Esempio n. 1
0
    def __init__(self, *args, **kwargs):

        super().__init__(*args, **kwargs)

        self.nlp_en = spacy.load('en')
        self.nlp_es = spacy.load('es')

        # Add wordnet component
        self.nlp_en.add_pipe(WordnetAnnotator(self.nlp_en.lang))
        self.nlp_es.add_pipe(WordnetAnnotator(self.nlp_es.lang))
def crea_grafo(parola1, parola2, num_iter, dominio):
    flag = False

    #creo i primi collegamenti, i sinonimi
    all_ss = wn.synsets(parola1)

    nlp = spacy.load('en_core_web_sm')
    nlp.add_pipe(WordnetAnnotator(nlp.lang))

    G = graph.Graph()

    partenza = G.insert_vertex(parola1)
    arrivo = G.insert_vertex(parola2)

    valid_ss = []

    print(dominio)
    for ss in all_ss:
        domini_ss = get_domains_for_synset(ss)
        padri_dominio = lista_domini_padre(dominio)
        for d in padri_dominio:
            if d not in domini_ss:
                continue
            nodo = G.insert_vertex(ss, dominio=d)
            valid_ss.append(ss)
            G.insert_edge(partenza, nodo, 1, "that means")  #SINONIMO
            flag = find_target(G, nodo, arrivo)
            break

    #funzione ricorsiva
    flag2 = popola_grafo(G, valid_ss, arrivo, num_iter, dominio)
    return G, partenza, arrivo, flag or flag2
Esempio n. 3
0
    def __init__(self, *args, **kwargs):

        super().__init__(*args, **kwargs)

        self.nlp_en = spacy.load("en_core_web_sm")
        self.nlp_es = spacy.load("es_core_news_sm")

        try:
            # Add wordnet component
            self.nlp_en.add_pipe("spacy_wordnet",
                                 config={"lang": self.nlp_en.lang})
            self.nlp_es.add_pipe("spacy_wordnet",
                                 config={"lang": self.nlp_es.lang})
        except TypeError:  # spacy 2.x
            self.nlp_en.add_pipe(WordnetAnnotator(self.nlp_en.lang))
            self.nlp_es.add_pipe(WordnetAnnotator(self.nlp_es.lang))
 def __init__(self):
     self.__nlp = spacy.load('en_core_web_lg')
     self.__nlp.add_pipe(WordnetAnnotator(self.__nlp.lang), after='tagger')
     self.__domains = [
         'finance', 'banking', 'betting', 'insurance', 'money', 'commerce'
     ]
     self.__lista_elementos_aceitos = ['VERB', 'ADV', 'ADJ', 'NOUN']
     self.__tradutor = Tradutor()
Esempio n. 5
0
    def __init__(self, config, feature_extractor, model_trainer):
        assert isinstance(feature_extractor, FeatureExtractor)
        assert isinstance(model_trainer, ModelTrainer)
        assert isinstance(config, ClassifierConfig)

        self._language = config.language
        self._dataset_name = config.dataset
        self._balancing = config.balancing_strategy
        self._nlp = spacy.load(config.language_model)
        self._folder = config.folder
        self.__configure_logger(config)
        if config.with_wordnet is True:
            self._nlp.add_pipe(WordnetAnnotator(self._nlp.lang),
                               after='tagger')
        self._model_trainer = model_trainer
        #ModelTrainer(config.testset_ratio, self._logger.name)
        self._feature_extractor = feature_extractor
        self._data = None
        self._is_testdata = config.is_testdata
    def SpacyStanzaPatternMatching(self):
        path = "../data/originalTexts/" + self.filename

        # scaricamento di wordnet in inglese e multi-language (omw = Open Multi Wordnet)
        nltk.downloader.download('wordnet')
        nltk.downloader.download('omw')
        stanza.download("en")

        snlp = stanza.Pipeline(lang="en")
        nlp = spacy_stanza.StanzaLanguage(snlp)
        nlp.add_pipe(WordnetAnnotator(snlp.lang))
        text = open(path).read()
        doc = nlp(text)

        for e in doc.sents:
            print(e.text + "\n|---------------------------------------------|\n")

        # SAVE SENTENCES IN FILE
        '''f = open("SpacyStanzaSegmentation.txt", "w+")
    def __init__(self, use_wordnet=True,  \
            use_synlist=False, synlist_path="./syn_test.txt"):
        """
        Setup Synonym Expander with spacy pipleline for synonym replacement

        Inputs
        ------
        use_wordnet : Boolean
            Wether to query the wordnet DB for synonyms
        
        use_synlist : Boolean
            Wether to use manually specified synomnym lists
        
        synlist_path : String
            If use_synlist is set to True, a path to the synonymlist file
            must be specified
            The format of the synlist is

            A, B, C 
            ED, BG, CG

            Where each row is a set of synonyms
        """
        # Load an spacy model (supported models are "es" and "en")

        nlp = en_core_web_sm.load()
        # nlp = spacy.load('en')
        nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger')

        self.nlp = nlp
        self.domain_of_interest = ['person']

        self.all_stopwords = nlp.Defaults.stop_words

        self.use_wordnet = use_wordnet
        self.use_synlist = use_synlist

        if self.use_synlist:
            # TODO : Throw an error if path is not valid
            self.word_to_name_dict, self.name_to_syn_dict = \
                self.parse_synlist(synlist_path)
Esempio n. 8
0
    def __init__(self, spaCy_core='small'):
        # load spaCy English model
        if spaCy_core == 'large':
            # includes vectors & entities
            self.nlp = spacy.load('en_core_web_lg')
            self.model_flag = 'large'
        else:
            # includes entities
            self.nlp = spacy.load('en_core_web_sm')
            self.model_flag = 'small'

        ### set up wordnet
        self.nlp.add_pipe(WordnetAnnotator(self.nlp.lang), after='tagger')
        self.token = self.nlp('prices')[0]

        # wordnet object link spacy token with nltk wordnet interface by giving acces to
        self.token._.wordnet.synsets()
        self.token._.wordnet.lemmas()

        # And automatically tag with wordnet domains
        self.token._.wordnet.wordnet_domains()
Esempio n. 9
0
def spacySynset(word):
    nlp = spacy.load('en_core_web_sm')
    nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger')
    token = nlp(word)[0]

    #Load all synsets
    token._.wordnet.synsets()
    #load all lemmas
    token._.wordnet.lemmas()
    token._.wordnet.wordnet_domains()

    economy_domains = ['economy', 'linguistics']

    #Condition sysnsets of a domain
    synsets = token._.wordnet.wordnet_synsets_for_domain(economy_domains)
    lemmas_for_synset = []
    if synsets:
        for s in synsets:
            lemmas_for_synset.extend(s.lemma_names())

    return (lemmas_for_synset)
Esempio n. 10
0
    def transform(self, X, y=None):
        t0 = time.time()

        nlp = models[self.lang]
        if self.lang == SupportedLanguages.English and self.with_wordnet:
            nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger')

        logger.debug('transforming with spacy...')

        logger.debug(len(X['def1']), len(X['def2']))

        X.loc[:, 'processed_1'] = pd.Series(
            list(nlp.pipe(iter(X['def1']), batch_size=1000)))
        logger.debug('----------processed 1 ------------')

        X.loc[:, 'processed_2'] = pd.Series(
            list(nlp.pipe(iter(X['def2']), batch_size=1000)))
        logger.debug('----------processed 2 ------------')

        X.loc[:, 'word_processed'] = pd.Series(
            list(nlp.pipe(iter(X['word']), batch_size=1000)))
        logger.debug('------------word processed ------------')

        X.loc[:, 'lemmatized_1'] = X['processed_1'].map(
            lambda doc: lemmatizer(doc, nlp))
        X.loc[:,
              'stopwords_removed_1'] = X['lemmatized_1'].map(remove_stopwords)
        logger.debug('-------------lemma and sw removed 1  ------------')

        X.loc[:, 'lemmatized_2'] = X['processed_2'].map(
            lambda doc: lemmatizer(doc, nlp))
        X.loc[:,
              'stopwords_removed_2'] = X['lemmatized_2'].map(remove_stopwords)
        logger.debug('-------------lemma and sw removed 2  ------------')

        logger.debug('SpacyProcessor.transform() took %.3f seconds' %
                     (time.time() - t0))

        return X
Esempio n. 11
0
 def __init__(self):
     print('loading the wordnet corpus...')
     wordnet.ensure_loaded()
     print('loading done')
     self.nlp = spacy.load('en')
     self.nlp.add_pipe(WordnetAnnotator(self.nlp.lang), after='tagger')
     f = open('sorted_first_names.txt', 'r')
     lines = f.readlines()
     self.first_name_array = []
     for line in lines:
         line = line.rstrip()
         self.first_name_array.append(line)
     f = open('sorted_last_names.txt', 'r')
     lines = f.readlines()
     self.last_name_array = []
     for line in lines:
         line = line.rstrip()
         self.last_name_array.append(line)
     f = open('bad_words.txt', 'r')
     lines = f.readlines()
     self.profane_words_array = []
     for line in lines:
         line = line.rstrip()
         self.profane_words_array.append(line)
Esempio n. 12
0
def main(args):
    if args.cuda:
        spacy.require_gpu()
    # Load an spacy model (supported models are "es" and "en")
    print("Loading spacy...")
    nlp = spacy.load("en_core_web_lg")
    print("Done")
    nlp.tokenizer = lambda text: whitespace_tokenizer(text, nlp.vocab)
    nlp.add_pipe(WordnetAnnotator(nlp.lang), after="tagger")
    nlp.add_pipe(BeneparComponent("benepar_en2"))

    with open(args.data) as f:
        lines = [line.strip() for line in list(f)]

    all_texts = []
    all_feats = []
    docs = nlp.pipe(lines, batch_size=args.batch_size)
    for doc in tqdm(docs, desc="Extracting feats", total=len(lines)):
        doc_feats = []
        doc_texts = []
        for token in doc:
            t_feats = extract_feats(token)
            doc_feats.append(t_feats)
            doc_texts.append(token.text)
        all_feats.append(doc_feats)
        all_texts.append(doc_texts)

    with open(args.data.replace(".tok", ".feats"), "w") as f:
        f.write("|".join((";".join(fn[:2]) for fn in FEATS)))
        f.write("\n")
        for text, doc_feats in zip(all_texts, all_feats):
            t_feats_joined = ["|".join(tf) for tf in doc_feats]
            line_feats = " ".join(
                ["|".join((t, f)) for t, f in zip(text, t_feats_joined)])
            f.write(line_feats)
            f.write("\n")
Esempio n. 13
0
import spacy
from spacy_wordnet.wordnet_annotator import WordnetAnnotator

stopwords = None

nlp_en = spacy.load('en_core_web_sm')
nlp_en.add_pipe(WordnetAnnotator(nlp_en.lang), after='tagger')
def calcola_domini(word1: str, word2: str, phrase: str):

    # Loading the supported spacy model (between "es" and "en")
    nlp = spacy.load('en_core_web_sm')
    nlp.add_pipe(WordnetAnnotator(nlp.lang))

    # Define the token that is the user's keyword that he wants to know the definition (in the particular context)
    token1 = nlp(word1)[0]
    type1_k = token1.pos_
    #if not (type1_k is 'NOUN' or type1_k is 'VERB' or type1_k is 'ADJ' or type1_k is 'INTJ' or type1_k is 'PROPN'):
    #    return [None,1]
    token2 = nlp(word2)[0]
    type2_k = token2.pos_
    #if not (type2_k is 'NOUN' or type2_k is 'VERB' or type2_k is 'ADJ' or type2_k is 'INTJ' or type2_k is 'PROPN'):
    #    return [None, 1]

    # Define the sentence that is the user's phrase in which there is the token
    sentence = nlp(phrase)

    #domains 1
    token1_domains = token1._.wordnet.wordnet_domains()
    if (len(token1_domains) == 0):
        return [None, 1]
    #domains 2
    token2_domains = token2._.wordnet.wordnet_domains()
    if (len(token2_domains) == 0):
        return [None, 1]

    print(token2_domains)
    print(token1_domains)

    #common domains
    c_domains = []
    for d in token1_domains:
        if d in token2_domains:
            c_domains.append(d)

    token_domains_dict = {c_domains[i]: 0 for i in range(0, len(c_domains), 1)}

    # For each word in the sentence
    for sentence_token in sentence:
        type = sentence_token.pos_
        #if type is 'NOUN' or type is 'VERB' or type is 'ADJ' or type is 'INTJ' or type is 'PROPN':
        # Retrieve the list of domains of the given token
        list_domains = sentence_token._.wordnet.wordnet_domains()
        # Cicle on the list: if a domain is in the list of the token's list then the integer value is incremented
        encountered_domains = []
        for i in list_domains:
            if i in c_domains and not (i in encountered_domains):
                token_domains_dict[i] += 1
                encountered_domains.append(i)

    final_dict = sorted(token_domains_dict.items(),
                        key=operator.itemgetter(1),
                        reverse=True)

    print("________________________________________________________")
    print(token_domains_dict.items())

    list = []
    for i in final_dict:
        list.append(i[0])

    return list
Esempio n. 15
0
 def __init__(self,selectivity_rate):
     #percentage of words that have to match on worndet domain
     self.rate=selectivity_rate
     self.nlp = spacy.load('en')
     self.nlp.add_pipe(WordnetAnnotator(self.nlp.lang), after='tagger')
Esempio n. 16
0
def findDefinition(word: str, phrase: str, word_index: int):

    # Loading the supported spacy model (between "es" and "en")
    nlp = spacy.load('en_core_web_sm')
    nlp.add_pipe(WordnetAnnotator(nlp.lang))

    # Define the token that is the user's keyword that he wants to know the definition (in the particular context)
    token = nlp(word)[0]
    # Define the sentence that is the user's phrase in which there is the token
    sentence = nlp(phrase)

    # From WordNet Domains retrieve all the token's domains
    token_domains = token._.wordnet.wordnet_domains()
    token_syn = wn.synsets(token.text)
    if (len(token_domains) == 0):
        if (len(token_syn) == 0):
            return [['No definition found'], 1]
        return [[
            token_syn[0]._pos.upper() + ' - ' + token_syn[0].definition()
        ], 0]
    #Support structures
    token_domains_dict = {
        token_domains[i]: 0
        for i in range(0, len(token_domains), 1)
    }  # Convert the list into a dictionary (domain,int)
    max_token_domains = []  #list of frequent domains
    synset_dict = dict()  #synsets dictionary
    max_synset_list = []  #list of frequent synsets
    def_list = []  #definitions list
    # For each word in the sentence
    for sentence_token in sentence:
        type = sentence_token.pos_
        if word_index == sentence_token.idx:
            type_k = type

        # Retrieve the list of domains of the given token
        list_domains = sentence_token._.wordnet.wordnet_domains()
        # Cicle on the list: if a domain is in the list of the token's list then the integer value is incremented
        encountered_domains = []
        for i in list_domains:
            if i in token_domains and not (i in encountered_domains):
                token_domains_dict[i] += 1
                encountered_domains.append(i)

    # Take the max repeated domain/s
    # Find item with Max Value in Dictionary
    itemMaxValue = max(token_domains_dict.items(), key=lambda x: x[1])
    for key, value in token_domains_dict.items():
        if value == itemMaxValue[1]:
            max_token_domains.append(key)
    # Retrieve the synset that is repeated frequently
    for j in max_token_domains:
        verified_pos = verify_pos(type_k)
        synsets = all_the_synsets(token.text, verified_pos, [j])
        if len(synsets) == 0:
            synsets = all_the_synsets(token.text, None, [j])
            all_syn = wn.synsets(token.text, verified_pos)
            if len(all_syn) != 0:
                synsets.append(all_syn[0])
        for i in synsets:
            if i not in synset_dict:
                synset_dict[i] = 1
            else:
                synset_dict[i] += 1
    #Max value in the dictionary, that we use to search frequent items
    max_synset = max(synset_dict.items(), key=lambda x: x[1])
    for key, value in synset_dict.items():
        if value == max_synset[1]:  #fare controllo tipo sinset e token
            max_synset_list.append(key)
    if (max_synset[1] - 1) != 0 and len(max_synset_list) == 1:
        for key, value in synset_dict.items():
            if value == max_synset[1] - 1:
                max_synset_list.append(key)

    #List of definitions
    for k in max_synset_list:
        def_list.append(k._pos.upper() + ' - ' + k.definition())
    return [def_list, 0]
Esempio n. 17
0
import spacy
import numpy as np

from textacy.similarity import jaccard
from spacy_wordnet.wordnet_annotator import WordnetAnnotator

# project
from woffle.functions.lists import strip
from woffle.parse.prob.spacy import roots

# -- Type synonyms --------------------------------------------------------------
Array = NewType('Array', np.array)
Doc = NewType('Doc', spacy.tokens.doc.Doc)

model = spacy.load('en_core_web_md')
model.add_pipe(WordnetAnnotator(model.lang), after='tagger')


# -- Interfaces -----------------------------------------------------------------
# semantic similarity
def condition(xs: List[str]) -> float:
    "implement hypernym lookup"
    xs_ = strip(xs)
    return (0.0 if len(xs_) <= 1 else 1.0)


def selection(xs: List[str]) -> str:
    return hypernyms(xs)


def hypernyms(xs: List[str]) -> str:
Esempio n. 18
0
def useAVLFunction(inputWriting):
    acadCoreAVL = open(
        '/Users/yanisa/Code_GitHub/MAThesis_YourAcadWritingFriend/miscInputFiles_WordListsPhrasesEtc/acadCore_AVL.txt'
    ).read()
    stopWordsFile = open(
        '/Users/yanisa/Code_GitHub/MAThesis_YourAcadWritingFriend/codeFiles/acadWritingFeaturesAnalysisFiles/acadWordLists/avlStopWords.txt'
    ).read().split('\n')

    dictWithAVLSuggestionsPerInputWord = {}
    # Load a spacy model and run my writing sample through (turns it into a Doc object)
    nlp = spacy.load('en_core_web_sm')
    nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger')
    inputWritingTokenized = nlp(inputWriting)

    # Make my AVL into a list with each word as an element
    # Then make it into a set so it's more efficient (looks up hash value rather than the actual word each time)
    acadCoreAVL_Set = set(acadCoreAVL.split('\n'))

    # Prepare stop words list so it ignores anything that has YANISA
    stopWordsList = []
    for eachLine in stopWordsFile:
        if 'YANISA' in eachLine:
            continue
        else:
            stopWordsList.append(eachLine)

    # Check to see if any of the synsets match with an academic word from the AVL AND have the same POS
    for word in inputWritingTokenized:
        # Check the POS - only move forward if they are one of the WN synset POS-tags
        if word.pos_ != 'NOUN' and \
        word.pos_ != 'ADJ' and \
        word.pos_ != 'ADV' and \
        word.pos_ != 'VERB':
            continue
        # If word is already in the AVL, skip it so it doesn't provide a suggestion
        if word.text.lower() in acadCoreAVL_Set:
            continue
        # If word is in stop words list, skip it too
        if word.text.lower() in stopWordsList:
            continue

        # Check the spacy POS and give it a hacky code that matches the wordnet synset output
        spacyPOS = word.pos_
        if spacyPOS == 'NOUN':
            spacyToWordNetPOS = 'n'
        if spacyPOS == 'VERB':
            spacyToWordNetPOS = 'v'
        if spacyPOS == 'ADJ':
            spacyToWordNetPOS = 'a'
        if spacyPOS == 'ADV':
            spacyToWordNetPOS = 'r'

        # Get the word index from spacy to be used later
        #   index is used in the file that calls this, so it doesn't duplicate suggestions where MWEs are
        # wordIndex = word.i

        # Get the synsets from WordNet for each word in the writing sample
        inputWritingSynsets = word._.wordnet.synsets()

        # Cleaning the synsets out so I only get the actual word
        #   synsets print like: Synset('synset.n.01')
        for eachSynset in inputWritingSynsets:
            cleanOutFrontOfSynset = str(eachSynset).replace('Synset(\'', '')
            splitEachSynset = str(cleanOutFrontOfSynset).split('.')
            actualWordInSynset = splitEachSynset[0]
            posInSynset = splitEachSynset[1]

            # Keep the synset and original word if a word in the synset is in the AVL
            #   and the synset word has the same POS as the orig word in the writing sample
            #   and the suggested and orig words are not identical
            #   and the suggested and orig words are not lemmas
            synsetToAddToDict = ''
            actualWordInSynsetStringLower = str(actualWordInSynset).lower()
            wordStringLower = str(word).lower()
            if actualWordInSynset.lower() in acadCoreAVL_Set and \
            posInSynset == spacyToWordNetPOS and \
            actualWordInSynsetStringLower != wordStringLower and \
            actualWordInSynsetStringLower not in wordStringLower and \
            wordStringLower not in actualWordInSynsetStringLower:
                # Removes super short words so my bottom script can run (e.g. 'am', 'is')
                if len(word.text) >= 3:

                    # Use word_forms to make sure suggestion has the correct word form
                    # e.g. so 'publishing' (rather than 'publish') is suggested for 'writing',
                    # but 'publish' is still suggested for 'write'
                    wordFormsEachSynonym = get_word_forms(
                        actualWordInSynsetStringLower)
                    # For each POS, get the word forms, and keep the one which has the appropriate ending
                    if posInSynset == 'n':
                        nounSet = wordFormsEachSynonym[posInSynset]
                        if word.text[-1] == 's':
                            for eachSyn in nounSet:
                                if eachSyn[-1] == 's':
                                    synsetToAddToDict = eachSyn
                        else:
                            synsetToAddToDict = actualWordInSynsetStringLower
                    if posInSynset == 'r':
                        # No adv endings added
                        synsetToAddToDict = actualWordInSynsetStringLower
                    if posInSynset == 'a':
                        # No adj endings added
                        synsetToAddToDict = actualWordInSynsetStringLower
                    if posInSynset == 'v':
                        verbSet = wordFormsEachSynonym[posInSynset]
                        if word.text[-3:] == 'ing':
                            for eachSyn in verbSet:
                                if eachSyn[-3:] == 'ing':
                                    synsetToAddToDict = eachSyn
                        elif word.text[-1] == 'd':
                            for eachSyn in verbSet:
                                if eachSyn[-1] == 'd':
                                    synsetToAddToDict = eachSyn
                        elif word.text[-1] == 's':
                            for eachSyn in verbSet:
                                if eachSyn[-1] == 's':
                                    synsetToAddToDict = eachSyn
                    # TODO add other verb endings
                        else:
                            synsetToAddToDict = actualWordInSynsetStringLower

                    if word not in dictWithAVLSuggestionsPerInputWord:
                        # Set dict so word is key and synonym is the first element in a list in the dict (as the value)
                        # The list thing is for if/when I add more synonyms to the orig word
                        dictWithAVLSuggestionsPerInputWord[word] = [
                            synsetToAddToDict
                        ]
                    else:
                        dictWithAVLSuggestionsPerInputWord[word].append(
                            synsetToAddToDict)
                    #print('AVL Suggested Word: ' + str(actualWordInSynset))
                    #print('Original Word: ' + str(word))
    return dictWithAVLSuggestionsPerInputWord
Esempio n. 19
0
def processAllArticles(directory):

    wikipediaArticles = os.listdir(directory)
    #print(wikipediaArticles)

    nlp = spacy.load("en_core_web_sm")

    #nltk.download('wordnet')
    nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger')

    dependCMap = dict()
    headsMap = dict()
    namedEntityMap = dict()
    sentenceMap = dict()

    # Process all the articles and find the respective NLP features
    for article in wikipediaArticles:

        #--------------- SCANNING THE ARTICLE -----------------
        filepath = directory + article
        print('Now Reading File . . . . . . . ', article)

        corpus = read_data(filepath)
        doc = nlp(corpus)

        #------------- EXTRACTING NLP FEATURES ----------------
        lemmasMap = dict()
        posMap = dict()
        tagsMap = dict()

        synsetsMap = dict()
        hypernymsMap = dict()
        hyponymsMap = dict()

        partMeronymsMap = dict()
        substanceMeronymsMap = dict()
        holonymsMap = dict()

        #1. TOKENIZATION
        tokens = tokennize(doc)

        for token in doc:
            #print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop)

            lemmasMap[token] = token.lemma_  # 2. Lemmatization
            posMap[token] = token.pos_  # 3. POS tagging (coarse-grained)
            tagsMap[token] = token.tag_  # 4. POS tagging (fine-grained)

            synsetsMap[token] = getSynsets(token)  # 5. SynSet of the token
            hypernymsMap[token] = getHypernyms(token)  # 6. Hypernyms
            hyponymsMap[token] = getHyponyms(token)  # 7. Hyponyms

            partMeronymsMap[token] = getPartMeronyms(token)  # 8. Part Meronyms
            substanceMeronymsMap[token] = getSubstanceMeronyms(
                token)  # 9. Substance Meronyms
            holonymsMap[token] = getHolonyms(token)  # 10. Holonyms
        #endFor

        # 11. DEPENDENCY labels and 12. SYNTACTIC HEADS for each token in the doc
        dependCMap[article], headsMap[article] = synParsing(doc)

        # 13. NAMED ENTITIES
        namedEntityMap[article] = getNamedEntities(doc)

        # 14. SENTENCE TOKENIZATION
        sentenceMap[article] = sentenceTokennize(doc)

    # Returning all the 14 NLP features collected FOR EVERY WIKIPEDIA ARTICLE
    return tokens, lemmasMap, posMap, tagsMap, dependCMap, headsMap, namedEntityMap, synsetsMap, hypernymsMap, hyponymsMap, partMeronymsMap, substanceMeronymsMap, holonymsMap, sentenceMap
Esempio n. 20
0
def processFirstArticle(directory, resultDirectory):

    #------------- SCANNING THE RELEVANT FILE -------------
    wikipediaArticles = os.listdir(directory)
    #print(wikipediaArticles)

    nlp = spacy.load("en_core_web_sm")

    #nltk.download('wordnet')
    nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger')

    dependCMap = dict()
    headsMap = dict()
    namedEntityMap = dict()
    sentenceMap = dict()

    firstArticle = wikipediaArticles[0]
    filepath = directory + firstArticle

    print('Now Reading File . . . . . . . ', firstArticle)

    corpus = read_data(filepath)
    doc = nlp(corpus)

    #------------- EXTRACTING NLP FEATURES ----------------
    lemmasMap = dict()
    posMap = dict()
    tagsMap = dict()

    synsetsMap = dict()
    hypernymsMap = dict()
    hyponymsMap = dict()

    partMeronymsMap = dict()
    substanceMeronymsMap = dict()
    holonymsMap = dict()

    #1. TOKENIZATION
    tokens = tokennize(doc)

    # For each token in the document, extracting features
    for token in doc:

        lemmasMap[token] = token.lemma_  # 2. Lemmatization
        posMap[token] = token.pos_  # 3. POS tagging (coarse-grained)
        tagsMap[token] = token.tag_  # 4. POS tagging (fine-grained)

        synsetsMap[token] = getSynsets(token)  # 5. SynSet of the token
        hypernymsMap[token] = getHypernyms(token)  # 6. Hypernyms
        hyponymsMap[token] = getHyponyms(token)  # 7. Hyponyms

        partMeronymsMap[token] = getPartMeronyms(token)  # 8. Part Meronyms
        substanceMeronymsMap[token] = getSubstanceMeronyms(
            token)  # 9. Substance Meronyms
        holonymsMap[token] = getHolonyms(token)  # 10. Holonyms
    #endFor

    # 11. DEPENDENCY labels and 12. SYNTACTIC HEADS for each token in the doc
    dependCMap[firstArticle], headsMap[firstArticle] = synParsing(doc)

    # 13. NAMED ENTITIES
    namedEntityMap[firstArticle] = getNamedEntities(doc)

    # 14. SENTENCE TOKENIZATION
    sentenceMap[firstArticle] = sentenceTokennize(doc)

    #---------------- WRITING THE FEATURES ----------------

    with open(resultDirectory + 'D01-Tokens.txt', 'w',
              encoding='utf-8') as f01:
        json.dump(tokens, f01)

    f02 = open(resultDirectory + 'D02-Lemmas.txt', 'w')
    f02.write(str(lemmasMap))
    f02.close()

    f03 = open(resultDirectory + 'D03-POS.txt', 'w')
    f03.write(str(posMap))
    f03.close()

    f04 = open(resultDirectory + 'D04-Tags.txt', 'w')
    f04.write(str(tagsMap))
    f04.close()

    f05 = open(resultDirectory + 'D05-Synsets.txt', 'w')
    f05.write(str(synsetsMap))
    f05.close()

    f06 = open(resultDirectory + 'D06-Hypernyms.txt', 'w')
    f06.write(str(hypernymsMap))
    f06.close()

    f07 = open(resultDirectory + 'D07-Hyponyms.txt', 'w')
    f07.write(str(hyponymsMap))
    f07.close()

    f08 = open(resultDirectory + 'D08-Meronyms-Part.txt', 'w')
    f08.write(str(partMeronymsMap))
    f08.close()

    f09 = open(resultDirectory + 'D09-Meronyms-Substance.txt', 'w')
    f09.write(str(substanceMeronymsMap))
    f09.close()

    f10 = open(resultDirectory + 'D10-Holonyms.txt', 'w')
    f10.write(str(holonymsMap))
    f10.close()

    f11 = open(resultDirectory + 'D11-Dependencies.txt', 'w')
    f11.write(str(dependCMap))
    f11.close()

    f12 = open(resultDirectory + 'D12-Syntactic-Heads.txt', 'w')
    f12.write(str(headsMap))
    f12.close()

    f13 = open(resultDirectory + 'D13-Named-Entities.txt', 'w')
    f13.write(str(namedEntityMap))
    f13.close()

    f14 = open(resultDirectory + 'D14-Tokenized-Sentences.txt', 'w')
    f14.write(str(sentenceMap))
    f14.close()
Esempio n. 21
0
# coding: utf-8
from spacy_wordnet.wordnet_annotator import WordnetAnnotator
from spacy import load
from spacy_wordnet.wordnet_annotator import WordnetAnnotator
import pandas as pd
nlp = load('en_core_web_sm')
nlp.add_pipe(WordnetAnnotator(nlp.lang))
token = nlp('Calculator.')[0]
token._.wordnet.synsets()
meaning1, meaning2 = token = nlp('Calculator.')[0]
meaning1, meaning2 = token._.wordnet.synsets()
meaning1
meaning1.name()
meaning1.lemmas()
meaning2.lemmas()
token._.wordnet.wordnet_domains()
nlp('mathematics')[0]._.wordnet.wordnet_domains()
'science' in _
nlp('pure_science')[0]._.wordnet.wordnet_domains()
nlp('science')[0]._.wordnet.wordnet_domains()
wnet = nlp('science')[0]._.wordnet
wnet.wordnet_synsets_for_domain()
wnet.lemmas()
token = nlp('human')[0]
token._.wordnet.lemmas()
token._.wordnet.synsets()
[c.lemmas() for c in token._.wordnet.synsets()]
syn =token._.wordnet.synsets()
x = syn[0]
x
x.common_hypernyms()