def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.nlp_en = spacy.load('en') self.nlp_es = spacy.load('es') # Add wordnet component self.nlp_en.add_pipe(WordnetAnnotator(self.nlp_en.lang)) self.nlp_es.add_pipe(WordnetAnnotator(self.nlp_es.lang))
def crea_grafo(parola1, parola2, num_iter, dominio): flag = False #creo i primi collegamenti, i sinonimi all_ss = wn.synsets(parola1) nlp = spacy.load('en_core_web_sm') nlp.add_pipe(WordnetAnnotator(nlp.lang)) G = graph.Graph() partenza = G.insert_vertex(parola1) arrivo = G.insert_vertex(parola2) valid_ss = [] print(dominio) for ss in all_ss: domini_ss = get_domains_for_synset(ss) padri_dominio = lista_domini_padre(dominio) for d in padri_dominio: if d not in domini_ss: continue nodo = G.insert_vertex(ss, dominio=d) valid_ss.append(ss) G.insert_edge(partenza, nodo, 1, "that means") #SINONIMO flag = find_target(G, nodo, arrivo) break #funzione ricorsiva flag2 = popola_grafo(G, valid_ss, arrivo, num_iter, dominio) return G, partenza, arrivo, flag or flag2
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.nlp_en = spacy.load("en_core_web_sm") self.nlp_es = spacy.load("es_core_news_sm") try: # Add wordnet component self.nlp_en.add_pipe("spacy_wordnet", config={"lang": self.nlp_en.lang}) self.nlp_es.add_pipe("spacy_wordnet", config={"lang": self.nlp_es.lang}) except TypeError: # spacy 2.x self.nlp_en.add_pipe(WordnetAnnotator(self.nlp_en.lang)) self.nlp_es.add_pipe(WordnetAnnotator(self.nlp_es.lang))
def __init__(self): self.__nlp = spacy.load('en_core_web_lg') self.__nlp.add_pipe(WordnetAnnotator(self.__nlp.lang), after='tagger') self.__domains = [ 'finance', 'banking', 'betting', 'insurance', 'money', 'commerce' ] self.__lista_elementos_aceitos = ['VERB', 'ADV', 'ADJ', 'NOUN'] self.__tradutor = Tradutor()
def __init__(self, config, feature_extractor, model_trainer): assert isinstance(feature_extractor, FeatureExtractor) assert isinstance(model_trainer, ModelTrainer) assert isinstance(config, ClassifierConfig) self._language = config.language self._dataset_name = config.dataset self._balancing = config.balancing_strategy self._nlp = spacy.load(config.language_model) self._folder = config.folder self.__configure_logger(config) if config.with_wordnet is True: self._nlp.add_pipe(WordnetAnnotator(self._nlp.lang), after='tagger') self._model_trainer = model_trainer #ModelTrainer(config.testset_ratio, self._logger.name) self._feature_extractor = feature_extractor self._data = None self._is_testdata = config.is_testdata
def SpacyStanzaPatternMatching(self): path = "../data/originalTexts/" + self.filename # scaricamento di wordnet in inglese e multi-language (omw = Open Multi Wordnet) nltk.downloader.download('wordnet') nltk.downloader.download('omw') stanza.download("en") snlp = stanza.Pipeline(lang="en") nlp = spacy_stanza.StanzaLanguage(snlp) nlp.add_pipe(WordnetAnnotator(snlp.lang)) text = open(path).read() doc = nlp(text) for e in doc.sents: print(e.text + "\n|---------------------------------------------|\n") # SAVE SENTENCES IN FILE '''f = open("SpacyStanzaSegmentation.txt", "w+")
def __init__(self, use_wordnet=True, \ use_synlist=False, synlist_path="./syn_test.txt"): """ Setup Synonym Expander with spacy pipleline for synonym replacement Inputs ------ use_wordnet : Boolean Wether to query the wordnet DB for synonyms use_synlist : Boolean Wether to use manually specified synomnym lists synlist_path : String If use_synlist is set to True, a path to the synonymlist file must be specified The format of the synlist is A, B, C ED, BG, CG Where each row is a set of synonyms """ # Load an spacy model (supported models are "es" and "en") nlp = en_core_web_sm.load() # nlp = spacy.load('en') nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger') self.nlp = nlp self.domain_of_interest = ['person'] self.all_stopwords = nlp.Defaults.stop_words self.use_wordnet = use_wordnet self.use_synlist = use_synlist if self.use_synlist: # TODO : Throw an error if path is not valid self.word_to_name_dict, self.name_to_syn_dict = \ self.parse_synlist(synlist_path)
def __init__(self, spaCy_core='small'): # load spaCy English model if spaCy_core == 'large': # includes vectors & entities self.nlp = spacy.load('en_core_web_lg') self.model_flag = 'large' else: # includes entities self.nlp = spacy.load('en_core_web_sm') self.model_flag = 'small' ### set up wordnet self.nlp.add_pipe(WordnetAnnotator(self.nlp.lang), after='tagger') self.token = self.nlp('prices')[0] # wordnet object link spacy token with nltk wordnet interface by giving acces to self.token._.wordnet.synsets() self.token._.wordnet.lemmas() # And automatically tag with wordnet domains self.token._.wordnet.wordnet_domains()
def spacySynset(word): nlp = spacy.load('en_core_web_sm') nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger') token = nlp(word)[0] #Load all synsets token._.wordnet.synsets() #load all lemmas token._.wordnet.lemmas() token._.wordnet.wordnet_domains() economy_domains = ['economy', 'linguistics'] #Condition sysnsets of a domain synsets = token._.wordnet.wordnet_synsets_for_domain(economy_domains) lemmas_for_synset = [] if synsets: for s in synsets: lemmas_for_synset.extend(s.lemma_names()) return (lemmas_for_synset)
def transform(self, X, y=None): t0 = time.time() nlp = models[self.lang] if self.lang == SupportedLanguages.English and self.with_wordnet: nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger') logger.debug('transforming with spacy...') logger.debug(len(X['def1']), len(X['def2'])) X.loc[:, 'processed_1'] = pd.Series( list(nlp.pipe(iter(X['def1']), batch_size=1000))) logger.debug('----------processed 1 ------------') X.loc[:, 'processed_2'] = pd.Series( list(nlp.pipe(iter(X['def2']), batch_size=1000))) logger.debug('----------processed 2 ------------') X.loc[:, 'word_processed'] = pd.Series( list(nlp.pipe(iter(X['word']), batch_size=1000))) logger.debug('------------word processed ------------') X.loc[:, 'lemmatized_1'] = X['processed_1'].map( lambda doc: lemmatizer(doc, nlp)) X.loc[:, 'stopwords_removed_1'] = X['lemmatized_1'].map(remove_stopwords) logger.debug('-------------lemma and sw removed 1 ------------') X.loc[:, 'lemmatized_2'] = X['processed_2'].map( lambda doc: lemmatizer(doc, nlp)) X.loc[:, 'stopwords_removed_2'] = X['lemmatized_2'].map(remove_stopwords) logger.debug('-------------lemma and sw removed 2 ------------') logger.debug('SpacyProcessor.transform() took %.3f seconds' % (time.time() - t0)) return X
def __init__(self): print('loading the wordnet corpus...') wordnet.ensure_loaded() print('loading done') self.nlp = spacy.load('en') self.nlp.add_pipe(WordnetAnnotator(self.nlp.lang), after='tagger') f = open('sorted_first_names.txt', 'r') lines = f.readlines() self.first_name_array = [] for line in lines: line = line.rstrip() self.first_name_array.append(line) f = open('sorted_last_names.txt', 'r') lines = f.readlines() self.last_name_array = [] for line in lines: line = line.rstrip() self.last_name_array.append(line) f = open('bad_words.txt', 'r') lines = f.readlines() self.profane_words_array = [] for line in lines: line = line.rstrip() self.profane_words_array.append(line)
def main(args): if args.cuda: spacy.require_gpu() # Load an spacy model (supported models are "es" and "en") print("Loading spacy...") nlp = spacy.load("en_core_web_lg") print("Done") nlp.tokenizer = lambda text: whitespace_tokenizer(text, nlp.vocab) nlp.add_pipe(WordnetAnnotator(nlp.lang), after="tagger") nlp.add_pipe(BeneparComponent("benepar_en2")) with open(args.data) as f: lines = [line.strip() for line in list(f)] all_texts = [] all_feats = [] docs = nlp.pipe(lines, batch_size=args.batch_size) for doc in tqdm(docs, desc="Extracting feats", total=len(lines)): doc_feats = [] doc_texts = [] for token in doc: t_feats = extract_feats(token) doc_feats.append(t_feats) doc_texts.append(token.text) all_feats.append(doc_feats) all_texts.append(doc_texts) with open(args.data.replace(".tok", ".feats"), "w") as f: f.write("|".join((";".join(fn[:2]) for fn in FEATS))) f.write("\n") for text, doc_feats in zip(all_texts, all_feats): t_feats_joined = ["|".join(tf) for tf in doc_feats] line_feats = " ".join( ["|".join((t, f)) for t, f in zip(text, t_feats_joined)]) f.write(line_feats) f.write("\n")
import spacy from spacy_wordnet.wordnet_annotator import WordnetAnnotator stopwords = None nlp_en = spacy.load('en_core_web_sm') nlp_en.add_pipe(WordnetAnnotator(nlp_en.lang), after='tagger')
def calcola_domini(word1: str, word2: str, phrase: str): # Loading the supported spacy model (between "es" and "en") nlp = spacy.load('en_core_web_sm') nlp.add_pipe(WordnetAnnotator(nlp.lang)) # Define the token that is the user's keyword that he wants to know the definition (in the particular context) token1 = nlp(word1)[0] type1_k = token1.pos_ #if not (type1_k is 'NOUN' or type1_k is 'VERB' or type1_k is 'ADJ' or type1_k is 'INTJ' or type1_k is 'PROPN'): # return [None,1] token2 = nlp(word2)[0] type2_k = token2.pos_ #if not (type2_k is 'NOUN' or type2_k is 'VERB' or type2_k is 'ADJ' or type2_k is 'INTJ' or type2_k is 'PROPN'): # return [None, 1] # Define the sentence that is the user's phrase in which there is the token sentence = nlp(phrase) #domains 1 token1_domains = token1._.wordnet.wordnet_domains() if (len(token1_domains) == 0): return [None, 1] #domains 2 token2_domains = token2._.wordnet.wordnet_domains() if (len(token2_domains) == 0): return [None, 1] print(token2_domains) print(token1_domains) #common domains c_domains = [] for d in token1_domains: if d in token2_domains: c_domains.append(d) token_domains_dict = {c_domains[i]: 0 for i in range(0, len(c_domains), 1)} # For each word in the sentence for sentence_token in sentence: type = sentence_token.pos_ #if type is 'NOUN' or type is 'VERB' or type is 'ADJ' or type is 'INTJ' or type is 'PROPN': # Retrieve the list of domains of the given token list_domains = sentence_token._.wordnet.wordnet_domains() # Cicle on the list: if a domain is in the list of the token's list then the integer value is incremented encountered_domains = [] for i in list_domains: if i in c_domains and not (i in encountered_domains): token_domains_dict[i] += 1 encountered_domains.append(i) final_dict = sorted(token_domains_dict.items(), key=operator.itemgetter(1), reverse=True) print("________________________________________________________") print(token_domains_dict.items()) list = [] for i in final_dict: list.append(i[0]) return list
def __init__(self,selectivity_rate): #percentage of words that have to match on worndet domain self.rate=selectivity_rate self.nlp = spacy.load('en') self.nlp.add_pipe(WordnetAnnotator(self.nlp.lang), after='tagger')
def findDefinition(word: str, phrase: str, word_index: int): # Loading the supported spacy model (between "es" and "en") nlp = spacy.load('en_core_web_sm') nlp.add_pipe(WordnetAnnotator(nlp.lang)) # Define the token that is the user's keyword that he wants to know the definition (in the particular context) token = nlp(word)[0] # Define the sentence that is the user's phrase in which there is the token sentence = nlp(phrase) # From WordNet Domains retrieve all the token's domains token_domains = token._.wordnet.wordnet_domains() token_syn = wn.synsets(token.text) if (len(token_domains) == 0): if (len(token_syn) == 0): return [['No definition found'], 1] return [[ token_syn[0]._pos.upper() + ' - ' + token_syn[0].definition() ], 0] #Support structures token_domains_dict = { token_domains[i]: 0 for i in range(0, len(token_domains), 1) } # Convert the list into a dictionary (domain,int) max_token_domains = [] #list of frequent domains synset_dict = dict() #synsets dictionary max_synset_list = [] #list of frequent synsets def_list = [] #definitions list # For each word in the sentence for sentence_token in sentence: type = sentence_token.pos_ if word_index == sentence_token.idx: type_k = type # Retrieve the list of domains of the given token list_domains = sentence_token._.wordnet.wordnet_domains() # Cicle on the list: if a domain is in the list of the token's list then the integer value is incremented encountered_domains = [] for i in list_domains: if i in token_domains and not (i in encountered_domains): token_domains_dict[i] += 1 encountered_domains.append(i) # Take the max repeated domain/s # Find item with Max Value in Dictionary itemMaxValue = max(token_domains_dict.items(), key=lambda x: x[1]) for key, value in token_domains_dict.items(): if value == itemMaxValue[1]: max_token_domains.append(key) # Retrieve the synset that is repeated frequently for j in max_token_domains: verified_pos = verify_pos(type_k) synsets = all_the_synsets(token.text, verified_pos, [j]) if len(synsets) == 0: synsets = all_the_synsets(token.text, None, [j]) all_syn = wn.synsets(token.text, verified_pos) if len(all_syn) != 0: synsets.append(all_syn[0]) for i in synsets: if i not in synset_dict: synset_dict[i] = 1 else: synset_dict[i] += 1 #Max value in the dictionary, that we use to search frequent items max_synset = max(synset_dict.items(), key=lambda x: x[1]) for key, value in synset_dict.items(): if value == max_synset[1]: #fare controllo tipo sinset e token max_synset_list.append(key) if (max_synset[1] - 1) != 0 and len(max_synset_list) == 1: for key, value in synset_dict.items(): if value == max_synset[1] - 1: max_synset_list.append(key) #List of definitions for k in max_synset_list: def_list.append(k._pos.upper() + ' - ' + k.definition()) return [def_list, 0]
import spacy import numpy as np from textacy.similarity import jaccard from spacy_wordnet.wordnet_annotator import WordnetAnnotator # project from woffle.functions.lists import strip from woffle.parse.prob.spacy import roots # -- Type synonyms -------------------------------------------------------------- Array = NewType('Array', np.array) Doc = NewType('Doc', spacy.tokens.doc.Doc) model = spacy.load('en_core_web_md') model.add_pipe(WordnetAnnotator(model.lang), after='tagger') # -- Interfaces ----------------------------------------------------------------- # semantic similarity def condition(xs: List[str]) -> float: "implement hypernym lookup" xs_ = strip(xs) return (0.0 if len(xs_) <= 1 else 1.0) def selection(xs: List[str]) -> str: return hypernyms(xs) def hypernyms(xs: List[str]) -> str:
def useAVLFunction(inputWriting): acadCoreAVL = open( '/Users/yanisa/Code_GitHub/MAThesis_YourAcadWritingFriend/miscInputFiles_WordListsPhrasesEtc/acadCore_AVL.txt' ).read() stopWordsFile = open( '/Users/yanisa/Code_GitHub/MAThesis_YourAcadWritingFriend/codeFiles/acadWritingFeaturesAnalysisFiles/acadWordLists/avlStopWords.txt' ).read().split('\n') dictWithAVLSuggestionsPerInputWord = {} # Load a spacy model and run my writing sample through (turns it into a Doc object) nlp = spacy.load('en_core_web_sm') nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger') inputWritingTokenized = nlp(inputWriting) # Make my AVL into a list with each word as an element # Then make it into a set so it's more efficient (looks up hash value rather than the actual word each time) acadCoreAVL_Set = set(acadCoreAVL.split('\n')) # Prepare stop words list so it ignores anything that has YANISA stopWordsList = [] for eachLine in stopWordsFile: if 'YANISA' in eachLine: continue else: stopWordsList.append(eachLine) # Check to see if any of the synsets match with an academic word from the AVL AND have the same POS for word in inputWritingTokenized: # Check the POS - only move forward if they are one of the WN synset POS-tags if word.pos_ != 'NOUN' and \ word.pos_ != 'ADJ' and \ word.pos_ != 'ADV' and \ word.pos_ != 'VERB': continue # If word is already in the AVL, skip it so it doesn't provide a suggestion if word.text.lower() in acadCoreAVL_Set: continue # If word is in stop words list, skip it too if word.text.lower() in stopWordsList: continue # Check the spacy POS and give it a hacky code that matches the wordnet synset output spacyPOS = word.pos_ if spacyPOS == 'NOUN': spacyToWordNetPOS = 'n' if spacyPOS == 'VERB': spacyToWordNetPOS = 'v' if spacyPOS == 'ADJ': spacyToWordNetPOS = 'a' if spacyPOS == 'ADV': spacyToWordNetPOS = 'r' # Get the word index from spacy to be used later # index is used in the file that calls this, so it doesn't duplicate suggestions where MWEs are # wordIndex = word.i # Get the synsets from WordNet for each word in the writing sample inputWritingSynsets = word._.wordnet.synsets() # Cleaning the synsets out so I only get the actual word # synsets print like: Synset('synset.n.01') for eachSynset in inputWritingSynsets: cleanOutFrontOfSynset = str(eachSynset).replace('Synset(\'', '') splitEachSynset = str(cleanOutFrontOfSynset).split('.') actualWordInSynset = splitEachSynset[0] posInSynset = splitEachSynset[1] # Keep the synset and original word if a word in the synset is in the AVL # and the synset word has the same POS as the orig word in the writing sample # and the suggested and orig words are not identical # and the suggested and orig words are not lemmas synsetToAddToDict = '' actualWordInSynsetStringLower = str(actualWordInSynset).lower() wordStringLower = str(word).lower() if actualWordInSynset.lower() in acadCoreAVL_Set and \ posInSynset == spacyToWordNetPOS and \ actualWordInSynsetStringLower != wordStringLower and \ actualWordInSynsetStringLower not in wordStringLower and \ wordStringLower not in actualWordInSynsetStringLower: # Removes super short words so my bottom script can run (e.g. 'am', 'is') if len(word.text) >= 3: # Use word_forms to make sure suggestion has the correct word form # e.g. so 'publishing' (rather than 'publish') is suggested for 'writing', # but 'publish' is still suggested for 'write' wordFormsEachSynonym = get_word_forms( actualWordInSynsetStringLower) # For each POS, get the word forms, and keep the one which has the appropriate ending if posInSynset == 'n': nounSet = wordFormsEachSynonym[posInSynset] if word.text[-1] == 's': for eachSyn in nounSet: if eachSyn[-1] == 's': synsetToAddToDict = eachSyn else: synsetToAddToDict = actualWordInSynsetStringLower if posInSynset == 'r': # No adv endings added synsetToAddToDict = actualWordInSynsetStringLower if posInSynset == 'a': # No adj endings added synsetToAddToDict = actualWordInSynsetStringLower if posInSynset == 'v': verbSet = wordFormsEachSynonym[posInSynset] if word.text[-3:] == 'ing': for eachSyn in verbSet: if eachSyn[-3:] == 'ing': synsetToAddToDict = eachSyn elif word.text[-1] == 'd': for eachSyn in verbSet: if eachSyn[-1] == 'd': synsetToAddToDict = eachSyn elif word.text[-1] == 's': for eachSyn in verbSet: if eachSyn[-1] == 's': synsetToAddToDict = eachSyn # TODO add other verb endings else: synsetToAddToDict = actualWordInSynsetStringLower if word not in dictWithAVLSuggestionsPerInputWord: # Set dict so word is key and synonym is the first element in a list in the dict (as the value) # The list thing is for if/when I add more synonyms to the orig word dictWithAVLSuggestionsPerInputWord[word] = [ synsetToAddToDict ] else: dictWithAVLSuggestionsPerInputWord[word].append( synsetToAddToDict) #print('AVL Suggested Word: ' + str(actualWordInSynset)) #print('Original Word: ' + str(word)) return dictWithAVLSuggestionsPerInputWord
def processAllArticles(directory): wikipediaArticles = os.listdir(directory) #print(wikipediaArticles) nlp = spacy.load("en_core_web_sm") #nltk.download('wordnet') nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger') dependCMap = dict() headsMap = dict() namedEntityMap = dict() sentenceMap = dict() # Process all the articles and find the respective NLP features for article in wikipediaArticles: #--------------- SCANNING THE ARTICLE ----------------- filepath = directory + article print('Now Reading File . . . . . . . ', article) corpus = read_data(filepath) doc = nlp(corpus) #------------- EXTRACTING NLP FEATURES ---------------- lemmasMap = dict() posMap = dict() tagsMap = dict() synsetsMap = dict() hypernymsMap = dict() hyponymsMap = dict() partMeronymsMap = dict() substanceMeronymsMap = dict() holonymsMap = dict() #1. TOKENIZATION tokens = tokennize(doc) for token in doc: #print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop) lemmasMap[token] = token.lemma_ # 2. Lemmatization posMap[token] = token.pos_ # 3. POS tagging (coarse-grained) tagsMap[token] = token.tag_ # 4. POS tagging (fine-grained) synsetsMap[token] = getSynsets(token) # 5. SynSet of the token hypernymsMap[token] = getHypernyms(token) # 6. Hypernyms hyponymsMap[token] = getHyponyms(token) # 7. Hyponyms partMeronymsMap[token] = getPartMeronyms(token) # 8. Part Meronyms substanceMeronymsMap[token] = getSubstanceMeronyms( token) # 9. Substance Meronyms holonymsMap[token] = getHolonyms(token) # 10. Holonyms #endFor # 11. DEPENDENCY labels and 12. SYNTACTIC HEADS for each token in the doc dependCMap[article], headsMap[article] = synParsing(doc) # 13. NAMED ENTITIES namedEntityMap[article] = getNamedEntities(doc) # 14. SENTENCE TOKENIZATION sentenceMap[article] = sentenceTokennize(doc) # Returning all the 14 NLP features collected FOR EVERY WIKIPEDIA ARTICLE return tokens, lemmasMap, posMap, tagsMap, dependCMap, headsMap, namedEntityMap, synsetsMap, hypernymsMap, hyponymsMap, partMeronymsMap, substanceMeronymsMap, holonymsMap, sentenceMap
def processFirstArticle(directory, resultDirectory): #------------- SCANNING THE RELEVANT FILE ------------- wikipediaArticles = os.listdir(directory) #print(wikipediaArticles) nlp = spacy.load("en_core_web_sm") #nltk.download('wordnet') nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger') dependCMap = dict() headsMap = dict() namedEntityMap = dict() sentenceMap = dict() firstArticle = wikipediaArticles[0] filepath = directory + firstArticle print('Now Reading File . . . . . . . ', firstArticle) corpus = read_data(filepath) doc = nlp(corpus) #------------- EXTRACTING NLP FEATURES ---------------- lemmasMap = dict() posMap = dict() tagsMap = dict() synsetsMap = dict() hypernymsMap = dict() hyponymsMap = dict() partMeronymsMap = dict() substanceMeronymsMap = dict() holonymsMap = dict() #1. TOKENIZATION tokens = tokennize(doc) # For each token in the document, extracting features for token in doc: lemmasMap[token] = token.lemma_ # 2. Lemmatization posMap[token] = token.pos_ # 3. POS tagging (coarse-grained) tagsMap[token] = token.tag_ # 4. POS tagging (fine-grained) synsetsMap[token] = getSynsets(token) # 5. SynSet of the token hypernymsMap[token] = getHypernyms(token) # 6. Hypernyms hyponymsMap[token] = getHyponyms(token) # 7. Hyponyms partMeronymsMap[token] = getPartMeronyms(token) # 8. Part Meronyms substanceMeronymsMap[token] = getSubstanceMeronyms( token) # 9. Substance Meronyms holonymsMap[token] = getHolonyms(token) # 10. Holonyms #endFor # 11. DEPENDENCY labels and 12. SYNTACTIC HEADS for each token in the doc dependCMap[firstArticle], headsMap[firstArticle] = synParsing(doc) # 13. NAMED ENTITIES namedEntityMap[firstArticle] = getNamedEntities(doc) # 14. SENTENCE TOKENIZATION sentenceMap[firstArticle] = sentenceTokennize(doc) #---------------- WRITING THE FEATURES ---------------- with open(resultDirectory + 'D01-Tokens.txt', 'w', encoding='utf-8') as f01: json.dump(tokens, f01) f02 = open(resultDirectory + 'D02-Lemmas.txt', 'w') f02.write(str(lemmasMap)) f02.close() f03 = open(resultDirectory + 'D03-POS.txt', 'w') f03.write(str(posMap)) f03.close() f04 = open(resultDirectory + 'D04-Tags.txt', 'w') f04.write(str(tagsMap)) f04.close() f05 = open(resultDirectory + 'D05-Synsets.txt', 'w') f05.write(str(synsetsMap)) f05.close() f06 = open(resultDirectory + 'D06-Hypernyms.txt', 'w') f06.write(str(hypernymsMap)) f06.close() f07 = open(resultDirectory + 'D07-Hyponyms.txt', 'w') f07.write(str(hyponymsMap)) f07.close() f08 = open(resultDirectory + 'D08-Meronyms-Part.txt', 'w') f08.write(str(partMeronymsMap)) f08.close() f09 = open(resultDirectory + 'D09-Meronyms-Substance.txt', 'w') f09.write(str(substanceMeronymsMap)) f09.close() f10 = open(resultDirectory + 'D10-Holonyms.txt', 'w') f10.write(str(holonymsMap)) f10.close() f11 = open(resultDirectory + 'D11-Dependencies.txt', 'w') f11.write(str(dependCMap)) f11.close() f12 = open(resultDirectory + 'D12-Syntactic-Heads.txt', 'w') f12.write(str(headsMap)) f12.close() f13 = open(resultDirectory + 'D13-Named-Entities.txt', 'w') f13.write(str(namedEntityMap)) f13.close() f14 = open(resultDirectory + 'D14-Tokenized-Sentences.txt', 'w') f14.write(str(sentenceMap)) f14.close()
# coding: utf-8 from spacy_wordnet.wordnet_annotator import WordnetAnnotator from spacy import load from spacy_wordnet.wordnet_annotator import WordnetAnnotator import pandas as pd nlp = load('en_core_web_sm') nlp.add_pipe(WordnetAnnotator(nlp.lang)) token = nlp('Calculator.')[0] token._.wordnet.synsets() meaning1, meaning2 = token = nlp('Calculator.')[0] meaning1, meaning2 = token._.wordnet.synsets() meaning1 meaning1.name() meaning1.lemmas() meaning2.lemmas() token._.wordnet.wordnet_domains() nlp('mathematics')[0]._.wordnet.wordnet_domains() 'science' in _ nlp('pure_science')[0]._.wordnet.wordnet_domains() nlp('science')[0]._.wordnet.wordnet_domains() wnet = nlp('science')[0]._.wordnet wnet.wordnet_synsets_for_domain() wnet.lemmas() token = nlp('human')[0] token._.wordnet.lemmas() token._.wordnet.synsets() [c.lemmas() for c in token._.wordnet.synsets()] syn =token._.wordnet.synsets() x = syn[0] x x.common_hypernyms()