def relevancy_score(desiredDoc): #Each word has score between 0 to 1 in terms of similarity. "None" is returned #there is no similarity. newWord =searchWord + ".n.01" searchWordwn = wn.synset(newWord) ## print (newWord) ## print (searchWordwn) relevancyScore = 0 currentWordScore = 0 memo = {} for i in range(len(keywords)): currentWord = keywords[i][0] if currentWord in memo: currentWordScore = memo[currentWord] if currentWordScore != None: relevancyScore += currentWordScore else: if wn.synsets(currentWord, pos = wn.NOUN) != []: currentWordwn = wn.synsets(currentWord, pos = wn.NOUN)[0] currentWordScore = wn.path_similarity(searchWordwn,currentWordwn) memo[currentWord] = currentWordScore if currentWordScore != None: relevancyScore += currentWordScore return relevancyScore
def sentence_similarity(idx, ob, mode): s_list = list() pbar = ProgressBar(widgets=['%s: image ' % mode, SimpleProgress()], maxval=len(sentences)).start() for im_idx, sentence_group in enumerate(np.array(sentences)[idx, :]): pbar.update(im_idx + 1) for sent in sentence_group: words = analyze(sent) sim = list() for w in words: syn1 = wn.synsets(w) syn2 = wn.synsets(ob) if syn1 and syn2: sim.append(max(s1.path_similarity(s2) for (s1, s2) in product(syn1, syn2))) else: # ignore word if no synset combination was found on wordnet sim.append(None) if max(sim): s_list.append(max(sim)) else: # ignore sentence if no word was similar enough s_list.append(float('nan')) pbar.finish() return s_list
def relation1_old(a, b) : ''' This method takes two words as arguments and returns their similarity based on wup_similarity method of nltk wordnet. Parameters ---------- a : string b : string Returns ------- float relation between two strings References ---------- .. [1] NLTK WordNet <http://www.nltk.org/howto/wordnet.html> ''' syna = wn.synsets(a, pos=wn.NOUN) synb = wn.synsets(b, pos=wn.NOUN) mx = 0 mxa = None mxb = None for i in syna[:1] : for j in synb[:1] : temp = wn.wup_similarity(i, j) if temp != None and temp > mx : mx = temp mxa = i mxb = j return mx
def findSimilarity(self): #As we recommend only one item first item of this list will be recommended item #Second item can be list of items '''So what we try to do is get exact synset of first item and get 10 synsets (to reduce computation costs) of second list of items over which the first item was preferred/recommended''' recommendation = wn.synsets(self.recoItems[0]) # @UndefinedVariable recommendationFiltered = [] for eachSyn in recommendation: if self.recoItems[0] in str(eachSyn): recommendationFiltered.append(eachSyn) choices = {} for eachItem in self.recoItems[1]: choices[eachItem] = wn.synsets(eachItem)[:10] # @UndefinedVariable getting only 10 items choiceScores = {} for key, value in choices.iteritems(): choiceScores[key] = [] for eachValue in choices[key]: for eachRecoSyn in recommendationFiltered: choiceScores[key].append(eachRecoSyn.path_similarity(eachValue)) maxChoiceScores = {} for eachKey in choiceScores.keys(): maxChoiceScores[eachKey] = max(choiceScores[eachKey]) return maxChoiceScores
def relation(a,b) : ''' Given two words(strings) returns a number that denotes relation between the two words. Parameters ---------- a : string b : string Returns ------- float relation (less than 1) between two strings Notes ----- First it applies BFS on the nltk wordnet and finds the least distance between the two given words. If the distance is x the function returns 1/(x+1), else return 0. ''' a = wn.synsets(a) b = wn.synsets(b) visited_a = set([]) visited_b = set([]) stemmed_a = set([]) stemmed_b = set([]) depth = 0 while True: if depth > 2: return 0 new_a = set([]) depth += 1 for syn in a: if stemmer.stem(syn.lemma_names[0]) in stemmed_b: return 1.0/depth if syn in visited_a: continue visited_a.add(syn) stemmed_a.add(stemmer.stem(syn.lemma_names[0])) hyp = set(syn.hyponyms()) for lemma in syn.lemma_names: None hyp |= set(wn.synsets(lemma)) new_a |= hyp a = new_a new_b = set([]) depth += 1 for syn in b: if stemmer.stem(syn.lemma_names[0]) in stemmed_a: return 1.0/depth if syn in visited_b: continue visited_b.add(syn) stemmed_b.add(stemmer.stem(syn.lemma_names[0])) hyp = set(syn.hyponyms()) for lemma in syn.lemma_names: None hyp |= set(wn.synsets(lemma)) new_b |= hyp b = new_b
def find_nearest_synset(in_tag, in_taglist): """ function find_nearest_synset for given in_tag, find its nearest (most similar) tag in in_taglist, return its tag name in_tag is a string in_taglist is a big list which is produced from fileparser.parse_imageclef_concepts_wn """ # process the input parameters concept_tag = in_taglist[0] concept_type = in_taglist[1] concept_sense = in_taglist[2] numConcepts = len(concept_tag) # new a distance matrix dist_score = np.zeros([1, numConcepts]) #loop to calculate the distance between in_tag and each concept syn_in_tag = wn.synsets(in_tag)[0] for idx in range(numConcepts): offset = concept_sense[idx] - 1 syn_concept = wn.synsets(concept_tag[idx])[offset] path_sim = syn_in_tag.path_similarity(syn_concept) if path_sim == None: path_sim = 0 # path_sim = compare(in_tag, concept_tag[idx]) dist_score[0][idx] = path_sim # sort in column and flip to descending order indices = np.argsort(dist_score, axis=1) sorted_indices = np.fliplr(indices) return concept_tag[sorted_indices[0][0]]
def parseLyrics2(outlist): bandLyricInfo = {} master = [['death', 0],['violence',0],['sacrifice',0],['nature',0],['peace',0],['storm',0],['spirit',0],[ 'dark',0],['scream',0],['pain',0],['blood',0],['flesh',0],['love',0],['greed',0],['poison',0],['anger',0],['revenge',0],['misery',0],['hell',0],['heaven',0],['hate',0],['soul',0],['battle',0],['ghost',0],['joy',0],['light',0],['omen',0],['miracle',0],['magic',0],['universe',0],['disease',0],['god',0],['satan',0],['struggle',0],['heart',0]] for key in outlist: templist = copy.deepcopy(master) ; #key = 'Queensryche' raw = outlist[key]; raw = raw.lower(); words = re.findall(r'\w+', raw,flags = re.UNICODE | re.LOCALE) # punctuation imp_words = filter(lambda x: x not in stopwords.words('english'), words) # filter noise lmt = WordNetLemmatizer() words_new = [lmt.lemmatize(x) for x in words] dw = list(set(words_new)) for word in dw: for m in templist: p1 = wordnet.synsets(word) ; p2 = wordnet.synsets(m[0]) ; if(len(p1) >0 and len(p2) >0): c = p1[0].wup_similarity(p2[0]) if(c > m[1]): m[1] = c # sort words according to similarity tnew = sorted(templist,key=lambda val:val[1],reverse=True) [0:10] ; # remove the other column for l in tnew: del l[1] print 'Done ',key #break ; bandLyricInfo[key] = tnew #del templist return bandLyricInfo
def get_attributes(): """ Gets all attributes for all vehicles ("GROUND * such as VEHICLE") """ wd = Data() for vehicle in wd.vehicles.keys(): if not wn.synsets(vehicle, wn.NOUN): del wd.vehicles[vehicle] for ground in wd.grounds.keys(): if not wn.synsets(ground, wn.ADJ): del wd.grounds[ground] with open('../res/wiki/allsuchas.txt', 'r') as f: for line in f: split_line = line.split(" such as ") left_side = split_line[0] right_side = split_line[1] for ground in wd.grounds.keys(): ground_temp = " " + ground + " " if ground_temp.replace("_", " ") in left_side: for vehicle in wd.vehicles.keys(): vehicle_temp = " a " + vehicle.replace("_", " ") + " " vehicle_plural = " " + vehicle.replace("_", " ") + "s " vehicle_temp = vehicle_temp.replace("_", " ") if vehicle_temp in right_side or vehicle_plural in right_side: wd.get_vehicle(vehicle).add_attribute(wd.get_ground(ground)) print str(vehicle) + " " + str(ground) + " ... " + line wd.save()
def parse_file(f): for l in f.readlines(): word = l.strip() synsets = wn.synsets(word) if word in synonym_values: continue # get first order synonyms synonyms = set() for synset in synsets: synonyms = set(synonyms) | set(synset.lemma_names) # add in synonyms of those synonyms for syn in synonyms: for syn_synset in wn.synsets(syn): synonyms = set(synonyms) | set(syn_synset.lemma_names) synonyms_with_values = set(synonyms) & set(synonym_values.keys()) if not len(synonyms_with_values): continue avg = 0 total = 0 for syn in synonyms_with_values: value = synonym_values[syn] avg = (avg * total + float(value)) / (total + 1) total += 1 # print "Adding", word, avg synonym_values[word] = int(abs_ceil(avg)) f.close()
def get_similarity(self, string1, string2): """ Calculate the similarity of two statements. This is based on the total similarity between each word in each sentence. """ import itertools tokens1 = self.get_tokens(string1) tokens2 = self.get_tokens(string2) total_similarity = 0 # Get the highest matching value for each possible combination of words for combination in itertools.product(*[tokens1, tokens2]): synset1 = wordnet.synsets(combination[0]) synset2 = wordnet.synsets(combination[1]) if synset1 and synset2: # Compare the first synset in each list of synsets similarity = synset1[0].path_similarity(synset2[0]) if similarity: total_similarity = total_similarity + similarity return total_similarity
def tell(para1,para2): #Strip anything but not alphanum para1=re.sub(r'[^\w ]+', '', para1) para2=re.sub(r'[^\w ]+', '', para2) para1=para1.lower().split() para2=para2.lower().split() if para1==[] or para2==[]: return 0 if not filter(lambda t:t.lower() not in stopwords, para1) == []: para1=filter(lambda t:t.lower() not in stopwords, para1) if not filter(lambda t:t.lower() not in stopwords, para2) == []: para2=filter(lambda t:t.lower() not in stopwords, para2) score=len(set(para1).intersection(para2)) score_1=float(score)/math.sqrt(len(para2)*len(para1)) para1_with_dictionary=reduce(lambda x,y:x+y, map(lambda word:[l.name for s in wordnet.synsets(word) for l in s.lemmas],para1)) para1_with_dictionary=map(lambda ele:ele.lower(), para1_with_dictionary) #^^ Returns duplicated elements as well. So we need to remove the duplicates. Converting into set does that para2_with_dictionary=reduce(lambda x,y:x+y, map(lambda word:[l.name for s in wordnet.synsets(word) for l in s.lemmas],para2)) para2_with_dictionary=map(lambda ele:ele.lower(), para2_with_dictionary) #^^ Returns duplicated elements as well. So we need to remove the duplicates. While taking intersection the same is handled score1=len(set(para1_with_dictionary).intersection(para2)) score2=len(set(para2_with_dictionary).intersection(para1)) score_2=float(max(score1,score2))/min(len(para2),len(para1)) score=(score_1+score_2)/2 return score
def add_word(word): maximum = 0 maxJCN = 0 flag = 0 for chain in lexical_chains: #for all chains that are present for synset in wn.synsets(word): #for all synsets of current word for sense in chain.senses: #for all senses of the current word in current element of the current chain similarity = sense.wup_similarity(synset) #using wup_similarity if(similarity >= maximum): if similarity >= threshold: #print word, synset, sense, sense.jcn_similarity(synset, brown_ic) JCN = sense.jcn_similarity(synset, brown_ic) #using jcn_similarity if JCN >= jcnTreshold: if sense.path_similarity(synset) >= 0.2: #using path similarity if JCN >= maxJCN: maximum = similarity maxJCN = JCN maxChain = chain flag = 1 if flag == 1: maxChain.addWord(word) maxChain.addSense(synset) return lexical_chains.append(Chain([word], wn.synsets(word)))
def hypernyms(self, word, question): hyper = [] sentence = self.parse(question) pos = '' for sent, tag in sentence[0]: if sent == word: pos = tag break if pos in ['JJ','JJR','JJS']: for synset in wn.synsets(word, pos = wn.ADJ): for lemma in synset.lemmas(): if lemma.name() not in hyper and len(hyper)<7: hyper.append(lemma.name()) elif pos in ['NN','NNS']: for synset in wn.synsets(word, pos = wn.NOUN): for lemma in synset.lemmas(): if lemma.name() not in hyper and len(hyper)<7: hyper.append(lemma.name()) elif pos in ['VB','VBG','VBD','VBN','VBP','VBZ']: for synset in wn.synsets(word, pos = wn.VERB): for lemma in synset.lemmas(): if lemma.name() not in hyper and len(hyper)<7: hyper.append(lemma.name()) elif pos in ['RB','RBR','RBS']: for synset in wn.synsets(word, pos = wn.ADV): for lemma in synset.lemmas(): if lemma.name() not in hyper and len(hyper)<7: hyper.append(lemma.name()) return hyper
def subclass(feats): if string_match(feats).endswith("False"): try: result = False i_clean = wn.morphy(feats.i_cleaned.lower(), wn.NOUN) i_synsets = wn.synsets(i_clean) j_clean = wn.morphy(feats.j_cleaned.lower(), wn.NOUN) j_synsets = wn.synsets(j_clean) def get_common_hypernym(i_synset,j_synset): i_hypernyms = i_synset.hypernyms() j_hypernyms = j_synset.hypernyms() if len(i_hypernyms) == 0: i_synset = i_synset.instance_hypernyms()[0] if len(j_hypernyms) == 0: j_synset = j_synset.instance_hypernyms()[0] subc = i_synset.common_hypernyms(j_synset) return (i_synset in subc) or (j_synset in subc) for synset in i_synsets: for syn in j_synsets: result = get_common_hypernym(synset,syn) if result: break if result:break return "subclass={}".format(result) except: wn_error return "subclass={}".format(False) else: return "subclass={}".format(False)
def scoreFile(filename, targetWords, verbose=False): meanScore = 0.0 baseWordCount = 0 wordCount = 0 f = file(filename) for l in f: wordScored = False fields = [x.strip().lower() for x in re.split(r"\s+", l)] if (targetWords is not None) and (fields[0] not in targetWords): continue baseSynsets = wordnet.synsets(fields[0]) if baseSynsets is None: continue for word in fields[1:]: # Ignore identical word if it occurs if word == fields[0]: continue targetSynsets = wordnet.synsets(word) if targetSynsets is None: continue wordScore = scoreWord(baseSynsets, targetSynsets) meanScore += wordScore wordCount += 1 wordScored = True baseWordCount += 1 if wordScored else 0 if verbose: if (baseWordCount > 0) and (baseWordCount % 1000 == 0): print "Words scored : %d, Current Score : %f" % ( baseWordCount, meanScore / (wordCount if wordCount > 0 else 1), ) f.close() meanScore /= wordCount if wordCount > 0 else 1 return {"baseWordCount": baseWordCount, "totalWordCount": wordCount, "meanScore": meanScore}
def compare(self, word1, word2): tmp1 = wn.synsets(word1)[0].name tmp2 = wn.synsets(word2)[0].name w1 = wn.synset(tmp1) w2 = wn.synset(tmp2) val = w1.wup_similarity(w2) return val
def wndist(fs): """ Distance between NP1 and NP2 in WordNet (using the first sense only) """ wndist=-100000 i_pos=__get_pos__(fs.article,fs.sentence,fs.offset_begin,fs.offset_end) j_pos=__get_pos__(fs.article,fs.sentence_ref,fs.offset_begin_ref,fs.offset_end_ref) #print "Orig:", fs.token, '\t', fs.token_ref if i_pos.startswith('NN') and j_pos.startswith('NN') and not i_pos.endswith('P') and not j_pos.endswith('P'): # considering only common nouns lemmatizer = nltk.WordNetLemmatizer() i=lemmatizer.lemmatize(fs.i_cleaned, pos='n') j=lemmatizer.lemmatize(fs.j_cleaned, pos='n') synsets_i=wn.synsets(i) synsets_j=wn.synsets(j) if len(synsets_i)>0 and len(synsets_j)>0: wn_sense1_i=synsets_i[0] wn_sense1_j=synsets_j[0] wn_pos_i=str(wn_sense1_i).split('.')[1] wn_pos_j=str(wn_sense1_j).split('.')[1] if wn_pos_i==wn_pos_j: wndist=wn_sense1_i.lch_similarity(wn_sense1_j) wndist=(ceil(wndist * 100) / 100.0) #print "Lemmatized:", i, '\t', j, '\t', str(wndist) #print #print return "wndist={}".format(wndist)
def semantic_similarity(word1, word2): words1 = word1.split('_') words2 = word2.split('_') if fast_semantic_similarity(word1, word2) == 1: return 1 max_p = 0 word1_sim = set([]) for s1 in wn.synsets(word1): word1_sim.add(s1) word1_sim.update(s1.similar_tos()) # for st1 in [s1] + s1.similar_tos(): # word1_sim.append(st1) word2_sim = set([]) for s2 in wn.synsets(word2): word2_sim.add(s2) word2_sim.update(s2.similar_tos()) for st1 in word1_sim: for st2 in word2_sim: p = wn.wup_similarity(st1, st2) if p == 1: return p if p > max_p: max_p = p if len(words1) > 1 or len(words2) > 1: sub_similarity = .9 * semantic_similarity(words1[-1], words2[-1]) else: sub_similarity = 0 return max(max_p, sub_similarity)
def processarticle(self, articleid): self.update_state(state='PROCESSING', meta={'current': 5, 'total': 100, 'status': 'Downloading article...'}) wp = wikipedia.page(articleid) content = wp.content self.update_state(state='PROCESSING', meta={'current': 10, 'total': 100, 'status': 'Processing article...'}) words = content.split() replacecount = 0 output = "" for i in range(0,len(words)): word = random.choice(words) if len(wn.synsets(word)) >= 1 and checkword(word): newword = wn.synsets(word)[0].lemma_names()[0] if not checksyn(newword): if len(wn.synsets(word)) >= 2: newword = wn.synsets(word)[1].lemma_names()[0] if not checksyn(newword): i -= 2 continue else: i -= 2 continue if newword == word: i -= 2 continue else: content = content.replace(" " + word + " ", " " + newword + " ", 1) output += "Replaced " + word + " with " + newword + "\n" replacecount += 1 else: i -= 2 self.update_state(state='PROCESSING', meta={'current': i/100, 'total': 100, 'status': 'Editing article...'}) # sleep(0.5) return {'current': 100, 'total': 100, 'status': 'Processing complete!', 'article': content, 'info': output}
def syns(q): return set(wn.synsets(q) + [x for y in wn.synsets(q) for x in set( set(y.closure(lambda a: a.hypernyms(), depth=3)) | set(y.closure(lambda a: a.hyponyms(), depth=3)) | set(y.closure(lambda a: a.hyponyms() + a.similar_tos(), depth=3)) )]) - ABSTRACT() def defs(q): return [l.definition for l in wn.synsets(word)]
def SynsetwithCategry(): hypo = lambda s:s.hyponyms() for entry in db.freqbyCtgry.find(): synsetLists = [] category = ctgryName.get(entry['category'], entry['category']) if category == 'Other': continue if category == 'Travel': synsetLists.append(getTreesList(wn.synset('travel.n.01').tree(hypo))) synsetLists.append(getTreesList(wn.synset('travel.v.03').tree(hypo))) synsetLists.append(getTreesList(wn.synset('travel.v.04').tree(hypo))) synsetLists.append(getTreesList(wn.synset('travel.v.05').tree(hypo))) synsetLists.append(getTreesList(wn.synset('travel.v.06').tree(hypo))) else: for word in category.split(): synsets = wn.synsets(word, 'n') synsets += wn.synsets(word, 'v') for synset in synsets: synsetLists.append(getTreesList(synset.tree(hypo))) for synsetList in synsetLists: for synset in synsetList: for lemma in wn.synset(synset[0]).lemmas: if db.wordSynsetMap.find({'word': lemma.name, 'category': entry['category']}).count(): #if the word is in serveral synsets, we can choose the one has the least distance from root if db.wordSynsetMap.find({'word': lemma.name, 'category': entry['category']})[0]['depth'] > synset[1]: db.wordSynsetMap.remove({'word': lemma.name, 'category': entry['category']}) db.wordSynsetMap.insert({'word': lemma.name, 'synset': synset[0], 'depth': synset[1], 'category':entry['category']}) print lemma.name, synset[0], synset[1], entry['category'] else: db.wordSynsetMap.insert({'word': lemma.name, 'synset': synset[0], 'depth': synset[1], 'category':entry['category']}) print lemma.name, synset[0], synset[1], entry['category']
def get_least_specific(n,word_list): word_list = [(w, min([synset.min_depth() for synset in wn.synsets(w,'n')])) for w in word_list if len(wn.synsets(w,'n'))>0] return [w for (w,n) in sorted(filter(lambda pair:pair[1]>0,word_list), key=itemgetter(1))[:n]]
def c_wn_max_path_similarity(score,word_from,word_to): """ WordNet path similarity for the most similar synsets. (1 if same word) This feature can be precomputed by EQUALS """ # Enforce returning 1 when words are equal (would be 0 if synset not found) # NOTE: since EQUALS precomputes this feature, the assignment in the second # if is double. It is mantained to keep the indipendence on the imple- # mentation of EQUALS. if not score.is_feature_set[score.EQUALS]: c_equals(score,word_from,word_to) if score.features[score.EQUALS] == 1: score.set_feature(score.WN_MAX_PATH_SIMILARITY,1) return # Compute the actual distance _r = 0 for ss_from in wn.synsets(word_from.text): for ss_to in wn.synsets(word_to.text): current_similarity = ss_to.path_similarity(ss_from) if current_similarity > _r: _r = current_similarity score.set_feature(score.WN_MAX_PATH_SIMILARITY,_r)
def generatesynsets(table): table2 = [] table3 = {} for i in table: search1 = "N.*" search2 = "V.*" if re.findall(search1, i[1]): x = wns.synsets(i[0], pos=wns.NOUN) elif re.findall(search2, i[1]): x = wns.synsets(i[0], pos=wns.VERB) for z in range(len(x)): for y in x[z].lemma_names: syn = 'SYN' if y not in ['match', 'be', 'in', 'is']: table2.append((y, syn)) test = 0 test += 1 for i in table2: try: table3[i] += test except: table3[i] = test return table3
def polar_values(self, positive_seeds, negative_seeds): self.values = [] POS_tags = list(set(nltk.pos_tag(WordPunctTokenizer().tokenize(self.data)))) words = [] for (w, s) in POS_tags: w= w.lower() POS = self.get_wordnet_pos(s) if POS =='' or re.match("^[\w]+$",w) == None: words.append('0') else: w+="."+POS w+=".01" words.append(w) negative_set = [] for nw in negative_seeds: for s in wordnet.synsets(nw): negative_set.append(s) positive_set = [] for pw in positive_seeds: for s in wordnet.synsets(pw): positive_set.append(s) self.eval_words(words, positive_set, negative_set) return self.values
def xhyper(words)->[str]: '''returns the highest order x hypernyms''' x = UI.request_x() print("\nNote: this program will use the first parallel synset if there are any") print("\nGathering data...") result = [x] hyp = lambda w: w.hypernyms() #This would pick up the deepest branch's depth -> valueAt returns None -> returns None #depth = lambda L: isinstance(L, list) and max(map(depth, L))+1 for i in range(len(words)): synsets = wordnet.synsets(words[i]) if len(synsets) > 0: for s in range(len(synsets)): hyper = wordnet.synsets(words[i])[s].tree(hyp) if (hyper[0].pos() in ['a','s','r']): result.append([words[i], 'None', 'None', [None]]) continue d = first_depth(hyper) - 1 xhyper = [] for j in range(x): xhyper.append(valueAt(d - j, hyper)) if xhyper[-1] is None: break result.append([words[i], pos_redef(hyper[0].pos()), hyper[0], xhyper]) else: result.append([words[i], 'None', 'None', [None]]) return result
def getSynonym(word, tag): pos_list = {"JJ":"ADJ","JJR":"ADJ", "JJS":"ADJ","NN":"NOUN","NNS":"NOUN","NPS":"NOUN","NP":"NOUN","RBR":"ADV","RBS":"ADV","RB":"ADV","VB":"VERB","VBD":"VERB","VBG":"VERB","VBN":"VERB","VBP":"VERB","VBZ":"VERB"}; tag_list = pos_list.keys() li = {} if tag in tag_list: dd = pos_list.get(tag) if dd == "VERB": tt = wn.synsets(word,pos=wn.VERB) for key in tt: ss = key.lemma_names for s in ss: li[s] = s if dd == "NOUN": tt = wn.synsets(word,pos=wn.NOUN) for key in tt: ss = key.lemma_names for s in ss: li[s] = s if dd == "ADV": tt = wn.synsets(word,pos=wn.ADV) for key in tt: ss = key.lemma_names for s in ss: li[s] = s if dd == "ADJ": tt = wn.synsets(word,pos=wn.ADJ) for key in tt: ss = key.lemma_names for s in ss: li[s] = s return li.keys()
def CollectSemcorSupersenses(): oracle_matrix = collections.defaultdict(WordSupersenses) for sent in semcor.tagged_sents(tag='both'): for chk in sent: if chk.node and len(chk.node)>3 and chk.node[-3]=='.' and chk.node[-2:].isdigit(): if chk[0].node.startswith('N'): pos = "n" elif chk[0].node.startswith('V'): pos = "v" else: continue lemmas = chk.node[:-3] wnsn = int(chk.node[-2:]) ssets = wn.synsets(lemmas, pos) sorted_ssets = sorted(ssets, key=lambda x: x.name) filtered_ssets = None for lemma in lemmas.split("_"): if not filtered_ssets or len(filtered_ssets) == 0: filtered_ssets = filter(lambda x: lemma in x.name, sorted_ssets) if filtered_ssets and len(filtered_ssets) > 0: sorted_ssets = filtered_ssets try: supersense = sorted_ssets[wnsn-1].lexname # prints 'noun.group except: #print("."), continue for lemma in lemmas.split("_"): ssets = wn.synsets(lemma, pos) if len(ssets) > 0: if lemma.isdigit(): lemma = "0" oracle_matrix[lemma].Add(supersense, "semcor") return oracle_matrix
def userEnteredWordSensor(user_input): #what stage are we currently in ? -- whether AS,IM or WI ? #what response did user enter ? if exactly_right: #save our total action plan. cursor.executeQuery("insert into path values('',session['uid'],session['wordid']") pathid=cursor.executeQuery("select pathid from path where wordid = session['wordid']") cursor.executeQuery("insert into waypoint values('',pathid,session['type'],session['waypoint_info'])") #LOG the path #procede to the next word. perform() pass; elif nearly_right: #nearly right means --> #one of the tags wid=cursor.executeQuery("Select wordid from words where word like 'session['word']'") tags=cursor.executeQuery("Select tags from words where wordid=wid") for tag in tags: if ( tag == word ) #perform action sequence for NEXT break; #its synonym for s in wn.synsets('session['word']'): if( s == user_input ) #perform action sequence for NEXT break; else: #tags's synonym...? for s in wn.synsets('tag'): if( s == user_input ) #perform action sequence for NEXT break;
def ch03_42_wordnet_semantic_index(): from nltk.corpus import webtext from nltk.corpus import wordnet as wn postings = [] docids = {} for (pos, fileid) in enumerate(webtext.fileids()): docids[pos] = fileid wpos = 0 words = webtext.words(fileid) for word in words: try: postings.append((word.lower(), (pos, wpos))) offset = wn.synsets(word)[0].offset postings.append((offset, (pos, wpos))) poffset = wn.synsets(word)[0].hypernyms()[0].offset postings.append((poffset, (pos, wpos))) except IndexError: continue wpos = wpos + 1 index = nltk.Index(postings) query = "canine" qpostings = [] qpostings.extend([(pos, wpos) for (pos, wpos) in index[query]]) try: offset = wn.synsets(query)[0].offset qpostings.extend([(pos, wpos) for (pos, wpos) in index[offset]]) except IndexError: pass for (pos, wpos) in qpostings: left = webtext.words(docids[pos])[wpos-4:wpos] right = webtext.words(docids[pos])[wpos:wpos+4] print left, right
def gloss(self, word): if wordnet.synsets(word): syn = wordnet.synsets(word)[0] return syn.definition() else: return None
def get_all_synsets(word, pos=None): for ss in wn.synsets(word, pos): for lemma in ss.lemma_names(): # yield (lemma, ss.name()) yield (lemma, ss)
# synonyms test from nltk.corpus import wordnet synonyms = [] for syn in wordnet.synsets('sailing'): for lemma in syn.lemmas(): synonyms.append(lemma.name()) print(synonyms)
def generateQuestion(): item = [ 'Pens', 'Books', 'Boxes', 'Chocolates', 'Biscuits', 'Mangos', 'Bananas', 'Dolls', 'Flowers', 'Breads', 'Watches', 'Apples', 'Apricots', 'Avocadoes', 'Blackberries', 'Blueberries', 'Cherries', 'Figs', 'toys', 'kiwi(fruit)', 'lemons', 'oranges','Papers', 'Peaches', 'pears', 'pineapples', 'plums', 'raspberries', 'strawberries', 'watermelons'] z = (random.choice(item)) addition = ['originally', 'in the first', 'in the beginning', 'earlier', 'to begin with', 'primitively', 'at first', 'initially', 'incipiently'] az1 = (random.choice(addition)) __title__ = 'names' __version__ = '0.2' __author__ = 'Trey Hunner' __license__ = 'MIT' # def multiwordReplace(text, wordDic): """ take a text and replace words that match a key in a dictionary with the associated value, return the changed text """ rc = re.compile('|'.join(map(re.escape, wordDic))) def translate(match): return wordDic[match.group(0)] return rc.sub(translate, text) p1 = random.randint(2, 9) p2 = random.randint(10, 50) q = str(p1) x = str(p2) # def get_name(filename): # selected = random.random() * 90 # with open(filename) as name_file: # for line in name_file: # name, _, cummulative, _ = line.split() # if float(cummulative) > selected: # return name def get_first_name(gender=None): if gender not in ('male', 'female'): gender = random.choice(('male', 'female')) return get_name(FILES['first:%s' % gender]).capitalize() def get_last_name(): return get_name(FILES['last']).capitalize() def get_full_name(gender=None): return u"%s %s" % (get_first_name(gender), get_last_name()) tn = (names.get_first_name()) p = (names.get_first_name()) str1 = """John had some marbles, Jim gave him 3 more, Now John has 8 marbles. How many marbles did John have to begin with ?""" # the dictionary has target_word : replacement_word pairs # print (str1) wordDic = { 'John': tn, 'marbles': z, 'Jim': p, '3': q, '8': x, 'to begin with' : az1} # call the function and get the changed text str2 = multiwordReplace(str1, wordDic) str3 = (str2) #print (str2) output = "" #synset library for changing the nouns, vowels etc. # Load the pretrained neural net tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') # Tokenize the text tokenized = tokenizer.tokenize(str3) # Get the list of words from the entire text words = word_tokenize(str3) # Identify the parts of speech tagged = nltk.pos_tag(words) for i in range(0, len(words)): replacements = [] # Only replace nouns with nouns, vowels with vowels etc. for syn in wordnet.synsets(words[i]): # Do not attempt to replace proper nouns or determiners if tagged[i][1] == 'NNP' or tagged[i][1] == 'DT': break # The tokenizer returns strings like NNP, VBP etc # but the wordnet synonyms has tags like .n. # So we extract the first character from NNP ie n # then we check if the dictionary word has a .n. or not word_type = tagged[i][1][0].lower() if syn.name().find("." + word_type + "."): # extract the word only r = syn.name()[0:syn.name().find(".")] replacements.append(r) if len(replacements) > 0: # Choose a random replacement replacement = replacements[randint(0, len(replacements) - 1)] output = output + " " + replacement else: # If no replacement could be found, then just use the # original word output = output + " " + words[i] strr = str3 tn = [int(s) for s in strr.split() if s.isdigit()] t[0] = tn[0] t[1] = tn[1] return strr
def my_get_term_sim(term1, term2, score_level): term1 = term1.encode() term2 = term2.encode() term1 = str.lower(term1).replace('\t', ' ').replace('-', ' ').replace(',', '') term2 = str.lower(term2).replace('\t', ' ').replace('-', ' ').replace(',', '') term2.strip() term1.strip() term1 = [w for w in term1.split(" ") if not w in filter_words] term2 = [w for w in term2.split(" ") if not w in filter_words] words1 = [] for word in term1: if word is "" or len(wn.synsets(word)) == 0: continue else: words1.append(word) words2 = [] for word in term2: if word is "" or len(wn.synsets(word)) == 0: continue else: words2.append(word) match = 0.0 max_i = len(words1) max_j = len(words2) if max_i <= 0 or max_j <= 0: direct = [x for x in term1 if x in term2] return 2 * (float(len(direct))) / (len(term1) + len(term2)) flag_i = -1 flag_j = -1 max_step_num = min(max_i, max_j) max_sim = [] for k in range(max_step_num): max_temp = -1 for i in range(k, max_i): for j in range(k, max_j): sim = get_wordnet_sim(words1[i], words2[j]) if sim > 1 or sim < 0: print sim raise Exception("Sorry, similarity score is not in [0,1]!") if sim > max_temp: flag_i = i flag_j = j max_temp = sim temp = words1[flag_i] words1[flag_i] = words1[k] words1[k] = temp temp = words2[flag_j] words2[flag_j] = words2[k] words2[k] = temp max_sim.append(max_temp) match += f_identify(max_temp, score_level) return 2 * match / (max_i + max_j)
import nltk from nltk.corpus import wordnet as wn synonyms_1 = wn.synsets("auto") print(synonyms_1) synonyms_2 = wn.synsets("car") print(synonyms_2) print() if len(synonyms_1) == 0 or len(synonyms_2) == 0: print("No results") # return None, None else: max_sim = -1 best_pair = None, None for synonym in synonyms_1: for synonym_2 in synonyms_2: sim = synonym.path_similarity(synonym_2) print(1 - sim) if sim is None: continue if sim > max_sim: max_sim = sim best_pair = synonym, synonym_2 max_sim = 1 - max_sim print(best_pair) print(max_sim) # return best_pair
from nltk.corpus import wordnet antonyms = [] for syn in wordnet.synsets("pain"): for l in syn.lemmas(): if l.antonyms(): antonyms.append(l.antonyms()[0].name()) print(antonyms)
def tps_word_embeddings( word_embeddings_name: str, neighbourhood_sizes: list, semeval_target_words: np.ndarray, semeval_target_words_gs_clusters: np.ndarray, word_embeddings_normalized: np.ndarray, word_to_int: dict, word_vocabulary: list, num_top_k_words_frequencies: int, output_dir: str, word_counts: Optional[list] = None, ann_instance: ApproxNN = None, ) -> None: """ Computes TPS for word embeddings and saves correlation plots. Parameters ---------- word_embeddings_name : str Name of the word embeddings. neighbourhood_sizes : list Neighbourhood sizes to compute TPS scores of. semeval_target_words : np.ndarray SemEval-2010 task 14 target words. semeval_target_words_gs_clusters : np.ndarray SemEval-2010 task 14 GS clusters. word_embeddings_normalized : np.ndarray Normalized word embeddings. word_to_int : dict Dictionary for mapping a word to its integer representation. word_vocabulary : list List of words/word ints to use for the vocabulary. num_top_k_words_frequencies : list Number of top words to use when computing TPS scores vs. word frequencies. output_dir : str Output directory. word_counts : list List containing word counts ann_instance : ApproxNN ApproxNN instance to use for computing TPS scores. """ # Ensure output directory exists output_dir_plots = join(output_dir, word_embeddings_name) makedirs(output_dir_plots, exist_ok=True) # Only use the SemEval-2010 task 14 words in vocabulary semeval_target_words_in_vocab_filter = [ i for i, word in enumerate(semeval_target_words) if word in word_to_int ] semeval_target_words_in_vocab = semeval_target_words[ semeval_target_words_in_vocab_filter ] semeval_target_words_gs_clusters_in_vocab = semeval_target_words_gs_clusters[ semeval_target_words_in_vocab_filter ] tps_vs_gs_key = "TPS_n vs. GS" tps_vs_synsets_key = "TPS_n vs. synsets" tps_vs_frequency_key = "TPS_n vs. frequency" result_dict: dict = { "n": neighbourhood_sizes, tps_vs_gs_key: [], tps_vs_synsets_key: [], } has_word_counts = word_counts is not None if has_word_counts: result_dict[tps_vs_frequency_key] = [] for neighbourhood_size in neighbourhood_sizes: print(f"-- Neighbourhood size: {neighbourhood_size} --") # -- Compute TPS scores and correlation vs GS words -- output_plot_filepath = join( output_dir_plots, f"tps_{neighbourhood_size}_vs_gs.pdf", ) output_tps_filepath = join( output_dir_plots, f"tps_{neighbourhood_size}_vs_gs.npy", ) if not isfile(output_plot_filepath): print("Computing TPS scores for GS words") tps_scores_semeval = tps_multiple( target_words=semeval_target_words_in_vocab, word_to_int=word_to_int, neighbourhood_size=neighbourhood_size, word_embeddings_normalized=word_embeddings_normalized, ann_instance=ann_instance, n_jobs=-1, progressbar_enabled=True, ) # Compute correlation vs GS word meanings tps_score_vs_gs_correlation, _ = pearsonr( x=tps_scores_semeval, y=semeval_target_words_gs_clusters_in_vocab ) result_dict[tps_vs_gs_key].append(tps_score_vs_gs_correlation) # Save plot of TPS scores vs. GS tps_word_embeddings_correlation_plot( tps_scores=tps_scores_semeval, y_values=semeval_target_words_gs_clusters_in_vocab, y_label="Clusters in GS", tps_vs_y_correlation=tps_score_vs_gs_correlation, output_plot_filepath=output_plot_filepath, neighbourhood_size=neighbourhood_size, ) # Save TPS scores to file np.save(output_tps_filepath, tps_scores_semeval) # -- Compute TPS scores and correlation vs Wordnet synsets words -- output_plot_filepath = join( output_dir_plots, f"tps_{neighbourhood_size}_vs_synsets.pdf", ) output_tps_filepath = join( output_dir_plots, f"tps_{neighbourhood_size}_vs_synsets.npy", ) if not isfile(output_plot_filepath): # Find words in vocabulary that have synsets in Wordnet tps_scores_wordnet_synsets = [] wordnet_synsets_words_in_vocab = [] wordnet_synsets_words_in_vocab_meanings = [] print("Computing TPS scores for words in vocabulary with Wordnet synsets") for word in tqdm(word_vocabulary): num_synsets_word = len(wn.synsets(word)) if num_synsets_word > 0: wordnet_synsets_words_in_vocab.append(word) wordnet_synsets_words_in_vocab_meanings.append(num_synsets_word) wordnet_synsets_words_in_vocab = np.array(wordnet_synsets_words_in_vocab) tps_scores_wordnet_synsets = tps_multiple( target_words=wordnet_synsets_words_in_vocab, word_to_int=word_to_int, neighbourhood_size=neighbourhood_size, word_embeddings_normalized=word_embeddings_normalized, ann_instance=ann_instance, n_jobs=-1, progressbar_enabled=True, ) # Compute correlation vs Wordnet synsets tps_score_vs_wordnet_synsets_correlation, _ = pearsonr( x=tps_scores_wordnet_synsets, y=wordnet_synsets_words_in_vocab_meanings ) result_dict[tps_vs_synsets_key].append( tps_score_vs_wordnet_synsets_correlation ) # Save plot of TPS scores vs. Wordnet synsets tps_word_embeddings_correlation_plot( tps_scores=tps_scores_wordnet_synsets, y_values=wordnet_synsets_words_in_vocab_meanings, y_label="Synsets in WordNet", tps_vs_y_correlation=tps_score_vs_wordnet_synsets_correlation, output_plot_filepath=output_plot_filepath, neighbourhood_size=neighbourhood_size, ) # Save TPS scores to file np.save(output_tps_filepath, tps_scores_wordnet_synsets) # -- Compute TPS scores and correlation vs Wordnet synsets words -- output_plot_filepath = join( output_dir_plots, f"tps_{neighbourhood_size}_vs_frequency.pdf", ) output_tps_filepath = join( output_dir_plots, f"tps_{neighbourhood_size}_vs_frequency.npy", ) if has_word_counts and not isfile(output_plot_filepath): print( f"Computing TPS scores for top {num_top_k_words_frequencies} words vs. word frequencies" ) tps_score_word_frequencies = tps_multiple( target_words=word_vocabulary[:num_top_k_words_frequencies], word_to_int=word_to_int, neighbourhood_size=neighbourhood_size, word_embeddings_normalized=word_embeddings_normalized, ann_instance=ann_instance, n_jobs=-1, progressbar_enabled=True, ) # Compute correlation vs Wordnet synsets tps_score_vs_word_frequency_correlation, _ = pearsonr( x=tps_score_word_frequencies, y=word_counts[:num_top_k_words_frequencies], ) result_dict[tps_vs_frequency_key].append( tps_score_vs_word_frequency_correlation ) # Save plot of TPS scores vs. word frequencies tps_word_embeddings_correlation_plot( tps_scores=tps_score_word_frequencies, y_values=word_counts[:num_top_k_words_frequencies], y_label="Word frequency", tps_vs_y_correlation=tps_score_vs_word_frequency_correlation, output_plot_filepath=output_plot_filepath, neighbourhood_size=neighbourhood_size, ) # Save TPS scores to file np.save(output_tps_filepath, tps_score_word_frequencies)
from nltk.corpus import wordnet word = input("Enter word : ") # hypernyms are generic word hypernyms = [] # hyponyms are specific words than given words hyponyms = [] for syn in wordnet.synsets(word): if syn.hyponyms(): for hypo in syn.hyponyms(): if hypo.lemmas(): for l in hypo.lemmas(): hyponyms.append(l.name()) if syn.hypernyms(): for hyper in syn.hypernyms(): if hyper.lemmas(): for l in hyper.lemmas(): hypernyms.append(l.name()) hypernyms = set(hypernyms) hyponyms = set(hyponyms) print("hypernyms :", ", ".join(hypernyms)) print("\n") print("hyponyms :", ", ".join(hyponyms))
def pos(self, word): if wordnet.synsets(word): syn = wordnet.synsets(word)[0] return syn.pos() else: return None
def get_verbnet_args(verb, verbose=False): lemmatizer = WordNetLemmatizer() lemmatized_verb = lemmatizer.lemmatize(verb.lower(), 'v') classids = verbnet.classids(lemma=lemmatized_verb) if verbose: print('Class IDs for "{}": {}'.format(lemmatized_verb, classids)) if len(classids) < 1: if verbose: print( 'No entry found on verbnet for "{}". Attempting WordNet synsets!' .format(lemmatized_verb)) wn_synsets = wordnet.synsets(lemmatized_verb) for synset in wn_synsets: if len(synset.lemmas()) < 1: continue candidate = str(synset.lemmas()[0].name()) classids = verbnet.classids(lemma=candidate) if verbose: print('Class IDs for "{}": {}'.format(candidate, classids)) if len(classids) > 0: break if len(classids) < 1: if verbose: print( 'Unable to find entries on verbnet for neither of the synsets... Will go recursive now (which is not a good thing!)' ) for synset in wn_synsets: if len(synset.lemmas()) < 1: continue candidate = str(synset.hypernyms()[0].lemmas()[0].name()) return NLPUtils.get_verbnet_args(candidate, verbose=verbose) if verbose: print('Exhausted attempts... returning an empty list.') return [] for id in classids: class_number = id[id.find('-') + 1:] try: v = verbnet.vnclass(class_number) roles = [ t.attrib['type'] for t in v.findall('THEMROLES/THEMROLE') ] pass except ValueError: print('VN class number not found: {}'.format(class_number)) # Will handle these both below v = [None] roles = [] pass while len(roles) < 1 and len(v) > 0: fallback_class_number = class_number[:class_number.rfind('-')] if verbose: print('No roles found for class {}, falling back to {}.'. format(class_number, fallback_class_number)) class_number = fallback_class_number try: v = verbnet.vnclass(class_number) roles = [ t.attrib['type'] for t in v.findall('THEMROLES/THEMROLE') ] pass except ValueError: # Go on with the loop v = [None] roles = [] pass if len(roles) > 0: if verbose: print('Roles found: {}'.format(roles)) return roles return None
sense_key_regex, sense_key).groups() ss_idx = '.'.join([lemma, synset_types[int(ss_type)], lex_id]) return wn.synset(ss_idx) #x = "visit%2:38:00::" #y = "visit%2:41:02::" #x = "come%2:38:04::" #y = "come%2:30:01::" #x = "quit%2:38:00::" x = "steal%2:38:01::" #print(synset_from_sense_key(x)) #print(synset_from_sense_key(y)) ls = [] syn = wn.synsets("quit", pos=wn.VERB) print(syn) for item in syn: print(item) ls = ls + wn.synset(item.name()).lemma_names() print(wn.synset(item.name()).lemma_names()) print() print(ls) print("=============================") #print(wn.synset("embark.v.02").lemma_names()) #eat = wn.lemma('arrive.v.01.arrive') #print(eat.key()) #print(wn.synset("come.v.04").lemma_names())
def openfile(f): x = [] y = [] finial = [] i = 2 csvfile = open(f, 'rb') reader = csv.DictReader(csvfile) similar = 0.0 word_count = 0.0 flag = 0 text1 = '' text2 = '' text1_main = '' text2_main = '' text1_split = [] text2_split = [] synonyms = None word_count = 0 similar = 0 for row in reader: synonyms = None if i == 0: for j in text1_split: flag = 0 if j not in text2_split: synonyms = wordnet.synsets(j) synonyms = set( chain.from_iterable( [word.lemma_names() for word in synonyms])) for k in synonyms: if k in text2_split: flag = 1 else: similar += 1 if flag == 1: similar += 1 word_count += 1 for j in text2_split: flag = 0 if j not in text1_split: synonyms = wordnet.synsets(j) synonyms = set( chain.from_iterable( [word.lemma_names() for word in synonyms])) for k in synonyms: if k in text1_split: flag = 1 else: similar += 1 if flag == 1: similar += 1 word_count += 1 print similar / word_count i = 2 word_count = 0 similar = 0 similar = 0.0 word_count = 0.0 flag = 0 text1 = '' text1_main = '' text2 = '' text2_main = '' elif i == 1: text1 = main_text(row['url']) text1_split = text1.split(' ') text1_split = [x.upper() for x in text1_split if x] elif i == 2: text2 = main_text(row['url']) text2_split = text2.split(' ') text2_split = [x.upper() for x in text2_split if x] print a csvfile.close()
Created on Thu Apr 29 11:12:07 2018 @author: Ayshwarya """ import nltk from nltk.corpus import wordnet import re # READ FILE Railway Station Inputfile = open("Railway station.txt", "r") nouns = [] # print sample synset syn = wordnet.synsets("Animal") print(syn[0].lemmas()[0]) # File = [ line for line in Inputfile ] String = '' . join(File) sentences = re.split(r'[.!?]', String) # Find out all the noun words in all the sentences of the text for sentence in sentences: for word,pos in nltk.pos_tag(nltk.word_tokenize(str(sentence))): if (pos == 'NNPS' or pos == 'NN' or pos == 'NNS' or pos == 'NNP'): nouns.append(word) print("LIST OF NOUNS IN THE TEXT") print("-------------------------")
def getWUPSimilarity(w1, w2): doc1 = nlp(w1) doc2 = nlp(w2) if doc1[0].lemma_ == doc2[0].lemma_: return 1 synonyms, _ = getSynAnt(w1) if w2 in synonyms: return 0.9 synonyms, _ = getSynAnt(w2) if w1 in synonyms: return 0.9 #NOUN synw1s = wordnet.synsets(w1, wordnet.NOUN) if len(synw1s) > 0: synw2s = wordnet.synsets(w2, wordnet.NOUN) if len(synw2s) > 0: return synw1s[0].wup_similarity(synw2s[0]) synw2s = wordnet.synsets(w2, wordnet.VERB) if len(synw2s) > 0: return synw1s[0].wup_similarity(synw2s[0]) synw2s = wordnet.synsets(w2, wordnet.ADJ) if len(synw2s) > 0: return synw1s[0].wup_similarity(synw2s[0]) synw2s = wordnet.synsets(w2, wordnet.ADV) if len(synw2s) > 0: return synw1s[0].wup_similarity(synw2s[0]) #VERB synw1s = wordnet.synsets(w1, wordnet.VERB) if len(synw1s) > 0: synw2s = wordnet.synsets(w2, wordnet.NOUN) if len(synw2s) > 0: return synw1s[0].wup_similarity(synw2s[0]) synw2s = wordnet.synsets(w2, wordnet.VERB) if len(synw2s) > 0: return synw1s[0].wup_similarity(synw2s[0]) synw2s = wordnet.synsets(w2, wordnet.ADJ) if len(synw2s) > 0: return synw1s[0].wup_similarity(synw2s[0]) synw2s = wordnet.synsets(w2, wordnet.ADV) if len(synw2s) > 0: return synw1s[0].wup_similarity(synw2s[0]) #ADJ synw1s = wordnet.synsets(w1, wordnet.ADJ) if len(synw1s) > 0: synw2s = wordnet.synsets(w2, wordnet.NOUN) if len(synw2s) > 0: return synw1s[0].wup_similarity(synw2s[0]) synw2s = wordnet.synsets(w2, wordnet.VERB) if len(synw2s) > 0: return synw1s[0].wup_similarity(synw2s[0]) synw2s = wordnet.synsets(w2, wordnet.ADJ) if len(synw2s) > 0: return synw1s[0].wup_similarity(synw2s[0]) synw2s = wordnet.synsets(w2, wordnet.ADV) if len(synw2s) > 0: return synw1s[0].wup_similarity(synw2s[0]) #ADV synw1s = wordnet.synsets(w1, wordnet.ADV) if len(synw1s) > 0: synw2s = wordnet.synsets(w2, wordnet.NOUN) if len(synw2s) > 0: return synw1s[0].wup_similarity(synw2s[0]) synw2s = wordnet.synsets(w2, wordnet.VERB) if len(synw2s) > 0: return synw1s[0].wup_similarity(synw2s[0]) synw2s = wordnet.synsets(w2, wordnet.ADJ) if len(synw2s) > 0: return synw1s[0].wup_similarity(synw2s[0]) synw2s = wordnet.synsets(w2, wordnet.ADV) if len(synw2s) > 0: return synw1s[0].wup_similarity(synw2s[0])
list_synonyms.pop(0) list_synonyms.pop(0) # Elimina los titulos tuplas_synonym_UNESCO.pop(0) # Elimina los titulos d = {} list_synonyms_clear = [ d.setdefault(x, x) for x in list_synonyms if x not in d ] # Elimina valores duplicados list_synonyms_clear.pop() dict_synonym = {} dict_synonym_UNESCO = {} for word in list_synonyms_clear: synonyms = [] cont = 0 for syn in wordnet.synsets(word, lang='spa'): for l in syn.lemmas(lang='spa'): synonyms.append(l.name().lower()) cont += 1 #if l.antonyms(): #antonyms.append(l.antonyms()[0].name()) if cont > 1: dict_synonym[word] = set(synonyms) else: list_tmp = [] for i in tuplas_synonym_UNESCO: if word == i[0]: list_tmp.append(i[1]) if len(list_tmp) > 1: dict_synonym[word] = set(list_tmp)
def wsd_lesk(raw_df, algorithm_choice): """This finds the synset of the word using the original sentence as context and different lesk algorithms from nltk- and pywsd-packages. Algorithm choices are: 1. nltk's lesk 2. pywsd simple_lesk, 3. pywsd advanced_lesk.""" start = timer() algorithm_dict = {1: "nltk_lesk", 2: "pywsd_simple_lesk", 3: "pywsd_advanced_lesk", 4: "pywsd_cosine_lesk"} df = raw_df full_aspect_synset_list = [] full_aspect_synset_list_definition = [] aspect_synset_list_definition = [] aspect_synset_list = [] opinion_synset_list = [] opinion_synset_list_definition = [] full_opinion_synset_list = [] full_opinion_synset_list_definition = [] aspect_opinion = ["aspect_tags", "opinion_tags"] tokenized_sentences = raw_df["tokenized_sentence"] non_tokenized_sentences = raw_df["original_text"] for opinion_list in aspect_opinion: for i, phrase in enumerate(df[opinion_list]): multiple_word_found = False for j, word in enumerate(phrase): special_word = False if multiple_word_found is False: # Check here for special words such as "bug". aspect = check_for_special_word(word) if aspect is not None: special_word = True wn_check = [] if len(phrase) >= 2: k = 0 temporary_combined_word = [] while k < len(phrase): temporary_combined_word.append(phrase[k][0]) k += 1 combined_word_string = '_'.join(temporary_combined_word) wn_check = wn.synsets(combined_word_string, pos=find_wordnet_pos(word[1])) multiple_word_found = True if len(wn_check) == 0: wn_check = wn.synsets(word[0], pos=find_wordnet_pos(word[1])) multiple_word_found = False if len(wn_check) > 0: if special_word is False: if algorithm_choice == 1: if multiple_word_found is True: aspect = lesk(tokenized_sentences[i], combined_word_string, find_wordnet_pos(word[1])) else: aspect = lesk(tokenized_sentences[i], word[0], find_wordnet_pos(word[1])) if algorithm_choice == 2: if multiple_word_found is True: aspect = pylesk.simple_lesk(non_tokenized_sentences[i], combined_word_string, find_wordnet_pos(word[1])) else: aspect = pylesk.simple_lesk(non_tokenized_sentences[i], word[0], find_wordnet_pos(word[1])) if algorithm_choice == 3: if multiple_word_found is True: aspect = pylesk.adapted_lesk(non_tokenized_sentences[i], combined_word_string, find_wordnet_pos(word[1])) else: aspect = pylesk.adapted_lesk(non_tokenized_sentences[i], word[0], find_wordnet_pos(word[1])) if algorithm_choice == 4: if multiple_word_found is True: aspect = pylesk.cosine_lesk(non_tokenized_sentences[i], combined_word_string, find_wordnet_pos(word[1])) else: aspect = pylesk.cosine_lesk(non_tokenized_sentences[i], word[0], find_wordnet_pos(word[1])) if aspect is not None: if opinion_list is "aspect_tags": aspect_synset_list.append(aspect) aspect_synset_list_definition.append(aspect.definition()) else: opinion_synset_list.append(aspect) opinion_synset_list_definition.append(aspect.definition()) if opinion_list is "aspect_tags": full_aspect_synset_list.append(aspect_synset_list) full_aspect_synset_list_definition.append(aspect_synset_list_definition) aspect_synset_list = [] aspect_synset_list_definition = [] else: full_opinion_synset_list.append(opinion_synset_list) full_opinion_synset_list_definition.append(opinion_synset_list_definition) opinion_synset_list = [] opinion_synset_list_definition = [] df[algorithm_dict[algorithm_choice] + "_aspect_synset"] = pd.Series(full_aspect_synset_list).values df[algorithm_dict[algorithm_choice] + "_aspect_definition"] = pd.Series(full_aspect_synset_list_definition).values df[algorithm_dict[algorithm_choice] + "_opinion_synset"] = pd.Series(full_opinion_synset_list).values df[algorithm_dict[algorithm_choice] + "_opinion_definition"] = pd.Series(full_opinion_synset_list_definition).values end = timer() logging.debug("WSD Lesk Time: %.2f seconds" % (end - start)) return df
if not line: break line = line.replace('\n', '') line = line.split(" ", 1) new_line = line[0] line[1] = line[1].lower() line[1] = line[1].translate(str.maketrans('', '', string.punctuation)) word_tokens = word_tokenize(line[1]) filtered_sentence = [w for w in word_tokens if not w in stop_words] synonyms = [] count = 0 for x in filtered_sentence: for syn in wordnet.synsets(x): for l in syn.lemmas(): if (count < 3): if l.name() not in synonyms: synonyms.append(l.name()) count += 1 count = 0 synonyms_string = ' '.join(synonyms) new_line = " ".join([str(new_line), synonyms_string]) synonyms = [] fout.write(new_line) fout.write('\n') f.close()
def candidates_for_word_type(self, trips, word, pos): ss = wn.synsets(word, pos) res = {s: self.trips_candidate(trips, s) for s in ss} return {s: t for s, t in res.items() if t}
def get_synsets(word): synsets = wn.synsets(word) return synsets
def precisionatk_nltk(self, pred_words, klist): ''' precision at k function which takes into account polysemy using NLTK wordmap ''' try: import nltk nltk.data.path = ['./nltk_data'] from nltk.corpus import wordnet as wn except ImportError: raise RuntimeError("Need NLTK for this function.") nltk_map = { 'es': 'spa', 'fr': 'fra', 'it': 'ita', 'en': 'eng', 'zh': 'cmn' } def set_correct(correct, val, prediction): for i, k in enumerate(klist): if len(set(correct) & set(prediction[:k])) > 0: val[i] = 1 ret_val = 1. * np.zeros_like(klist) word_map = {} for idx, (src, gold) in enumerate(self.word_map): if src not in word_map: word_map[src] = ([], pred_words[idx]) word_map[src][0].append(gold) d = len(word_map) for word in word_map: prediction = self.tgt.ix2word[word_map[word][1]] val = np.zeros_like(klist) src_word = self.src.ix2word[word] gold = self.tgt.ix2word[word_map[word][0]] ''' Normal Dictionary Matching ''' set_correct(gold, val, prediction) ''' Checking if any sense of the gold word matches with the prediction ''' if self.tgt.name not in nltk_map: ret_val += val continue tgt_lang = nltk_map[self.tgt.name] synsets = [w for gold_word in gold for w in wn.synsets(gold_word)] similar_words = [ w for synset in synsets for w in synset.lemma_names(tgt_lang) ] set_correct(similar_words, val, prediction) ''' Checking if the prediction is the translation of any sense of the source word ''' if self.src.name not in nltk_map: ret_val += val continue synsets = wn.synsets(src_word) similar_words = [ w for synset in synsets for w in synset.lemma_names(tgt_lang) ] set_correct(similar_words, val, prediction) ret_val += val ret_val *= (100. / d) return ret_val, len(set(self.word_map[:, 0]))
def find_wordnet_synonyms_nouns(noun_synset): start = timer() original_synset = noun_synset synonym_words = [] # print("Original: %s" % (original_synset)) # This is for the synonym words from this exact synset. # for synonym_word in original_synset.lemma_names(): # print("Original: %s synonym: %s" % ( # original_synset, synonym_word)) # if synonym_word != original_synset.lemma_names()[0]: # synonym_words.append(synonym_word) # This is for the synonym synsets that compare # against the original synset. if original_synset.pos() == "n": for synonym_synset in wn.synsets(original_synset.lemma_names()[0], original_synset.pos()): # print(synonym) if (original_synset != synonym_synset) and (original_synset.lch_similarity(synonym_synset) >= 2.5): if synonym_synset.lemma_names()[0] not in synonym_words: synonym_words.append(synonym_synset.lemma_names()[0]) print("Original: %s other synsets: %s LCH-similarity %s" % ( original_synset, synonym_synset, original_synset.lch_similarity(synonym_synset))) for nested_hyponym_synset in synonym_synset.hyponyms(): if original_synset.lch_similarity(nested_hyponym_synset) >= 2.5: synonym_words.append(nested_hyponym_synset.lemma_names()[0]) print("Other synset: %s nested_hyponym words: %s LCH(original) %s" % (synonym_synset, nested_hyponym_synset, original_synset.lch_similarity(nested_hyponym_synset))) # This goes into the hyponyms of hyponyms, seems too deep for now. # for double_nested_hyponym_synset in nested_hyponym_synset.hyponyms(): # print("Hypernym: %s double_nested_hyponym words: %s LCH(original) %s" % ( # nested_hyponym_synset, double_nested_hyponym_synset, original_synset.lch_similarity(double_nested_hyponym_synset))) # This iterates first to a higher level, e.g. from Synset computer.n.01 # to machine.n.01, and then over all the hypernyms from machine.n.01. # This doesn't make sense at this level, as it produces too much noise # and all the distances are always the same. # for hypernym_synset in original_synset.hypernyms(): # print("Original: %s nested_hypernym words: %s LCH-similarity %s" % (original_synset, hypernym_synset, original_synset.lch_similarity(hypernym_synset))) # for nested_synonym_synset in hypernym_synset.hyponyms(): # print("Hypernym: %s nested_synonym synset: %s LCH (original&nested) %s" % (hypernym_synset, nested_synonym_synset, original_synset.lch_similarity(nested_synonym_synset))) # print("Original: %s other synset words: %s WUP-similarity %s" % ( # original_synset, synonym_synset, original_synset.wup_similarity(synonym_synset))) # This part deals with adjectives, that # have different relations than nouns. # if original_synset.pos() == "a": # This is for antonyms (opposites e.g. dry-wet), it # loops through all synonyms, although antonym seems # to be assigned only to the first for the set. # for synonym in original_synset.lemmas(): # for antonym in synonym.antonyms(): # print("Original: %s antonym: %s" % ( # synonym, antonym)) # This is for similar adjectives, which are # also called satellites: # https://wordnet.princeton.edu/documentation/wngloss7wn # for similar in original_synset.similar_tos(): # print("Original: %s satellite_adjective: %s" % ( # original_synset, similar)) # synonym_words.append(similar.lemma_names()[0]) end = timer() logging.debug("Wordnet cycle: %.2f seconds" % (end - start)) return synonym_words
def run(queryList): # stemmer = PorterStemmer() stemmer = SnowballStemmer("english") f = open("data/expanded.txt", "w+") for query in queryList: querySplitted = query.split(",") # tokenizing the query tokens = nltk.word_tokenize(querySplitted[1]) # removing stop words in the query filtered_words = [word for word in tokens if word not in stopwords.words('english')] # pos tagging of tokens pos = nltk.pos_tag(filtered_words) synonyms = [] # synonyms of all the tokens index = 0 # iterating through the tokens for item in filtered_words: synsets = wordnet.synsets(item) if not synsets: # stemming the tokens in the query synsets = wordnet.synsets(stemmer.stem(item)) # synonyms of the current token currentSynonyms = [] currentPOS = get_wordnet_pos(pos[index]) # iterating through the synsets for i in synsets: # first we check if token and synset have the same part of speech if str(i.pos()) == str(currentPOS): for j in i.lemmas(): if j.name() not in currentSynonyms: # if we have not currentSynonyms.append(j.name().replace("_", " ")) synonyms.append(currentSynonyms) index += 1 f.write(querySplitted[0] + ", " + querySplitted[1] + ", ") # removing duplicate lists in the synonyms list tmp = [] for elem in synonyms: if elem and elem not in tmp: tmp.append(elem) synonyms = tmp # now that we have all the synonyms for x in itertools.product(*synonyms): current = "" for item in x: current += item current += " " current += ", " f.write(current) f.write("\n")
# * **Hiperonimo:** Es un synset mas generalizado que puede abarcar varias palabras. El ejemplo de la clase es que Artefacto es un hiperónimo de vehículo motorizado. # * **Hiponimo:** Es un synset que no es general sino más específico. # # Importamos **Wordnet** # # In[16]: nltk.download('omw') from nltk.corpus import wordnet as wn # **synset:** grupo de sinómimos de una palabra. # In[17]: ss = wn.synsets('carro', lang='spa') ss # Explorando los synsets # In[18]: for syn in ss: print(syn.name(), ': ', syn.definition()) for name in syn.lemma_names(): print(' * ', name) # ### visualization references # # [Visualizing WordNet relationships as graphs](http://www.randomhacks.net/2009/12/29/visualizing-wordnet-relationships-as-graphs/) #
def iterate(df): correct_count = 0 wrong_count = 0 logloss = 0 for index, row in df.iterrows(): res = row["is_duplicate"] terms1 = get_terms(row["question1"]) terms2 = get_terms(row["question2"]) sims = [] for word1 in terms1: word1_sim = [] try: syn1 = wn.synsets(word1)[0] except: sims.append([0 for i in range(0, len(terms2))]) continue for word2 in terms2: try: syn2 = wn.synsets(word2)[0] except: word1_sim.append(0) continue word_similarity = syn1.wup_similarity(syn2) word1_sim.append(word_similarity) sims.append(word1_sim) # print sims word1_score = 0 for i in range(0, len(terms1), 1): try: word1_score += max(sims[i]) except: continue word1_score /= len(terms1) word2_score = 0 for i in range(0, len(terms2), 1): try: word2_score += max([j[i] for j in sims]) except: continue word2_score /= len(terms2) pair_score = (word1_score + word2_score) / 2 if res == 1: logloss += math.log(pair_score) if (pair_score > 0.5): pred = 1 else: pred = 0 if pred == res: correct_count += 1 else: wrong_count += 1 if index % 100 == 0: print correct_count, wrong_count print logloss / (correct_count + wrong_count)
def get_antoynm(x): prefix = [] f = open("semantic/prefixes.txt", "r") prefix = f.readlines() for i in range(0, len(prefix) - 1): prefix[i] = prefix[i][:-1] f.close() #print(prefix) dic_words, dic_ant = d.extract_more_antonyms() xx = [] ant_a = [] ant_n = [] ant_v = [] temp = [] for syn in wn.synsets(str(x)): xx.append(chunck(syn.name(), 0)) #print(xx) for synx in xx: for syn in wn.synsets(synx): for l in syn.lemmas(): string = l.name() #print("jgdkgj" + string) if string in dic_words: #print("morun1") index = dic_words.index(string) ant_v.append(dic_ant[index]) #print("!") #print(dic_ant[index]) #print(index) if string in dic_ant: #print("morun2") index = dic_ant.index(string) ant_v.append(dic_words[index]) #print("!!") #print(dic_words[index]) if l.antonyms(): i = 0 while i < len(l.antonyms()): n = chunck(str(l.antonyms()[i]), 1) if str(l.antonyms()[i].name()).startswith( "re", 0, 2) and str( l.antonyms()[i].name()) in prefix: string = str(l.antonyms()[i].name()) temp.append(string[2:]) elif (str(l.antonyms()[i].name()).startswith( "un", 0, 2) or str(l.antonyms()[i].name()).startswith( "ir", 0, 2) or str(l.antonyms()[i].name()).startswith( "il", 0, 2) or str(l.antonyms()[i].name()).startswith( "im", 0, 2) or str(l.antonyms()[i].name()).startswith( "non", 0, 3) or str(l.antonyms()[i].name()).startswith( "in", 0, 2)) and str( l.antonyms()[i].name()) in prefix: temp.append(str(l.antonyms()[i].name())) else: if n == "a" or n == "s": ant_a.append(l.antonyms()[i].name()) if n == "n": ant_n.append(l.antonyms()[i].name()) else: ant_v.append(l.antonyms()[i].name()) i += 1 #print(ant_a) #print(ant_n) #print(ant_v) #print(temp) if len(ant_a) == 0 and len(ant_v) == 0 and len(ant_n) == 0 and len( temp) == 0: return None else: if len(ant_a) >= 1 and n != "v": c = Counter(ant_a) for i in c.elements(): print(i, c[i]) return i if len(ant_n) >= 1 and n != "v": c = Counter(ant_n) for i in c.elements(): print(i, c[i]) return i if len(ant_v) >= 1: c = Counter(ant_v) for i in c.elements(): print(i, c[i]) return i else: ## read from the list of prefixes c = Counter(temp) for i in c.elements(): print(i, c[i]) return i
import nltk from nltk.corpus import wordnet synonyms = [] antonyms = [] for syn in wordnet.synsets("good"): for l in syn.lemmas(): synonyms.append(l.name()) if l.antonyms(): antonyms.append(l.antonyms()[0].name()) print(set(synonyms)) print(set(antonyms))
tokens = word_tokenize(text_file) # print tokens # whitespace tokenizer from nltk.tokenize import regexp_tokenize tokenizer = regexp_tokenize(text_file, '\s+', gaps=True) # print tokenizer from nltk.corpus import stopwords english_stops = set(stopwords.words('english')) words = tokenizer # print [word for word in words if word not in english_stops] #look up words and print synset from nltk.corpus import wordnet syn = wordnet.synsets('cookbook')[0] print syn.name() print syn.definition() print syn.hypernyms() print syn.hypernyms()[0].hyponyms() print syn.root_hypernyms() print syn.hypernym_paths() # # for w in words: # print w # syn = wordnet.synsets(w) # if (type(syn) == 'list'): # syn = syn[0] # # print syn # if (len(syn) != 0):
spa_nodes = deepcopy(pickled_graph.nodes(data=True)) for term_ind, node in spa_nodes: term = dictionary[node['term_id']] # syn nodes have already been created, the values are in in the list if term in syns_per_term: for syn in syns_per_term[term]: pickled_graph.add_edge(term_ind, syn_to_node_map[syn_dict.token2id[syn]], attr_dict={'weight': 0.5}) # synonyms have not been created for this term else: # get syns for term syns = wn.synsets(term) for syn_obj in syns: # extracts the text value from the syn object syn = syn_obj.name().split('.')[0] # We have not seen this syn yet if syn not in syn_dict.token2id: # add syn term to dictionary syn_dict.add_documents([[syn]]) # add syn node to graph pickled_graph.add_node(node_count, type='SYN', term_id=syn_dict.token2id[syn], freq_per_doc=-1,
with open(input_file, 'r', encoding='utf-8') as file: for text in file: text = re.sub(r"^AdvertisementSupported.*— ", '', text) text = ''.join([i for i in text if not i.isdigit()]) if 'RT @' in text: text = text[4:] text = clean_tweet_url(text) text = re.sub(r"([:=;X][oO\-]?[D\)\]\(\]/\\OpP]) ", '', text) emojis = extract_emojis(text) tweet_xx = clean_tweet(text) for em in emojis[:]: tweet_xx = re.sub(em,'',tweet_xx) tweet_xx=re.sub(emoji_pattern,'',tweet_xx) stop = set(stopwords.words('english')) sentence = " ".join([word.lower() for word in word_tokenize(tweet_xx) if word.lower() not in stop]) sentence = " ".join([word for word in word_tokenize(sentence) if wordnet.synsets(word)]) sentence = " ".join([stem(word) for word in word_tokenize(sentence)]) textOutFile.writelines(sentence + '\n') textOutFile.close() except BaseException as e: print('processing file: %s' % filename) print("Error while search: %s" % str(e)) exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] print(exc_type, fname, exc_tb.tb_lineno) continue