def path_similarity(self, word1, word2): score = 0 for syn1 in wn.synsets(word1): for syn2 in wn.synsets(word2): if (wn.path_similarity(syn1, syn2)): score += wn.path_similarity(syn1, syn2) self.path.append(score)
def checkFirstSentence_test5(paragraphID, paragraph): if paragraphID in cache: tokens = cache[paragraphID] else: tokens = getParagraphTokenIntersectionByID(paragraphID) cache[paragraphID] = tokens for token in tokens: if token in paragraph[0]: return True for token in tokens: tokenSynset = getSynset(token, paragraphID) if tokenSynset: partOfSpeechMap = POSCache[paragraphID] tokenPos = partOfSpeechMap[token] for word in paragraph[0].split(" "): word = word.strip().lower() if word in partOfSpeechMap: wordPos = partOfSpeechMap[word] if wordPos == tokenPos: wordSynset = getSynset(word, paragraphID) if wordSynset: if ( wn.path_similarity(tokenSynset, wordSynset) and wn.path_similarity(tokenSynset, wordSynset) > 0.13 ): return True return False
def test_path_similarities(self): from nltk.corpus import wordnet as nltk_wn nltk_cat = nltk_wn.synset('cat.n.1') nltk_dog = nltk_wn.synset('dog.n.1') nltk_bus = nltk_wn.synset('bus.n.1') our_cat = our_wn.synset('cat.n.1') our_dog = our_wn.synset('dog.n.1') our_bus = our_wn.synset('bus.n.1') assert nltk_wn.path_similarity(nltk_cat, nltk_dog) == our_wn.path_similarity( our_cat, our_dog) assert nltk_wn.wup_similarity(nltk_cat, nltk_dog) == our_wn.wup_similarity( our_cat, our_dog) assert nltk_wn.lch_similarity(nltk_cat, nltk_dog) == our_wn.lch_similarity( our_cat, our_dog) assert nltk_wn.path_similarity(nltk_cat, nltk_bus) == our_wn.path_similarity( our_cat, our_bus) assert nltk_wn.wup_similarity(nltk_cat, nltk_bus) == our_wn.wup_similarity( our_cat, our_bus) assert nltk_wn.lch_similarity(nltk_cat, nltk_bus) == our_wn.lch_similarity( our_cat, our_bus)
def get_synonym(tok_details): count = 0 for i in range(len(tok_details)): tok_details[i][1] = Dic_pos[ tok_details[i][1] [0]] if tok_details[i][1][0] in Dic_pos.keys() else None tok_details[i].append(wn.synsets(tok_details[i][0], tok_details[i][1])[0]) if \ len(wn.synsets(tok_details[i][0], tok_details[i][1])) > 0 \ else tok_details[i].append(None) if tok_details[i][3] == 1: count += 1 print(count) # for i in tok_details: # # if i[3] is 0: # # print(i) for i in range(len(tok_details)): if tok_details[i][4] is not None and tok_details[i][3] == 0: # print() for j in range(len(tok_details)): if i != j and tok_details[j][4] is not None: # print(tok_details[i], tok_details[j]) # print(tok_details[i], tok_details[j], wn.path_similarity(tok_details[i][4], tok_details[j][4])) if wn.path_similarity(tok_details[i][4], tok_details[j][4]) is not None \ and wn.path_similarity(tok_details[i][4], tok_details[j][4]) >= 1: tok_details[i][3] = 2 print(tok_details[i], tok_details[j]) print(i, j) break return tok_details
def getSimilarity(w1, w2, method='max'): meanings_1 = wn.synsets(w1) nm1 = len(meanings_1) meanings_2 = wn.synsets(w2) nm2 = len(meanings_2) if (method == 'max'): similarity = 0 for i in range(nm1): m1 = wn.synset(meanings_1[i].name()) for j in range(nm2): m2 = wn.synset(meanings_2[j].name()) sim = wn.path_similarity(m1, m2) similarity = max(sim, similarity) elif (method == 'mean'): if (nm1 * nm2 == 0): return 0 similarities = [0 for i in range(nm1 * nm2)] count = 0 for i in range(nm1): m1 = wn.synset(meanings_1[i].name()) for j in range(nm2): m2 = wn.synset(meanings_2[j].name()) sim = wn.path_similarity(m1, m2) if (sim == None): sim = 0 similarities[count] = sim count += 1 similarity = float(sum(similarities)) / count return similarity
def similarity_by_path(sense1, sense2, option="path"): if option.lower() in ["path", "path_similarity"]: # Path similaritys return max(wn.path_similarity(sense1,sense2), wn.path_similarity(sense1,sense2)) elif option.lower() in ["wup", "wupa", "wu-palmer", "wu-palmer"]: # Wu-Palmer return wn.wup_similarity(sense1, sense2) elif option.lower() in ['lch', "leacock-chordorow"]: # Leacock-Chodorow if sense1.pos != sense2.pos: # lch can't do diff POS return 0 return wn.lch_similarity(sense1, sense2)
def wnsensesim(synset1, synset2, metric): #return wn similarity of two synsets according to metric #if metric == 'path_similarity': print "synset1:%r"%synset1 print "synset2:%r"%synset2 if metric == 'path_similarity': print wn.path_similarity(synset1, synset2) return wn.path_similarity(synset1, synset2) else:#add more similarity measures e.g., jcn print "Unsupported wn similarity measure requested"
def similarity_by_path(sense1, sense2, option="path"): """ Returns maximum path similarity between two senses. """ if option.lower() in ["path", "path_similarity"]: # Path similaritys return max(wn.path_similarity(sense1,sense2), wn.path_similarity(sense1,sense2)) elif option.lower() in ["wup", "wupa", "wu-palmer", "wu-palmer"]: # Wu-Palmer return wn.wup_similarity(sense1, sense2) elif option.lower() in ['lch', "leacock-chordorow"]: # Leacock-Chodorow if sense1.pos != sense2.pos: # lch can't do diff POS return 0 return wn.lch_similarity(sense1, sense2)
def most_similar_path(synsets_dict, verb): best_similarity = -1 most_similar = str() verb_synset = wn.synsets(verb, pos=wn.VERB)[0] for verb, synset in synsets_dict.items(): if wn.path_similarity(synset, verb_synset) > best_similarity: best_similarity = wn.path_similarity(synset, verb_synset) most_similar = verb return most_similar
def similarity_by_path(sense1, sense2, option="path"): """ Returns maximum path similarity between two senses. """ if option.lower() in ["path", "path_similarity"]: # Path similaritys return max(wn.path_similarity(sense1,sense2), wn.path_similarity(sense1,sense2)) elif option.lower() in ["wup", "wupa", "wu-palmer", "wu-palmer"]: # Wu-Palmer return wn.wup_similarity(sense1, sense2) elif option.lower() in ['lch', "leacock-chordorow"]: # Leacock-Chodorow if sense1.pos != sense2.pos: # lch can't do diff POS return 0 return wn.lch_similarity(sense1, sense2) return wn.lin_similarity(sense1, sense2, wnic.ic('ic-bnc-add1.dat'))
def find_exercise(self, sentence, dist_th=0.2): sport_v = [] ss_sport = wn.synsets('sport')[0] ss_exercise = wn.synsets('exercise')[0] ss_practice = wn.synsets('practice')[0] pt_v = self.postag(sentence) if len(pt_v) > 1: for w, t in pt_v: if 'sport' in w or 'exe' in w or 'practi' in w: continue if ('NN' in t) or ('VB' in t): ss_NN = wn.synsets(w) d_sport = [ wn.path_similarity(ss, ss_sport, simulate_root=False) for ss in ss_NN ] d_exercise = [ wn.path_similarity(ss, ss_exercise, simulate_root=False) for ss in ss_NN ] d_practice = [ wn.path_similarity(ss, ss_practice, simulate_root=False) for ss in ss_NN ] ## print(d_sport) ## print(d_exercise) d_v = [ d for d in (d_sport + d_exercise + d_practice) if d is not None ] if len(d_v) > 0: d_NN = max(d_v) else: d_NN = 0 if d_NN > dist_th: sport_v.append([d_NN, w]) elif len(pt_v) == 1: sport_v.append((1.0, pt_v[0][0])) if len(sport_v) == 0: return None else: sport_v.sort() return sport_v[0][1]
def get_relation(syn1,syn2,sim_metric): from nltk.corpus import wordnet as wn if sim_metric == "path": # https://stackoverflow.com/questions/20075335/is-wordnet-path-similarity-commutative sim_score = min(wn.path_similarity(syn1,syn2), wn.path_similarity(syn2,syn1)) elif sim_metric == "lch": if syn1.pos() == syn2.pos(): sim_score = syn1.lch_similarity(syn2) else: sim_score = 0 elif sim_metric == "wup": sim_score = syn1.wup_similarity(syn2) if sim_score: return sim_score else: return 0
def get_relation(syn1, syn2, sim_metric): from nltk.corpus import wordnet as wn if sim_metric == "path": # https://stackoverflow.com/questions/20075335/is-wordnet-path-similarity-commutative sim_score = min(wn.path_similarity(syn1, syn2), wn.path_similarity(syn2, syn1)) elif sim_metric == "lch": if syn1.pos() == syn2.pos(): sim_score = syn1.lch_similarity(syn2) else: sim_score = 0 elif sim_metric == "wup": sim_score = syn1.wup_similarity(syn2) if sim_score: return sim_score else: return 0
def domain_range_measure(q_type, p_domain, p_range): if q_type == 'HUM': q_type = "person" elif q_type == 'NUM': q_type = "number" elif q_type == 'ENTY': q_type = "entity" elif q_type == 'DESC': q_type = "" elif q_type == 'ABBR': q_type = "abbreviation" elif q_type == 'LOC': q_type = "location" if q_type == p_domain and q_type == p_range: return 1 elif q_type == p_domain or q_type == p_range: return 0.75 else: f_syns = wn.synsets(q_type) s1_syns = wn.synsets(p_domain) s2_syns = wn.synsets(p_range) path_sim_d = 0.0 path_sim_r = 0.0 try: for f in f_syns: for s in s1_syns: path_sim1 = wn.path_similarity(f, s) if path_sim1 > path_sim_d: path_sim_d = path_sim1 for s in s2_syns: path_sim2 = wn.path_similarity(f, s) if path_sim2 > path_sim_r: path_sim_r = path_sim2 except: pass if path_sim_d == path_sim_r and path_sim_d > 0.75: return 1 else: return max(path_sim_d, path_sim_r) return 0
def sentence_similarity(wordSense1, wordSense2, similarity_metric = 'path'): ''' Calculating sentence similarity measurement. Parameters: wordSense1 (list): a list of extracted sense for the first sentence. wordSense2 (list): a list of extracted sense for the second sentence. similarity_metric (str): which algorithm for similarity measurement. Default to be the path similaity. Available choice include path similarity (path), and Wu-Palmer Similarity (lcs). See the official definition here: http://www.nltk.org/howto/wordnet.html Return: the similarity score (float). ''' similarity = 0.0 total = 0.0 if len(wordSense1) == 0 or len(wordSense2) == 0: return 0 for sense1 in wordSense1: for sense2 in wordSense2: total += 1.0 cur_sim = None if similarity_metric == 'path': cur_sim = wn.path_similarity(sense1, sense2) elif similarity_metric == 'lcs': cur_sim = wn.wup_similarity(sense1, sense2) else: raise ValueError('ERROR: given similarity metric is not defined.') if cur_sim: similarity += cur_sim return similarity / total
def disambiguateWordSenses(self,sentence,word): wordsynsets = wn.synsets(word) bestScore = 0.0 result = None for synset in wordsynsets: for w in nltk.word_tokenize(sentence): score = 0.0 for wsynset in wn.synsets(w): sim = wn.path_similarity(wsynset, synset) if(sim == None): continue else: score += sim if (score > bestScore): bestScore = score result = synset if result: pos = result.pos() offset = result.offset() pos_score=0.0 neg_score=0.0 if (pos, offset) in self.db: pos_score, neg_score = self.db[(pos, offset)] obj = 1.0-(pos_score+neg_score) #print "%%%%%%%%%%" #print pos_score,neg_score, obj else: obj=1.0 pos=None pos_score=0.0 neg_score=0.0 return obj,pos,pos_score,neg_score
def semantic_similarity(word1, word2, speech, measure): """ Finds the highest similarity score for the given pair of words. Goes through each combination of all senses. :param word1: First word in the pair of words :param word2: Second word in the pair of words :param speech: part of speech e.g. nw.NOUN :param measure: String representing the type of similarity measure ("path" = path ; "res" = Resnik ; "lin" = Lin) :return: The highest similarity score across all senses and all parts of speech """ #error handling if invalid measure input is given if measure not in ["path","res","lin"]: raise ValueError("Not a valid similarity type \n Must be 'path'(path), 'res'(Resnik) or 'lin'(Lin)") greatest = 0 conceptsA = wn.synsets(word1,speech) conceptsB = wn.synsets(word2,speech) #finds similarity score for every combination of senses for conceptA in conceptsA: for conceptB in conceptsB: if measure == "path": similarity = wn.path_similarity(conceptA,conceptB) elif measure == "res": similarity = wn.res_similarity(conceptA,conceptB,brown_ic) elif measure == "lin": similarity = wn.lin_similarity(conceptA,conceptB,brown_ic) if similarity == None : continue #error checking if similarity scorce not possible if similarity>greatest: greatest = similarity #if new highest similairty is found, set it to the greatest return greatest
def distance_between_pairs(self, lemma_i, lemma_j, pos_i, pos_j): '''Computes path distance between a pair of words Args: lemma_i: i-th word lemma. lemma_j: j-th word lemma. pos_i: i-th word part of speech tag. pos_j: j-th word part of speech tag. Returns: The minimal distance in the WordNet lexical tree d_path(i,j) ''' if pos_i not in constants.pos2wnpos or pos_j not in constants.pos2wnpos: return None if not wn.synsets(lemma_i, pos=constants.pos2wnpos[pos_i], lang=self.iso_lang) or \ not wn.synsets(lemma_j, pos=constants.pos2wnpos[pos_j], lang=self.iso_lang): return None max_similarity = 0. # TODO: consider language, maybe use other type of similatity for i_synset in wn.synsets(lemma_i, pos=constants.pos2wnpos[pos_i], lang=self.iso_lang): for j_synset in wn.synsets(lemma_j, pos=constants.pos2wnpos[pos_j], lang=self.iso_lang): pair_sim = wn.path_similarity(i_synset, j_synset) if pair_sim and pair_sim > max_similarity: max_similarity = pair_sim if max_similarity == 0.: return None return 1. / max_similarity
def return_relationship_matrix(sentence1, sentence2, posGroup): relationshipMatrix = [] # applies inderect similarity measurement techniques if posGroup == NOUNS or posGroup == VERBS: for word_A in sentence1: relationshipMatrixNode = [] for word_B in sentence2: similarity = wn.path_similarity( word_A.wordSense, word_B.wordSense ) # a path similarity is measure for 2 word senses relationshipMatrixNode.append(similarity) relationshipMatrix.append(relationshipMatrixNode) # applies a dirtect match to singulars elif posGroup == SINGULARS: for word_A in sentence1: relationshipMatrixNode = [] for word_B in sentence2: if word_A.word.lower() == word_B.word.lower(): relationshipMatrixNode.append(1) else: relationshipMatrixNode.append(0) relationshipMatrix.append(relationshipMatrixNode) return relationshipMatrix
def disambiguate_word_senses(self, sentence, word): """ Attempts to determine the proper sense of the target word from the sentence in which it appears. Args: sentence: String representation of the sentence word: String represtnation of word Returns: Returns a synset which is the best guess. Example: disambiguateWordSenses('A cat is a good pet', 'cat') OUT: Synset('cat.v.01') """ wordsynsets = wn.synsets(word) bestScore = 0.0 result = None for synset in wordsynsets: for w in nltk.word_tokenize(sentence): score = 0.0 for wsynset in wn.synsets(w): sim = wn.path_similarity(wsynset, synset) if(sim == None): continue else: score += sim if (score > bestScore): bestScore = score result = synset return result
def disambiguate_word_senses(self, sentence, word): """ Attempts to determine the proper sense of the target word from the sentence in which it appears. Args: sentence: String representation of the sentence word: String represtnation of word Returns: Returns a synset which is the best guess. Example: disambiguateWordSenses('A cat is a good pet', 'cat') OUT: Synset('cat.v.01') """ wordsynsets = wn.synsets(word) bestScore = 0.0 result = None for synset in wordsynsets: for w in nltk.word_tokenize(sentence): score = 0.0 for wsynset in wn.synsets(w): sim = wn.path_similarity(wsynset, synset) if (sim == None): continue else: score += sim if (score > bestScore): bestScore = score result = synset return result
def get_best_synset_pair(word_1, word_2): """ Choose the pair with highest path similarity among all pairs. Mimics pattern-seeking behavior of humans. """ global synset_pair_cache max_sim = -1.0 with lock: synsets_1 = wn.synsets(word_1, pos=wn.NOUN) synsets_2 = wn.synsets(word_2, pos=wn.NOUN) #print "w1:", word_1, synsets_1 #print "w2:", word_2, synsets_2 if len(synsets_1) == 0 or len(synsets_2) == 0: return None, None else: max_sim = -1.0 best_pair = None, None for synset_1 in synsets_1: for synset_2 in synsets_2: with lock: sim = wn.path_similarity(synset_1, synset_2) if sim > max_sim: max_sim = sim best_pair = synset_1, synset_2 return best_pair
def intersection(h, ref, wordnetTest=True): refmap = {} for word in ref: if word in refmap: refmap[word] += 1 else: refmap[word] = 1 i = 0 for run in h: if run in refmap and refmap[run] > 0: i += 1 refmap[run] -= 1 elif wordnetTest and not run in refmap and not isinstance(run, tuple): # Use wordnet to match synsets = wordnet.synsets(run.decode("utf-8")) if len(synsets) == 0: continue; for word in refmap: s = wordnet.synsets(word.decode("utf-8")) if len(s) < 1: continue if wordnet.path_similarity(synsets[0], s[0]) > 0.9: i += 1 refmap[word] -= 1 return i
def semantic_diff(a: object, b: object): """ Computes the semantic difference, as 1 - path similarity, between two string. After calculated, the semantic difference will be stored in the dictionary semantic_diff_dic so when this distance is again requested it will not be calculated a second time. The path similarity is calculated by using WordNet. If one of the two parameter is NaN, the distance returned will be infinity. If both are NaN, the distance returned will be 0. :param a: first term :type a: str or float for NaN value :param b: second term :type b: str or float for NaN value :return: the semantic difference between a and b :rtype float """ if (isinstance(a, float) and np.isnan(a)) and (isinstance(b, float) and np.isnan(b)): return 0 if isinstance(a, float) and np.isnan(a): return np.inf if isinstance(b, float) and np.isnan(b): return np.inf if a == b: return 0 if (a, b) in DiffDataFrame.semantic_diff_dict: return DiffDataFrame.semantic_diff_dict[(a, b)] elif (b, a) in DiffDataFrame.semantic_diff_dict: return DiffDataFrame.semantic_diff_dict[(b, a)] else: t = wn.path_similarity(DiffDataFrame.sysnset_dict[a], DiffDataFrame.sysnset_dict[b]) DiffDataFrame.semantic_diff_dict[(a, b)] = 1 - t DiffDataFrame.semantic_diff_dict[(b, a)] = 1 - t return 1 - t
def word_probability(word, tag, hashtag): probability = 1 hashtag_words = bags_of_words[hashtag] wordcount = 0 # Counts up how many of the word appear in the bag of words for the hashtag # Includes similar words as partial counts if tag != "": word_synset = wn.synsets(word, pos=tag) if len(word_synset) != 0: for training_word, training_tag in hashtag_words: # NOTE: This could be made more thourough at the cost of some performance in the future training_synset = wn.synsets(training_word, pos=training_tag) if len(training_synset) != 0: similarity = wn.path_similarity(word_synset[0], training_synset[0]) if similarity is not None: wordcount += similarity * similarity # Add to probability probability += wordcount # Divide probability of sum of words in category plus unique word count probability /= hashtag_frequency[hashtag] + unique_word_count return probability
def __similarity(self, word, compareto): try: score = word.jcn_similarity(compareto, self.wordnet_ic, True) except: score = wordnet.path_similarity(word, compareto) if score == -1: score = None #No path between the words was found return score
def get_best_synset_pair(word_1, word_2): max_sim = -1.0 synsets_1 = wn.synsets(word_1) synsets_2 = wn.synsets(word_2) if len(synsets_1) == 0 or len(synsets_2) == 0: return None, None else: max_sim = -1.0 best_pair = None, None for synset_1 in synsets_1: for synset_2 in synsets_2: sim = wn.path_similarity(synset_1, synset_2) # error occured if sim is not None and sim > max_sim: max_sim = sim best_pair = synset_1, synset_2 # 2つの単語の類義語一覧の中から最も似ているペアを返す return best_pair
def similarity_score(s1, s2): get_score = [] max_score = [] for syn1 in s1: for syn2 in s2: #print(syn1,syn2) score = wn.path_similarity(syn1, syn2) #print('score is',score) if score is not None: #print('true score',score) get_score.append(score) #print(score) #print('hi') #print(get_score.append(score)) #print(get_score) if len(get_score) >= 1: #print('hi') max_score.append(max(get_score)) #print(nltk.pos_tag(s1)) # Your Code Here return (sum(max_score) / len(max_score)) # Your Answer Here
def sentence_similarity(sentence1, sentence2): """ compute the sentence similarity using Wordnet """ sentence1 = pos_tag(word_tokenize(sentence1)) sentence2 = pos_tag(word_tokenize(sentence2)) synsets1 = [tagged_to_synset(*tagged_word) for tagged_word in sentence1] synsets2 = [tagged_to_synset(*tagged_word) for tagged_word in sentence2] synsets1 = [ss for ss in synsets1 if ss] synsets2 = [ss for ss in synsets2 if ss] score, count = 0.0, 0 for synset in synsets1: scores = [wn.path_similarity(synset, ss) for ss in synsets2] if [x for x in scores if x is not None] == []: return 0 best_score = max([x for x in scores if x is not None]) if best_score is not None: score += best_score count += 1 if count == 0: score = 0 print('oops') else: score /= count return score * 100
def syn(conw,candw): syn_strings = lambda x : str(x)[8:-2] pos = lambda y : y[-4:] #find the synsets of the context word ssets = wn.synsets(conw) sset_strings = map(syn_strings,ssets) #synsets of the candidate word csets = wn.synsets(candw) cset_strings = map(syn_strings,csets) #take a synset whose part of speech matches matches = [(i,j) for i in range(len(sset_strings)) for j in range(len(cset_strings)) if pos(sset_strings[i]) == pos(cset_strings[j])] similarity = 0 if matches != []: (k,l) = matches[0] similarity = wn.path_similarity(ssets[k],csets[l]) else: similarity = 0 if similarity is None: return 0 else: return similarity
def spreadItemsIntoGroups(groupNames, groupItemList): res = [] for groupItem in groupItemList: maxPsSims = [] maxWupSims = [] maxMixSims = [] for groupName in groupNames: groupNameSynsets = wordnet.synsets(groupName) groupItemSynsets = wordnet.synsets(groupItem) psSims = [] wupSims = [] for nameSyns, itemSyns in product(groupNameSynsets, groupItemSynsets): ps = wordnet.path_similarity(nameSyns, itemSyns) or 0 psSims.append((ps, groupItem, groupName)) wup = wordnet.wup_similarity(nameSyns, itemSyns) or 0 wupSims.append((wup, groupItem, groupName)) maxPsSims.append(max(psSims)) maxWupSims.append(max(wupSims)) maxMixSims.append((max(psSims)[0] * max(wupSims)[0], groupItem, groupName)) print(' path:', sorted(maxPsSims, key=lambda item: item[0], reverse=True)) print(' wup:', sorted(maxWupSims, key=lambda item: item[0], reverse=True)) print(' mix:', sorted(maxMixSims, key=lambda item: item[0], reverse=True)) print('path:', max(maxPsSims)) print('wup:', max(maxWupSims)) maxMix = max(maxMixSims)[0] print('mix:', max(maxMixSims)) res.append((maxMix, groupItem, [maxMixSim[2] for maxMixSim in maxMixSims if maxMixSim[0] == maxMix])) return res #--------------------------------------------------------------------------------------------- path: [(0.3333333333333333, 'Apple', 'Fruit'), (0.3333333333333333, 'Apple', 'Berry'), (0.25, 'Apple', 'Vegetable'), (0.2, 'Apple', 'Mushroom'), (0.125, 'Apple', 'Plant')] wup: [(0.9, 'Apple', 'Fruit'), (0.8571428571428571, 'Apple', 'Berry'), (0.8, 'Apple', 'Vegetable'), (0.75, 'Apple', 'Mushroom'), (0.6666666666666666, 'Apple', 'Plant')] mix: [(0.3, 'Apple', 'Fruit'), (0.2857142857142857, 'Apple', 'Berry'), (0.2, 'Apple', 'Vegetable'), (0.15000000000000002, 'Apple', 'Mushroom'), (0.08333333333333333, 'Apple', 'Plant')]
def get_category(text): stop_free = " ".join([i for i in text.lower().split() if i not in stop]) punc_free = "".join([i for i in stop_free if i not in punc]) normalized = [lemma.lemmatize(i) for i in punc_free.split()] score = {} for i in categories: nb_word = len(normalized) score[categories[i]] = 0 for j in normalized: try: # limit the impact of words with many different senses because it could lead to misinterpretation nb_sense = len(wn.synsets(j)) score[categories[i]] += wn.path_similarity(wn.synset(i), wn.synsets(j)[0])/nb_sense nb_word -= (nb_sense-1)/nb_sense except IndexError: # word can not be found in wordnet nb_word -= 1 # in this case we should not count this word except TypeError: # or no similarity is found pass # in this case we count the word if nb_word == 0: # if no word is considered interesting, the category cannot be defined return None score[categories[i]] /= nb_word # nb_word is the same at each iteration maxScore = max(score, key=lambda x: score[x]) if score[maxScore] > 0.048: return maxScore return None
def wn_pairwise_similarity(f, namer): """ Get pairwise similarity of label given wordnet similarity """ from nltk.corpus import wordnet as wn leaves = f.get_vals() leaves = [namer(l).lower() for l in leaves] leaves = clean(leaves, to_underscore=True) synsets = [get_synset(s) for s in leaves] if all(s is None for s in synsets): return 0., 0., 0. synsets = [s for s in synsets if s is not None] synsets = list(set(synsets)) if len(synsets) == 1: return 1., 1., 1. sims = [] for a, b in itertools.combinations(synsets, 2): sim = wn.path_similarity(a, b) sims.append(sim) mean_sim = np.mean(sims) min_sim = min(sims) max_sim = max(sims) return mean_sim, min_sim, max_sim
def dependency_parse(result, sentence, label): base, tree = None, None target, tar_index = target_search(result.nodes) if tar_index is not None: base, tree = base_search(tar_index, result.nodes) if base is not None and len(target) > 1: base, b = changePronoun(base) #check if it is a person name print(sentence, "---", base, target) target, t = changePronoun(target) similarity = wn.path_similarity(b, t) if similarity is None: similarity = 0.0 return { "base": base, "target": target, "similarity": similarity, "sentence": sentence, "tree_type": tree, "detected": True, "label": label } #add features here else: return { "base": "", "target": "", "similarity": 0.0, "sentence": sentence, "tree_type": 0, "detected": False, "label": label }
def return_relationship_matrix(sentence1, sentence2, posGroup): relationshipMatrix = [] if posGroup == NOUNS or posGroup == VERBS: for word_A in sentence1: relationshipMatrixNode = [] for word_B in sentence2: similarity = wn.path_similarity(word_A.wordSense, word_B.wordSense) relationshipMatrixNode.append(similarity) relationshipMatrix.append(relationshipMatrixNode) elif posGroup == SINGULARS: for word_A in sentence1: relationshipMatrixNode = [] for word_B in sentence2: if word_A.word.lower() == word_B.word.lower(): relationshipMatrixNode.append(1) else: relationshipMatrixNode.append(0) relationshipMatrix.append(relationshipMatrixNode) return relationshipMatrix
def relevancy_score(desiredDoc): #Each word has score between 0 to 1 in terms of similarity. "None" is returned #there is no similarity. newWord =searchWord + ".n.01" searchWordwn = wn.synset(newWord) ## print (newWord) ## print (searchWordwn) relevancyScore = 0 currentWordScore = 0 memo = {} for i in range(len(keywords)): currentWord = keywords[i][0] if currentWord in memo: currentWordScore = memo[currentWord] if currentWordScore != None: relevancyScore += currentWordScore else: if wn.synsets(currentWord, pos = wn.NOUN) != []: currentWordwn = wn.synsets(currentWord, pos = wn.NOUN)[0] currentWordScore = wn.path_similarity(searchWordwn,currentWordwn) memo[currentWord] = currentWordScore if currentWordScore != None: relevancyScore += currentWordScore return relevancyScore
def get_best_synset_pair(self, word1, word2, word1_tag=None, word2_tag=None): """ Returns the best synset pair with the highest similaity among all pairs Args: word1: source word word2: Word compared to word1_tag: POS tag of word1 word2_tag: POS tag of word 2 Returns: Tuple of (synset1, synset2) """ max_similarity = -1.0 best_pair = None, None word1_synsets = wn.synsets(word1, word1_tag) word2_synsets = wn.synsets(word2, word2_tag) if word1_synsets is None or word2_synsets is None: return best_pair for syn1 in word1_synsets: for syn2 in word2_synsets: # Compare pos tags for both words here if syn1._pos != 's' and syn1._pos == syn2._pos: sim = wn.path_similarity(syn1, syn2) if sim is None: # print("here sim is None") return None, None elif sim > max_similarity: max_similarity = sim best_pair = syn1, syn2 return best_pair
def create_graphs(doc_list): documents = doc_list if documents is None: documents = default_document_list() distance_functions = [ (wn.lch_similarity(SYNSETS[0], SYNSETS[0]), 'lch', lambda sense_1, sense_2: wn.lch_similarity(sense_1, sense_2)), (1.0, 'lin', lambda sense_1, sense_2: wn.lin_similarity(sense_1, sense_2, CORPUS)), (10.636958516573292, 'res', lambda sense_1, sense_2: wn.res_similarity(sense_1, sense_2, CORPUS)), (wn.jcn_similarity(SYNSETS[0], SYNSETS[0], CORPUS), 'jcn', lambda sense_1, sense_2: wn.jcn_similarity(sense_1, sense_2, CORPUS)), (1.0, 'path', lambda sense_1, sense_2: wn.path_similarity(sense_1, sense_2)), ] all_senses = [] for doc in documents: for sense in doc.top_senses(): all_senses.append((sense, doc.name)) against_colors = ['r', 'b', 'g'] against_to = [wn.synset(word) for word in ["economy.n.01", "philosophy.n.02", "politics.n.01"]] create_against_graph('phyl_eco_pol', documents, all_senses, against_to, distance_functions, against_colors) against_to = SYNSETS against_colors = [(random(), random(), random()) for _i in range(0, len(SYNSETS))] create_against_graph('handpicked', documents, all_senses, against_to, distance_functions, against_colors) create_graph_top_senses(documents, all_senses, distance_functions)
def get_score(tags, groups): sscore = 0 scount = 0 illegal_word = 0 if (tags != None ) : for g in groups: for x in k.tags: try : #print str(x.text), #check substring else calculate words similarity score if g in str(x.text).lower(): sscore += 2.0 scount += 1 else: tag = wn.synset(str(x.text).lower()+'.n.01') group = wn.synset(g+ '.n.01') sem = wn.path_similarity(group,tag) if sem >= 0.3 : sscore += sem scount += 1 except: illegal_word += 1 if scount != 0 : return sscore/scount else : return 0
def similarity_score(s1, s2): """ Calculate the normalized similarity score of s1 onto s2 For each synset in s1, finds the synset in s2 with the largest similarity value. Sum of all of the largest similarity values and normalize this value by dividing it by the number of largest similarity values found. Args: s1, s2: list of synsets from doc_to_synsets Returns: normalized similarity score of s1 onto s2 Example: synsets1 = doc_to_synsets('I like cats') synsets2 = doc_to_synsets('I like dogs') similarity_score(synsets1, synsets2) Out: 0.73333333333333339 """ sim_s = [] for syn1 in s1 : sim_v = [] for syn2 in s2 : val = wn.path_similarity(syn1,syn2) if isinstance(val,float) : sim_v.append(val) if (sim_v) : sim_s.append(max(sim_v)) return sum(sim_s)/len(sim_s)
def internal_word_max_WSD(sentence, word): """ Auxiliary function for sem_wsd() Input: a sentence and a word in the sentence, sentence is a list of words, not a string Return: synset(sense) of the word that maximize one similarity with another word in the sentence Derived from code at http://www.jaist.ac.jp/~s1010205/sitemap-2/styled-7/ """ wordsynsets = wn.synsets(word) bestScore = 0.0 result = None for synset in wordsynsets: for w in sentence: score = 0.0 for wsynset in wn.synsets(w): sim = wn.path_similarity(wsynset, synset) if(sim == None): continue else: score += sim if (score > bestScore): bestScore = score result = synset return result
def similarity_score(s1, s2): """ Calculate the normalized similarity score of s1 onto s2 For each synset in s1, finds the synset in s2 with the largest similarity value. Sum of all of the largest similarity values and normalize this value by dividing it by the number of largest similarity values found. Args: s1, s2: list of synsets from doc_to_synsets Returns: normalized similarity score of s1 onto s2 Example: synsets1 = doc_to_synsets('I like cats') synsets2 = doc_to_synsets('I like dogs') similarity_score(synsets1, synsets2) Out: 0.73333333333333339 """ largest_similarity_values = [] for syn1 in s1: similarity_values = [] for syn2 in s2: simi_value = wn.path_similarity(syn1, syn2) if simi_value is not None: similarity_values.append(simi_value) if len(similarity_values) != 0: largest_similarity_values.append(max(similarity_values)) return sum(largest_similarity_values) / len(largest_similarity_values)
def findMaxPathSimilarity(ingredSynsets, foodSynsets): maxPathSimilarity = 0 for synseta in ingredSynsets: for synsetb in foodSynsets: pathSim = wn.path_similarity(synseta, synsetb) if pathSim > maxPathSimilarity: maxPathSimilarity = pathSim return maxPathSimilarity
def wnsensesim(synset1, synset2, metric): if metric == 'path_similarity': return wn.path_similarity(synset1, synset2) elif metric == 'lch_similarity': return wn.lch_similarity(synset1, synset2) elif metric == 'wup_similarity': return wn.wup_similarity(synset1, synset2) else:#add more similarity measures e.g., jcn print "Unsupported wn similarity measure requested"
def word_similarity(self, word1, word2): w1synsets = wn.synsets(word1) w2synsets = wn.synsets(word2) maxsim = 0 for w1s in w1synsets: for w2s in w2synsets: current = wn.path_similarity(w1s, w2s) if (current > maxsim and current > 0): maxsim = current return maxsim
def get_path_similarity_between_boy_and_dog(): """ Computes the path similarity between "boy" and "dog". Returns ------- A float. """ return wn.path_similarity(wn.synset('boy.n.01'), wn.synset('dog.n.01'))
def compare(a,b,min=0.31): # returns True if a has equal meaning to b, False otherwise asyn = wn.synsets(a) bsyn = wn.synsets(b) if len(asyn) > 0 and len(bsyn) > 0: for ax in asyn: if len(filter(lambda x : x == True,map(lambda bx : wn.path_similarity(ax,bx) > min, bsyn)))>0: return True return False else: return a == b
def word_similarity(word1, word2): w1synsets = wn.synsets(word1) w2synsets = wn.synsets(word2) maxsim = 0 for w1s in w1synsets: for w2s in w2synsets: current = wn.path_similarity(w1s, w2s) if (current > maxsim and current > 0): maxsim = current #print "Common hypernyms of ", w1s, " and ", w2s, ": ", w1s.common_hypernyms(w2s) return maxsim
def __init__(self, obs_corpus, target_corpus, metric="path", aggregation_mode_prev="", aggregation_mode=""): super().__init__(obs_corpus, target_corpus, aggregation_mode, None, aggregation_mode_prev) self.metric = metric if self.metric == "path": self.metric_func = lambda syn1, syn2: wn.path_similarity(syn1, syn2) elif self.metric == "lch": self.metric_func = lambda syn1, syn2: wn.lch_similarity(syn1, syn2) elif self.metric == "wup": self.metric_func = lambda syn1, syn2: wn.wup_similarity(syn1, syn2) else: raise(ValueError("Wrong similarity metric: %s, should be one of path/lch/wup."%self.metric))
def semantic_score(word1, word2): ''' Semantic score between two words based on WordNet Returns: float (the semantic score between word1 and word2) ''' try: w1 = wn.synset('%s.n.01'%(word1)) w2 = wn.synset('%s.n.01'%(word2)) return wn.path_similarity(w1,w2,simulate_root = False) except: return 0
def wnpath(self, target, neighbor): r"""Return the best path_similarity between `target` and `neighbor`.""" synsetsT = wn.synsets(target, self.args.wordnet_pos_tag) synsetsN = wn.synsets(neighbor, self.args.wordnet_pos_tag) if not synsetsT: return 0 # XXX no synsets for `target` if not synsetsN: return 0 # XXX no synsets for `neighbor` return ( max(wn.path_similarity(sT, sN) for sT in synsetsT for sN in synsetsN) or 0 ) # When `wn` returns None, we just say sim==0
def get_path_similarity_between_girl_and_girl(): """ Computes the path similarity between "girl" and "girl". Returns ------- A float. """ # YOUR CODE HERE return wn.path_similarity(wn.synset('girl.n.01'), wn.synset('girl.n.01'))
def wordNetSimilarity(self, term1, term2): #http://www.nltk.org/howto/wordnet.html sim = None try: wn_term1 = wn.synsets(term1)[0] #+ ".n.01") wn_term2 = wn.synsets(term2)[0] #+ ".n.01") sim = wn.path_similarity(wn_term1, wn_term2) except: print("Error computing similarity.") if not sim: sim = 0 return sim
def extract_feature(self, sent): feature = [0] * (self.n_bow+self.n_verbs) verbs = [ w for w,pos in self.tagger.tag(word_tokenize(sent)) if pos=='VB' ] words = set(sent.split()) for i in xrange(self.n_bow): feature[i] = 1 if self.BOW[i] in words else 0 for i in xrange(self.n_verbs): if not verbs: feature[self.n_bow+i] = 0 else: similarities = [ wn.path_similarity(self.VERBS[i],wn.synset(v+'.v.01')) for v in verbs ] feature[self.n_bow+i] = max(similarities) return feature
def meansimilarity(word): # Helper function. Calculate mean path similarity of first synset # of the word with all synsets of this word. sums = 0.0 synsets = wn.synsets(word) index = 0 for synset in range(0, len(synsets)): if index < len(synsets) - 1: ps = wn.path_similarity(synsets[0], synsets[index + 1]) if ps is not None: sums = sums + ps index += 1 return sums / len(synsets)
def word_path_similarity(s1, s2): wplock.acquire() val = 0 if(s1.lower == s2.lower): return 1.0 ss1 = wn.synsets(s1.lower()) ss2 = wn.synsets(s2.lower()) for t1 in ss1: for t2 in ss2: val = max(wn.path_similarity(t1, t2), val) wplock.release() return val
def cmp_text_word_net(self, annotation, candidate, entire_annotation): """ Compare the retrieved answer with the annotation using WordNet path distance. THIS IS VERY SLOW, RESULTS ARE NOT CACHED :param annotation: The correct Answer :type annotation: String :param candidate: The retrieved Answer :type candidate: [String, String] :return: Float """ if annotation is None or annotation is 'NULL': # annotation is NULL return -1 elif candidate is None: # no answer was extracted return -2 # fetch synsets for both answers self._lock.acquire() syn_a = [wordnet.synsets(t) for t in word_tokenize(annotation)] syn_b = [wordnet.synsets(t[0]) for t in candidate] # drop tokens without synsets syn_a = [syn for syn in syn_a if len(syn) > 0] syn_b = [syn for syn in syn_b if len(syn) > 0] self._lock.release() if not any(syn_a) or not any(syn_b): # no synsets were found for one of the answers! return -3 score = 0 max_b = [0] * len(syn_b) self._lock.acquire() for i in range(len(syn_a)): max_a = 0 for j in range(len(syn_b)): sim = max(list((wordnet.path_similarity(a, b) or 0) for a, b in product(syn_a[i], syn_b[j])) or [0]) max_a = max(sim, max_a) max_b[j] = max(max_b[j], sim) score += max_a score += sum(max_b) self._lock.release() return score / len(syn_a) + len(syn_b)
def wnsim(synset1, synset2, method='all'): synset_patt = re.compile(r'^.+\..+\.\d+$') if synset_patt.match(synset1): s1 = wn.synset(synset1) else: s1 = wn_synset(synset1) if synset_patt.match(synset2): s2 = wn.synset(synset2) else: s2 = wn_synset(synset2) if s1 is None or s2 is None: return 0 if method == 'lin': return wn.lin_similarity(s1, s2, wn_ic) elif method == 'res': return wn.res_similarity(s1, s2, wn_ic) elif method == 'jcn': return wn.jcn_similarity(s1, s2, wn_ic) elif method == 'wup': return wn.wup_similarity(s1, s2) elif method == 'path': return wn.path_similarity(s1, s2) elif method == 'lch': return wn.lch_similarity(s1, s2) elif method == 'all': return [ ('lin', wn.lin_similarity(s1, s2, wn_ic)), ('res', wn.res_similarity(s1, s2, wn_ic)), ('jcn', wn.jcn_similarity(s1, s2, wn_ic)), ('wup', wn.wup_similarity(s1, s2)), ('path', wn.path_similarity(s1, s2)), ('lch', wn.lch_similarity(s1, s2)) ]
def get_best_synset_pair(word_1, word_2): synsets_1 = wn.synsets(word_1) synsets_2 = wn.synsets(word_2) if len(synsets_1) == 0 or len(synsets_2) == 0: return None, None else: max_sim = -1.0 best_pair = None, None for synset_1 in synsets_1: for synset_2 in synsets_2: sim = wn.path_similarity(synset_1, synset_2) if sim > max_sim: max_sim = sim best_pair = synset_1, synset_2 return best_pair