def create_graphs(doc_list): documents = doc_list if documents is None: documents = default_document_list() distance_functions = [ (wn.lch_similarity(SYNSETS[0], SYNSETS[0]), 'lch', lambda sense_1, sense_2: wn.lch_similarity(sense_1, sense_2)), (1.0, 'lin', lambda sense_1, sense_2: wn.lin_similarity(sense_1, sense_2, CORPUS)), (10.636958516573292, 'res', lambda sense_1, sense_2: wn.res_similarity(sense_1, sense_2, CORPUS)), (wn.jcn_similarity(SYNSETS[0], SYNSETS[0], CORPUS), 'jcn', lambda sense_1, sense_2: wn.jcn_similarity(sense_1, sense_2, CORPUS)), (1.0, 'path', lambda sense_1, sense_2: wn.path_similarity(sense_1, sense_2)), ] all_senses = [] for doc in documents: for sense in doc.top_senses(): all_senses.append((sense, doc.name)) against_colors = ['r', 'b', 'g'] against_to = [wn.synset(word) for word in ["economy.n.01", "philosophy.n.02", "politics.n.01"]] create_against_graph('phyl_eco_pol', documents, all_senses, against_to, distance_functions, against_colors) against_to = SYNSETS against_colors = [(random(), random(), random()) for _i in range(0, len(SYNSETS))] create_against_graph('handpicked', documents, all_senses, against_to, distance_functions, against_colors) create_graph_top_senses(documents, all_senses, distance_functions)
def test_path_similarities(self): from nltk.corpus import wordnet as nltk_wn nltk_cat = nltk_wn.synset('cat.n.1') nltk_dog = nltk_wn.synset('dog.n.1') nltk_bus = nltk_wn.synset('bus.n.1') our_cat = our_wn.synset('cat.n.1') our_dog = our_wn.synset('dog.n.1') our_bus = our_wn.synset('bus.n.1') assert nltk_wn.path_similarity(nltk_cat, nltk_dog) == our_wn.path_similarity( our_cat, our_dog) assert nltk_wn.wup_similarity(nltk_cat, nltk_dog) == our_wn.wup_similarity( our_cat, our_dog) assert nltk_wn.lch_similarity(nltk_cat, nltk_dog) == our_wn.lch_similarity( our_cat, our_dog) assert nltk_wn.path_similarity(nltk_cat, nltk_bus) == our_wn.path_similarity( our_cat, our_bus) assert nltk_wn.wup_similarity(nltk_cat, nltk_bus) == our_wn.wup_similarity( our_cat, our_bus) assert nltk_wn.lch_similarity(nltk_cat, nltk_bus) == our_wn.lch_similarity( our_cat, our_bus)
def get_synset_similarity(first_sentence: str, second_sentence: str, method: str): sentence_tagged_1 = pos_tag(first_sentence) sentence_tagged_2 = pos_tag(second_sentence) sentence_tagged_wn_1 = get_sentences_tagged_with_wn_and_cleaned(sentence_tagged_1) sentence_tagged_wn_2 = get_sentences_tagged_with_wn_and_cleaned(sentence_tagged_2) synsets = {} synsets, key_list1 = get_synset_tag(sentence_tagged_wn_1, synsets) synsets, key_list2 = get_synset_tag(sentence_tagged_wn_2, synsets) synsets_combinations = list(product(key_list1, key_list2)) resulting_similarity = [] for first_word, second_word in synsets_combinations: if first_word == second_word: resulting_similarity.append(1) continue first_synset = synsets[first_word][0] second_synset = synsets[second_word][0] first_tag = synsets[first_word][1] second_tag = synsets[second_word][1] if method == "path": path_sim = first_synset.path_similarity(second_synset) if path_sim is None: resulting_similarity.append(0) else: resulting_similarity.append(path_sim) if method == "lch": if first_tag == second_tag: lch_sim = wn.lch_similarity(first_synset, second_synset) if lch_sim is None: resulting_similarity.append(0) else: lch_norm = lch_sim / wn.lch_similarity(first_synset, first_synset) resulting_similarity.append(lch_norm) if method == "wup": wup_sim = first_synset.wup_similarity(second_synset) if wup_sim is None: resulting_similarity.append(0) else: resulting_similarity.append(wup_sim) if method == "lin": if first_tag == second_tag and first_tag in ['n', 'v']: lin_sim = first_synset.lin_similarity(second_synset, brown_ic) if lin_sim is None: resulting_similarity.append(0) else: resulting_similarity.append(lin_sim) if not resulting_similarity: return 0 else: return sum(resulting_similarity) / len(resulting_similarity)
def most_similar_lch(synsets_dict, verb): best_similarity = -1 most_similar = str() verb_synset = wn.synsets(verb, pos=wn.VERB)[0] for verb, synset in synsets_dict.items(): if wn.lch_similarity(synset, verb_synset) > best_similarity: best_similarity = wn.lch_similarity(synset, verb_synset) most_similar = verb return most_similar
def compare_allsynsets(method, word1, word2): ss1 = wordnet.synsets(word1) ss2 = wordnet.synsets(word2) simi, simi_value = 0.0, 0.0 for (s1, s2) in product(ss1, ss2): # if SYNpos and s1.pos() != s2.pos(): # SYN-POS # continue # if TWpos and s1.pos() != pos: # Target word POS # continue if method == "PATH": simi = s1.path_similarity(s2) elif method == "LCH": simi = wordnet.lch_similarity(s1, s2) elif method == "WUP": simi = wordnet.wup_similarity(s1, s2) elif method == "RES": simi = wordnet.res_similarity(s1, s2, brown_ic) elif method == "JCN": if s1.pos() == s2.pos() and s1.pos() in ['n', 'a', 'v' ]: # can't do diff POS simi = wordnet.jcn_similarity(s1, s2, brown_ic) elif method == "LIN": if s1.pos() == s2.pos() and s1.pos() in ['n', 'a', 'v' ]: # can't do diff POS simi = wordnet.lin_similarity(s1, s2, brown_ic) else: sys.exit("Error! No similarity methods!") if simi > simi_value: simi_value = simi return simi_value
def similarityWordNet(word1, word2): """ Similarity between two words with nltk Input: word1, word2 (String) Return: similarity (float) """ #print (word1,"-",word2) word1 = str(wn.morphy(word1)) word2 = str(wn.morphy(word2)) palabras = wn.synsets(word1) #print (palabras) if len(palabras) == 0: print("no existe") return False temp = str(palabras[0]) temp = temp[8:-2] #print (">",temp) word1 = wn.synset(str(temp)) #print (word1) palabras = wn.synsets(word2) #print (palabras) if len(palabras) == 0: print("no existe") return False temp = str(palabras[0]) temp = temp[8:-2] #print (">",temp) word2 = wn.synset(str(temp)) #print (word2) """ Return a score denoting how similar two word senses are, based on the shortest path that connects the senses in the is-a (hypernym/hypnoym) taxonomy. The score is in the range 0 to 1. """ #similarity1 = word1.path_similarity(word2) #similarity1 = wn.path_similarity(word1, word2) try: #print (wn.path_similarity(word1, word2)) #if (wn.path_similarity(word1, word2) > 0.5): #(hypernym/hypnoym) taxonomy print(wn.wup_similarity(word1, word2)) if (wn.wup_similarity(word1, word2) > 0.5): #Wu-Palmer Similarity return True except: return False """ Leacock-Chodorow Similarity: Return a score denoting how similar two word senses are, based on the shortest path that connects the senses (as above) and the maximum depth of the taxonomy in which the senses occur. range 3.6 """ similarity2 = wn.lch_similarity(word1, word2) """ Wu-Palmer Similarity: Return a score denoting how similar two word senses are, based on the depth of the two senses in the taxonomy and that of their Least Common Subsumer (most specific ancestor node). range 0.92 """ similarity3 = wn.wup_similarity(word1, word2)
def get_best_synset_pair(word_1, word_2, pos_1=POS_SET, pos_2=POS_SET): """ Choose the pair with highest path similarity among all pairs. Mimics pattern-seeking behavior of humans. """ #synsets_1 = wn.synsets(word_1) synsets_1 = [s for s in wn.synsets(word_1) if s.pos() in pos_1] #synsets_2 = wn.synsets(word_2) synsets_2 = [s for s in wn.synsets(word_2) if s.pos() in pos_2] max_sim = None best_pair = None, None for synset_1 in synsets_1: for synset_2 in synsets_2: if synset_1.pos() == synset_2.pos(): #sim = wn.path_similarity(synset_1, synset_2) sim = wn.lch_similarity(synset_1, synset_2) # same POS needed if (max_sim == None) or (max_sim < sim): max_sim = sim best_pair = synset_1, synset_2 #if best_pair!=(None,None): # or max_sim!=None if max_sim != None: spd = best_pair[0].shortest_path_distance(best_pair[1]) lch = best_pair[0].lowest_common_hypernyms(best_pair[1]) lch_depth = None if lch: lch_depth = max(s.min_depth() for s in lch) return best_pair, max_sim, spd, lch_depth return None
def __init__(self, metric="path", double_aggregator=False): """ :param metric: path lch and wup metric :param double_aggregator: """ self.metric = metric self.aggregation_mode_prev = ['max', 'mean', 'median'] #["mean", "max", "median"] self.aggregation_mode = ["mean", "std", "max", "min", "median"] self.aggregator = [ None if m == "" else getattr(np, m) for m in self.aggregation_mode ] self.aggregator_prev = [ None if m == "" else getattr(np, m) for m in self.aggregation_mode_prev ] self.double_aggregator = double_aggregator if self.metric == "path": # scene shortest path self.metric_func = lambda syn1, syn2: wn.path_similarity( syn1, syn2) elif self.metric == "lch": self.metric_func = lambda syn1, syn2: wn.lch_similarity(syn1, syn2) elif self.metric == "wup": # words' depth and ancestor depth + shortest path self.metric_func = lambda syn1, syn2: wn.wup_similarity(syn1, syn2) else: raise (ValueError( "Wrong similarity metric: %s, should be one of path/lch/wup." % self.metric))
def get_lch_average(self, sentence1, sentence2): sentence1_unique, sentence2_unique = self.sentence_difference( sentence1, sentence2) avg_similarity = 0 total_count = 0 # Measure similarity for each unique word from A to each unique word to B for sentence1_word in sentence1_unique: for sentence2_word in sentence2_unique: sentence1_word_tag = sentence1.get_tag(sentence1_word) sentence2_word_tag = sentence2.get_tag(sentence2_word) synsets_word1 = wordnet.synsets(sentence1_word, sentence1_word_tag) synsets_word2 = wordnet.synsets(sentence2_word, sentence2_word_tag) if len(synsets_word1) == 0: synsets_word1 = wordnet.synsets(sentence1_word) if len(synsets_word2) == 0: synsets_word2 = wordnet.synsets(sentence2_word) if len(synsets_word1) > 0 and len(synsets_word2) > 0: # Skip words with different tags if synsets_word1[0].pos() != synsets_word2[0].pos(): continue similarity = wordnet.lch_similarity( synsets_word1[0], synsets_word2[0]) if similarity != None: avg_similarity += similarity total_count += 1 if total_count == 0: return 0 return float(avg_similarity) / float(total_count)
def get_lch_min(self, sentence1, sentence2): sentence1_unique, sentence2_unique = self.sentence_difference( sentence1, sentence2) min_similarity = maxint # Measure similarity for each unique word from A to each unique word to B for sentence1_word in sentence1_unique: for sentence2_word in sentence2_unique: sentence1_word_tag = sentence1.get_tag(sentence1_word) sentence2_word_tag = sentence2.get_tag(sentence2_word) synsets_word1 = wordnet.synsets(sentence1_word, sentence1_word_tag) synsets_word2 = wordnet.synsets(sentence2_word, sentence2_word_tag) if len(synsets_word1) == 0: synsets_word1 = wordnet.synsets(sentence1_word) if len(synsets_word2) == 0: synsets_word2 = wordnet.synsets(sentence2_word) if len(synsets_word1) > 0 and len(synsets_word2) > 0: # Skip words with different tags if synsets_word1[0].pos() != synsets_word2[0].pos(): continue similarity = wordnet.lch_similarity( synsets_word1[0], synsets_word2[0]) if similarity != None: min_similarity = min(similarity, min_similarity) if min_similarity == maxint: return 0 return min_similarity
def _get_simil_term(self, x, y, mode='lch'): ''' Returns the similarity between two terms x and y Args: x, y (str) mode = lch | path | wup ''' w1 = wn.synsets(x) w2 = wn.synsets(y) if len(w1) == 0 or len(w2) == 0: return 0 else: if mode == 'lch': return max([ wn.lch_similarity(e1, e2) for e1 in w1 for e2 in w2 if e1.pos == e2 ]) elif mode == 'path': return max([ wn.path_similarity(e1, e2) for e1 in w1 for e2 in w2 if e1.pos == e2 ]) elif mode == 'wup': return max([ wn.wup_similarity(e1, e2) for e1 in w1 for e2 in w2 if e1.pos == e2 ])
def relaxedSimi(syn1, syn2): """ Compute similarity between two synsets """ try: return wn.lch_similarity(syn1, syn2) or 0 except WordNetError: return 0
def lch(self, synset_a, synset_b): return ( self.normalize( self.MAX_VALUE, wordnet.lch_similarity(synset_a, synset_b, verbose=True), ) if synset_a.pos() == synset_b.pos() else 0 )
def classify(self, ex): word = ex["word"].value synset_ex = wn.synsets(word.replace(" ", "_"))[0] similarities = [ wn.lch_similarity(synset_ex, synset_t) for synset_t in self.training_synsets ] cls_i = na.argmax(similarities) return self.training_table[cls_i]["class"]
def similarity_by_path(sense1, sense2, option="path"): if option.lower() in ["path", "path_similarity"]: # Path similaritys return max(wn.path_similarity(sense1,sense2), wn.path_similarity(sense1,sense2)) elif option.lower() in ["wup", "wupa", "wu-palmer", "wu-palmer"]: # Wu-Palmer return wn.wup_similarity(sense1, sense2) elif option.lower() in ['lch', "leacock-chordorow"]: # Leacock-Chodorow if sense1.pos != sense2.pos: # lch can't do diff POS return 0 return wn.lch_similarity(sense1, sense2)
def wnsensesim(synset1, synset2, metric): if metric == 'path_similarity': return wn.path_similarity(synset1, synset2) elif metric == 'lch_similarity': return wn.lch_similarity(synset1, synset2) elif metric == 'wup_similarity': return wn.wup_similarity(synset1, synset2) else:#add more similarity measures e.g., jcn print "Unsupported wn similarity measure requested"
def similarity_by_path(sense1, sense2, option="path"): """ Returns maximum path similarity between two senses. """ if option.lower() in ["path", "path_similarity"]: # Path similaritys return max(wn.path_similarity(sense1,sense2), wn.path_similarity(sense1,sense2)) elif option.lower() in ["wup", "wupa", "wu-palmer", "wu-palmer"]: # Wu-Palmer return wn.wup_similarity(sense1, sense2) elif option.lower() in ['lch', "leacock-chordorow"]: # Leacock-Chodorow if sense1.pos != sense2.pos: # lch can't do diff POS return 0 return wn.lch_similarity(sense1, sense2)
def __init__(self, obs_corpus, target_corpus, metric="path", aggregation_mode_prev="", aggregation_mode=""): super().__init__(obs_corpus, target_corpus, aggregation_mode, None, aggregation_mode_prev) self.metric = metric if self.metric == "path": self.metric_func = lambda syn1, syn2: wn.path_similarity(syn1, syn2) elif self.metric == "lch": self.metric_func = lambda syn1, syn2: wn.lch_similarity(syn1, syn2) elif self.metric == "wup": self.metric_func = lambda syn1, syn2: wn.wup_similarity(syn1, syn2) else: raise(ValueError("Wrong similarity metric: %s, should be one of path/lch/wup."%self.metric))
def compute_similarities(s1, s2, sim): if sim == "path": return wn.path_similarity(s1, s2) elif sim == "lch": return wn.lch_similarity(s1, s2) elif sim == "wup": return wn.wup_similarity(s1, s2) elif sim == "res": return wn.res_similarity(s1, s2, genesis_ic) elif sim == "jcn": return wn.jcn_similarity(s1, s2, genesis_ic) elif sim == "lin": return wn.lin_similarity(s1, s2, genesis_ic)
def similarity_by_path(sense1, sense2, option="path"): """ Returns maximum path similarity between two senses. """ if option.lower() in ["path", "path_similarity"]: # Path similaritys return max(wn.path_similarity(sense1,sense2), wn.path_similarity(sense1,sense2)) elif option.lower() in ["wup", "wupa", "wu-palmer", "wu-palmer"]: # Wu-Palmer return wn.wup_similarity(sense1, sense2) elif option.lower() in ['lch', "leacock-chordorow"]: # Leacock-Chodorow if sense1.pos != sense2.pos: # lch can't do diff POS return 0 return wn.lch_similarity(sense1, sense2) return wn.lin_similarity(sense1, sense2, wnic.ic('ic-bnc-add1.dat'))
def bagSimilarity(s1, s2) : if(len(s1) == 0 or len(s2) == 0) : return 1 total = 0; for a in s1: for b in s2: if(a.pos() == b.pos()) : total += wn.lch_similarity(a,b) total /= (len(s1)*len(s2)) return total
def dist_all_synsets(first, second): f_syns = wn.synsets(first) s_syns = wn.synsets(second) #Path SImilarity #A 0-1 similarity score based on the shortest path that connects the senses in the is-a (hypernym/hypnoym) taxonomy. #A score of 1 represents identity i.e. comparing a sense with itself will return 1. least_sim = 0.0 try: for f in f_syns: for s in s_syns: path_sim = wn.path_similarity(f, s) if path_sim > least_sim: least_sim = path_sim except: pass #Leacock-Chodorow Similarity #A similarity score of the shortest path connecting the senses & the maximum depth of the taxonomy in which the senses occur. #The relationship is given as -log(p/2d) where p is the shortest path length and d the taxonomy depth. max_lch = 0.0 for f in f_syns: for s in s_syns: lch = 0.0 try: lch = wn.lch_similarity(s, f) except WordNetError: pass if lch > max_lch: max_lch = lch max_lch = max_lch / 3.6375 #Wu-Palmer Similarity #A similarity score based on the depth of the two senses in the taxonomy and that of their Least Common Subsumer (most specific ancestor node). #The LCS does not necessarily feature in the shortest path connecting the two senses, as it is by definition the common ancestor deepest in the taxonomy, not closest to the two senses. Typically, however, it will so feature. Where multiple candidates for the LCS exist, that whose shortest path to the root node is the longest will be selected. Where the LCS has multiple paths to the root, the longer path is used for the purposes of the calculation. wup_sim = 0 try: wup_sim = wn.wup_similarity(f_syns[0], s_syns[0]) if (wup_sim == None): wup_sim = -1 except: pass return (least_sim, max_lch, wup_sim)
def checksim(self, synset1, synset2): score = 0 for syn1 in synset1: for syn2 in synset2: try: ns = wn.lch_similarity(syn1, syn2) except: ns = 0 # ns = wn.wup_similarity(syn1,syn2) if isinstance(ns, float): if ns > score: score = ns return (score)
def wnsim(synset1, synset2, method='all'): synset_patt = re.compile(r'^.+\..+\.\d+$') if synset_patt.match(synset1): s1 = wn.synset(synset1) else: s1 = wn_synset(synset1) if synset_patt.match(synset2): s2 = wn.synset(synset2) else: s2 = wn_synset(synset2) if s1 is None or s2 is None: return 0 if method == 'lin': return wn.lin_similarity(s1, s2, wn_ic) elif method == 'res': return wn.res_similarity(s1, s2, wn_ic) elif method == 'jcn': return wn.jcn_similarity(s1, s2, wn_ic) elif method == 'wup': return wn.wup_similarity(s1, s2) elif method == 'path': return wn.path_similarity(s1, s2) elif method == 'lch': return wn.lch_similarity(s1, s2) elif method == 'all': return [ ('lin', wn.lin_similarity(s1, s2, wn_ic)), ('res', wn.res_similarity(s1, s2, wn_ic)), ('jcn', wn.jcn_similarity(s1, s2, wn_ic)), ('wup', wn.wup_similarity(s1, s2)), ('path', wn.path_similarity(s1, s2)), ('lch', wn.lch_similarity(s1, s2)) ]
def create_graphs(doc_list): documents = doc_list if documents is None: documents = default_document_list() distance_functions = [ (wn.lch_similarity(SYNSETS[0], SYNSETS[0]), 'lch', lambda sense_1, sense_2: wn.lch_similarity(sense_1, sense_2)), (1.0, 'lin', lambda sense_1, sense_2: wn.lin_similarity(sense_1, sense_2, CORPUS)), (10.636958516573292, 'res', lambda sense_1, sense_2: wn.res_similarity(sense_1, sense_2, CORPUS)), (wn.jcn_similarity(SYNSETS[0], SYNSETS[0], CORPUS), 'jcn', lambda sense_1, sense_2: wn.jcn_similarity(sense_1, sense_2, CORPUS)), (1.0, 'path', lambda sense_1, sense_2: wn.path_similarity(sense_1, sense_2)), ] all_senses = [] for doc in documents: for sense in doc.top_senses(): all_senses.append((sense, doc.name)) against_colors = ['r', 'b', 'g'] against_to = [ wn.synset(word) for word in ["economy.n.01", "philosophy.n.02", "politics.n.01"] ] create_against_graph('phyl_eco_pol', documents, all_senses, against_to, distance_functions, against_colors) against_to = SYNSETS against_colors = [(random(), random(), random()) for _i in range(0, len(SYNSETS))] create_against_graph('handpicked', documents, all_senses, against_to, distance_functions, against_colors) create_graph_top_senses(documents, all_senses, distance_functions)
def __word_net_lch_eval(self, hint: str, target: str): h_synsets = wn.synsets(hint) t_synsets = wn.synsets(target) lst = [] for h in h_synsets: for t in t_synsets: try: strength = wn.lch_similarity(h, t) except WordNetError: strength = -1 lst.append(strength if strength is not None else -1) if all([x == -1 for x in lst]): return -9.999 else: return max(lst) # get strongest hint
def wn_similarity(synset_1, synset_2, similarity='Shortest_Path'): if similarity == "Shortest_Path": sim = wn.path_similarity(synset_1, synset_2) elif similarity == "Leacock_Chodorow": sim = wn.lch_similarity(synset_1, synset_2) elif similarity == "Wu_Palmer": sim = wn.wup_similarity(synset_1, synset_2) elif similarity == "Resnik": sim = synset_1.res_similarity(synset_2, ic) elif similarity == "Jiang_Conrath": sim = synset_1.jcn_similarity(synset_2, ic) elif similarity == "Lin": sim = synset_1.lin_similarity(synset_2, ic) else: sim = 0 return sim
def assignToCategoriesLCH(category_synsets,word_synsets): prettyprint("start assigning lch_similarity...") assignedDict = {} for category in category_synsets: assignedDict[category] = [] for word in word_synsets: tempValues = [] for category in category_synsets: #similarity = wn.path_similarity(word, category) similarity = wn.lch_similarity(word, category) tempValues.append(similarity) #print("appended "+str(similarity)+" for "+str(category)+ " and "+str(word)) #print("__________________________________") indexOfMaxValue = tempValues.index(getMaxFromList(tempValues)) assignedDict[ category_synsets[ indexOfMaxValue ] ].append( word ) return assignedDict
def lch_sim(word1,word2): """ Leacock-Chodorow Similarity: Return a score denoting how similar two word senses are, based on the shortest path that connects the senses (as above) and the maximum depth of the taxonomy in which the senses occur. range 3.6 The relationship is given as -log(p/2d) where p is the shortest path length and d the taxonomy depth. """ try: try: value = wn.lch_similarity(word1, word2) value = value / 3.6 #value in range of 0 to 1 return value except ValueError: return 0 except: return 0
def similarity(words: list)->list: '''Calculates similarity based on the given synsets''' results = [] synsets = ask_for_word_defs(words) print("\n{}\n".format('*'*80)) for i in range(int(len(synsets)/2)): print("{:30}{}".format(str(synsets[2*i]), str(synsets[2*i + 1]))) print("\n{}\n".format('*'*80)) print("Running comparisons...") for i in range(int(len(synsets)/2)): try: if (synsets[2*i] == None or synsets[2*i + 1] == None): results.append(["Undefined","Undefined", -1, -1, -1, "None", "None"]) continue except: pass result = [words[2*i], words[2*i + 1], 0, 0, 0, synsets[2*i].definition(), synsets[2*i + 1].definition()] result[2] = wordnet.lch_similarity(synsets[2*i],synsets[2*i + 1]) result[3] = wordnet.wup_similarity(synsets[2*i],synsets[2*i + 1]) result[4] = wordnet.path_similarity(synsets[2*i],synsets[2*i + 1]) results.append(result) print("\n{}\n".format('*'*80)) return results
def word_similarity(self, w1, w2, syns, loc, thr_sim): syn1 = wn.synsets(w1, wn.NOUN or wn.ADJ) syn2 = wn.synsets(w2, wn.NOUN or wn.ADJ) if len(syn1) > 0 and len(syn2) > 0: score = 0 max_score = 0 count = 0 sns1 = syn1[0] sns2 = syn2[0] for i in range(0, len(syn1)): for j in range(0, len(syn2)): if self.wordnet_metric == 'j': # Jiang-Conrath Similarity score = wn.jcn_similarity(syn1[i], syn2[j]) elif self.wordnet_metric == 'le': # Leacock-Chodorow Similarity score = wn.lch_similarity(syn1[i], syn2[j], simulate_root=False) elif self.wordnet_metric == 'li': # Lin Similarity score = wn.lin_similarity(syn1[i], syn2[j]) elif self.wordnet_metric == 'p': # Path Similarity score = wn.path_similarity(syn1[i], syn2[j]) elif self.wordnet_metric == 'w': # Wu-Palmer Similarity. It can not be '0'. It ranges in (0,1] score = wn.wup_similarity(syn1[i], syn2[j]) if score > max_score: # Finding the maximum score max_score = score sns1 = syn1[i] sns2 = syn2[j] if max_score >= thr_sim: # Storing all the synset pairs that have scores > threshold syns, loc = self.merging_synsets( syns, w1, w2, sns1, sns2, max_score, loc) count = count + 1 if count == 0: # Storing the synset that has maximum score but the score < threshold syns, loc = self.merging_synsets(syns, w1, w2, sns1, sns2, max_score, loc) return syns, loc
def word_similarity(measure, word1, word2, pos): wsim = 0.0 if pos is "n" or pos is "v": if pos is "n": word1 = wn.synsets(word1, wn.NOUN) word2 = wn.synsets(word2, wn.NOUN) else: word1 = wn.synsets(word1, wn.VERB) word2 = wn.synsets(word2, wn.VERB) if word1 != [] and word2 != []: word1 = word1[0] word2 = word2[0] if measure == "path": wsim = wn.path_similarity(word1, word2) if measure == "lch": wsim = wn.lch_similarity(word1, word2) wsim = (wsim / 3.63758615973) if measure == "wup": wsim = wn.wup_similarity(word1, word2) if measure == "res": wsim = word1.res_similarity(word2, brown_ic) wsim = wsim / 9.00601439892 if measure == "jcn": wsim = word1.jcn_similarity(word2, brown_ic) wsim = wsim / (1e+300) if measure == "lin": wsim = word1.lin_similarity(word2, brown_ic) return wsim else: return wsim else: if pos is "r" or pos is "a": if word1 is word2: return 1.0 else: return 0.0
def lch_similarity(synsets1, synsets2): """ This function returns Leacock Chodorow similarity (LCH) between two synsets, based on the shortest path distance and the maximum depth of the taxonomy. The equation to calculate LCH similarity is shown below: .. math:: lch\\_similarity = {-log(shortest\\_path\\_distance(synsets1, synsets2) \\over 2 * taxonomy\\_depth} :param `Synset` synsets1: first synset supplied to measures the LCH similarity :param `Synset` synsets2: second synset supplied to measures the LCH similarity :return: LCH similarity between two synsets :rtype: float :Example: >>> from pythainlp.corpus.wordnet import lch_similarity, synset >>> >>> entity = synset('entity.n.01') >>> obj = synset('object.n.01') >>> cat = synset('cat.n.01') >>> >>> lch_similarity(entity, obj) 2.538973871058276 >>> lch_similarity(entity, cat) 0.9985288301111273 >>> lch_similarity(obj, cat) 1.1526795099383855 """ return wordnet.lch_similarity(synsets1, synsets2)
def similarity_by_path(sense1, sense2, option="path", no_path_value=0): """ Returns maximum path similarity between two senses. If no path is found between the two senses, returns no_path_value. """ if option.lower() in ["path", "path_similarity"]: # Path similaritys sim_dir1 = wn.path_similarity(sense1, sense2) sim_dir2 = wn.path_similarity(sense2, sense1) if sim_dir1 is None and sim_dir2 is None: return no_path_value elif sim_dir1 is None: return sim_dir2 elif sim_dir2 is None: return sim_dir1 else: return max(sim_dir2, sim_dir1) elif option.lower() in ["wup", "wupa", "wu-palmer", "wu-palmer"]: # Wu-Palmer wup_sim = wn.wup_similarity(sense1, sense2) return wup_sim if wup_sim is not None else no_path_value elif option.lower() in ['lch', "leacock-chordorow"]: # Leacock-Chodorow if sense1.pos != sense2.pos: # lch can't do diff POS return no_path_value return wn.lch_similarity(sense1, sense2)
distances_res_bnc.append(sim7) distances_jcn_bnc.append(sim8) distances_lin_bnc.append(sim9) #Import IC calculation from nltk.corpus import wordnet_ic brown_ic = wordnet_ic.ic('ic-brown-resnik-add1.dat') bnc_ic = wordnet_ic.ic('ic-bnc-resnik-add1.dat') #For each pair of synsets, compute distance for s1 in synsets: syn1 = wn.of2ss(s1) for s2 in synsets: syn2 = wn.of2ss(s2) distances_path[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.path_similarity(syn1,syn2) distances_lch[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.lch_similarity(syn1,syn2) distances_wup[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.wup_similarity(syn1,syn2) distances_res[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.res_similarity(syn1,syn2,brown_ic) distances_jcn[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.jcn_similarity(syn1,syn2,brown_ic) distances_lin[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.lin_similarity(syn1,syn2,brown_ic) distances_res_bnc[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.res_similarity(syn1,syn2,bnc_ic) distances_jcn_bnc[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.jcn_similarity(syn1,syn2,bnc_ic) distances_lin_bnc[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.lin_similarity(syn1,syn2,bnc_ic) #distances_path[labelsNLTK.index(s1)][labelsNLTK.index(s2)] =1/(labelsNLTK.index(s2)+1) #distances_lch[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1) #distances_wup[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1) #distances_res[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1) #distances_jcn[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1) #distances_lin[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1) #distances_res_bnc[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1) #distances_jcn_bnc[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1)
def lch_similarity(synset1, synset2): return wn.lch_similarity(synset1, synset2)
#coeffs['drink'] = 1.0-float(sys.argv[4]) coeffs['eat'] = 1.0 coeffs['drink'] = 1.0 for s in sentences: s = s.split() if s[1] not in verbs_gref: verbs_gref[s[1]] = np.zeros((num_basis, num_basis)) verbs_gref[s[1]] = verbs_gref[s[1]]+np.outer(matrix[s[0]],matrix[s[2]]) verbs[s[1]] = np.zeros((num_basis, num_basis)) simweights = {} for v in verbs: if sys.argv[3]=='wup': simweights[v] = wn.wup_similarity(wn.synset(s[1]+'.v.01'), wn.synset(v+'.v.01')) elif sys.argv[3]=='lch': simweights[v] = wn.lch_similarity(wn.synset(s[1]+'.v.01'), wn.synset(v+'.v.01')) elif sys.argv[3]=='path': simweights[v] = wn.path_similarity(wn.synset(s[1]+'.v.01'), wn.synset(v+'.v.01')) verbs[s[1]] += coeffs[v]*simweights[v]*verbs_gref[v] verbs[s[1]] /= float(sum(simweights.values())) # Learn the reference matrices using Grefenstette for swallow consume and gulp gold_verbs = ['swallow','consume','gulp'] for gv in gold_verbs: with open('train/'+gv+'_train') as f: sentences = f.readlines() for s in sentences: s = s.split() if s[1] not in verbs_gref: verbs_gref[s[1]] = np.zeros((num_basis, num_basis)) verbs_gref[s[1]] = verbs_gref[s[1]]+np.outer(matrix[s[0]],matrix[s[2]])
def wn_similarity(synset_1, synset_2): return wn.lch_similarity(synset_1, synset_2)
def lch_similarity(synsets1,synsets2): return wordnet.lch_similarity(synsets1,synsets2)
def lch_sim_fun(vq_words=[]): l1 = knowledge = [ 'recite', 'review', 'point', 'recognize', 'describe', 'choose', 'examine', 'identify', 'enumerate', 'find', 'select', 'what', 'memorize', 'collect', 'sequence', 'when', 'duplicate', 'who', 'label', 'write', 'indicate', 'state', 'tabulate', 'which', 'relate', 'show', 'arrange', 'cite', 'match', 'define', 'locate', 'draw', 'repeat', 'remember', 'trace', 'read', 'quote', 'spell', 'memorise', 'how', 'observe', 'recognise', 'copy', 'why', 'outline', 'count', 'name', 'recall', 'study', 'omit', 'list', 'tell', 'reproduce', 'record', 'retell', 'meet', 'listen', 'where', 'order', 'view' ] l2 = comprehension = [ 'compare', 'cite', 'give', 'predict', 'recognize', 'describe', 'articulate', 'detail', 'order', 'characterize', 'generalize', 'factor', 'summarize', 'select', 'illustrate', 'visualize', 'group', 'trace', 'purpose', 'defend', 'rewrite', 'relate', 'approximate', 'demonstrate', 'indicate', 'add', 'interact', 'tell', 'extrapolate', 'show', 'rephrase', 'paraphrase', 'infer', 'contrast', 'locate', 'picture', 'extend', 'associate', 'conclude', 'express', 'interpolate', 'generalise', 'clarify', 'observe', 'understand', 'differentiate', 'review', 'distinguish', 'estimate', 'subtract', 'discuss', 'interpret', 'summarise', 'convert', 'translate', 'compute', 'outline', 'identify', 'elaborate', 'ask', 'example', 'classify', 'report', 'restate', 'explain', 'match' ] l3 = application = [ 'represent', 'show', 'identify', 'participate', 'derive', 'group', 'calculate', 'graph', 'dramatize', 'choose', 'factor', 'include', 'allocate', 'handle', 'practice', 'relate' 'schedule', 'report', 'assess', 'collect', 'investigate', 'categorise', 'ascertain', 'round', 'sketch', 'transcribe', 'sequence', 'imitate', 'discover', 'connect', 'tabulate', 'employ', 'avoid', 'experiment', 'manipulate', 'exercise', 'extend', 'associate', 'modify', 'personalize', 'dramatise', 'explore', 'teach', 'change', 'perform', 'summarise', 'act', 'implement', 'assign', 'alphabetize', 'relate', 'articulate', 'administer', 'subscribe', 'instruct', 'determine', 'apply', 'establish', 'select', 'illustrate', 'plot', 'use', 'prepare', 'paint', 'transfer', 'construct', 'process', 'interpret', 'translate', 'depreciate', 'complete', 'expose', 'acquire', 'adapt', 'link', 'simulate', 'diminish', 'compute', 'project', 'demonstrate', 'control', 'predict', 'contribute', 'examine', 'attain', 'capture', 'develop', 'provide', 'utilize', 'write', 'build', 'interview', 'organise', 'classify', 'draw', 'express', 'customize', 'price', 'chart', 'produce', 'plan', 'inform', 'solve', 'correlation', 'model', 'operate', 'convert' ] l4 = analysis = [ 'find', 'focus', 'identify', 'query', 'debate', 'relationships', 'derive', 'group', 'calculate', 'explain', 'theme', 'choose', 'reason', 'proof', 'reorganise', 'point', 'interrupt', 'difference', 'arrange', 'list', 'investigate', 'classify', 'discover', 'motive', 'deduce', 'connect', 'advertise', 'detect', 'confirm', 'research', 'experiment', 'size', 'cause', 'contrast', 'inspect', 'explore', 'distinguish', 'layout', 'optimize', 'interpret', 'question', 'omit', 'depth', 'ensure', 'distinction', 'inference', 'divide', 'relate', 'manage', 'rank', 'maximize', 'categorize', 'establish', 'select', 'illustrate', 'subdivide', 'transform', 'comparing', 'assumption', 'analyze', 'function', 'analyse', 'train', 'differentiate', 'breadboard', 'dissect', 'see', 'limit', 'highlight', 'appraise', 'diagnose', 'blueprint', 'compare', 'recognize', 'characterize', 'examine', 'file', 'discriminate', 'discussion', 'isolate', 'inventory', 'test', 'survey', 'document', 'infer', 'categorise', 'breakdown', 'separate', 'effect', 'diagram', 'simplify', 'point', 'audit', 'criticize', 'outline', 'correlate', 'minimize', 'prioritize', 'organise', 'model', 'order', 'test' ] l5 = synthesis = [ 'incorporate', 'code', 'reorganize', 'invent', 'generalize', 'compose', 'overhaul', 'explain', 'hypothesize', 'program', 'combine', 'choose', 'frame', 'integrate', 'collaborate', 'handle', 'format', 'propose', 'express', 'progress', 'reconstruct', 'speculate', 'discuss', 'comply', 'arrange', 'intervene', 'collect', 'hypothesise', 'debug', 'enhance', 'anticipate', 'originate', 'formulate', 'discover', 'reinforce', 'design', 'animate', 'substitute', 'network', 'join', 'experiment', 'adapt', 'lecture', 'contrast', 'extend', 'visualise', 'modify', 'makeup', 'prescribe', 'imagine', 'interface', 'estimate', 'generate', 'change', 'improve', 'convert', 'elaborate', 'initiate', 'individualize', 'think', 'revise', 'organize', 'relate', 'assemble', 'synthesize', 'categorize', 'summarize', 'prepare', 'create', 'transform', 'construct', 'predict', 'theorise', 'minimise', 'tell', 'cope', 'maximise', 'innovate', 'specify', 'communicate', 'setup', 'pretend', 'budget', 'compile', 'suppose', 'tabulate', 'delete', 'compare', 'rewrite', 'devise', 'abstract', 'dictate', 'cultivate', 'happen', 'portray', 'depict', 'develop', 'perform', 'make', 'write', 'build', 'test', 'negotiate', 'rearrange', 'simplify', 'produce', 'plan', 'validate', 'structure', 'add', 'outline', 'facilitate', 'correspond', 'solve', 'model', 'original' ] l6 = evaluation = [ 'validate', 'compare', 'deduct', 'useful', 'consider', 'conclude', 'predict', 'relate', 'describe', 'influence', 'rank', 'assess', 'rate', 'persuade', 'determine', 'measure', 'critique', 'mark', 'summarize', 'select', 'discuss', 'discriminate', 'prove', 'verify', 'defend', 'support', 'debate', 'grade', 'argue', 'disprove', 'recommend', 'test', 'infer', 'contrast', 'choose', 'attach', 'good', 'importance', 'evaluate', 'criteria', 'prescribe', 'hire', 'award', 'perceive', 'dispute', 'know', 'decide', 'opinion', 'judge', 'estimate', 'why', 'interpret', 'counsel', 'criticize', 'effective', 'prioritize', 'value', 'agree', 'bad', 'convince', 'prioritise', 'release', 'frame', 'appraise', 'explain', 'criticise', 'justify' ] cl_listoflist = [] cl_listoflist.append(l1) cl_listoflist.append(l2) cl_listoflist.append(l3) cl_listoflist.append(l4) cl_listoflist.append(l5) cl_listoflist.append(l6) cnt_log = 0 final_level_of_ques = -1 final_sim_of_ques_with_all_levels = [0, 0, 0, 0, 0, 0] final_area_sim_of_ques_with_all_levels = [0, 0, 0, 0, 0, 0] for vq_word in vq_words: # calculating sum and avg of sim of word with each list # print("\n\ndoing for word -----" , vq_word) sum_of_sim_all_levels = [] avg_of_sim_all_levels = [] for i, list_i in enumerate(cl_listoflist): # print("list number : " , i) sum_of_sim = 0 for l_word in list_i: # print("two words " , vq_word , l_word) if len(wordnet.synsets(vq_word)) == 0: # print vq_word break vq_word_syn = wordnet.synsets(vq_word)[0] # print("l_word => wordnet.synsets(l_word)",l_word, "=>" ,wordnet.synsets(l_word)) if len(wordnet.synsets(l_word)) == 0: # print l_word continue l_word_syn = wordnet.synsets(l_word)[0] try: wup_sim = wordnet.lch_similarity(vq_word_syn, l_word_syn) except: # print vq_word_syn,l_word_syn,"->exception" continue # wup_sim=(vq_word_syn).jcn_similarity(l_word_syn) if (type(wup_sim) != type(None)): sum_of_sim = sum_of_sim + wup_sim # sum_of_sim += 1 # print(" counted ",vq_word,l_word , "synset " , vq_word_syn , l_word_syn) else: cnt_log = cnt_log + 1 # print("Not counted ",vq_word,l_word , "synset " , vq_word_syn , l_word_syn) # input() sum_of_sim_all_levels.append(sum_of_sim) avg_of_sim_all_levels.append(sum_of_sim / len(list_i)) # print("\n\n printing all lists") # for l in cl_listoflist: # print(l) # QUES WORK BEGIN # print ("Sim") for i in range(0, 6): final_sim_of_ques_with_all_levels[i] += avg_of_sim_all_levels[i] # print (final_sim_of_ques_with_all_levels[i],",") # print("\n") # print("area sim") for i in range(0, 6): final_area_sim_of_ques_with_all_levels[i] += sum_of_sim_all_levels[ i] # print (final_area_sim_of_ques_with_all_levels[i],",") # print("\n") # print ("cnt_log",cnt_log) # print ("Final Sim") # for i in range(0,6): # print (final_sim_of_ques_with_all_levels[i],",") # print("\n") # print ("Final Area Sim") # for i in range(0,6): # print (final_area_sim_of_ques_with_all_levels[i],",") # print("\n") # maximum of all similarities values to find cl level final_level = 0 max_sim = final_sim_of_ques_with_all_levels[0] for index, sim in enumerate(final_sim_of_ques_with_all_levels): if sim > max_sim: max_sim = sim final_level = index # print("\n") # print("avg wali list: " , avg_of_sim_all_levels) # print( "sum wali list: " , sum_of_sim_all_levels) # finding if word will be classified in more than two levels count = 0 indices_of_same_sim = [] for i, sim in enumerate(final_sim_of_ques_with_all_levels): if sim == max_sim: count += 1 indices_of_same_sim.append(i) # if word is in more than two levels if len(indices_of_same_sim) > 1: # print ("ques is in more than two levels") same_sim_list = [] for index in indices_of_same_sim: same_sim_list.append(final_area_sim_of_ques_with_all_levels[index]) max_sim_area = same_sim_list[0] for sim_area, index_of_max_sim in zip(same_sim_list, indices_of_same_sim): if sim_area > max_sim_area: max_sim_area = sim_area final_level = index_of_max_sim # print("final_level ",final_level) return final_level