def compare_allsynsets(method, word1, word2): ss1 = wordnet.synsets(word1) ss2 = wordnet.synsets(word2) simi, simi_value = 0.0, 0.0 for (s1, s2) in product(ss1, ss2): # if SYNpos and s1.pos() != s2.pos(): # SYN-POS # continue # if TWpos and s1.pos() != pos: # Target word POS # continue if method == "PATH": simi = s1.path_similarity(s2) elif method == "LCH": simi = wordnet.lch_similarity(s1, s2) elif method == "WUP": simi = wordnet.wup_similarity(s1, s2) elif method == "RES": simi = wordnet.res_similarity(s1, s2, brown_ic) elif method == "JCN": if s1.pos() == s2.pos() and s1.pos() in ['n', 'a', 'v' ]: # can't do diff POS simi = wordnet.jcn_similarity(s1, s2, brown_ic) elif method == "LIN": if s1.pos() == s2.pos() and s1.pos() in ['n', 'a', 'v' ]: # can't do diff POS simi = wordnet.lin_similarity(s1, s2, brown_ic) else: sys.exit("Error! No similarity methods!") if simi > simi_value: simi_value = simi return simi_value
def semantic_similarity(word1, word2, speech, measure): """ Finds the highest similarity score for the given pair of words. Goes through each combination of all senses. :param word1: First word in the pair of words :param word2: Second word in the pair of words :param speech: part of speech e.g. nw.NOUN :param measure: String representing the type of similarity measure ("path" = path ; "res" = Resnik ; "lin" = Lin) :return: The highest similarity score across all senses and all parts of speech """ #error handling if invalid measure input is given if measure not in ["path","res","lin"]: raise ValueError("Not a valid similarity type \n Must be 'path'(path), 'res'(Resnik) or 'lin'(Lin)") greatest = 0 conceptsA = wn.synsets(word1,speech) conceptsB = wn.synsets(word2,speech) #finds similarity score for every combination of senses for conceptA in conceptsA: for conceptB in conceptsB: if measure == "path": similarity = wn.path_similarity(conceptA,conceptB) elif measure == "res": similarity = wn.res_similarity(conceptA,conceptB,brown_ic) elif measure == "lin": similarity = wn.lin_similarity(conceptA,conceptB,brown_ic) if similarity == None : continue #error checking if similarity scorce not possible if similarity>greatest: greatest = similarity #if new highest similairty is found, set it to the greatest return greatest
def create_graphs(doc_list): documents = doc_list if documents is None: documents = default_document_list() distance_functions = [ (wn.lch_similarity(SYNSETS[0], SYNSETS[0]), 'lch', lambda sense_1, sense_2: wn.lch_similarity(sense_1, sense_2)), (1.0, 'lin', lambda sense_1, sense_2: wn.lin_similarity(sense_1, sense_2, CORPUS)), (10.636958516573292, 'res', lambda sense_1, sense_2: wn.res_similarity(sense_1, sense_2, CORPUS)), (wn.jcn_similarity(SYNSETS[0], SYNSETS[0], CORPUS), 'jcn', lambda sense_1, sense_2: wn.jcn_similarity(sense_1, sense_2, CORPUS)), (1.0, 'path', lambda sense_1, sense_2: wn.path_similarity(sense_1, sense_2)), ] all_senses = [] for doc in documents: for sense in doc.top_senses(): all_senses.append((sense, doc.name)) against_colors = ['r', 'b', 'g'] against_to = [wn.synset(word) for word in ["economy.n.01", "philosophy.n.02", "politics.n.01"]] create_against_graph('phyl_eco_pol', documents, all_senses, against_to, distance_functions, against_colors) against_to = SYNSETS against_colors = [(random(), random(), random()) for _i in range(0, len(SYNSETS))] create_against_graph('handpicked', documents, all_senses, against_to, distance_functions, against_colors) create_graph_top_senses(documents, all_senses, distance_functions)
def get_lin_min(self, sentence1, sentence2): sentence1_unique, sentence2_unique = self.sentence_difference( sentence1, sentence2) min_similarity = maxint # Measure similarity for each unique word from A to each unique word to B for sentence1_word in sentence1_unique: for sentence2_word in sentence2_unique: sentence1_word_tag = sentence1.get_tag(sentence1_word) sentence2_word_tag = sentence2.get_tag(sentence2_word) synsets_word1 = wordnet.synsets(sentence1_word, sentence1_word_tag) synsets_word2 = wordnet.synsets(sentence2_word, sentence2_word_tag) if len(synsets_word1) == 0: synsets_word1 = wordnet.synsets(sentence1_word) if len(synsets_word2) == 0: synsets_word2 = wordnet.synsets(sentence2_word) if len(synsets_word1) > 0 and len(synsets_word2) > 0: # Skip words with different tags if synsets_word1[0].pos() != synsets_word2[0].pos(): continue # Try find similarity from corpus try: similarity = wordnet.lin_similarity( synsets_word1[0], synsets_word2[0], self.brown_ic) except: continue if similarity != None: min_similarity = min(similarity, min_similarity) if min_similarity == maxint: return 0 return min_similarity
def similarity_by_infocontent(sense1, sense2, option): """ Returns similarity scores by information content. """ if sense1.pos != sense2.pos: # infocontent sim can't do diff POS. return 0 info_contents = [ 'ic-bnc-add1.dat', 'ic-bnc-resnik-add1.dat', 'ic-bnc-resnik.dat', 'ic-bnc.dat', 'ic-brown-add1.dat', 'ic-brown-resnik-add1.dat', 'ic-brown-resnik.dat', 'ic-brown.dat', 'ic-semcor-add1.dat', 'ic-semcor.dat', 'ic-semcorraw-add1.dat', 'ic-semcorraw-resnik-add1.dat', 'ic-semcorraw-resnik.dat', 'ic-semcorraw.dat', 'ic-shaks-add1.dat', 'ic-shaks-resnik.dat', 'ic-shaks-resnink-add1.dat', 'ic-shaks.dat', 'ic-treebank-add1.dat', 'ic-treebank-resnik-add1.dat', 'ic-treebank-resnik.dat', 'ic-treebank.dat' ] if option in ['res', 'resnik']: return wn.res_similarity(sense1, sense2, wnic.ic('ic-bnc-resnik-add1.dat')) #return min(wn.res_similarity(sense1, sense2, wnic.ic(ic)) \ # for ic in info_contents) elif option in ['jcn', "jiang-conrath"]: return wn.jcn_similarity(sense1, sense2, wnic.ic('ic-bnc-add1.dat')) elif option in ['lin']: return wn.lin_similarity(sense1, sense2, wnic.ic('ic-bnc-add1.dat'))
def linWordnetSimilarity(word1, word2, ic): possibleValues = [wordnet.lin_similarity(synset1, synset2, ic) \ for synset1 in [smoothed(s) for s in wordnet.synsets(word1)] \ for synset2 in [smoothed(s) for s in wordnet.synsets(word2)] \ if synset1.pos == synset2.pos and synset1.pos in ic] possibleValues = [v for v in possibleValues if v >= 0] return max(possibleValues) if possibleValues else None
def get_similarity(synsets, c, ic): matrix = [] for ci in c: temp = [] for si in synsets: try: temp.append(wn.lin_similarity(si, ci, ic)) except: temp.append(0) matrix.append(temp) return matrix
def lin(self, synset_a, synset_b, ic): if synset_a.pos() != synset_b.pos(): return 0 try: norm = self.normalize( self.MAX_VALUE, wordnet.lin_similarity(synset_a, synset_b, ic) ) if norm == None: return 0 return norm except ZeroDivisionError: return 0
def similarity_by_path(sense1, sense2, option="path"): """ Returns maximum path similarity between two senses. """ if option.lower() in ["path", "path_similarity"]: # Path similaritys return max(wn.path_similarity(sense1,sense2), wn.path_similarity(sense1,sense2)) elif option.lower() in ["wup", "wupa", "wu-palmer", "wu-palmer"]: # Wu-Palmer return wn.wup_similarity(sense1, sense2) elif option.lower() in ['lch', "leacock-chordorow"]: # Leacock-Chodorow if sense1.pos != sense2.pos: # lch can't do diff POS return 0 return wn.lch_similarity(sense1, sense2) return wn.lin_similarity(sense1, sense2, wnic.ic('ic-bnc-add1.dat'))
def test_wordnet_ic(self): from nltk.corpus import wordnet as nltk_wn from nltk.corpus import wordnet_ic as nltk_wnic nltk_car = nltk_wn.synset('car.n.1') nltk_bus = nltk_wn.synset('bus.n.1') our_bnc_resnik_add1 = WordNetInformationContent('bnc', resnik=True, add1=True) our_car = our_wn.synset('car.n.1') our_bus = our_wn.synset('bus.n.1') nltk_bnc_resnik_add1 = nltk_wnic.ic('ic-bnc-resnik-add1.dat') assert our_wn.res_similarity(our_car, our_bus, our_bnc_resnik_add1) == nltk_wn.res_similarity(nltk_car, nltk_bus, nltk_bnc_resnik_add1) assert our_wn.jcn_similarity(our_car, our_bus, our_bnc_resnik_add1) == nltk_wn.jcn_similarity(nltk_car, nltk_bus, nltk_bnc_resnik_add1) assert our_wn.lin_similarity(our_car, our_bus, our_bnc_resnik_add1) == nltk_wn.lin_similarity(nltk_car, nltk_bus, nltk_bnc_resnik_add1)
def compute_similarities(s1, s2, sim): if sim == "path": return wn.path_similarity(s1, s2) elif sim == "lch": return wn.lch_similarity(s1, s2) elif sim == "wup": return wn.wup_similarity(s1, s2) elif sim == "res": return wn.res_similarity(s1, s2, genesis_ic) elif sim == "jcn": return wn.jcn_similarity(s1, s2, genesis_ic) elif sim == "lin": return wn.lin_similarity(s1, s2, genesis_ic)
def similarity2(lverb,el): ret = []; for v1 in lverb: maxsim = -1; for v2 in el: sim = wn.lin_similarity(v1, v2, brown_ic, verbose = False); if (sim>maxsim): maxsim = sim; ret.append(maxsim); if (len(ret) != 2): ret.append(0); return ret;
def wnsim(synset1, synset2, method='all'): synset_patt = re.compile(r'^.+\..+\.\d+$') if synset_patt.match(synset1): s1 = wn.synset(synset1) else: s1 = wn_synset(synset1) if synset_patt.match(synset2): s2 = wn.synset(synset2) else: s2 = wn_synset(synset2) if s1 is None or s2 is None: return 0 if method == 'lin': return wn.lin_similarity(s1, s2, wn_ic) elif method == 'res': return wn.res_similarity(s1, s2, wn_ic) elif method == 'jcn': return wn.jcn_similarity(s1, s2, wn_ic) elif method == 'wup': return wn.wup_similarity(s1, s2) elif method == 'path': return wn.path_similarity(s1, s2) elif method == 'lch': return wn.lch_similarity(s1, s2) elif method == 'all': return [ ('lin', wn.lin_similarity(s1, s2, wn_ic)), ('res', wn.res_similarity(s1, s2, wn_ic)), ('jcn', wn.jcn_similarity(s1, s2, wn_ic)), ('wup', wn.wup_similarity(s1, s2)), ('path', wn.path_similarity(s1, s2)), ('lch', wn.lch_similarity(s1, s2)) ]
def computeLinSimilarity(term1, term2): global ic if not ic: #ic = wordnet_ic.ic('ic-semcor.dat') ic = wordnet_ic.ic('ic-brown.dat') w1_syns = wn.synsets(term1) w2_syns = wn.synsets(term2) maxsim = 0 for w1s in w1_syns: for w2s in w2_syns: try: sim = wn.lin_similarity(w1s, w2s, ic) if sim > maxsim: maxsim = sim except Exception: pass return maxsim
def word_similarity(self, w1, w2, syns, loc, thr_sim): syn1 = wn.synsets(w1, wn.NOUN or wn.ADJ) syn2 = wn.synsets(w2, wn.NOUN or wn.ADJ) if len(syn1) > 0 and len(syn2) > 0: score = 0 max_score = 0 count = 0 sns1 = syn1[0] sns2 = syn2[0] for i in range(0, len(syn1)): for j in range(0, len(syn2)): if self.wordnet_metric == 'j': # Jiang-Conrath Similarity score = wn.jcn_similarity(syn1[i], syn2[j]) elif self.wordnet_metric == 'le': # Leacock-Chodorow Similarity score = wn.lch_similarity(syn1[i], syn2[j], simulate_root=False) elif self.wordnet_metric == 'li': # Lin Similarity score = wn.lin_similarity(syn1[i], syn2[j]) elif self.wordnet_metric == 'p': # Path Similarity score = wn.path_similarity(syn1[i], syn2[j]) elif self.wordnet_metric == 'w': # Wu-Palmer Similarity. It can not be '0'. It ranges in (0,1] score = wn.wup_similarity(syn1[i], syn2[j]) if score > max_score: # Finding the maximum score max_score = score sns1 = syn1[i] sns2 = syn2[j] if max_score >= thr_sim: # Storing all the synset pairs that have scores > threshold syns, loc = self.merging_synsets( syns, w1, w2, sns1, sns2, max_score, loc) count = count + 1 if count == 0: # Storing the synset that has maximum score but the score < threshold syns, loc = self.merging_synsets(syns, w1, w2, sns1, sns2, max_score, loc) return syns, loc
def create_graphs(doc_list): documents = doc_list if documents is None: documents = default_document_list() distance_functions = [ (wn.lch_similarity(SYNSETS[0], SYNSETS[0]), 'lch', lambda sense_1, sense_2: wn.lch_similarity(sense_1, sense_2)), (1.0, 'lin', lambda sense_1, sense_2: wn.lin_similarity(sense_1, sense_2, CORPUS)), (10.636958516573292, 'res', lambda sense_1, sense_2: wn.res_similarity(sense_1, sense_2, CORPUS)), (wn.jcn_similarity(SYNSETS[0], SYNSETS[0], CORPUS), 'jcn', lambda sense_1, sense_2: wn.jcn_similarity(sense_1, sense_2, CORPUS)), (1.0, 'path', lambda sense_1, sense_2: wn.path_similarity(sense_1, sense_2)), ] all_senses = [] for doc in documents: for sense in doc.top_senses(): all_senses.append((sense, doc.name)) against_colors = ['r', 'b', 'g'] against_to = [ wn.synset(word) for word in ["economy.n.01", "philosophy.n.02", "politics.n.01"] ] create_against_graph('phyl_eco_pol', documents, all_senses, against_to, distance_functions, against_colors) against_to = SYNSETS against_colors = [(random(), random(), random()) for _i in range(0, len(SYNSETS))] create_against_graph('handpicked', documents, all_senses, against_to, distance_functions, against_colors) create_graph_top_senses(documents, all_senses, distance_functions)
def get_lin_average(self, sentence1, sentence2): sentence1_unique, sentence2_unique = self.sentence_difference( sentence1, sentence2) avg_similarity = 0 total_count = 0 # Measure similarity for each unique word from A to each unique word to B for sentence1_word in sentence1_unique: for sentence2_word in sentence2_unique: sentence1_word_tag = sentence1.get_tag(sentence1_word) sentence2_word_tag = sentence2.get_tag(sentence2_word) synsets_word1 = wordnet.synsets(sentence1_word, sentence1_word_tag) synsets_word2 = wordnet.synsets(sentence2_word, sentence2_word_tag) if len(synsets_word1) == 0: synsets_word1 = wordnet.synsets(sentence1_word) if len(synsets_word2) == 0: synsets_word2 = wordnet.synsets(sentence2_word) if len(synsets_word1) > 0 and len(synsets_word2) > 0: # Skip words with different tags if synsets_word1[0].pos() != synsets_word2[0].pos(): continue # Try find similarity from corpus try: similarity = wordnet.lin_similarity( synsets_word1[0], synsets_word2[0], self.brown_ic) except: continue if similarity != None: avg_similarity += similarity total_count += 1 if total_count == 0: return 0 return float(avg_similarity) / float(total_count)
def lin_sim_fun(vq_words=[]): l1 = knowledge = [ 'recite', 'review', 'point', 'recognize', 'describe', 'choose', 'examine', 'identify', 'enumerate', 'find', 'select', 'what', 'memorize', 'collect', 'sequence', 'when', 'duplicate', 'who', 'label', 'write', 'indicate', 'state', 'tabulate', 'which', 'relate', 'show', 'arrange', 'cite', 'match', 'define', 'locate', 'draw', 'repeat', 'remember', 'trace', 'read', 'quote', 'spell', 'memorise', 'how', 'observe', 'recognise', 'copy', 'why', 'outline', 'count', 'name', 'recall', 'study', 'omit', 'list', 'tell', 'reproduce', 'record', 'retell', 'meet', 'listen', 'where', 'order', 'view' ] l2 = comprehension = [ 'compare', 'cite', 'give', 'predict', 'recognize', 'describe', 'articulate', 'detail', 'order', 'characterize', 'generalize', 'factor', 'summarize', 'select', 'illustrate', 'visualize', 'group', 'trace', 'purpose', 'defend', 'rewrite', 'relate', 'approximate', 'demonstrate', 'indicate', 'add', 'interact', 'tell', 'extrapolate', 'show', 'rephrase', 'paraphrase', 'infer', 'contrast', 'locate', 'picture', 'extend', 'associate', 'conclude', 'express', 'interpolate', 'generalise', 'clarify', 'observe', 'understand', 'differentiate', 'review', 'distinguish', 'estimate', 'subtract', 'discuss', 'interpret', 'summarise', 'convert', 'translate', 'compute', 'outline', 'identify', 'elaborate', 'ask', 'example', 'classify', 'report', 'restate', 'explain', 'match' ] l3 = application = [ 'represent', 'show', 'identify', 'participate', 'derive', 'group', 'calculate', 'graph', 'dramatize', 'choose', 'factor', 'include', 'allocate', 'handle', 'practice', 'relate' 'schedule', 'report', 'assess', 'collect', 'investigate', 'categorise', 'ascertain', 'round', 'sketch', 'transcribe', 'sequence', 'imitate', 'discover', 'connect', 'tabulate', 'employ', 'avoid', 'experiment', 'manipulate', 'exercise', 'extend', 'associate', 'modify', 'personalize', 'dramatise', 'explore', 'teach', 'change', 'perform', 'summarise', 'act', 'implement', 'assign', 'alphabetize', 'relate', 'articulate', 'administer', 'subscribe', 'instruct', 'determine', 'apply', 'establish', 'select', 'illustrate', 'plot', 'use', 'prepare', 'paint', 'transfer', 'construct', 'process', 'interpret', 'translate', 'depreciate', 'complete', 'expose', 'acquire', 'adapt', 'link', 'simulate', 'diminish', 'compute', 'project', 'demonstrate', 'control', 'predict', 'contribute', 'examine', 'attain', 'capture', 'develop', 'provide', 'utilize', 'write', 'build', 'interview', 'organise', 'classify', 'draw', 'express', 'customize', 'price', 'chart', 'produce', 'plan', 'inform', 'solve', 'correlation', 'model', 'operate', 'convert' ] l4 = analysis = [ 'find', 'focus', 'identify', 'query', 'debate', 'relationships', 'derive', 'group', 'calculate', 'explain', 'theme', 'choose', 'reason', 'proof', 'reorganise', 'point', 'interrupt', 'difference', 'arrange', 'list', 'investigate', 'classify', 'discover', 'motive', 'deduce', 'connect', 'advertise', 'detect', 'confirm', 'research', 'experiment', 'size', 'cause', 'contrast', 'inspect', 'explore', 'distinguish', 'layout', 'optimize', 'interpret', 'question', 'omit', 'depth', 'ensure', 'distinction', 'inference', 'divide', 'relate', 'manage', 'rank', 'maximize', 'categorize', 'establish', 'select', 'illustrate', 'subdivide', 'transform', 'comparing', 'assumption', 'analyze', 'function', 'analyse', 'train', 'differentiate', 'breadboard', 'dissect', 'see', 'limit', 'highlight', 'appraise', 'diagnose', 'blueprint', 'compare', 'recognize', 'characterize', 'examine', 'file', 'discriminate', 'discussion', 'isolate', 'inventory', 'test', 'survey', 'document', 'infer', 'categorise', 'breakdown', 'separate', 'effect', 'diagram', 'simplify', 'point', 'audit', 'criticize', 'outline', 'correlate', 'minimize', 'prioritize', 'organise', 'model', 'order', 'test' ] l5 = synthesis = [ 'incorporate', 'code', 'reorganize', 'invent', 'generalize', 'compose', 'overhaul', 'explain', 'hypothesize', 'program', 'combine', 'choose', 'frame', 'integrate', 'collaborate', 'handle', 'format', 'propose', 'express', 'progress', 'reconstruct', 'speculate', 'discuss', 'comply', 'arrange', 'intervene', 'collect', 'hypothesise', 'debug', 'enhance', 'anticipate', 'originate', 'formulate', 'discover', 'reinforce', 'design', 'animate', 'substitute', 'network', 'join', 'experiment', 'adapt', 'lecture', 'contrast', 'extend', 'visualise', 'modify', 'makeup', 'prescribe', 'imagine', 'interface', 'estimate', 'generate', 'change', 'improve', 'convert', 'elaborate', 'initiate', 'individualize', 'think', 'revise', 'organize', 'relate', 'assemble', 'synthesize', 'categorize', 'summarize', 'prepare', 'create', 'transform', 'construct', 'predict', 'theorise', 'minimise', 'tell', 'cope', 'maximise', 'innovate', 'specify', 'communicate', 'setup', 'pretend', 'budget', 'compile', 'suppose', 'tabulate', 'delete', 'compare', 'rewrite', 'devise', 'abstract', 'dictate', 'cultivate', 'happen', 'portray', 'depict', 'develop', 'perform', 'make', 'write', 'build', 'test', 'negotiate', 'rearrange', 'simplify', 'produce', 'plan', 'validate', 'structure', 'add', 'outline', 'facilitate', 'correspond', 'solve', 'model', 'original' ] l6 = evaluation = [ 'validate', 'compare', 'deduct', 'useful', 'consider', 'conclude', 'predict', 'relate', 'describe', 'influence', 'rank', 'assess', 'rate', 'persuade', 'determine', 'measure', 'critique', 'mark', 'summarize', 'select', 'discuss', 'discriminate', 'prove', 'verify', 'defend', 'support', 'debate', 'grade', 'argue', 'disprove', 'recommend', 'test', 'infer', 'contrast', 'choose', 'attach', 'good', 'importance', 'evaluate', 'criteria', 'prescribe', 'hire', 'award', 'perceive', 'dispute', 'know', 'decide', 'opinion', 'judge', 'estimate', 'why', 'interpret', 'counsel', 'criticize', 'effective', 'prioritize', 'value', 'agree', 'bad', 'convince', 'prioritise', 'release', 'frame', 'appraise', 'explain', 'criticise', 'justify' ] cl_listoflist = [] cl_listoflist.append(l1) cl_listoflist.append(l2) cl_listoflist.append(l3) cl_listoflist.append(l4) cl_listoflist.append(l5) cl_listoflist.append(l6) cnt_log = 0 final_level_of_ques = -1 final_sim_of_ques_with_all_levels = [0, 0, 0, 0, 0, 0] final_area_sim_of_ques_with_all_levels = [0, 0, 0, 0, 0, 0] for vq_word in vq_words: # calculating sum and avg of sim of word with each list # print("\n\ndoing for word -----" , vq_word) sum_of_sim_all_levels = [] avg_of_sim_all_levels = [] for i, list_i in enumerate(cl_listoflist): # print("list number : " , i) sum_of_sim = 0 for l_word in list_i: # print("two words " , vq_word , l_word) if len(wordnet.synsets(vq_word)) == 0: # print vq_word break vq_word_syn = wordnet.synsets(vq_word)[0] # print("l_word => wordnet.synsets(l_word)",l_word, "=>" ,wordnet.synsets(l_word)) if len(wordnet.synsets(l_word)) == 0: # print l_word continue l_word_syn = wordnet.synsets(l_word)[0] try: wup_sim = wordnet.lin_similarity(vq_word_syn, l_word_syn, brown_ic) except: # print vq_word_syn,l_word_syn,"->exception" continue # wup_sim=(vq_word_syn).jcn_similarity(l_word_syn) if (type(wup_sim) != type(None)): sum_of_sim = sum_of_sim + wup_sim # sum_of_sim += 1 # print(" counted ",vq_word,l_word , "synset " , vq_word_syn , l_word_syn) else: cnt_log = cnt_log + 1 # print("Not counted ",vq_word,l_word , "synset " , vq_word_syn , l_word_syn) # input() sum_of_sim_all_levels.append(sum_of_sim) avg_of_sim_all_levels.append(sum_of_sim / len(list_i)) # print("\n\n printing all lists") # for l in cl_listoflist: # print(l) # QUES WORK BEGIN # print ("Sim") for i in range(0, 6): final_sim_of_ques_with_all_levels[i] += avg_of_sim_all_levels[i] # print (final_sim_of_ques_with_all_levels[i],",") # print("\n") # print("area sim") for i in range(0, 6): final_area_sim_of_ques_with_all_levels[i] += sum_of_sim_all_levels[ i] # print (final_area_sim_of_ques_with_all_levels[i],",") # print("\n") # print ("cnt_log",cnt_log) # print ("Final Sim") # for i in range(0,6): # print (final_sim_of_ques_with_all_levels[i],",") # print("\n") # # print ("Final Area Sim") # for i in range(0,6): # print (final_area_sim_of_ques_with_all_levels[i],",") # print("\n") # maximum of all similarities values to find cl level final_level = 0 max_sim = final_sim_of_ques_with_all_levels[0] for index, sim in enumerate(final_sim_of_ques_with_all_levels): if sim > max_sim: max_sim = sim final_level = index # print("\n") # print("avg wali list: " , avg_of_sim_all_levels) # print( "sum wali list: " , sum_of_sim_all_levels) # finding if word will be classified in more than two levels count = 0 indices_of_same_sim = [] for i, sim in enumerate(final_sim_of_ques_with_all_levels): if sim == max_sim: count += 1 indices_of_same_sim.append(i) # if word is in more than two levels if len(indices_of_same_sim) > 1: # print ("ques is in more than two levels") same_sim_list = [] for index in indices_of_same_sim: same_sim_list.append(final_area_sim_of_ques_with_all_levels[index]) max_sim_area = same_sim_list[0] for sim_area, index_of_max_sim in zip(same_sim_list, indices_of_same_sim): if sim_area > max_sim_area: max_sim_area = sim_area final_level = index_of_max_sim # print("final_level ",final_level) return final_level
def get_best_synset_pair(word_1, word_2): """ Choose the pair with highest path similarity among all pairs. """ max_sim = -1.0 synsets_1 = wn.synsets(word_1) synsets_2 = wn.synsets(word_2) # if zero synsets are found, assign string similarity if len(synsets_1) == 0 or len(synsets_2) == 0: if string_sim == 'croft': # string_sim = 1 - nltk.jaccard_distance(set(word_1), set(word_2)) str_sim = SequenceMatcher(None, word_1, word_2).ratio() return None, None, str_sim elif string_sim == 'li': return None, None, 0.0 else: max_sim = -1.0 best_pair = None, None for synset_1 in synsets_1: for synset_2 in synsets_2: # ignore if both words are from different POS if (synset_1._pos != synset_2._pos): sim = 0 else: # same pos if pos == 'noun': # pos noun if synset_1._pos != 'n' or synset_2._pos != 'n': sim = 0 else: if word_sim_algo == 'wup': sim = wn.wup_similarity(synset_1, synset_2) elif word_sim_algo == 'lin': sim = wn.lin_similarity( synset_1, synset_2, brown_ic) elif word_sim_algo == 'li': sim = ws.li_similarity(synset_1, synset_2) else: sim = wn.path_similarity(synset_1, synset_2) else: # pos all if word_sim_algo == 'wup': sim = wn.wup_similarity(synset_1, synset_2) elif word_sim_algo == 'lin': if (synset_1._pos == 'v' or synset_1._pos == 'n') and (synset_2._pos == 'v' or synset_2._pos == 'n'): sim = wn.lin_similarity( synset_1, synset_2, brown_ic) else: sim = 0 elif word_sim_algo == 'li': sim = ws.li_similarity(synset_1, synset_2) else: sim = wn.path_similarity(synset_1, synset_2) if sim == None: sim = 0 if sim > max_sim: max_sim = sim best_pair = synset_1, synset_2, max_sim return best_pair
def lin_similarity(synsets1, synsets2): similarity_function = lambda ss1, ss2: wn.lin_similarity(ss1, ss2, corpus) return __max_similarity(synsets1, synsets2, similarity_function)
def l_similarity(a,b): if wn.lin_similarity(wn.synsets(a)[0],wn.synsets(b)[0],genesis_ic) == None: return 0 else: return wn.lin_similarity(wn.synsets(a)[0],wn.synsets(b)[0],genesis_ic)
wn.synset(ani_list[indcolum][0][8:-2]), brown_ic) if results_jcn[indrow][indcolum] == 1: word_pairs.append(ani_list[indrow][0] + " vs. " + ani_list[indcolum][0]) print(results_jcn) #Lin Similarity results_lin = np.zeros((len(ani_list), len(ani_list))) word_pairs = [] for indrow in range(0, len(ani_list)): print("word: " + ani_list[indrow][0][8:-2]) for indcolum in range(0, len(ani_list)): print(ani_list[indrow][0][8:-2] + " vs. " + ani_list[indcolum][0][8:-2]) results_lin[indrow][indcolum] = wn.lin_similarity( wn.synset(ani_list[indrow][0][8:-2]), wn.synset(ani_list[indcolum][0][8:-2]), brown_ic) if results_lin[indrow][indcolum] == 1: word_pairs.append(ani_list[indrow][0] + " vs. " + ani_list[indcolum][0]) print(results_lin) #path similarity results_path = np.zeros((len(ani_list), len(ani_list))) word_pairs = [] for indrow in range(0, len(ani_list)): print("word: " + ani_list[indrow][0][8:-2]) for indcolum in range(0, len(ani_list)): print(ani_list[indrow][0][8:-2] + " vs. " + ani_list[indcolum][0][8:-2]) results_path[indrow][indcolum] = wn.path_similarity(
#Import IC calculation from nltk.corpus import wordnet_ic brown_ic = wordnet_ic.ic('ic-brown-resnik-add1.dat') bnc_ic = wordnet_ic.ic('ic-bnc-resnik-add1.dat') #For each pair of synsets, compute distance for s1 in synsets: syn1 = wn.of2ss(s1) for s2 in synsets: syn2 = wn.of2ss(s2) distances_path[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.path_similarity(syn1,syn2) distances_lch[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.lch_similarity(syn1,syn2) distances_wup[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.wup_similarity(syn1,syn2) distances_res[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.res_similarity(syn1,syn2,brown_ic) distances_jcn[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.jcn_similarity(syn1,syn2,brown_ic) distances_lin[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.lin_similarity(syn1,syn2,brown_ic) distances_res_bnc[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.res_similarity(syn1,syn2,bnc_ic) distances_jcn_bnc[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.jcn_similarity(syn1,syn2,bnc_ic) distances_lin_bnc[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.lin_similarity(syn1,syn2,bnc_ic) #distances_path[labelsNLTK.index(s1)][labelsNLTK.index(s2)] =1/(labelsNLTK.index(s2)+1) #distances_lch[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1) #distances_wup[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1) #distances_res[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1) #distances_jcn[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1) #distances_lin[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1) #distances_res_bnc[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1) #distances_jcn_bnc[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1) #distances_lin_bnc[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1) print 'done computing wordnet distances'
def similarity_by_infocontent(sense1, sense2, option): """ Returns similarity scores by information content. """ if sense1.pos != sense2.pos: # infocontent sim can't do diff POS. return 0 info_contents = ['ic-bnc-add1.dat', 'ic-bnc-resnik-add1.dat', 'ic-bnc-resnik.dat', 'ic-bnc.dat', 'ic-brown-add1.dat', 'ic-brown-resnik-add1.dat', 'ic-brown-resnik.dat', 'ic-brown.dat', 'ic-semcor-add1.dat', 'ic-semcor.dat', 'ic-semcorraw-add1.dat', 'ic-semcorraw-resnik-add1.dat', 'ic-semcorraw-resnik.dat', 'ic-semcorraw.dat', 'ic-shaks-add1.dat', 'ic-shaks-resnik.dat', 'ic-shaks-resnink-add1.dat', 'ic-shaks.dat', 'ic-treebank-add1.dat', 'ic-treebank-resnik-add1.dat', 'ic-treebank-resnik.dat', 'ic-treebank.dat'] if option in ['res', 'resnik']: return wn.res_similarity(sense1, sense2, wnic.ic('ic-bnc-resnik-add1.dat')) #return min(wn.res_similarity(sense1, sense2, wnic.ic(ic)) \ # for ic in info_contents) elif option in ['jcn', "jiang-conrath"]: return wn.jcn_similarity(sense1, sense2, wnic.ic('ic-bnc-add1.dat')) elif option in ['lin']: return wn.lin_similarity(sense1, sense2, wnic.ic('ic-bnc-add1.dat'))