def create_graphs(doc_list): documents = doc_list if documents is None: documents = default_document_list() distance_functions = [ (wn.lch_similarity(SYNSETS[0], SYNSETS[0]), 'lch', lambda sense_1, sense_2: wn.lch_similarity(sense_1, sense_2)), (1.0, 'lin', lambda sense_1, sense_2: wn.lin_similarity(sense_1, sense_2, CORPUS)), (10.636958516573292, 'res', lambda sense_1, sense_2: wn.res_similarity(sense_1, sense_2, CORPUS)), (wn.jcn_similarity(SYNSETS[0], SYNSETS[0], CORPUS), 'jcn', lambda sense_1, sense_2: wn.jcn_similarity(sense_1, sense_2, CORPUS)), (1.0, 'path', lambda sense_1, sense_2: wn.path_similarity(sense_1, sense_2)), ] all_senses = [] for doc in documents: for sense in doc.top_senses(): all_senses.append((sense, doc.name)) against_colors = ['r', 'b', 'g'] against_to = [wn.synset(word) for word in ["economy.n.01", "philosophy.n.02", "politics.n.01"]] create_against_graph('phyl_eco_pol', documents, all_senses, against_to, distance_functions, against_colors) against_to = SYNSETS against_colors = [(random(), random(), random()) for _i in range(0, len(SYNSETS))] create_against_graph('handpicked', documents, all_senses, against_to, distance_functions, against_colors) create_graph_top_senses(documents, all_senses, distance_functions)
def wn_similarity(synset_1, synset_2): if synset_1.pos() not in [ "a", "s", "r" ] and synset_2.pos() not in ["a", "s", "r"]: return wn.jcn_similarity(synset_1, synset_2, brown_ic) else: return None
def similarity_by_infocontent(sense1, sense2, option): """ Returns similarity scores by information content. """ if sense1.pos != sense2.pos: # infocontent sim can't do diff POS. return 0 info_contents = [ 'ic-bnc-add1.dat', 'ic-bnc-resnik-add1.dat', 'ic-bnc-resnik.dat', 'ic-bnc.dat', 'ic-brown-add1.dat', 'ic-brown-resnik-add1.dat', 'ic-brown-resnik.dat', 'ic-brown.dat', 'ic-semcor-add1.dat', 'ic-semcor.dat', 'ic-semcorraw-add1.dat', 'ic-semcorraw-resnik-add1.dat', 'ic-semcorraw-resnik.dat', 'ic-semcorraw.dat', 'ic-shaks-add1.dat', 'ic-shaks-resnik.dat', 'ic-shaks-resnink-add1.dat', 'ic-shaks.dat', 'ic-treebank-add1.dat', 'ic-treebank-resnik-add1.dat', 'ic-treebank-resnik.dat', 'ic-treebank.dat' ] if option in ['res', 'resnik']: return wn.res_similarity(sense1, sense2, wnic.ic('ic-bnc-resnik-add1.dat')) #return min(wn.res_similarity(sense1, sense2, wnic.ic(ic)) \ # for ic in info_contents) elif option in ['jcn', "jiang-conrath"]: return wn.jcn_similarity(sense1, sense2, wnic.ic('ic-bnc-add1.dat')) elif option in ['lin']: return wn.lin_similarity(sense1, sense2, wnic.ic('ic-bnc-add1.dat'))
def internal_sentence_max_WSD(sentence, word): """ Auxiliary function for sem_wsd() Input: a sentence and a word in the sentence, sentence is a list of words, not a string Return: synset(sense) of the word that maximize similarity with all other synsets in the sentence """ # brown_ic = wordnet_ic.ic('ic-brown.dat') wordsynsets = wn.synsets(word) bestScore = 0.0 result = None for synset in wordsynsets: score = 0.0 for w in sentence: for wsynset in wn.synsets(w): # sim = wn.path_similarity(wsynset, synset) # sim = wn.wup_similarity(wsynset, synset) try: # requires to be the same POS, so, using try to assign sim when POS sim = wn.jcn_similarity(wsynset, synset, ic=brown_ic) except: # use except to avoid error when POSs are not the same sim = None if(sim == None): continue else: score += sim if (score > bestScore): bestScore = score result = synset return result
def get_jcn_min(self, sentence1, sentence2): sentence1_unique, sentence2_unique = self.sentence_difference( sentence1, sentence2) min_similarity = maxint # Measure similarity for each unique word from A to each unique word to B for sentence1_word in sentence1_unique: for sentence2_word in sentence2_unique: sentence1_word_tag = sentence1.get_tag(sentence1_word) sentence2_word_tag = sentence2.get_tag(sentence2_word) synsets_word1 = wordnet.synsets(sentence1_word, sentence1_word_tag) synsets_word2 = wordnet.synsets(sentence2_word, sentence2_word_tag) if len(synsets_word1) == 0: synsets_word1 = wordnet.synsets(sentence1_word) if len(synsets_word2) == 0: synsets_word2 = wordnet.synsets(sentence2_word) if len(synsets_word1) > 0 and len(synsets_word2) > 0: # Skip words with different tags if synsets_word1[0].pos() != synsets_word2[0].pos(): continue # Try find similarity from corpus try: similarity = wordnet.jcn_similarity( synsets_word1[0], synsets_word2[0], self.brown_ic) except: continue if similarity != None: min_similarity = min(similarity, min_similarity) if min_similarity == maxint or min_similarity == 1e-300: return 0 return min_similarity
def compare_allsynsets(method, word1, word2): ss1 = wordnet.synsets(word1) ss2 = wordnet.synsets(word2) simi, simi_value = 0.0, 0.0 for (s1, s2) in product(ss1, ss2): # if SYNpos and s1.pos() != s2.pos(): # SYN-POS # continue # if TWpos and s1.pos() != pos: # Target word POS # continue if method == "PATH": simi = s1.path_similarity(s2) elif method == "LCH": simi = wordnet.lch_similarity(s1, s2) elif method == "WUP": simi = wordnet.wup_similarity(s1, s2) elif method == "RES": simi = wordnet.res_similarity(s1, s2, brown_ic) elif method == "JCN": if s1.pos() == s2.pos() and s1.pos() in ['n', 'a', 'v' ]: # can't do diff POS simi = wordnet.jcn_similarity(s1, s2, brown_ic) elif method == "LIN": if s1.pos() == s2.pos() and s1.pos() in ['n', 'a', 'v' ]: # can't do diff POS simi = wordnet.lin_similarity(s1, s2, brown_ic) else: sys.exit("Error! No similarity methods!") if simi > simi_value: simi_value = simi return simi_value
def jcn(self, synset_a, synset_b, ic): return ( self.normalize( self.MAX_VALUE, wordnet.jcn_similarity(synset_a, synset_b, ic), ) if synset_a.pos() == synset_b.pos() else 0 )
def checksimil(synsets1,synsets2, simFactor): simold = -1 sim = -1 similset = [] unsimilset = [] for ss1 in synsets1: for ss2 in synsets2: if ss1.pos == ss2.pos and ss2.pos != "s" and ss2.pos != "r" and ss2.pos != "a": if max(sim,wn.jcn_similarity(ss1,ss2,semcor_ic)) > simFactor: similset.append(ss1) else: unsimilset.append(ss1) return (similset,unsimilset)
def compute_similarities(s1, s2, sim): if sim == "path": return wn.path_similarity(s1, s2) elif sim == "lch": return wn.lch_similarity(s1, s2) elif sim == "wup": return wn.wup_similarity(s1, s2) elif sim == "res": return wn.res_similarity(s1, s2, genesis_ic) elif sim == "jcn": return wn.jcn_similarity(s1, s2, genesis_ic) elif sim == "lin": return wn.lin_similarity(s1, s2, genesis_ic)
def test_wordnet_ic(self): from nltk.corpus import wordnet as nltk_wn from nltk.corpus import wordnet_ic as nltk_wnic nltk_car = nltk_wn.synset('car.n.1') nltk_bus = nltk_wn.synset('bus.n.1') our_bnc_resnik_add1 = WordNetInformationContent('bnc', resnik=True, add1=True) our_car = our_wn.synset('car.n.1') our_bus = our_wn.synset('bus.n.1') nltk_bnc_resnik_add1 = nltk_wnic.ic('ic-bnc-resnik-add1.dat') assert our_wn.res_similarity(our_car, our_bus, our_bnc_resnik_add1) == nltk_wn.res_similarity(nltk_car, nltk_bus, nltk_bnc_resnik_add1) assert our_wn.jcn_similarity(our_car, our_bus, our_bnc_resnik_add1) == nltk_wn.jcn_similarity(nltk_car, nltk_bus, nltk_bnc_resnik_add1) assert our_wn.lin_similarity(our_car, our_bus, our_bnc_resnik_add1) == nltk_wn.lin_similarity(nltk_car, nltk_bus, nltk_bnc_resnik_add1)
def avg_wn_dist_dict(dict_file, disambiguated): sim_data = wnic.ic('ic-bnc-add1.dat') words = file(dict_file).readlines() synsets = get_synsets(words, disambiguated, data=sim_data) total_dist = 0.0 num_synsets = len(synsets) / 3 for i in range(num_synsets): for j in range(i + 1, num_synsets): sim = wn.jcn_similarity(synsets[i], synsets[j], sim_data) total_dist += 1.0e+300 - sim return total_dist / (num_synsets * (num_synsets - 1) / 2.0)
def create_graphs(doc_list): documents = doc_list if documents is None: documents = default_document_list() distance_functions = [ (wn.lch_similarity(SYNSETS[0], SYNSETS[0]), 'lch', lambda sense_1, sense_2: wn.lch_similarity(sense_1, sense_2)), (1.0, 'lin', lambda sense_1, sense_2: wn.lin_similarity(sense_1, sense_2, CORPUS)), (10.636958516573292, 'res', lambda sense_1, sense_2: wn.res_similarity(sense_1, sense_2, CORPUS)), (wn.jcn_similarity(SYNSETS[0], SYNSETS[0], CORPUS), 'jcn', lambda sense_1, sense_2: wn.jcn_similarity(sense_1, sense_2, CORPUS)), (1.0, 'path', lambda sense_1, sense_2: wn.path_similarity(sense_1, sense_2)), ] all_senses = [] for doc in documents: for sense in doc.top_senses(): all_senses.append((sense, doc.name)) against_colors = ['r', 'b', 'g'] against_to = [ wn.synset(word) for word in ["economy.n.01", "philosophy.n.02", "politics.n.01"] ] create_against_graph('phyl_eco_pol', documents, all_senses, against_to, distance_functions, against_colors) against_to = SYNSETS against_colors = [(random(), random(), random()) for _i in range(0, len(SYNSETS))] create_against_graph('handpicked', documents, all_senses, against_to, distance_functions, against_colors) create_graph_top_senses(documents, all_senses, distance_functions)
def wnsim(synset1, synset2, method='all'): synset_patt = re.compile(r'^.+\..+\.\d+$') if synset_patt.match(synset1): s1 = wn.synset(synset1) else: s1 = wn_synset(synset1) if synset_patt.match(synset2): s2 = wn.synset(synset2) else: s2 = wn_synset(synset2) if s1 is None or s2 is None: return 0 if method == 'lin': return wn.lin_similarity(s1, s2, wn_ic) elif method == 'res': return wn.res_similarity(s1, s2, wn_ic) elif method == 'jcn': return wn.jcn_similarity(s1, s2, wn_ic) elif method == 'wup': return wn.wup_similarity(s1, s2) elif method == 'path': return wn.path_similarity(s1, s2) elif method == 'lch': return wn.lch_similarity(s1, s2) elif method == 'all': return [ ('lin', wn.lin_similarity(s1, s2, wn_ic)), ('res', wn.res_similarity(s1, s2, wn_ic)), ('jcn', wn.jcn_similarity(s1, s2, wn_ic)), ('wup', wn.wup_similarity(s1, s2)), ('path', wn.path_similarity(s1, s2)), ('lch', wn.lch_similarity(s1, s2)) ]
def avg_wn_dist_topic(topic, disambiguated, sim_data): words = [w[0] for w in topic] synsets = get_synsets(words, disambiguated, data=sim_data) total_dist = 0.0 for i in range(len(synsets)): for j in range(i + 1, len(synsets)): sim = wn.jcn_similarity(synsets[i], synsets[j], sim_data) if sim is None: print "syn1: " + str(synsets[i]) + " syn2: " + str(synsets[j]) continue total_dist += 1.0e+300 - sim num_comparisons = (len(synsets) * (len(synsets) - 1) / 2.0) return total_dist / num_comparisons, num_comparisons
def wordnet_distances(synsets): print "calculating distances" word_dists = np.zeros(len(synsets)**2, dtype='float64').reshape(len(synsets), len(synsets)) sim_data = wnic.ic('ic-bnc-add1.dat') for i in range(len(synsets)): for j in range(i + 1, len(synsets)): sim = wn.jcn_similarity(synsets[i], synsets[j], sim_data) if sim is None: print "syn1: " + str(synsets[i]) + " syn2: " + str(synsets[j]) continue word_dists[i, j] = 1.0e+300 - sim word_dists[j, i] = 1.0e+300 - sim return word_dists
def get_jcn_average(self, sentence1, sentence2): sentence1_unique, sentence2_unique = self.sentence_difference( sentence1, sentence2) avg_similarity = 0 total_count = 0 # Measure similarity for each unique word from A to each unique word to B for sentence1_word in sentence1_unique: for sentence2_word in sentence2_unique: sentence1_word_tag = sentence1.get_tag(sentence1_word) sentence2_word_tag = sentence2.get_tag(sentence2_word) synsets_word1 = wordnet.synsets(sentence1_word, sentence1_word_tag) synsets_word2 = wordnet.synsets(sentence2_word, sentence2_word_tag) if len(synsets_word1) == 0: synsets_word1 = wordnet.synsets(sentence1_word) if len(synsets_word2) == 0: synsets_word2 = wordnet.synsets(sentence2_word) if len(synsets_word1) > 0 and len(synsets_word2) > 0: # Skip words with different tags if synsets_word1[0].pos() != synsets_word2[0].pos(): continue # Try find similarity from corpus try: similarity = wordnet.jcn_similarity( synsets_word1[0], synsets_word2[0], self.brown_ic) except: continue if similarity == 1e-300: similarity = 0.0 if similarity == 1e+300: similarity = 1.0 if similarity != None: avg_similarity += similarity total_count += 1 if total_count == 0: return 0 return float(avg_similarity) / float(total_count)
def word_similarity(self, w1, w2, syns, loc, thr_sim): syn1 = wn.synsets(w1, wn.NOUN or wn.ADJ) syn2 = wn.synsets(w2, wn.NOUN or wn.ADJ) if len(syn1) > 0 and len(syn2) > 0: score = 0 max_score = 0 count = 0 sns1 = syn1[0] sns2 = syn2[0] for i in range(0, len(syn1)): for j in range(0, len(syn2)): if self.wordnet_metric == 'j': # Jiang-Conrath Similarity score = wn.jcn_similarity(syn1[i], syn2[j]) elif self.wordnet_metric == 'le': # Leacock-Chodorow Similarity score = wn.lch_similarity(syn1[i], syn2[j], simulate_root=False) elif self.wordnet_metric == 'li': # Lin Similarity score = wn.lin_similarity(syn1[i], syn2[j]) elif self.wordnet_metric == 'p': # Path Similarity score = wn.path_similarity(syn1[i], syn2[j]) elif self.wordnet_metric == 'w': # Wu-Palmer Similarity. It can not be '0'. It ranges in (0,1] score = wn.wup_similarity(syn1[i], syn2[j]) if score > max_score: # Finding the maximum score max_score = score sns1 = syn1[i] sns2 = syn2[j] if max_score >= thr_sim: # Storing all the synset pairs that have scores > threshold syns, loc = self.merging_synsets( syns, w1, w2, sns1, sns2, max_score, loc) count = count + 1 if count == 0: # Storing the synset that has maximum score but the score < threshold syns, loc = self.merging_synsets(syns, w1, w2, sns1, sns2, max_score, loc) return syns, loc
def jcn_sim_fun(vq_words=[]): l1 = knowledge = [ 'recite', 'review', 'point', 'recognize', 'describe', 'choose', 'examine', 'identify', 'enumerate', 'find', 'select', 'what', 'memorize', 'collect', 'sequence', 'when', 'duplicate', 'who', 'label', 'write', 'indicate', 'state', 'tabulate', 'which', 'relate', 'show', 'arrange', 'cite', 'match', 'define', 'locate', 'draw', 'repeat', 'remember', 'trace', 'read', 'quote', 'spell', 'memorise', 'how', 'observe', 'recognise', 'copy', 'why', 'outline', 'count', 'name', 'recall', 'study', 'omit', 'list', 'tell', 'reproduce', 'record', 'retell', 'meet', 'listen', 'where', 'order', 'view' ] l2 = comprehension = [ 'compare', 'cite', 'give', 'predict', 'recognize', 'describe', 'articulate', 'detail', 'order', 'characterize', 'generalize', 'factor', 'summarize', 'select', 'illustrate', 'visualize', 'group', 'trace', 'purpose', 'defend', 'rewrite', 'relate', 'approximate', 'demonstrate', 'indicate', 'add', 'interact', 'tell', 'extrapolate', 'show', 'rephrase', 'paraphrase', 'infer', 'contrast', 'locate', 'picture', 'extend', 'associate', 'conclude', 'express', 'interpolate', 'generalise', 'clarify', 'observe', 'understand', 'differentiate', 'review', 'distinguish', 'estimate', 'subtract', 'discuss', 'interpret', 'summarise', 'convert', 'translate', 'compute', 'outline', 'identify', 'elaborate', 'ask', 'example', 'classify', 'report', 'restate', 'explain', 'match' ] l3 = application = [ 'represent', 'show', 'identify', 'participate', 'derive', 'group', 'calculate', 'graph', 'dramatize', 'choose', 'factor', 'include', 'allocate', 'handle', 'practice', 'relate' 'schedule', 'report', 'assess', 'collect', 'investigate', 'categorise', 'ascertain', 'round', 'sketch', 'transcribe', 'sequence', 'imitate', 'discover', 'connect', 'tabulate', 'employ', 'avoid', 'experiment', 'manipulate', 'exercise', 'extend', 'associate', 'modify', 'personalize', 'dramatise', 'explore', 'teach', 'change', 'perform', 'summarise', 'act', 'implement', 'assign', 'alphabetize', 'relate', 'articulate', 'administer', 'subscribe', 'instruct', 'determine', 'apply', 'establish', 'select', 'illustrate', 'plot', 'use', 'prepare', 'paint', 'transfer', 'construct', 'process', 'interpret', 'translate', 'depreciate', 'complete', 'expose', 'acquire', 'adapt', 'link', 'simulate', 'diminish', 'compute', 'project', 'demonstrate', 'control', 'predict', 'contribute', 'examine', 'attain', 'capture', 'develop', 'provide', 'utilize', 'write', 'build', 'interview', 'organise', 'classify', 'draw', 'express', 'customize', 'price', 'chart', 'produce', 'plan', 'inform', 'solve', 'correlation', 'model', 'operate', 'convert' ] l4 = analysis = [ 'find', 'focus', 'identify', 'query', 'debate', 'relationships', 'derive', 'group', 'calculate', 'explain', 'theme', 'choose', 'reason', 'proof', 'reorganise', 'point', 'interrupt', 'difference', 'arrange', 'list', 'investigate', 'classify', 'discover', 'motive', 'deduce', 'connect', 'advertise', 'detect', 'confirm', 'research', 'experiment', 'size', 'cause', 'contrast', 'inspect', 'explore', 'distinguish', 'layout', 'optimize', 'interpret', 'question', 'omit', 'depth', 'ensure', 'distinction', 'inference', 'divide', 'relate', 'manage', 'rank', 'maximize', 'categorize', 'establish', 'select', 'illustrate', 'subdivide', 'transform', 'comparing', 'assumption', 'analyze', 'function', 'analyse', 'train', 'differentiate', 'breadboard', 'dissect', 'see', 'limit', 'highlight', 'appraise', 'diagnose', 'blueprint', 'compare', 'recognize', 'characterize', 'examine', 'file', 'discriminate', 'discussion', 'isolate', 'inventory', 'test', 'survey', 'document', 'infer', 'categorise', 'breakdown', 'separate', 'effect', 'diagram', 'simplify', 'point', 'audit', 'criticize', 'outline', 'correlate', 'minimize', 'prioritize', 'organise', 'model', 'order', 'test' ] l5 = synthesis = [ 'incorporate', 'code', 'reorganize', 'invent', 'generalize', 'compose', 'overhaul', 'explain', 'hypothesize', 'program', 'combine', 'choose', 'frame', 'integrate', 'collaborate', 'handle', 'format', 'propose', 'express', 'progress', 'reconstruct', 'speculate', 'discuss', 'comply', 'arrange', 'intervene', 'collect', 'hypothesise', 'debug', 'enhance', 'anticipate', 'originate', 'formulate', 'discover', 'reinforce', 'design', 'animate', 'substitute', 'network', 'join', 'experiment', 'adapt', 'lecture', 'contrast', 'extend', 'visualise', 'modify', 'makeup', 'prescribe', 'imagine', 'interface', 'estimate', 'generate', 'change', 'improve', 'convert', 'elaborate', 'initiate', 'individualize', 'think', 'revise', 'organize', 'relate', 'assemble', 'synthesize', 'categorize', 'summarize', 'prepare', 'create', 'transform', 'construct', 'predict', 'theorise', 'minimise', 'tell', 'cope', 'maximise', 'innovate', 'specify', 'communicate', 'setup', 'pretend', 'budget', 'compile', 'suppose', 'tabulate', 'delete', 'compare', 'rewrite', 'devise', 'abstract', 'dictate', 'cultivate', 'happen', 'portray', 'depict', 'develop', 'perform', 'make', 'write', 'build', 'test', 'negotiate', 'rearrange', 'simplify', 'produce', 'plan', 'validate', 'structure', 'add', 'outline', 'facilitate', 'correspond', 'solve', 'model', 'original' ] l6 = evaluation = [ 'validate', 'compare', 'deduct', 'useful', 'consider', 'conclude', 'predict', 'relate', 'describe', 'influence', 'rank', 'assess', 'rate', 'persuade', 'determine', 'measure', 'critique', 'mark', 'summarize', 'select', 'discuss', 'discriminate', 'prove', 'verify', 'defend', 'support', 'debate', 'grade', 'argue', 'disprove', 'recommend', 'test', 'infer', 'contrast', 'choose', 'attach', 'good', 'importance', 'evaluate', 'criteria', 'prescribe', 'hire', 'award', 'perceive', 'dispute', 'know', 'decide', 'opinion', 'judge', 'estimate', 'why', 'interpret', 'counsel', 'criticize', 'effective', 'prioritize', 'value', 'agree', 'bad', 'convince', 'prioritise', 'release', 'frame', 'appraise', 'explain', 'criticise', 'justify' ] cl_listoflist = [] cl_listoflist.append(l1) cl_listoflist.append(l2) cl_listoflist.append(l3) cl_listoflist.append(l4) cl_listoflist.append(l5) cl_listoflist.append(l6) cnt_log = 0 final_level_of_ques = -1 final_sim_of_ques_with_all_levels = [0, 0, 0, 0, 0, 0] final_area_sim_of_ques_with_all_levels = [0, 0, 0, 0, 0, 0] for vq_word in vq_words: # calculating sum and avg of sim of word with each list # print("\n\ndoing for word -----" , vq_word) sum_of_sim_all_levels = [] avg_of_sim_all_levels = [] for i, list_i in enumerate(cl_listoflist): # print("list number : " , i) sum_of_sim = 0 for l_word in list_i: # print("two words " , vq_word , l_word) if len(wordnet.synsets(vq_word)) == 0: # print vq_word break vq_word_syn = wordnet.synsets(vq_word)[0] # print("l_word => wordnet.synsets(l_word)",l_word, "=>" ,wordnet.synsets(l_word)) if len(wordnet.synsets(l_word)) == 0: # print l_word continue l_word_syn = wordnet.synsets(l_word)[0] try: wup_sim = wordnet.jcn_similarity(vq_word_syn, l_word_syn, brown_ic) except: # print vq_word_syn,l_word_syn,"->exception" continue # wup_sim=(vq_word_syn).jcn_similarity(l_word_syn) if (type(wup_sim) != type(None)): sum_of_sim = sum_of_sim + wup_sim # sum_of_sim += 1 # print(" counted ",vq_word,l_word , "synset " , vq_word_syn , l_word_syn) else: cnt_log = cnt_log + 1 # print("Not counted ",vq_word,l_word , "synset " , vq_word_syn , l_word_syn) # input() sum_of_sim_all_levels.append(sum_of_sim) avg_of_sim_all_levels.append(sum_of_sim / len(list_i)) # print("\n\n printing all lists") # for l in cl_listoflist: # print(l) # QUES WORK BEGIN # print ("Sim") for i in range(0, 6): final_sim_of_ques_with_all_levels[i] += avg_of_sim_all_levels[i] # print (final_sim_of_ques_with_all_levels[i],",") # print("\n") # print("area sim") for i in range(0, 6): final_area_sim_of_ques_with_all_levels[i] += sum_of_sim_all_levels[ i] # print (final_area_sim_of_ques_with_all_levels[i],",") # print("\n") # print ("cnt_log",cnt_log) # print ("Final Sim") # for i in range(0,6): # print (final_sim_of_ques_with_all_levels[i],",") # print("\n") # print ("Final Area Sim") # for i in range(0,6): # print (final_area_sim_of_ques_with_all_levels[i],",") # print("\n") # maximum of all similarities values to find cl level final_level = 0 max_sim = final_sim_of_ques_with_all_levels[0] for index, sim in enumerate(final_sim_of_ques_with_all_levels): if sim > max_sim: max_sim = sim final_level = index # print("\n") # print("avg wali list: " , avg_of_sim_all_levels) # print( "sum wali list: " , sum_of_sim_all_levels) # finding if word will be classified in more than two levels count = 0 indices_of_same_sim = [] for i, sim in enumerate(final_sim_of_ques_with_all_levels): if sim == max_sim: count += 1 indices_of_same_sim.append(i) # if word is in more than two levels if len(indices_of_same_sim) > 1: # print ("ques is in more than two levels") same_sim_list = [] for index in indices_of_same_sim: same_sim_list.append(final_area_sim_of_ques_with_all_levels[index]) max_sim_area = same_sim_list[0] for sim_area, index_of_max_sim in zip(same_sim_list, indices_of_same_sim): if sim_area > max_sim_area: max_sim_area = sim_area final_level = index_of_max_sim # print("final_level ",final_level) return final_level
def jcn_similarity(synsets1, synsets2): similarity_function = lambda ss1, ss2: wn.jcn_similarity(ss1, ss2, corpus) return __max_similarity(synsets1, synsets2, similarity_function)
def jiang_conrath_similarity(a,b): if wn.jcn_similarity(wn.synsets(a)[0],wn.synsets(b)[0],genesis_ic) == None: return 0 else: return wn.jcn_similarity(wn.synsets(a)[0],wn.synsets(b,genesis_ic)[0])
for sid in range(len(sslist)): prevs.append(0.0) for p2term,sim in sims: if pterm == p2term: continue sim_p_p2 = sim ss2list = pterm_to_synsets.get(p2term, []) # Calculate normalizing wnss norm_wnss = 0.0 for sid in range(len(sslist)): max_wnss = 0.0 for s2id in range(len(ss2list)): s_s2_wnss = wordnet.jcn_similarity(sslist[sid],\ ss2list[s2id],\ ic) if s_s2_wnss > max_wnss: max_wnss = s_s2_wnss norm_wnss += max_wnss if norm_wnss <= 0.0: continue # Increment prevalence score for each sense for sid in range(len(sslist)): max_wnss = 0.0 for s2id in range(len(ss2list)): s_s2_wnss = wordnet.jcn_similarity(sslist[sid],\ ss2list[s2id],\ ic) if s_s2_wnss > max_wnss:
def computeInfContSimilarity(): ## Load an information content file from the wordnet_ic corpus brown_ic = wordnet_ic.ic('ic-brown.dat') print "computing Information Content Similarity..." tStart = time.time() ## Compute the similarity between nouns ALLnouns_sim = [] for subSent1, subSent2 in zip(nouns_text1, nouns_text2): ## if-else to use the longer sentence if (len(subSent1) > len(subSent2)): nounSim = np.zeros(len(subSent1)) for i, noun1 in enumerate(subSent1): for noun2 in subSent2: try: w1 = noun1 + ".n.01" w1 = wn.synset(w1) w2 = noun2 + ".n.01" w2 = wn.synset(w2) sim = wn.jcn_similarity(w1, w2, brown_ic) if sim > nounSim[i]: nounSim[i] = sim except: continue # print nounSim else: nounSim = np.zeros(len(subSent2)) for i, noun2 in enumerate(subSent2): for noun1 in subSent1: try: w1 = noun1 + ".n.01" w1 = wn.synset(w1) w2 = noun2 + ".n.01" w2 = wn.synset(w2) sim = wn.jcn_similarity(w1, w2, brown_ic) if sim > nounSim[i]: nounSim[i] = sim except: continue ALLnouns_sim.append(nounSim) ## Compute the similarity between verbs ALLverbs_sim = [] for subSent1, subSent2 in zip(verbs_text1, verbs_text2): ## if-else to use the longer sentence if (len(subSent1) > len(subSent2)): verbSim = np.zeros(len(subSent1)) for i, verb1 in enumerate(subSent1): for verb2 in subSent2: try: w1 = verb1 + ".n.01" w1 = wn.synset(w1) w2 = verb2 + ".n.01" w2 = wn.synset(w2) sim = wn.jcn_similarity(w1, w2, brown_ic) if sim > verbSim[i]: verbSim[i] = sim except: continue else: verbSim = np.zeros(len(subSent2)) for i, verb2 in enumerate(subSent2): for verb1 in subSent1: try: w1 = verb1 + ".n.01" w1 = wn.synset(w1) w2 = verb2 + ".n.01" w2 = wn.synset(w2) sim = wn.jcn_similarity(w1, w2, brown_ic) if sim > verbSim[i]: verbSim[i] = sim except: continue ALLverbs_sim.append(verbSim) ## Compute the similarity between adjectives ALLadjs_sim = [] for subSent1, subSent2 in zip(adj_text1, adj_text2): ## if-else to use the longer sentence if (len(subSent1) > len(subSent2)): adjSim = np.zeros(len(subSent1)) for i, adj1 in enumerate(subSent1): for adj2 in subSent2: try: w1 = adj1 + ".n.01" w1 = wn.synset(w1) w2 = adj2 + ".n.01" w2 = wn.synset(w2) sim = wn.jcn_similarity(w1, w2, brown_ic) if sim > adjSim[i]: adjSim[i] = sim except: continue # print nounSim else: adjSim = np.zeros(len(subSent2)) for i, adj2 in enumerate(subSent2): for adj1 in subSent1: try: w1 = adj1 + ".n.01" w1 = wn.synset(w1) w2 = adj2 + ".n.01" w2 = wn.synset(w2) sim = wn.jcn_similarity(w1, w2, brown_ic) if sim > adjSim[i]: adjSim[i] = sim except: continue ALLadjs_sim.append(adjSim) tEnd = time.time() print "..done. Time taken (InformationContentSimilarity): ", tEnd-tStart return ALLnouns_sim, ALLverbs_sim, ALLverbs_sim
def jcn_process(xfpg, keywds): #fpg_l=list(set1) brown_ic = wordnet_ic.ic('ic-brown.dat') x = [] s_g = [] result = [] #print (len(l)) #print (len(l2)) #for k in list(keywds): # for j in list(set1): # s1 = wn.synsets(k) # s2 = wn.synsets(j) # print (k,j) # x.append(wn.jcn_similarity(s1[i], s2[i], brown_ic)) #print(len(set1)) print("TFIDF KEYWORDS NUMBER " + str(len(keywds))) for j in (list(xfpg['itemsets'])): s = [] result = [] #print() #print(j) #print(len(j)) #print() #print("FPGROWRH ITEMSET length "+str(len(list(xfpg['itemsets'])))) for j1 in j: #print("FPGROWRH ITEMSET "+str((j1))) for i in (keywds[:20]): s1 = wn.synsets(i) s2 = wn.synsets(j1) #print (s1,s2) #print(s1) #print(s2) if (len(s1) == 0 or len(s2) == 0): x = 0 elif (s1[0].pos() != s2[0].pos()): x = 0 else: x = wn.jcn_similarity(s1[0], s2[0], brown_ic) if (x == (1e300)): x = 1 result.append(x) #print ("jcn of {} and {} is {}".format(str(j1),str(i),str(x))) s1 = sum(result) #print("score of {} is {}".format(j1,str(s1))) s.append(s1) #print("score of {} is {}".format(str(j),str(sum(s)))) #print(result) s_g.append(s) #print("Length of result"+str(len(s_g))) #res=s/len(result) #print("resultat" + str(s_g)) #print("Resultat" + str(res*100)) res = np.asarray(s_g) #get the element with the higest score if (res.size != 0): res_index = res.argmax() print("Highest score is {} with {}".format( str(xfpg['itemsets'][res_index]), str(s_g[res_index]))) #Converting frozen set to list sets = [xfpg['itemsets'][res_index]] final_list = ([list(x) for x in sets]) return final_list else: None #res_index print() return None print()
wn.synset(ani_list[indcolum][0][8:-2]), brown_ic) if results_res[indrow][indcolum] == 1: word_pairs.append(ani_list[indrow][0] + " vs. " + ani_list[indcolum][0]) print(results_res) #Jiang-Conrath Similarity results_jcn = np.zeros((len(ani_list), len(ani_list))) word_pairs = [] for indrow in range(0, len(ani_list)): print("word: " + ani_list[indrow][0][8:-2]) for indcolum in range(0, len(ani_list)): print(ani_list[indrow][0][8:-2] + " vs. " + ani_list[indcolum][0][8:-2]) results_jcn[indrow][indcolum] = wn.jcn_similarity( wn.synset(ani_list[indrow][0][8:-2]), wn.synset(ani_list[indcolum][0][8:-2]), brown_ic) if results_jcn[indrow][indcolum] == 1: word_pairs.append(ani_list[indrow][0] + " vs. " + ani_list[indcolum][0]) print(results_jcn) #Lin Similarity results_lin = np.zeros((len(ani_list), len(ani_list))) word_pairs = [] for indrow in range(0, len(ani_list)): print("word: " + ani_list[indrow][0][8:-2]) for indcolum in range(0, len(ani_list)): print(ani_list[indrow][0][8:-2] + " vs. " + ani_list[indcolum][0][8:-2]) results_lin[indrow][indcolum] = wn.lin_similarity(
def jcn_similarity(synset1, synset2): return wn.jcn_similarity(synset1, synset2, info_content)
#Import IC calculation from nltk.corpus import wordnet_ic brown_ic = wordnet_ic.ic('ic-brown-resnik-add1.dat') bnc_ic = wordnet_ic.ic('ic-bnc-resnik-add1.dat') #For each pair of synsets, compute distance for s1 in synsets: syn1 = wn.of2ss(s1) for s2 in synsets: syn2 = wn.of2ss(s2) distances_path[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.path_similarity(syn1,syn2) distances_lch[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.lch_similarity(syn1,syn2) distances_wup[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.wup_similarity(syn1,syn2) distances_res[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.res_similarity(syn1,syn2,brown_ic) distances_jcn[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.jcn_similarity(syn1,syn2,brown_ic) distances_lin[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.lin_similarity(syn1,syn2,brown_ic) distances_res_bnc[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.res_similarity(syn1,syn2,bnc_ic) distances_jcn_bnc[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.jcn_similarity(syn1,syn2,bnc_ic) distances_lin_bnc[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.lin_similarity(syn1,syn2,bnc_ic) #distances_path[labelsNLTK.index(s1)][labelsNLTK.index(s2)] =1/(labelsNLTK.index(s2)+1) #distances_lch[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1) #distances_wup[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1) #distances_res[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1) #distances_jcn[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1) #distances_lin[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1) #distances_res_bnc[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1) #distances_jcn_bnc[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1) #distances_lin_bnc[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1) print 'done computing wordnet distances'
def similarity_by_infocontent(sense1, sense2, option): """ Returns similarity scores by information content. """ if sense1.pos != sense2.pos: # infocontent sim can't do diff POS. return 0 info_contents = ['ic-bnc-add1.dat', 'ic-bnc-resnik-add1.dat', 'ic-bnc-resnik.dat', 'ic-bnc.dat', 'ic-brown-add1.dat', 'ic-brown-resnik-add1.dat', 'ic-brown-resnik.dat', 'ic-brown.dat', 'ic-semcor-add1.dat', 'ic-semcor.dat', 'ic-semcorraw-add1.dat', 'ic-semcorraw-resnik-add1.dat', 'ic-semcorraw-resnik.dat', 'ic-semcorraw.dat', 'ic-shaks-add1.dat', 'ic-shaks-resnik.dat', 'ic-shaks-resnink-add1.dat', 'ic-shaks.dat', 'ic-treebank-add1.dat', 'ic-treebank-resnik-add1.dat', 'ic-treebank-resnik.dat', 'ic-treebank.dat'] if option in ['res', 'resnik']: return wn.res_similarity(sense1, sense2, wnic.ic('ic-bnc-resnik-add1.dat')) #return min(wn.res_similarity(sense1, sense2, wnic.ic(ic)) \ # for ic in info_contents) elif option in ['jcn', "jiang-conrath"]: return wn.jcn_similarity(sense1, sense2, wnic.ic('ic-bnc-add1.dat')) elif option in ['lin']: return wn.lin_similarity(sense1, sense2, wnic.ic('ic-bnc-add1.dat'))