def get_lesk_answers(senseval_data): time_start = time.clock() # Getting answers from lesk algorithms original_lesk_answers = {} simple_lesk_answers = {} adapted_lesk_answers = {} for sentence_data in senseval_data: for phrase in sentence_data["test_phrases"]: word_id, word = phrase["headword"] original_lesk_answers[word_id] = lesk.original_lesk(" ".join(sentence_data["sentence"]), word) simple_lesk_answers[word_id] = lesk.simple_lesk(" ".join(sentence_data["sentence"]), word) adapted_lesk_answers[word_id] = lesk.adapted_lesk(" ".join(sentence_data["sentence"]), word) for word_id, word in sentence_data["test_words"].iteritems(): original_lesk_answers[word_id] = lesk.original_lesk(" ".join(sentence_data["sentence"]), word) simple_lesk_answers[word_id] = lesk.simple_lesk(" ".join(sentence_data["sentence"]), word) adapted_lesk_answers[word_id] = lesk.adapted_lesk(" ".join(sentence_data["sentence"]), word) sys.stdout.write(".") lesk_answers_list = [] lesk_answers_list.append((original_lesk_answers, "original lesk")) lesk_answers_list.append((simple_lesk_answers, "simple lesk")) lesk_answers_list.append((adapted_lesk_answers, "adapted lesk")) time_end = time.clock() print "\nlesk took " + str(time_end - time_start) + " seconds" return lesk_answers_list
def ourLesk(sentence, word, pos1, forceResponse = False): leskList = [] if pos is not None: possibility1 = pylesk.cosine_lesk(sentence, word, pos1) possibility2 = pylesk.adapted_lesk(sentence, word) else: possibility1 = pylesk.cosine_lesk(sentence, word) possibility2 = pylesk.adapted_lesk(sentence, word) if possibility1 is not None and possibility2 is not None: possibility1 = [str(lemma.name()) for lemma in possibility1.lemmas()] possibility2 = [str(lemma.name()) for lemma in possibility2.lemmas()] leskList = set(possibility1).intersection(possibility2) else: if possibility1 is None: if possibility2 is not None: leskList = [str(lemma.name()) for lemma in possibility2.lemmas()] else: return None else: leskList = [str(lemma.name()) for lemma in possibility1.lemmas()] if len(leskList) > 0: print "-------" print word print leskList return list(leskList) else: return None
def get_lesk_answers(senseval_data): time_start = time.clock() # Getting answers from lesk algorithms original_lesk_answers = {} simple_lesk_answers = {} adapted_lesk_answers = {} for sentence_data in senseval_data: for phrase in sentence_data["test_phrases"]: word_id, word = phrase["headword"] original_lesk_answers[word_id] = lesk.original_lesk( " ".join(sentence_data["sentence"]), word) simple_lesk_answers[word_id] = lesk.simple_lesk( " ".join(sentence_data["sentence"]), word) adapted_lesk_answers[word_id] = lesk.adapted_lesk( " ".join(sentence_data["sentence"]), word) for word_id, word in sentence_data["test_words"].iteritems(): original_lesk_answers[word_id] = lesk.original_lesk( " ".join(sentence_data["sentence"]), word) simple_lesk_answers[word_id] = lesk.simple_lesk( " ".join(sentence_data["sentence"]), word) adapted_lesk_answers[word_id] = lesk.adapted_lesk( " ".join(sentence_data["sentence"]), word) sys.stdout.write(".") lesk_answers_list = [] lesk_answers_list.append((original_lesk_answers, "original lesk")) lesk_answers_list.append((simple_lesk_answers, "simple lesk")) lesk_answers_list.append((adapted_lesk_answers, "adapted lesk")) time_end = time.clock() print "\nlesk took " + str(time_end - time_start) + " seconds" return lesk_answers_list
def get_similarity(): que_topics, que = get_suggested_answer_topics() answer_topics, text = get_student_answer_topics() length = len(list(set(que_topics) & set(answer_topics))) print(str(length) + " topics matched") # Calculating the score based on number of topics matched topics_score = abs(topic_match(que_topics, answer_topics, length)) print("") print(topics_score) synsets_que_topics = [] synsets_ans_topics = [] sim_score = 0 # calculating similarity using wordnet's wup_similarity # Getting appropriate sense of topic from the text using "lesk"(word sense disambiguation algorithm) for i in que_topics: synsets_que_topics.append(adapted_lesk(que, i, pos='n')) for i in answer_topics: synset_answer = adapted_lesk(text, i, pos='n') print(str(synset_answer) + '..') if str(synset_answer) != "None": synsets_ans_topics.append(synset_answer) print("Similarity Score") sim_score = compute_similarity(synsets_que_topics, synsets_ans_topics) * 100 print(sim_score) print("") print("Average Score: " + str(abs(topics_score + int(sim_score) / 2)))
def _get_disambiguated_synset(self, token: TokenEN, text: TextHolderEN) -> Optional[Synset]: return adapted_lesk( context_sentence=text.raw_text, ambiguous_word=token.lemma_extended.replace(" ", "_"), pos=token.pos_simple, )
def get_syns(story_dict, cast_no1, cast_no2): syn_dict = OrderedDict() syns1 = [] story1 = [] story1 = remove_stop_words((tokenise(story_dict[cast_no1]))) for word in story1: syns1.append(adapted_lesk(story_dict[cast_no1], word)) syn_dict[cast_no1] = syns1 syns2 = [] story2 = [] story2 = remove_stop_words((tokenise(story_dict[cast_no1]))) for word in story2: syns2.append(adapted_lesk(story_dict[cast_no2], word)) syn_dict[cast_no2] = syns2 return syn_dict
def getDef(sent, targetWord): # Get defintion of word #defineSent = cosine_lesk(sent,targetWord).definition() defineSent = adapted_lesk(sent, targetWord).definition() return defineSent
def readGenreFilesAndTagWordsForSenses(core_nlp_files): for genre_file_path, genre_file_name in core_nlp_files: dictionary = dict() with open(genre_file_path) as f: print 'Processing File', genre_file_path synset_wsd_file = genre_file_path.replace(CORE_NLP_FILE_SUFFIX, SYNSET_WSD_FILE_SUFFIX) if os.path.exists(synset_wsd_file): continue lines = f.readlines()[:100] output = [] for line in lines: line = 'dictionary=' + line exec(line) sentences = dictionary[SENTENCES] for sent in sentences: parsetree = sent[PARSE_TREE] t = ParentedTree.fromstring(parsetree) sentence_result = [] txt = sent[TXT] for word, pos in t.pos(): if re.match(POS_PATTERN_FOR_WSD, pos) and pos not in ['DT', 'CC', 'CD']: ranked_synsets = lsk.adapted_lesk(unicode(txt), unicode(word)) ranked_synset_prob_names = None if ranked_synsets: ranked_synset_prob_names = [(prob, ranked_synset.name())\ for prob, ranked_synset in ranked_synsets] result = (word, ranked_synset_prob_names) sentence_result.append(result) output.append(sentence_result) with open(synset_wsd_file, 'w') as f1: f1.write(str(output))
def get_disambiguated_definition(sentence, word, pos): translated_pos = get_wordnet_pos(pos) try: synset = adapted_lesk(sentence, word, pos=translated_pos) except: synset = None if synset is None: return word else: return synset.definition()
def get_syns(story_dict): syn_dict = OrderedDict() i = 0 while i < len(story_dict): key = 'cast' + ` i ` syns = [] story = [] story = remove_stop_words((tokenise(story_dict[key]))) for word in story: syns.append(adapted_lesk(story_dict[key], word)) syn_dict[key] = syns i += 1 return syn_dict
def get_wordsense(self,sent,word): word= word.lower() if len(word.split())>0: word = word.replace(" ","_") synsets = wn.synsets(word,'n') if synsets: wup = max_similarity(sent, word, 'wup', pos='n') adapted_lesk_output = adapted_lesk(sent, word, pos='n') lowest_index = min (synsets.index(wup),synsets.index(adapted_lesk_output)) return synsets[lowest_index] else: return None
def seperateByDef(targetWord): # Returns a dictionary sorted by defintion sentList = scrape.scrape(targetWord) dictDef = {} for i, sent in enumerate(sentList): #defineSent = cosine_lesk(sent,targetWord).definition() defineSent = adapted_lesk(sent, targetWord).definition() if defineSent not in dictDef: dictDef[defineSent] = [sent] else: dictDef[defineSent].append(sent) return dictDef
def get_syns(story_dict): syn_dict = OrderedDict() i = 0 while i<len(story_dict): key = 'cast' + `i` syns = [] story = [] story = remove_stop_words((tokenise(story_dict[key]))) for word in story: syns.append(adapted_lesk(story_dict[key], word)) syn_dict[key] = syns i+=1 return syn_dict
def bayes_theorem(context, vocab, word_count, sum_word, word_median): words_probs = {} print len(vocab) count = 0 for word in vocab: if count % 1000 == 0: print 'word ' + str(count) count += 1 sent = context ambiguous = vocab.get(word).split("_")[0] post = vocab.get(word).split("_")[1] #print ambiguous, post try: answer = adapted_lesk(sent, ambiguous, pos=penn_to_wn(post), nbest=True) except Exception, e: continue total = 0 for j in range(len(answer)): total += answer[j][0] if total == 0: continue for j in range(len(answer)): if answer[j][0] == 0: continue prob_w = 0.0 prob_s_w = float(answer[j][0]) / total if word_count.has_key(vocab.get(word)): prob_w = word_count.get(vocab.get(word)) / float(sum_word) else: prob_w = word_median prob_w_s = prob_s_w * prob_w if words_probs.has_key(word): aux = words_probs.get(word) aux[int(answer[j][1].offset)] = prob_w_s words_probs[word] = aux else: aux = {} aux[int(answer[j][1].offset)] = prob_w_s words_probs[word] = aux
def word_sense(sentence, keyword): print("5.Getting word sense to obtain best MCQ options with WordNet...") word = keyword.lower() if len(word.split())>0: word = word.replace(" ","_") syon_sets = wordnet.synsets(word,'n') if syon_sets: try: wup = max_similarity(sentence, word, 'wup', pos='n') adapted_lesk_output = adapted_lesk(sentence, word, pos='n') lowest_index = min(syon_sets.index(wup),syon_sets.index(adapted_lesk_output)) return syon_sets[lowest_index] except: return syon_sets[0] else: return None
def bayes_theorem(context, vocab, word_count, sum_word, word_median): words_probs = {} print len(vocab) count = 0 for word in vocab: if count%1000 == 0: print 'word ' + str(count) count += 1 sent = context ambiguous = vocab.get(word).split("_")[0] post = vocab.get(word).split("_")[1] #print ambiguous, post try: answer = adapted_lesk(sent, ambiguous, pos= penn_to_wn(post), nbest=True) except Exception, e: continue total = 0 for j in range(len(answer)): total += answer[j][0] if total == 0: continue for j in range(len(answer)): if answer[j][0] == 0: continue prob_w = 0.0 prob_s_w = float(answer[j][0])/total if word_count.has_key(vocab.get(word)): prob_w = word_count.get(vocab.get(word))/float(sum_word) else: prob_w = word_median prob_w_s = prob_s_w * prob_w if words_probs.has_key(word): aux = words_probs.get(word) aux[int(answer[j][1].offset)] = prob_w_s words_probs[word] = aux else: aux = {} aux[int(answer[j][1].offset)] = prob_w_s words_probs[word] = aux
def get_wordsense(sent, word): """ Get a sentence of the meaning of a word, in context, using (1) Lesk algorithm and (2) max similarity Useful for word sense disambiguation tasks (e.g., one word means different things, based on context) Paper: https://thesai.org/Downloads/Volume11No3/Paper_30-Adapted_Lesk_Algorithm.pdf The goal here is to see if the word has synonyms (or words close in meaning) that we could potentially use as answer choices """ word = word.lower() if len(word.split()) > 0: word = word.replace(" ", "_") # get set of synonyms synsets = wn.synsets(word, 'n') if synsets: # get similarity between possible synsets of all words in # context sentence and possible synsets of ambiguous words, # to determine "context" of the word of interest and what it # "should" mean wup = max_similarity(sent, word, "wup", pos='n') # use Lesk algorithm, which will assume that words in the same # "neighborhood", or area of text, will tend to share the same topic. adapted_lesk_output = adapted_lesk(sent, word, pos="n") lowest_index = min(synsets.index(wup), synsets.index(adapted_lesk_output)) return synsets[lowest_index] else: print(f"No synonyms found for the word {word}") return None
def get_synset(metode, word, text): synset = "" if metode == "original_lesk": synset = simple_lesk(text, word) elif metode == "simple_lesk": synset = adapted_lesk(text, word) elif metode == "adapted_lesk": synset = cosine_lesk(text, word) # elif metode == "path" : # synset = max_similarity(text, word, "path") # elif metode == "path" : # synset = max_similarity(text, word, "wup") # elif metode == "path" : # synset = max_similarity(text, word, "lin") # elif metode == "path" : # synset = max_similarity(text, word, "res") # elif metode == "random_sense": # synset = random_sense(word) # elif metode == "first_sense": # synset = first_sense(word) # elif metode == "most_frequent_sense": # synset = most_frequent_sense(word) return synset
def main(argv): cast_no = 'cast' + ` int(argv[0]) ` filepath = os.path.join(os.path.expanduser('~'), 'workspace', 'Dissertation', 'resources', 'casts.json') with open(filepath) as f: all_casts = yaml.safe_load(f.read().encode('utf-8')) stories = get_stories(all_casts) our_story = stories[cast_no] syns = [] story = [] story = remove_stop_words(tokenise(our_story)) for word in story: syns.append(adapted_lesk(our_story, word)) print syns max_depth = 0 for syn in syns: if syn is not None and syn.min_depth() > max_depth: max_depth = syn.min_depth() print max_depth print syn print max_depth
print "Context:", plant_sents[0] answer = simple_lesk(plant_sents[0],'plant','n', True, \ nbest=True, keepscore=True, normalizescore=True) print "Senses ranked by #overlaps:", answer best_sense = answer[0][1] try: definition = best_sense.definition() except: definition = best_sense.definition print "Definition:", definition print print "======== TESTING adapted_lesk ===========\n" from pywsd.lesk import adapted_lesk print "#TESTING adapted_lesk() ..." print "Context:", bank_sents[0] answer = adapted_lesk(bank_sents[0],'bank') print "Sense:", answer try: definition = answer.definition() except: definition = answer.definition print "Definition:", definition print print "#TESTING adapted_lesk() with pos, stem, nbest and scores." print "Context:", bank_sents[0] answer = adapted_lesk(bank_sents[0],'bank','n', True, \ nbest=True, keepscore=True) print "Senses ranked by #overlaps:", answer best_sense = answer[0][1] try: definition = best_sense.definition() except: definition = best_sense.definition print "Definition:", definition
def main(file_name): start = time.time() #string = '/home/adriana/Dropbox/mine/Tese/preprocessing/data_output/' #string = '/home/aferrugento/Desktop/' string = '' h = open(string + file_name + '_proc.txt') sentences = h.read() h.close() extra_synsets = {} sentences = sentences.split("\n") for i in range(len(sentences)): sentences[i] = sentences[i].split(" ") for j in range(len(sentences[i])): if sentences[i][j] == '': continue sentences[i][j] = sentences[i][j].split("_")[0] for i in range(len(sentences)): aux = '' for j in range(len(sentences[i])): aux += sentences[i][j] + ' ' sentences[i] = aux word_count = pickle.load(open('word_count_new.p')) synset_count = pickle.load(open('synset_count.p')) word_count_corpus = calculate_word_frequency(sentences) sum_word_corpus = 0 for key in word_count_corpus.keys(): sum_word_corpus += word_count_corpus.get(key) sum_word = 0 for key in word_count.keys(): sum_word += word_count.get(key) sum_synset = 0 for key in synset_count.keys(): sum_synset += synset_count.get(key) word_list = [] for key in word_count.keys(): word_list.append(word_count.get(key)) synset_list = [] for key in synset_count.keys(): synset_list.append(synset_count.get(key)) word_list.sort() synset_list.sort() #print len(word_list), len(synset_list) #print len(word_list)/2., len(synset_list)/2., (len(word_list)/2.) -1, (len(synset_list)/2.) -1 #print word_list[len(word_list)/2], word_list[(len(word_list)/2)-1] #print synset_list[len(synset_list)/2], synset_list[(len(synset_list)/2)-1] word_median = round(2./sum_word, 5) synset_median = round(2./sum_synset, 5) #print word_median, synset_median #print sum_word, sum_synset #return #f = open(string + 'preprocess_semLDA_EPIA/NEWS2_snowballstopword_wordnetlemma_pos_freq.txt') f = open(string + file_name +'_freq.txt') m = f.read() f.close() m = m.split("\n") for i in range(len(m)): m[i] = m[i].split(" ") count = 0 imag = -1 #f = open(string + 'preprocess_semLDA_EPIA/znew_eta_NEWS2.txt') f = open(string + file_name + '_eta.txt') g = f.read() f.close() g = g.split("\n") for i in range(len(g)): g[i] = g[i].split(" ") dic_g = create_dicio(g) g = open(string + file_name +'_wsd.txt','w') #dictio = pickle.load(open(string + 'preprocess_semLDA_EPIA/NEWS2_snowballstopword_wordnetlemma_pos_vocab.p')) dictio = pickle.load(open(string + file_name +'_vocab.p')) nn = open(string + file_name +'_synsetVoc.txt','w') synsets = {} to_write = [] p = open(string + 'NEWS2_wsd.log','w') for i in range(len(m)): nana = str(m[i][0]) + ' ' print 'Doc ' + str(i) p.write('---------- DOC ' +str(i) + ' ----------\n') #words_probs = bayes_theorem(sentences[i], dictio, word_count, sum_word, word_median) #return #g.write(str(m[i][0]) + ' ') for k in range(1, len(m[i])): #print sentences[i] if m[i][k] == '': continue #print dictio.get(int(m[i][k].split(":")[0])) + str(m[i][k].split(":")[0]) #print wn.synsets(dictio.get(int(m[i][k].split(":")[0])).split("_")[0], penn_to_wn(dictio.get(int(m[i][k].split(":")[0])).split("_")[1])) #caso nao existam synsets para aquela palavra if len(wn.synsets(dictio.get(int(m[i][k].split(":")[0])).split("_")[0], penn_to_wn(dictio.get(int(m[i][k].split(":")[0])).split("_")[1]))) == 0: nana += m[i][k]+":1[" +str(count)+":"+str(1)+"] " synsets[imag] = count extra_synsets[imag] = dictio.get(int(m[i][k].split(":")[0])) #g.write(m[i][k]+":1[" +str(imag)+":"+str(1)+"] ") imag -= 1 count += 1 continue sent = sentences[i] ambiguous = dictio.get(int(m[i][k].split(":")[0])).split("_")[0] post = dictio.get(int(m[i][k].split(":")[0])).split("_")[1] try: answer = adapted_lesk(sent, ambiguous, pos= penn_to_wn(post), nbest=True) except Exception, e: #caso o lesk se arme em estupido s = wn.synsets(dictio.get(int(m[i][k].split(":")[0])).split("_")[0], penn_to_wn(dictio.get(int(m[i][k].split(":")[0])).split("_")[1])) if len(s) != 0: count2 = 0 #ver quantos synsets existem no semcor #for n in range(len(s)): # if dic_g.has_key(str(s[n].offset)): # words = dic_g.get(str(s[n].offset)) # for j in range(len(words)): # if words[j].split(":")[0] == m[i][k].split(":")[0]: # count2 += 1 # se nao existir nenhum criar synset imaginario #if count2 == 0: # nana += m[i][k]+":1[" +str(count)+":"+str(1)+"] " # synsets[imag] = count # extra_synsets[imag] = dictio.get(int(m[i][k].split(":")[0])) #g.write(m[i][k]+":1[" +str(imag)+":"+str(1)+"] ") # count += 1 # imag -= 1 # continue #caso existam ir buscar as suas probabilidades ao semcor nana += m[i][k] +':'+ str(len(s)) + '[' c = 1 prob = 1.0/len(s) for n in range(len(s)): #print answer[n][1].offset #print 'Coco ' + str(s[n].offset) #if dic_g.has_key(str(s[n].offset)): #words = dic_g.get(str(s[n].offset)) #for j in range(len(words)): # if words[j].split(":")[0] == m[i][k].split(":")[0]: # aux = 0 a = (s[n].offset()) #print s[n].offset() if synsets.has_key(a): aux = synsets.get(a) else: synsets[a] = count aux = count count += 1 if n == len(s) - 1: nana += str(aux) + ':' + str(prob) + '] ' else: nana += str(aux) + ':' + str(prob) + ' ' else: nana += m[i][k]+":1[" +str(count)+":"+str(1)+"] " synsets[imag] = count extra_synsets[imag] = dictio.get(int(m[i][k].split(":")[0])) #g.write(m[i][k]+":1[" +str(imag)+":"+str(1)+"] ") count += 1 imag -= 1 continue #g.write(m[i][k] +':'+ str(len(answer)) + '[') total = 0 for j in range(len(answer)): total += answer[j][0] #caso lesk nao devolva nenhuma resposta criar synset imaginario if len(answer) == 0: nana += m[i][k]+":1[" +str(count)+":"+str(1)+"] " synsets[imag] = count extra_synsets[imag] = dictio.get(int(m[i][k].split(":")[0])) #g.write(m[i][k]+":1[" +str(imag)+":"+str(1)+"] ") count += 1 imag -= 1 continue #print ambiguous #print total #print answer #caso nenhum dos synsets tenha overlap ir ver ao semcor as suas probabilidades if total == 0: #print 'ZERO' count2 = 0 #for n in range(len(answer)): # if dic_g.has_key(str(answer[n][1].offset)): # words = dic_g.get(str(answer[n][1].offset)) # for j in range(len(words)): # if words[j].split(":")[0] == m[i][k].split(":")[0]: # count2 += 1 #if count2 == 0: # nana += m[i][k]+":1[" +str(count)+":"+str(1)+"] " # synsets[imag] = count # extra_synsets[imag] = dictio.get(int(m[i][k].split(":")[0])) #g.write(m[i][k]+":1[" +str(imag)+":"+str(1)+"] ") # count += 1 # imag -= 1 # continue s = wn.synsets(dictio.get(int(m[i][k].split(":")[0])).split("_")[0], penn_to_wn(dictio.get(int(m[i][k].split(":")[0])).split("_")[1])) nana += m[i][k] +':'+ str(len(s)) + '[' c = 1 prob = 1.0/len(s) for n in range(len(s)): #print answer[n][1].offset #print 'Coco ' + str(s[n].offset) #if dic_g.has_key(str(s[n].offset)): #words = dic_g.get(str(s[n].offset)) #for j in range(len(words)): # if words[j].split(":")[0] == m[i][k].split(":")[0]: # aux = 0 a = (s[n].offset()) #print s[n].offset() if synsets.has_key(a): aux = synsets.get(a) else: synsets[a] = count aux = count count += 1 if n == len(s) - 1: nana += str(aux) + ':' + str(prob) + '] ' else: nana += str(aux) + ':' + str(prob) + ' ' #print nana continue #contar quantos synsets e que nao estao a zero count2 = 0 for j in range(len(answer)): if answer[j][0] == 0: continue else: count2 += 1 c = 1 nana += m[i][k] +':'+ str(count2) + '[' for j in range(len(answer)): #words_synsets = words_probs.get(int(m[i][k].split(':')[0])) #s.write(answer[j][1].offset+"\n") if answer[j][0] == 0: continue aux = 0 a = (answer[j][1].offset()) #print 'Coco '+ str(answer[j][1].offset()) if synsets.has_key(a): aux = synsets.get(a) else: synsets[a] = count aux = count count += 1 prob_s = 0.0 prob_w = 0.0 prob_s_w = float(answer[j][0])/total #if synset_count.has_key(str(answer[j][1].offset)): # prob_s = synset_count.get(str(answer[j][1].offset))/float(sum_synset) #else: # prob_s = 0.1 prob_s_s = 1.0/count2 #if word_count.has_key(dictio.get(int(m[i][k].split(":")[0]))): # prob_w = word_count.get(dictio.get(int(m[i][k].split(":")[0])))/float(sum_word) #else: # prob_w = 0.1 if word_count_corpus.has_key(dictio.get(int(m[i][k].split(":")[0])).split("_")[0]): prob_w = word_count_corpus.get(dictio.get(int(m[i][k].split(":")[0])).split("_")[0])/float(sum_word_corpus) else: prob_w = 0.1 prob_w_s = (prob_w * prob_s_w) / prob_s_s if j == len(answer) - 1 or count2 == c: if prob_w_s > 1.0: #print 'Word: 'dictio.get(int(m[i][k].split(":")[0])) + ' Synset: ' + str(answer[j][1]) p.write('Word: '+ dictio.get(int(m[i][k].split(":")[0])) + ' Synset: ' + str(answer[j][1])) #print 'Synsets disambiguated: ' + str(answer) p.write('---- Synsets disambiguated: ' + str(answer)) #print synset_count.get(str(answer[j][1].offset)), word_count.get(dictio.get(int(m[i][k].split(":")[0]))), sum_synset, sum_word #print 'P(s)=' +prob_s +', P(w)='+prob_w +', P(s|w)='+ prob_s_w +', P(w|s)='+ prob_w_s p.write('---- P(s)=' +str(prob_s) +', P(w)='+ str(prob_w) +', P(s|w)='+ str(prob_s_w) +', P(w|s)='+ str(prob_w_s)) p.write("\n") nana += str(aux) + ':' + str(1) + '] ' #nana += str(aux) + ':' + str(words_synsets.get(answer[j][1].offset)) + '] ' else: nana += str(aux) + ':' + str(prob_w_s) + '] ' #g.write(str(aux) + ':' + str(float(answer[j][0]/total)) + '] ') else: c += 1 if prob_w_s > 1.0: #print 'Word: 'dictio.get(int(m[i][k].split(":")[0])) + ' Synset: ' + str(answer[j][1]) p.write('Word: '+ dictio.get(int(m[i][k].split(":")[0])) + ' Synset: ' + str(answer[j][1])) #print 'Synsets disambiguated: ' + str(answer) p.write('---- Synsets disambiguated: ' + str(answer)) #print synset_count.get(str(answer[j][1].offset)), word_count.get(dictio.get(int(m[i][k].split(":")[0]))), sum_synset, sum_word #print 'P(s)=' +prob_s +', P(w)='+prob_w +', P(s|w)='+ prob_s_w +', P(w|s)='+ prob_w_s p.write('---- P(s)=' +str(prob_s) +', P(w)='+ str(prob_w) +', P(s|w)='+ str(prob_s_w) +', P(w|s)='+ str(prob_w_s)) p.write("\n") nana += str(aux) + ':' + str(1) + '] ' #nana += str(aux) + ':' + str(words_synsets.get(answer[j][1].offset)) +' ' else: nana += str(aux) + ':' + str(prob_w_s) +' ' #g.write(str(aux) + ':' + str(float(answer[j][0]/total)) +' ') nana += '\n' #print nana #return to_write.append(nana)
from pywsd.lesk import adapted_lesk raw_sentence=raw_input("Please enter your sentence : ") raw_word=raw_input("Please enter input word :") print "#TESTING adapted_lesk() with pos, stem, nbest and scores." print "Context:", raw_sentence answer = adapted_lesk(raw_sentence[0],raw_word,'n', True, \ nbest=True, keepscore=True) print "Senses ranked by #overlaps:", answer best_sense = answer[0][1] try: definition = best_sense.definition() except: definition = best_sense.definition print "Definition:", definition
for eachword in words: if has_synset(eachword): answer = simple_lesk(raw_sentence, eachword) simplelesk_answer.append(answer) print "Sense :", answer print eachword+":"+answer.definition()+"\n" else: print eachword+": "+eachword+"\n" simplelesk_answer.append(eachword) print "\nDisambiguating your sentence word by word using Adapted Lesk algorithm. Hold on. \n======================================================" for eachword in words: if has_synset(eachword): answer = adapted_lesk(raw_sentence, eachword) adaptedlesk_answer.append(answer) print "Sense :", answer print eachword+":"+answer.definition()+"\n" else: print eachword+": "+eachword+"\n" adaptedlesk_answer.append(eachword) print "\nDisambiguating your sentence word by word using Cosine Lesk algorithm. Hold on. \n======================================================" for eachword in words: if has_synset(eachword): answer = cosine_lesk(raw_sentence, eachword) cosinelesk_answer.append(answer) print "Sense :", answer
def main(file_name): start = time.time() #string = '/home/adriana/Dropbox/mine/Tese/preprocessing/data_output/' #string = '/home/aferrugento/Desktop/' string = '' h = open(string + file_name + '_proc.txt') sentences = h.read() h.close() extra_synsets = {} sentences = sentences.split("\n") for i in range(len(sentences)): sentences[i] = sentences[i].split(" ") for j in range(len(sentences[i])): if sentences[i][j] == '': continue sentences[i][j] = sentences[i][j].split("_")[0] for i in range(len(sentences)): aux = '' for j in range(len(sentences[i])): aux += sentences[i][j] + ' ' sentences[i] = aux word_count = pickle.load(open('word_count_new.p')) synset_count = pickle.load(open('synset_count.p')) word_count_corpus = calculate_word_frequency(sentences) sum_word_corpus = 0 for key in word_count_corpus.keys(): sum_word_corpus += word_count_corpus.get(key) sum_word = 0 for key in word_count.keys(): sum_word += word_count.get(key) sum_synset = 0 for key in synset_count.keys(): sum_synset += synset_count.get(key) word_list = [] for key in word_count.keys(): word_list.append(word_count.get(key)) synset_list = [] for key in synset_count.keys(): synset_list.append(synset_count.get(key)) word_list.sort() synset_list.sort() #print len(word_list), len(synset_list) #print len(word_list)/2., len(synset_list)/2., (len(word_list)/2.) -1, (len(synset_list)/2.) -1 #print word_list[len(word_list)/2], word_list[(len(word_list)/2)-1] #print synset_list[len(synset_list)/2], synset_list[(len(synset_list)/2)-1] word_median = round(2. / sum_word, 5) synset_median = round(2. / sum_synset, 5) #print word_median, synset_median #print sum_word, sum_synset #return #f = open(string + 'preprocess_semLDA_EPIA/NEWS2_snowballstopword_wordnetlemma_pos_freq.txt') f = open(string + file_name + '_freq.txt') m = f.read() f.close() m = m.split("\n") for i in range(len(m)): m[i] = m[i].split(" ") count = 0 imag = -1 #f = open(string + 'preprocess_semLDA_EPIA/znew_eta_NEWS2.txt') f = open(string + file_name + '_eta.txt') g = f.read() f.close() g = g.split("\n") for i in range(len(g)): g[i] = g[i].split(" ") dic_g = create_dicio(g) g = open(string + file_name + '_wsd.txt', 'w') #dictio = pickle.load(open(string + 'preprocess_semLDA_EPIA/NEWS2_snowballstopword_wordnetlemma_pos_vocab.p')) dictio = pickle.load(open(string + file_name + '_vocab.p')) nn = open(string + file_name + '_synsetVoc.txt', 'w') synsets = {} to_write = [] p = open(string + 'NEWS2_wsd.log', 'w') for i in range(len(m)): nana = str(m[i][0]) + ' ' print 'Doc ' + str(i) p.write('---------- DOC ' + str(i) + ' ----------\n') #words_probs = bayes_theorem(sentences[i], dictio, word_count, sum_word, word_median) #return #g.write(str(m[i][0]) + ' ') for k in range(1, len(m[i])): #print sentences[i] if m[i][k] == '': continue #print dictio.get(int(m[i][k].split(":")[0])) + str(m[i][k].split(":")[0]) #print wn.synsets(dictio.get(int(m[i][k].split(":")[0])).split("_")[0], penn_to_wn(dictio.get(int(m[i][k].split(":")[0])).split("_")[1])) #caso nao existam synsets para aquela palavra if len( wn.synsets( dictio.get(int(m[i][k].split(":")[0])).split("_")[0], penn_to_wn( dictio.get(int( m[i][k].split(":")[0])).split("_")[1]))) == 0: nana += m[i][k] + ":1[" + str(count) + ":" + str(1) + "] " synsets[imag] = count extra_synsets[imag] = dictio.get(int(m[i][k].split(":")[0])) #g.write(m[i][k]+":1[" +str(imag)+":"+str(1)+"] ") imag -= 1 count += 1 continue sent = sentences[i] ambiguous = dictio.get(int(m[i][k].split(":")[0])).split("_")[0] post = dictio.get(int(m[i][k].split(":")[0])).split("_")[1] try: answer = adapted_lesk(sent, ambiguous, pos=penn_to_wn(post), nbest=True) except Exception, e: #caso o lesk se arme em estupido s = wn.synsets( dictio.get(int(m[i][k].split(":")[0])).split("_")[0], penn_to_wn( dictio.get(int(m[i][k].split(":")[0])).split("_")[1])) if len(s) != 0: count2 = 0 #ver quantos synsets existem no semcor #for n in range(len(s)): # if dic_g.has_key(str(s[n].offset)): # words = dic_g.get(str(s[n].offset)) # for j in range(len(words)): # if words[j].split(":")[0] == m[i][k].split(":")[0]: # count2 += 1 # se nao existir nenhum criar synset imaginario #if count2 == 0: # nana += m[i][k]+":1[" +str(count)+":"+str(1)+"] " # synsets[imag] = count # extra_synsets[imag] = dictio.get(int(m[i][k].split(":")[0])) #g.write(m[i][k]+":1[" +str(imag)+":"+str(1)+"] ") # count += 1 # imag -= 1 # continue #caso existam ir buscar as suas probabilidades ao semcor nana += m[i][k] + ':' + str(len(s)) + '[' c = 1 prob = 1.0 / len(s) for n in range(len(s)): #print answer[n][1].offset #print 'Coco ' + str(s[n].offset) #if dic_g.has_key(str(s[n].offset)): #words = dic_g.get(str(s[n].offset)) #for j in range(len(words)): # if words[j].split(":")[0] == m[i][k].split(":")[0]: # aux = 0 a = (s[n].offset()) #print s[n].offset() if synsets.has_key(a): aux = synsets.get(a) else: synsets[a] = count aux = count count += 1 if n == len(s) - 1: nana += str(aux) + ':' + str(prob) + '] ' else: nana += str(aux) + ':' + str(prob) + ' ' else: nana += m[i][k] + ":1[" + str(count) + ":" + str(1) + "] " synsets[imag] = count extra_synsets[imag] = dictio.get(int( m[i][k].split(":")[0])) #g.write(m[i][k]+":1[" +str(imag)+":"+str(1)+"] ") count += 1 imag -= 1 continue #g.write(m[i][k] +':'+ str(len(answer)) + '[') total = 0 for j in range(len(answer)): total += answer[j][0] #caso lesk nao devolva nenhuma resposta criar synset imaginario if len(answer) == 0: nana += m[i][k] + ":1[" + str(count) + ":" + str(1) + "] " synsets[imag] = count extra_synsets[imag] = dictio.get(int(m[i][k].split(":")[0])) #g.write(m[i][k]+":1[" +str(imag)+":"+str(1)+"] ") count += 1 imag -= 1 continue #print ambiguous #print total #print answer #caso nenhum dos synsets tenha overlap ir ver ao semcor as suas probabilidades if total == 0: #print 'ZERO' count2 = 0 #for n in range(len(answer)): # if dic_g.has_key(str(answer[n][1].offset)): # words = dic_g.get(str(answer[n][1].offset)) # for j in range(len(words)): # if words[j].split(":")[0] == m[i][k].split(":")[0]: # count2 += 1 #if count2 == 0: # nana += m[i][k]+":1[" +str(count)+":"+str(1)+"] " # synsets[imag] = count # extra_synsets[imag] = dictio.get(int(m[i][k].split(":")[0])) #g.write(m[i][k]+":1[" +str(imag)+":"+str(1)+"] ") # count += 1 # imag -= 1 # continue s = wn.synsets( dictio.get(int(m[i][k].split(":")[0])).split("_")[0], penn_to_wn( dictio.get(int(m[i][k].split(":")[0])).split("_")[1])) nana += m[i][k] + ':' + str(len(s)) + '[' c = 1 prob = 1.0 / len(s) for n in range(len(s)): #print answer[n][1].offset #print 'Coco ' + str(s[n].offset) #if dic_g.has_key(str(s[n].offset)): #words = dic_g.get(str(s[n].offset)) #for j in range(len(words)): # if words[j].split(":")[0] == m[i][k].split(":")[0]: # aux = 0 a = (s[n].offset()) #print s[n].offset() if synsets.has_key(a): aux = synsets.get(a) else: synsets[a] = count aux = count count += 1 if n == len(s) - 1: nana += str(aux) + ':' + str(prob) + '] ' else: nana += str(aux) + ':' + str(prob) + ' ' #print nana continue #contar quantos synsets e que nao estao a zero count2 = 0 for j in range(len(answer)): if answer[j][0] == 0: continue else: count2 += 1 c = 1 nana += m[i][k] + ':' + str(count2) + '[' for j in range(len(answer)): #words_synsets = words_probs.get(int(m[i][k].split(':')[0])) #s.write(answer[j][1].offset+"\n") if answer[j][0] == 0: continue aux = 0 a = (answer[j][1].offset()) #print 'Coco '+ str(answer[j][1].offset()) if synsets.has_key(a): aux = synsets.get(a) else: synsets[a] = count aux = count count += 1 prob_s = 0.0 prob_w = 0.0 prob_s_w = float(answer[j][0]) / total #if synset_count.has_key(str(answer[j][1].offset)): # prob_s = synset_count.get(str(answer[j][1].offset))/float(sum_synset) #else: # prob_s = 0.1 prob_s_s = 1.0 / count2 #if word_count.has_key(dictio.get(int(m[i][k].split(":")[0]))): # prob_w = word_count.get(dictio.get(int(m[i][k].split(":")[0])))/float(sum_word) #else: # prob_w = 0.1 if word_count_corpus.has_key( dictio.get(int(m[i][k].split(":")[0])).split("_")[0]): prob_w = word_count_corpus.get( dictio.get(int(m[i][k].split(":")[0])).split("_") [0]) / float(sum_word_corpus) else: prob_w = 0.1 prob_w_s = (prob_w * prob_s_w) / prob_s_s if j == len(answer) - 1 or count2 == c: if prob_w_s > 1.0: #print 'Word: 'dictio.get(int(m[i][k].split(":")[0])) + ' Synset: ' + str(answer[j][1]) p.write('Word: ' + dictio.get(int(m[i][k].split(":")[0])) + ' Synset: ' + str(answer[j][1])) #print 'Synsets disambiguated: ' + str(answer) p.write('---- Synsets disambiguated: ' + str(answer)) #print synset_count.get(str(answer[j][1].offset)), word_count.get(dictio.get(int(m[i][k].split(":")[0]))), sum_synset, sum_word #print 'P(s)=' +prob_s +', P(w)='+prob_w +', P(s|w)='+ prob_s_w +', P(w|s)='+ prob_w_s p.write('---- P(s)=' + str(prob_s) + ', P(w)=' + str(prob_w) + ', P(s|w)=' + str(prob_s_w) + ', P(w|s)=' + str(prob_w_s)) p.write("\n") nana += str(aux) + ':' + str(1) + '] ' #nana += str(aux) + ':' + str(words_synsets.get(answer[j][1].offset)) + '] ' else: nana += str(aux) + ':' + str(prob_w_s) + '] ' #g.write(str(aux) + ':' + str(float(answer[j][0]/total)) + '] ') else: c += 1 if prob_w_s > 1.0: #print 'Word: 'dictio.get(int(m[i][k].split(":")[0])) + ' Synset: ' + str(answer[j][1]) p.write('Word: ' + dictio.get(int(m[i][k].split(":")[0])) + ' Synset: ' + str(answer[j][1])) #print 'Synsets disambiguated: ' + str(answer) p.write('---- Synsets disambiguated: ' + str(answer)) #print synset_count.get(str(answer[j][1].offset)), word_count.get(dictio.get(int(m[i][k].split(":")[0]))), sum_synset, sum_word #print 'P(s)=' +prob_s +', P(w)='+prob_w +', P(s|w)='+ prob_s_w +', P(w|s)='+ prob_w_s p.write('---- P(s)=' + str(prob_s) + ', P(w)=' + str(prob_w) + ', P(s|w)=' + str(prob_s_w) + ', P(w|s)=' + str(prob_w_s)) p.write("\n") nana += str(aux) + ':' + str(1) + '] ' #nana += str(aux) + ':' + str(words_synsets.get(answer[j][1].offset)) +' ' else: nana += str(aux) + ':' + str(prob_w_s) + ' ' #g.write(str(aux) + ':' + str(float(answer[j][0]/total)) +' ') nana += '\n' #print nana #return to_write.append(nana)
from pattern.en import tag import nltk from nltk.corpus import wordnet as wn from nltk.wsd import lesk nltk.data.path.append('/media/santhosh/Data/workspace/nltk_data') # for word, pos in tag('I feel *happy*!'): # print word, pos # s = parsetree('The cat sat on the mat.', relations=True, lemmata=True) # print repr(s) # from pattern.en import parse # s = 'This is my sample' # s = parse(s, relations=True, lemmata=True) # print s from pywsd import lesk as lsk from nltk.corpus import wordnet as wn from nltk.corpus.reader.wordnet import Synset data = lsk.adapted_lesk(u'I killed Cricket', u'Cricket') ranked_synsets = data probs = 0.0 for ranked_synset in ranked_synsets: prob, syn = ranked_synset print prob, syn.name() probs += prob print probs
def wsd_lesk(raw_df, algorithm_choice): """This finds the synset of the word using the original sentence as context and different lesk algorithms from nltk- and pywsd-packages. Algorithm choices are: 1. nltk's lesk 2. pywsd simple_lesk, 3. pywsd advanced_lesk.""" start = timer() algorithm_dict = {1: "nltk_lesk", 2: "pywsd_simple_lesk", 3: "pywsd_advanced_lesk", 4: "pywsd_cosine_lesk"} df = raw_df full_aspect_synset_list = [] full_aspect_synset_list_definition = [] aspect_synset_list_definition = [] aspect_synset_list = [] opinion_synset_list = [] opinion_synset_list_definition = [] full_opinion_synset_list = [] full_opinion_synset_list_definition = [] aspect_opinion = ["aspect_tags", "opinion_tags"] tokenized_sentences = raw_df["tokenized_sentence"] non_tokenized_sentences = raw_df["original_text"] for opinion_list in aspect_opinion: for i, phrase in enumerate(df[opinion_list]): multiple_word_found = False for j, word in enumerate(phrase): special_word = False if multiple_word_found is False: # Check here for special words such as "bug". aspect = check_for_special_word(word) if aspect is not None: special_word = True wn_check = [] if len(phrase) >= 2: k = 0 temporary_combined_word = [] while k < len(phrase): temporary_combined_word.append(phrase[k][0]) k += 1 combined_word_string = '_'.join(temporary_combined_word) wn_check = wn.synsets(combined_word_string, pos=find_wordnet_pos(word[1])) multiple_word_found = True if len(wn_check) == 0: wn_check = wn.synsets(word[0], pos=find_wordnet_pos(word[1])) multiple_word_found = False if len(wn_check) > 0: if special_word is False: if algorithm_choice == 1: if multiple_word_found is True: aspect = lesk(tokenized_sentences[i], combined_word_string, find_wordnet_pos(word[1])) else: aspect = lesk(tokenized_sentences[i], word[0], find_wordnet_pos(word[1])) if algorithm_choice == 2: if multiple_word_found is True: aspect = pylesk.simple_lesk(non_tokenized_sentences[i], combined_word_string, find_wordnet_pos(word[1])) else: aspect = pylesk.simple_lesk(non_tokenized_sentences[i], word[0], find_wordnet_pos(word[1])) if algorithm_choice == 3: if multiple_word_found is True: aspect = pylesk.adapted_lesk(non_tokenized_sentences[i], combined_word_string, find_wordnet_pos(word[1])) else: aspect = pylesk.adapted_lesk(non_tokenized_sentences[i], word[0], find_wordnet_pos(word[1])) if algorithm_choice == 4: if multiple_word_found is True: aspect = pylesk.cosine_lesk(non_tokenized_sentences[i], combined_word_string, find_wordnet_pos(word[1])) else: aspect = pylesk.cosine_lesk(non_tokenized_sentences[i], word[0], find_wordnet_pos(word[1])) if aspect is not None: if opinion_list is "aspect_tags": aspect_synset_list.append(aspect) aspect_synset_list_definition.append(aspect.definition()) else: opinion_synset_list.append(aspect) opinion_synset_list_definition.append(aspect.definition()) if opinion_list is "aspect_tags": full_aspect_synset_list.append(aspect_synset_list) full_aspect_synset_list_definition.append(aspect_synset_list_definition) aspect_synset_list = [] aspect_synset_list_definition = [] else: full_opinion_synset_list.append(opinion_synset_list) full_opinion_synset_list_definition.append(opinion_synset_list_definition) opinion_synset_list = [] opinion_synset_list_definition = [] df[algorithm_dict[algorithm_choice] + "_aspect_synset"] = pd.Series(full_aspect_synset_list).values df[algorithm_dict[algorithm_choice] + "_aspect_definition"] = pd.Series(full_aspect_synset_list_definition).values df[algorithm_dict[algorithm_choice] + "_opinion_synset"] = pd.Series(full_opinion_synset_list).values df[algorithm_dict[algorithm_choice] + "_opinion_definition"] = pd.Series(full_opinion_synset_list_definition).values end = timer() logging.debug("WSD Lesk Time: %.2f seconds" % (end - start)) return df
sentences.append(z) all_sentences.append(sentences) sentences_tagged = [] sentence = 1 # Add POS tags to all the sentences and extracting the dependencies verbs = set() children = [] # spacy.displacy.serve(sentences[25], style='dep') for x in sentences: word = 0 sentence_data = [] for token in x: synset = adapted_lesk(str(x), token.text) synset = str(synset) if synset != "None": token_synset = synset.split('(', 1)[1].split(')')[0] token_synset = token_synset[1:-1] else: token_synset = "None" sentence_data.append([ token.text, token.pos_, token.dep_, sentence, token.i, token.head.i, token.head, token.lemma_, token_synset ]) word = word + 1 sentences_tagged.append( pd.DataFrame(sentence_data,
print "Context:", plant_sents[0] answer = simple_lesk(plant_sents[0],'plant','n', True, \ nbest=True, keepscore=True, normalizescore=True) print "Senses ranked by #overlaps:", answer best_sense = answer[0][1] definition = best_sense.definition() #except: definition = best_sense.definition print "Definition:", definition print print "======== TESTING adapted_lesk ===========\n" from pywsd.lesk import adapted_lesk print "#TESTING adapted_lesk() ..." print "Context:", bank_sents[0] answer = adapted_lesk(bank_sents[0],'bank') print "Sense:", answer definition = answer.definition() #except: definition = answer.definition print "Definition:", definition print print "#TESTING adapted_lesk() with pos, stem, nbest and scores." print "Context:", bank_sents[0] answer = adapted_lesk(bank_sents[0],'bank','n', True, \ nbest=True, keepscore=True) print "Senses ranked by #overlaps:", answer best_sense = answer[0][1] definition = best_sense.definition() #except: definition = best_sense.definition print "Definition:", definition
def get_disambiguated_synset(sentence, word, pos): translated_pos = get_wordnet_pos(pos) synset = adapted_lesk(sentence, word, pos=translated_pos) return synset