def getSimilarity(s1, s2, word_order=False): try: s1_wsd = disambiguate(s1) # using default disambiguation s2_wsd = disambiguate(s2) except TypeError: print("s2:", s1) sys.exit(0) # remove None synsets s1_wsd = [syn for syn in s1_wsd if syn[1]] s2_wsd = [syn for syn in s2_wsd if syn[1]] #vector_length = max(len(s1_wsd), len(s2_wsd)) try: L1, L2 = _synset_similarity(s1_wsd, s2_wsd) V1 = np.array([max(L1[key]) for key in L1.keys()]) V2 = np.array([max(L2[key]) for key in L2.keys()]) S = np.linalg.norm(V1) * np.linalg.norm(V2) C1 = sum(V1 >= benchmark_similarity) C2 = sum(V2 >= benchmark_similarity) Xi = (C1 + C2) / gamma if C1 + C2 == 0: Xi = max(V1.size, V2.size) / 2.0 sem_similarity = S / Xi except ValueError: sem_similarity = 0 # computing word order similarity word_ord_similarity = 0 delta = 1.0 if word_order: tokens1 = word_tokenize(s1) tokens2 = word_tokenize(s2) len1 = len(tokens1) len2 = len(tokens2) maxlen = len1 if maxlen < len2: maxlen = len2 r1 = list(range(maxlen)) r2 = [0 for _ in range(maxlen)] if maxlen == len1: for i, v in enumerate(tokens2): if v in tokens1: r2[i] = tokens1.index(v) + 1 else: r2[i] = i else: for i, v in enumerate(tokens1): if v in tokens2: r2[i] = tokens2.index(v) + 1 else: r2[i] = i word_ord_similarity = np.linalg.norm(np.array(r1) - np.array( r2)) / np.linalg.norm(np.array(r1) + np.array(r2)) delta = 0.8 # set delta for convex combination of semantic similarity and word order similarity return delta * sem_similarity + (1 - delta) * word_ord_similarity
def wordnetScore(sentence, namedEntities): nouns = ['NN', 'NNP', 'NNS', 'NNPS'] verbs = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'] adjectives = ['JJ', 'JJR', 'JJS'] adverbs = ['RB', 'RBR', 'RBS'] #removing tokens other than nouns, verbs, adverbs, adjectives posToConsider = nouns + verbs + adverbs + adjectives pos_tokens = nltk.pos_tag(word_tokenize(sentence)) tokensToConsider = [] for token in pos_tokens: if token[1] in posToConsider and (token[0] not in namedEntities): tokensToConsider.append(token) t0_ = time.time() #1.word sense disambiguation senses = { token[0]: token[1] for token in disambiguate(sentence) if type(token[1]) != type(None) } #2.similarity calculation t1_ = time.time() for word, sense in senses.items(): senses[word] = {} senses[word]['sense'] = sense calculateSimilarity(senses[word]) print(senses) print(time.time() - t1_, t1_ - t0_)
def GetDisambiguation(tweet_sentence): cleaned_tweet = p.clean(tweet_sentence) replaced_tweet = replace_word(cleaned_tweet) replaced_tweet_list = replaced_tweet.split(" ") ## Can replace this by using other WSD options (different Lesk algorithms / similarity options) da_token_pair_list = disambiguate(replaced_tweet, max_similarity, similarity_option='res') # da_token_pair_list = disambiguate(replaced_tweet, cosine_lesk) da_token_list = [] for pair in da_token_pair_list: da_token_list.append(pair[0]) token_negation_pair_list = negate(da_token_list) print(da_token_pair_list) print(len(da_token_pair_list)) print(token_negation_pair_list) print(len(token_negation_pair_list)) return da_token_pair_list, token_negation_pair_list
def process_jokes(joke_data, ref_dict): ''' Return a DataFrame that contrains sentences along with citations and information of detected heteronyms ''' sense_list = list(ref_dict['sense']) het_list = set(ref_dict['word']) word_duplicate_sense = set(ref_dict[ref_dict.duplicated(['sense' ])]['word']) joke_sents = pd.DataFrame(columns=['sentence', 'citation', 'heteronym']) for i, row in joke_data.iterrows(): ## List of sentence in 1 joke. Sometimes jokes do not have proper punctuation. ## We may see 2-3 sentence in 1 row in the out result if it fails to decompose text to separate sentences. sents = nltk.sent_tokenize(row['sentence']) for sent in sents: het_in_row = [] text_token = [ w.lower() for w in nltk.word_tokenize(sent) if (w not in string.punctuation) and (w.lower() not in stopset) ] het_occur = set(text_token).intersection(het_list) if not het_occur: continue for (word, synset) in disambiguate(sent): if (word in het_list) and (synset) and (synset in sense_list): ## Take care of sense-duplcated heteronyms (rare), ## e.g. project and projects can have same sense but different pronunciations. if word.lower() in word_duplicate_sense: pron = list(ref_dict[(ref_dict['word'] == word.lower()) & (ref_dict['sense'] == synset)] ['pronunciation']) if pron: het_in_row.append((word.lower(), synset, pron[0])) ## If sense if not duplicated, mapping to pron is one-to-one else: pron = list(ref_dict[ref_dict['sense'] == synset] ['pronunciation'])[0] word_in_ref = list( ref_dict[ref_dict['sense'] == synset]['word'])[0] if word.lower() == word_in_ref: het_in_row.append((word_in_ref, synset, pron)) if het_in_row: new_row = { 'sentence': sent, 'citation': row['citation'], 'heteronym': het_in_row } joke_sents = joke_sents.append(new_row, ignore_index=True) return joke_sents
def sentido_PYWSD(palabra, frase): frase_wsd = disambiguate(frase) for r in frase_wsd: if r[0] == palabra: if not (r[1] is None): sentido = r[1] return (sentido, True) return (None, False)
def get_disambiguated_synonym(sentence): for word_dis_sense in disambiguate(sentence, algorithm=maxsim, similarity_option='wup', keepLemmas=True): if (word_dis_sense[2] is not None): for lemma in word_dis_sense[2].lemma_names(): yield (word_dis_sense[0], lemma)
def classify(text, category, dictionary, total = 0): #category = "senses" or "token" # seprate the text into tokens if category == "sense": tokens = [word[1].name() for word in disambiguate(text) if word[1] is not None] else: tokens = word_tokenize(text) tokens = [word for word in tokens if word not in stop] tokens = filter(lambda word: word not in [',', '.', '!', '?', '``', "'ve", "''", "n't", "'s"], tokens) return probability(tokens, category, dictionary, total)
def train_random_senses_set(title): global random_senses_dict # extract the words that are not none with their synset senses = [word[1].name() for word in disambiguate(str(title)) if word[1] is not None] for sense in senses: if sense in random_senses_dict: random_senses_dict[sense] += 1 else: random_senses_dict[sense] = 1 return random_senses_dict
def replace(self, sentence): wording = [] wording.append( disambiguate(sentence, algorithm=maxsim, similarity_option='wup', keepLemmas=True)) for i in wording: for j in i: if j is not None: j = j.replace(wordnet.synsets(j).name) return wording
def wsd(request): if request.method == 'POST': input_sentence = request.POST['input_sentence'] output = disambiguate(input_sentence, algorithm=maxsim, similarity_option='wup', keepLemmas=True) output = [(record[0], record[2].lexname()) if record[2] is not None else (record[0], None) for record in output] return render(request, 'output_wsd.html', {'output': output}) return render(request, 'form_wsd.html', {})
def disambiguate_pipe(df, name=None): """Returns a list of 2-tuples (s1_disam, s2_disam), for each sentence pair in the dataframe, where each tuple is a list of disambiguated 2-tuples (word, synset). Args: df: the source dataframe with columns: [s1, s2]. name ([type], optional): the name of the dataframe. Defaults to None. Returns: list: a list of the disambiguated sentence pairs like: [ ( tuple[0], for s1 [ (word:str, wn_synset), (word:str, wn_synset), ... ], tuple[1], for s2 [ (word:str, wn_synset), (word:str, wn_synset), ... ] ), ... ] """ from pywsd import disambiguate, max_similarity from pywsd.lesk import adapted_lesk print(f"Disambiguating {name}...") disambiguated = [] for s1, s2, in zip(df["s1"], df["s2"]): s1_disam = disambiguate(s1, adapted_lesk, prefersNone=True) s2_disam = disambiguate(s2, adapted_lesk, prefersNone=True) disambiguated.append((s1_disam, s2_disam)) return disambiguated
def get_expanded_query(self, q, args=None): res = [] disamb = disambiguate(q) for i, t in enumerate(disamb): if t[1] is not None: if not self.replace: res.append(t[0]) x = t[1].name().split('.')[0].split('_') if t[0].lower() != (' '.join(x)).lower() or self.replace: res.append(' '.join(x)) else: res.append(t[0]) return ' '.join(res)
def produce_single_repr( self, field_data: Union[List[str], str]) -> FeaturesBagField: """ Produces a bag of features whose key is a wordnet synset and whose value is the frequency of the synset in the field data text """ field_data = check_not_tokenized(field_data) synsets = disambiguate(field_data) synsets = [synset for word, synset in synsets if synset is not None] return FeaturesBagField(Counter(synsets))
def getSimilarity(s1, s2): s1_wsd = disambiguate(s1) # using default disambiguation s2_wsd = disambiguate(s2) # remove None synsets s1_wsd = [syn for syn in s1_wsd if syn[1]] s2_wsd = [syn for syn in s2_wsd if syn[1]] #vector_length = max(len(s1_wsd), len(s2_wsd)) L1, L2 = _synset_similarity(s1_wsd, s2_wsd) V1 = np.array([max(L1[key]) for key in L1.keys()]) V2 = np.array([max(L2[key]) for key in L2.keys()]) S = np.linalg.norm(V1) * np.linalg.norm(V2) C1 = sum(V1 >= benchmark_similarity) C2 = sum(V2 >= benchmark_similarity) Xi = (C1 + C2) / gamma if C1 + C2 == 0: Xi = max(V1.size, V2.size) / 2 return S / Xi
def produce_content(self, field_representation_name: str, field_data) -> FeaturesBagField: """ Produces a bag of features whose key is a wordnet synset and whose value is the frequency of the synset in the field data text """ field_data = check_not_tokenized(field_data) synsets = disambiguate(field_data) synsets = [synset for word, synset in synsets if synset is not None] return FeaturesBagField(field_representation_name, Counter(synsets))
def wsd(self, sentence): result = disambiguate(sentence) list_item = [] for item in result: item = { "word": item[0], "synset": str(item[1]).replace("Synset('", "").replace("')", "") } list_item.append(item) return list_item
def word_sense_disambiguate(query): query = query[0] print(query) res = [] disamb = disambiguate(query) print(disamb) for i, t in enumerate(disamb): print((i, t)) if t[1] is not None: res.append(t[0]) x = t[1].name().split('.') y = x[0].split('_') if t[0].lower() != (' '.join(y)).lower(): res = res + y else: res.append(t[0]) # print(' '.join(res)) return ' '.join(res)
def translate(text, year): parsing = disambiguate(text) translated = [] for word in parsing: if word[-1] is not None: #synms = getNgramsWrapper(','.join(word[-1].lemma_names()), year) print ','.join(word[-1].lemma_names()) print synms = {"hi":"bye"} maxWord = synms.keys()[0] max = synms.values()[0] for synm in synms: if synms[synm] > max: max = synms[synm] maxWord = synm translated.append(maxWord) else: translated.append(word[0]) return " ".join(translated)
def main(args): file_path = args.file_path sentences = MySentences(file_path) # for i in sentences: # print(i) with open(args.pwd_file_path, 'w', encoding='utf-8') as f: for i in sentences: if len(i) > 5: ls = [] try: for word_sen in disambiguate(' '.join(i), algorithm=cosine_lesk): if word_sen[1] is None: ls.append(word_sen[0]) else: ls.append(word_sen[0] + '|' + word_sen[1].name()) f.write(' '.join(ls)) f.write('\n') except: print(' '.join(i))
def translate(text, year): parsing = disambiguate(text) translated = [] for word in parsing: if word[-1] is not None: #synms = getNgramsWrapper(','.join(word[-1].lemma_names()), year) print ','.join(word[-1].lemma_names()) print synms = {"hi": "bye"} maxWord = synms.keys()[0] max = synms.values()[0] for synm in synms: if synms[synm] > max: max = synms[synm] maxWord = synm translated.append(maxWord) else: translated.append(word[0]) return " ".join(translated)
def extract_global_bag_of_synsets(commentList): corpus = [] global_synset_set = set() sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') # ISSUE throws away named entities i = 0 for art in commentList.items(): for comm in art[1]: filtered_words = [] for sentence in sent_detector.tokenize(comm.body.strip()): #print sentence dis = disambiguate(sentence, algorithm=maxsim, similarity_option='wup') for w in dis: # Only found words and nouns+verbs if w[1] is None: continue if not w[1].pos() == wn.NOUN and not w[1].pos() == wn.VERB: continue #print w[0] ," - ", w[1], " - ", w[1].definition() filtered_words.append(w[1]) global_synset_set.add(w[1]) corpus.append(filtered_words) i += 1 print i if i % 1000 == 0: print i, "processed" break if i % 1000 == 0: print i, "processed" break return global_synset_set, corpus
def tokenize_corpus(self, corpus, tokenize=True): """ Method that tokenizes the corpus prior to training. For each word in the corpus we compute the sense of that word and change it with word_sense. For example: cat can become cat_n.01 :param corpus: the corpus as a string. :param tokenize: True if should tokenize the corpus beforehand. :return: the tokenized corpus. """ # convert the corpus to be sentence corpus = [' '.join(sentence) for sentence in corpus] if not tokenize: return corpus print('Starting to tag corpus') corpus_tags = [] counter = 0.0 for sentence in corpus: if (counter % 100000) == 0: print(counter/len(corpus)*100, " percent complete \r",) try: # get the sense of each word in the sentence tagged_sentence = disambiguate(sentence) corpus_tags.append(tagged_sentence) except IndexError: print("pywsd can't handle the sentence: " + sentence) counter += 1 # create a dictionary of each word and all the senses it was mapped to for sentence in corpus_tags: for tag in sentence: if tag[1] is None: continue cur_set = self.token_dict.get(tag[0], set()) cur_set.add(tag[1].name()) self.token_dict[tag[0]] = cur_set # create the tagged corpus in a format ready for training tagged_corpus = [[word[1].name() for word in sentence if word[1] is not None] for sentence in corpus_tags] return tagged_corpus
async def extract_wsd(request, target): """ $ curl -d '{"sents":"The sheet is twenty centimeters."}' \ -H "Content-Type: application/json" -X POST \ localhost:1700/en/wsd/default | json :param request: :return: """ from pywsd import disambiguate from pywsd.similarity import max_similarity as maxsim from pywsd.lesk import simple_lesk rd = request.json sents = rd['sents'] extract_syn = lambda r: (r[0], r[1].name(), r[1].definition()) def extract_sents(): rs = disambiguate(sents) return [extract_syn(r) for r in rs if r[1]] fn_map = { 'default': lambda: extract_sents(), 'maxsim': lambda: [ extract_syn(r) for r in disambiguate(sents, algorithm=maxsim, similarity_option='wup', keepLemmas=False) if r[1] ], 'lesk': lambda: simple_lesk(sents, rd['word']), } result = fn_map[target]() if target in fn_map else [] return json(result)
def disambiguateWordSenses2(self,sentence): #disambiguation without simple_lesk synsets = disambiguate(sentence) print synsets #print synsets lst=[] for word in synsets: if word[1]: pos=word[1].pos() offset=word[1].offset() print "$$$$$$$$$$$$$$$$" print word[1], pos,offset pos_score=0.0 neg_score=0.0 if(pos,offset) in self.db: pos_score,neg_score = self.db[(pos,offset)] #print word[0],pos_score,neg_score obj = 1.0-(pos_score+neg_score) else: pos = None obj=1.0 pos_score=0.0 neg_score=0.0 lst.append((word[0],obj,pos,pos_score,neg_score)) return lst
def translate(text, year): parsing = disambiguate(text) print parsing translated = [] for word in parsing: if word[-1] is not None: print word[-1].lemma_names() synms = word[-1].lemma_names() maxWord = synms[0] max = 0 for synm in synms: freq_count = get_freq(str(synm), year) if freq_count > max: max = freq_count maxWord = synm translated.append(maxWord) else: translated.append(word[0]) print 'input text: ', text print print "final result: ", ' '.join(translated) print return ' '.join(translated)
def wordSense(text): # I will use a dictionary structure to store the data and transfer it into pandas dataframe dic = {} word = [] sense = [] definition = [] # apply function answer = disambiguate(text) # iterate each elements from disambiguate() for t in answer: if all(t): word.append(t[0]) sense.append(t[1]) definition.append(t[1].definition()) # store in list and define a dic dic['word'] = word dic['sense'] = sense dic['definition'] = definition # return as dataframe return DataFrame(dic)
def thesaurus_expand(query, wikimedia, size=3, threshold=4.23): """ Wordent hierarchy - hyponyms concepts that are more specific (immediate), navigate down to the tree - hypernyms general concept, navigate up the hierarchy - meronyms components. For instance a tree have trunk, ...so on as meronym - holonyms things that contain meronyms (i.e. tree) Query expansion require good relevance feedback methods. Using a thesaurus based query expansion might decrease performance and has query drift problems with polysemic words. This method picks up keyword from gloss of the synsets and uses a lesk algorithm to disambiguate terms from each other :param query: :return: """ analyzer = ThesaurusExpansionAnalyzer() wikimedia_analyzer = WikimediaAnalyzer() original_tokens = [i.text for i in analyzer(query)] # original_tokens = set([i.text for i in query.all_tokens()]) print(original_tokens) synonyms = set() rule = r""" NBAR: {<NN>} {<JJ>} # {<JJS>} {<NNS>} # {<NNP>} """ synsets = [] # for i in original_tokens: # for s in wordnet.synsets(i): # for h in s.hypernyms(): # print(s, h , s.wup_similarity(h)) # for i in original_tokens: # for s in wordnet.synsets(i): # print(s.definition()) for w, s in disambiguate(" ".join(original_tokens), algorithm=adapted_lesk): if s: definition = s.definition() pke_text = definition + ' ' + ' '.join(s.lemma_names()) # print(pke_key_phrase_extract(pke_text)) tokens = [i.text for i in wikimedia_analyzer(definition)] synsets.append((w, wordnet.synset(s.name()), tokens)) for word, sense, definition in synsets: if sense: synonyms = synonyms.union(noun_groups(word_tokenize(sense.definition()), chunk_size=1, rule=rule)) text = " ".join([i.name() for i in sense.lemmas()]) for lemma in wikimedia_analyzer(text): if lemma.text not in original_tokens: synonyms.add(lemma.text) # vfor tok in wikimedia_analyzer(lemma.text): # print(tok.text) # if tok.text not in original_tokens: # synonyms.add(tok.text) # for token in tokens: for _, original_sense, _ in synsets: for child_synset in wordnet.synsets(token): # if child_synset: # definition = [i.text for i in analyzer(child_synset.definition())] # pywsd. score = # wordnet.synset(original_sense.name()).path_similarity(child_synset, simulate_root=False) print( # child_synset, child_synset.definition(), original_sense, score) # print(tokens) # print([j.definition() for i, j in pywsd.disambiguate(query, algorithm=pywsd.simple_lesk)], '\n', # [j.definition() for i, j in pywsd.disambiguate(query, algorithm=pywsd.adapted_lesk)], '\n', # [j.definition() for i, j in pywsd.disambiguate(query, algorithm=pywsd.cosine_lesk)], '\n', # [j.definition() for i, j in pywsd.disambiguate(query, algorithm=pywsd.max_similarity)]) # if len(_concept) > 0: # concept, similarity_strength = _concept[0] # if similarity_strength > 0.7: # __retrieve_definition_groupings(synsets) # else: # print(__retrieve_definition_groupings(synsets)) # disambiguated_senses = disambiguate(query, algorithm=adapted_lesk) # print(disambiguated_senses, '\n\n', simple_lesk, '\n\n', resnik_wsd(word_tokenize(query)), '\n') # for token in original_tokens: # senses = wordnet.synsets(token, 'n') # if len(senses) == 1: # synonyms = synonyms.union(set(senses[0].lemma_names())) # else: # # tokens += [i.text for i in analyzer(' '.join(list(synonyms)))] # return original_tokens + [i for i in tokens if i not in original_tokens] reader = wikimedia.reader terms_vec = {} for syn in synonyms: score = calc_syn_score(syn, reader) terms_vec[syn] = score # else: # terms_vec[syn] = 0 ranked_terms = sorted(terms_vec, key=lambda c: terms_vec[c], reverse=True) print('***Ranked terms') for i in list(map(lambda q: (q, terms_vec[q]), ranked_terms)): print(i[0], ' ', i[1], '\n') return list(map(lambda q: q[0], filter(lambda v: v[1] >= threshold, terms_vec.items())))
from pywsd.lesk import adapted_lesk from pywsd.lesk import cosine_lesk from pywsd import disambiguate from pywsd.similarity import max_similarity as maxsim from pywsd.utils import has_synset simplelesk_answer = [] adaptedlesk_answer = [] cosinelesk_answer = [] print "\nSentence Context Disambiguation\n============================== \n" raw_sentence="Some people are happy this sentence." words = nltk.word_tokenize(raw_sentence) print "\nChecking synsets of each word . . .\n==========================================\n" print(disambiguate(raw_sentence)) print "\nDisambiguating your sentence word by word using Simple Lesk algorithm. Hold on. \n======================================================" for eachword in words: if has_synset(eachword): answer = simple_lesk(raw_sentence, eachword) simplelesk_answer.append(answer) print "Sense :", answer print eachword+":"+answer.definition()+"\n" else: print eachword+": "+eachword+"\n" simplelesk_answer.append(eachword) """" print "\nDisambiguating your sentence word by word using Adapted Lesk algorithm. Hold on. \n======================================================" for eachword in words:
def get_disambiguated_synonym(sentence): for word_dis_sense in disambiguate(sentence, algorithm=maxsim, similarity_option='wup', keepLemmas=True): if(word_dis_sense[2] is not None): for lemma in word_dis_sense[2].lemma_names(): yield(word_dis_sense[0],lemma)
AcText[act]): # for each actorgs untikenized text in txtuntkn; # txtuntkn=AcText[act][0] # print(txtuntkn['texte']) txttkn = tokenize.sent_tokenize(txtuntkn['texte']) # print(txttkn) # print(txttkn)# tokenized text for tmp3, sent in enumerate( txttkn ): # for each sentenxe in tokenzed text of eaxh actor in sent # sent=txttkn[0] # wordlist=sent.split() # for wrd in wordlist: try: # if counter <2: dissent = disambiguate(sent, algorithm=maxsim, similarity_option='wup', keepLemmas=True) counter = counter + 1 print(counter) print(dissent) for tmp4, diswrd in enumerate(dissent): print(diswrd) if diswrd[2] is not None: diswrd_brief = (diswrd[0], diswrd[1], diswrd[2].name()) txtuntkn['Textwsd'].append(diswrd_brief) except: pass # sent=sent.replace('\n',' ') # print(tmp3) # print(disambiguate(sent, algorithm=maxsim, similarity_option='wup', keepLemmas=True))
from pywsd import disambiguate from nltk import sent_tokenize text = "Python is a widely used general-purpose, high-level programming language. Its design philosophy emphasizes code readability, and its syntax allows programmers to express concepts in fewer lines of code than would be possible in languages such as C++ or Java. The language provides constructs intended to enable clear programs on both a small and large scale. Python supports multiple programming paradigms, including object-oriented, imperative and functional programming or procedural styles." for sent in sent_tokenize(text): print disambiguate(sent, prefersNone=True)
def nominalized_verb_detection(docID, doc, sent): sentences = tokenize.sent_tokenize(sent) result = [] result1 = [] true_word = [] false_word = [] # word count for the sentence word_count = [] # number of nominalization in the sentence nomi_count = [] sen_id = -1 # to print the sentence in output sentence = [] # to print the nominalizations in each sentence nomi_sen = [] nomi_sen_ = "" def is_pos(s, pos): # print(s) return s.split('.')[1] == pos for each_sen in sentences: sen_id += 1 nomi_count.append(0) word_count.append(0) sentence.append(each_sen) words_with_tags = disambiguate(each_sen) for tup in words_with_tags: word, syns = tup if (word in string.punctuation) or (word == "\"") or ( word[0] == "\'") or (word[0] == "`"): continue word_count[sen_id] += 1 derivationals = [] word = word.lower() if word in true_word: nomi_count[sen_id] += 1 if nomi_sen_ == "": nomi_sen_ = word else: nomi_sen_ = nomi_sen_ + "; " + word noun_cnt[word] += 1 nominalized_cnt[word] += 1 continue if word in false_word: noun_cnt[word] += 1 continue if syns: #look at only nouns if not is_pos(syns.name(), 'n'): result.append([word, False]) false_word.append(word) noun_cnt[word] += 1 continue if wn.lemmas(word): for lemma in wn.lemmas(word): derive = lemma.derivationally_related_forms() if derive not in derivationals and derive: derivationals.append(derive) else: try: derivationals = syns.lemmas( )[0].derivationally_related_forms() except: pass stem = first_section.match(str(syns.name())).group(1) found = False for deriv in derivationals: if is_pos(str(deriv), 'v'): deriv_str = str(deriv)[7:-3].split('.')[3] if len(word) <= len(deriv_str): continue result.append([word, True]) true_word.append(word) noun_cnt[word] += 1 if nomi_sen_ == "": nomi_sen_ = word else: nomi_sen_ = nomi_sen_ + "; " + word nominalized_cnt[word] += 1 found = True break if found: nomi_count[sen_id] += 1 continue else: result.append([word, False]) #includes word='NO NOMINALIZATION' noun_cnt[word] += 1 nomi_sen.append(nomi_sen_) nomi_sen_ = "" for i in range(sen_id + 1): #['Document ID', 'Document', 'Sentence ID', 'Sentence', 'Number of words in sentence', 'Nominalized verbs','Number of nominalizations in sentence', 'Percentage of nominalizations in sentence']) if word_count[i] > 0: result1.append([ docID, IO_csv_util.dressFilenameForCSVHyperlink(doc), i + 1, sentence[i], word_count[i], nomi_sen[i], nomi_count[i], 100.0 * nomi_count[i] / word_count[i] ]) else: result1.append([ docID, IO_csv_util.dressFilenameForCSVHyperlink(doc), i + 1, sentence[i], word_count[i], nomi_sen[i], nomi_count[i] ]) # print(result1) # result contains a list of each word TRUE/FALSE values for nominalization # result1 contains a list of docID, docName, sentence... return result, result1
def text_to_emoji(text): tokens = disambiguate(text, algorithm=max_similarity, similarity_option='wup', keepLemmas=True) return " ".join([token_to_emoji(t) for t in tokens])
def disambiguate_max(self, sentence): from pywsd.similarity import max_similarity as maxsim return disambiguate(sentence, algorithm=maxsim, similarity_option='wup', keepLemmas=True)
def compare_defin(defin1, defin2): return len([item for item in set(disambiguate(defin1)).intersection(disambiguate(defin2)) if filter_stopwords(item)])
from nltk.tokenize import word_tokenize from nltk.corpus import stopwords import string import pywsd #p = input() #p = 'I was walking around a river bank while looking at fishes in the water.' p = 'I am going to bank to deposit my money' new = [ word for word in word_tokenize(p) if word.lower() not in stopwords.words('English') and word not in string.punctuation ] new = ' '.join(new) for word, context in pywsd.disambiguate(new): print(word.ljust(15, ' '), ': ', context.definition() if context else 'Not Found')
token_pair_list.append((word, negation)) org_tokens.append(word) return org_tokens, token_pair_list ## get all definition from all synsets synset_list = list(wn.all_synsets()) all_pairs_from_definition = [] # for ss in tqdm([wn.synset('amazing.s.02'), wn.synset('good.a.01')]): for ss in tqdm(synset_list): df = ss.definition() curr_df_pair_list = disambiguate(df, cosine_lesk) df_pair_txt_list = [] for curr_df_pair in curr_df_pair_list: if curr_df_pair[1] is None: df_pair_txt_list.append(curr_df_pair) else: df_pair_txt_list.append((curr_df_pair[0], curr_df_pair[1].name())) all_pairs_from_definition.append((ss.name(), df_pair_txt_list)) with open('all_wn_synset_definition_da_cosine.txt', 'w') as fp: fp.write(all_pairs_from_definition) pickle.dump(all_pairs_from_definition, open("all_wn_synset_definition_da_cosine.p", "wb")) # ## Prepare graphical model
def lemmatized_pos_selected_overlap(question, story): q = question["question"] punct = set(string.punctuation) q_sent_token = nltk.sent_tokenize(q) q_word_token = [nltk.word_tokenize(word) for word in q_sent_token] q_word_tagged = [nltk.pos_tag(word) for word in q_word_token] stop_words = nltk.corpus.stopwords.words("english") key_question_words = set([(tup[0].lower(),tup[1]) for ls in q_word_tagged for tup in ls if tup[0].lower() not in stop_words \ and tup[0].lower() not in punct and not '\'' in tup[0] ]) #print(key_question_words) #print(q_word_tagged) sentence_id_sent = [(dic["sentenceid"], nltk.sent_tokenize(dic["sentence"])) for dic in story] #print(sentence_id_sent) #story_sentences = list(map(lambda x: nltk.sent_tokenize(x), [dic["sentence"] for dic in story] )) #print(story_sentences) #sentences = [nltk.word_tokenize(word) for sent in story_sentences for word in sent ] #sentences_tagged = [nltk.pos_tag(ls) for ls in sentences] sentences = [(tup[0], nltk.word_tokenize(word), tup[1]) for tup in sentence_id_sent for word in tup[1]] sentences_tagged = [(tup[0], nltk.pos_tag(tup[1]), tup[2]) for tup in sentences] lemmatizer = WordNetLemmatizer() pos_match = {"NN": 'n', "JJ": 'a', "VB": 'v', "RB": 'r'} pos_match.setdefault('n') #key_question_words = set(map(lambda w: lemmatizer.lemmatize(w[0],pos=pos_match.get(re.match('^(..?)\w*',w[1]).group(0),'n')),key_question_words)) stemmer = SnowballStemmer('english') key_question_words = set( map(lambda w: stemmer.stem(w[0]), key_question_words)) #key_question_w_posDict = {} #for stem,pos in key_question_words: # key_question_w_posDict[stem] = pos #key_question_words = set(map(lambda w: w[0],key_question_words)) question_word = get_question_word(q) q_disambiguated = disambiguate(q) set_q_synsets = set(map(lambda w: w[1], q_disambiguated)) set_q_synsets.remove(None) answers = [] for sent in sentences_tagged: key_sentence_words = set([ (tup[0].lower(),tup[1]) for tup in sent[1] if tup[0].lower() not in stop_words \ and tup[0].lower() not in punct and not '\'' in tup[0] ]) #key_sentence_words = set(map(lambda w: lemmatizer.lemmatize(w[0],pos=pos_match.get(re.match('^(..?)\w*',w[1]).group(0),'n')),key_sentence_words)) key_sentence_words = set( map(lambda w: (stemmer.stem(w[0]), w[1]), key_sentence_words)) key_sentence_w_posDict = {} for stem, pos in key_sentence_words: key_sentence_w_posDict[stem] = pos sen_disambiguated = disambiguate(sent[2][0]) set_sen_synsets = set(map(lambda w: w[1], sen_disambiguated)) set_sen_synsets.remove(None) #set_sen_synsets = set(map(lambda w: re.match())) # print(disambiguate('Where did Andrew and his dad go')) # for word, syn in sen_disambiguated: # if syn is not None: # print(syn) # print (wn.synset('circus.n.05').definition()) # print (wn.synset('circus.n.05')._lexname) # print(syn._pos) # print(syn._lemmas) # synRe = re.match('(\w+)\.\(w+)\.(\w+)',syn) key_sentence_words = set(map(lambda w: w[0], key_sentence_words)) overlap = 0 overlapList = (key_question_words & key_sentence_words) for word in overlapList: if "nn" in key_sentence_w_posDict[word].lower(): overlap += .5 elif "vb" in key_sentence_w_posDict[word].lower(): overlap += 2.5 elif "rb" in key_sentence_w_posDict[word].lower(): overlap += .25 else: overlap += 1 synsetOverlap = (set_q_synsets & set_sen_synsets) synsetOverlap = set(filter(lambda q: q is not None, synsetOverlap)) overlap += len(synsetOverlap) # print(key_sentence_words) answers.append((overlap, (sent[0], key_sentence_words))) answers = sorted(answers, key=operator.itemgetter(0), reverse=True) best_answer = (answers[0])[1] #print("answer:", " ".join(tup[0] for tup in best_answer)) #print(best_answer) # if question_word == 'why': # bestAnswerIndex = 0 # if (answers[0][0]/len(answers[0][1]) >= .9): # for i in range(len(sentence_id_sent)): # if sentence_id_sent[i][0] == best_answer[0]: # bestAnswerIndex = i # if bestAnswerIndex < len(sentence_id_sent): # # print (sentence_id_sent[i][0]) # return sentence_id_sent[i][0] answerid = best_answer[0] return answerid #, " ".join(tup[0] for tup in best_answer[1])
): if example == examplelist: context_syn.append(synsetelem) #meaning specified -> map meaning to pronunciation classified_pronun = map_meaning2pronun( context_syn, targetword, targettag) result_hetero.setdefault(targetword, []) result_hetero[targetword].append(classified_pronun) #noun case elif 'NN' in targettag: result_syn = [] hetero_num_in_sentence_noun += 1 iteration = 0 # pywsd used wsdlist = disambiguate(sentence) for wsdentry in wsdlist: if wsdentry[0].lower( ) == targetword and wsdentry[1] is not None: if wsdentry[1].pos() == 'n': iteration += 1 if hetero_num_in_sentence_noun == iteration: result_syn.append(wsdentry[1]) classified_pronun = map_meaning2pronun( result_syn, targetword, targettag) result_hetero.setdefault(targetword, []) result_hetero[targetword].append(classified_pronun) # pattern heteronym if len(pos_heterolist) > 0: