def synsetSignatures(ss: "wn.Synset"): signature = [] # Adds the definition, example sentences and lemma_names. signature += word_tokenize(ss.definition()) signature += chain(*[word_tokenize(eg) for eg in ss.examples()]) signature += ss.lemma_names() # Includes lemma_names of hyper-/hyponyms. hyperhyponyms = set(ss.hyponyms() + ss.hypernyms() + ss.instance_hyponyms() + ss.instance_hypernyms()) signature += set(chain(*[i.lemma_names() for i in hyperhyponyms])) # Includes signatures from related senses as in Adapted Lesk. # Includes lemma_names from holonyms, meronyms and similar_tos related_senses = set(ss.member_holonyms() + ss.part_holonyms() + ss.substance_holonyms() + \ ss.member_meronyms() + ss.part_meronyms() + ss.substance_meronyms() + \ ss.similar_tos()) signature += set(chain(*[i.lemma_names() for i in related_senses])) # Lowercase. signature = set(s.lower() for s in signature) # Removes stopwords. signature = set(signature).difference(EN_STOPWORDS) # Lemmatized context is preferred over stemmed context. signature = [lemmatize(s) for s in signature] # Keep only the unique bag-of-words return set(signature)
def max_similarity(context_sentence: str, ambiguous_word: str, option="path", lemma=True, context_is_lemmatized=False, pos=None, best=True) -> "wn.Synset": """ Perform WSD by maximizing the sum of maximum similarity between possible synsets of all words in the context sentence and the possible synsets of the ambiguous words (see http://goo.gl/XMq2BI): {argmax}_{synset(a)}(\sum_{i}^{n}{{max}_{synset(i)}(sim(i,a))} :param context_sentence: String, a sentence. :param ambiguous_word: String, a single word. :return: If best, returns only the best Synset, else returns a dict. """ ambiguous_word = lemmatize(ambiguous_word) # If ambiguous word not in WordNet return None if not wn.synsets(ambiguous_word): return None if context_is_lemmatized: context_sentence = word_tokenize(context_sentence) else: context_sentence = [lemmatize(w) for w in word_tokenize(context_sentence)] result = {} for i in wn.synsets(ambiguous_word, pos=pos): result[i] = 0 for j in context_sentence: _result = [0] for k in wn.synsets(j): _result.append(sim(i,k,option)) result[i] += max(_result) if option in ["res","resnik"]: # lower score = more similar result = sorted([(v,k) for k,v in result.items()]) else: # higher score = more similar result = sorted([(v,k) for k,v in result.items()],reverse=True) return result[0][1] if best else result
def synset_signatures(ss, hyperhypo=True, adapted=False, remove_stopwords=True, to_lemmatize=True, remove_numbers=True, lowercase=True, original_lesk=False, from_cache=True): """ :param ss: A WordNet synset. :type ss: nltk.corpus.wordnet.Synset """ if from_cache: return synset_signatures_from_cache(ss, hyperhypo, adapted, original_lesk) # Collects the signatures from WordNet. signature = [] # Adds the definition, example sentences and lemma_names. signature += word_tokenize(ss.definition()) # If the original lesk signature is requested, skip the other signatures. if original_lesk: return set(signature) # Adds the examples and lemma names. signature += chain(*[word_tokenize(eg) for eg in ss.examples()]) signature += ss.lemma_names() # Includes lemma_names of hyper-/hyponyms. if hyperhypo: hyperhyponyms = set(ss.hyponyms() + ss.hypernyms() + ss.instance_hyponyms() + ss.instance_hypernyms()) signature += set(chain(*[i.lemma_names() for i in hyperhyponyms])) # Includes signatures from related senses as in Adapted Lesk. if adapted: # Includes lemma_names from holonyms, meronyms and similar_tos related_senses = set(ss.member_holonyms() + ss.part_holonyms() + ss.substance_holonyms() + \ ss.member_meronyms() + ss.part_meronyms() + ss.substance_meronyms() + \ ss.similar_tos()) signature += set(chain(*[i.lemma_names() for i in related_senses])) # Lowercase. signature = set(s.lower() for s in signature) if lowercase else signature # Removes stopwords. signature = set(signature).difference( EN_STOPWORDS) if remove_stopwords else signature # Lemmatized context is preferred over stemmed context. if to_lemmatize: signature = [ lemmatize(s) if lowercase else lemmatize(s) # Lowercasing checks here. for s in signature # We only throw away if both remove_numbers and s is a digit are true. if not (remove_numbers and s.isdigit()) ] # Keep only the unique bag-of-words return set(signature)