def cosine_lesk(context_sentence, ambiguous_word, pos=None, lemma=True, stem=True, hyperhypo=True, stop=True, context_is_lemmatized=False, nbest=False, from_cache=True): """ In line with vector space models, we can use cosine to calculate overlaps instead of using raw overlap counts. Essentially, the idea of using signatures (aka 'sense paraphrases') is lesk-like. """ # Ensure that ambiguous word is a lemma. ambiguous_word = lemmatize(ambiguous_word) # If ambiguous word not in WordNet return None if not wn.synsets(ambiguous_word): return None ss_sign = simple_signatures(ambiguous_word, pos, lemma, stem, hyperhypo, stop, from_cache=from_cache) if context_is_lemmatized: context_sentence = " ".join(context_sentence.split()) else: context_sentence = " ".join(lemmatize_sentence(context_sentence)) scores = [] for ss, signature in ss_sign.items(): # Lowercase and replace "_" with spaces. signature = " ".join(map(str, signature)).lower().replace("_", " ") scores.append((cos_sim(context_sentence, signature), ss)) scores = sorted(scores, reverse=True) return scores if nbest else scores[0][1]
def cosine_lesk(context_sentence, ambiguous_word, \ pos=None, lemma=True, stem=True, hyperhypo=True, \ stop=True, context_is_lemmatized=False, \ nbest=False): """ In line with vector space models, we can use cosine to calculate overlaps instead of using raw overlap counts. Essentially, the idea of using signatures (aka 'sense paraphrases') is lesk-like. """ # Ensure that ambiguous word is a lemma. ambiguous_word = lemmatize(ambiguous_word) # If ambiguous word not in WordNet return None if not wn.synsets(ambiguous_word): return None synsets_signatures = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo) if context_is_lemmatized: context_sentence = " ".join(context_sentence.split()) else: context_sentence = " ".join(lemmatize_sentence(context_sentence)) scores = [] for ss, signature in synsets_signatures.items(): # Lowercase and replace "_" with spaces. signature = " ".join(map(str, signature)).lower().replace("_", " ") # Removes punctuation. signature = [i for i in word_tokenize(signature) \ if i not in string.punctuation] # Optional: remove stopwords. if stop: signature = [i for i in signature if i not in EN_STOPWORDS] # Optional: Lemmatize the tokens. if lemma == True: signature = [lemmatize(i) for i in signature] # Optional: stem the tokens. if stem: signature = [porter.stem(i) for i in signature] scores.append((cos_sim(context_sentence, " ".join(signature)), ss)) if not nbest: return sorted(scores, reverse=True)[0][1] else: return [(j, i) for i, j in sorted(scores, reverse=True)]
def cosine_lesk_inventario_estendido(context_sentence, ambiguous_word, \ pos=None, lemma=True, stem=True, hyperhypo=True, \ stop=True, context_is_lemmatized=False, \ nbest=False, synsets_signatures=None, busca_ampla=False): """ In line with vector space models, we can use cosine to calculate overlaps instead of using raw overlap counts. Essentially, the idea of using signatures (aka 'sense paraphrases') is lesk-like. """ # Ensure that ambiguous word is a lemma. if lemma: ambiguous_word = lemmatize(ambiguous_word) # If ambiguous word not in WordNet return None #if not wn.synsets(ambiguous_word): if not criar_inventario_des_wn(ambiguous_word, busca_ampla=busca_ampla): return None if context_is_lemmatized: context_sentence = " ".join(context_sentence.split()) else: context_sentence = " ".join(lemmatize_sentence(context_sentence)) scores = [] chave_assinatura = "%s.%s.%s.%s.%s.%s" % (ambiguous_word, pos, lemma, stem, hyperhypo, busca_ampla) if not chave_assinatura in DesWordnet.cache_assinaturas: synsets_signatures = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo, busca_ampla=busca_ampla) DesWordnet.cache_assinaturas[chave_assinatura] = [] for ss, signature in synsets_signatures.items(): # Lowercase and replace "_" with spaces. signature = " ".join(map(str, signature)).lower().replace("_", " ") # Removes punctuation. signature = [i for i in Util.word_tokenize(signature) \ if i not in string.punctuation] signature = Util.normalizar_ctx(signature, stop=stop, lematizar=lemma, stem=stem) scores.append((cos_sim(context_sentence, " ".join(signature)), ss)) DesWordnet.cache_assinaturas[chave_assinatura].append( (ss, signature)) else: synsets_signatures = DesWordnet.cache_assinaturas[chave_assinatura] for ss, signature in synsets_signatures: scores.append((cos_sim(context_sentence, " ".join(signature)), ss)) if not nbest: return sorted(scores, reverse=True)[0][1] else: return [(j, i) for i, j in sorted(scores, reverse=True)]