def max_similarity(context_sentence, ambiguous_word, option="path", lemma=True, context_is_lemmatized=False, pos=None, best=True): """ Perform WSD by maximizing the sum of maximum similarity between possible synsets of all words in the context sentence and the possible synsets of the ambiguous words (see http://goo.gl/XMq2BI): {argmax}_{synset(a)}(\sum_{i}^{n}{{max}_{synset(i)}(sim(i,a))} """ ambiguous_word = lemmatize(ambiguous_word) # If ambiguous word not in WordNet return None if not wn.synsets(ambiguous_word): return None if context_is_lemmatized: context_sentence = word_tokenize(context_sentence) else: context_sentence = [lemmatize(w) for w in word_tokenize(context_sentence)] result = {} for i in wn.synsets(ambiguous_word): try: if pos and pos != str(i.pos()): continue except: if pos and pos != str(i.pos): continue result[i] = sum(max([sim(i,k,option) for k in wn.synsets(j)]+[0]) \ for j in context_sentence) if option in ["res","resnik"]: # lower score = more similar result = sorted([(v,k) for k,v in result.items()]) else: # higher score = more similar result = sorted([(v,k) for k,v in result.items()],reverse=True) ##print result if best: return result[0][1]; return result
def max_similarity(context_sentence: str, ambiguous_word: str, option="path", lemma=True, context_is_lemmatized=False, pos=None, best=True) -> "wn.Synset": """ Perform WSD by maximizing the sum of maximum similarity between possible synsets of all words in the context sentence and the possible synsets of the ambiguous words (see http://goo.gl/XMq2BI): {argmax}_{synset(a)}(\sum_{i}^{n}{{max}_{synset(i)}(sim(i,a))} :param context_sentence: String, a sentence. :param ambiguous_word: String, a single word. :return: If best, returns only the best Synset, else returns a dict. """ ambiguous_word = lemmatize(ambiguous_word) # If ambiguous word not in WordNet return None if not wn.synsets(ambiguous_word): return None if context_is_lemmatized: context_sentence = word_tokenize(context_sentence) else: context_sentence = [lemmatize(w) for w in word_tokenize(context_sentence)] result = {} for i in wn.synsets(ambiguous_word, pos=pos): result[i] = 0 for j in context_sentence: _result = [0] for k in wn.synsets(j): _result.append(sim(i,k,option)) result[i] += max(_result) if option in ["res","resnik"]: # lower score = more similar result = sorted([(v,k) for k,v in result.items()]) else: # higher score = more similar result = sorted([(v,k) for k,v in result.items()],reverse=True) return result[0][1] if best else result
def adapted_lesk(context_sentence, ambiguous_word, \ pos=None, lemma=True, stem=True, hyperhypo=True, \ stop=True, context_is_lemmatized=False, \ nbest=False, keepscore=False, normalizescore=False): """ This function is the implementation of the Adapted Lesk algorithm, described in Banerjee and Pederson (2002). It makes use of the lexical items from semantically related senses within the wordnet hierarchies and to generate more lexical items for each sense. see www.d.umn.edu/~tpederse/Pubs/cicling2002-b.pdf """ # Ensure that ambiguous word is a lemma. ambiguous_word = lemmatize(ambiguous_word) # If ambiguous word not in WordNet return None if not wn.synsets(ambiguous_word): return None # Get the signatures for each synset. ss_sign = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo) for ss in ss_sign: # Includes holonyms. ss_mem_holonyms = synset_properties(ss, 'member_holonyms') ss_part_holonyms = synset_properties(ss, 'part_holonyms') ss_sub_holonyms = synset_properties(ss, 'substance_holonyms') # Includes meronyms. ss_mem_meronyms = synset_properties(ss, 'member_meronyms') ss_part_meronyms = synset_properties(ss, 'part_meronyms') ss_sub_meronyms = synset_properties(ss, 'substance_meronyms') # Includes similar_tos ss_simto = synset_properties(ss, 'similar_tos') related_senses = list( set(ss_mem_holonyms + ss_part_holonyms + ss_sub_holonyms + ss_mem_meronyms + ss_part_meronyms + ss_sub_meronyms + ss_simto)) signature = list([ j for j in chain( *[synset_properties(i, 'lemma_names') for i in related_senses]) if j not in EN_STOPWORDS ]) # Lemmatized context is preferred over stemmed context if lemma == True: signature = [lemmatize(i) for i in signature] # Matching exact words causes sparsity, so optional matching for stems. if stem == True: signature = [porter.stem(i) for i in signature] # Adds the extended signature to the simple signatures. ss_sign[ss] += signature # Disambiguate the sense in context. if context_is_lemmatized: context_sentence = context_sentence.split() else: context_sentence = lemmatize_sentence(context_sentence) best_sense = compare_overlaps(context_sentence, ss_sign, \ nbest=nbest, keepscore=keepscore, \ normalizescore=normalizescore) return best_sense
def synset_signatures(ss, hyperhypo=True, adapted=False, remove_stopwords=True, to_lemmatize=True, remove_numbers=True, lowercase=True, original_lesk=False, from_cache=True): """ :param ss: A WordNet synset. :type ss: nltk.corpus.wordnet.Synset """ if from_cache: return synset_signatures_from_cache(ss, hyperhypo, adapted, original_lesk) # Collects the signatures from WordNet. signature = [] # Adds the definition, example sentences and lemma_names. signature += word_tokenize(ss.definition()) # If the original lesk signature is requested, skip the other signatures. if original_lesk: return set(signature) # Adds the examples and lemma names. signature += chain(*[word_tokenize(eg) for eg in ss.examples()]) signature += ss.lemma_names() # Includes lemma_names of hyper-/hyponyms. if hyperhypo: hyperhyponyms = set(ss.hyponyms() + ss.hypernyms() + ss.instance_hyponyms() + ss.instance_hypernyms()) signature += set(chain(*[i.lemma_names() for i in hyperhyponyms])) # Includes signatures from related senses as in Adapted Lesk. if adapted: # Includes lemma_names from holonyms, meronyms and similar_tos related_senses = set(ss.member_holonyms() + ss.part_holonyms() + ss.substance_holonyms() + \ ss.member_meronyms() + ss.part_meronyms() + ss.substance_meronyms() + \ ss.similar_tos()) signature += set(chain(*[i.lemma_names() for i in related_senses])) # Lowercase. signature = set(s.lower() for s in signature) if lowercase else signature # Removes stopwords. signature = set(signature).difference( EN_STOPWORDS) if remove_stopwords else signature # Lemmatized context is preferred over stemmed context. if to_lemmatize: signature = [ lemmatize(s) if lowercase else lemmatize(s) # Lowercasing checks here. for s in signature # We only throw away if both remove_numbers and s is a digit are true. if not (remove_numbers and s.isdigit()) ] # Keep only the unique bag-of-words return set(signature)
def simple_lesk(context_sentence, ambiguous_word, \ pos=None, lemma=True, stem=False, hyperhypo=True, \ stop=True, context_is_lemmatized=False, \ nbest=False, keepscore=False, normalizescore=False): """ Simple Lesk is somewhere in between using more than the original Lesk algorithm (1986) and using less signature words than adapted Lesk (Banerjee and Pederson, 2002) """ # Ensure that ambiguous word is a lemma. ambiguous_word = lemmatize(ambiguous_word) # If ambiguous word not in WordNet return None if not wn.synsets(ambiguous_word): return None # Get the signatures for each synset. ss_sign = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo) # Disambiguate the sense in context. if context_is_lemmatized: context_sentence = context_sentence.split() else: context_sentence = lemmatize_sentence(context_sentence) best_sense = compare_overlaps(context_sentence, ss_sign, \ nbest=nbest, keepscore=keepscore, \ normalizescore=normalizescore) return best_sense
def synsetSignatures(ss: "wn.Synset"): signature = [] # Adds the definition, example sentences and lemma_names. signature += word_tokenize(ss.definition()) signature += chain(*[word_tokenize(eg) for eg in ss.examples()]) signature += ss.lemma_names() # Includes lemma_names of hyper-/hyponyms. hyperhyponyms = set(ss.hyponyms() + ss.hypernyms() + ss.instance_hyponyms() + ss.instance_hypernyms()) signature += set(chain(*[i.lemma_names() for i in hyperhyponyms])) # Includes signatures from related senses as in Adapted Lesk. # Includes lemma_names from holonyms, meronyms and similar_tos related_senses = set(ss.member_holonyms() + ss.part_holonyms() + ss.substance_holonyms() + \ ss.member_meronyms() + ss.part_meronyms() + ss.substance_meronyms() + \ ss.similar_tos()) signature += set(chain(*[i.lemma_names() for i in related_senses])) # Lowercase. signature = set(s.lower() for s in signature) # Removes stopwords. signature = set(signature).difference(EN_STOPWORDS) # Lemmatized context is preferred over stemmed context. signature = [lemmatize(s) for s in signature] # Keep only the unique bag-of-words return set(signature)
def cosine_lesk(context_sentence, ambiguous_word, pos=None, lemma=True, stem=True, hyperhypo=True, stop=True, context_is_lemmatized=False, nbest=False, from_cache=True): """ In line with vector space models, we can use cosine to calculate overlaps instead of using raw overlap counts. Essentially, the idea of using signatures (aka 'sense paraphrases') is lesk-like. """ # Ensure that ambiguous word is a lemma. ambiguous_word = lemmatize(ambiguous_word) # If ambiguous word not in WordNet return None if not wn.synsets(ambiguous_word): return None ss_sign = simple_signatures(ambiguous_word, pos, lemma, stem, hyperhypo, stop, from_cache=from_cache) if context_is_lemmatized: context_sentence = " ".join(context_sentence.split()) else: context_sentence = " ".join(lemmatize_sentence(context_sentence)) scores = [] for ss, signature in ss_sign.items(): # Lowercase and replace "_" with spaces. signature = " ".join(map(str, signature)).lower().replace("_", " ") scores.append((cos_sim(context_sentence, signature), ss)) scores = sorted(scores, reverse=True) return scores if nbest else scores[0][1]
def adapted_lesk(context_sentence, ambiguous_word, pos=None, lemma=True, stem=False, hyperhypo=True, stop=True, context_is_lemmatized=False, nbest=False, keepscore=False, normalizescore=False, from_cache=True): """ This function is the implementation of the Adapted Lesk algorithm, described in Banerjee and Pederson (2002). It makes use of the lexical items from semantically related senses within the wordnet hierarchies and to generate more lexical items for each sense. see www.d.umn.edu/~tpederse/Pubs/cicling2002-b.pdf """ # Ensure that ambiguous word is a lemma. ambiguous_word = lemmatize(ambiguous_word) # If ambiguous word not in WordNet return None if not wn.synsets(ambiguous_word): return None # Get the signatures for each synset. ss_sign = signatures(ambiguous_word, pos=pos, hyperhypo=hyperhypo, adapted=True, remove_stopwords=stop, to_lemmatize=lemma, remove_numbers=True, lowercase=True, to_stem=stem, from_cache=from_cache) # Disambiguate the sense in context. context_sentence = context_sentence.split() if context_is_lemmatized else lemmatize_sentence(context_sentence) return compare_overlaps(context_sentence, ss_sign, nbest=nbest, keepscore=keepscore, normalizescore=normalizescore)
def cosine_lesk(context_sentence, ambiguous_word, \ pos=None, lemma=True, stem=True, hyperhypo=True, \ stop=True, context_is_lemmatized=False, \ nbest=False): """ In line with vector space models, we can use cosine to calculate overlaps instead of using raw overlap counts. Essentially, the idea of using signatures (aka 'sense paraphrases') is lesk-like. """ # Ensure that ambiguous word is a lemma. ambiguous_word = lemmatize(ambiguous_word) # If ambiguous word not in WordNet return None if not wn.synsets(ambiguous_word): return None synsets_signatures = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo) if context_is_lemmatized: context_sentence = " ".join(context_sentence.split()) else: context_sentence = " ".join(lemmatize_sentence(context_sentence)) scores = [] for ss, signature in synsets_signatures.items(): # Lowercase and replace "_" with spaces. signature = " ".join(map(str, signature)).lower().replace("_", " ") # Removes punctuation. signature = [i for i in word_tokenize(signature) \ if i not in string.punctuation] # Optional: remove stopwords. if stop: signature = [i for i in signature if i not in EN_STOPWORDS] # Optional: Lemmatize the tokens. if lemma == True: signature = [lemmatize(i) for i in signature] # Optional: stem the tokens. if stem: signature = [porter.stem(i) for i in signature] scores.append((cos_sim(context_sentence, " ".join(signature)), ss)) if not nbest: return sorted(scores, reverse=True)[0][1] else: return [(j, i) for i, j in sorted(scores, reverse=True)]
def simple_signature(ss, stem=False): """ Returns a synsets_signatures dictionary that includes signature words of a sense from its: (i) definition (ii) example sentences (iii) hypernyms and hyponyms """ signature = [] # print("ss: ",ss) # Includes definition. ss_definition = synset_properties(ss, 'definition') signature += word_tokenize(ss_definition) # Includes examples ss_examples = synset_properties(ss, 'examples') signature += list(chain(*[i.split() for i in ss_examples])) # Includes lemma_names. ss_lemma_names = synset_properties(ss, 'lemma_names') signature += ss_lemma_names # Optional: includes lemma_names of hypernyms and hyponyms. ss_hyponyms = synset_properties(ss, 'hyponyms') ss_hypernyms = synset_properties(ss, 'hypernyms') ss_hypohypernyms = ss_hypernyms + ss_hyponyms signature += list(chain(*[i.lemma_names() for i in ss_hypohypernyms])) # print(signature) ''' # Includes holonyms. ss_mem_holonyms = synset_properties(ss, 'member_holonyms') ss_part_holonyms = synset_properties(ss, 'part_holonyms') ss_sub_holonyms = synset_properties(ss, 'substance_holonyms') # Includes meronyms. ss_mem_meronyms = synset_properties(ss, 'member_meronyms') ss_part_meronyms = synset_properties(ss, 'part_meronyms') ss_sub_meronyms = synset_properties(ss, 'substance_meronyms') # Includes similar_tos ss_simto = synset_properties(ss, 'similar_tos') related_senses = list(set(ss_mem_holonyms+ss_part_holonyms+ ss_sub_holonyms+ss_mem_meronyms+ ss_part_meronyms+ss_sub_meronyms+ ss_simto)) signature += list([j for j in chain(*[synset_properties(i, 'lemma_names') for i in related_senses]) if j not in EN_STOPWORDS]) # print(signature) # Optional: removes stopwords. ''' signature = [i for i in signature if i.lower() not in EN_STOPWORDS] # Lemmatized context is preferred over stemmed context. signature = [lemmatize(i) for i in signature] # Matching exact words may cause sparsity, so optional matching for stems. if stem == True: signature = [porter.stem(i) for i in signature] return signature
def normalizar_ctx(lista_ctx, stop=True, lematizar=True, stem=True): if stop: lista_ctx = [ i for i in lista_ctx if i not in stopwords.words('english') ] if lematizar: lista_ctx = [lemmatize(i) for i in lista_ctx] if stem: lista_ctx = [porter.stem(i) for i in lista_ctx] return lista_ctx
def original_lesk(context_sentence, ambiguous_word, dictionary=None, from_cache=True): """ This function is the implementation of the original Lesk algorithm (1986). It requires a dictionary which contains the definition of the different sense of each word. See http://dl.acm.org/citation.cfm?id=318728 """ ambiguous_word = lemmatize(ambiguous_word) if not dictionary: # If dictionary is not provided, use the WN defintion. dictionary = signatures(ambiguous_word, original_lesk=True, from_cache=from_cache) best_sense = compare_overlaps_greedy(context_sentence.split(), dictionary) return best_sense
def original_lesk(context_sentence, ambiguous_word, dictionary=None): """ This function is the implementation of the original Lesk algorithm (1986). It requires a dictionary which contains the definition of the different sense of each word. See http://dl.acm.org/citation.cfm?id=318728 """ ambiguous_word = lemmatize(ambiguous_word) if not dictionary: # If dictionary is not provided, use the WN defintion. dictionary = {} for ss in wn.synsets(ambiguous_word): ss_definition = synset_properties(ss, 'definition') dictionary[ss] = ss_definition best_sense = compare_overlaps_greedy(context_sentence.split(), dictionary) return best_sense
def simple_signature(ambiguous_word, pos=None, lemma=True, stem=False, \ hyperhypo=True, stop=True): """ Returns a synsets_signatures dictionary that includes signature words of a sense from its: (i) definition (ii) example sentences (iii) hypernyms and hyponyms """ synsets_signatures = {} for ss in wn.synsets(ambiguous_word): try: # If POS is specified. if pos and str(ss.pos()) != pos: continue except: if pos and str(ss.pos) != pos: continue signature = [] # Includes definition. ss_definition = synset_properties(ss, 'definition') signature += ss_definition # Includes examples ss_examples = synset_properties(ss, 'examples') signature += list(chain(*[i.split() for i in ss_examples])) # Includes lemma_names. ss_lemma_names = synset_properties(ss, 'lemma_names') signature += ss_lemma_names # Optional: includes lemma_names of hypernyms and hyponyms. if hyperhypo == True: ss_hyponyms = synset_properties(ss, 'hyponyms') ss_hypernyms = synset_properties(ss, 'hypernyms') ss_hypohypernyms = ss_hypernyms + ss_hyponyms signature += list( chain(*[i.lemma_names() for i in ss_hypohypernyms])) # Optional: removes stopwords. if stop == True: signature = [i for i in signature if i not in EN_STOPWORDS] # Lemmatized context is preferred over stemmed context. if lemma == True: signature = [lemmatize(i) for i in signature] # Matching exact words may cause sparsity, so optional matching for stems. if stem == True: signature = [porter.stem(i) for i in signature] synsets_signatures[ss] = signature return synsets_signatures
def lemmatize_sentence(sentence: str, neverstem=False, keepWordPOS=False, tokenizer=word_tokenize, postagger=pos_tag, lemmatizer=wnl, stemmer=porter) -> list: words, lemmas, poss = [], [], [] for word, pos_ in postagger(tokenizer(sentence)): pos = penn2morphy(pos_) lemmas.append(lemmatize(word.lower(), pos, neverstem, lemmatizer, stemmer)) poss.append(pos_) words.append(word) if keepWordPOS: return words, lemmas, poss#[None if i == '' else i for i in poss] return lemmas
def adapted_lesk(context_sentence: str, ambiguous_word: str, pos: str = None) -> "wn.Synset": # Ensure that ambiguous word is a lemma. ambiguous_word = lemmatize(ambiguous_word) # If ambiguous word not in WordNet return None if (wn.synsets(ambiguous_word) == None): return None # Get the signatures for each synset. ss_sign = getSignaturesForWord(ambiguous_word, pos=pos) # Disambiguate the sense in context. #print(lemmatize_sentence(context_sentence)) context_sentence = context_sentence.split() return compare_overlaps_greedy(context_sentence, ss_sign)
def simple_lesk(context_sentence: str, ambiguous_word: str, pos: str = None, lemma=True, stem=False, hyperhypo=True, stop=True, context_is_lemmatized=False, nbest=False, keepscore=False, normalizescore=False, from_cache=True) -> "wn.Synset": """ Simple Lesk is somewhere in between using more than the original Lesk algorithm (1986) and using less signature words than adapted Lesk (Banerjee and Pederson, 2002) :param context_sentence: String, sentence or document. :param ambiguous_word: String, a single word. :param pos: String, one of 'a', 'r', 's', 'n', 'v', or None. :return: A Synset for the estimated best sense. """ # Ensure that ambiguous word is a lemma. ambiguous_word = lemmatize(ambiguous_word, pos=pos) # If ambiguous word not in WordNet return None if not wn.synsets(ambiguous_word): return None # Get the signatures for each synset. ss_sign = simple_signatures(ambiguous_word, pos, lemma, stem, hyperhypo, stop, from_cache=from_cache) # Disambiguate the sense in context. context_sentence = context_sentence.split( ) if context_is_lemmatized else lemmatize_sentence(context_sentence) return compare_overlaps(context_sentence, ss_sign, nbest=nbest, keepscore=keepscore, normalizescore=normalizescore)
def getSignaturesForWord(ambiguous_word: str, pos: str = None) -> dict: # Ensure that the POS is supported. pos = pos if pos in ['a', 'r', 's', 'n', 'v', None] else None # If the POS specified isn't found but other POS is in wordnet. if not wn.synsets(ambiguous_word, pos) and wn.synsets(ambiguous_word): pos = None # Holds the synset->signature dictionary. ss_sign = {} for ss in wn.synsets(ambiguous_word, pos): ss_sign[ss] = synsetSignatures(ss) ss_sign = { ss: [lemmatize(s) for s in signature] for ss, signature in ss_sign.items() } return ss_sign
def simple_lesk(context_sentence, ambiguous_word, \ pos=None, lemma=True, stem=False, hyperhypo=True, \ stop=True, context_is_lemmatized=False, \ nbest=False, keepscore=False, normalizescore=False): # Ensure that ambiguous word is a lemma. ambiguous_word = lemmatize(ambiguous_word) # If ambiguous word not in WordNet return None if not wn.synsets(ambiguous_word): return None # Get the signatures for each synset. ss_sign = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo) # Disambiguate the sense in context. if context_is_lemmatized: context_sentence = context_sentence.split() else: context_sentence = lemmatize_sentence(context_sentence) best_sense = compare_overlaps(context_sentence, ss_sign, \ nbest=nbest, keepscore=keepscore, \ normalizescore=normalizescore) return best_sense
def original_lesk(context_sentence: str, ambiguous_word: str, dictionary=None, from_cache=True) -> "wn.Synset": """ This function is the implementation of the original Lesk algorithm (1986). It requires a dictionary which contains the definition of the different sense of each word. See http://dl.acm.org/citation.cfm?id=318728 :param context_sentence: String, sentence or document. :param ambiguous_word: String, a single word. :return: A Synset for the estimated best sense. """ ambiguous_word = lemmatize(ambiguous_word) if not dictionary: # If dictionary is not provided, use the WN defintion. dictionary = signatures(ambiguous_word, original_lesk=True, from_cache=from_cache) best_sense = compare_overlaps_greedy(context_sentence.split(), dictionary) return best_sense
def signatures(ambiguous_word, pos=None, hyperhypo=True, adapted=False, remove_stopwords=True, to_lemmatize=True): # Ensure that the POS is supported. pos = pos if pos in ['a', 'r', 's', 'n', 'v', None] else None # Holds the synset->signature dictionary. synsets_signatures = {} for ss in wn.synsets(ambiguous_word, pos=pos): signature = [] # Adds the definition, example sentences and lemma_names. signature += word_tokenize(ss.definition()) signature += chain(*[word_tokenize(eg) for eg in ss.examples()]) signature += ss.lemma_names() # Optional: includes lemma_names of hyper-/hyponyms. if hyperhypo: hyperhypo = set(ss.hyponyms() + ss.hypernyms() + ss.instance_hyponyms() + ss.instance_hypernyms()) signature += set(chain(*[i.lemma_names() for i in hyperhypo])) # Optional: Includes signatures from related senses as in Adapted Lesk. if adapted: # Includes lemma_names from holonyms, meronyms and similar_tos related_senses = set(ss.member_holonyms() + ss.part_holonyms() + ss.substance_holonyms() + ss.member_meronyms() + ss.part_meronyms() + ss.substance_meronyms() + ss.similar_tos()) signature += set(chain(*[i.lemma_names() for i in related_senses])) # Optional: removes stopwords. if remove_stopwords: signature = [i for i in signature if i not in EN_STOPWORDS] # Lemmatized context is preferred over stemmed context. if to_lemmatize: signature = [lemmatize(i) for i in signature] synsets_signatures[ss] = signature return synsets_signatures
def assinatura_significado(self, lema, lematizar=True,\ stem=False, stop=True,\ extrair_relacao_semantica=False,\ usar_exemplos=False): resultado = BaseOx.construir_objeto_unificado(self.base_ox, lema) if not resultado: resultado = {} lema = lemmatize(lema) assinaturas_significados = [] # (nome, definicao, exemplos) for pos in resultado.keys(): todos_significados = resultado[pos].keys() indice = 1 for sig in todos_significados: nome_sig = "%s.%s.%d" % (lema, pos, indice) indice += 1 if usar_exemplos: exemplos = resultado[pos][sig]['exemplos'] else: exemplos = [] # nome, definicao, exemplos, assinatura definicao_corrente = [nome_sig, sig, exemplos, []] assinaturas_significados.append(definicao_corrente) # Colocando exemplos na assinatura definicao_corrente[len(definicao_corrente) - 1] += self.assinatura_significado_aux( lema, pos, sig, exemplos) sig_secundarios = resultado[pos][sig]['def_secs'] for ss in sig_secundarios: nome_sig_sec = "%s.%s.%d" % (lema, pos, indice) if usar_exemplos: exemplos_secundarios = resultado[pos][sig]['def_secs'][ ss]['exemplos'] else: exemplos_secundarios = [] definicao_corrente_sec = [ nome_sig_sec, ss, exemplos_secundarios, [] ] assinaturas_significados.append(definicao_corrente_sec) definicao_corrente_sec[ len(definicao_corrente) - 1] += self.assinatura_significado_aux( lema, pos, ss, exemplos_secundarios) indice += 1 for sig in assinaturas_significados: sig[3] = Util.normalizar_ctx(sig[3], stop=True, lematizar=True, stem=True) return [tuple(a) for a in assinaturas_significados]
def des_exemplos(self, ctx,\ ambigua, pos, nbest=True,\ lematizar=True, stem=True, stop=True,\ normalizar_pont=True): cfgs = self.cfgs dir_bases = self.cfgs['caminho_bases'] base_ox = self.base_ox rep_vet = self.rep_vetorial alvaro = Alvaro.Alvaro.INSTANCE dir_cache_rel_sinonimia = cfgs['caminho_bases'] + '/' + cfgs['oxford'][ 'cache']['sinonimia'] chave_cache_relacao_sin = "%s-%s.json" % (ambigua, pos) dir_obj = dir_cache_rel_sinonimia + '/' + chave_cache_relacao_sin if not chave_cache_relacao_sin in Util.list_arqs( dir_cache_rel_sinonimia): rel_definicoes = alvaro.construir_relacao_definicoes( ambigua, pos, fontes='oxford') Util.salvar_json(dir_obj, rel_definicoes) else: rel_definicoes = Util.abrir_json(dir_obj, criarsenaoexiste=False) res_des_tmp = [] pontuacao_somada = 0.00 for def_ambigua in rel_definicoes: uniao_palavras_sem_duplicatas = set() uniao_palavras_com_duplicatas = list() exemplos_blob = [] palavras_tf = {} try: maximo_exemplos = self.cfgs['params_exps']['qtde_exemplos'][0] lista_exemplos = BaseOx.obter_atributo(ambigua, pos, def_ambigua, 'exemplos') # Adicionando lemas lista_exemplos.append(" ".join( BaseOx.obter_sins(ambigua, def_ambigua, pos))) # Adicionando definicao lista_exemplos.append(def_ambigua) for ex in lista_exemplos[:maximo_exemplos]: ex_blob = TextBlob(ex) exemplos_blob.append(ex_blob) for token in ex_blob.words: if Util.is_stop_word(token.lower()) == False: token_lematizado = lemmatize(token) uniao_palavras_sem_duplicatas.add(token_lematizado) uniao_palavras_com_duplicatas.append( token_lematizado) except Exception, caminho: exemplos = [] textblob_vocab = TextBlob(" ".join(uniao_palavras_com_duplicatas)) palavras_latentes = [] for p in textblob_vocab.word_counts: if textblob_vocab.word_counts[p] > 1: palavras_latentes.append(p) palavras_derivadas = [] for p in uniao_palavras_sem_duplicatas: tf = alvaro.tf(p, textblob_vocab) palavras_tf[p] = tf pontuacao = 0.00 for t in Util.tokenize(Util.resolver_en(ctx).lower()): try: pontuacao += palavras_tf[t] except: pontuacao += 0.00 pontuacao_somada += pontuacao try: if normalizar_pont: reg_pont = pontuacao / sum(palavras_tf.values()) else: reg_pont = pontuacao except ZeroDivisionError, zde: reg_pont = 0.00
if sins_defcaminho: lista_exemplos.append(" ".join(sins_defcaminho)) # Adicionando definicao lista_exemplos.append(def_caminho) for ex_iter in lista_exemplos[:maximo_exemplos]: ex = ex_iter ex_blob = TextBlob(ex) exemplos_blob.append(ex_blob) for token in ex_blob.words: if Util.is_stop_word(token.lower()) == False: token_lematizado = lemmatize(token) uniao_palavras_sem_duplicatas.add(token_lematizado) uniao_palavras_com_duplicatas.append( token_lematizado) except Exception, caminho: import traceback traceback.print_stack() traceback.print_exc() exemplos = [] tb_vocab_duplicatas = TextBlob( " ".join(uniao_palavras_com_duplicatas))
def wsd(summpath, wsdpath): summpathlist = os.listdir(summpath) for idp,dirpath in enumerate(summpathlist[0:1]): print(idp) ''' idp=0 dirpath=summpathlist[idp] ''' comppath = summpath+dirpath+'/' comppathlist = os.listdir(comppath) for idc,compathdir in enumerate(comppathlist): ''' idc=0 compathdir=comppathlist[idc] ''' fpath = comppath+compathdir f = open(fpath,'r+',encoding='utf-8') text = f.readlines() text = [re.sub(r'^\s+|\s+$','',x) for x in text if len(x)>3] nertext = [] sentencewsd=copy.deepcopy(text) for ids,sentence in enumerate(text): ''' ids=0 sentence=text[ids] ''' word = sentence.split() word = [re.sub(r'[\,]','',x) for x in word] word = [re.sub(r'(?<=\w)[\.]','',x) for x in word] ner = st.tag(word) ner = [x[0] for x in ner if x[1]=='O'] ner = [x for x in ner if x not in stops] sentence = ' '.join(ner) nertext.append(sentence) for zipf in zipf_freq: ''' zipf=zipf_freq[0] ''' textwsd = [] for ids,sentence in enumerate(nertext): ''' ids=0 sentence=text[ids] ''' ambiguity = disambiguate(sentence, adapted_lesk, keepLemmas=True, zipf=zipf) for idy,syn in enumerate(ambiguity): ''' idy=3 syn=ambiguity[idy] ''' if syn[2] is not None: syn_lemma = syn[2].lemma_names() syn_lemma = [[zipf_frequency(x, 'en'),x] for x in syn_lemma ] syn_lemma = sorted(syn_lemma , reverse=True) if syn_lemma[0][0]==0: syn_lemma = [[len(x[1]),x[1]] for x in syn_lemma] syn_lemma = sorted(syn_lemma , reverse=False) if(lemmatize(syn[0].lower())!=syn_lemma[0][1]): sentencewsd[ids] = re.sub(r''+syn[0],syn_lemma[0][1],sentencewsd[ids] ) textwsd.append(sentencewsd[ids]) outDirectory = wsdpath+dirpath+'/' if not os.path.exists(outDirectory): os.makedirs(outDirectory) fout = open(outDirectory+str(zipf)+'-'+compathdir,'w',encoding='utf-8') fout.writelines(textwsd) fout.flush() fout.close()
def max_similarity(context_sentence, ambiguous_word, option="path", lemma=True, context_is_lemmatized=False, pos=None, best=True, data=None): """ Perform WSD by maximizing the sum of maximum similarity between possible synsets of all words in the context sentence and the possible synsets of the ambiguous words (see http://goo.gl/XMq2BI): {argmax}_{synset(a)}(\sum_{i}^{n}{{max}_{synset(i)}(sim(i,a))} """ ambiguous_word = lemmatize(ambiguous_word) # If ambiguous word not in WordNet return None if not wn.synsets(ambiguous_word): print("no synsets found in wordnet") return None # print("its synsets are %s" % wn.synsets(ambiguous_word)) if context_is_lemmatized: context_sentence = word_tokenize(context_sentence) else: context_sentence = [lemmatize(w) for w in word_tokenize(context_sentence)] #print context_sentence result = {} for i in wn.synsets(ambiguous_word): try: if pos and pos != str(i.pos()): continue except: if pos and pos != str(i.pos): continue res = 0 for j in context_sentence: if re.search(r'[a-z]+\.[nvsar]\.[0-9]{2}', j) != None: # if j is already disambiguated mysynsets = [wn.synset(j)] #elif j is not '.': # mysynsets = wn.synsets(j) else: mysynsets = wn.synsets(j) #print len(mysynsets) sims = [0] for k in mysynsets: sims.append(sim(i,k,option,data)) res += max(sims) # res += max([sim(i,k,option,data) for k in mysynsets]) # result[i] = sum(max([sim(i,k,option,data) for k in wn.synsets(j)]+[0]) \ # for j in context_sentence) result[i] = res if len(result) == 0: return None # print("printing results") #print result.items() if option in ["res","resnik"]: # lower score = more similar result = sorted([(v,k) for k,v in result.items()]) else: # higher score = more similar result = sorted([(v,k) for k,v in result.items()],reverse=True) # print("sorted; printing results again") #print result if best: return result[0][1]; return result
def lemmatization(stem_array): lemmatized = [] for stems in stem_array: lemmas = [lemmatize(x) for x in stems if not x in stop_words] lemmatized.append(lemmas) return lemmatized
def simple_signature(word, ss, lexicon): """ Returns a synsets_signatures dictionary that includes signature words of a sense from its: (i) definition (ii) example sentences (iii) hypernyms and hyponyms """ signature = [wn.morphy(word), word, ss.name().split('.')[0]] + findRelatedForms(ss) ss_definition = synset_properties(ss, 'definition') signature += word_tokenize(ss_definition) ss_lemma_names = synset_properties(ss, 'lemma_names') signature += ss_lemma_names ss_hyponyms = synset_properties(ss, 'hyponyms') #ss_hypernyms = synset_properties(ss, 'hypernyms') ss_hypohypernyms = ss_hyponyms #+ss_hypernyms signature += list(chain(*[i.lemma_names() for i in ss_hypohypernyms])) # print(signature) # Includes holonyms. ss_mem_holonyms = synset_properties(ss, 'member_holonyms') ss_part_holonyms = synset_properties(ss, 'part_holonyms') ss_sub_holonyms = synset_properties(ss, 'substance_holonyms') # Includes meronyms. ss_mem_meronyms = synset_properties(ss, 'member_meronyms') ss_part_meronyms = synset_properties(ss, 'part_meronyms') ss_sub_meronyms = synset_properties(ss, 'substance_meronyms') # Includes similar_tos #ss_simto = synset_properties(ss, 'similar_tos') related_senses = list( set(ss_mem_holonyms + ss_part_holonyms + ss_sub_holonyms + ss_mem_meronyms + ss_part_meronyms + ss_sub_meronyms)) signature += list([ j for j in chain( *[synset_properties(i, 'lemma_names') for i in related_senses]) if j not in EN_STOPWORDS ]) # print(signature) # Optional: removes stopwords. signature = [i for i in signature if type(i) == str] signature = [i for i in signature if i.lower() not in EN_STOPWORDS] # Lemmatized context is preferred over stemmed context. signature = [lemmatize(i) for i in signature] # Matching exact words may cause sparsity, so optional matching for stems. #signature = [porter.stem(i) for i in signature] ss_sign = set(signature) to_remove = [] for word in ss_sign: morphy = wn.morphy(word) if lexicon == '+-': if word not in lexiconDict and morphy not in lexiconDict: to_remove.append(word) else: if word in lexiconDict: if lexiconDict[word] != lexicon: to_remove.append(word) elif (morphy is not None) and morphy in lexiconDict: if lexiconDict[morphy] != lexicon: to_remove.append(word) else: to_remove.append(word) to_remove = set(to_remove) ss_sign = ss_sign - to_remove signature = list(ss_sign) sim = [] for ss in ss_sign: synsets = wn.synsets(ss) if len(synsets) > 0: if synsets[0] not in sim: synset = synsets[0] pos = synset.pos() morphy = wn.morphy(ss, pos) if ss in lexiconDict or ss in lexiconDict or ss in disgustingWords: sim.append(synset) elif (morphy is not None) and (morphy in lexiconDict or morphy in lexiconDict): sim.append(synset) else: signature.remove(ss) else: signature.remove(ss) return [signature, sim]
def cosine_lesk_inventario_estendido(context_sentence, ambiguous_word, \ pos=None, lemma=True, stem=True, hyperhypo=True, \ stop=True, context_is_lemmatized=False, \ nbest=False, synsets_signatures=None, busca_ampla=False): """ In line with vector space models, we can use cosine to calculate overlaps instead of using raw overlap counts. Essentially, the idea of using signatures (aka 'sense paraphrases') is lesk-like. """ # Ensure that ambiguous word is a lemma. if lemma: ambiguous_word = lemmatize(ambiguous_word) # If ambiguous word not in WordNet return None #if not wn.synsets(ambiguous_word): if not criar_inventario_des_wn(ambiguous_word, busca_ampla=busca_ampla): return None if context_is_lemmatized: context_sentence = " ".join(context_sentence.split()) else: context_sentence = " ".join(lemmatize_sentence(context_sentence)) scores = [] chave_assinatura = "%s.%s.%s.%s.%s.%s" % (ambiguous_word, pos, lemma, stem, hyperhypo, busca_ampla) if not chave_assinatura in DesWordnet.cache_assinaturas: synsets_signatures = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo, busca_ampla=busca_ampla) DesWordnet.cache_assinaturas[chave_assinatura] = [] for ss, signature in synsets_signatures.items(): # Lowercase and replace "_" with spaces. signature = " ".join(map(str, signature)).lower().replace("_", " ") # Removes punctuation. signature = [i for i in Util.word_tokenize(signature) \ if i not in string.punctuation] signature = Util.normalizar_ctx(signature, stop=stop, lematizar=lemma, stem=stem) scores.append((cos_sim(context_sentence, " ".join(signature)), ss)) DesWordnet.cache_assinaturas[chave_assinatura].append( (ss, signature)) else: synsets_signatures = DesWordnet.cache_assinaturas[chave_assinatura] for ss, signature in synsets_signatures: scores.append((cos_sim(context_sentence, " ".join(signature)), ss)) if not nbest: return sorted(scores, reverse=True)[0][1] else: return [(j, i) for i, j in sorted(scores, reverse=True)]