Ejemplo n.º 1
0
def cosine_lesk(context_sentence, ambiguous_word,
                pos=None, lemma=True, stem=True, hyperhypo=True,
                stop=True, context_is_lemmatized=False,
                nbest=False, from_cache=True):
    """
    In line with vector space models, we can use cosine to calculate overlaps
    instead of using raw overlap counts. Essentially, the idea of using
    signatures (aka 'sense paraphrases') is lesk-like.
    """
    # Ensure that ambiguous word is a lemma.
    ambiguous_word = lemmatize(ambiguous_word)
    # If ambiguous word not in WordNet return None
    if not wn.synsets(ambiguous_word):
        return None
    ss_sign = simple_signatures(ambiguous_word, pos, lemma, stem, hyperhypo, stop,
                                from_cache=from_cache)
    if context_is_lemmatized:
        context_sentence = " ".join(context_sentence.split())
    else:
        context_sentence = " ".join(lemmatize_sentence(context_sentence))

    scores = []
    for ss, signature in ss_sign.items():
        # Lowercase and replace "_" with spaces.
        signature = " ".join(map(str, signature)).lower().replace("_", " ")
        scores.append((cos_sim(context_sentence, signature), ss))

    scores = sorted(scores, reverse=True)
    return scores if nbest else scores[0][1]
Ejemplo n.º 2
0
def cosine_lesk(context_sentence, ambiguous_word, \
                pos=None, lemma=True, stem=True, hyperhypo=True, \
                stop=True, context_is_lemmatized=False, \
                nbest=False):
    """ 
    In line with vector space models, we can use cosine to calculate overlaps
    instead of using raw overlap counts. Essentially, the idea of using 
    signatures (aka 'sense paraphrases') is lesk-like.
    """
    # Ensure that ambiguous word is a lemma.
    ambiguous_word = lemmatize(ambiguous_word)
    # If ambiguous word not in WordNet return None
    if not wn.synsets(ambiguous_word):
        return None
    synsets_signatures = simple_signature(ambiguous_word, pos, lemma, stem,
                                          hyperhypo)

    if context_is_lemmatized:
        context_sentence = " ".join(context_sentence.split())
    else:
        context_sentence = " ".join(lemmatize_sentence(context_sentence))

    scores = []
    for ss, signature in synsets_signatures.items():
        # Lowercase and replace "_" with spaces.
        signature = " ".join(map(str, signature)).lower().replace("_", " ")
        # Removes punctuation.
        signature = [i for i in word_tokenize(signature) \
                     if i not in string.punctuation]
        # Optional: remove stopwords.
        if stop:
            signature = [i for i in signature if i not in EN_STOPWORDS]
        # Optional: Lemmatize the tokens.
        if lemma == True:
            signature = [lemmatize(i) for i in signature]
        # Optional: stem the tokens.
        if stem:
            signature = [porter.stem(i) for i in signature]
        scores.append((cos_sim(context_sentence, " ".join(signature)), ss))

        if not nbest:
            return sorted(scores, reverse=True)[0][1]
        else:
            return [(j, i) for i, j in sorted(scores, reverse=True)]
def cosine_lesk_inventario_estendido(context_sentence, ambiguous_word, \
    pos=None, lemma=True, stem=True, hyperhypo=True, \
    stop=True, context_is_lemmatized=False, \
    nbest=False, synsets_signatures=None, busca_ampla=False):
    """
	In line with vector space models, we can use cosine to calculate overlaps
	instead of using raw overlap counts. Essentially, the idea of using
	signatures (aka 'sense paraphrases') is lesk-like.
	"""

    # Ensure that ambiguous word is a lemma.
    if lemma:
        ambiguous_word = lemmatize(ambiguous_word)

    # If ambiguous word not in WordNet return None
    #if not wn.synsets(ambiguous_word):
    if not criar_inventario_des_wn(ambiguous_word, busca_ampla=busca_ampla):
        return None

    if context_is_lemmatized:
        context_sentence = " ".join(context_sentence.split())
    else:
        context_sentence = " ".join(lemmatize_sentence(context_sentence))

    scores = []

    chave_assinatura = "%s.%s.%s.%s.%s.%s" % (ambiguous_word, pos, lemma, stem,
                                              hyperhypo, busca_ampla)

    if not chave_assinatura in DesWordnet.cache_assinaturas:
        synsets_signatures = simple_signature(ambiguous_word,
                                              pos,
                                              lemma,
                                              stem,
                                              hyperhypo,
                                              busca_ampla=busca_ampla)

        DesWordnet.cache_assinaturas[chave_assinatura] = []

        for ss, signature in synsets_signatures.items():
            # Lowercase and replace "_" with spaces.
            signature = " ".join(map(str, signature)).lower().replace("_", " ")
            # Removes punctuation.
            signature = [i for i in Util.word_tokenize(signature) \
               if i not in string.punctuation]

            signature = Util.normalizar_ctx(signature,
                                            stop=stop,
                                            lematizar=lemma,
                                            stem=stem)

            scores.append((cos_sim(context_sentence, " ".join(signature)), ss))

            DesWordnet.cache_assinaturas[chave_assinatura].append(
                (ss, signature))

    else:
        synsets_signatures = DesWordnet.cache_assinaturas[chave_assinatura]

        for ss, signature in synsets_signatures:
            scores.append((cos_sim(context_sentence, " ".join(signature)), ss))

    if not nbest:
        return sorted(scores, reverse=True)[0][1]
    else:
        return [(j, i) for i, j in sorted(scores, reverse=True)]