Esempio n. 1
0
File: lesk.py Progetto: dav009/pywsd
def cosine_lesk(context_sentence, ambiguous_word, stem=False, stop=True, \
                nbest=False):
  """ 
  In line with vector space models, we can use cosine to calculate overlaps
  instead of using raw overlap counts. Essentially, the idea of using 
  signatures (aka 'sense paraphrases') is lesk-like.
  """
  from nltk.tokenize import word_tokenize
  from cosine import cosine_similarity as cos_sim
  synsets_signatures = simple_signature(ambiguous_word, stem=stem, stop=stop)
  
  scores = []
  for ss, signature in synsets_signatures.items():
    # Lowercase and replace "_" with spaces.
    signature = " ".join(map(str, signature)).lower().replace("_", " ")
    # Removes punctuation.
    signature = [i for i in word_tokenize(signature) \
                 if i not in string.punctuation]
    # Optional: remove stopwords.
    if stop:
      signature = [i for i in signature if i not in stopwords.words('english')]
    # Optional: stem the tokens.
    if stem:
      signature = [porter.stem(i) for i in signature]
    scores.append((cos_sim(context_sentence, " ".join(signature)), ss))

  if not nbest:
    return sorted(scores, reverse=True)[0][1]
  else:
    return [(j,i) for i,j in sorted(scores, reverse=True)]
Esempio n. 2
0
def cosine_lesk(context_sentence, ambiguous_word, stem=False, stop=True, \
                nbest=False):
    """ 
    In line with vector space models, we can use cosine to calculate overlaps
    instead of using raw overlap counts. Essentially, the idea of using 
    signatures (aka 'sense paraphrases') is lesk-like.
    """
    
    
    # Ensure that ambiguous word is a lemma.
    ambiguous_word = lemmatize(ambiguous_word)
    synsets_signatures = simple_signature(ambiguous_word, stem=stem, stop=stop)
    
    scores = []
    for ss, signature in synsets_signatures.items():
        # Lowercase and replace "_" with spaces.
        signature = " ".join(map(str, signature)).lower().replace("_", " ")
        # Removes punctuation.
        signature = [i for i in word_tokenize(signature) \
                     if i not in string.punctuation]
        # Optional: remove stopwords.
        if stop:
            signature = [i for i in signature if i not in stopwords.words('english')]
        # Optional: stem the tokens.
        if stem:
            signature = [porter.stem(i) for i in signature]
        scores.append((cos_sim(context_sentence, " ".join(signature)), ss))
        
        if not nbest:
            return sorted(scores, reverse=True)[0][1]
        else:
            return [(j,i) for i,j in sorted(scores, reverse=True)]
Esempio n. 3
0
def cosine_lesk(
    context_sentence,
    ambiguous_word,
    pos=None,
    lemma=True,
    stem=True,
    hyperhypo=True,
    stop=True,
    context_is_lemmatized=False,
    nbest=False,
):
    """ 
    In line with vector space models, we can use cosine to calculate overlaps
    instead of using raw overlap counts. Essentially, the idea of using 
    signatures (aka 'sense paraphrases') is lesk-like.
    """
    # Ensure that ambiguous word is a lemma.
    ambiguous_word = lemmatize(ambiguous_word)
    # If ambiguous word not in WordNet return None
    if not wn.synsets(ambiguous_word):
        return None
    synsets_signatures = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo)

    if context_is_lemmatized:
        context_sentence = " ".join(context_sentence.split())
    else:
        context_sentence = " ".join(lemmatize_sentence(context_sentence))

    scores = []
    for ss, signature in synsets_signatures.items():
        # Lowercase and replace "_" with spaces.
        signature = " ".join(map(str, signature)).lower().replace("_", " ")
        # Removes punctuation.
        signature = [i for i in word_tokenize(signature) if i not in string.punctuation]
        # Optional: remove stopwords.
        if stop:
            signature = [i for i in signature if i not in EN_STOPWORDS]
        # Optional: Lemmatize the tokens.
        if lemma == True:
            signature = [lemmatize(i) for i in signature]
        # Optional: stem the tokens.
        if stem:
            signature = [porter.stem(i) for i in signature]
        scores.append((cos_sim(context_sentence, " ".join(signature)), ss))

        if not nbest:
            return sorted(scores, reverse=True)[0][1]
        else:
            return [(j, i) for i, j in sorted(scores, reverse=True)]
def cosine_lesk(context_sentence, ambiguous_word, \
                pos=None, lemma=True, stem=True, hyperhypo=True, \
                stop=True, context_is_lemmatized=False, \
                nbest=False):
    """ 
    In line with vector space models, we can use cosine to calculate overlaps
    instead of using raw overlap counts. Essentially, the idea of using 
    signatures (aka 'sense paraphrases') is lesk-like.
    """
    # Ensure that ambiguous word is a lemma.
    ambiguous_word = lemmatize(ambiguous_word)
    # If ambiguous word not in WordNet return None
    if not wn.synsets(ambiguous_word):
        return None
    synsets_signatures = simple_signature(ambiguous_word, pos, lemma, stem,
                                          hyperhypo)

    if context_is_lemmatized:
        context_sentence = " ".join(context_sentence.split())
    else:
        context_sentence = " ".join(lemmatize_sentence(context_sentence))

    scores = []
    for ss, signature in synsets_signatures.items():
        # Lowercase and replace "_" with spaces.
        signature = " ".join(map(str, signature)).lower().replace("_", " ")
        # Removes punctuation.
        signature = [i for i in word_tokenize(signature) \
                     if i not in string.punctuation]
        # Optional: remove stopwords.
        if stop:
            signature = [i for i in signature if i not in EN_STOPWORDS]
        # Optional: Lemmatize the tokens.
        if lemma == True:
            signature = [lemmatize(i) for i in signature]
        # Optional: stem the tokens.
        if stem:
            signature = [porter.stem(i) for i in signature]
        scores.append((cos_sim(context_sentence, " ".join(signature)), ss))

        if not nbest:
            return sorted(scores, reverse=True)[0][1]
        else:
            return [(j, i) for i, j in sorted(scores, reverse=True)]