コード例 #1
0
def synsetSignatures(ss: "wn.Synset"):
    signature = []
    # Adds the definition, example sentences and lemma_names.
    signature += word_tokenize(ss.definition())
    signature += chain(*[word_tokenize(eg) for eg in ss.examples()])
    signature += ss.lemma_names()

    # Includes lemma_names of hyper-/hyponyms.
    hyperhyponyms = set(ss.hyponyms() + ss.hypernyms() +
                        ss.instance_hyponyms() + ss.instance_hypernyms())
    signature += set(chain(*[i.lemma_names() for i in hyperhyponyms]))

    # Includes signatures from related senses as in Adapted Lesk.
    # Includes lemma_names from holonyms, meronyms and similar_tos
    related_senses = set(ss.member_holonyms() + ss.part_holonyms() + ss.substance_holonyms() + \
                             ss.member_meronyms() + ss.part_meronyms() + ss.substance_meronyms() + \
                             ss.similar_tos())
    signature += set(chain(*[i.lemma_names() for i in related_senses]))
    # Lowercase.
    signature = set(s.lower() for s in signature)

    # Removes stopwords.
    signature = set(signature).difference(EN_STOPWORDS)

    # Lemmatized context is preferred over stemmed context.
    signature = [lemmatize(s) for s in signature]
    # Keep only the unique bag-of-words
    return set(signature)
コード例 #2
0
def max_similarity(context_sentence: str, ambiguous_word: str, option="path",
                   lemma=True, context_is_lemmatized=False, pos=None, best=True) -> "wn.Synset":
    """
    Perform WSD by maximizing the sum of maximum similarity between possible
    synsets of all words in the context sentence and the possible synsets of the
    ambiguous words (see http://goo.gl/XMq2BI):
    {argmax}_{synset(a)}(\sum_{i}^{n}{{max}_{synset(i)}(sim(i,a))}

    :param context_sentence: String, a sentence.
    :param ambiguous_word: String, a single word.
    :return: If best, returns only the best Synset, else returns a dict.
    """
    ambiguous_word = lemmatize(ambiguous_word)
    # If ambiguous word not in WordNet return None
    if not wn.synsets(ambiguous_word):
        return None
    if context_is_lemmatized:
        context_sentence = word_tokenize(context_sentence)
    else:
        context_sentence = [lemmatize(w) for w in word_tokenize(context_sentence)]
    result = {}
    for i in wn.synsets(ambiguous_word, pos=pos):
        result[i] = 0
        for j in context_sentence:
            _result = [0]
            for k in wn.synsets(j):
                _result.append(sim(i,k,option))
            result[i] += max(_result)

    if option in ["res","resnik"]: # lower score = more similar
        result = sorted([(v,k) for k,v in result.items()])
    else: # higher score = more similar
        result = sorted([(v,k) for k,v in result.items()],reverse=True)

    return result[0][1] if best else result
コード例 #3
0
ファイル: lesk.py プロジェクト: gauravjuvekar/pywsd
def synset_signatures(ss,
                      hyperhypo=True,
                      adapted=False,
                      remove_stopwords=True,
                      to_lemmatize=True,
                      remove_numbers=True,
                      lowercase=True,
                      original_lesk=False,
                      from_cache=True):
    """
    :param ss: A WordNet synset.
    :type ss: nltk.corpus.wordnet.Synset
    """
    if from_cache:
        return synset_signatures_from_cache(ss, hyperhypo, adapted,
                                            original_lesk)
    # Collects the signatures from WordNet.
    signature = []
    # Adds the definition, example sentences and lemma_names.
    signature += word_tokenize(ss.definition())
    # If the original lesk signature is requested, skip the other signatures.
    if original_lesk:
        return set(signature)
    # Adds the examples and lemma names.
    signature += chain(*[word_tokenize(eg) for eg in ss.examples()])
    signature += ss.lemma_names()

    # Includes lemma_names of hyper-/hyponyms.
    if hyperhypo:
        hyperhyponyms = set(ss.hyponyms() + ss.hypernyms() +
                            ss.instance_hyponyms() + ss.instance_hypernyms())
        signature += set(chain(*[i.lemma_names() for i in hyperhyponyms]))

    # Includes signatures from related senses as in Adapted Lesk.
    if adapted:
        # Includes lemma_names from holonyms, meronyms and similar_tos
        related_senses = set(ss.member_holonyms() + ss.part_holonyms() + ss.substance_holonyms() + \
                             ss.member_meronyms() + ss.part_meronyms() + ss.substance_meronyms() + \
                             ss.similar_tos())
        signature += set(chain(*[i.lemma_names() for i in related_senses]))

    # Lowercase.
    signature = set(s.lower() for s in signature) if lowercase else signature
    # Removes stopwords.
    signature = set(signature).difference(
        EN_STOPWORDS) if remove_stopwords else signature

    # Lemmatized context is preferred over stemmed context.
    if to_lemmatize:
        signature = [
            lemmatize(s)
            if lowercase else lemmatize(s)  # Lowercasing checks here.
            for s in signature
            # We only throw away if both remove_numbers and s is a digit are true.
            if not (remove_numbers and s.isdigit())
        ]
    # Keep only the unique bag-of-words
    return set(signature)