コード例 #1
0
def max_similarity(context_sentence, ambiguous_word, option="path", 
                   lemma=True, context_is_lemmatized=False, pos=None, best=True):
    """
    Perform WSD by maximizing the sum of maximum similarity between possible 
    synsets of all words in the context sentence and the possible synsets of the 
    ambiguous words (see http://goo.gl/XMq2BI):
    {argmax}_{synset(a)}(\sum_{i}^{n}{{max}_{synset(i)}(sim(i,a))}
    """
    ambiguous_word = lemmatize(ambiguous_word)
    # If ambiguous word not in WordNet return None
    if not wn.synsets(ambiguous_word):
        return None
    if context_is_lemmatized:
        context_sentence = word_tokenize(context_sentence)
    else:
        context_sentence = [lemmatize(w) for w in word_tokenize(context_sentence)]
    result = {}
    for i in wn.synsets(ambiguous_word):
        try:
            if pos and pos != str(i.pos()):
                continue
        except:
            if pos and pos != str(i.pos):
                continue 
        result[i] = sum(max([sim(i,k,option) for k in wn.synsets(j)]+[0]) \
                        for j in context_sentence)
    
    if option in ["res","resnik"]: # lower score = more similar
        result = sorted([(v,k) for k,v in result.items()])
    else: # higher score = more similar
        result = sorted([(v,k) for k,v in result.items()],reverse=True)
    ##print result
    if best: return result[0][1];
    return result
コード例 #2
0
def max_similarity(context_sentence: str, ambiguous_word: str, option="path",
                   lemma=True, context_is_lemmatized=False, pos=None, best=True) -> "wn.Synset":
    """
    Perform WSD by maximizing the sum of maximum similarity between possible
    synsets of all words in the context sentence and the possible synsets of the
    ambiguous words (see http://goo.gl/XMq2BI):
    {argmax}_{synset(a)}(\sum_{i}^{n}{{max}_{synset(i)}(sim(i,a))}

    :param context_sentence: String, a sentence.
    :param ambiguous_word: String, a single word.
    :return: If best, returns only the best Synset, else returns a dict.
    """
    ambiguous_word = lemmatize(ambiguous_word)
    # If ambiguous word not in WordNet return None
    if not wn.synsets(ambiguous_word):
        return None
    if context_is_lemmatized:
        context_sentence = word_tokenize(context_sentence)
    else:
        context_sentence = [lemmatize(w) for w in word_tokenize(context_sentence)]
    result = {}
    for i in wn.synsets(ambiguous_word, pos=pos):
        result[i] = 0
        for j in context_sentence:
            _result = [0]
            for k in wn.synsets(j):
                _result.append(sim(i,k,option))
            result[i] += max(_result)

    if option in ["res","resnik"]: # lower score = more similar
        result = sorted([(v,k) for k,v in result.items()])
    else: # higher score = more similar
        result = sorted([(v,k) for k,v in result.items()],reverse=True)

    return result[0][1] if best else result
コード例 #3
0
def adapted_lesk(context_sentence, ambiguous_word, \
                pos=None, lemma=True, stem=True, hyperhypo=True, \
                stop=True, context_is_lemmatized=False, \
                nbest=False, keepscore=False, normalizescore=False):
    """
    This function is the implementation of the Adapted Lesk algorithm, 
    described in Banerjee and Pederson (2002). It makes use of the lexical 
    items from semantically related senses within the wordnet 
    hierarchies and to generate more lexical items for each sense. 
    see www.d.umn.edu/~tpederse/Pubs/cicling2002-b.pdf‎
    """
    # Ensure that ambiguous word is a lemma.
    ambiguous_word = lemmatize(ambiguous_word)
    # If ambiguous word not in WordNet return None
    if not wn.synsets(ambiguous_word):
        return None
    # Get the signatures for each synset.
    ss_sign = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo)
    for ss in ss_sign:
        # Includes holonyms.
        ss_mem_holonyms = synset_properties(ss, 'member_holonyms')
        ss_part_holonyms = synset_properties(ss, 'part_holonyms')
        ss_sub_holonyms = synset_properties(ss, 'substance_holonyms')
        # Includes meronyms.
        ss_mem_meronyms = synset_properties(ss, 'member_meronyms')
        ss_part_meronyms = synset_properties(ss, 'part_meronyms')
        ss_sub_meronyms = synset_properties(ss, 'substance_meronyms')
        # Includes similar_tos
        ss_simto = synset_properties(ss, 'similar_tos')

        related_senses = list(
            set(ss_mem_holonyms + ss_part_holonyms + ss_sub_holonyms +
                ss_mem_meronyms + ss_part_meronyms + ss_sub_meronyms +
                ss_simto))

        signature = list([
            j for j in chain(
                *[synset_properties(i, 'lemma_names') for i in related_senses])
            if j not in EN_STOPWORDS
        ])

    # Lemmatized context is preferred over stemmed context
    if lemma == True:
        signature = [lemmatize(i) for i in signature]
    # Matching exact words causes sparsity, so optional matching for stems.
    if stem == True:
        signature = [porter.stem(i) for i in signature]
    # Adds the extended signature to the simple signatures.
    ss_sign[ss] += signature

    # Disambiguate the sense in context.
    if context_is_lemmatized:
        context_sentence = context_sentence.split()
    else:
        context_sentence = lemmatize_sentence(context_sentence)
    best_sense = compare_overlaps(context_sentence, ss_sign, \
                                    nbest=nbest, keepscore=keepscore, \
                                    normalizescore=normalizescore)
    return best_sense
コード例 #4
0
ファイル: lesk.py プロジェクト: gauravjuvekar/pywsd
def synset_signatures(ss,
                      hyperhypo=True,
                      adapted=False,
                      remove_stopwords=True,
                      to_lemmatize=True,
                      remove_numbers=True,
                      lowercase=True,
                      original_lesk=False,
                      from_cache=True):
    """
    :param ss: A WordNet synset.
    :type ss: nltk.corpus.wordnet.Synset
    """
    if from_cache:
        return synset_signatures_from_cache(ss, hyperhypo, adapted,
                                            original_lesk)
    # Collects the signatures from WordNet.
    signature = []
    # Adds the definition, example sentences and lemma_names.
    signature += word_tokenize(ss.definition())
    # If the original lesk signature is requested, skip the other signatures.
    if original_lesk:
        return set(signature)
    # Adds the examples and lemma names.
    signature += chain(*[word_tokenize(eg) for eg in ss.examples()])
    signature += ss.lemma_names()

    # Includes lemma_names of hyper-/hyponyms.
    if hyperhypo:
        hyperhyponyms = set(ss.hyponyms() + ss.hypernyms() +
                            ss.instance_hyponyms() + ss.instance_hypernyms())
        signature += set(chain(*[i.lemma_names() for i in hyperhyponyms]))

    # Includes signatures from related senses as in Adapted Lesk.
    if adapted:
        # Includes lemma_names from holonyms, meronyms and similar_tos
        related_senses = set(ss.member_holonyms() + ss.part_holonyms() + ss.substance_holonyms() + \
                             ss.member_meronyms() + ss.part_meronyms() + ss.substance_meronyms() + \
                             ss.similar_tos())
        signature += set(chain(*[i.lemma_names() for i in related_senses]))

    # Lowercase.
    signature = set(s.lower() for s in signature) if lowercase else signature
    # Removes stopwords.
    signature = set(signature).difference(
        EN_STOPWORDS) if remove_stopwords else signature

    # Lemmatized context is preferred over stemmed context.
    if to_lemmatize:
        signature = [
            lemmatize(s)
            if lowercase else lemmatize(s)  # Lowercasing checks here.
            for s in signature
            # We only throw away if both remove_numbers and s is a digit are true.
            if not (remove_numbers and s.isdigit())
        ]
    # Keep only the unique bag-of-words
    return set(signature)
コード例 #5
0
def simple_lesk(context_sentence, ambiguous_word, \
                pos=None, lemma=True, stem=False, hyperhypo=True, \
                stop=True, context_is_lemmatized=False, \
                nbest=False, keepscore=False, normalizescore=False):
    """
    Simple Lesk is somewhere in between using more than the 
    original Lesk algorithm (1986) and using less signature 
    words than adapted Lesk (Banerjee and Pederson, 2002)
    """
    # Ensure that ambiguous word is a lemma.
    ambiguous_word = lemmatize(ambiguous_word)
    # If ambiguous word not in WordNet return None
    if not wn.synsets(ambiguous_word):
        return None
    # Get the signatures for each synset.
    ss_sign = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo)
    # Disambiguate the sense in context.
    if context_is_lemmatized:
        context_sentence = context_sentence.split()
    else:
        context_sentence = lemmatize_sentence(context_sentence)
    best_sense = compare_overlaps(context_sentence, ss_sign, \
                                    nbest=nbest, keepscore=keepscore, \
                                    normalizescore=normalizescore)
    return best_sense
コード例 #6
0
def synsetSignatures(ss: "wn.Synset"):
    signature = []
    # Adds the definition, example sentences and lemma_names.
    signature += word_tokenize(ss.definition())
    signature += chain(*[word_tokenize(eg) for eg in ss.examples()])
    signature += ss.lemma_names()

    # Includes lemma_names of hyper-/hyponyms.
    hyperhyponyms = set(ss.hyponyms() + ss.hypernyms() +
                        ss.instance_hyponyms() + ss.instance_hypernyms())
    signature += set(chain(*[i.lemma_names() for i in hyperhyponyms]))

    # Includes signatures from related senses as in Adapted Lesk.
    # Includes lemma_names from holonyms, meronyms and similar_tos
    related_senses = set(ss.member_holonyms() + ss.part_holonyms() + ss.substance_holonyms() + \
                             ss.member_meronyms() + ss.part_meronyms() + ss.substance_meronyms() + \
                             ss.similar_tos())
    signature += set(chain(*[i.lemma_names() for i in related_senses]))
    # Lowercase.
    signature = set(s.lower() for s in signature)

    # Removes stopwords.
    signature = set(signature).difference(EN_STOPWORDS)

    # Lemmatized context is preferred over stemmed context.
    signature = [lemmatize(s) for s in signature]
    # Keep only the unique bag-of-words
    return set(signature)
コード例 #7
0
def cosine_lesk(context_sentence, ambiguous_word,
                pos=None, lemma=True, stem=True, hyperhypo=True,
                stop=True, context_is_lemmatized=False,
                nbest=False, from_cache=True):
    """
    In line with vector space models, we can use cosine to calculate overlaps
    instead of using raw overlap counts. Essentially, the idea of using
    signatures (aka 'sense paraphrases') is lesk-like.
    """
    # Ensure that ambiguous word is a lemma.
    ambiguous_word = lemmatize(ambiguous_word)
    # If ambiguous word not in WordNet return None
    if not wn.synsets(ambiguous_word):
        return None
    ss_sign = simple_signatures(ambiguous_word, pos, lemma, stem, hyperhypo, stop,
                                from_cache=from_cache)
    if context_is_lemmatized:
        context_sentence = " ".join(context_sentence.split())
    else:
        context_sentence = " ".join(lemmatize_sentence(context_sentence))

    scores = []
    for ss, signature in ss_sign.items():
        # Lowercase and replace "_" with spaces.
        signature = " ".join(map(str, signature)).lower().replace("_", " ")
        scores.append((cos_sim(context_sentence, signature), ss))

    scores = sorted(scores, reverse=True)
    return scores if nbest else scores[0][1]
コード例 #8
0
def adapted_lesk(context_sentence, ambiguous_word,
                pos=None, lemma=True, stem=False, hyperhypo=True,
                stop=True, context_is_lemmatized=False,
                nbest=False, keepscore=False, normalizescore=False,
                from_cache=True):
    """
    This function is the implementation of the Adapted Lesk algorithm,
    described in Banerjee and Pederson (2002). It makes use of the lexical
    items from semantically related senses within the wordnet
    hierarchies and to generate more lexical items for each sense.
    see www.d.umn.edu/~tpederse/Pubs/cicling2002-b.pdf‎
    """
    # Ensure that ambiguous word is a lemma.
    ambiguous_word = lemmatize(ambiguous_word)
    # If ambiguous word not in WordNet return None
    if not wn.synsets(ambiguous_word):
        return None
    # Get the signatures for each synset.
    ss_sign = signatures(ambiguous_word, pos=pos, hyperhypo=hyperhypo, adapted=True,
                         remove_stopwords=stop, to_lemmatize=lemma,
                         remove_numbers=True, lowercase=True, to_stem=stem,
                         from_cache=from_cache)

    # Disambiguate the sense in context.
    context_sentence = context_sentence.split() if context_is_lemmatized else lemmatize_sentence(context_sentence)
    return compare_overlaps(context_sentence, ss_sign, nbest=nbest,
                            keepscore=keepscore, normalizescore=normalizescore)
コード例 #9
0
def cosine_lesk(context_sentence, ambiguous_word, \
                pos=None, lemma=True, stem=True, hyperhypo=True, \
                stop=True, context_is_lemmatized=False, \
                nbest=False):
    """ 
    In line with vector space models, we can use cosine to calculate overlaps
    instead of using raw overlap counts. Essentially, the idea of using 
    signatures (aka 'sense paraphrases') is lesk-like.
    """
    # Ensure that ambiguous word is a lemma.
    ambiguous_word = lemmatize(ambiguous_word)
    # If ambiguous word not in WordNet return None
    if not wn.synsets(ambiguous_word):
        return None
    synsets_signatures = simple_signature(ambiguous_word, pos, lemma, stem,
                                          hyperhypo)

    if context_is_lemmatized:
        context_sentence = " ".join(context_sentence.split())
    else:
        context_sentence = " ".join(lemmatize_sentence(context_sentence))

    scores = []
    for ss, signature in synsets_signatures.items():
        # Lowercase and replace "_" with spaces.
        signature = " ".join(map(str, signature)).lower().replace("_", " ")
        # Removes punctuation.
        signature = [i for i in word_tokenize(signature) \
                     if i not in string.punctuation]
        # Optional: remove stopwords.
        if stop:
            signature = [i for i in signature if i not in EN_STOPWORDS]
        # Optional: Lemmatize the tokens.
        if lemma == True:
            signature = [lemmatize(i) for i in signature]
        # Optional: stem the tokens.
        if stem:
            signature = [porter.stem(i) for i in signature]
        scores.append((cos_sim(context_sentence, " ".join(signature)), ss))

        if not nbest:
            return sorted(scores, reverse=True)[0][1]
        else:
            return [(j, i) for i, j in sorted(scores, reverse=True)]
コード例 #10
0
ファイル: lesk.py プロジェクト: nnishu2901/emotion_tweets
def simple_signature(ss, stem=False):
    """
    Returns a synsets_signatures dictionary that includes signature words of a
    sense from its:
    (i)   definition
    (ii)  example sentences
    (iii) hypernyms and hyponyms
    """
    signature = []
    # print("ss: ",ss)
    # Includes definition.
    ss_definition = synset_properties(ss, 'definition')
    signature += word_tokenize(ss_definition)
    # Includes examples
    ss_examples = synset_properties(ss, 'examples')
    signature += list(chain(*[i.split() for i in ss_examples]))
    # Includes lemma_names.
    ss_lemma_names = synset_properties(ss, 'lemma_names')
    signature += ss_lemma_names
    # Optional: includes lemma_names of hypernyms and hyponyms.

    ss_hyponyms = synset_properties(ss, 'hyponyms')
    ss_hypernyms = synset_properties(ss, 'hypernyms')
    ss_hypohypernyms = ss_hypernyms + ss_hyponyms
    signature += list(chain(*[i.lemma_names() for i in ss_hypohypernyms]))
    #  print(signature)
    '''
    # Includes holonyms.
    ss_mem_holonyms = synset_properties(ss, 'member_holonyms')
    ss_part_holonyms = synset_properties(ss, 'part_holonyms')
    ss_sub_holonyms = synset_properties(ss, 'substance_holonyms')
    # Includes meronyms.
    ss_mem_meronyms = synset_properties(ss, 'member_meronyms')
    ss_part_meronyms = synset_properties(ss, 'part_meronyms')
    ss_sub_meronyms = synset_properties(ss, 'substance_meronyms')
    # Includes similar_tos
    ss_simto = synset_properties(ss, 'similar_tos')
    
    related_senses = list(set(ss_mem_holonyms+ss_part_holonyms+
                              ss_sub_holonyms+ss_mem_meronyms+
                              ss_part_meronyms+ss_sub_meronyms+ ss_simto))

    signature += list([j for j in chain(*[synset_properties(i, 'lemma_names')
                                         for i in related_senses])
                      if j not in EN_STOPWORDS])
  #  print(signature)
    # Optional: removes stopwords.
    '''
    signature = [i for i in signature if i.lower() not in EN_STOPWORDS]
    # Lemmatized context is preferred over stemmed context.
    signature = [lemmatize(i) for i in signature]
    # Matching exact words may cause sparsity, so optional matching for stems.
    if stem == True:
        signature = [porter.stem(i) for i in signature]

    return signature
コード例 #11
0
    def normalizar_ctx(lista_ctx, stop=True, lematizar=True, stem=True):
        if stop:
            lista_ctx = [
                i for i in lista_ctx if i not in stopwords.words('english')
            ]
        if lematizar:
            lista_ctx = [lemmatize(i) for i in lista_ctx]
        if stem:
            lista_ctx = [porter.stem(i) for i in lista_ctx]

        return lista_ctx
コード例 #12
0
def original_lesk(context_sentence, ambiguous_word, dictionary=None, from_cache=True):
    """
    This function is the implementation of the original Lesk algorithm (1986).
    It requires a dictionary which contains the definition of the different
    sense of each word. See http://dl.acm.org/citation.cfm?id=318728
    """
    ambiguous_word = lemmatize(ambiguous_word)
    if not dictionary: # If dictionary is not provided, use the WN defintion.
        dictionary = signatures(ambiguous_word, original_lesk=True, from_cache=from_cache)
    best_sense = compare_overlaps_greedy(context_sentence.split(), dictionary)
    return best_sense
コード例 #13
0
def original_lesk(context_sentence, ambiguous_word, dictionary=None):
    """
    This function is the implementation of the original Lesk algorithm (1986).
    It requires a dictionary which contains the definition of the different
    sense of each word. See http://dl.acm.org/citation.cfm?id=318728
    """
    ambiguous_word = lemmatize(ambiguous_word)
    if not dictionary:  # If dictionary is not provided, use the WN defintion.
        dictionary = {}
        for ss in wn.synsets(ambiguous_word):
            ss_definition = synset_properties(ss, 'definition')
            dictionary[ss] = ss_definition
    best_sense = compare_overlaps_greedy(context_sentence.split(), dictionary)
    return best_sense
コード例 #14
0
ファイル: combined.py プロジェクト: chehaknayar/combining
def simple_signature(ambiguous_word, pos=None, lemma=True, stem=False, \
                     hyperhypo=True, stop=True):
    """
    Returns a synsets_signatures dictionary that includes signature words of a
    sense from its:
    (i)   definition
    (ii)  example sentences
    (iii) hypernyms and hyponyms
    """
    synsets_signatures = {}
    for ss in wn.synsets(ambiguous_word):
        try:  # If POS is specified.
            if pos and str(ss.pos()) != pos:
                continue
        except:
            if pos and str(ss.pos) != pos:
                continue
        signature = []
        # Includes definition.
        ss_definition = synset_properties(ss, 'definition')
        signature += ss_definition
        # Includes examples
        ss_examples = synset_properties(ss, 'examples')
        signature += list(chain(*[i.split() for i in ss_examples]))
        # Includes lemma_names.
        ss_lemma_names = synset_properties(ss, 'lemma_names')
        signature += ss_lemma_names

        # Optional: includes lemma_names of hypernyms and hyponyms.
        if hyperhypo == True:
            ss_hyponyms = synset_properties(ss, 'hyponyms')
            ss_hypernyms = synset_properties(ss, 'hypernyms')
            ss_hypohypernyms = ss_hypernyms + ss_hyponyms
            signature += list(
                chain(*[i.lemma_names() for i in ss_hypohypernyms]))

        # Optional: removes stopwords.
        if stop == True:
            signature = [i for i in signature if i not in EN_STOPWORDS]
        # Lemmatized context is preferred over stemmed context.
        if lemma == True:
            signature = [lemmatize(i) for i in signature]
        # Matching exact words may cause sparsity, so optional matching for stems.
        if stem == True:
            signature = [porter.stem(i) for i in signature]
        synsets_signatures[ss] = signature

    return synsets_signatures
コード例 #15
0
ファイル: pywsdmod.py プロジェクト: joesdesk/themecrafter
def lemmatize_sentence(sentence: str, neverstem=False, keepWordPOS=False,
                       tokenizer=word_tokenize, postagger=pos_tag,
                       lemmatizer=wnl, stemmer=porter) -> list:

    words, lemmas, poss = [], [], []
    for word, pos_ in postagger(tokenizer(sentence)):
        pos = penn2morphy(pos_)
        lemmas.append(lemmatize(word.lower(), pos, neverstem,
                                lemmatizer, stemmer))
        poss.append(pos_)
        words.append(word)

    if keepWordPOS:
        return words, lemmas, poss#[None if i == '' else i for i in poss]

    return lemmas
コード例 #16
0
def adapted_lesk(context_sentence: str,
                 ambiguous_word: str,
                 pos: str = None) -> "wn.Synset":

    # Ensure that ambiguous word is a lemma.
    ambiguous_word = lemmatize(ambiguous_word)

    # If ambiguous word not in WordNet return None
    if (wn.synsets(ambiguous_word) == None):
        return None

    # Get the signatures for each synset.
    ss_sign = getSignaturesForWord(ambiguous_word, pos=pos)

    # Disambiguate the sense in context.
    #print(lemmatize_sentence(context_sentence))
    context_sentence = context_sentence.split()
    return compare_overlaps_greedy(context_sentence, ss_sign)
コード例 #17
0
ファイル: lesk.py プロジェクト: zhanghyy/pywsd
def simple_lesk(context_sentence: str,
                ambiguous_word: str,
                pos: str = None,
                lemma=True,
                stem=False,
                hyperhypo=True,
                stop=True,
                context_is_lemmatized=False,
                nbest=False,
                keepscore=False,
                normalizescore=False,
                from_cache=True) -> "wn.Synset":
    """
    Simple Lesk is somewhere in between using more than the
    original Lesk algorithm (1986) and using less signature
    words than adapted Lesk (Banerjee and Pederson, 2002)

    :param context_sentence: String, sentence or document.
    :param ambiguous_word: String, a single word.
    :param pos: String, one of 'a', 'r', 's', 'n', 'v', or None.
    :return: A Synset for the estimated best sense.
    """
    # Ensure that ambiguous word is a lemma.
    ambiguous_word = lemmatize(ambiguous_word, pos=pos)
    # If ambiguous word not in WordNet return None
    if not wn.synsets(ambiguous_word):
        return None
    # Get the signatures for each synset.
    ss_sign = simple_signatures(ambiguous_word,
                                pos,
                                lemma,
                                stem,
                                hyperhypo,
                                stop,
                                from_cache=from_cache)
    # Disambiguate the sense in context.
    context_sentence = context_sentence.split(
    ) if context_is_lemmatized else lemmatize_sentence(context_sentence)
    return compare_overlaps(context_sentence,
                            ss_sign,
                            nbest=nbest,
                            keepscore=keepscore,
                            normalizescore=normalizescore)
コード例 #18
0
def getSignaturesForWord(ambiguous_word: str, pos: str = None) -> dict:

    # Ensure that the POS is supported.
    pos = pos if pos in ['a', 'r', 's', 'n', 'v', None] else None

    # If the POS specified isn't found but other POS is in wordnet.
    if not wn.synsets(ambiguous_word, pos) and wn.synsets(ambiguous_word):
        pos = None

    # Holds the synset->signature dictionary.
    ss_sign = {}
    for ss in wn.synsets(ambiguous_word, pos):
        ss_sign[ss] = synsetSignatures(ss)

    ss_sign = {
        ss: [lemmatize(s) for s in signature]
        for ss, signature in ss_sign.items()
    }

    return ss_sign
コード例 #19
0
ファイル: combined.py プロジェクト: chehaknayar/combining
def simple_lesk(context_sentence, ambiguous_word, \
                pos=None, lemma=True, stem=False, hyperhypo=True, \
                stop=True, context_is_lemmatized=False, \
                nbest=False, keepscore=False, normalizescore=False):

    # Ensure that ambiguous word is a lemma.
    ambiguous_word = lemmatize(ambiguous_word)
    # If ambiguous word not in WordNet return None
    if not wn.synsets(ambiguous_word):
        return None
    # Get the signatures for each synset.
    ss_sign = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo)
    # Disambiguate the sense in context.
    if context_is_lemmatized:
        context_sentence = context_sentence.split()
    else:
        context_sentence = lemmatize_sentence(context_sentence)
    best_sense = compare_overlaps(context_sentence, ss_sign, \
                                    nbest=nbest, keepscore=keepscore, \
                                    normalizescore=normalizescore)
    return best_sense
コード例 #20
0
ファイル: lesk.py プロジェクト: zhanghyy/pywsd
def original_lesk(context_sentence: str,
                  ambiguous_word: str,
                  dictionary=None,
                  from_cache=True) -> "wn.Synset":
    """
    This function is the implementation of the original Lesk algorithm (1986).
    It requires a dictionary which contains the definition of the different
    sense of each word. See http://dl.acm.org/citation.cfm?id=318728

    :param context_sentence: String, sentence or document.
    :param ambiguous_word: String, a single word.
    :return: A Synset for the estimated best sense.
    """

    ambiguous_word = lemmatize(ambiguous_word)
    if not dictionary:  # If dictionary is not provided, use the WN defintion.
        dictionary = signatures(ambiguous_word,
                                original_lesk=True,
                                from_cache=from_cache)
    best_sense = compare_overlaps_greedy(context_sentence.split(), dictionary)

    return best_sense
コード例 #21
0
ファイル: lesk.py プロジェクト: tashengjinsheng/pywsd
def signatures(ambiguous_word,
               pos=None,
               hyperhypo=True,
               adapted=False,
               remove_stopwords=True,
               to_lemmatize=True):
    # Ensure that the POS is supported.
    pos = pos if pos in ['a', 'r', 's', 'n', 'v', None] else None
    # Holds the synset->signature dictionary.
    synsets_signatures = {}
    for ss in wn.synsets(ambiguous_word, pos=pos):
        signature = []
        # Adds the definition, example sentences and lemma_names.
        signature += word_tokenize(ss.definition())
        signature += chain(*[word_tokenize(eg) for eg in ss.examples()])
        signature += ss.lemma_names()
        # Optional: includes lemma_names of hyper-/hyponyms.
        if hyperhypo:
            hyperhypo = set(ss.hyponyms() + ss.hypernyms() +
                            ss.instance_hyponyms() + ss.instance_hypernyms())
            signature += set(chain(*[i.lemma_names() for i in hyperhypo]))
        # Optional: Includes signatures from related senses as in Adapted Lesk.
        if adapted:
            # Includes lemma_names from holonyms, meronyms and similar_tos
            related_senses = set(ss.member_holonyms() + ss.part_holonyms() +
                                 ss.substance_holonyms() +
                                 ss.member_meronyms() + ss.part_meronyms() +
                                 ss.substance_meronyms() + ss.similar_tos())
            signature += set(chain(*[i.lemma_names() for i in related_senses]))
        # Optional: removes stopwords.
        if remove_stopwords:
            signature = [i for i in signature if i not in EN_STOPWORDS]
        # Lemmatized context is preferred over stemmed context.
        if to_lemmatize:
            signature = [lemmatize(i) for i in signature]
        synsets_signatures[ss] = signature
    return synsets_signatures
コード例 #22
0
    def assinatura_significado(self, lema, lematizar=True,\
                            stem=False, stop=True,\
                            extrair_relacao_semantica=False,\
                            usar_exemplos=False):

        resultado = BaseOx.construir_objeto_unificado(self.base_ox, lema)

        if not resultado:
            resultado = {}

        lema = lemmatize(lema)

        assinaturas_significados = []  # (nome, definicao, exemplos)

        for pos in resultado.keys():
            todos_significados = resultado[pos].keys()

            indice = 1
            for sig in todos_significados:
                nome_sig = "%s.%s.%d" % (lema, pos, indice)
                indice += 1

                if usar_exemplos:
                    exemplos = resultado[pos][sig]['exemplos']
                else:
                    exemplos = []

                # nome, definicao, exemplos, assinatura
                definicao_corrente = [nome_sig, sig, exemplos, []]
                assinaturas_significados.append(definicao_corrente)

                # Colocando exemplos na assinatura
                definicao_corrente[len(definicao_corrente) -
                                   1] += self.assinatura_significado_aux(
                                       lema, pos, sig, exemplos)

                sig_secundarios = resultado[pos][sig]['def_secs']

                for ss in sig_secundarios:
                    nome_sig_sec = "%s.%s.%d" % (lema, pos, indice)

                    if usar_exemplos:
                        exemplos_secundarios = resultado[pos][sig]['def_secs'][
                            ss]['exemplos']
                    else:
                        exemplos_secundarios = []

                    definicao_corrente_sec = [
                        nome_sig_sec, ss, exemplos_secundarios, []
                    ]
                    assinaturas_significados.append(definicao_corrente_sec)

                    definicao_corrente_sec[
                        len(definicao_corrente) -
                        1] += self.assinatura_significado_aux(
                            lema, pos, ss, exemplos_secundarios)

                    indice += 1

        for sig in assinaturas_significados:
            sig[3] = Util.normalizar_ctx(sig[3],
                                         stop=True,
                                         lematizar=True,
                                         stem=True)

        return [tuple(a) for a in assinaturas_significados]
コード例 #23
0
    def des_exemplos(self, ctx,\
                    ambigua, pos, nbest=True,\
                    lematizar=True, stem=True, stop=True,\
                    normalizar_pont=True):

        cfgs = self.cfgs
        dir_bases = self.cfgs['caminho_bases']
        base_ox = self.base_ox

        rep_vet = self.rep_vetorial
        alvaro = Alvaro.Alvaro.INSTANCE

        dir_cache_rel_sinonimia = cfgs['caminho_bases'] + '/' + cfgs['oxford'][
            'cache']['sinonimia']
        chave_cache_relacao_sin = "%s-%s.json" % (ambigua, pos)
        dir_obj = dir_cache_rel_sinonimia + '/' + chave_cache_relacao_sin

        if not chave_cache_relacao_sin in Util.list_arqs(
                dir_cache_rel_sinonimia):
            rel_definicoes = alvaro.construir_relacao_definicoes(
                ambigua, pos, fontes='oxford')
            Util.salvar_json(dir_obj, rel_definicoes)
        else:
            rel_definicoes = Util.abrir_json(dir_obj, criarsenaoexiste=False)

        res_des_tmp = []
        pontuacao_somada = 0.00

        for def_ambigua in rel_definicoes:
            uniao_palavras_sem_duplicatas = set()
            uniao_palavras_com_duplicatas = list()
            exemplos_blob = []

            palavras_tf = {}

            try:
                maximo_exemplos = self.cfgs['params_exps']['qtde_exemplos'][0]
                lista_exemplos = BaseOx.obter_atributo(ambigua, pos,
                                                       def_ambigua, 'exemplos')
                # Adicionando lemas
                lista_exemplos.append(" ".join(
                    BaseOx.obter_sins(ambigua, def_ambigua, pos)))
                # Adicionando definicao
                lista_exemplos.append(def_ambigua)

                for ex in lista_exemplos[:maximo_exemplos]:
                    ex_blob = TextBlob(ex)
                    exemplos_blob.append(ex_blob)
                    for token in ex_blob.words:
                        if Util.is_stop_word(token.lower()) == False:
                            token_lematizado = lemmatize(token)
                            uniao_palavras_sem_duplicatas.add(token_lematizado)
                            uniao_palavras_com_duplicatas.append(
                                token_lematizado)

            except Exception, caminho:
                exemplos = []

            textblob_vocab = TextBlob(" ".join(uniao_palavras_com_duplicatas))

            palavras_latentes = []
            for p in textblob_vocab.word_counts:
                if textblob_vocab.word_counts[p] > 1:
                    palavras_latentes.append(p)

            palavras_derivadas = []

            for p in uniao_palavras_sem_duplicatas:
                tf = alvaro.tf(p, textblob_vocab)
                palavras_tf[p] = tf

            pontuacao = 0.00

            for t in Util.tokenize(Util.resolver_en(ctx).lower()):
                try:
                    pontuacao += palavras_tf[t]
                except:
                    pontuacao += 0.00

            pontuacao_somada += pontuacao

            try:
                if normalizar_pont:
                    reg_pont = pontuacao / sum(palavras_tf.values())
                else:
                    reg_pont = pontuacao
            except ZeroDivisionError, zde:
                reg_pont = 0.00
コード例 #24
0
                    if sins_defcaminho:
                        lista_exemplos.append(" ".join(sins_defcaminho))

                    # Adicionando definicao
                    lista_exemplos.append(def_caminho)

                for ex_iter in lista_exemplos[:maximo_exemplos]:
                    ex = ex_iter

                    ex_blob = TextBlob(ex)
                    exemplos_blob.append(ex_blob)

                    for token in ex_blob.words:
                        if Util.is_stop_word(token.lower()) == False:
                            token_lematizado = lemmatize(token)
                            uniao_palavras_sem_duplicatas.add(token_lematizado)
                            uniao_palavras_com_duplicatas.append(
                                token_lematizado)

            except Exception, caminho:
                import traceback

                traceback.print_stack()
                traceback.print_exc()

                exemplos = []

            tb_vocab_duplicatas = TextBlob(
                " ".join(uniao_palavras_com_duplicatas))
コード例 #25
0
def wsd(summpath, wsdpath):
    summpathlist = os.listdir(summpath)
    for idp,dirpath in enumerate(summpathlist[0:1]):
        print(idp)
        '''
        idp=0
        dirpath=summpathlist[idp]
        '''
        comppath = summpath+dirpath+'/'
        comppathlist = os.listdir(comppath)
        for idc,compathdir in enumerate(comppathlist):
            '''
            idc=0
            compathdir=comppathlist[idc]
            '''
            fpath = comppath+compathdir
            f = open(fpath,'r+',encoding='utf-8')
            text = f.readlines()
            text = [re.sub(r'^\s+|\s+$','',x) for x in text if len(x)>3]
            nertext = []
            sentencewsd=copy.deepcopy(text)
            for ids,sentence in enumerate(text):
                '''
                ids=0
                sentence=text[ids]
                '''
                word = sentence.split()
                word = [re.sub(r'[\,]','',x) for x in word]
                word = [re.sub(r'(?<=\w)[\.]','',x) for x in word]
                ner =  st.tag(word)
                ner = [x[0] for x in ner if x[1]=='O']
                ner = [x for x in ner if x not in stops]
                sentence = ' '.join(ner)
                nertext.append(sentence)
            for zipf in zipf_freq:
                '''
                zipf=zipf_freq[0]
                '''
                textwsd = []
                for ids,sentence in enumerate(nertext):
                    '''
                    ids=0
                    sentence=text[ids]
                    '''
                    ambiguity = disambiguate(sentence, adapted_lesk, keepLemmas=True, zipf=zipf)
                    for idy,syn in enumerate(ambiguity):
                        '''
                        idy=3
                        syn=ambiguity[idy]
                        '''
                        if syn[2] is not None:
                            syn_lemma = syn[2].lemma_names()
                            syn_lemma = [[zipf_frequency(x, 'en'),x] for x in syn_lemma ]
                            syn_lemma = sorted(syn_lemma , reverse=True)
                            if syn_lemma[0][0]==0:
                                syn_lemma = [[len(x[1]),x[1]] for x in syn_lemma]
                                syn_lemma = sorted(syn_lemma , reverse=False)
                            if(lemmatize(syn[0].lower())!=syn_lemma[0][1]):
                                sentencewsd[ids] = re.sub(r''+syn[0],syn_lemma[0][1],sentencewsd[ids] )
                    textwsd.append(sentencewsd[ids])
                outDirectory = wsdpath+dirpath+'/'
                if not os.path.exists(outDirectory):
                    os.makedirs(outDirectory)
                fout = open(outDirectory+str(zipf)+'-'+compathdir,'w',encoding='utf-8')
                fout.writelines(textwsd)
                fout.flush()
                fout.close()
コード例 #26
0
ファイル: similarity.py プロジェクト: katadh/tropical-models
def max_similarity(context_sentence, ambiguous_word, option="path",
                   lemma=True, context_is_lemmatized=False, pos=None, best=True, data=None):
    """
    Perform WSD by maximizing the sum of maximum similarity between possible
    synsets of all words in the context sentence and the possible synsets of the
    ambiguous words (see http://goo.gl/XMq2BI):
    {argmax}_{synset(a)}(\sum_{i}^{n}{{max}_{synset(i)}(sim(i,a))}
    """
    ambiguous_word = lemmatize(ambiguous_word)
    # If ambiguous word not in WordNet return None
    if not wn.synsets(ambiguous_word):
        print("no synsets found in wordnet")
        return None
    # print("its synsets are %s" % wn.synsets(ambiguous_word))
    if context_is_lemmatized:
        context_sentence = word_tokenize(context_sentence)
    else:
        context_sentence = [lemmatize(w) for w in word_tokenize(context_sentence)]
        #print context_sentence

    result = {}
    for i in wn.synsets(ambiguous_word):
        try:
            if pos and pos != str(i.pos()):
                continue
        except:
            if pos and pos != str(i.pos):
                continue

        res = 0
        for j in context_sentence:
            if re.search(r'[a-z]+\.[nvsar]\.[0-9]{2}', j) != None: # if j is already disambiguated
                mysynsets = [wn.synset(j)]
            #elif j is not '.':
            #    mysynsets = wn.synsets(j)
            else:
                mysynsets = wn.synsets(j)

            #print len(mysynsets)
            sims = [0]
            for k in mysynsets:
                sims.append(sim(i,k,option,data))

            res += max(sims)
                        # res += max([sim(i,k,option,data) for k in mysynsets])
                        # result[i] = sum(max([sim(i,k,option,data) for k in wn.synsets(j)]+[0]) \
                            # for j in context_sentence)
        result[i] = res
                        
    if len(result) == 0:
        return None
    # print("printing results")
    #print result.items()
    if option in ["res","resnik"]: # lower score = more similar
        result = sorted([(v,k) for k,v in result.items()])
    else: # higher score = more similar
        result = sorted([(v,k) for k,v in result.items()],reverse=True)
        # print("sorted; printing results again")
    #print result
    if best: return result[0][1];
    return result
コード例 #27
0
def lemmatization(stem_array):
    lemmatized = []
    for stems in stem_array:
        lemmas = [lemmatize(x) for x in stems if not x in stop_words]
        lemmatized.append(lemmas)
    return lemmatized
コード例 #28
0
ファイル: lesk.py プロジェクト: parthness/emotion_tweets
def simple_signature(word, ss, lexicon):
    """
    Returns a synsets_signatures dictionary that includes signature words of a
    sense from its:
    (i)   definition
    (ii)  example sentences
    (iii) hypernyms and hyponyms
    """
    signature = [wn.morphy(word), word,
                 ss.name().split('.')[0]] + findRelatedForms(ss)

    ss_definition = synset_properties(ss, 'definition')
    signature += word_tokenize(ss_definition)

    ss_lemma_names = synset_properties(ss, 'lemma_names')
    signature += ss_lemma_names

    ss_hyponyms = synset_properties(ss, 'hyponyms')
    #ss_hypernyms = synset_properties(ss, 'hypernyms')
    ss_hypohypernyms = ss_hyponyms  #+ss_hypernyms
    signature += list(chain(*[i.lemma_names() for i in ss_hypohypernyms]))
    #  print(signature)

    # Includes holonyms.
    ss_mem_holonyms = synset_properties(ss, 'member_holonyms')
    ss_part_holonyms = synset_properties(ss, 'part_holonyms')
    ss_sub_holonyms = synset_properties(ss, 'substance_holonyms')
    # Includes meronyms.
    ss_mem_meronyms = synset_properties(ss, 'member_meronyms')
    ss_part_meronyms = synset_properties(ss, 'part_meronyms')
    ss_sub_meronyms = synset_properties(ss, 'substance_meronyms')
    # Includes similar_tos
    #ss_simto = synset_properties(ss, 'similar_tos')

    related_senses = list(
        set(ss_mem_holonyms + ss_part_holonyms + ss_sub_holonyms +
            ss_mem_meronyms + ss_part_meronyms + ss_sub_meronyms))

    signature += list([
        j for j in chain(
            *[synset_properties(i, 'lemma_names') for i in related_senses])
        if j not in EN_STOPWORDS
    ])
    #  print(signature)
    # Optional: removes stopwords.
    signature = [i for i in signature if type(i) == str]
    signature = [i for i in signature if i.lower() not in EN_STOPWORDS]
    # Lemmatized context is preferred over stemmed context.
    signature = [lemmatize(i) for i in signature]
    # Matching exact words may cause sparsity, so optional matching for stems.
    #signature = [porter.stem(i) for i in signature]

    ss_sign = set(signature)
    to_remove = []

    for word in ss_sign:
        morphy = wn.morphy(word)
        if lexicon == '+-':
            if word not in lexiconDict and morphy not in lexiconDict:
                to_remove.append(word)
        else:
            if word in lexiconDict:
                if lexiconDict[word] != lexicon:
                    to_remove.append(word)
            elif (morphy is not None) and morphy in lexiconDict:
                if lexiconDict[morphy] != lexicon:
                    to_remove.append(word)
            else:
                to_remove.append(word)

    to_remove = set(to_remove)
    ss_sign = ss_sign - to_remove
    signature = list(ss_sign)

    sim = []
    for ss in ss_sign:
        synsets = wn.synsets(ss)
        if len(synsets) > 0:
            if synsets[0] not in sim:
                synset = synsets[0]
                pos = synset.pos()
                morphy = wn.morphy(ss, pos)
                if ss in lexiconDict or ss in lexiconDict or ss in disgustingWords:
                    sim.append(synset)
                elif (morphy is not None) and (morphy in lexiconDict
                                               or morphy in lexiconDict):
                    sim.append(synset)
                else:
                    signature.remove(ss)
        else:
            signature.remove(ss)
    return [signature, sim]
コード例 #29
0
def cosine_lesk_inventario_estendido(context_sentence, ambiguous_word, \
    pos=None, lemma=True, stem=True, hyperhypo=True, \
    stop=True, context_is_lemmatized=False, \
    nbest=False, synsets_signatures=None, busca_ampla=False):
    """
	In line with vector space models, we can use cosine to calculate overlaps
	instead of using raw overlap counts. Essentially, the idea of using
	signatures (aka 'sense paraphrases') is lesk-like.
	"""

    # Ensure that ambiguous word is a lemma.
    if lemma:
        ambiguous_word = lemmatize(ambiguous_word)

    # If ambiguous word not in WordNet return None
    #if not wn.synsets(ambiguous_word):
    if not criar_inventario_des_wn(ambiguous_word, busca_ampla=busca_ampla):
        return None

    if context_is_lemmatized:
        context_sentence = " ".join(context_sentence.split())
    else:
        context_sentence = " ".join(lemmatize_sentence(context_sentence))

    scores = []

    chave_assinatura = "%s.%s.%s.%s.%s.%s" % (ambiguous_word, pos, lemma, stem,
                                              hyperhypo, busca_ampla)

    if not chave_assinatura in DesWordnet.cache_assinaturas:
        synsets_signatures = simple_signature(ambiguous_word,
                                              pos,
                                              lemma,
                                              stem,
                                              hyperhypo,
                                              busca_ampla=busca_ampla)

        DesWordnet.cache_assinaturas[chave_assinatura] = []

        for ss, signature in synsets_signatures.items():
            # Lowercase and replace "_" with spaces.
            signature = " ".join(map(str, signature)).lower().replace("_", " ")
            # Removes punctuation.
            signature = [i for i in Util.word_tokenize(signature) \
               if i not in string.punctuation]

            signature = Util.normalizar_ctx(signature,
                                            stop=stop,
                                            lematizar=lemma,
                                            stem=stem)

            scores.append((cos_sim(context_sentence, " ".join(signature)), ss))

            DesWordnet.cache_assinaturas[chave_assinatura].append(
                (ss, signature))

    else:
        synsets_signatures = DesWordnet.cache_assinaturas[chave_assinatura]

        for ss, signature in synsets_signatures:
            scores.append((cos_sim(context_sentence, " ".join(signature)), ss))

    if not nbest:
        return sorted(scores, reverse=True)[0][1]
    else:
        return [(j, i) for i, j in sorted(scores, reverse=True)]