Python lemmatize_sentence Beispiele, utils.lemmatize_sentence Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: similarities.py Projekt: Danfoa/SemEval-2012-task6-project

def longest_common_subsequence(s1, s2):

    lemmas_sentence_1, _ = lemmatize_sentence(s1.lower())
    lemmas_sentence_2, _ = lemmatize_sentence(s2.lower())
    sent1 = [w for w in lemmas_sentence_1 if not w in stop_words]
    sent2 = [w for w in lemmas_sentence_2 if not w in stop_words]

    ss1 = ' '.join(sent1)
    ss2 = ' '.join(sent2)
    m = len(ss1)
    n = len(ss2)

    if m == 0 or n == 0:
        return 0
    # declaring the array for storing the dp values
    L = [[None] * (n + 1) for i in range(m + 1)]
    """Following steps build L[m + 1][n + 1] in bottom up fashion 
    Note: L[i][j] contains length of LCS of X[0..i-1] 
    and Y[0..j-1]"""
    for i in range(m + 1):
        for j in range(n + 1):
            if i == 0 or j == 0:
                L[i][j] = 0
            elif ss1[i - 1] == ss2[j - 1]:
                L[i][j] = L[i - 1][j - 1] + 1
            else:
                L[i][j] = max(L[i - 1][j], L[i][j - 1])

    # L[m][n] contains the length of LCS of X[0..n-1] & Y[0..m-1]
    normalizer = len(ss1) if len(ss1) < len(ss2) else len(ss2)

    return L[m][n] / normalizer

Beispiel #2

0

Datei anzeigen

Datei: lesk.py Projekt: katieclark/datasci_topiccoherence_wordnet

def simple_lesk(context_sentence, ambiguous_word, \
                pos=None, lemma=True, stem=False, hyperhypo=True, \
                stop=True, context_is_lemmatized=False, \
                nbest=False, keepscore=False, normalizescore=False):
    """
    Simple Lesk is somewhere in between using more than the 
    original Lesk algorithm (1986) and using less signature 
    words than adapted Lesk (Banerjee and Pederson, 2002)
    """
    # Ensure that ambiguous word is a lemma.
    ambiguous_word = lemmatize(ambiguous_word)
    # If ambiguous word not in WordNet return None
    if not wn.synsets(ambiguous_word):
        return None
    # Get the signatures for each synset.
    ss_sign = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo)
    # Disambiguate the sense in context.
    if context_is_lemmatized:
        context_sentence = context_sentence.split()
    else:
        context_sentence = lemmatize_sentence(context_sentence)
    best_sense = compare_overlaps(context_sentence, ss_sign, \
                                    nbest=nbest, keepscore=keepscore, \
                                    normalizescore=normalizescore)
    return best_sense

Beispiel #3

0

Datei anzeigen

Datei: lesk_wsd.py Projekt: animeshh/Word-Sense-Disambiguation-NLP

def adapted_lesk(context_sentence, ambiguous_word, \
                pos=None, option=False,lemma=True,hyperhypo=True, \
                stop=True):
    """
    This function is the implementation of the Adapted Lesk algorithm, 
    described in Banerjee and Pederson (2002). It makes use of the lexical 
    items from semantically related senses within the wordnet 
    hierarchies and to generate more lexical items for each sense. 
    see www.d.umn.edu/~tpederse/Pubs/cicling2002-b.pdf‎
    """
    # Ensure that ambiguous word is a lemma.
    #ambiguous_word = lemmatize(ambiguous_word)
    # Get the signatures for each synset.

    ss_sign = simple_signature(ambiguous_word,lemma=True,hyperhypo=True)
    #print ss_sign
    for ss in ss_sign:
        related_senses = list(set(ss.member_holonyms() + ss.member_meronyms() + 
                                 ss.part_meronyms() + ss.part_holonyms() + 
                                 ss.similar_tos() + ss.substance_holonyms() + 
                                 ss.substance_meronyms()))
    
        try:
            signature = list([j for j in chain(*[i.lemma_names() for i in \
                      related_senses]) if j not in stopwords.words('english')])
        except:
            signature = list([j for j in chain(*[i.lemma_names for i in \
                      related_senses]) if j not in stopwords.words('english')])
    ss_sign[ss]+=signature
  
    context_sentence = lemmatize_sentence(context_sentence)
    best_sense = compare_overlaps(context_sentence, ss_sign)
    return best_sense

Beispiel #4

0

Datei anzeigen

Datei: lesk.py Projekt: shreyg/GitFiles

def simple_lesk(context_sentence, ambiguous_word, \
                pos=None, lemma=True, stem=False, hyperhypo=True, \
                stop=True, context_is_lemmatized=False, \
                nbest=False, keepscore=False, normalizescore=False):
    """
    Simple Lesk is somewhere in between using more than the 
    original Lesk algorithm (1986) and using less signature 
    words than adapted Lesk (Banerjee and Pederson, 2002)
    """
    # Ensure that ambiguous word is a lemma.
    ambiguous_word = lemmatize(ambiguous_word) 
    # If ambiguous word not in WordNet return None
    if not wn.synsets(ambiguous_word):
        return None
    # Get the signatures for each synset.
    ss_sign = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo)
    # Disambiguate the sense in context.
    if context_is_lemmatized:
        context_sentence = context_sentence.split()
    else:
        context_sentence = lemmatize_sentence(context_sentence)
    best_sense = compare_overlaps(context_sentence, ss_sign, \
                                    nbest=nbest, keepscore=keepscore, \
                                    normalizescore=normalizescore)  
    return best_sense

Beispiel #5

0

Datei anzeigen

Datei: lesk.py Projekt: katieclark/datasci_topiccoherence_wordnet

def adapted_lesk(context_sentence, ambiguous_word, \
                pos=None, lemma=True, stem=True, hyperhypo=True, \
                stop=True, context_is_lemmatized=False, \
                nbest=False, keepscore=False, normalizescore=False):
    """
    This function is the implementation of the Adapted Lesk algorithm, 
    described in Banerjee and Pederson (2002). It makes use of the lexical 
    items from semantically related senses within the wordnet 
    hierarchies and to generate more lexical items for each sense. 
    see www.d.umn.edu/~tpederse/Pubs/cicling2002-b.pdf‎
    """
    # Ensure that ambiguous word is a lemma.
    ambiguous_word = lemmatize(ambiguous_word)
    # If ambiguous word not in WordNet return None
    if not wn.synsets(ambiguous_word):
        return None
    # Get the signatures for each synset.
    ss_sign = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo)
    for ss in ss_sign:
        # Includes holonyms.
        ss_mem_holonyms = synset_properties(ss, 'member_holonyms')
        ss_part_holonyms = synset_properties(ss, 'part_holonyms')
        ss_sub_holonyms = synset_properties(ss, 'substance_holonyms')
        # Includes meronyms.
        ss_mem_meronyms = synset_properties(ss, 'member_meronyms')
        ss_part_meronyms = synset_properties(ss, 'part_meronyms')
        ss_sub_meronyms = synset_properties(ss, 'substance_meronyms')
        # Includes similar_tos
        ss_simto = synset_properties(ss, 'similar_tos')

        related_senses = list(
            set(ss_mem_holonyms + ss_part_holonyms + ss_sub_holonyms +
                ss_mem_meronyms + ss_part_meronyms + ss_sub_meronyms +
                ss_simto))

        signature = list([
            j for j in chain(
                *[synset_properties(i, 'lemma_names') for i in related_senses])
            if j not in EN_STOPWORDS
        ])

    # Lemmatized context is preferred over stemmed context
    if lemma == True:
        signature = [lemmatize(i) for i in signature]
    # Matching exact words causes sparsity, so optional matching for stems.
    if stem == True:
        signature = [porter.stem(i) for i in signature]
    # Adds the extended signature to the simple signatures.
    ss_sign[ss] += signature

    # Disambiguate the sense in context.
    if context_is_lemmatized:
        context_sentence = context_sentence.split()
    else:
        context_sentence = lemmatize_sentence(context_sentence)
    best_sense = compare_overlaps(context_sentence, ss_sign, \
                                    nbest=nbest, keepscore=keepscore, \
                                    normalizescore=normalizescore)
    return best_sense

Beispiel #6

0

Datei anzeigen

Datei: similarities.py Projekt: Danfoa/SemEval-2012-task6-project

def information_content_similarity(s1, s2):
    """ 
    Compute the sentence similairty using information content from wordnet
    (words are disambiguated first to Synsets by means of Lesk algorithm) 
    """
    lemmas_sentence_1, tagged_sentence_1 = lemmatize_sentence(s1.lower())
    lemmas_sentence_2, tagged_sentence_2 = lemmatize_sentence(s2.lower())

    # Disambiguate words and create list of sysnsets
    synsets_sentence_1 = []
    for (lemma, word_tag) in zip(lemmas_sentence_1, tagged_sentence_1):
        synset = lesk(lemmas_sentence_1, lemma, wordnet_pos_code(word_tag[1]))
        if synset is not None:
            synsets_sentence_1.append(synset)
        else:
            found = wordnet.synsets(lemma, wordnet_pos_code(word_tag[1]))
            if len(found) > 0:
                synsets_sentence_1.append(found[0])
                #print("Warn: lemma [%s] returned no disambiguation...using synset : %s" % (lemma, found[0]))
    synsets_sentence_2 = []
    for (lemma, word_tag) in zip(lemmas_sentence_2, tagged_sentence_2):
        synset = lesk(lemmas_sentence_2, lemma, wordnet_pos_code(word_tag[1]))
        if synset is not None:
            synsets_sentence_2.append(synset)
        else:
            found = wordnet.synsets(lemma, wordnet_pos_code(word_tag[1]))
            if len(found) > 0:
                synsets_sentence_2.append(found[0])
                #print("Warn: lemma [%s] returned no disambiguation...using synset : %s" % (lemma, found[0]))

    score, count = 0.0, 0
    # For each word in the first sentence
    for synset in synsets_sentence_1:
        L = []
        for ss in synsets_sentence_2:
            try:
                L.append(synset.lin_similarity(ss, brown_ic))
            except:
                continue
        if L:
            best_score = max(L)
            score += best_score
            count += 1
    # Average the values
    if count > 0: score /= count
    return score

Beispiel #7

0

Datei anzeigen

Datei: lesk.py Projekt: shreyg/GitFiles

def adapted_lesk(context_sentence, ambiguous_word, \
                pos=None, lemma=True, stem=True, hyperhypo=True, \
                stop=True, context_is_lemmatized=False, \
                nbest=False, keepscore=False, normalizescore=False):
    """
    This function is the implementation of the Adapted Lesk algorithm, 
    described in Banerjee and Pederson (2002). It makes use of the lexical 
    items from semantically related senses within the wordnet 
    hierarchies and to generate more lexical items for each sense. 
    see www.d.umn.edu/~tpederse/Pubs/cicling2002-b.pdf‎
    """
    # Ensure that ambiguous word is a lemma.
    ambiguous_word = lemmatize(ambiguous_word)
    # If ambiguous word not in WordNet return None
    if not wn.synsets(ambiguous_word):
        return None
    # Get the signatures for each synset.
    ss_sign = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo)
    for ss in ss_sign:
        # Includes holonyms.
        ss_mem_holonyms = synset_properties(ss, 'member_holonyms')
        ss_part_holonyms = synset_properties(ss, 'part_holonyms')
        ss_sub_holonyms = synset_properties(ss, 'substance_holonyms')
        # Includes meronyms.
        ss_mem_meronyms = synset_properties(ss, 'member_meronyms')
        ss_part_meronyms = synset_properties(ss, 'part_meronyms')
        ss_sub_meronyms = synset_properties(ss, 'substance_meronyms')
        # Includes similar_tos
        ss_simto = synset_properties(ss, 'similar_tos')
        
        related_senses = list(set(ss_mem_holonyms+ss_part_holonyms+ 
                                  ss_sub_holonyms+ss_mem_meronyms+ 
                                  ss_part_meronyms+ss_sub_meronyms+ ss_simto))
    
        signature = list([j for j in chain(*[synset_properties(i, 'lemma_names') 
                                             for i in related_senses]) 
                          if j not in EN_STOPWORDS])
        
    # Lemmatized context is preferred over stemmed context
    if lemma == True:
        signature = [lemmatize(i) for i in signature]
    # Matching exact words causes sparsity, so optional matching for stems.
    if stem == True:
        signature = [porter.stem(i) for i in signature]
    # Adds the extended signature to the simple signatures.
    ss_sign[ss]+=signature
  
    # Disambiguate the sense in context.
    if context_is_lemmatized:
        context_sentence = context_sentence.split()
    else:
        context_sentence = lemmatize_sentence(context_sentence)
    best_sense = compare_overlaps(context_sentence, ss_sign, \
                                    nbest=nbest, keepscore=keepscore, \
                                    normalizescore=normalizescore)
    return best_sense

Beispiel #8

0

Datei anzeigen

Datei: similarities.py Projekt: Danfoa/SemEval-2012-task6-project

def synsets_similarity(s1, s2):
    """
    Find the jaccard similarity between two sentences synsets using lesk algorithm
    to disambiguate words given their context.
    """
    lemmas_sentence_1, tagged_sentence_1 = lemmatize_sentence(s1.lower())
    lemmas_sentence_2, tagged_sentence_2 = lemmatize_sentence(s2.lower())

    # Disambiguate words and create list of sysnsets
    synsets_sentence_1 = []
    for (lemma, word_tag) in zip(lemmas_sentence_1, tagged_sentence_1):
        if lemma in stop_words:
            continue
        synset = lesk(lemmas_sentence_1, lemma, wordnet_pos_code(word_tag[1]))
        if synset is not None:
            synsets_sentence_1.append(synset)
        else:
            found = wordnet.synsets(lemma, wordnet_pos_code(word_tag[1]))
            if len(found) > 0:
                synsets_sentence_1.append(found[0])
                #print("Warn: lemma [%s] returned no disambiguation...using synset : %s" % (lemma, found[0]))

    synsets_sentence_2 = []
    for (lemma, word_tag) in zip(lemmas_sentence_2, tagged_sentence_2):
        if lemma in stop_words:
            continue
        synset = lesk(lemmas_sentence_2, lemma, wordnet_pos_code(word_tag[1]))
        if synset is not None:
            synsets_sentence_2.append(synset)
        else:
            found = wordnet.synsets(lemma, wordnet_pos_code(word_tag[1]))
            if len(found) > 0:
                synsets_sentence_2.append(found[0])
                #print("Warn: lemma [%s] returned no disambiguation...using synset : %s" % (lemma, found[0]))

    # Compute similarity
    if len(synsets_sentence_1) != 0 and len(synsets_sentence_2) != 0:
        similarity = 1 - jaccard_distance(set(synsets_sentence_1),
                                          set(synsets_sentence_2))
        return similarity
    else:
        return 0

Beispiel #9

0

Datei anzeigen

def disambiguate(sentence,
                 algorithm=simple_lesk,
                 context_is_lemmatized=False,
                 similarity_option='path',
                 keepLemmas=False,
                 prefersNone=True,
                 similarity_data=None):
    tagged_sentence = []
    # Pre-lemmatize the sentnece before WSD
    if not context_is_lemmatized:
        surface_words, lemmas, morphy_poss = lemmatize_sentence(
            sentence, keepWordPOS=True)
        lemma_sentence = " ".join(lemmas)
    else:
        lemma_sentence = sentence  # TODO: Miss out on POS specification, how to resolve?
    for word, lemma, pos in zip(surface_words, lemmas, morphy_poss):
        if lemma not in stopwords:  # Checks if it is a content word
            try:
                wn.synsets(lemma)[0]
                if algorithm == original_lesk:  # Note: Original doesn't care about lemmas
                    synset = algorithm(lemma_sentence, lemma)
                elif algorithm == max_similarity:
                    synset = algorithm(lemma_sentence,
                                       lemma,
                                       pos=pos,
                                       option=similarity_option,
                                       data=similarity_data)
                else:
                    synset = algorithm(lemma_sentence,
                                       lemma,
                                       pos=pos,
                                       context_is_lemmatized=True)
            except:  # In case the content word is not in WordNet
                synset = '#NOT_IN_WN#'
        else:
            synset = '#STOPWORD/PUNCTUATION#'
        if keepLemmas:
            tagged_sentence.append((word, lemma, synset))
        else:
            tagged_sentence.append((word, synset))
    # Change #NOT_IN_WN# and #STOPWORD/PUNCTUATION# into None.
    if prefersNone and not keepLemmas:
        tagged_sentence = [(word, None) if str(tag).startswith('#') else
                           (word, tag) for word, tag in tagged_sentence]
    if prefersNone and keepLemmas:
        tagged_sentence = [(word, lemma, None) if str(tag).startswith('#') else
                           (word, lemma, tag)
                           for word, lemma, tag in tagged_sentence]
    return tagged_sentence

Beispiel #10

0

Datei anzeigen

Datei: lesk.py Projekt: lucasnoah/litmetricscore

def cosine_lesk(
    context_sentence,
    ambiguous_word,
    pos=None,
    lemma=True,
    stem=True,
    hyperhypo=True,
    stop=True,
    context_is_lemmatized=False,
    nbest=False,
):
    """ 
    In line with vector space models, we can use cosine to calculate overlaps
    instead of using raw overlap counts. Essentially, the idea of using 
    signatures (aka 'sense paraphrases') is lesk-like.
    """
    # Ensure that ambiguous word is a lemma.
    ambiguous_word = lemmatize(ambiguous_word)
    # If ambiguous word not in WordNet return None
    if not wn.synsets(ambiguous_word):
        return None
    synsets_signatures = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo)

    if context_is_lemmatized:
        context_sentence = " ".join(context_sentence.split())
    else:
        context_sentence = " ".join(lemmatize_sentence(context_sentence))

    scores = []
    for ss, signature in synsets_signatures.items():
        # Lowercase and replace "_" with spaces.
        signature = " ".join(map(str, signature)).lower().replace("_", " ")
        # Removes punctuation.
        signature = [i for i in word_tokenize(signature) if i not in string.punctuation]
        # Optional: remove stopwords.
        if stop:
            signature = [i for i in signature if i not in EN_STOPWORDS]
        # Optional: Lemmatize the tokens.
        if lemma == True:
            signature = [lemmatize(i) for i in signature]
        # Optional: stem the tokens.
        if stem:
            signature = [porter.stem(i) for i in signature]
        scores.append((cos_sim(context_sentence, " ".join(signature)), ss))

        if not nbest:
            return sorted(scores, reverse=True)[0][1]
        else:
            return [(j, i) for i, j in sorted(scores, reverse=True)]

Beispiel #11

0

Datei anzeigen

Datei: leskalgorithm.py Projekt: anweshm4/ReWordTool

def adapted_lesk(context_sentence, ambiguous_word, \
                pos=None, lemma=True, stem=True, hyperhypo=True, \
                stop=True, context_is_lemmatized=False, \
                nbest=False, keepscore=False, normalizescore=False):
    """
    This function is the implementation of the Adapted Lesk algorithm,
    described in Banerjee and Pederson (2002). It makes use of the lexical
    items from semantically related senses within the wordnet
    hierarchies and to generate more lexical items for each sense.
    see www.d.umn.edu/~tpederse/Pubs/cicling2002-b.pdf‎
    """
    # Ensure that ambiguous word is a lemma.
    ambiguous_word = lemmatize(ambiguous_word)
    # Get the signatures for each synset.
    ss_sign = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo)
    for ss in ss_sign:
        related_senses = list(set(ss.member_holonyms() + ss.member_meronyms() +
                                 ss.part_meronyms() + ss.part_holonyms() +
                                 ss.similar_tos() + ss.substance_holonyms() +
                                 ss.substance_meronyms()))

        try:
            signature = list([j for j in chain(*[i.lemma_names() for i in \
                      related_senses]) if j not in stopwords.words('english')])
        except:
            signature = list([j for j in chain(*[i.lemma_names for i in \
                      related_senses]) if j not in stopwords.words('english')])
            if j in stopwords.words('english')
                print "Error"
    # Lemmatized context is preferred over stemmed context
    if lemma == True:
        signature = [lemmatize(i) for i in signature]
    # Matching exact words causes sparsity, so optional matching for stems.
    if stem == True:
        signature = [porter.stem(i) for i in signature]
    ss_sign[ss]+=signature

    # Disambiguate the sense in context.
    if context_is_lemmatized:
        context_sentence = context_sentence.split()
    else:
        context_sentence = lemmatize_sentence(context_sentence)
    best_sense = compare_overlaps(context_sentence, ss_sign, \
                                    nbest=nbest, keepscore=keepscore, \
                                    normalizescore=normalizescore)
    return best_sense

Beispiel #12

0

Datei anzeigen

Datei: lesk.py Projekt: katieclark/datasci_topiccoherence_wordnet

def cosine_lesk(context_sentence, ambiguous_word, \
                pos=None, lemma=True, stem=True, hyperhypo=True, \
                stop=True, context_is_lemmatized=False, \
                nbest=False):
    """ 
    In line with vector space models, we can use cosine to calculate overlaps
    instead of using raw overlap counts. Essentially, the idea of using 
    signatures (aka 'sense paraphrases') is lesk-like.
    """
    # Ensure that ambiguous word is a lemma.
    ambiguous_word = lemmatize(ambiguous_word)
    # If ambiguous word not in WordNet return None
    if not wn.synsets(ambiguous_word):
        return None
    synsets_signatures = simple_signature(ambiguous_word, pos, lemma, stem,
                                          hyperhypo)

    if context_is_lemmatized:
        context_sentence = " ".join(context_sentence.split())
    else:
        context_sentence = " ".join(lemmatize_sentence(context_sentence))

    scores = []
    for ss, signature in synsets_signatures.items():
        # Lowercase and replace "_" with spaces.
        signature = " ".join(map(str, signature)).lower().replace("_", " ")
        # Removes punctuation.
        signature = [i for i in word_tokenize(signature) \
                     if i not in string.punctuation]
        # Optional: remove stopwords.
        if stop:
            signature = [i for i in signature if i not in EN_STOPWORDS]
        # Optional: Lemmatize the tokens.
        if lemma == True:
            signature = [lemmatize(i) for i in signature]
        # Optional: stem the tokens.
        if stem:
            signature = [porter.stem(i) for i in signature]
        scores.append((cos_sim(context_sentence, " ".join(signature)), ss))

        if not nbest:
            return sorted(scores, reverse=True)[0][1]
        else:
            return [(j, i) for i, j in sorted(scores, reverse=True)]

Beispiel #13

0

Datei anzeigen

Datei: allwords_wsd.py Projekt: ChenglongChen/pywsd

def disambiguate(sentence, algorithm=simple_lesk, 
                 context_is_lemmatized=False, similarity_option='path',
                 keepLemmas=False, prefersNone=True):
    tagged_sentence = []
    # Pre-lemmatize the sentnece before WSD
    if not context_is_lemmatized:
        surface_words, lemmas, morphy_poss = lemmatize_sentence(sentence, keepWordPOS=True)
        lemma_sentence = " ".join(lemmas)
    else:
        lemma_sentence = sentence # TODO: Miss out on POS specification, how to resolve?
    for word, lemma, pos in zip(surface_words, lemmas, morphy_poss):
        if lemma not in stopwords: # Checks if it is a content word 
            try:
                wn.synsets(lemma)[0]
                if algorithm == original_lesk: # Note: Original doesn't care about lemmas
                    synset = algorithm(lemma_sentence, lemma)
                elif algorithm == max_similarity:                    
                    synset = algorithm(lemma_sentence, lemma, pos=pos, option=similarity_option)
                else:
                    synset = algorithm(lemma_sentence, lemma, pos=pos, context_is_lemmatized=True)
            except: # In case the content word is not in WordNet
                synset = '#NOT_IN_WN#'
        else:
            synset = '#STOPWORD/PUNCTUATION#'
        if keepLemmas:
            tagged_sentence.append((word, lemma, synset))
        else:
            tagged_sentence.append((word, synset))
    # Change #NOT_IN_WN# and #STOPWORD/PUNCTUATION# into None.
    if prefersNone and not keepLemmas:
        tagged_sentence = [(word, None) if str(tag).startswith('#') 
                           else (word, tag) for word, tag in tagged_sentence]
    if prefersNone and keepLemmas:
        tagged_sentence = [(word, lemma, None) if str(tag).startswith('#') 
                           else (word, lemma, tag) for word, lemma, tag in tagged_sentence]
    return tagged_sentence

Beispiel #14

0

Datei anzeigen

def disambiguate_new(sentence,
                     algorithm=simple_lesk,
                     extra_words=None,
                     context_is_lemmatized=False,
                     similarity_option='path',
                     keepLemmas=False,
                     prefersNone=True,
                     similarity_data=None):
    # adds option of extra words, e.g. from LDA output, though not required
    # also checks if a word has 0 or 1 synsets, and doesn't run WSD in those cases
    tagged_sentence = []
    # Pre-lemmatize the sentence before WSD
    if not context_is_lemmatized:
        surface_words, lemmas, morphy_poss = lemmatize_sentence(
            sentence, keepWordPOS=True)
        lemma_sentence = " ".join(lemmas)
    else:
        lemma_sentence = sentence  # TODO: Miss out on POS specification, how to resolve?
    # print lemma_sentence
    if extra_words:
        #print("changing sentence to add LDA words:")
        #print(lemma_sentence)
        lemma_sentence = lemma_sentence.rstrip('.') + ' ' + " ".join(
            extra_words)
        #print(lemma_sentence)
    for word, lemma, pos in zip(surface_words, lemmas, morphy_poss):
        if lemma not in stopwords:  # Checks if it is a content word
            try:
                if re.search(r'[a-z]+\.[nvsar]\.[0-9]{2}',
                             lemma) != None:  # lemma is already disambiguated
                    synset = wn.synset(lemma)
                    # print("single synset for %s" % lemma)
                else:
                    syns = wn.synsets(lemma)
                    if len(syns) == 0:
                        #print("no synsets for %s: returning None" % lemma)
                        synset = None
                    elif len(syns) == 1:
                        #print("just one synset for %s: returning %s" % (lemma, syns[0]))
                        synset = syns[0]
                    elif algorithm == original_lesk:  # Note: Original doesn't care about lemmas
                        # print("running original_lesk on %s" % lemma)
                        synset = algorithm(lemma_sentence, lemma)
                        # print("succeeded; returning %s" % synset)
                    elif algorithm == max_similarity:
                        #print("running max_similarity on %s %s" % (lemma, pos))
                        synset = algorithm(lemma_sentence,
                                           lemma,
                                           pos=pos,
                                           option=similarity_option,
                                           data=similarity_data)
                        # print("succeeded at max_sim; returning %s" % synset)
                    else:
                        # print("running alg %s on %s" % (algorithm.__name__, lemma))
                        synset = algorithm(lemma_sentence,
                                           lemma,
                                           pos=pos,
                                           context_is_lemmatized=True)
                        # print("succeeded; returning %s" % synset)
            except:  # In case the content word is not in WordNet
                #exc_type, exc_obj, exc_tb = sys.exc_info()
                #fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
                #print(exc_type, fname, exc_tb.tb_lineno)
                print "threw error on: ", word, lemma, pos
                tb = traceback.format_exc()
                print tb
                synset = '#NOT_IN_WN#'
                # print("\ntry/except caught %s while trying alg %s and is returning #NOT_IN_WN#\n" % (lemma, algorithm.__name__))
        else:
            #print lemma, " in stop words"
            synset = '#STOPWORD/PUNCTUATION#'

        if keepLemmas:
            tagged_sentence.append((word, lemma, synset))
        else:
            tagged_sentence.append((word, synset))
        # print word, synset
    # Change #NOT_IN_WN# and #STOPWORD/PUNCTUATION# into None.
    if prefersNone and not keepLemmas:
        tagged_sentence = [(word, None) if str(tag).startswith('#') else
                           (word, tag) for word, tag in tagged_sentence]
    if prefersNone and keepLemmas:
        tagged_sentence = [(word, lemma, None) if str(tag).startswith('#') else
                           (word, lemma, tag)
                           for word, lemma, tag in tagged_sentence]
    return tagged_sentence

Beispiel #15

0

Datei anzeigen

Datei: similarities.py Projekt: Danfoa/SemEval-2012-task6-project

def extract_overlap_pen(s1, s2):
    """
    :param s1:
    :param s2:
    :return: overlap_pen score
    """
    lemmas_sentence_1, _ = lemmatize_sentence(s1.lower())
    lemmas_sentence_2, _ = lemmatize_sentence(s2.lower())
    ss1 = [w for w in lemmas_sentence_1 if not w in stop_words]
    ss2 = [w for w in lemmas_sentence_2 if not w in stop_words]

    ovlp_cnt = 0
    for w1 in ss1:
        ovlp_cnt += ss2.count(w1)
    score = 2 * ovlp_cnt / (len(ss1) + len(ss2) + .001)
    return score


# def sif_embeddings(sentences, alpha=1e-3):
#     """Compute the SIF embeddings for a list of sentences
#     Parameters
#     ----------
#     sentences : list
#         The sentences to compute the embeddings for
#     model : `~gensim.models.base_any2vec.BaseAny2VecModel`
#         A gensim model that contains the word vectors and the vocabulary
#     alpha : float, optional
#         Parameter which is used to weigh each individual word based on its probability p(w).
#     Returns
#     -------
#     numpy.ndarray
#         SIF sentence embedding matrix of dim len(sentences) * dimension
#     """
#     global glove_model

#     vlookup = glove_model.wv.vocab  # Gives us access to word index and count
#     vectors = glove_model.wv        # Gives us access to word vectors
#     size = glove_model.vector_size  # Embedding size

#     Z = 0
#     for k in vlookup:
#         Z += vlookup[k].count # Compute the normalization constant Z

#     output = []

#     # Iterate all sentences
#     for s in sentences:
#         count = 0
#         v = numpy.zeros(size, dtype=REAL) # Summary vector
#         # Iterare all words
#         for w in s:
#             # A word must be present in the vocabulary
#             if w in vlookup:
#                 for i in range(size):
#                     v[i] += ( alpha / (alpha + (vlookup[w].count / Z))) * vectors[w][i]
#                 count += 1

#         if count > 0:
#             for i in range(size):
#                 v[i] *= 1/count
#         output.append(v)
#     return numpy.vstack(output)