Example #1
0
    def test_path_similarities(self):
        from nltk.corpus import wordnet as nltk_wn
        nltk_cat = nltk_wn.synset('cat.n.1')
        nltk_dog = nltk_wn.synset('dog.n.1')
        nltk_bus = nltk_wn.synset('bus.n.1')

        our_cat = our_wn.synset('cat.n.1')
        our_dog = our_wn.synset('dog.n.1')
        our_bus = our_wn.synset('bus.n.1')
        assert nltk_wn.path_similarity(nltk_cat,
                                       nltk_dog) == our_wn.path_similarity(
                                           our_cat, our_dog)
        assert nltk_wn.wup_similarity(nltk_cat,
                                      nltk_dog) == our_wn.wup_similarity(
                                          our_cat, our_dog)
        assert nltk_wn.lch_similarity(nltk_cat,
                                      nltk_dog) == our_wn.lch_similarity(
                                          our_cat, our_dog)

        assert nltk_wn.path_similarity(nltk_cat,
                                       nltk_bus) == our_wn.path_similarity(
                                           our_cat, our_bus)
        assert nltk_wn.wup_similarity(nltk_cat,
                                      nltk_bus) == our_wn.wup_similarity(
                                          our_cat, our_bus)
        assert nltk_wn.lch_similarity(nltk_cat,
                                      nltk_bus) == our_wn.lch_similarity(
                                          our_cat, our_bus)
def get_similarity_score_1(word, given_list):
    max_similarity = 0

    if len(given_list) > 1:
        if word.lower() in given_list:
            max_similarity = 1
        else:
            current_verb_list = wn.synsets(word.lower())
            for verb in given_list:
                related_verbs = wn.synsets(verb.lower())
                for a, b in product(related_verbs, current_verb_list):
                    d = wn.wup_similarity(a, b)
                    try:
                        if d > max_similarity:
                            max_similarity = d
                    except:
                        continue
    else:
        if word.lower() == given_list[0].lower():
            max_similarity = 1
        else:
            current_verb_list = wn.synsets(word.lower())
            related_verbs = wn.synsets(given_list[0].lower())
            for a, b in product(related_verbs, current_verb_list):
                d = wn.wup_similarity(a, b)
                try:
                    if d > max_similarity:
                        max_similarity = d
                except:
                    continue
    return max_similarity
Example #3
0
 def wup_similarity(self, word1, word2):
     score = 0
     for syn1 in wn.synsets(word1):
         for syn2 in wn.synsets(word2):
             if (wn.wup_similarity(syn1, syn2)):
                 score += wn.wup_similarity(syn1, syn2)
     self.wup.append(score)
def similarityWordNet(word1, word2):
    """
    Similarity between two words with nltk
    Input: word1, word2 (String)
    Return: similarity (float)
    """
    #print (word1,"-",word2)
    word1 = str(wn.morphy(word1))
    word2 = str(wn.morphy(word2))

    palabras = wn.synsets(word1)
    #print (palabras)
    if len(palabras) == 0:
        print("no existe")
        return False
    temp = str(palabras[0])
    temp = temp[8:-2]
    #print (">",temp)
    word1 = wn.synset(str(temp))
    #print (word1)

    palabras = wn.synsets(word2)
    #print (palabras)
    if len(palabras) == 0:
        print("no existe")
        return False
    temp = str(palabras[0])
    temp = temp[8:-2]
    #print (">",temp)
    word2 = wn.synset(str(temp))
    #print (word2)
    """
    Return a score denoting how similar two word senses are,
    based on the shortest path that connects the senses in the is-a
    (hypernym/hypnoym) taxonomy. The score is in the range 0 to 1.
    """
    #similarity1 = word1.path_similarity(word2)
    #similarity1 = wn.path_similarity(word1, word2)
    try:
        #print (wn.path_similarity(word1, word2))
        #if (wn.path_similarity(word1, word2) > 0.5): #(hypernym/hypnoym) taxonomy
        print(wn.wup_similarity(word1, word2))
        if (wn.wup_similarity(word1, word2) > 0.5):  #Wu-Palmer Similarity
            return True
    except:
        return False
    """
    Leacock-Chodorow Similarity: Return a score denoting how similar
    two word senses are, based on the shortest path that connects
    the senses (as above) and the maximum depth of the taxonomy in
    which the senses occur. range 3.6
    """
    similarity2 = wn.lch_similarity(word1, word2)
    """
    Wu-Palmer Similarity: Return a score denoting how similar
    two word senses are, based on the depth of the two senses in
    the taxonomy and that of their Least Common Subsumer (most specific ancestor node).
    range 0.92
    """
    similarity3 = wn.wup_similarity(word1, word2)
def most_similar_wup(synsets_dict, verb):
    best_similarity = -1
    most_similar = str()
    verb_synset = wn.synsets(verb, pos=wn.VERB)[0]

    for verb, synset in synsets_dict.items():
        if wn.wup_similarity(synset, verb_synset) > best_similarity:
            best_similarity = wn.wup_similarity(synset, verb_synset)
            most_similar = verb

    return most_similar
Example #6
0
def similarity_by_path(sense1, sense2, option="path"):
    """ Returns maximum path similarity between two senses. """
    if option.lower() in ["path", "path_similarity"]:  # Path similaritys
        if wn.path_similarity(sense1, sense2) is None:  # lch can't do diff POS
            return 0
        return wn.path_similarity(sense1, sense2)
    elif option.lower() in ["wup", "wupa", "wu-palmer",
                            "wu-palmer"]:  # Wu-Palmer
        if wn.wup_similarity(sense1, sense2) is None:  # lch can't do diff POS
            return 0
        return wn.wup_similarity(sense1, sense2)
    elif option.lower() in ['lch', "leacock-chordorow"]:  # Leacock-Chodorow
        if sense1.pos != sense2.pos:  # lch can't do diff POS
            return 0
        return wn.lch_similarity(sense1, sense2)
Example #7
0
def similar(all_words):
    all_words = sorted(all_words)
    search_results = sorted(all_words, key=len)
    CONCEPTS = []
    while search_results:
        super_concepts = []
        some_word = search_results.pop(0)
        super_concepts.append(some_word)
        for x in search_results:
            #ratio = difflib.SequenceMatcher(None, some_word, x).ratio()
            #if ratio > 0.9:
            #    super_concepts.append(x)
            try:
                word1 = wordnet.synsets(some_word)
                #print(word1[0].name())
                word2 = wordnet.synsets(x)
                for sense1, sense2 in product(word1, word2):
                    d = wordnet.wup_similarity(sense1, sense2)
                    if d > 0.7:
                        super_concepts.append(x)
            except:
                continue
        CONCEPTS.append(super_concepts)
        for x in super_concepts:
            if x in search_results:
                search_results.remove(x)

    return sorted(CONCEPTS)
    def _relevance_wordnet(cls, c_0, c_1):
        # print(str(c_0) + "\t" + str(c_1))
        if not isinstance(c_0[0], tuple):
            allsyns1 = set(
                ss for word in c_0
                for ss in wn.synsets(word[0], pos=transform_POS(word[1])))
        else:
            allsyns1 = set(
                ss for ss in wn.synsets(c_0[0], pos=transform_POS(c_0[1])))
        if not isinstance(c_1[0], tuple):
            allsyns2 = set(
                ss for word in c_1
                for ss in wn.synsets(word[0], pos=transform_POS(word[1]))
                if ss not in allsyns1)
        else:
            allsyns2 = set(
                ss for ss in wn.synsets(c_1[0], pos=transform_POS(c_1[1])))

        if len(allsyns1) == 0 or len(allsyns2) == 0:
            return 0
        list_sim = [
            wn.wup_similarity(s1, s2) or 0
            for s1, s2 in product(allsyns1, allsyns2)
        ]
        best = sum(list_sim) / len(list_sim)
        # print(best)
        return best
Example #9
0
def compare_allsynsets(method, word1, word2):
    ss1 = wordnet.synsets(word1)
    ss2 = wordnet.synsets(word2)
    simi, simi_value = 0.0, 0.0
    for (s1, s2) in product(ss1, ss2):
        # if SYNpos and s1.pos() != s2.pos():  # SYN-POS
        #     continue
        # if TWpos and s1.pos() != pos:  # Target word POS
        #     continue
        if method == "PATH":
            simi = s1.path_similarity(s2)
        elif method == "LCH":
            simi = wordnet.lch_similarity(s1, s2)
        elif method == "WUP":
            simi = wordnet.wup_similarity(s1, s2)
        elif method == "RES":
            simi = wordnet.res_similarity(s1, s2, brown_ic)
        elif method == "JCN":
            if s1.pos() == s2.pos() and s1.pos() in ['n', 'a', 'v'
                                                     ]:  # can't do diff POS
                simi = wordnet.jcn_similarity(s1, s2, brown_ic)
        elif method == "LIN":
            if s1.pos() == s2.pos() and s1.pos() in ['n', 'a', 'v'
                                                     ]:  # can't do diff POS
                simi = wordnet.lin_similarity(s1, s2, brown_ic)
        else:
            sys.exit("Error! No similarity methods!")

        if simi > simi_value:
            simi_value = simi
    return simi_value
Example #10
0
def get_distance(w1, w2, pos=None, def_cutoff: int = 2):
    # https://stackoverflow.com/questions/30829382/check-the-similarity-between-two-words-with-nltk-with-python
    if not isinstance(w1, list): w1 = [w1]
    if not isinstance(w2, list): w2 = [w2]
    sims = list()
    kwargs = {
        'pos': pos
    } if pos else {}
    for word1, word2 in itertools.product(w1, w2):
        syns1 = list(
            filter(
                lambda sn: def_cutoff >= int('0' + ''.join(x for x in repr(sn)
                                                           if x.isdigit())),
                wordnet.synsets(word1, **kwargs)))
        syns2 = list(
            filter(
                lambda sn: def_cutoff >= int('0' + ''.join(x for x in repr(sn)
                                                           if x.isdigit())),
                wordnet.synsets(word2, **kwargs)))
        if len(syns1) * len(syns2) == 0: continue
        for sense1, sense2 in itertools.product(syns1, syns2):
            d = wordnet.wup_similarity(sense1, sense2)
            if d == None: d = 0
            sims.append((1 - d, sense1, sense2))
    if len(sims) <= 0: return (0, None, None)
    return min(sims, key=lambda x: x[0])
Example #11
0
def get_similarity_score_1(word,given_list):
    max_similarity=0
    if word.lower() in given_list:
        max_similarity=1
    else:
        current_verb_list=wn.synsets(word.lower())
        for verb in given_list:
            related_verbs=wn.synsets(verb)
            for a,b in itertools.product(related_verbs,current_verb_list):
                if wn.wup_similarity(a,b) == None:
                    d = 0
                else:
                    d=wn.wup_similarity(a,b)
                if d > max_similarity:
                    max_similarity=d
    return max_similarity
def relation1_old(a, b) :
    ''' This method takes two words as arguments and returns their similarity based on
    wup_similarity method of nltk wordnet.

    Parameters
    ----------
    a : string
    b : string

    Returns
    -------
    float
        relation between two strings

    References
    ----------
    .. [1] NLTK WordNet <http://www.nltk.org/howto/wordnet.html>

    '''
    syna = wn.synsets(a, pos=wn.NOUN)
    synb = wn.synsets(b, pos=wn.NOUN)
    mx = 0
    mxa = None
    mxb = None
    for i in syna[:1] :
        for j in synb[:1] :
            temp = wn.wup_similarity(i, j)
            if temp != None and temp > mx :
                mx = temp
                mxa = i
                mxb = j
    return mx
Example #13
0
 def max_entity_similarity(self):
     from itertools import product
     allsyns1 = set(ss for ss in wn.synsets(self.e1))
     allsyns2 = set(ss for ss in wn.synsets(self.e2))
     sim_values = [wn.wup_similarity(s1, s2) for s1, s2 in product(allsyns1, allsyns2)]
     sim_values = list(filter(None, sim_values))
     self.max_entity_sim = max(sim_values) if len(sim_values) > 0 else 0
Example #14
0
def max_wup_similarity(word1, word2, pos=NOUN):
    """
    Computes the maximum Wu-Palmer Similarity between two words.
    Returns the highest similarity score between the words' synsets

    Args:
    word1: A string denoting a lexical item in English.
    word2: A string denoting a lexical item in English.
    pos: A WordNet part-of-speech.  By default it only considers nouns.

    Returns:
    maxScore: a real number between [0,1.0]

    If maxScore == 0, at least one of the words does not exist in WordNet.
    """
    #grab the synsets of each word
    word1Synsets = wn.synsets(word1, pos)
    word2Synsets = wn.synsets(word2, pos)
    maxScore = 0
    #find the pair (synset1, synset2) which maximizes the wup metric
    for synset1 in word1Synsets:
        for synset2 in word2Synsets:
            currScore = wn.wup_similarity(synset1, synset2)
            if currScore > maxScore:
                maxScore = currScore
    return maxScore
Example #15
0
def detect_similarity(term):
	
	#print
	#print value, wn.synsets(value)[0]
	try:
		wn_term = wn.synsets(term)[0]
	except:
		#print term
		return

	max_score = -100
	max_sim = ''
	for cat, value in synset_categories.iteritems():
		sim_score = wn.wup_similarity(wn_term, value)
		#print cat, term, sim_score
		if sim_score > max_score:
			max_score = sim_score
			max_sim = cat

	temp = []
	temp.append(max_sim)
	temp.append(max_score)

	if temp[1] >= 0.5:
		return temp
	else:
		return
Example #16
0
def obtainrhyme(sentence1, sentence2):  # creates rhymes
    lastword = last_word(sentence1)
    syllable = syllables(lastword)
    rhymes_list = matchrhymes(syllable)  # list of rhyming words

    print(rhymes_list)

    lastword2 = last_word(sentence2)
    to_match = wn.synsets(lastword2)[0]

    rhymes_synsets = [
        wn.synsets(word) for word in rhymes_list
    ]  # now find the rhymign word most similar to last word in sentence2
    flattened = [synset for sublist in rhymes_synsets for synset in sublist]

    synset_scores = {}
    for synset in flattened:
        score = wn.wup_similarity(synset, to_match)
        synset_scores[synset] = score
    print("using wup_similarity")
    for i in synset_scores:
        print(i, synset_scores[i])

    most_similar = max(synset_scores)
    print(most_similar)
    print(synset_scores[most_similar])

    most_similarnames = most_similar.lemma_names()
    print(most_similarnames)
Example #17
0
def word_similarity_wup(word1, word2, makemap=False):
    if makemap:
        if (word1, word2) in sim_map:
            return sim_map[(word1, word2)]
        if (word2, word1) in sim_map:
            return sim_map[(word2, word1)]
    else:
        if (word1, word2) in sim_map_training:
            return sim_map_training[(word1, word2)]
        if (word2, word1) in sim_map_training:
            return sim_map_training[(word2, word1)]
    synset_list1 = wordnet.synsets(word1)
    synset_list2 = wordnet.synsets(word2)
    if len(synset_list1) == 0 or len(synset_list2) == 0:
        return 0
    ans = 0.0
    for synset1 in synset_list1:
        for synset2 in synset_list2:
            sim = wordnet.wup_similarity(synset1, synset2)
            ans = max(ans, sim)
    if ans > 0.99:
        ans = 1.25
    if makemap:
        sim_map[(word1, word2)] = ans
        filewrite.write("('" + word1 + "','" + word2 + "'):" + str(ans) +
                        ',\n')
    return ans
Example #18
0
def definitional_score(synset1, synset2):
    keynouns1 = [
        t[0] for t in nltk.pos_tag(nltk.word_tokenize(synset1.definition()))
        if t[1] in ['NN', 'NNP', 'NNPS', 'NNS']
    ]
    synset_nouns1 = []
    for keynoun in keynouns1:
        all_synset_nouns = [s for s in wn.synsets(keynoun) if s.pos() == 'n']
        if len(all_synset_nouns) > 0:
            synset_nouns1.append(all_synset_nouns[0])
    keynouns2 = [
        t[0] for t in nltk.pos_tag(nltk.word_tokenize(synset2.definition()))
        if t[1] in ['NN', 'NNP', 'NNPS', 'NNS']
    ]
    synset_nouns2 = []
    for keynoun in keynouns2:
        all_synset_nouns = [s for s in wn.synsets(keynoun) if s.pos() == 'n']
        if len(all_synset_nouns) > 0:
            synset_nouns2.append(all_synset_nouns[0])
    score = 0.
    for synset_noun1 in synset_nouns1:
        for synset_noun2 in synset_nouns2:
            score += wn.wup_similarity(synset_noun1, synset_noun2)
    # Note to normalize by size of synset_noun set (both)
    if (len(synset_nouns1) + len(synset_nouns2)) != 0:
        return score / (len(synset_nouns1) + len(synset_nouns2))
    else:
        return 0.
Example #19
0
 def get_similarity(self,a,b):
     s1 = wordnet.synsets(a)
     s2 = wordnet.synsets(b)
     score = 0
     if len(s1) >= 1 and len(s2) >= 1:
         score = wordnet.wup_similarity(s1[0], s2[0])
     return score
Example #20
0
def categorise(weighted_words, catwords):
    if len(weighted_words) > 0 and len(catwords) > 0:
        # Find all synyonyms for all the words in each list
        allsyns1 = [ss for word in weighted_words for ss in wn.synsets(word)]
        allsyns2 = [ss for word in catwords for ss in wn.synsets(word)]

        # Find the best matched synoyms in the two lists (product rule is used to
        # make every comparison)
        best_syn_match = max([(wn.wup_similarity(s1, s2) or 0, s1, s2)
                              for s1, s2 in product(allsyns1, allsyns2)])

        # Function to relate the synonym back to parent word
        def parent(word_list, loc):
            # Create a list with the number of syns per parent word. Once the
            # index is found, take value and return parent word (see function
            # input parameters in following lines)
            syn_size = [len(wn.synsets(word)) for word in word_list]
            i = 0
            j = 0
            while i <= loc:
                i += syn_size[j]
                j += 1
            return word_list[j - 1]

        # Call above function, inputs are lists and index for best match synonym
        p1 = parent(weighted_words, allsyns1.index(best_syn_match[1]))
        p2 = parent(catwords, allsyns2.index(best_syn_match[2]))

        return [p1, p2]
    else:
        return [None, None]
Example #21
0
def chain_gen(sentence):
    words = tokenize_data(sentence)
    lex_chain = []    
    
    for i, word in enumerate(words):
        chain = []
        word_count = dict(Counter(words))    
        
        for j in xrange(i+1, len(words)):
            # avoiding comparison of the word to itself
            if (i != j):
                item1 = words[i] + "(" + str(word_count[words[i]]) + ")"
                item2 = words[j] + "(" + str(word_count[words[j]]) + ")"
                
                # genrating synsets of the consecutive words
                if (words[i] != words[j]):
                    syn1 = wn.synsets(words[i])
                    syn2 = wn.synsets(words[j])
                    
                    # compairing the synsets using wordnet.wup_similarity
                    for s1, s2 in product(syn1, syn2):
                        w = max((wn.wup_similarity(s1, s2) or 0) for s1, s2 in product(syn1, syn2))
    
                if item2 not in chain:
                        if (w >= 0.8) and (item2 not in chain):
                                chain.append(item2)
                        if (w < 0.8) and (item1 not in chain):
                            chain.append(item1)
        lex_chain.append(chain)
                
    return lex_chain
Example #22
0
def detect_similarity(term):

    #print
    #print value, wn.synsets(value)[0]
    try:
        wn_term = wn.synsets(term)[0]
    except:
        #print term
        return

    max_score = -100
    max_sim = ''
    for cat, value in synset_categories.iteritems():
        sim_score = wn.wup_similarity(wn_term, value)
        #print cat, term, sim_score
        if sim_score > max_score:
            max_score = sim_score
            max_sim = cat

    temp = []
    temp.append(max_sim)
    temp.append(max_score)

    if temp[1] >= 0.5:
        return temp
    else:
        return
Example #23
0
def compute_wn_sim(qw, dw, i, j, k):
    allsyns1 = wordnet.synsets(qw)
    allsyns2 = wordnet.synsets(dw)
    if len(allsyns2) == 0 or len(allsyns1) == 0:
        return 0, i, j, k
    best = max((wordnet.wup_similarity(s1, s2) or 0, s1, s2) for s1, s2 in product(allsyns1, allsyns2))[0]
    return best, i, j, k
def relation1_old(a, b) :
    ''' This method takes two words as arguments and returns their similarity based on
    wup_similarity method of nltk wordnet.

    Parameters
    ----------
    a : string
    b : string

    Returns
    -------
    float
        relation between two strings

    References
    ----------
    .. [1] NLTK WordNet <http://www.nltk.org/howto/wordnet.html>

    '''
    syna = wn.synsets(a, pos=wn.NOUN)
    synb = wn.synsets(b, pos=wn.NOUN)
    mx = 0
    mxa = None
    mxb = None
    for i in syna[:1] :
        for j in synb[:1] :
            temp = wn.wup_similarity(i, j)
            if temp != None and temp > mx :
                mx = temp
                mxa = i
                mxb = j
    return mx
Example #25
0
def wup_similarity(synsets1, synsets2):
    """
        This function returns Wu-Palmer similarity (WUP) between two synsets,
        based on the depth of the two senses in the taxonomy and their
        Least Common Subsumer (most specific ancestor node).

        :param `Synset` synsets1: first synset supplied to measures
                                  the WUP similarity
        :param `Synset` synsets2: second synset supplied to measures
                                  the WUP similarity

        :return: WUP similarity between two synsets
        :rtype: float

        :Example:

            >>> from pythainlp.corpus.wordnet import wup_similarity, synset
            >>>
            >>> entity = synset('entity.n.01')
            >>> obj = synset('object.n.01')
            >>> cat = synset('cat.n.01')
            >>>
            >>> wup_similarity(entity, obj)
            0.5
            >>> wup_similarity(entity, cat)
            0.13333333333333333
            >>> wup_similarity(obj, cat)
            0.35294117647058826
    """
    return wordnet.wup_similarity(synsets1, synsets2)
def wup_sim(word1,word2):
    
    
    """
    Wu-Palmer Similarity: Return a score denoting how similar
    two word senses are, based on the depth of the two senses in
    the taxonomy and that of their Least Common Subsumer (most specific ancestor node).
    range 0.92
    
    Note that at this time the scores given do _not_ always agree with those given by Pedersen's
    Perl implementation of Wordnet Similarity.
    The LCS does not necessarily feature in the shortest path connecting the two senses,
    as it is by definition the common ancestor deepest in the taxonomy, not closest to
    the two senses. Typically, however, it will so feature. Where multiple candidates for
    the LCS exist, that whose shortest path to the root node is the longest will be selected.
    Where the LCS has multiple paths to the root, the longer path is used for the purposes
    of the calculation.
    """
    try:
        #print (wn.path_similarity(word1, word2))
        #if (wn.path_similarity(word1, word2) > 0.5): #(hypernym/hypnoym) taxonomy
        value=wn.wup_similarity(word1, word2)
        if value.isnumeric():
           return value
        return 0
    except:
        return 0
Example #27
0
def semantic_similarity(word1, word2):
    words1 = word1.split('_')
    words2 = word2.split('_')
    if fast_semantic_similarity(word1, word2) == 1:
        return 1
    max_p = 0
    word1_sim = set([])
    for s1 in wn.synsets(word1):
        word1_sim.add(s1)
        word1_sim.update(s1.similar_tos())
        # for st1 in [s1] + s1.similar_tos():
        #     word1_sim.append(st1)

    word2_sim = set([])
    for s2 in wn.synsets(word2):
        word2_sim.add(s2)
        word2_sim.update(s2.similar_tos())

    for st1 in word1_sim:
        for st2 in word2_sim:
            p = wn.wup_similarity(st1, st2)
            if p == 1:
                return p
            if p > max_p:
                max_p = p
    if len(words1) > 1 or len(words2) > 1:
        sub_similarity = .9 * semantic_similarity(words1[-1], words2[-1])
    else:
        sub_similarity = 0
    return max(max_p, sub_similarity)
Example #28
0
def intersect_value(s1, s2):
    s1 = regToken.tokenize(s1)
    s2 = regToken.tokenize(s2)
    normalizer = (len(s1) + len(s2)) / 2
    intersect = 0
    # Find an intersection, but also look at similarities
    for w1 in s1:
        synw1 = wn.synsets(w1)
        if len(synw1) > 0:
            synw1 = synw1[0]
        else:
            continue
        for w2 in s2:
            synw2 = wn.synsets(w2)
            if len(synw2) > 0:
                synw2 = synw2[0]
            else:
                continue
            similar = wn.wup_similarity(synw1, synw2)
            if similar is not None:
                similar *= similar
                if similar == 1:
                    similar *= 2
                intersect += similar / (len(s1 + s2) / 2)

    #normalize to the size of each sentence
    return intersect  # / normalizer
def max_sum(w: list, T: list) -> float:
    """
    As mentioned by:
        Mihalcea R. , Corley C. & Strapparava C. (2006).
        Corpus-based and Knowledge-based Measures of Text Semantic Similarity.
        In Proceedings, The Twenty-First National Conference on Artificial Intelligence
            and the Eighteenth Innovative Applications of Artificial Intelligence Conference,
        July 16-20, 2006, Boston, Massachusetts, USA.

    :param w: A Word represented as a triple of the word, penn-treebank-tag, list of its synsets.
    :param T: List representation of a text in triples like w.
    :return: max_sum(w, T)
    """
    w_synsets: list = w[2]
    similarities: set = set()
    similarities.add(0)
    for t in T:
        t_synsets = t[2]
        for w_synset in w_synsets:
            for t_synset in t_synsets:
                try:
                    similarity: float = wordnet.wup_similarity(
                        w_synset, t_synset, brown_ic)
                    if similarity is None:
                        similarities.add(0)
                    else:
                        similarities.add(similarity)
                except:
                    similarities.add(0)
    return max(similarities)
Example #30
0
def sentence_similarity(wordSense1, wordSense2, similarity_metric = 'path'):
    '''
    Calculating sentence similarity measurement.
    
    Parameters:
        wordSense1 (list): a list of extracted sense for the first sentence.
        wordSense2 (list): a list of extracted sense for the second sentence.
        similarity_metric (str): which algorithm for similarity measurement. Default to be the path similaity. Available choice include 
            path similarity (path), and Wu-Palmer Similarity (lcs). See the official definition here: http://www.nltk.org/howto/wordnet.html
        
    Return:
        the similarity score (float).
    '''
    similarity = 0.0
    total = 0.0
    if len(wordSense1) == 0 or len(wordSense2) == 0:
        return 0
    
    for sense1 in wordSense1:
        for sense2 in wordSense2:
            total += 1.0
            cur_sim = None
            if similarity_metric == 'path':
                cur_sim = wn.path_similarity(sense1, sense2)
            elif similarity_metric == 'lcs':
                cur_sim = wn.wup_similarity(sense1, sense2)
            else:
                raise ValueError('ERROR: given similarity metric is not defined.')
            if cur_sim:
                similarity += cur_sim

    return similarity / total
def spreadItemsIntoGroups(groupNames, groupItemList):
    res = []
    for groupItem in groupItemList:
        maxPsSims = []
        maxWupSims = []
        maxMixSims = []
        for groupName in groupNames:
            groupNameSynsets = wordnet.synsets(groupName)
            groupItemSynsets = wordnet.synsets(groupItem)
            psSims = []
            wupSims = []
            for nameSyns, itemSyns in product(groupNameSynsets, groupItemSynsets):
                ps = wordnet.path_similarity(nameSyns, itemSyns) or 0
                psSims.append((ps, groupItem, groupName))
                wup = wordnet.wup_similarity(nameSyns, itemSyns) or 0
                wupSims.append((wup, groupItem, groupName))
            maxPsSims.append(max(psSims))
            maxWupSims.append(max(wupSims))
            maxMixSims.append((max(psSims)[0] * max(wupSims)[0], groupItem, groupName))
        print('     path:', sorted(maxPsSims, key=lambda item: item[0], reverse=True))
        print('     wup:', sorted(maxWupSims, key=lambda item: item[0], reverse=True))
        print('     mix:', sorted(maxMixSims, key=lambda item: item[0], reverse=True))
        print('path:', max(maxPsSims))
        print('wup:', max(maxWupSims))
        maxMix = max(maxMixSims)[0]
        print('mix:', max(maxMixSims))
        res.append((maxMix, groupItem, [maxMixSim[2] for maxMixSim in maxMixSims if maxMixSim[0] == maxMix]))
    return res



#---------------------------------------------------------------------------------------------
     path: [(0.3333333333333333, 'Apple', 'Fruit'), (0.3333333333333333, 'Apple', 'Berry'), (0.25, 'Apple', 'Vegetable'), (0.2, 'Apple', 'Mushroom'), (0.125, 'Apple', 'Plant')]
     wup: [(0.9, 'Apple', 'Fruit'), (0.8571428571428571, 'Apple', 'Berry'), (0.8, 'Apple', 'Vegetable'), (0.75, 'Apple', 'Mushroom'), (0.6666666666666666, 'Apple', 'Plant')]
     mix: [(0.3, 'Apple', 'Fruit'), (0.2857142857142857, 'Apple', 'Berry'), (0.2, 'Apple', 'Vegetable'), (0.15000000000000002, 'Apple', 'Mushroom'), (0.08333333333333333, 'Apple', 'Plant')]
Example #32
0
def main():
    # temp sentence
    temp_s = """why is the sky so blue and not red"""
    # timer
    #start = timeit.default_timer()
    # tokenize
    token_s = nltk.word_tokenize(temp_s)
    # remove stopwords
    stopWords = set(stopwords.words('english'))
    filtered_tokens = [word for word in token_s if word not in stopWords]
    filtered_tokens = []
    for word in token_s:
        if word not in stopWords:
            filtered_tokens.append(word)
    # ini print
    print('tokens:', token_s)
    print('filtered:', filtered_tokens)
    # loops through for comparisons
    filtered_iter = iter(filtered_tokens)
    for i in range(len(filtered_tokens)):
        total = 0
        elem = next(filtered_iter)
        allsyns1 = set(ss for word in token_s for ss in wn.synsets(word))
        allsyns2 = set(ss for word in elem for ss in wn.synsets(word))
        full_list = [(wn.wup_similarity(s1, s2) or 0, s1, s2)
                     for s1, s2 in product(allsyns1, allsyns2)]
        score_list = iter(full_list)
        for j in range(len(full_list)):
            total += float(next(score_list)[0])
        print('word:', elem, 'weighted avg:', total / len(full_list))
def measure_by_method(action, list_objects, method, embedding_index):
    nouns = [token for token, pos in pos_tag(word_tokenize(action)) if pos.startswith('N')]
    list_word_action_all = action.split(" ")
    list_word_action_nouns = nouns

    if list_objects == [] or list_word_action_all == [] or (list_word_action_nouns == [] and (
            method == 'wup_sim' or method == 'cos_sim nouns')):
        best = 0

    else:
        if method == 'wup_sim':
            # speed it up by collecting the synsets for all words in list_objects and list_word_action once, and taking the product of the synsets.
            allsyns1 = set(ss for word in list_objects for ss in wordnet.synsets(word))
            allsyns2 = set(ss for word in list_word_action_nouns for ss in wordnet.synsets(word))

            if allsyns1 == set([]) or allsyns2 == set([]):
                best = 0
            else:
                best, s1, s2 = max((wordnet.wup_similarity(s1, s2) or 0, s1, s2) for s1, s2 in product(allsyns1, allsyns2))

        elif method == 'cos_sim all':
            best = cosine_similarity_lists(list_objects, list_word_action_all, embedding_index)
        elif method == 'cos_sim nouns':
            best = cosine_similarity_lists(list_objects, list_word_action_nouns, embedding_index)
        else:
            raise ValueError("wrong similarity method name")
    return best
Example #34
0
def similarity_by_path(sense1, sense2, option="path"):
  if option.lower() in ["path", "path_similarity"]: # Path similaritys
    return max(wn.path_similarity(sense1,sense2), 
               wn.path_similarity(sense1,sense2))
  elif option.lower() in ["wup", "wupa", "wu-palmer", "wu-palmer"]: # Wu-Palmer 
    return wn.wup_similarity(sense1, sense2)
  elif option.lower() in ['lch', "leacock-chordorow"]: # Leacock-Chodorow
    if sense1.pos != sense2.pos: # lch can't do diff POS
      return 0
    return wn.lch_similarity(sense1, sense2)
def wnsensesim(synset1, synset2, metric):

    if metric == 'path_similarity':
        return wn.path_similarity(synset1, synset2)
    elif metric == 'lch_similarity':
        return wn.lch_similarity(synset1, synset2)
    elif metric == 'wup_similarity':
        return wn.wup_similarity(synset1, synset2)
    else:#add more similarity measures e.g., jcn
        print "Unsupported wn similarity measure requested"
 def __init__(self, obs_corpus, target_corpus, metric="path", aggregation_mode_prev="", aggregation_mode=""):
     super().__init__(obs_corpus, target_corpus, aggregation_mode, None, aggregation_mode_prev)
     self.metric = metric
     if self.metric == "path":
         self.metric_func = lambda syn1, syn2: wn.path_similarity(syn1, syn2)
     elif self.metric == "lch":
         self.metric_func = lambda syn1, syn2: wn.lch_similarity(syn1, syn2)
     elif self.metric == "wup":
         self.metric_func = lambda syn1, syn2: wn.wup_similarity(syn1, syn2)
     else:
         raise(ValueError("Wrong similarity metric: %s, should be one of path/lch/wup."%self.metric))
    def similarityScore(self,wordList):
        similarityJson = {}
        i=0
        while (i<len(wordList)):
            j = i+1
            while(j<len(wordList)):
                key = wordList[i]+'-'+wordList[j]
                if len(wordnet.synsets(wordList[i]))> 0  and len(wordnet.synsets(wordList[j]))> 0:
                    similarityJson[key] = wordnet.wup_similarity(wordnet.synsets(wordList[i])[0],wordnet.synsets(wordList[j])[0])
                j = j+1
            i = i+1

        return similarityJson
Example #38
0
def similarity_by_path(sense1, sense2, option="path"):
  """ Returns maximum path similarity between two senses. """
  if option.lower() in ["path", "path_similarity"]: # Path similaritys
    return max(wn.path_similarity(sense1,sense2), 
               wn.path_similarity(sense1,sense2))
  elif option.lower() in ["wup", "wupa", "wu-palmer", "wu-palmer"]: # Wu-Palmer 
    return wn.wup_similarity(sense1, sense2)
  elif option.lower() in ['lch', "leacock-chordorow"]: # Leacock-Chodorow
    if sense1.pos != sense2.pos: # lch can't do diff POS
      return 0
    return wn.lch_similarity(sense1, sense2)

    return wn.lin_similarity(sense1, sense2, wnic.ic('ic-bnc-add1.dat'))
	def get_tensor_neuron_potential_for_relation(self,synset_vertex,synset_r):
		smt=0.0
		similarity=0.0
		for s1, s2 in product(synset_vertex, synset_r):
			if self.Similarity=="WordNet":
				smt=wn.wup_similarity(s1,s2)
			if self.Similarity=="ConceptNet":
				s1_lemma_names=s1.lemma_names()
				s2_lemma_names=s2.lemma_names()
				smt=self.conceptnet.conceptnet_distance(s1_lemma_names[0], s2_lemma_names[0])
			#print "similarity=",smt
			if smt > similarity and smt != 1.0:
				similarity = float(smt)
		return similarity
Example #40
0
def wnsim(synset1, synset2, method='all'):
    synset_patt = re.compile(r'^.+\..+\.\d+$')

    if synset_patt.match(synset1):
        s1 = wn.synset(synset1)
    else:
        s1 = wn_synset(synset1)

    if synset_patt.match(synset2):
        s2 = wn.synset(synset2)
    else:
        s2 = wn_synset(synset2)

    if s1 is None or s2 is None:
        return 0

    if method == 'lin':
        return wn.lin_similarity(s1, s2, wn_ic)
    elif method == 'res':
        return wn.res_similarity(s1, s2, wn_ic)
    elif method == 'jcn':
        return wn.jcn_similarity(s1, s2, wn_ic)
    elif method == 'wup':
        return wn.wup_similarity(s1, s2)
    elif method == 'path':
        return wn.path_similarity(s1, s2)
    elif method == 'lch':
        return wn.lch_similarity(s1, s2)
    elif method == 'all':
        return [
            ('lin', wn.lin_similarity(s1, s2, wn_ic)),
            ('res', wn.res_similarity(s1, s2, wn_ic)),
            ('jcn', wn.jcn_similarity(s1, s2, wn_ic)),
            ('wup', wn.wup_similarity(s1, s2)),
            ('path', wn.path_similarity(s1, s2)),
            ('lch', wn.lch_similarity(s1, s2))
        ]
def get_sim_values(synset_list):
    sim_values = defaultdict(int)
    for pair in itertools.combinations(synset_list, 2):
        if pair[0] is None or pair[1] is None:
            sim_values[pair] = 0
        else:
            sim_values[pair] = wn.wup_similarity(pair[0], pair[1])
            # wn.wup_similarity() returns None if no path connecting the two
            # synsets are found. Replace None with 
            if sim_values[pair] is None:
                sim_values[pair] = 0
            else:
                # wu-palmer score of 1 means identity, take the inverse for dist
                sim_values[pair] = sim_values[pair]
    return dict(sim_values)
def gen_sim_matrix(synset_list):
    # get similarity for each pair of nouns
    sim_values = get_sim_values(synset_list)

    # initialize 2d matrix
    matrix = [[0 for i in xrange(len(synset_list))] for i in xrange(len(synset_list))]
    for (i1, syn1) in enumerate(synset_list):
        for (i2, syn2) in enumerate(synset_list):
            if syn1 is None or syn2 is None:
                matrix[i1][i2] = 0
            elif syn1 is syn2:
                matrix[i1][i2] = 1 - wn.wup_similarity(syn1, syn2)
            else:
                try:
                    matrix[i1][i2] = 1 - sim_values[(syn1, syn2)]
                except KeyError:
                    matrix[i1][i2] = 1 - sim_values[(syn2, syn1)]
    return matrix
Example #43
0
def _first_wup(word1, word2, pos=wn.NOUN):
    """
    Computes the Wu-Palmer similarity between two words.
    Compares the two most common synsets of each word.

    Args:
    word1: A string representing a word.
    word2: A string representing a word.
    pos: The part-of-speech of both words.

    Returns:
    score: A real number between [0, 1.0]
    """
    try:
        synset1 = wn.synsets(word1, pos)[0]
        synset2 = wn.synsets(word2, pos)[0]
        return wn.wup_similarity(synset1, synset2)
    except IndexError:
        return 0
def average_wup_similarity(sub_tuples, obj_tuples):
	'''
	Get a list of subject tuples and a list of object tuples, and calculate
	the average wup_similarity for all combinations between them
	
	Example formats:
	sub_words = [('three', 'CD'), ('percent', 'NN')]

	obj_words = [(').', 'NNP'), ('==', 'NNP'), ('Northern', 'NNP'), ('Virginia', 'NNP'), ('Campaign', 'NNP'), ('==', 'NNP'), ('The', 'NNP'), ('Coast', 'NNP'), ('Division', 'NNP')]
	
	'''
	print sub_tuples, obj_tuples
	# get WordNet SynSets for all nouns
	sub_syn = [wn.synsets(word)[0] for (word, det) in sub_tuples if det.startswith('NN') and len(wn.synsets(word))>0]
	obj_syn = [wn.synsets(word)[0] for (word, det) in obj_tuples if det.startswith('NN') and len(wn.synsets(word))>0]

	# get all subject-object combinations
	all_combins_values = [wn.wup_similarity(sub, obj) for sub in sub_syn for obj in obj_syn]

	# calculate average WUP similarity
	return sum(all_combins_values) / float(len((all_combins_values)))
Example #45
0
def semantic_similarity(word1, word2):
    words1 = word1.split('_')
    words2 = word2.split('_')
    if len(words1) > 1 or len(words2) > 1:
        sub_similarity = .9 * semantic_similarity(words1[-1], words2[-1])
    else:
        sub_similarity = 0
    if sub_similarity == 1:
        return sub_similarity
    if fast_semantic_similarity(word1, word2) == 1:
        return 1
    max_p = 0
    for s1 in wn.synsets(word1):
        for st1 in [s1] + s1.similar_tos():
            for s2 in wn.synsets(word2):
                for st2 in [s2] + s2.similar_tos():
                    p = wn.wup_similarity(st1, st2)
                    if p == 1:
                        return p
                    if p > max_p:
                        max_p = p
    return max(max_p, sub_similarity)
Example #46
0
def similarity(words: list)->list:
    '''Calculates similarity based on the given synsets'''
    results = []
    synsets = ask_for_word_defs(words)
    print("\n{}\n".format('*'*80))
    for i in range(int(len(synsets)/2)):
        print("{:30}{}".format(str(synsets[2*i]), str(synsets[2*i + 1])))
    print("\n{}\n".format('*'*80))
    print("Running comparisons...")
    for i in range(int(len(synsets)/2)):
        try:
            if (synsets[2*i] == None or synsets[2*i + 1] == None):
                results.append(["Undefined","Undefined", -1, -1, -1, "None", "None"])
                continue
        except:
            pass
        result = [words[2*i], words[2*i + 1], 0, 0, 0, synsets[2*i].definition(), synsets[2*i + 1].definition()]
        result[2] = wordnet.lch_similarity(synsets[2*i],synsets[2*i + 1])
        result[3] = wordnet.wup_similarity(synsets[2*i],synsets[2*i + 1])
        result[4] = wordnet.path_similarity(synsets[2*i],synsets[2*i + 1])
        results.append(result)
    print("\n{}\n".format('*'*80))
    return results
  distances_jcn_bnc.append(sim8)
  distances_lin_bnc.append(sim9)

#Import IC calculation
from nltk.corpus import wordnet_ic
brown_ic = wordnet_ic.ic('ic-brown-resnik-add1.dat')
bnc_ic = wordnet_ic.ic('ic-bnc-resnik-add1.dat')

#For each pair of synsets, compute distance
for s1 in synsets:
  syn1 = wn.of2ss(s1)
  for s2 in synsets:
    syn2 = wn.of2ss(s2)
    distances_path[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.path_similarity(syn1,syn2)
    distances_lch[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.lch_similarity(syn1,syn2)
    distances_wup[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.wup_similarity(syn1,syn2)
    distances_res[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.res_similarity(syn1,syn2,brown_ic)
    distances_jcn[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.jcn_similarity(syn1,syn2,brown_ic)
    distances_lin[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.lin_similarity(syn1,syn2,brown_ic)
    distances_res_bnc[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.res_similarity(syn1,syn2,bnc_ic)
    distances_jcn_bnc[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.jcn_similarity(syn1,syn2,bnc_ic)
    distances_lin_bnc[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.lin_similarity(syn1,syn2,bnc_ic)
    #distances_path[labelsNLTK.index(s1)][labelsNLTK.index(s2)] =1/(labelsNLTK.index(s2)+1) 
    #distances_lch[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1)
    #distances_wup[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1)  
    #distances_res[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1)  
    #distances_jcn[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1)  
    #distances_lin[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1)  
    #distances_res_bnc[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1)  
    #distances_jcn_bnc[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1)  
    #distances_lin_bnc[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1)  
                    syns.append(wordnet.synsets(top20[j][0], pos=wordnet.NOUN))
                else:
                    syns.append((1,1))
            
            confs = [()]
            for x in syns:
                confs = [i + (y,) for y in x for i in confs]
                
            max_conf=0
            max_sim=0
            for conf in confs:
                combinations = list(itertools.combinations(conf,2))
                sim = 0
                for pair in combinations:
                    if(pair[0] is not 1 and pair[1] is not 1):
                        sim += wordnet.wup_similarity(pair[0], pair[1])

                sim = float(sim)/float(len(combinations))
                if(sim >= max_sim):
                    max_sim = sim
                    max_conf = confs.index(conf)
                    
            j=0  
            for element in confs[max_conf]:
                if pointer[j] not in WSD:
                    WSD[pointer[j]] = []
                WSD[pointer[j]].append(element)
                j += 1
            step += 1

       
def process_per_sentence(f,context, tgt_words, max_dist, d_factor):
    
    print context
    word_synsets = {}
    synset_index = {}
    index = 0
        
    for i in range(len(context)):
        t_context = tuple([i,context[i]])
#         word = wn.morphy(context[i])
#         if( word == None ) :
#             word = context[i]
        word_synsets[t_context] = wn.synsets(context[i])
        for synset in word_synsets[t_context]:
            t_synset = tuple([i, synset])
            synset_index[t_synset] = index
#             print synset, index
            index += 1
            
    graph_matrix = [[0 for i in range(index)] for j in range(index)]
    #print indexprint [word for word in words]
       
    for i in range(len(context)):
        for j in range(len(context)):
            if i != j:
                 
                #check how far the 2 words are from each other
                if( absolute(i-j) <= max_dist ):
                     
                    for synset1 in word_synsets[ tuple([i, context[i]]) ]:
                        t_s1 = tuple([i, synset1])
                        for synset2 in word_synsets[ tuple([j, context[j]]) ]:
                            t_s2 = tuple([j, synset2])
#                             sim = synset1.wup_similarity(synset2)
                            sim1 = wn.wup_similarity(synset1, synset2)
                            sim2 = wn.wup_similarity(synset2, synset1)
                            if isinstance(sim1, numbers.Number) == False: sim1 = 0
                            if isinstance(sim2, numbers.Number) == False: sim2 = 0
                            
                            graph_matrix[synset_index[t_s1]][synset_index[t_s2]] = sim1
                            graph_matrix[synset_index[t_s2]][synset_index[t_s1]] = sim2
 
    ranked_sense = pr.get_pagerank(graph_matrix, d_factor)
#     print ranked_sense
   
#process only for target_words
    for i in range(len(tgt_words)):
        if isinstance(t_words[i], numbers.Number) == False:
            word_t = tuple([i, context[i]])
#         targeword_t_id = 
            synsets = word_synsets[word_t]
            max_r = 0
            res_offset = -1
            chosen_synset = None
            for synset in synsets:
                t_synset = tuple([i, synset])
                if ranked_sense[synset_index[t_synset]] >= max_r:
                    max_r = ranked_sense[synset_index[t_synset]]
                    res_offset = synset.offset
                    chosen_synset = synset  
                    offset_str=pad_zeros(res_offset)
                    answer_line=tgt_words[i][0:3]+" "+tgt_words[i]+" eng-30-"+offset_str+"-"+chosen_synset.pos+"\n"
                    print answer_line
                    f.write(answer_line)
Example #50
0
def similarities(synsets):
    return [wn.wup_similarity(item,synsets[0]) for item in synsets ]
def computeSemanticSimilarityFeatures(sentence1, sentence2):
    features = [0] * 9

    if (sentence1 + sentence2) not in semanticsimilarity_lookuptable:
        def prepareSentence(sentence):
            return sentence.replace('-', ' ').replace('$', ' ')

        tt = TreeTagger(language='english')
        tags1 = [a for a in tt.tag(prepareSentence(sentence1)) if len(a) > 1]
        tags2 = [a for a in tt.tag(prepareSentence(sentence2)) if len(a) > 1]

        semanticsimilarity_lookuptable[sentence1 + sentence2] = [tags1, tags2]

    tags1 = copy.deepcopy(semanticsimilarity_lookuptable[sentence1 + sentence2][0])
    tags2 = copy.deepcopy(semanticsimilarity_lookuptable[sentence1 + sentence2][1])

    # Feature: noun/web semantic similarity
    # Get Synonym set
    def synSet(tags):
        for word in tags:
            # Only compare Nouns or Verbs
            # Python does not have short circuit operators, wtf?!
            if (word[1][0] != 'N' if len(word[1]) >= 1 else 1) and (word[1][:2] != 'VV' if len(word[1]) >= 2 else 1):
                continue

            word.append(wordnet.synsets(word[2]))

    synSet(tags=tags1)
    synSet(tags=tags2)

    simsMaxNoun = []
    simsAvgNoun = []
    simsMaxVerb = []
    simsAvgVerb = []

    for word1, word2 in product(tags1, tags2):
        type1 = word1[1]
        type2 = word2[1]

        if (type1[0] != 'N' and type1[:2] != 'VV') or type1 != type2:
            continue

        similarityMax = 0
        similarityAvg = 0
        if word1[2] == word2[2]:
            similarityAvg = 1
            similarityMax = 1
        else:
            for sense1, sense2 in product(word1[3], word2[3]):
                sim = wordnet.wup_similarity(sense1, sense2)
                similarityMax = max(similarityMax, sim)
                similarityAvg += sim if sim is not None else 0

        if type1[0] != 'N':
            simsMaxNoun.append(similarityMax)
            simsAvgNoun.append(similarityAvg / (len(word1[3]) + len(word2[3])) if len(word1[3]) + len(word2[3]) > 0 else 0)
        else:
            simsMaxVerb.append(similarityMax)
            simsAvgVerb.append(similarityAvg / (len(word1[3]) + len(word2[3])) if len(word1[3]) + len(word2[3]) > 0 else 0)


    features[0] = np.sum(simsMaxNoun) / len(simsMaxNoun) if len(simsMaxNoun) > 0 else 0
    features[1] = np.sum(simsAvgNoun) / len(simsAvgNoun) if len(simsAvgNoun) > 0 else 0

    features[2] = np.sum(simsMaxVerb) / len(simsMaxVerb) if len(simsMaxVerb) > 0 else 0
    features[3] = np.sum(simsAvgVerb) / len(simsAvgVerb) if len(simsAvgVerb) > 0 else 0

    # Feature: Cardinal number similarity
    def findCardinals(tags):
        cardinals = []
        for index, word1 in enumerate(tags):
            if word1[1] == 'CD':
                # is "more", "over" or "above" before?
                before = [a[0] for a in tags[max(index-2, 0):index]]

                try:
                    val = float(word1[0])
                except ValueError:
                    val = t2i.text2int(word1[0])

                maxValue = minValue = val

                if ("more" in before) or ("over" in before) or ("above" in before) or ("greater" in before):
                    maxValue = sys.maxint
                    minValue += 1
                elif ("less" in before) or ("under" in before) or ("below" in before) or ("smaller" in before):
                    minValue = -sys.maxint - 1
                    maxValue -= 1

                cardinals.append([minValue, maxValue])
        return cardinals

    cardinals1 = findCardinals(tags=tags1)
    cardinals2 = findCardinals(tags=tags2)

    def countCDMatches(cardinals1, cardinals2):
        count = 0
        for cd1 in cardinals1:
            for cd2 in cardinals2:
                if cd1[0] == cd2[0] and cd1[1] == cd2[1]:
                    count += 1
                    break
        return count

    features[4] = (countCDMatches(cardinals1, cardinals2) + countCDMatches(cardinals2, cardinals1)) / (len(cardinals1) + len(cardinals2)) if len(cardinals1) + len(cardinals2) > 0 else 1
    #features[2] = countCDMatches(cardinals1, cardinals2) / len(cardinals1) if len(cardinals1) > 0 else 1
    #features[3] = countCDMatches(cardinals2, cardinals1) / len(cardinals2) if len(cardinals2) > 0 else 1


    # Feature: Proper Name
    def findProperNouns(tags):
        nouns = []
        for word in tags:
            if word[1] == 'NPS':
                nouns.append(word[0])
        return nouns

    def countNounMatches(nouns1, nouns2):
        count = 0
        for noun1 in nouns1:
            for noun2 in nouns2:
                if noun1 == noun2:
                    count += 1
                    break
        return count

    nouns1 = findProperNouns(tags1)
    nouns2 = findProperNouns(tags2)

    features[5] = (countNounMatches(nouns1, nouns2) + countNounMatches(nouns2, nouns1)) / (len(nouns1) + len(nouns2)) if len(nouns1) + len(nouns2) > 0 else 1
    # features[4] = countNounMatches(nouns1, nouns2) / len(nouns1) if len(nouns1) > 0 else 1
    # features[5] = countNounMatches(nouns2, nouns1) / len(nouns2) if len(nouns2) > 0 else 1

    # Feature: Word2Vec (all)
    meaning1 = np.zeros(model.vectors.shape[1])
    for word in tags1:
        if word[2] in model:
            meaning1 += model[word[2]]

    meaning2 = np.zeros(model.vectors.shape[1])
    for word in tags2:
        if word[2] in model:
            meaning2 += model[word[2]]

    diffMeaning = meaning1 - meaning2
    features[6] = np.linalg.norm(diffMeaning)
    features[7] = scipy.spatial.distance.cosine(meaning1, meaning2)

    similarityMatrix = [0] * len(tags1)
    for index1, word1 in enumerate(tags1):
        row = [0]*len(tags2)
        for index2, word2 in enumerate(tags2):
            similarityMax = 0
            if len(word1) > 3 and len(word2) > 3:
                for sense1, sense2 in product(word1[3], word2[3]):
                    sim = wordnet.wup_similarity(sense1, sense2)
                    similarityMax = max(similarityMax, sim)
                similarityMax = 1 - similarityMax
            else:
                similarityMax = 1

            row[index2] = similarityMax
        similarityMatrix[index1] = row
    m = Munkres()
    totalCost = 0
    indices = m.compute(similarityMatrix)
    for row, column in indices:
        totalCost += similarityMatrix[row][column]

    features[8] = totalCost / len(indices)

    return features
    # decay coeffs
    #coeffs['eat'] = float(sys.argv[4])
    #coeffs['drink'] = 1.0-float(sys.argv[4])
    coeffs['eat'] = 1.0
    coeffs['drink'] = 1.0
    for s in sentences:
        s = s.split()
        if s[1] not in verbs_gref:
            verbs_gref[s[1]] = np.zeros((num_basis, num_basis))
        verbs_gref[s[1]] = verbs_gref[s[1]]+np.outer(matrix[s[0]],matrix[s[2]])

        verbs[s[1]] = np.zeros((num_basis, num_basis))
        simweights = {}
        for v in verbs:
            if sys.argv[3]=='wup':
                simweights[v] = wn.wup_similarity(wn.synset(s[1]+'.v.01'), wn.synset(v+'.v.01'))
            elif sys.argv[3]=='lch':
                simweights[v] = wn.lch_similarity(wn.synset(s[1]+'.v.01'), wn.synset(v+'.v.01'))
            elif sys.argv[3]=='path':
                simweights[v] = wn.path_similarity(wn.synset(s[1]+'.v.01'), wn.synset(v+'.v.01'))
            verbs[s[1]] += coeffs[v]*simweights[v]*verbs_gref[v]
        verbs[s[1]] /= float(sum(simweights.values()))
        
    # Learn the reference matrices using Grefenstette for swallow consume and gulp
    gold_verbs = ['swallow','consume','gulp']
    for gv in gold_verbs:
        with open('train/'+gv+'_train') as f:
            sentences = f.readlines()
        for s in sentences:
            s = s.split()
            if s[1] not in verbs_gref:
def loadTestset(folder_path, folder_names, wordsd, subd, digramsd, trigramsd):
	w = []
	testSet = []
	numbers=[]
	dollars=[]
	urls=[]
	emails=[]
	count_number=[]
	count_dollar=[]
	count_mail=[]
	count_url=[]
	all_files = []
	counter=0	
	subject_weight = 4
	similarity_cutoff = 0.8
	stop=stopwords.words("english")

	token_dict = get_token_dict(folder_path, folder_names)
	tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
	tfs = tfidf.fit_transform(token_dict.values())
	
	for loop_var in range(len(folder_names)):
		mypath = folder_path + folder_names[loop_var]
		os.chdir(mypath)

		for fo in listdir(mypath):
			if isfile(join(mypath,fo)):
				all_files.append(mypath + '/' + fo)
				temp_list = []
				subd_temp={}
				wordsd_temp={}
				digramsd_temp={}
				trigramsd_temp={}
				f=open(fo,"rU")
				fr=f.read()
				fr=fr.lower()
				subl=[]
				count_number.append(0)
				
				response = tfidf.transform([fr])
				feature_names = tfidf.get_feature_names()

				match=re.search('subject:(.+)\n',fr)
				if match:
					subl=subl+[w for w in re.split('\W',match.group(1)) if w]
					
					for i in subl:
						if i.isdigit():
							count_number[counter]+=1

					for i in subl:
						if i in stop:
							subl.remove(i)

					for i,j in zip(subl,subl[1:]):
						i=stem(i)
						j=stem(j)
						if i+" "+j in digramsd_temp:
							digramsd_temp[i+" "+j]+=1
						else:
							digramsd_temp[i+" "+j]=1

					for i,j,k in zip(subl,subl[1:],subl[2:]):
						i=stem(i)
						j=stem(j)
						k=stem(k)
						if i+" "+j+" "+k in trigramsd_temp:
							trigramsd_temp[i+" "+j+" "+k]+=1
						else:
							trigramsd_temp[i+" "+j+" "+k]=1

					for elements in subl:
						element=stem(elements)
						if element in subd_temp:
							subd_temp[element]+=1
						else:
							subd_temp[element]=1
				f.close()

				f=open(fo,"rU")
				flag=0
				wordsl=[]
				for line in f:
					if flag==0 and not re.search(r'x-filename',line.lower()):
						continue
					elif flag==0 and re.search(r'x-filename',line.lower()):
						flag=1
						continue
					elif not ( re.search(r'forwarded by',line.lower()) or re.search(r'original message',line.lower()) ):
						wordsl=wordsl+[w for w in re.split('\W',line.lower()) if w]
					elif re.search(r'forwarded by',line.lower()) or re.search(r'original message',line.lower()):
						break

				for i in wordsl:
					if i.isdigit():
						count_number[counter]+=1

				for i in wordsl:
						if i in stop:
							wordsl.remove(i)

				for i,j in zip(wordsl,wordsl[1:]):
						i=stem(i)
						j=stem(j)
						if i+" "+j in digramsd_temp:
							digramsd_temp[i+" "+j]+=1
						else:
							digramsd_temp[i+" "+j]=1

				for i,j,k in zip(wordsl,wordsl[1:],wordsl[2:]):
						i=stem(i)
						j=stem(j)
						k=stem(k)
						if i+" "+j+" "+k in trigramsd_temp:
							trigramsd_temp[i+" "+j+" "+k]+=1
						else:
							trigramsd_temp[i+" "+j+" "+k]=1

				for elements in wordsl:
					element=stem(elements)
					if element in wordsd_temp:
						wordsd_temp[element]+=1
					else:
						wordsd_temp[element]=1
				f.close()

				f=open(fo,"rU")
				count_mail.append(0)
				count_url.append(0)
				emails=[]
				urls=[]
				for line in f:
					if not ( re.search(r'forwarded by',line.lower()) or re.search(r'original message',line.lower()) ):
						emails=[]
						urls=[]
						emails = emails + re.findall(r'\w+[.|\w]\w+@\w+[.]\w+[.|\w+]\w+',line)
						urls = urls + re.findall(r'www.',line)
						for email in emails:
							count_mail[counter]+=1
						for url in urls:
							count_url[counter]+=1
					else:
						break
				f.close()


				for word in wordsd:
					if word in wordsd_temp:
						if word in feature_names:
							temp_list.append(wordsd_temp[word]*response[0,feature_names.index(word)])
						else:
							temp_list.append(wordsd_temp[word])
					else:
						synonyms, flag = [], False
						for syn in wordnet.synsets(word):
							for l in syn.lemmas():
								synonyms.append(l.name())
						for synonym in synonyms:
							if synonym in wordsd_temp:
								temp_list.append(wordsd_temp[synonym])
								flag = True
								break
						if flag == False:
							syn1 = wordnet.synsets(word)
							for word2 in wordsd_temp:
								syn2 = wordnet.synsets(word2)
								for sense1, sense2 in product(syn1, syn2):
									sim = wordnet.wup_similarity(sense1, sense2)
									if sim and sim >= similarity_cutoff:
										temp_list.append(wordsd_temp[word2])
										flag = True
										break
									break # Use this to check only first Synsets
								if flag == True:
									break
							if flag == False:
								temp_list.append(0)
				
				for word in subd:
					if word in subd_temp:
						if word in feature_names:
							temp_list.append(subd_temp[word]*response[0,feature_names.index(word)]*subject_weight)
						else:
							temp_list.append(subd_temp[word]*subject_weight)
					else:
						synonyms, flag = [], False
						for syn in wordnet.synsets(word):
							for l in syn.lemmas():
								synonyms.append(l.name())
						for synonym in synonyms:
							if synonym in subd_temp:
								temp_list.append(subd_temp[synonym]*subject_weight)
								flag = True
								break
						if flag == False:
							syn1 = wordnet.synsets(word)
							for word2 in subd_temp:
								syn2 = wordnet.synsets(word2)
								for sense1, sense2 in product(syn1, syn2):
									sim = wordnet.wup_similarity(sense1, sense2)
									if sim and sim >= similarity_cutoff:
										temp_list.append(subd_temp[word2])
										flag = True
										break
									break # Use this to check only first Synsets
								if flag == True:
									break
							if flag == False:
								temp_list.append(0)
				
				for digram in digramsd:
					if digram in digramsd_temp:
						temp_list.append(digramsd_temp[digram])
					else:
						temp_list.append(0)

				for trigram in trigramsd:
					if trigram in trigramsd_temp:
						temp_list.append(trigramsd_temp[trigram])
					else:
						temp_list.append(0)
				
				temp_list.append(count_url[counter])
				temp_list.append(count_mail[counter])
				temp_list.append(count_number[counter])
				temp_list.append(loop_var)

				counter += 1

				testSet.append(temp_list)
	return testSet, all_files
Example #54
0
from nltk.corpus import wordnet as wn

# <codecell>

[(synset, synset.definition) for synset in wn.synsets('dog')]

# <codecell>

dog = wn.synset('dog.n.01')
cat = wn.synset('cat.n.01')
toaster = wn.synset('toaster.n.01')

# <codecell>

wn.wup_similarity(dog, cat)

# <codecell>

wn.wup_similarity(cat, toaster)

# <codecell>

wn.morphy('dogs'), wn.morphy('barked')

# <codecell>

[(synset, synset.definition) for synset in wn.synsets('bark')]

# <codecell>
#print aspects
# l= [u'vodka', u'vodka', u'cocktail']
list1=[u'food', u'ambience', u'price', u'anecdotes/miscellaneous', u'service']
cats_dict={}
for sid in aspects.keys():
	l=aspects[sid]
	similarities=[]
	inner=[]
	categories=[]
	for asp in l:
		print asp
		inner=[]
		sense1=wordnet.synsets(asp)
		#print "sense1:",sense1
		for cat in list1:
			print cat
			sense2=wordnet.synsets(cat)
			#print "sense2",sense2
			for s1,s2 in product(sense1,sense2):
				score=wordnet.wup_similarity(s1,s2)
				# print score,s1,s2
				inner.append((score,s2))
		print "\n"
		if len(inner)>0:
			topcat=sorted(inner,key=itemgetter(0),reverse=True)[0]
			categories.append(topcat[1].name()[:topcat[1].name().index('.')])
	cats_dict[sid]=list(set(categories))
print cats_dict
pickle.dump(cats_dict,open('cats_dump.p','wb'))

print "Running all classifications..."
for i in range(len(all_files)):
	print "Working on file: " + str(i) + ".txt..."
	classification = semantic_classifier.run(directory+str(i)+".txt", 2, 99, 0.5)
	hypernyms[str(i)] = classification.hypernyms
	#print hypernyms[str(i)]


print "Computing closest hypernyms..."
total_accuracy = 0.0
for i in range(len(all_files)):
	accuracy = 0.0
	for tag in tags[str(i)]:
		syn_hyp = hypernyms[str(i)][0]
		syn_tag = wn.synsets(tag)[0]
		maxSim = wn.wup_similarity(syn_hyp, syn_tag)
		if maxSim is None:
			maxSim = 0.0
		closestHypernym[(tag, str(i))] = hypernyms[str(i)][0]
		for hypernym in hypernyms[str(i)]:
			syn_hyp = hypernym
			sim = wn.wup_similarity(syn_tag, syn_hyp)
			if sim is None:
				sim = 0.0
			if sim > maxSim:
				maxSim = sim
				closestHypernym[(tag, str(i))] = hypernym
		print "File " + str(i) + ".txt: ",
		print "Tag: " + str(tag) + "; ",
		print "Closest hypernym: " + closestHypernym[(tag, str(i))].name().split(".")[0] + " = " + str(maxSim)
		accuracy += maxSim
#opens document
f = open('30.txt', 'r')

rep = 'defined'

total = 0
count = 0
for line in f:
  #reads in each line of the document; stores first/second word
  currentLine = line
  words = string.split(currentLine)
  firstWord = words[0]
  # Get a collection of synsets (synonym sets) for the words
  synsets1 = wn.synsets(rep)
  synsets2 = wn.synsets(firstWord[1:len(firstWord)])
  maximum = 0

  if synsets1 != [] and synsets2 != []:
    for synset1 in synsets1:
      for synset2 in synsets2:
        num = wn.wup_similarity(synset1, synset2, verbose=False, simulate_root=True)
        if num > maximum:
          maximum = num
    total = total + maximum 
    count = count + 1
    
print total
print count
if count != 0:
  print total/count
Example #58
0
def wup_similarity(synsets1,synsets2):
	return wordnet.wup_similarity(synsets1,synsets2)