コード例 #1
0
def create_graphs(doc_list):
    documents = doc_list
    if documents is None:
        documents = default_document_list()

    distance_functions = [
        (wn.lch_similarity(SYNSETS[0], SYNSETS[0]), 'lch', lambda sense_1, sense_2: wn.lch_similarity(sense_1, sense_2)),
        (1.0, 'lin', lambda sense_1, sense_2: wn.lin_similarity(sense_1, sense_2, CORPUS)),
        (10.636958516573292, 'res', lambda sense_1, sense_2: wn.res_similarity(sense_1, sense_2, CORPUS)),
        (wn.jcn_similarity(SYNSETS[0], SYNSETS[0], CORPUS), 'jcn', lambda sense_1, sense_2: wn.jcn_similarity(sense_1, sense_2, CORPUS)),
        (1.0, 'path', lambda sense_1, sense_2: wn.path_similarity(sense_1, sense_2)),
    ]
    all_senses = []
    for doc in documents:
        for sense in doc.top_senses():
            all_senses.append((sense, doc.name))
    against_colors = ['r', 'b', 'g']
    against_to = [wn.synset(word) for word in ["economy.n.01", "philosophy.n.02", "politics.n.01"]]
    create_against_graph('phyl_eco_pol', documents, all_senses, against_to, distance_functions, against_colors)

    against_to = SYNSETS

    against_colors = [(random(), random(), random()) for _i in range(0, len(SYNSETS))]
    create_against_graph('handpicked', documents, all_senses, against_to, distance_functions, against_colors)

    create_graph_top_senses(documents, all_senses, distance_functions)
コード例 #2
0
 def wn_similarity(synset_1, synset_2):
     if synset_1.pos() not in [
             "a", "s", "r"
     ] and synset_2.pos() not in ["a", "s", "r"]:
         return wn.jcn_similarity(synset_1, synset_2, brown_ic)
     else:
         return None
コード例 #3
0
def similarity_by_infocontent(sense1, sense2, option):
    """ Returns similarity scores by information content. """
    if sense1.pos != sense2.pos:  # infocontent sim can't do diff POS.
        return 0

    info_contents = [
        'ic-bnc-add1.dat', 'ic-bnc-resnik-add1.dat', 'ic-bnc-resnik.dat',
        'ic-bnc.dat', 'ic-brown-add1.dat', 'ic-brown-resnik-add1.dat',
        'ic-brown-resnik.dat', 'ic-brown.dat', 'ic-semcor-add1.dat',
        'ic-semcor.dat', 'ic-semcorraw-add1.dat',
        'ic-semcorraw-resnik-add1.dat', 'ic-semcorraw-resnik.dat',
        'ic-semcorraw.dat', 'ic-shaks-add1.dat', 'ic-shaks-resnik.dat',
        'ic-shaks-resnink-add1.dat', 'ic-shaks.dat', 'ic-treebank-add1.dat',
        'ic-treebank-resnik-add1.dat', 'ic-treebank-resnik.dat',
        'ic-treebank.dat'
    ]

    if option in ['res', 'resnik']:
        return wn.res_similarity(sense1, sense2,
                                 wnic.ic('ic-bnc-resnik-add1.dat'))
    #return min(wn.res_similarity(sense1, sense2, wnic.ic(ic)) \
    #             for ic in info_contents)

    elif option in ['jcn', "jiang-conrath"]:
        return wn.jcn_similarity(sense1, sense2, wnic.ic('ic-bnc-add1.dat'))

    elif option in ['lin']:
        return wn.lin_similarity(sense1, sense2, wnic.ic('ic-bnc-add1.dat'))
コード例 #4
0
def internal_sentence_max_WSD(sentence, word):
    """
    Auxiliary function for sem_wsd()

    Input: a sentence and a word in the sentence,
            sentence is a list of words, not a string
    
    Return: synset(sense) of the word that maximize similarity with all other synsets in the sentence
    """    
    # brown_ic = wordnet_ic.ic('ic-brown.dat')
    wordsynsets = wn.synsets(word)
    bestScore = 0.0
    result = None
    for synset in wordsynsets:
        score = 0.0
        for w in sentence:
            for wsynset in wn.synsets(w):
                # sim = wn.path_similarity(wsynset, synset)
                # sim = wn.wup_similarity(wsynset, synset)
                try: # requires to be the same POS, so, using try to assign sim when POS
                    sim = wn.jcn_similarity(wsynset, synset, ic=brown_ic)
                except: # use except to avoid error when POSs are not the same
                    sim = None
                if(sim == None):
                    continue
                else:
                    score += sim
        if (score > bestScore):
            bestScore = score
            result = synset
    return result
コード例 #5
0
    def get_jcn_min(self, sentence1, sentence2):
        sentence1_unique, sentence2_unique = self.sentence_difference(
            sentence1, sentence2)
        min_similarity = maxint
        # Measure similarity for each unique word from A to each unique word to B
        for sentence1_word in sentence1_unique:
            for sentence2_word in sentence2_unique:
                sentence1_word_tag = sentence1.get_tag(sentence1_word)
                sentence2_word_tag = sentence2.get_tag(sentence2_word)
                synsets_word1 = wordnet.synsets(sentence1_word,
                                                sentence1_word_tag)
                synsets_word2 = wordnet.synsets(sentence2_word,
                                                sentence2_word_tag)

                if len(synsets_word1) == 0:
                    synsets_word1 = wordnet.synsets(sentence1_word)
                if len(synsets_word2) == 0:
                    synsets_word2 = wordnet.synsets(sentence2_word)

                if len(synsets_word1) > 0 and len(synsets_word2) > 0:
                    # Skip words with different tags
                    if synsets_word1[0].pos() != synsets_word2[0].pos():
                        continue
                    # Try find similarity from corpus
                    try:
                        similarity = wordnet.jcn_similarity(
                            synsets_word1[0], synsets_word2[0], self.brown_ic)
                    except:
                        continue
                    if similarity != None:
                        min_similarity = min(similarity, min_similarity)
        if min_similarity == maxint or min_similarity == 1e-300:
            return 0
        return min_similarity
コード例 #6
0
def compare_allsynsets(method, word1, word2):
    ss1 = wordnet.synsets(word1)
    ss2 = wordnet.synsets(word2)
    simi, simi_value = 0.0, 0.0
    for (s1, s2) in product(ss1, ss2):
        # if SYNpos and s1.pos() != s2.pos():  # SYN-POS
        #     continue
        # if TWpos and s1.pos() != pos:  # Target word POS
        #     continue
        if method == "PATH":
            simi = s1.path_similarity(s2)
        elif method == "LCH":
            simi = wordnet.lch_similarity(s1, s2)
        elif method == "WUP":
            simi = wordnet.wup_similarity(s1, s2)
        elif method == "RES":
            simi = wordnet.res_similarity(s1, s2, brown_ic)
        elif method == "JCN":
            if s1.pos() == s2.pos() and s1.pos() in ['n', 'a', 'v'
                                                     ]:  # can't do diff POS
                simi = wordnet.jcn_similarity(s1, s2, brown_ic)
        elif method == "LIN":
            if s1.pos() == s2.pos() and s1.pos() in ['n', 'a', 'v'
                                                     ]:  # can't do diff POS
                simi = wordnet.lin_similarity(s1, s2, brown_ic)
        else:
            sys.exit("Error! No similarity methods!")

        if simi > simi_value:
            simi_value = simi
    return simi_value
コード例 #7
0
 def jcn(self, synset_a, synset_b, ic):
     return (
         self.normalize(
             self.MAX_VALUE,
             wordnet.jcn_similarity(synset_a, synset_b, ic),
         )
         if synset_a.pos() == synset_b.pos()
         else 0
     )
コード例 #8
0
def checksimil(synsets1,synsets2, simFactor):
	simold = -1
	sim = -1
	similset = []
	unsimilset = []
	for ss1 in synsets1:
		for ss2 in synsets2:
			if ss1.pos == ss2.pos and ss2.pos != "s" and ss2.pos != "r" and ss2.pos != "a":
				if max(sim,wn.jcn_similarity(ss1,ss2,semcor_ic)) > simFactor:
					similset.append(ss1)
				else:
					unsimilset.append(ss1)
	return (similset,unsimilset)
コード例 #9
0
def compute_similarities(s1, s2, sim):
    if sim == "path":
        return wn.path_similarity(s1, s2)
    elif sim == "lch":
        return wn.lch_similarity(s1, s2)
    elif sim == "wup":
        return wn.wup_similarity(s1, s2)
    elif sim == "res":
        return wn.res_similarity(s1, s2, genesis_ic)
    elif sim == "jcn":
        return wn.jcn_similarity(s1, s2, genesis_ic)
    elif sim == "lin":
        return wn.lin_similarity(s1, s2, genesis_ic)
コード例 #10
0
    def test_wordnet_ic(self):
        from nltk.corpus import wordnet as nltk_wn
        from nltk.corpus import wordnet_ic as nltk_wnic
        nltk_car = nltk_wn.synset('car.n.1')
        nltk_bus = nltk_wn.synset('bus.n.1')
        our_bnc_resnik_add1 = WordNetInformationContent('bnc', resnik=True, add1=True)

        our_car = our_wn.synset('car.n.1')
        our_bus = our_wn.synset('bus.n.1')
        nltk_bnc_resnik_add1 = nltk_wnic.ic('ic-bnc-resnik-add1.dat')
        assert our_wn.res_similarity(our_car, our_bus, our_bnc_resnik_add1) == nltk_wn.res_similarity(nltk_car, nltk_bus, nltk_bnc_resnik_add1)
        assert our_wn.jcn_similarity(our_car, our_bus, our_bnc_resnik_add1) == nltk_wn.jcn_similarity(nltk_car, nltk_bus, nltk_bnc_resnik_add1)
        assert our_wn.lin_similarity(our_car, our_bus, our_bnc_resnik_add1) == nltk_wn.lin_similarity(nltk_car, nltk_bus, nltk_bnc_resnik_add1)
コード例 #11
0
def avg_wn_dist_dict(dict_file, disambiguated):
    sim_data = wnic.ic('ic-bnc-add1.dat')

    words = file(dict_file).readlines()
    synsets = get_synsets(words, disambiguated, data=sim_data)

    total_dist = 0.0
    num_synsets = len(synsets) / 3
    for i in range(num_synsets):
        for j in range(i + 1, num_synsets):
            sim = wn.jcn_similarity(synsets[i], synsets[j], sim_data)
            total_dist += 1.0e+300 - sim

    return total_dist / (num_synsets * (num_synsets - 1) / 2.0)
コード例 #12
0
def create_graphs(doc_list):
    documents = doc_list
    if documents is None:
        documents = default_document_list()

    distance_functions = [
        (wn.lch_similarity(SYNSETS[0], SYNSETS[0]), 'lch',
         lambda sense_1, sense_2: wn.lch_similarity(sense_1, sense_2)),
        (1.0, 'lin',
         lambda sense_1, sense_2: wn.lin_similarity(sense_1, sense_2, CORPUS)),
        (10.636958516573292, 'res',
         lambda sense_1, sense_2: wn.res_similarity(sense_1, sense_2, CORPUS)),
        (wn.jcn_similarity(SYNSETS[0], SYNSETS[0], CORPUS), 'jcn',
         lambda sense_1, sense_2: wn.jcn_similarity(sense_1, sense_2, CORPUS)),
        (1.0, 'path',
         lambda sense_1, sense_2: wn.path_similarity(sense_1, sense_2)),
    ]
    all_senses = []
    for doc in documents:
        for sense in doc.top_senses():
            all_senses.append((sense, doc.name))
    against_colors = ['r', 'b', 'g']
    against_to = [
        wn.synset(word)
        for word in ["economy.n.01", "philosophy.n.02", "politics.n.01"]
    ]
    create_against_graph('phyl_eco_pol', documents, all_senses, against_to,
                         distance_functions, against_colors)

    against_to = SYNSETS

    against_colors = [(random(), random(), random())
                      for _i in range(0, len(SYNSETS))]
    create_against_graph('handpicked', documents, all_senses, against_to,
                         distance_functions, against_colors)

    create_graph_top_senses(documents, all_senses, distance_functions)
コード例 #13
0
ファイル: utilities.py プロジェクト: fashandge/deja
def wnsim(synset1, synset2, method='all'):
    synset_patt = re.compile(r'^.+\..+\.\d+$')

    if synset_patt.match(synset1):
        s1 = wn.synset(synset1)
    else:
        s1 = wn_synset(synset1)

    if synset_patt.match(synset2):
        s2 = wn.synset(synset2)
    else:
        s2 = wn_synset(synset2)

    if s1 is None or s2 is None:
        return 0

    if method == 'lin':
        return wn.lin_similarity(s1, s2, wn_ic)
    elif method == 'res':
        return wn.res_similarity(s1, s2, wn_ic)
    elif method == 'jcn':
        return wn.jcn_similarity(s1, s2, wn_ic)
    elif method == 'wup':
        return wn.wup_similarity(s1, s2)
    elif method == 'path':
        return wn.path_similarity(s1, s2)
    elif method == 'lch':
        return wn.lch_similarity(s1, s2)
    elif method == 'all':
        return [
            ('lin', wn.lin_similarity(s1, s2, wn_ic)),
            ('res', wn.res_similarity(s1, s2, wn_ic)),
            ('jcn', wn.jcn_similarity(s1, s2, wn_ic)),
            ('wup', wn.wup_similarity(s1, s2)),
            ('path', wn.path_similarity(s1, s2)),
            ('lch', wn.lch_similarity(s1, s2))
        ]
コード例 #14
0
def avg_wn_dist_topic(topic, disambiguated, sim_data):
    words = [w[0] for w in topic]
    synsets = get_synsets(words, disambiguated, data=sim_data)

    total_dist = 0.0
    for i in range(len(synsets)):
        for j in range(i + 1, len(synsets)):
            sim = wn.jcn_similarity(synsets[i], synsets[j], sim_data)
            if sim is None:
                print "syn1: " + str(synsets[i]) + " syn2: " + str(synsets[j])
                continue
            total_dist += 1.0e+300 - sim

    num_comparisons = (len(synsets) * (len(synsets) - 1) / 2.0)
    return total_dist / num_comparisons, num_comparisons
コード例 #15
0
def wordnet_distances(synsets):
    print "calculating distances"
    word_dists = np.zeros(len(synsets)**2,
                          dtype='float64').reshape(len(synsets), len(synsets))

    sim_data = wnic.ic('ic-bnc-add1.dat')
    for i in range(len(synsets)):
        for j in range(i + 1, len(synsets)):
            sim = wn.jcn_similarity(synsets[i], synsets[j], sim_data)
            if sim is None:
                print "syn1: " + str(synsets[i]) + " syn2: " + str(synsets[j])
                continue
            word_dists[i, j] = 1.0e+300 - sim
            word_dists[j, i] = 1.0e+300 - sim

    return word_dists
コード例 #16
0
    def get_jcn_average(self, sentence1, sentence2):
        sentence1_unique, sentence2_unique = self.sentence_difference(
            sentence1, sentence2)
        avg_similarity = 0
        total_count = 0
        # Measure similarity for each unique word from A to each unique word to B
        for sentence1_word in sentence1_unique:
            for sentence2_word in sentence2_unique:
                sentence1_word_tag = sentence1.get_tag(sentence1_word)
                sentence2_word_tag = sentence2.get_tag(sentence2_word)
                synsets_word1 = wordnet.synsets(sentence1_word,
                                                sentence1_word_tag)
                synsets_word2 = wordnet.synsets(sentence2_word,
                                                sentence2_word_tag)

                if len(synsets_word1) == 0:
                    synsets_word1 = wordnet.synsets(sentence1_word)
                if len(synsets_word2) == 0:
                    synsets_word2 = wordnet.synsets(sentence2_word)

                if len(synsets_word1) > 0 and len(synsets_word2) > 0:
                    # Skip words with different tags
                    if synsets_word1[0].pos() != synsets_word2[0].pos():
                        continue
                    # Try find similarity from corpus
                    try:
                        similarity = wordnet.jcn_similarity(
                            synsets_word1[0], synsets_word2[0], self.brown_ic)
                    except:
                        continue
                    if similarity == 1e-300:
                        similarity = 0.0
                    if similarity == 1e+300:
                        similarity = 1.0
                    if similarity != None:
                        avg_similarity += similarity
                        total_count += 1
        if total_count == 0:
            return 0
        return float(avg_similarity) / float(total_count)
コード例 #17
0
    def word_similarity(self, w1, w2, syns, loc, thr_sim):
        syn1 = wn.synsets(w1, wn.NOUN or wn.ADJ)
        syn2 = wn.synsets(w2, wn.NOUN or wn.ADJ)

        if len(syn1) > 0 and len(syn2) > 0:
            score = 0
            max_score = 0
            count = 0
            sns1 = syn1[0]
            sns2 = syn2[0]
            for i in range(0, len(syn1)):
                for j in range(0, len(syn2)):
                    if self.wordnet_metric == 'j':  # Jiang-Conrath Similarity
                        score = wn.jcn_similarity(syn1[i], syn2[j])
                    elif self.wordnet_metric == 'le':  # Leacock-Chodorow Similarity
                        score = wn.lch_similarity(syn1[i],
                                                  syn2[j],
                                                  simulate_root=False)
                    elif self.wordnet_metric == 'li':  # Lin Similarity
                        score = wn.lin_similarity(syn1[i], syn2[j])
                    elif self.wordnet_metric == 'p':  # Path Similarity
                        score = wn.path_similarity(syn1[i], syn2[j])
                    elif self.wordnet_metric == 'w':  # Wu-Palmer Similarity. It can not be '0'. It ranges in (0,1]
                        score = wn.wup_similarity(syn1[i], syn2[j])

                    if score > max_score:  # Finding the maximum score
                        max_score = score
                        sns1 = syn1[i]
                        sns2 = syn2[j]
                        if max_score >= thr_sim:  # Storing all the synset pairs that have scores > threshold
                            syns, loc = self.merging_synsets(
                                syns, w1, w2, sns1, sns2, max_score, loc)
                            count = count + 1
            if count == 0:  # Storing the synset that has maximum score but the score < threshold
                syns, loc = self.merging_synsets(syns, w1, w2, sns1, sns2,
                                                 max_score, loc)
        return syns, loc
コード例 #18
0
def jcn_sim_fun(vq_words=[]):
    l1 = knowledge = [
        'recite', 'review', 'point', 'recognize', 'describe', 'choose',
        'examine', 'identify', 'enumerate', 'find', 'select', 'what',
        'memorize', 'collect', 'sequence', 'when', 'duplicate', 'who', 'label',
        'write', 'indicate', 'state', 'tabulate', 'which', 'relate', 'show',
        'arrange', 'cite', 'match', 'define', 'locate', 'draw', 'repeat',
        'remember', 'trace', 'read', 'quote', 'spell', 'memorise', 'how',
        'observe', 'recognise', 'copy', 'why', 'outline', 'count', 'name',
        'recall', 'study', 'omit', 'list', 'tell', 'reproduce', 'record',
        'retell', 'meet', 'listen', 'where', 'order', 'view'
    ]

    l2 = comprehension = [
        'compare', 'cite', 'give', 'predict', 'recognize', 'describe',
        'articulate', 'detail', 'order', 'characterize', 'generalize',
        'factor', 'summarize', 'select', 'illustrate', 'visualize', 'group',
        'trace', 'purpose', 'defend', 'rewrite', 'relate', 'approximate',
        'demonstrate', 'indicate', 'add', 'interact', 'tell', 'extrapolate',
        'show', 'rephrase', 'paraphrase', 'infer', 'contrast', 'locate',
        'picture', 'extend', 'associate', 'conclude', 'express', 'interpolate',
        'generalise', 'clarify', 'observe', 'understand', 'differentiate',
        'review', 'distinguish', 'estimate', 'subtract', 'discuss',
        'interpret', 'summarise', 'convert', 'translate', 'compute', 'outline',
        'identify', 'elaborate', 'ask', 'example', 'classify', 'report',
        'restate', 'explain', 'match'
    ]

    l3 = application = [
        'represent', 'show', 'identify', 'participate', 'derive', 'group',
        'calculate', 'graph', 'dramatize', 'choose', 'factor', 'include',
        'allocate', 'handle', 'practice', 'relate'
        'schedule', 'report', 'assess', 'collect', 'investigate', 'categorise',
        'ascertain', 'round', 'sketch', 'transcribe', 'sequence', 'imitate',
        'discover', 'connect', 'tabulate', 'employ', 'avoid', 'experiment',
        'manipulate', 'exercise', 'extend', 'associate', 'modify',
        'personalize', 'dramatise', 'explore', 'teach', 'change', 'perform',
        'summarise', 'act', 'implement', 'assign', 'alphabetize', 'relate',
        'articulate', 'administer', 'subscribe', 'instruct', 'determine',
        'apply', 'establish', 'select', 'illustrate', 'plot', 'use', 'prepare',
        'paint', 'transfer', 'construct', 'process', 'interpret', 'translate',
        'depreciate', 'complete', 'expose', 'acquire', 'adapt', 'link',
        'simulate', 'diminish', 'compute', 'project', 'demonstrate', 'control',
        'predict', 'contribute', 'examine', 'attain', 'capture', 'develop',
        'provide', 'utilize', 'write', 'build', 'interview', 'organise',
        'classify', 'draw', 'express', 'customize', 'price', 'chart',
        'produce', 'plan', 'inform', 'solve', 'correlation', 'model',
        'operate', 'convert'
    ]

    l4 = analysis = [
        'find', 'focus', 'identify', 'query', 'debate', 'relationships',
        'derive', 'group', 'calculate', 'explain', 'theme', 'choose', 'reason',
        'proof', 'reorganise', 'point', 'interrupt', 'difference', 'arrange',
        'list', 'investigate', 'classify', 'discover', 'motive', 'deduce',
        'connect', 'advertise', 'detect', 'confirm', 'research', 'experiment',
        'size', 'cause', 'contrast', 'inspect', 'explore', 'distinguish',
        'layout', 'optimize', 'interpret', 'question', 'omit', 'depth',
        'ensure', 'distinction', 'inference', 'divide', 'relate', 'manage',
        'rank', 'maximize', 'categorize', 'establish', 'select', 'illustrate',
        'subdivide', 'transform', 'comparing', 'assumption', 'analyze',
        'function', 'analyse', 'train', 'differentiate', 'breadboard',
        'dissect', 'see', 'limit', 'highlight', 'appraise', 'diagnose',
        'blueprint', 'compare', 'recognize', 'characterize', 'examine', 'file',
        'discriminate', 'discussion', 'isolate', 'inventory', 'test', 'survey',
        'document', 'infer', 'categorise', 'breakdown', 'separate', 'effect',
        'diagram', 'simplify', 'point', 'audit', 'criticize', 'outline',
        'correlate', 'minimize', 'prioritize', 'organise', 'model', 'order',
        'test'
    ]

    l5 = synthesis = [
        'incorporate', 'code', 'reorganize', 'invent', 'generalize', 'compose',
        'overhaul', 'explain', 'hypothesize', 'program', 'combine', 'choose',
        'frame', 'integrate', 'collaborate', 'handle', 'format', 'propose',
        'express', 'progress', 'reconstruct', 'speculate', 'discuss', 'comply',
        'arrange', 'intervene', 'collect', 'hypothesise', 'debug', 'enhance',
        'anticipate', 'originate', 'formulate', 'discover', 'reinforce',
        'design', 'animate', 'substitute', 'network', 'join', 'experiment',
        'adapt', 'lecture', 'contrast', 'extend', 'visualise', 'modify',
        'makeup', 'prescribe', 'imagine', 'interface', 'estimate', 'generate',
        'change', 'improve', 'convert', 'elaborate', 'initiate',
        'individualize', 'think', 'revise', 'organize', 'relate', 'assemble',
        'synthesize', 'categorize', 'summarize', 'prepare', 'create',
        'transform', 'construct', 'predict', 'theorise', 'minimise', 'tell',
        'cope', 'maximise', 'innovate', 'specify', 'communicate', 'setup',
        'pretend', 'budget', 'compile', 'suppose', 'tabulate', 'delete',
        'compare', 'rewrite', 'devise', 'abstract', 'dictate', 'cultivate',
        'happen', 'portray', 'depict', 'develop', 'perform', 'make', 'write',
        'build', 'test', 'negotiate', 'rearrange', 'simplify', 'produce',
        'plan', 'validate', 'structure', 'add', 'outline', 'facilitate',
        'correspond', 'solve', 'model', 'original'
    ]

    l6 = evaluation = [
        'validate', 'compare', 'deduct', 'useful', 'consider', 'conclude',
        'predict', 'relate', 'describe', 'influence', 'rank', 'assess', 'rate',
        'persuade', 'determine', 'measure', 'critique', 'mark', 'summarize',
        'select', 'discuss', 'discriminate', 'prove', 'verify', 'defend',
        'support', 'debate', 'grade', 'argue', 'disprove', 'recommend', 'test',
        'infer', 'contrast', 'choose', 'attach', 'good', 'importance',
        'evaluate', 'criteria', 'prescribe', 'hire', 'award', 'perceive',
        'dispute', 'know', 'decide', 'opinion', 'judge', 'estimate', 'why',
        'interpret', 'counsel', 'criticize', 'effective', 'prioritize',
        'value', 'agree', 'bad', 'convince', 'prioritise', 'release', 'frame',
        'appraise', 'explain', 'criticise', 'justify'
    ]

    cl_listoflist = []
    cl_listoflist.append(l1)
    cl_listoflist.append(l2)
    cl_listoflist.append(l3)
    cl_listoflist.append(l4)
    cl_listoflist.append(l5)
    cl_listoflist.append(l6)

    cnt_log = 0

    final_level_of_ques = -1
    final_sim_of_ques_with_all_levels = [0, 0, 0, 0, 0, 0]
    final_area_sim_of_ques_with_all_levels = [0, 0, 0, 0, 0, 0]
    for vq_word in vq_words:
        # calculating sum and avg of sim of word with each list
        # print("\n\ndoing for word -----" , vq_word)
        sum_of_sim_all_levels = []
        avg_of_sim_all_levels = []
        for i, list_i in enumerate(cl_listoflist):
            # print("list number  : " , i)
            sum_of_sim = 0
            for l_word in list_i:
                # print("two words " , vq_word , l_word)
                if len(wordnet.synsets(vq_word)) == 0:
                    # print vq_word
                    break
                vq_word_syn = wordnet.synsets(vq_word)[0]
                # print("l_word => wordnet.synsets(l_word)",l_word, "=>" ,wordnet.synsets(l_word))
                if len(wordnet.synsets(l_word)) == 0:
                    # print l_word
                    continue
                l_word_syn = wordnet.synsets(l_word)[0]
                try:
                    wup_sim = wordnet.jcn_similarity(vq_word_syn, l_word_syn,
                                                     brown_ic)
                except:
                    # print vq_word_syn,l_word_syn,"->exception"
                    continue
                # wup_sim=(vq_word_syn).jcn_similarity(l_word_syn)
                if (type(wup_sim) != type(None)):
                    sum_of_sim = sum_of_sim + wup_sim
                    # sum_of_sim += 1
                    # print(" counted ",vq_word,l_word , "synset " , vq_word_syn , l_word_syn)
                else:
                    cnt_log = cnt_log + 1
                    # print("Not counted             ",vq_word,l_word , "synset " , vq_word_syn , l_word_syn)
                # input()
            sum_of_sim_all_levels.append(sum_of_sim)
            avg_of_sim_all_levels.append(sum_of_sim / len(list_i))

        # print("\n\n printing all lists")
        # for l in cl_listoflist:
        # 	print(l)

        # QUES WORK BEGIN
        # print ("Sim")
        for i in range(0, 6):
            final_sim_of_ques_with_all_levels[i] += avg_of_sim_all_levels[i]
        # 	print (final_sim_of_ques_with_all_levels[i],",")
        # print("\n")

        # print("area sim")
        for i in range(0, 6):
            final_area_sim_of_ques_with_all_levels[i] += sum_of_sim_all_levels[
                i]
        # 	print (final_area_sim_of_ques_with_all_levels[i],",")
        # print("\n")
        # print ("cnt_log",cnt_log)

    # print ("Final Sim")
    # for i in range(0,6):
    # 	print (final_sim_of_ques_with_all_levels[i],",")
    # print("\n")

    # print ("Final Area Sim")
    # for i in range(0,6):
    # 	print (final_area_sim_of_ques_with_all_levels[i],",")
    # print("\n")

    #	maximum of all similarities values to find cl level
    final_level = 0
    max_sim = final_sim_of_ques_with_all_levels[0]
    for index, sim in enumerate(final_sim_of_ques_with_all_levels):
        if sim > max_sim:
            max_sim = sim
            final_level = index

    # print("\n")
    # print("avg wali list: " , avg_of_sim_all_levels)

    # print( "sum wali list: " , sum_of_sim_all_levels)

    # 	finding if word will be classified in  more than two levels
    count = 0
    indices_of_same_sim = []
    for i, sim in enumerate(final_sim_of_ques_with_all_levels):
        if sim == max_sim:
            count += 1
            indices_of_same_sim.append(i)

    # 	if word is in more than two levels
    if len(indices_of_same_sim) > 1:
        # print ("ques is in more than two levels")
        same_sim_list = []
        for index in indices_of_same_sim:
            same_sim_list.append(final_area_sim_of_ques_with_all_levels[index])

        max_sim_area = same_sim_list[0]
        for sim_area, index_of_max_sim in zip(same_sim_list,
                                              indices_of_same_sim):
            if sim_area > max_sim_area:
                max_sim_area = sim_area
                final_level = index_of_max_sim

    # print("final_level ",final_level)
    return final_level
コード例 #19
0
def jcn_similarity(synsets1, synsets2):
    similarity_function = lambda ss1, ss2: wn.jcn_similarity(ss1, ss2, corpus)
    return __max_similarity(synsets1, synsets2, similarity_function)
コード例 #20
0
def jiang_conrath_similarity(a,b):
	if wn.jcn_similarity(wn.synsets(a)[0],wn.synsets(b)[0],genesis_ic) == None:
		return 0
	else:
		return wn.jcn_similarity(wn.synsets(a)[0],wn.synsets(b,genesis_ic)[0])
コード例 #21
0
        for sid in range(len(sslist)):
            prevs.append(0.0)

        for p2term,sim in sims:
            if pterm == p2term:
                continue
            sim_p_p2 = sim
            ss2list = pterm_to_synsets.get(p2term, [])

            # Calculate normalizing wnss
            norm_wnss = 0.0
            for sid in range(len(sslist)):
                max_wnss = 0.0
                for s2id in range(len(ss2list)):
                    s_s2_wnss = wordnet.jcn_similarity(sslist[sid],\
                                                       ss2list[s2id],\
                                                       ic)
                    if s_s2_wnss > max_wnss:
                        max_wnss = s_s2_wnss
                norm_wnss += max_wnss
            if norm_wnss <= 0.0:
                continue
            
            # Increment prevalence score for each sense
            for sid in range(len(sslist)):
                max_wnss = 0.0
                for s2id in range(len(ss2list)):
                    s_s2_wnss = wordnet.jcn_similarity(sslist[sid],\
                                                       ss2list[s2id],\
                                                       ic)
                    if s_s2_wnss > max_wnss:
コード例 #22
0
def computeInfContSimilarity():
	## Load an information content file from the wordnet_ic corpus
	brown_ic = wordnet_ic.ic('ic-brown.dat')

	print "computing Information Content Similarity..."
	tStart = time.time()
	## Compute the similarity between nouns
	ALLnouns_sim = []
	for subSent1, subSent2 in zip(nouns_text1, nouns_text2):

		## if-else to use the longer sentence
		if (len(subSent1) > len(subSent2)):
			nounSim = np.zeros(len(subSent1)) 
			for i, noun1 in enumerate(subSent1):
				for noun2 in subSent2:
					try:
						w1 = noun1 + ".n.01"
						w1 = wn.synset(w1)
						w2 = noun2 + ".n.01"
						w2 = wn.synset(w2)
						sim = wn.jcn_similarity(w1, w2, brown_ic)
						if sim > nounSim[i]:
							nounSim[i] = sim
					except:
						continue
			# print nounSim
		else:
			nounSim = np.zeros(len(subSent2))
			for i, noun2 in enumerate(subSent2):
				for noun1 in subSent1:
					try:
						w1 = noun1 + ".n.01"
						w1 = wn.synset(w1)
						w2 = noun2 + ".n.01"
						w2 = wn.synset(w2)
						sim = wn.jcn_similarity(w1, w2, brown_ic)
						if sim > nounSim[i]:
							nounSim[i] = sim
					except:
						continue	
		
		ALLnouns_sim.append(nounSim)


	## Compute the similarity between verbs
	ALLverbs_sim = []
	for subSent1, subSent2 in zip(verbs_text1, verbs_text2):

		## if-else to use the longer sentence
		if (len(subSent1) > len(subSent2)):
			verbSim = np.zeros(len(subSent1)) 
			for i, verb1 in enumerate(subSent1):
				for verb2 in subSent2:
					try:
						w1 = verb1 + ".n.01"
						w1 = wn.synset(w1)
						w2 = verb2 + ".n.01"
						w2 = wn.synset(w2)
						sim = wn.jcn_similarity(w1, w2, brown_ic)
						if sim > verbSim[i]:
							verbSim[i] = sim
					except:
						continue
		else:
			verbSim = np.zeros(len(subSent2))
			for i, verb2 in enumerate(subSent2):
				for verb1 in subSent1:
					try:
						w1 = verb1 + ".n.01"
						w1 = wn.synset(w1)
						w2 = verb2 + ".n.01"
						w2 = wn.synset(w2)
						sim = wn.jcn_similarity(w1, w2, brown_ic)
						if sim > verbSim[i]:
							verbSim[i] = sim
					except:
						continue	
		
		ALLverbs_sim.append(verbSim)


	## Compute the similarity between adjectives
	ALLadjs_sim = []
	for subSent1, subSent2 in zip(adj_text1, adj_text2):

		## if-else to use the longer sentence
		if (len(subSent1) > len(subSent2)):
			adjSim = np.zeros(len(subSent1)) 
			for i, adj1 in enumerate(subSent1):
				for adj2 in subSent2:
					try:
						w1 = adj1 + ".n.01"
						w1 = wn.synset(w1)
						w2 = adj2 + ".n.01"
						w2 = wn.synset(w2)
						sim = wn.jcn_similarity(w1, w2, brown_ic)
						if sim > adjSim[i]:
							adjSim[i] = sim
					except:
						continue
			# print nounSim
		else:
			adjSim = np.zeros(len(subSent2))
			for i, adj2 in enumerate(subSent2):
				for adj1 in subSent1:
					try:
						w1 = adj1 + ".n.01"
						w1 = wn.synset(w1)
						w2 = adj2 + ".n.01"
						w2 = wn.synset(w2)
						sim = wn.jcn_similarity(w1, w2, brown_ic)
						if sim > adjSim[i]:
							adjSim[i] = sim
					except:
						continue	
		
		ALLadjs_sim.append(adjSim)

	tEnd = time.time()
	print "..done. Time taken (InformationContentSimilarity): ", tEnd-tStart
	return ALLnouns_sim, ALLverbs_sim, ALLverbs_sim
コード例 #23
0
def jcn_process(xfpg, keywds):
    #fpg_l=list(set1)
    brown_ic = wordnet_ic.ic('ic-brown.dat')
    x = []
    s_g = []
    result = []
    #print (len(l))
    #print (len(l2))
    #for k in list(keywds):
    #   for j in list(set1):
    #        s1 = wn.synsets(k)
    #        s2 = wn.synsets(j)
    #       print (k,j)
    #      x.append(wn.jcn_similarity(s1[i], s2[i], brown_ic))
    #print(len(set1))
    print("TFIDF KEYWORDS NUMBER " + str(len(keywds)))

    for j in (list(xfpg['itemsets'])):
        s = []
        result = []
        #print()
        #print(j)
        #print(len(j))
        #print()

        #print("FPGROWRH ITEMSET length "+str(len(list(xfpg['itemsets']))))
        for j1 in j:
            #print("FPGROWRH ITEMSET  "+str((j1)))
            for i in (keywds[:20]):

                s1 = wn.synsets(i)
                s2 = wn.synsets(j1)

                #print (s1,s2)
                #print(s1)
                #print(s2)

                if (len(s1) == 0 or len(s2) == 0):
                    x = 0
                elif (s1[0].pos() != s2[0].pos()):
                    x = 0
                else:
                    x = wn.jcn_similarity(s1[0], s2[0], brown_ic)
                    if (x == (1e300)):
                        x = 1
                    result.append(x)

                #print ("jcn of {} and {} is {}".format(str(j1),str(i),str(x)))
            s1 = sum(result)
            #print("score of {} is {}".format(j1,str(s1)))
        s.append(s1)

        #print("score of {} is {}".format(str(j),str(sum(s))))
        #print(result)
        s_g.append(s)

        #print("Length of result"+str(len(s_g)))

    #res=s/len(result)
    #print("resultat" + str(s_g))

    #print("Resultat" + str(res*100))
    res = np.asarray(s_g)
    #get the element with the higest score
    if (res.size != 0):
        res_index = res.argmax()
        print("Highest score is {} with {}".format(
            str(xfpg['itemsets'][res_index]), str(s_g[res_index])))
        #Converting frozen set to list
        sets = [xfpg['itemsets'][res_index]]

        final_list = ([list(x) for x in sets])

        return final_list

    else:
        None
    #res_index
    print()
    return None

    print()
コード例 #24
0
                wn.synset(ani_list[indcolum][0][8:-2]), brown_ic)
        if results_res[indrow][indcolum] == 1:
            word_pairs.append(ani_list[indrow][0] + " vs. " +
                              ani_list[indcolum][0])
print(results_res)

#Jiang-Conrath Similarity
results_jcn = np.zeros((len(ani_list), len(ani_list)))
word_pairs = []
for indrow in range(0, len(ani_list)):
    print("word: " + ani_list[indrow][0][8:-2])
    for indcolum in range(0, len(ani_list)):
        print(ani_list[indrow][0][8:-2] + " vs. " +
              ani_list[indcolum][0][8:-2])
        results_jcn[indrow][indcolum] = wn.jcn_similarity(
            wn.synset(ani_list[indrow][0][8:-2]),
            wn.synset(ani_list[indcolum][0][8:-2]), brown_ic)
        if results_jcn[indrow][indcolum] == 1:
            word_pairs.append(ani_list[indrow][0] + " vs. " +
                              ani_list[indcolum][0])
print(results_jcn)

#Lin Similarity
results_lin = np.zeros((len(ani_list), len(ani_list)))
word_pairs = []
for indrow in range(0, len(ani_list)):
    print("word: " + ani_list[indrow][0][8:-2])
    for indcolum in range(0, len(ani_list)):
        print(ani_list[indrow][0][8:-2] + " vs. " +
              ani_list[indcolum][0][8:-2])
        results_lin[indrow][indcolum] = wn.lin_similarity(
コード例 #25
0
ファイル: graph_wsd_test_v2.py プロジェクト: lancercat/OSOCR
def jcn_similarity(synset1, synset2):
    return wn.jcn_similarity(synset1, synset2, info_content)
コード例 #26
0
#Import IC calculation
from nltk.corpus import wordnet_ic
brown_ic = wordnet_ic.ic('ic-brown-resnik-add1.dat')
bnc_ic = wordnet_ic.ic('ic-bnc-resnik-add1.dat')

#For each pair of synsets, compute distance
for s1 in synsets:
  syn1 = wn.of2ss(s1)
  for s2 in synsets:
    syn2 = wn.of2ss(s2)
    distances_path[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.path_similarity(syn1,syn2)
    distances_lch[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.lch_similarity(syn1,syn2)
    distances_wup[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.wup_similarity(syn1,syn2)
    distances_res[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.res_similarity(syn1,syn2,brown_ic)
    distances_jcn[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.jcn_similarity(syn1,syn2,brown_ic)
    distances_lin[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.lin_similarity(syn1,syn2,brown_ic)
    distances_res_bnc[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.res_similarity(syn1,syn2,bnc_ic)
    distances_jcn_bnc[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.jcn_similarity(syn1,syn2,bnc_ic)
    distances_lin_bnc[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.lin_similarity(syn1,syn2,bnc_ic)
    #distances_path[labelsNLTK.index(s1)][labelsNLTK.index(s2)] =1/(labelsNLTK.index(s2)+1) 
    #distances_lch[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1)
    #distances_wup[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1)  
    #distances_res[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1)  
    #distances_jcn[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1)  
    #distances_lin[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1)  
    #distances_res_bnc[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1)  
    #distances_jcn_bnc[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1)  
    #distances_lin_bnc[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1)  

print 'done computing wordnet distances'
コード例 #27
-1
ファイル: similarity.py プロジェクト: ChenglongChen/pywsd
def similarity_by_infocontent(sense1, sense2, option):
    """ Returns similarity scores by information content. """
    if sense1.pos != sense2.pos: # infocontent sim can't do diff POS.
        return 0

    info_contents = ['ic-bnc-add1.dat', 'ic-bnc-resnik-add1.dat', 
                     'ic-bnc-resnik.dat', 'ic-bnc.dat', 
                     
                     'ic-brown-add1.dat', 'ic-brown-resnik-add1.dat', 
                     'ic-brown-resnik.dat', 'ic-brown.dat', 
                     
                     'ic-semcor-add1.dat', 'ic-semcor.dat',
                      
                     'ic-semcorraw-add1.dat', 'ic-semcorraw-resnik-add1.dat', 
                     'ic-semcorraw-resnik.dat', 'ic-semcorraw.dat', 
                     
                     'ic-shaks-add1.dat', 'ic-shaks-resnik.dat', 
                     'ic-shaks-resnink-add1.dat', 'ic-shaks.dat', 
                     
                     'ic-treebank-add1.dat', 'ic-treebank-resnik-add1.dat', 
                     'ic-treebank-resnik.dat', 'ic-treebank.dat']
  
    if option in ['res', 'resnik']:
        return wn.res_similarity(sense1, sense2, wnic.ic('ic-bnc-resnik-add1.dat'))
    #return min(wn.res_similarity(sense1, sense2, wnic.ic(ic)) \
    #             for ic in info_contents)

    elif option in ['jcn', "jiang-conrath"]:
        return wn.jcn_similarity(sense1, sense2, wnic.ic('ic-bnc-add1.dat'))
  
    elif option in ['lin']:
        return wn.lin_similarity(sense1, sense2, wnic.ic('ic-bnc-add1.dat'))