Beispiel #1
0
def compare_allsynsets(method, word1, word2):
    ss1 = wordnet.synsets(word1)
    ss2 = wordnet.synsets(word2)
    simi, simi_value = 0.0, 0.0
    for (s1, s2) in product(ss1, ss2):
        # if SYNpos and s1.pos() != s2.pos():  # SYN-POS
        #     continue
        # if TWpos and s1.pos() != pos:  # Target word POS
        #     continue
        if method == "PATH":
            simi = s1.path_similarity(s2)
        elif method == "LCH":
            simi = wordnet.lch_similarity(s1, s2)
        elif method == "WUP":
            simi = wordnet.wup_similarity(s1, s2)
        elif method == "RES":
            simi = wordnet.res_similarity(s1, s2, brown_ic)
        elif method == "JCN":
            if s1.pos() == s2.pos() and s1.pos() in ['n', 'a', 'v'
                                                     ]:  # can't do diff POS
                simi = wordnet.jcn_similarity(s1, s2, brown_ic)
        elif method == "LIN":
            if s1.pos() == s2.pos() and s1.pos() in ['n', 'a', 'v'
                                                     ]:  # can't do diff POS
                simi = wordnet.lin_similarity(s1, s2, brown_ic)
        else:
            sys.exit("Error! No similarity methods!")

        if simi > simi_value:
            simi_value = simi
    return simi_value
def semantic_similarity(word1, word2, speech, measure):
  """
  Finds the highest similarity score for the given pair of words. Goes through each combination of all senses.

  :param word1: First word in the pair of words
  :param word2: Second word in the pair of words
  :param speech: part of speech e.g. nw.NOUN
  :param measure: String representing the type of similarity measure ("path" = path ; "res" = Resnik  ;  "lin" = Lin)
  :return: The highest similarity score across all senses and all parts of speech
  """
  #error handling if invalid measure input is given
  if measure not in ["path","res","lin"]:
    raise ValueError("Not a valid similarity type \n Must be 'path'(path), 'res'(Resnik) or 'lin'(Lin)")

  greatest = 0
  conceptsA = wn.synsets(word1,speech)
  conceptsB = wn.synsets(word2,speech)
  #finds similarity score for every combination of senses
  for conceptA in conceptsA:
    for conceptB in conceptsB:
      if measure == "path":
        similarity = wn.path_similarity(conceptA,conceptB)
      elif measure == "res":
        similarity = wn.res_similarity(conceptA,conceptB,brown_ic)
      elif measure == "lin":
        similarity = wn.lin_similarity(conceptA,conceptB,brown_ic)
      if similarity == None : continue #error checking if similarity scorce not possible
      if similarity>greatest:
          greatest = similarity #if new highest similairty is found, set it to the greatest
  return greatest
def create_graphs(doc_list):
    documents = doc_list
    if documents is None:
        documents = default_document_list()

    distance_functions = [
        (wn.lch_similarity(SYNSETS[0], SYNSETS[0]), 'lch', lambda sense_1, sense_2: wn.lch_similarity(sense_1, sense_2)),
        (1.0, 'lin', lambda sense_1, sense_2: wn.lin_similarity(sense_1, sense_2, CORPUS)),
        (10.636958516573292, 'res', lambda sense_1, sense_2: wn.res_similarity(sense_1, sense_2, CORPUS)),
        (wn.jcn_similarity(SYNSETS[0], SYNSETS[0], CORPUS), 'jcn', lambda sense_1, sense_2: wn.jcn_similarity(sense_1, sense_2, CORPUS)),
        (1.0, 'path', lambda sense_1, sense_2: wn.path_similarity(sense_1, sense_2)),
    ]
    all_senses = []
    for doc in documents:
        for sense in doc.top_senses():
            all_senses.append((sense, doc.name))
    against_colors = ['r', 'b', 'g']
    against_to = [wn.synset(word) for word in ["economy.n.01", "philosophy.n.02", "politics.n.01"]]
    create_against_graph('phyl_eco_pol', documents, all_senses, against_to, distance_functions, against_colors)

    against_to = SYNSETS

    against_colors = [(random(), random(), random()) for _i in range(0, len(SYNSETS))]
    create_against_graph('handpicked', documents, all_senses, against_to, distance_functions, against_colors)

    create_graph_top_senses(documents, all_senses, distance_functions)
    def get_lin_min(self, sentence1, sentence2):
        sentence1_unique, sentence2_unique = self.sentence_difference(
            sentence1, sentence2)
        min_similarity = maxint
        # Measure similarity for each unique word from A to each unique word to B
        for sentence1_word in sentence1_unique:
            for sentence2_word in sentence2_unique:
                sentence1_word_tag = sentence1.get_tag(sentence1_word)
                sentence2_word_tag = sentence2.get_tag(sentence2_word)
                synsets_word1 = wordnet.synsets(sentence1_word,
                                                sentence1_word_tag)
                synsets_word2 = wordnet.synsets(sentence2_word,
                                                sentence2_word_tag)

                if len(synsets_word1) == 0:
                    synsets_word1 = wordnet.synsets(sentence1_word)
                if len(synsets_word2) == 0:
                    synsets_word2 = wordnet.synsets(sentence2_word)

                if len(synsets_word1) > 0 and len(synsets_word2) > 0:
                    # Skip words with different tags
                    if synsets_word1[0].pos() != synsets_word2[0].pos():
                        continue
                    # Try find similarity from corpus
                    try:
                        similarity = wordnet.lin_similarity(
                            synsets_word1[0], synsets_word2[0], self.brown_ic)
                    except:
                        continue
                    if similarity != None:
                        min_similarity = min(similarity, min_similarity)
        if min_similarity == maxint:
            return 0
        return min_similarity
Beispiel #5
0
def similarity_by_infocontent(sense1, sense2, option):
    """ Returns similarity scores by information content. """
    if sense1.pos != sense2.pos:  # infocontent sim can't do diff POS.
        return 0

    info_contents = [
        'ic-bnc-add1.dat', 'ic-bnc-resnik-add1.dat', 'ic-bnc-resnik.dat',
        'ic-bnc.dat', 'ic-brown-add1.dat', 'ic-brown-resnik-add1.dat',
        'ic-brown-resnik.dat', 'ic-brown.dat', 'ic-semcor-add1.dat',
        'ic-semcor.dat', 'ic-semcorraw-add1.dat',
        'ic-semcorraw-resnik-add1.dat', 'ic-semcorraw-resnik.dat',
        'ic-semcorraw.dat', 'ic-shaks-add1.dat', 'ic-shaks-resnik.dat',
        'ic-shaks-resnink-add1.dat', 'ic-shaks.dat', 'ic-treebank-add1.dat',
        'ic-treebank-resnik-add1.dat', 'ic-treebank-resnik.dat',
        'ic-treebank.dat'
    ]

    if option in ['res', 'resnik']:
        return wn.res_similarity(sense1, sense2,
                                 wnic.ic('ic-bnc-resnik-add1.dat'))
    #return min(wn.res_similarity(sense1, sense2, wnic.ic(ic)) \
    #             for ic in info_contents)

    elif option in ['jcn', "jiang-conrath"]:
        return wn.jcn_similarity(sense1, sense2, wnic.ic('ic-bnc-add1.dat'))

    elif option in ['lin']:
        return wn.lin_similarity(sense1, sense2, wnic.ic('ic-bnc-add1.dat'))
def linWordnetSimilarity(word1, word2, ic):
	
	possibleValues = [wordnet.lin_similarity(synset1, synset2, ic) \
				for synset1 in [smoothed(s) for s in wordnet.synsets(word1)] \
				for synset2 in [smoothed(s) for s in wordnet.synsets(word2)] \
				if synset1.pos == synset2.pos and synset1.pos in ic]
	possibleValues = [v for v in possibleValues if v >= 0]
	return max(possibleValues) if possibleValues else None
def get_similarity(synsets, c, ic):
    matrix = []
    for ci in c:
        temp = []
        for si in synsets:
            try:
                temp.append(wn.lin_similarity(si, ci, ic))
            except:
                temp.append(0)
        matrix.append(temp)
    return matrix
Beispiel #8
0
 def lin(self, synset_a, synset_b, ic):
     if synset_a.pos() != synset_b.pos():
         return 0
     try:
         norm = self.normalize(
             self.MAX_VALUE, wordnet.lin_similarity(synset_a, synset_b, ic)
         )
         if norm == None:
             return 0
         return norm
     except ZeroDivisionError:
         return 0
Beispiel #9
0
def similarity_by_path(sense1, sense2, option="path"):
  """ Returns maximum path similarity between two senses. """
  if option.lower() in ["path", "path_similarity"]: # Path similaritys
    return max(wn.path_similarity(sense1,sense2), 
               wn.path_similarity(sense1,sense2))
  elif option.lower() in ["wup", "wupa", "wu-palmer", "wu-palmer"]: # Wu-Palmer 
    return wn.wup_similarity(sense1, sense2)
  elif option.lower() in ['lch', "leacock-chordorow"]: # Leacock-Chodorow
    if sense1.pos != sense2.pos: # lch can't do diff POS
      return 0
    return wn.lch_similarity(sense1, sense2)

    return wn.lin_similarity(sense1, sense2, wnic.ic('ic-bnc-add1.dat'))
Beispiel #10
0
    def test_wordnet_ic(self):
        from nltk.corpus import wordnet as nltk_wn
        from nltk.corpus import wordnet_ic as nltk_wnic
        nltk_car = nltk_wn.synset('car.n.1')
        nltk_bus = nltk_wn.synset('bus.n.1')
        our_bnc_resnik_add1 = WordNetInformationContent('bnc', resnik=True, add1=True)

        our_car = our_wn.synset('car.n.1')
        our_bus = our_wn.synset('bus.n.1')
        nltk_bnc_resnik_add1 = nltk_wnic.ic('ic-bnc-resnik-add1.dat')
        assert our_wn.res_similarity(our_car, our_bus, our_bnc_resnik_add1) == nltk_wn.res_similarity(nltk_car, nltk_bus, nltk_bnc_resnik_add1)
        assert our_wn.jcn_similarity(our_car, our_bus, our_bnc_resnik_add1) == nltk_wn.jcn_similarity(nltk_car, nltk_bus, nltk_bnc_resnik_add1)
        assert our_wn.lin_similarity(our_car, our_bus, our_bnc_resnik_add1) == nltk_wn.lin_similarity(nltk_car, nltk_bus, nltk_bnc_resnik_add1)
Beispiel #11
0
def compute_similarities(s1, s2, sim):
    if sim == "path":
        return wn.path_similarity(s1, s2)
    elif sim == "lch":
        return wn.lch_similarity(s1, s2)
    elif sim == "wup":
        return wn.wup_similarity(s1, s2)
    elif sim == "res":
        return wn.res_similarity(s1, s2, genesis_ic)
    elif sim == "jcn":
        return wn.jcn_similarity(s1, s2, genesis_ic)
    elif sim == "lin":
        return wn.lin_similarity(s1, s2, genesis_ic)
Beispiel #12
0
def similarity2(lverb,el):
	ret = [];
	for v1 in lverb:
		maxsim = -1;
		for v2 in el:
			sim = wn.lin_similarity(v1, v2, brown_ic, verbose = False);
			if (sim>maxsim):
				maxsim = sim;

		ret.append(maxsim);
	if (len(ret) != 2):
		ret.append(0);

	return ret;
Beispiel #13
0
def wnsim(synset1, synset2, method='all'):
    synset_patt = re.compile(r'^.+\..+\.\d+$')

    if synset_patt.match(synset1):
        s1 = wn.synset(synset1)
    else:
        s1 = wn_synset(synset1)

    if synset_patt.match(synset2):
        s2 = wn.synset(synset2)
    else:
        s2 = wn_synset(synset2)

    if s1 is None or s2 is None:
        return 0

    if method == 'lin':
        return wn.lin_similarity(s1, s2, wn_ic)
    elif method == 'res':
        return wn.res_similarity(s1, s2, wn_ic)
    elif method == 'jcn':
        return wn.jcn_similarity(s1, s2, wn_ic)
    elif method == 'wup':
        return wn.wup_similarity(s1, s2)
    elif method == 'path':
        return wn.path_similarity(s1, s2)
    elif method == 'lch':
        return wn.lch_similarity(s1, s2)
    elif method == 'all':
        return [
            ('lin', wn.lin_similarity(s1, s2, wn_ic)),
            ('res', wn.res_similarity(s1, s2, wn_ic)),
            ('jcn', wn.jcn_similarity(s1, s2, wn_ic)),
            ('wup', wn.wup_similarity(s1, s2)),
            ('path', wn.path_similarity(s1, s2)),
            ('lch', wn.lch_similarity(s1, s2))
        ]
def computeLinSimilarity(term1, term2):
    global ic
    if not ic:
        #ic = wordnet_ic.ic('ic-semcor.dat')
        ic = wordnet_ic.ic('ic-brown.dat')
    w1_syns = wn.synsets(term1)
    w2_syns = wn.synsets(term2)
    maxsim = 0
    for w1s in w1_syns:
        for w2s in w2_syns:
            try:
                sim = wn.lin_similarity(w1s, w2s, ic)
                if sim > maxsim:
                    maxsim = sim
            except Exception:
                pass
    return maxsim
Beispiel #15
0
    def word_similarity(self, w1, w2, syns, loc, thr_sim):
        syn1 = wn.synsets(w1, wn.NOUN or wn.ADJ)
        syn2 = wn.synsets(w2, wn.NOUN or wn.ADJ)

        if len(syn1) > 0 and len(syn2) > 0:
            score = 0
            max_score = 0
            count = 0
            sns1 = syn1[0]
            sns2 = syn2[0]
            for i in range(0, len(syn1)):
                for j in range(0, len(syn2)):
                    if self.wordnet_metric == 'j':  # Jiang-Conrath Similarity
                        score = wn.jcn_similarity(syn1[i], syn2[j])
                    elif self.wordnet_metric == 'le':  # Leacock-Chodorow Similarity
                        score = wn.lch_similarity(syn1[i],
                                                  syn2[j],
                                                  simulate_root=False)
                    elif self.wordnet_metric == 'li':  # Lin Similarity
                        score = wn.lin_similarity(syn1[i], syn2[j])
                    elif self.wordnet_metric == 'p':  # Path Similarity
                        score = wn.path_similarity(syn1[i], syn2[j])
                    elif self.wordnet_metric == 'w':  # Wu-Palmer Similarity. It can not be '0'. It ranges in (0,1]
                        score = wn.wup_similarity(syn1[i], syn2[j])

                    if score > max_score:  # Finding the maximum score
                        max_score = score
                        sns1 = syn1[i]
                        sns2 = syn2[j]
                        if max_score >= thr_sim:  # Storing all the synset pairs that have scores > threshold
                            syns, loc = self.merging_synsets(
                                syns, w1, w2, sns1, sns2, max_score, loc)
                            count = count + 1
            if count == 0:  # Storing the synset that has maximum score but the score < threshold
                syns, loc = self.merging_synsets(syns, w1, w2, sns1, sns2,
                                                 max_score, loc)
        return syns, loc
def create_graphs(doc_list):
    documents = doc_list
    if documents is None:
        documents = default_document_list()

    distance_functions = [
        (wn.lch_similarity(SYNSETS[0], SYNSETS[0]), 'lch',
         lambda sense_1, sense_2: wn.lch_similarity(sense_1, sense_2)),
        (1.0, 'lin',
         lambda sense_1, sense_2: wn.lin_similarity(sense_1, sense_2, CORPUS)),
        (10.636958516573292, 'res',
         lambda sense_1, sense_2: wn.res_similarity(sense_1, sense_2, CORPUS)),
        (wn.jcn_similarity(SYNSETS[0], SYNSETS[0], CORPUS), 'jcn',
         lambda sense_1, sense_2: wn.jcn_similarity(sense_1, sense_2, CORPUS)),
        (1.0, 'path',
         lambda sense_1, sense_2: wn.path_similarity(sense_1, sense_2)),
    ]
    all_senses = []
    for doc in documents:
        for sense in doc.top_senses():
            all_senses.append((sense, doc.name))
    against_colors = ['r', 'b', 'g']
    against_to = [
        wn.synset(word)
        for word in ["economy.n.01", "philosophy.n.02", "politics.n.01"]
    ]
    create_against_graph('phyl_eco_pol', documents, all_senses, against_to,
                         distance_functions, against_colors)

    against_to = SYNSETS

    against_colors = [(random(), random(), random())
                      for _i in range(0, len(SYNSETS))]
    create_against_graph('handpicked', documents, all_senses, against_to,
                         distance_functions, against_colors)

    create_graph_top_senses(documents, all_senses, distance_functions)
    def get_lin_average(self, sentence1, sentence2):
        sentence1_unique, sentence2_unique = self.sentence_difference(
            sentence1, sentence2)
        avg_similarity = 0
        total_count = 0
        # Measure similarity for each unique word from A to each unique word to B
        for sentence1_word in sentence1_unique:
            for sentence2_word in sentence2_unique:
                sentence1_word_tag = sentence1.get_tag(sentence1_word)
                sentence2_word_tag = sentence2.get_tag(sentence2_word)
                synsets_word1 = wordnet.synsets(sentence1_word,
                                                sentence1_word_tag)
                synsets_word2 = wordnet.synsets(sentence2_word,
                                                sentence2_word_tag)

                if len(synsets_word1) == 0:
                    synsets_word1 = wordnet.synsets(sentence1_word)
                if len(synsets_word2) == 0:
                    synsets_word2 = wordnet.synsets(sentence2_word)

                if len(synsets_word1) > 0 and len(synsets_word2) > 0:
                    # Skip words with different tags
                    if synsets_word1[0].pos() != synsets_word2[0].pos():
                        continue
                    # Try find similarity from corpus
                    try:
                        similarity = wordnet.lin_similarity(
                            synsets_word1[0], synsets_word2[0], self.brown_ic)
                    except:
                        continue
                    if similarity != None:
                        avg_similarity += similarity
                        total_count += 1
        if total_count == 0:
            return 0
        return float(avg_similarity) / float(total_count)
Beispiel #18
0
def lin_sim_fun(vq_words=[]):
    l1 = knowledge = [
        'recite', 'review', 'point', 'recognize', 'describe', 'choose',
        'examine', 'identify', 'enumerate', 'find', 'select', 'what',
        'memorize', 'collect', 'sequence', 'when', 'duplicate', 'who', 'label',
        'write', 'indicate', 'state', 'tabulate', 'which', 'relate', 'show',
        'arrange', 'cite', 'match', 'define', 'locate', 'draw', 'repeat',
        'remember', 'trace', 'read', 'quote', 'spell', 'memorise', 'how',
        'observe', 'recognise', 'copy', 'why', 'outline', 'count', 'name',
        'recall', 'study', 'omit', 'list', 'tell', 'reproduce', 'record',
        'retell', 'meet', 'listen', 'where', 'order', 'view'
    ]

    l2 = comprehension = [
        'compare', 'cite', 'give', 'predict', 'recognize', 'describe',
        'articulate', 'detail', 'order', 'characterize', 'generalize',
        'factor', 'summarize', 'select', 'illustrate', 'visualize', 'group',
        'trace', 'purpose', 'defend', 'rewrite', 'relate', 'approximate',
        'demonstrate', 'indicate', 'add', 'interact', 'tell', 'extrapolate',
        'show', 'rephrase', 'paraphrase', 'infer', 'contrast', 'locate',
        'picture', 'extend', 'associate', 'conclude', 'express', 'interpolate',
        'generalise', 'clarify', 'observe', 'understand', 'differentiate',
        'review', 'distinguish', 'estimate', 'subtract', 'discuss',
        'interpret', 'summarise', 'convert', 'translate', 'compute', 'outline',
        'identify', 'elaborate', 'ask', 'example', 'classify', 'report',
        'restate', 'explain', 'match'
    ]

    l3 = application = [
        'represent', 'show', 'identify', 'participate', 'derive', 'group',
        'calculate', 'graph', 'dramatize', 'choose', 'factor', 'include',
        'allocate', 'handle', 'practice', 'relate'
        'schedule', 'report', 'assess', 'collect', 'investigate', 'categorise',
        'ascertain', 'round', 'sketch', 'transcribe', 'sequence', 'imitate',
        'discover', 'connect', 'tabulate', 'employ', 'avoid', 'experiment',
        'manipulate', 'exercise', 'extend', 'associate', 'modify',
        'personalize', 'dramatise', 'explore', 'teach', 'change', 'perform',
        'summarise', 'act', 'implement', 'assign', 'alphabetize', 'relate',
        'articulate', 'administer', 'subscribe', 'instruct', 'determine',
        'apply', 'establish', 'select', 'illustrate', 'plot', 'use', 'prepare',
        'paint', 'transfer', 'construct', 'process', 'interpret', 'translate',
        'depreciate', 'complete', 'expose', 'acquire', 'adapt', 'link',
        'simulate', 'diminish', 'compute', 'project', 'demonstrate', 'control',
        'predict', 'contribute', 'examine', 'attain', 'capture', 'develop',
        'provide', 'utilize', 'write', 'build', 'interview', 'organise',
        'classify', 'draw', 'express', 'customize', 'price', 'chart',
        'produce', 'plan', 'inform', 'solve', 'correlation', 'model',
        'operate', 'convert'
    ]

    l4 = analysis = [
        'find', 'focus', 'identify', 'query', 'debate', 'relationships',
        'derive', 'group', 'calculate', 'explain', 'theme', 'choose', 'reason',
        'proof', 'reorganise', 'point', 'interrupt', 'difference', 'arrange',
        'list', 'investigate', 'classify', 'discover', 'motive', 'deduce',
        'connect', 'advertise', 'detect', 'confirm', 'research', 'experiment',
        'size', 'cause', 'contrast', 'inspect', 'explore', 'distinguish',
        'layout', 'optimize', 'interpret', 'question', 'omit', 'depth',
        'ensure', 'distinction', 'inference', 'divide', 'relate', 'manage',
        'rank', 'maximize', 'categorize', 'establish', 'select', 'illustrate',
        'subdivide', 'transform', 'comparing', 'assumption', 'analyze',
        'function', 'analyse', 'train', 'differentiate', 'breadboard',
        'dissect', 'see', 'limit', 'highlight', 'appraise', 'diagnose',
        'blueprint', 'compare', 'recognize', 'characterize', 'examine', 'file',
        'discriminate', 'discussion', 'isolate', 'inventory', 'test', 'survey',
        'document', 'infer', 'categorise', 'breakdown', 'separate', 'effect',
        'diagram', 'simplify', 'point', 'audit', 'criticize', 'outline',
        'correlate', 'minimize', 'prioritize', 'organise', 'model', 'order',
        'test'
    ]

    l5 = synthesis = [
        'incorporate', 'code', 'reorganize', 'invent', 'generalize', 'compose',
        'overhaul', 'explain', 'hypothesize', 'program', 'combine', 'choose',
        'frame', 'integrate', 'collaborate', 'handle', 'format', 'propose',
        'express', 'progress', 'reconstruct', 'speculate', 'discuss', 'comply',
        'arrange', 'intervene', 'collect', 'hypothesise', 'debug', 'enhance',
        'anticipate', 'originate', 'formulate', 'discover', 'reinforce',
        'design', 'animate', 'substitute', 'network', 'join', 'experiment',
        'adapt', 'lecture', 'contrast', 'extend', 'visualise', 'modify',
        'makeup', 'prescribe', 'imagine', 'interface', 'estimate', 'generate',
        'change', 'improve', 'convert', 'elaborate', 'initiate',
        'individualize', 'think', 'revise', 'organize', 'relate', 'assemble',
        'synthesize', 'categorize', 'summarize', 'prepare', 'create',
        'transform', 'construct', 'predict', 'theorise', 'minimise', 'tell',
        'cope', 'maximise', 'innovate', 'specify', 'communicate', 'setup',
        'pretend', 'budget', 'compile', 'suppose', 'tabulate', 'delete',
        'compare', 'rewrite', 'devise', 'abstract', 'dictate', 'cultivate',
        'happen', 'portray', 'depict', 'develop', 'perform', 'make', 'write',
        'build', 'test', 'negotiate', 'rearrange', 'simplify', 'produce',
        'plan', 'validate', 'structure', 'add', 'outline', 'facilitate',
        'correspond', 'solve', 'model', 'original'
    ]

    l6 = evaluation = [
        'validate', 'compare', 'deduct', 'useful', 'consider', 'conclude',
        'predict', 'relate', 'describe', 'influence', 'rank', 'assess', 'rate',
        'persuade', 'determine', 'measure', 'critique', 'mark', 'summarize',
        'select', 'discuss', 'discriminate', 'prove', 'verify', 'defend',
        'support', 'debate', 'grade', 'argue', 'disprove', 'recommend', 'test',
        'infer', 'contrast', 'choose', 'attach', 'good', 'importance',
        'evaluate', 'criteria', 'prescribe', 'hire', 'award', 'perceive',
        'dispute', 'know', 'decide', 'opinion', 'judge', 'estimate', 'why',
        'interpret', 'counsel', 'criticize', 'effective', 'prioritize',
        'value', 'agree', 'bad', 'convince', 'prioritise', 'release', 'frame',
        'appraise', 'explain', 'criticise', 'justify'
    ]

    cl_listoflist = []
    cl_listoflist.append(l1)
    cl_listoflist.append(l2)
    cl_listoflist.append(l3)
    cl_listoflist.append(l4)
    cl_listoflist.append(l5)
    cl_listoflist.append(l6)

    cnt_log = 0

    final_level_of_ques = -1
    final_sim_of_ques_with_all_levels = [0, 0, 0, 0, 0, 0]
    final_area_sim_of_ques_with_all_levels = [0, 0, 0, 0, 0, 0]
    for vq_word in vq_words:
        # calculating sum and avg of sim of word with each list
        # print("\n\ndoing for word -----" , vq_word)
        sum_of_sim_all_levels = []
        avg_of_sim_all_levels = []
        for i, list_i in enumerate(cl_listoflist):
            # print("list number  : " , i)
            sum_of_sim = 0
            for l_word in list_i:
                # print("two words " , vq_word , l_word)
                if len(wordnet.synsets(vq_word)) == 0:
                    # print vq_word
                    break
                vq_word_syn = wordnet.synsets(vq_word)[0]
                # print("l_word => wordnet.synsets(l_word)",l_word, "=>" ,wordnet.synsets(l_word))
                if len(wordnet.synsets(l_word)) == 0:
                    # print l_word
                    continue
                l_word_syn = wordnet.synsets(l_word)[0]
                try:
                    wup_sim = wordnet.lin_similarity(vq_word_syn, l_word_syn,
                                                     brown_ic)
                except:
                    # print vq_word_syn,l_word_syn,"->exception"
                    continue
                # wup_sim=(vq_word_syn).jcn_similarity(l_word_syn)
                if (type(wup_sim) != type(None)):
                    sum_of_sim = sum_of_sim + wup_sim
                    # sum_of_sim += 1
                    # print(" counted ",vq_word,l_word , "synset " , vq_word_syn , l_word_syn)
                else:
                    cnt_log = cnt_log + 1
                    # print("Not counted             ",vq_word,l_word , "synset " , vq_word_syn , l_word_syn)
                # input()
            sum_of_sim_all_levels.append(sum_of_sim)
            avg_of_sim_all_levels.append(sum_of_sim / len(list_i))

        # print("\n\n printing all lists")
        # for l in cl_listoflist:
        # 	print(l)

        # QUES WORK BEGIN
        # print ("Sim")
        for i in range(0, 6):
            final_sim_of_ques_with_all_levels[i] += avg_of_sim_all_levels[i]
        # 	print (final_sim_of_ques_with_all_levels[i],",")
        # print("\n")

        # print("area sim")
        for i in range(0, 6):
            final_area_sim_of_ques_with_all_levels[i] += sum_of_sim_all_levels[
                i]
        # 	print (final_area_sim_of_ques_with_all_levels[i],",")
        # print("\n")
        # print ("cnt_log",cnt_log)

    # print ("Final Sim")
    # for i in range(0,6):
    # 	print (final_sim_of_ques_with_all_levels[i],",")
    # print("\n")
    #
    # print ("Final Area Sim")
    # for i in range(0,6):
    # 	print (final_area_sim_of_ques_with_all_levels[i],",")
    # print("\n")

    #	maximum of all similarities values to find cl level
    final_level = 0
    max_sim = final_sim_of_ques_with_all_levels[0]
    for index, sim in enumerate(final_sim_of_ques_with_all_levels):
        if sim > max_sim:
            max_sim = sim
            final_level = index

    # print("\n")
    # print("avg wali list: " , avg_of_sim_all_levels)

    # print( "sum wali list: " , sum_of_sim_all_levels)

    # 	finding if word will be classified in  more than two levels
    count = 0
    indices_of_same_sim = []
    for i, sim in enumerate(final_sim_of_ques_with_all_levels):
        if sim == max_sim:
            count += 1
            indices_of_same_sim.append(i)

    # 	if word is in more than two levels
    if len(indices_of_same_sim) > 1:
        # print ("ques is in more than two levels")
        same_sim_list = []
        for index in indices_of_same_sim:
            same_sim_list.append(final_area_sim_of_ques_with_all_levels[index])

        max_sim_area = same_sim_list[0]
        for sim_area, index_of_max_sim in zip(same_sim_list,
                                              indices_of_same_sim):
            if sim_area > max_sim_area:
                max_sim_area = sim_area
                final_level = index_of_max_sim

    # print("final_level ",final_level)
    return final_level
Beispiel #19
0
def get_best_synset_pair(word_1, word_2):
    """
    Choose the pair with highest path similarity among all pairs.
    """
    max_sim = -1.0
    synsets_1 = wn.synsets(word_1)
    synsets_2 = wn.synsets(word_2)

    # if zero synsets are found, assign string similarity
    if len(synsets_1) == 0 or len(synsets_2) == 0:
        if string_sim == 'croft':
            # string_sim = 1 - nltk.jaccard_distance(set(word_1), set(word_2))
            str_sim = SequenceMatcher(None, word_1, word_2).ratio()
            return None, None, str_sim
        elif string_sim == 'li':
            return None, None, 0.0
    else:
        max_sim = -1.0
        best_pair = None, None
        for synset_1 in synsets_1:
            for synset_2 in synsets_2:

                # ignore if both words are from different POS
                if (synset_1._pos != synset_2._pos):
                    sim = 0
                else:  # same pos
                    if pos == 'noun':  # pos noun
                        if synset_1._pos != 'n' or synset_2._pos != 'n':
                            sim = 0
                        else:
                            if word_sim_algo == 'wup':
                                sim = wn.wup_similarity(synset_1, synset_2)
                            elif word_sim_algo == 'lin':
                                sim = wn.lin_similarity(
                                    synset_1, synset_2, brown_ic)
                            elif word_sim_algo == 'li':
                                sim = ws.li_similarity(synset_1, synset_2)
                            else:
                                sim = wn.path_similarity(synset_1, synset_2)
                    else:  # pos all
                        if word_sim_algo == 'wup':
                            sim = wn.wup_similarity(synset_1, synset_2)
                        elif word_sim_algo == 'lin':
                            if (synset_1._pos == 'v' or synset_1._pos
                                    == 'n') and (synset_2._pos == 'v'
                                                 or synset_2._pos == 'n'):
                                sim = wn.lin_similarity(
                                    synset_1, synset_2, brown_ic)
                            else:
                                sim = 0
                        elif word_sim_algo == 'li':
                            sim = ws.li_similarity(synset_1, synset_2)
                        else:
                            sim = wn.path_similarity(synset_1, synset_2)

                if sim == None:
                    sim = 0
                if sim > max_sim:
                    max_sim = sim
                    best_pair = synset_1, synset_2, max_sim
        return best_pair
Beispiel #20
0
def lin_similarity(synsets1, synsets2):
    similarity_function = lambda ss1, ss2: wn.lin_similarity(ss1, ss2, corpus)
    return __max_similarity(synsets1, synsets2, similarity_function)
def l_similarity(a,b):
	if wn.lin_similarity(wn.synsets(a)[0],wn.synsets(b)[0],genesis_ic) == None:
		return 0
	else:
		return wn.lin_similarity(wn.synsets(a)[0],wn.synsets(b)[0],genesis_ic)
Beispiel #22
0
            wn.synset(ani_list[indcolum][0][8:-2]), brown_ic)
        if results_jcn[indrow][indcolum] == 1:
            word_pairs.append(ani_list[indrow][0] + " vs. " +
                              ani_list[indcolum][0])
print(results_jcn)

#Lin Similarity
results_lin = np.zeros((len(ani_list), len(ani_list)))
word_pairs = []
for indrow in range(0, len(ani_list)):
    print("word: " + ani_list[indrow][0][8:-2])
    for indcolum in range(0, len(ani_list)):
        print(ani_list[indrow][0][8:-2] + " vs. " +
              ani_list[indcolum][0][8:-2])
        results_lin[indrow][indcolum] = wn.lin_similarity(
            wn.synset(ani_list[indrow][0][8:-2]),
            wn.synset(ani_list[indcolum][0][8:-2]), brown_ic)
        if results_lin[indrow][indcolum] == 1:
            word_pairs.append(ani_list[indrow][0] + " vs. " +
                              ani_list[indcolum][0])
print(results_lin)

#path similarity
results_path = np.zeros((len(ani_list), len(ani_list)))
word_pairs = []
for indrow in range(0, len(ani_list)):
    print("word: " + ani_list[indrow][0][8:-2])
    for indcolum in range(0, len(ani_list)):
        print(ani_list[indrow][0][8:-2] + " vs. " +
              ani_list[indcolum][0][8:-2])
        results_path[indrow][indcolum] = wn.path_similarity(
#Import IC calculation
from nltk.corpus import wordnet_ic
brown_ic = wordnet_ic.ic('ic-brown-resnik-add1.dat')
bnc_ic = wordnet_ic.ic('ic-bnc-resnik-add1.dat')

#For each pair of synsets, compute distance
for s1 in synsets:
  syn1 = wn.of2ss(s1)
  for s2 in synsets:
    syn2 = wn.of2ss(s2)
    distances_path[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.path_similarity(syn1,syn2)
    distances_lch[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.lch_similarity(syn1,syn2)
    distances_wup[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.wup_similarity(syn1,syn2)
    distances_res[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.res_similarity(syn1,syn2,brown_ic)
    distances_jcn[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.jcn_similarity(syn1,syn2,brown_ic)
    distances_lin[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.lin_similarity(syn1,syn2,brown_ic)
    distances_res_bnc[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.res_similarity(syn1,syn2,bnc_ic)
    distances_jcn_bnc[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.jcn_similarity(syn1,syn2,bnc_ic)
    distances_lin_bnc[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1 - wn.lin_similarity(syn1,syn2,bnc_ic)
    #distances_path[labelsNLTK.index(s1)][labelsNLTK.index(s2)] =1/(labelsNLTK.index(s2)+1) 
    #distances_lch[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1)
    #distances_wup[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1)  
    #distances_res[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1)  
    #distances_jcn[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1)  
    #distances_lin[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1)  
    #distances_res_bnc[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1)  
    #distances_jcn_bnc[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1)  
    #distances_lin_bnc[labelsNLTK.index(s1)][labelsNLTK.index(s2)] = 1/(labelsNLTK.index(s2)+1)  

print 'done computing wordnet distances'
     
Beispiel #24
-1
def similarity_by_infocontent(sense1, sense2, option):
    """ Returns similarity scores by information content. """
    if sense1.pos != sense2.pos: # infocontent sim can't do diff POS.
        return 0

    info_contents = ['ic-bnc-add1.dat', 'ic-bnc-resnik-add1.dat', 
                     'ic-bnc-resnik.dat', 'ic-bnc.dat', 
                     
                     'ic-brown-add1.dat', 'ic-brown-resnik-add1.dat', 
                     'ic-brown-resnik.dat', 'ic-brown.dat', 
                     
                     'ic-semcor-add1.dat', 'ic-semcor.dat',
                      
                     'ic-semcorraw-add1.dat', 'ic-semcorraw-resnik-add1.dat', 
                     'ic-semcorraw-resnik.dat', 'ic-semcorraw.dat', 
                     
                     'ic-shaks-add1.dat', 'ic-shaks-resnik.dat', 
                     'ic-shaks-resnink-add1.dat', 'ic-shaks.dat', 
                     
                     'ic-treebank-add1.dat', 'ic-treebank-resnik-add1.dat', 
                     'ic-treebank-resnik.dat', 'ic-treebank.dat']
  
    if option in ['res', 'resnik']:
        return wn.res_similarity(sense1, sense2, wnic.ic('ic-bnc-resnik-add1.dat'))
    #return min(wn.res_similarity(sense1, sense2, wnic.ic(ic)) \
    #             for ic in info_contents)

    elif option in ['jcn', "jiang-conrath"]:
        return wn.jcn_similarity(sense1, sense2, wnic.ic('ic-bnc-add1.dat'))
  
    elif option in ['lin']:
        return wn.lin_similarity(sense1, sense2, wnic.ic('ic-bnc-add1.dat'))