Python jaccard_distanceの例、nltk.metrics.jaccard_distance Pythonの例

コード例 #1

0

ファイルを表示

ファイル: similarities.py プロジェクト: Danfoa/SemEval-2012-task6-project

def ngrams_similarity(s1, s2, filter_stop_words=True):
    # Tokenize by sentences into words in lower case
    tokenized_sentence_1 = nltk.word_tokenize(s1.lower())
    tokenized_sentence_2 = nltk.word_tokenize(s2.lower())

    if filter_stop_words:
        tokenized_sentence_1 = [
            token for token in tokenized_sentence_1 if token not in stop_words
        ]
        tokenized_sentence_2 = [
            token for token in tokenized_sentence_2 if token not in stop_words
        ]

    grams_lst_1 = [w for w in nltk.ngrams(tokenized_sentence_1, 2)]
    grams_lst_2 = [w for w in nltk.ngrams(tokenized_sentence_2, 2)]
    if len(grams_lst_1) > 0 and len(grams_lst_2) > 0:
        sim2 = 1 - jaccard_distance(set(grams_lst_1), set(grams_lst_2))
    else:
        sim2 = 0

    grams_lst_1 = [w for w in nltk.ngrams(tokenized_sentence_1, 3)]
    grams_lst_2 = [w for w in nltk.ngrams(tokenized_sentence_2, 3)]
    if len(grams_lst_1) > 0 and len(grams_lst_2) > 0:
        sim3 = 1 - jaccard_distance(set(grams_lst_1), set(grams_lst_2))
    else:
        sim3 = 0

    grams_lst_1 = [w for w in nltk.ngrams(tokenized_sentence_1, 4)]
    grams_lst_2 = [w for w in nltk.ngrams(tokenized_sentence_2, 4)]
    if len(grams_lst_1) > 0 and len(grams_lst_2) > 0:
        sim4 = 1 - jaccard_distance(set(grams_lst_1), set(grams_lst_2))
    else:
        sim4 = 0

    return sim2, sim3, sim4

コード例 #2

0

ファイルを表示

def fun_1_5_2():
    def jacc_similarity(query, document):
        first = set(query).intersection(set(document))
        second = set(query).union(set(document))
        return len(first) / len(second)

    from nltk.metrics import jaccard_distance
    X = set([10, 20, 30, 40])
    Y = set([20, 30, 60])
    print jaccard_distance(X, Y)

コード例 #3

0

ファイルを表示

def compute_feature5(frases1, frases2, X_train_or_test):
    sw = set(stopwords.words('english'))
    wnl = WordNetLemmatizer()
    feature = []

    for sent1, sent2 in zip(frases1, frases2):
        sent1 = preprocess(sent1, wnl, sw)
        sent2 = preprocess(sent2, wnl, sw)
        jaccard_distance(set(sent1), set(sent2))
        feature.append(jaccard_distance(set(sent1), set(sent2)))

    X_train_or_test = np.concatenate(
        (X_train_or_test, np.array(feature).reshape(len(feature), 1)), axis=1)
    return X_train_or_test

コード例 #4

0

ファイルを表示

ファイル: iaa_utilities.py プロジェクト: nlgknowledge/commonsense

 def print_absolute_agreement(
         cls,
         dataframe: pd.DataFrame,
         iaa_by_column_dict: Optional[Dict] = None) -> None:
     if iaa_by_column_dict is None:
         iaa_by_column_dict = cls.run_closed_class_jaccard_and_masi(
             dataframe)
     for column in cls.CLOSED_CLASS_COLUMNS:
         df = iaa_by_column_dict[column]['df']
         print(f"Interannotator agreement for {column}")
         annotator_list = dataframe.source_spreadsheet.unique()
         print(" \t" +
               "\t".join([str(annotator) for annotator in annotator_list]))
         for a1 in annotator_list:
             a1_vals = list(df[df.source_spreadsheet == a1][column])
             print(f"{a1}", end="\t")
             pairwise_agreements = []
             for a2 in annotator_list:
                 a2_vals = list(df[df.source_spreadsheet == a2][column])
                 agreement_sum = 0
                 for a1_val, a2_val in zip(a1_vals, a2_vals):
                     agreement_sum += 1 - jaccard_distance(a1_val, a2_val)
                 pairwise_agreements.append(agreement_sum /
                                            min(len(a1_vals), len(a2_vals)))
                 print(f"{pairwise_agreements[-1]:.2f}", end="\t")
             print(
                 f"\t{(sum(pairwise_agreements) - 1) / (len(pairwise_agreements) - 1):.2f}"
             )
         print()
         print()

コード例 #5

0

ファイルを表示

ファイル: utils.py プロジェクト: pawankelkar/nl2sql-1

def jaccard_sim(word1, word2):
    set1 = set(word1)
    set2 = set(word2)

    coefficient = 1 - jaccard_distance(set1, set2)

    return coefficient

コード例 #6

0

ファイルを表示

ファイル: nominal_features.py プロジェクト: skvrahul/hackabout-2017

def jaccard_common(nominals):
    sents = brown.sents()
    sents_no_punct = []
    for sent in sents:
        sents_no_punct.append([
            ''.join(c for c in s if c not in string.punctuation) for s in sent
        ])
    sents_no_punct = [words for sent in sents_no_punct for words in sent]
    sents_no_punct = [word for word in sents_no_punct if word]
    five_grams = ngrams(sents_no_punct, 5)
    e1_words, e2_words = [], []
    for five_gram in five_grams:
        if nominals[0] in five_gram:
            for word in five_gram:
                if word != nominals[0]:
                    e1_words.append(word)
        elif nominals[1] in five_gram:
            for word in five_gram:
                if word != nominals[1]:
                    e2_words.append(word)
    e1_top, e2_top = [], []
    e1_count = Counter(e1_words)
    e2_count = Counter(e2_words)
    e1_top = [word[0] for word in e1_count.most_common(4)]
    e2_top = [word[0] for word in e2_count.most_common(4)]
    return [e1_top, e2_top], jaccard_distance(set(Counter(e1_words).keys()),
                                              set(Counter(e2_words).keys()))

コード例 #7

0

ファイルを表示

ファイル: ShallowSyntactic.py プロジェクト: anhtukhtn/Similarity

def jaccard_POS(sen_1, sen_2):
    pos_1 = split_and_POS(sen_1)
    pos_2 = split_and_POS(sen_2)

    pos_1 = set(pos_1)
    pos_2 = set(pos_2)
    return jaccard_distance(pos_1, pos_2)

コード例 #8

0

ファイルを表示

ファイル: map_wordnet_via_EVD.py プロジェクト: anhtukhtn/Similarity

def word_word_is_similarity(phrase_word_1, phrase_word_2):
  words_1 = Util.split_unicode_words(phrase_word_1)
  words_2 = Util.split_unicode_words(phrase_word_2)
  if len(words_1) == 0 or len(words_2) == 0:
    return False

  jaccard_similarity = jaccard_distance(words_1, words_2)

  return jaccard_similarity < 0.1

コード例 #9

0

ファイルを表示

ファイル: Ngrams.py プロジェクト: anhtukhtn/Similarity

def ngrams_word_for(sen_1, sen_2, n_grams):
  ngrams_1 = get_ngrams_for_sen(sen_1, n_grams)
  ngrams_2 = get_ngrams_for_sen(sen_2, n_grams)
  set_1 = set(ngrams_1)
  set_2 = set(ngrams_2)
  if len(set_1) == 0 or len(set_2) == 0:
    return 0.00001
  value = 1.00001 -  jaccard_distance(set_1, set_2)

  return value

コード例 #10

0

ファイルを表示

    def score(self, lbl_types, ref_types, stemmed_word):
        """Gives the Jaccard distance between the two sets."""

        # Hack: ref 23643 is empty after applying rules & so it case "A*D" from csv file
        if not len(ref_types): return 1

        if stemmed_word:
            ref_types = self.replace_stem(stemmed_word, ref_types, lbl_types)

        return jaccard_distance(lbl_types, ref_types)

コード例 #11

0

ファイルを表示

ファイル: ShallowSyntactic.py プロジェクト: anhtukhtn/Similarity

def jaccard_POS_ngrams(sen_1, sen_2, n_grams):
    pos_1 = split_and_POS(sen_1)
    pos_2 = split_and_POS(sen_2)

    pos_1 = get_ngrams_for(pos_1, n_grams)
    pos_2 = get_ngrams_for(pos_2, n_grams)

    pos_1 = set(pos_1)
    pos_2 = set(pos_2)
    return jaccard_distance(pos_1, pos_2)

コード例 #12

0

ファイルを表示

ファイル: distance_calculations.py プロジェクト: RJTK/kaggle_quora

def jaccard_distance_chunk(D):
    '''
    Calculates the jaccard distance between lemmatized list pairs.
    '''
    if len(D) > 0:
        D[JACCARD_DISTANCE] = D.loc[:, Q_WORD_TOKENIZED].apply(
            lambda x: jaccard_distance(set(literal_eval(x[0])),
                                       set(literal_eval(x[1]))),
            axis=1)
    return D

コード例 #13

0

ファイルを表示

ファイル: ResumeParser.py プロジェクト: rubeenarustum/resume-parser

def getName(namesList, email):

    temp = []
    for name in namesList:
        if str(name).lower() != str(
                email[0]).lower() and str(name).lower() != str(
                    email[1]).lower():
            temp.append(name)
    namesList = temp

    sim = 0.0
    person = None
    for name in namesList:
        simn = 0.0
        division = 0
        for mail in email:
            namemail = mail.split('@')
            namemail = str(namemail[0])
            if mail != None:
                char1_2 = set(ngrams(namemail, 2))
                char1_3 = set(ngrams(namemail, 3))
                char1_4 = set(ngrams(namemail, 4))
                char2_2 = set(ngrams(name, 2))
                char2_3 = set(ngrams(name, 3))
                char2_4 = set(ngrams(name, 4))
                char2_jd = 1.0 - jaccard_distance(char1_2, char2_2)
                char3_jd = 1.0 - jaccard_distance(char1_3, char2_3)
                char4_jd = 1.0 - jaccard_distance(char1_4, char2_4)
                simn += 0.2 * char2_jd + 0.5 * char3_jd + 0.3 * char4_jd
                division += 1
        if division != 0:
            simn /= division
            if simn > sim:
                sim = simn
                person = name
        else:
            person = None
            break
    #EXTRA
    if sim < 0.05:
        person = None
    #print sim
    return person

コード例 #14

0

ファイルを表示

ファイル: similarities.py プロジェクト: Danfoa/SemEval-2012-task6-project

def ne_simmilarity(s1, s2):
    sent1 = ner_transform(s1)
    sent2 = ner_transform(s2)
    # Compute similarity
    if len(sent1) > 0 and len(sent2) > 0:
        similarity = 1 - jaccard_distance(set(sent1), set(sent2))
        # Compute label of similarity
        return similarity
    else:
        return 0

コード例 #15

0

ファイルを表示

    def predict(self, data_frame, maximum=5):
        predicted = []
        for index, row in data_frame.iterrows():
            s1 = row['sentence0']
            s2 = row['sentence1']
            jaccard_similarity = (1 -
                                  jaccard_distance(set(s1), set(s2))) * maximum
            predicted.append(jaccard_similarity)

        return predicted

コード例 #16

0

ファイルを表示

def compute_feature10(frases1, frases2, X_train_or_test):
    feature = []

    for sent1, sent2 in zip(frases1, frases2):
        result1 = method(sent1)
        result2 = method(sent2)
        c = 1 - jaccard_distance(set(result1), set(result2))
        feature.append(c)

    X_train_or_test = np.concatenate(
        (X_train_or_test, np.array(feature).reshape(len(feature), 1)), axis=1)
    return X_train_or_test

コード例 #17

0

ファイルを表示

ファイル: util.py プロジェクト: KindYAK/NLPMonitor-DAGs

def validator(mappings_dict, client, index_theta_one, index_theta_two, datetime_from_tm_2, datetime_to_tm_1,
              number_of_topics):
    """
    pass
    """
    from sklearn.preprocessing import MinMaxScaler
    from nltk.metrics import jaccard_distance
    scaler = MinMaxScaler()
    scores = dict(zip(mappings_dict.keys(), [0] * len(mappings_dict)))
    scores_for_normalization = []
    for threshhold, map_dict in mappings_dict.items():
        cnt_matches_for_threshhold = 0
        for topic_parent, topic_childs_list in map_dict.items():

            theta_1 = search(client=client, index=index_theta_one,
                             query={'datetime__gte': datetime_from_tm_2, 'datetime__lte': datetime_to_tm_1,
                                    'topic_id': topic_parent, 'topic_weight__gte': 0.05},
                             source=['document_es_id'],
                             start=0,
                             end=1000000,
                             get_scan_obj=True
                             )
            scanned_parent = set([elem.document_es_id for elem in theta_1])

            for topic_child in topic_childs_list:
                theta_2 = search(client=client, index=index_theta_two,
                                 query={'datetime__gte': datetime_from_tm_2, 'datetime__lte': datetime_to_tm_1,
                                        'topic_id': topic_child, 'topic_weight__gte': 0.05},
                                 source=['document_es_id'],
                                 start=0,
                                 end=1000000,
                                 get_scan_obj=True
                                 )
                jaccard_score = 1 - jaccard_distance(scanned_parent, set([elem.document_es_id for elem in theta_2]))

                scores[threshhold] += jaccard_score
                cnt_matches_for_threshhold += 1
        try:
            avg_score = scores[threshhold] / cnt_matches_for_threshhold

            scores_for_normalization.append(avg_score)
            scores[threshhold] = [len(map_dict) / number_of_topics, avg_score]

        except ZeroDivisionError:
            scores[threshhold] = [len(map_dict) / number_of_topics, 0]

    scores_normalized = [score[0] for score in scaler.fit_transform(np.array(scores_for_normalization).reshape(-1, 1))]

    for i, items in enumerate(scores.items()):
        scores[items[0]] += [scores_normalized[i]]

    return scores

コード例 #18

0

ファイルを表示

ファイル: Literal.py プロジェクト: anhtukhtn/Similarity

def jaccard(sen_1, sen_2):
  tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(sen_1))
  words = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS' or pos == 'JJ' or pos == '' or pos == 'VB' or pos == 'VBN' or pos == 'VBD' or pos == 'RB')]

  sen_set_1 = set(words)

  tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(sen_2))
  words = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS' or pos == 'JJ' or pos == '' or pos == 'VB' or pos == 'VBN' or pos == 'VBD' or pos == 'RB')]

  sen_set_2 = set(words)

  jaccard_value = jaccard_distance(sen_set_1, sen_set_2)
  return jaccard_value

コード例 #19

0

ファイルを表示

 def jaccard_similarity_stemmer(self, config, sentence1, sentence2):
     """
     Computes stem unigram similarity.
     """
     tokens1 = [
         self.stemmer.stem(token) for token in word_tokenize(sentence1[1])
         if token not in self.punctuation_set
     ]
     tokens2 = [
         self.stemmer.stem(token) for token in word_tokenize(sentence2[1])
         if token not in self.punctuation_set
     ]
     return 1 - jaccard_distance(set(tokens1), set(tokens2))

コード例 #20

0

ファイルを表示

ファイル: SimOxWnDefinition.py プロジェクト: anhtukhtn/Similarity

def cal_ngrams_by_jacc(wn_grams, ox_grams):

  matrix_similarity_jaccard = [[0 for x in range(len(ox_grams))] for x in range(len(wn_grams))];

  for iWnWord in range(len(wn_grams)):

    wn_set = set(wn_grams[iWnWord]);

    for iDictWord in range(len(ox_grams)):

      dict_set = set(ox_grams[iDictWord])
      matrix_similarity_jaccard[iWnWord][iDictWord] = 1 - jaccard_distance(wn_set,dict_set);

  return matrix_similarity_jaccard

コード例 #21

0

ファイルを表示

ファイル: similarities.py プロジェクト: Danfoa/SemEval-2012-task6-project

def jaccard_similarity(s1, s2):
    try:
        tokenized_sentence_1 = nltk.word_tokenize(s1.lower())
        tokenized_sentence_2 = nltk.word_tokenize(s2.lower())
    except:
        print("Error: S1[%s] \n S2[%s]" % (s1, s2))
        return 0
    # Compute similarity
    if len(tokenized_sentence_1) > 0 and len(tokenized_sentence_2) > 0:
        similarity = 1 - jaccard_distance(set(tokenized_sentence_1),
                                          set(tokenized_sentence_2))
        return similarity
    else:
        return 0

コード例 #22

0

ファイルを表示

ファイル: extract.py プロジェクト: gatech/fmap

    def computeDistances(self):
        for k, v in sorted(self.keywords.iteritems()):
            prev = None
            for tup in v:
                self.allItems.append((tup[0], tup[1], k))
                cnt = len(self.allItems)-1
                if prev != None:
                    self.allSeq.append([prev, cnt, k])
                prev = cnt

        n=len(self.allItems)
        self.dist = numpy.zeros(shape=(n,n))
        for i in range(0,n):
            for j in range(0,n):
                try:
                    self.dist[i,j] = jaccard_distance(set(self.allItems[i][1]), set(self.allItems[j][1]))
                except ZeroDivisionError:
                    self.dist[i,j] = 0 #sys.maxint

コード例 #23

0

ファイルを表示

def get_dists(keyword):
    dists = []
    for word in words_preprocessed:
        dists.append({
            "edit_dist":
            edit_distance(word, keyword),
            "jaro_simi":
            jaro_similarity(word, keyword),
            "jaro_winkler_simi":
            jaro_winkler_similarity(word, keyword),
            "jaccard_dist":
            jaccard_distance(set(word), set(keyword)),
            "word":
            word,
            "keyword":
            keyword
        })
    return pd.DataFrame(dists).sort_values("edit_dist").iloc[0:3, :]

コード例 #24

0

ファイルを表示

ファイル: SimOxWnDefinition.py プロジェクト: anhtukhtn/Similarity

def similarity_by_jaccard(ox_defis, wn_defis):

  matrix_similarity_jaccard = [[0 for x in range(len(ox_defis))] for x in range(len(wn_defis))];

  for iWnWord in range(len(wn_defis)):

    tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wn_defis[iWnWord]));
    words = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS' or pos == 'JJ' or pos == '' or pos == 'VB' or pos == 'VBN' or pos == 'VBD' or pos == 'RB')];

    # words = nltk.wordpunct_tokenize(wn.synset(wn_defis[iWnWord].name()).definition());
    # print words
    for i in range(len(words)):
      words[i] = wordnet_lemmatizer.lemmatize(words[i]);
    wn_set = set(words);
#    print "\n"
#    print wn_set
    # wn_set = set(wn.synset(wn_defis[iWnWord].name()).definition().split())
    # print wn_set

    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # word-word
    for iDictWord in range(len(ox_defis)):

#      if not ox_defis[str(iDictWord)].has_key("d") or dict_words[str(iDictWord)]["d"] == None:
#        matrix_similarity_jaccard[iWnWord][iDictWord] = 1;
#        continue

      tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(ox_defis[iDictWord]));
      words = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS' or pos == 'JJ' or pos == '' or pos == 'VB' or pos == 'VBN' or pos == 'VBD' or pos == 'RB')];

      # words = nltk.wordpunct_tokenize(ox_defis[str(iDictWord)]["d"]);
      # print words
      for i in range(len(words)):
        words[i] = wordnet_lemmatizer.lemmatize(words[i]);
      dict_set = set(words);
#      print dict_set
      # print
      # dict_set = set(ox_defis[str(iDictWord)]["d"].encode('utf8').split());
      matrix_similarity_jaccard[iWnWord][iDictWord] = 1 - jaccard_distance(wn_set,dict_set);
#      matrix_similarity_jaccard[iWnWord][iDictWord] = cal_jacc_for_ngrams(wn_set, dict_set, 1)

  ########################################
  return matrix_similarity_jaccard

コード例 #25

0

ファイルを表示

ファイル: similarities.py プロジェクト: Danfoa/SemEval-2012-task6-project

def synsets_similarity(s1, s2):
    """
    Find the jaccard similarity between two sentences synsets using lesk algorithm
    to disambiguate words given their context.
    """
    lemmas_sentence_1, tagged_sentence_1 = lemmatize_sentence(s1.lower())
    lemmas_sentence_2, tagged_sentence_2 = lemmatize_sentence(s2.lower())

    # Disambiguate words and create list of sysnsets
    synsets_sentence_1 = []
    for (lemma, word_tag) in zip(lemmas_sentence_1, tagged_sentence_1):
        if lemma in stop_words:
            continue
        synset = lesk(lemmas_sentence_1, lemma, wordnet_pos_code(word_tag[1]))
        if synset is not None:
            synsets_sentence_1.append(synset)
        else:
            found = wordnet.synsets(lemma, wordnet_pos_code(word_tag[1]))
            if len(found) > 0:
                synsets_sentence_1.append(found[0])
                #print("Warn: lemma [%s] returned no disambiguation...using synset : %s" % (lemma, found[0]))

    synsets_sentence_2 = []
    for (lemma, word_tag) in zip(lemmas_sentence_2, tagged_sentence_2):
        if lemma in stop_words:
            continue
        synset = lesk(lemmas_sentence_2, lemma, wordnet_pos_code(word_tag[1]))
        if synset is not None:
            synsets_sentence_2.append(synset)
        else:
            found = wordnet.synsets(lemma, wordnet_pos_code(word_tag[1]))
            if len(found) > 0:
                synsets_sentence_2.append(found[0])
                #print("Warn: lemma [%s] returned no disambiguation...using synset : %s" % (lemma, found[0]))

    # Compute similarity
    if len(synsets_sentence_1) != 0 and len(synsets_sentence_2) != 0:
        similarity = 1 - jaccard_distance(set(synsets_sentence_1),
                                          set(synsets_sentence_2))
        return similarity
    else:
        return 0

コード例 #26

0

ファイルを表示

    def computeDistances(self):
        for k, v in sorted(self.keywords.iteritems()):
            prev = None
            for tup in v:
                self.allItems.append((tup[0], tup[1], k))
                cnt = len(self.allItems) - 1
                if prev != None:
                    self.allSeq.append([prev, cnt, k])
                prev = cnt

        n = len(self.allItems)
        self.dist = numpy.zeros(shape=(n, n))
        for i in range(0, n):
            for j in range(0, n):
                try:
                    self.dist[i,
                              j] = jaccard_distance(set(self.allItems[i][1]),
                                                    set(self.allItems[j][1]))
                except ZeroDivisionError:
                    self.dist[i, j] = 0  #sys.maxint

コード例 #27

0

ファイルを表示

 def number_overlap(self, config, sentence1, sentence2):
     """
     Computes the Jaccard distance between the sets of the cardinal numbers that appear in the two sentences.
     It tries to parse numbers expressed in words (for example 'two', 'three hundreds and forty-nine'), numbers using
     ',' (for example '16,432,970') and decimal numbers (for example '45,233.4123'). If the number can't be parsed,
     it's added as the corresponding string to the set.
     """
     numbers1 = []
     for i, word in enumerate(sentence1[2]):
         if sentence1[3][i] == 'CD':
             try:
                 numbers1.append(w2n.word_to_num(sentence1[2][i]))
             except ValueError:
                 try:
                     numbers1.append(
                         w2n.word_to_num(sentence1[2][i].replace(',', '')))
                 except ValueError:
                     try:
                         numbers1.append(
                             float(sentence1[2][i].replace(',', '')))
                     except ValueError:
                         numbers1.append(word)
     numbers2 = []
     for i, word in enumerate(sentence2[2]):
         if sentence2[3][i] == 'CD':
             try:
                 numbers2.append(w2n.word_to_num(sentence2[2][i]))
             except ValueError:
                 try:
                     numbers2.append(
                         w2n.word_to_num(sentence2[2][i].replace(',', '')))
                 except ValueError:
                     try:
                         numbers2.append(
                             float(sentence2[2][i].replace(',', '')))
                     except ValueError:
                         numbers2.append(word)
     try:
         return 1 - jaccard_distance(set(numbers1), set(numbers2))
     except ZeroDivisionError:
         return 0

コード例 #28

0

ファイルを表示

ファイル: similarities.py プロジェクト: Danfoa/SemEval-2012-task6-project

def dependency_similarity(s1, s2):
    """
    Find the jaccard similarity between the semantic depency parsing nodes of the sentences
    using CoreNLP dependency parser.
    """
    # pass
    parsed_sentence_1 = parser.raw_parse(s1)
    parsed_sentence_2 = parser.raw_parse(s2)

    tree1 = next(parsed_sentence_1)
    tree2 = next(parsed_sentence_2)

    triples1 = [t for t in tree1.triples()]
    triples2 = [t for t in tree2.triples()]

    # Compute similarity
    if len(triples1) != 0 and len(triples2) != 0:
        similarity = 1 - jaccard_distance(set(triples1), set(triples2))
        return similarity
    else:
        return 0

コード例 #29

0

ファイルを表示

def compute_feature8(frases1, frases2, X_train_or_test):
    feature = []
    sw = set(stopwords.words('english'))
    wnl = WordNetLemmatizer()

    feature = []

    for sent1, sent2 in zip(frases1, frases2):
        sent1b = sent1
        sent2b = sent2
        sent1 = preprocess(sent1, wnl, sw)
        sent2 = preprocess(sent2, wnl, sw)

        trigrams1 = list(nltk.trigrams(sent1))
        trigrams2 = list(nltk.trigrams(sent2))
        if len(trigrams1) == 0 or len(trigrams2) == 0:
            feature.append(0)
        else:
            feature.append(jaccard_distance(set(trigrams1), set(trigrams2)))

    X_train_or_test = np.concatenate((X_train_or_test, np.array(feature).reshape(len(feature), 1)), axis=1)
    return X_train_or_test

コード例 #30

0

ファイルを表示

ファイル: similarities.py プロジェクト: Danfoa/SemEval-2012-task6-project

def lemmas_similarity(s1, s2, filter_stop_words=True):
    """
    Jaccard lematized sentences similarity 
    """
    # Tokenize by sentences into words in lower case
    tokenized_sentence_1 = nltk.word_tokenize(s1.lower())
    tokenized_sentence_2 = nltk.word_tokenize(s2.lower())

    if not filter_stop_words:
        tokenized_sentence_1 = [
            token for token in tokenized_sentence_1 if token not in stop_words
        ]
        tokenized_sentence_2 = [
            token for token in tokenized_sentence_2 if token not in stop_words
        ]

    tagged_sentence_1 = pos_tag(
        tokenized_sentence_1)  # [ (word, POS_TAG), ...]
    tagged_sentence_2 = pos_tag(
        tokenized_sentence_2)  # [ (word, POS_TAG), ...]

    lemmas_sentence_1 = [
        lemmatize(tagged_word, wnl) for tagged_word in tagged_sentence_1
    ]
    lemmas_sentence_2 = [
        lemmatize(tagged_word, wnl) for tagged_word in tagged_sentence_2
    ]  # [LEMMA_1, ...]

    # Compute similarity
    if len(lemmas_sentence_1) > 0 and len(lemmas_sentence_2) > 0:
        similarity = 1 - jaccard_distance(set(lemmas_sentence_1),
                                          set(lemmas_sentence_2))
        # Compute label of similarity
        return similarity
    else:
        return 0

コード例 #31

0

ファイルを表示

ファイル: wh4t_webgraph_creator.py プロジェクト: 2mh/wahatttt

def create_graph():  
    g = nx.Graph()
    xml_docs = Collection()
    xml_docs_subset = xml_docs.get_docs(author="Wau Holland")
    docs_no = len(xml_docs_subset)
    id_dict = dict()
    stems_dict = dict()
    doc_id = 1
    
    print "Put stems into a dict for each document (with an uniq id) ..."
    print "Create nodes with all the documents' relevant information ..."
    pb = ProgressBar(maxval=docs_no).start()
    
    for xml_doc in xml_docs_subset:
        pb.update(doc_id)
        id_dict[xml_doc.get_xml_filename()] = doc_id
        g.add_node(doc_id, 
                   id = xml_doc.get_id(),
                   rawlen = xml_doc.get_rawlen(),
                   subj = xml_doc.get_subj(),
                   author = xml_doc.get_author(),
                   date = xml_doc.get_date(),
                   words = xml_doc.get_words(),
                   uniq_stems = list(xml_doc.get_stems(uniq=True, 
                                                       relev=True)),
                   rawcontent = xml_doc.get_rawcontent()
                   )
        doc_id += 1
        # It seems sometimes a list (-> set conversion) gets returned 
        # ... ugly. XXX
        stems_dict[doc_id] = set(xml_doc.get_stems(uniq=True, relev=True))
        
    print "Create undirected, weighted graph based on Jaccard similarity ..."
    no_of_edges = docs_no * (docs_no - 1) / 2
    pb = ProgressBar(maxval=no_of_edges).start()
    count = 1
    for doc_idx1 in stems_dict.keys():
        doc_idx2 = doc_idx1 + 1
        # Nothing left to compare
        if (doc_idx1 == docs_no):
            break
    
        while True:
            # print "Comparing: ", doc_idx1, doc_idx2
            
            # Find longer doc
            doc1_len, doc2_len = len(stems_dict[doc_idx1]), \
                                    len(stems_dict[doc_idx2])
            long_doc_len = max((doc1_len, doc2_len))
            short_doc_len = min((doc1_len, doc2_len))
            
            # In case a document has no useful stems to classify
            edge_weight = 0
            alias_coeff = 0
            if long_doc_len == 0 or short_doc_len == 0:
                pass
            else:
                alias_coeff = float(long_doc_len) / short_doc_len
            
                edge_weight = (1 - jaccard_distance(stems_dict[doc_idx1],
                                           stems_dict[doc_idx2])) \
                           * alias_coeff
                           
            print alias_coeff, edge_weight
            
            # Still redundant, only for testing
            if (edge_weight > 0.3):
                cluster_stems = stems_dict[doc_idx1].intersection(
                               stems_dict[doc_idx2])
                try: 
                    g.node[doc_idx1]['cluster_stems']
                except KeyError:
                    g.node[doc_idx1]['cluster_stems'] = cluster_stems
                else:
                    for stem in cluster_stems:
                        g.node[doc_idx1]['cluster_stems'].add(stem)
                try: 
                    g.node[doc_idx2]['cluster_stems']
                except KeyError:
                    g.node[doc_idx2]['cluster_stems'] = cluster_stems
                else:
                    for stem in cluster_stems:
                        g.node[doc_idx2]['cluster_stems'].add(stem)
            
            # To be made more flexible
            if edge_weight > 0.3:
                g.add_edge(doc_idx1, doc_idx2, weight=edge_weight)
            doc_idx2 += 1
            pb.update(count)
            count += 1
            if doc_idx2 > docs_no:
                break
    
    print "Draw graph showing possible clusters  ..."
    
    elarge = [(u,v) for (u,v,d) in g.edges(data=True) if d['weight'] > 0.4]
    emedium = [(u,v) for (u,v,d) in g.edges(data=True) 
              if d['weight'] > 0.2 and d['weight'] < 0.4]
    esmall = [(u,v) for (u,v,d) in g.edges(data=True) if d['weight'] <= 0.2]
    print "elarge: ", len(elarge)
    print "emedium: ", len(emedium)
    print "esmall: ", len(esmall)
       
    pos = nx.spring_layout(g, scale=20)
    #pos = nx.random_layout(g)
    
    dlarge = [n for n,d in g.degree_iter() if d >= 20]
    dmedium = [n for n,d in g.degree_iter() if d > 1 and d < 20]
    dsmall = [n for n,d in g.degree_iter() if d == 1]
    dnone = [n for n,d in g.degree_iter() if d == 0]
    print "dlarge: ", len(dlarge)
    print "dmedium: ", len(dmedium)
    print "dsmall: ", len(dsmall)
    print "dnone: ", len(dnone)
    
    # Draw nodes
    # nx.draw_networkx_nodes(g, pos, node_size=5, linewidths=0)
    nx.draw_networkx_nodes(g, pos, nodelist=dlarge, node_size=20,
                           node_color='b',
                           linewidths=0)
    nx.draw_networkx_nodes(g, pos, nodelist=dmedium, node_size=10,
                           node_color='g',
                           alpha=0.8, 
                           linewidths=0)
    nx.draw_networkx_nodes(g, pos, nodelist=dsmall, node_size=5,
                           node_color='b',
                           alpha=0.2,
                           linewidths=0,
                           )
    nx.draw_networkx_nodes(g, pos, nodelist=dnone, node_size=5,
                           node_color='b',
                           alpha=0.2, 
                           linewidths=0)
    
    # Draw edges
    nx.draw_networkx_edges(g, pos, edgelist=elarge, width=0.4)
    nx.draw_networkx_edges(g, pos, edgelist=emedium, edge_color='g', 
                           alpha=0.8, width=0.2)
    nx.draw_networkx_edges(g, pos, edgelist=esmall, width=0.1,
                           alpha=0.1, edge_color='b')
    
    # Draw labels
    # nx.draw_networkx_labels(g, pos, font_size=1, font_family='sans-serif')
    
    plt.axis('off')
    plt.figure(1, figsize=(20,20))
    """
    print "Print PNG"
    plt.savefig("graph.png", dpi=600)
    """
    # plt.show()
    nx.write_yaml(g, get_graph_file())
    d3_js.export_d3_js(g)

コード例 #32

0

ファイルを表示

ファイル: features.py プロジェクト: YacineGACI/subjective_attributes

def jaccard_distance_char(s1, s2):
    w1 = set(s1)
    w2 = set(s2)
    return jaccard_distance(w1, w2)

コード例 #33

0

ファイルを表示

ファイル: features.py プロジェクト: YacineGACI/subjective_attributes

def jaccard_distance_word(s1, s2):
    w1 = set(s1.split(" "))
    w2 = set(s2.split(" "))
    return jaccard_distance(w1, w2)

コード例 #34

0

ファイルを表示

def nGrams(d,inputfile):
    text.configure(state="normal")
    text.delete('1.0',END)
    e = inputfile+"/"
    fi = open(d+inputfile+".txt",'r')
    d += e
    fn = open(d+"ngrams.txt",'w')
    fi.readline()
    fn.write("char2\tchar3\tchar4\tword1\tword2\tword3\tlemma1\tlemma2\tlemma3\tposgm1\tposgm2\tposgm3\n")
    for line in fi.readlines():
        sents = line.split("\t")
        words_1 = sents[0].split()
        words_2 = sents[1].split()
        if len(words_1) < 3:
            words_1.append(".")
        if len(words_2) < 3:
            words_2.append(".")
        char1_2 = set(ngrams(sents[0],2))
        char1_3 = set(ngrams(sents[0],3))
        char1_4 = set(ngrams(sents[0],4))
        char2_2 = set(ngrams(sents[1],2))
        char2_3 = set(ngrams(sents[1],3))
        char2_4 = set(ngrams(sents[1],4))
        word1_1 = set(ngrams(words_1,1))
        word1_2 = set(ngrams(words_1,2))
        word1_3 = set(ngrams(words_1,3))
        word2_1 = set(ngrams(words_2,1))
        word2_2 = set(ngrams(words_2,2))
        word2_3 = set(ngrams(words_2,3))
        sent1 = nltk.pos_tag(words_1)
        sent2 = nltk.pos_tag(words_2)
        nouns = ['NN','NNS','NNP','NNPS']
        adj = ['JJ','JJR','JJS']
        adv = ['RB','RBR','RBS']
        verbs = ['VB','VBG','VBN','VBZ','VBP','VBD']
        all_pos = ['NN','NNS','NNP','NNPS','JJ','JJR','JJS','RB','RBR','RBS','VB','VBG','VBN','VBZ','VBP','VBD']
        posgm1 = []
        posgm2 = []
        
        for s in sent1:
            posgm1.append(s[1])
        for s in sent2:
            posgm2.append(s[1])
        
        pos = []
        for s in sent1:
            if s[1] not in all_pos:
                pos.append((s[0],'v'))
            if s[1] in nouns:
                pos.append((s[0],'n'))
            if s[1] in adj:
                pos.append((s[0],'a'))
            if s[1] in adv:
                pos.append((s[0],'r'))
            if s[1] in verbs:
                pos.append((s[0],'v'))
        sent1 = pos
        pos = []
        for s in sent2:
            if s[1] not in all_pos:
                pos.append((s[0],'v'))
            if s[1] in nouns:
                pos.append((s[0],'n'))
            if s[1] in adj:
                pos.append((s[0],'a'))
            if s[1] in adv:
                pos.append((s[0],'r'))
            if s[1] in verbs:
                pos.append((s[0],'v'))
        sent2 = pos
        lemma1 = []
        for s in sent1:
            lemma1.append(lemmatizer.lemmatize(s[0],s[1]))
        lemma2 = []
        for s in sent2:
            lemma2.append(lemmatizer.lemmatize(s[0],s[1]))
        
        print sents[0],sents[1]
        lemma1_1 = set(ngrams(lemma1,1))
        lemma1_2 = set(ngrams(lemma1,2))
        lemma1_3 = set(ngrams(lemma1,3))
        lemma2_1 = set(ngrams(lemma2,1))
        lemma2_2 = set(ngrams(lemma2,2))
        lemma2_3 = set(ngrams(lemma2,3))
        posgm1_1 = set(ngrams(posgm1,1))
        posgm1_2 = set(ngrams(posgm1,2))
        posgm1_3 = set(ngrams(posgm1,3))
        posgm2_1 = set(ngrams(posgm2,1))
        posgm2_2 = set(ngrams(posgm2,2))
        posgm2_3 = set(ngrams(posgm2,3))
        char2_jd = 1.0 - jaccard_distance(char1_2,char2_2)
        char3_jd = 1.0 - jaccard_distance(char1_3,char2_3)
        char4_jd = 1.0 - jaccard_distance(char1_4,char2_4)
        word1_jd = 1.0 - jaccard_distance(word1_1,word2_1)
        word2_jd = 1.0 - jaccard_distance(word1_2,word2_2)
        word3_jd = 1.0 - jaccard_distance(word1_3,word2_3)
        lemma1_jd = 1.0 - jaccard_distance(lemma1_1,lemma2_1)
        lemma2_jd = 1.0 - jaccard_distance(lemma1_2,lemma2_2)
        lemma3_jd = 1.0 - jaccard_distance(lemma1_3,lemma2_3)
        posgm1_jd = 1.0 - jaccard_distance(posgm1_1,posgm2_1)
        posgm2_jd = 1.0 - jaccard_distance(posgm1_2,posgm2_2)
        posgm3_jd = 1.0 - jaccard_distance(posgm1_3,posgm2_3)
        text.insert(INSERT,"%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t\n" % (sents[0],sents[1],char2_jd,char3_jd,char4_jd,word1_jd,word2_jd,word3_jd,lemma1_jd,lemma2_jd,lemma3_jd,posgm1_jd,posgm2_jd,posgm3_jd))
        fn.write("%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n" % (char2_jd,char3_jd,char4_jd,word1_jd,word2_jd,word3_jd,lemma1_jd,lemma2_jd,lemma3_jd,posgm1_jd,posgm2_jd,posgm3_jd))
        #print "LCS of ",sent1,sent2,"is: ",lcs(sent1,sent2)/(1.0 * min_len)
    fi.close()
    fn.close()
    text.configure(state="disabled")

コード例 #35

0

ファイルを表示

ファイル: CompareVietNetOxford.py プロジェクト: anhtukhtn/Similarity

def compareVietNetAndOxford(dict_VietNet, dict_Oxford):

    for WORD in dict_Oxford:

        if len(dict_Oxford[WORD]) == 0:
            continue

        # if WORD == "BA":
        # print "holyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyy"

        wn_words = wn.synsets(WORD, pos="n")
        if wn_words == None:
            continue

        if WORD == "baby":
            a = 1

        if dict_VietNet.has_key(WORD):

            arr_VietNet = dict_VietNet[WORD]
            arr_Oxford = dict_Oxford[WORD]

            matrix_similarity = [[0 for x in range(len(arr_Oxford))] for x in range(len(wn_words))]

            for iWn in range(len(wn_words)):

                definitionWn = wn.synset(wn_words[iWn].name()).definition()

                vietNet = {}
                for iVietNet in arr_VietNet:

                    levenshtein_vn_wn = Util.levenshtein(arr_VietNet[iVietNet]["d"], definitionWn)

                    if levenshtein_vn_wn < len(definitionWn) / 2.0:
                        vietNet = arr_VietNet[iVietNet]
                        break

                if not vietNet.has_key("tv"):
                    vietNet["tv"] = ""

                viet_net_tv = vietNet["tv"]

                for iOxford in range(len(arr_Oxford)):
                    oxford = arr_Oxford[str(iOxford)]

                    vietNet_tv = viet_net_tv

                    if not oxford.has_key("tv"):
                        continue
                    oxford_tv = oxford["tv"].encode("utf-8")

                    vietNet_tv.replace(";", "")
                    oxford_tv = oxford_tv.replace(";", "")
                    oxford_tv = oxford_tv.replace(",", "")
                    oxford_tv = oxford_tv.replace("/", " ")

                    arr_tv_oxford = set(oxford_tv.split(" "))
                    arr_tv_vietnet = set(vietNet_tv.split(" "))

                    jaccard = jaccard_distance(arr_tv_oxford, arr_tv_vietnet)
                    print arr_tv_vietnet
                    print arr_tv_oxford
                    print jaccard
                    matrix_similarity[iWn][iOxford] = 0
                    if jaccard < 0.95:
                        matrix_similarity[iWn][iOxford] = 1

                matrix_similarity[iWn].insert(0, viet_net_tv + "<>" + definitionWn.encode("utf-8"))

            print matrix_similarity
            # - - - - - - - - - - - - - - - - - - - - - - - - -
            # col
            # for i in range(len(dict_VietNet[WORD])):
            #   matrix_similarity[i].insert(0,dict_VietNet[WORD][i]["tv"] + "<>" + dict_VietNet[WORD][i]["d"]);

            # - - - - - - - - - - - - - - - - - - - - - - - - -
            # row
            arrRowDict = []
            arrRowDict.append(WORD)
            for i in range(len(dict_Oxford[WORD])):
                if not dict_Oxford[WORD][str(i)].has_key("tv"):
                    dict_Oxford[WORD][str(i)]["tv"] = "-"
                if not dict_Oxford[WORD][str(i)].has_key("d"):
                    dict_Oxford[WORD][str(i)]["d"] = "-"
                if dict_Oxford[WORD][str(i)]["d"] == None:
                    dict_Oxford[WORD][str(i)]["d"] = "-"

                arrRowDict.append(
                    dict_Oxford[WORD][str(i)]["tv"].encode("utf-8")
                    + "<>"
                    + dict_Oxford[WORD][str(i)]["d"].encode("utf-8")
                )

            FileProcess.append_to_excel_file(
                "Results/parameters/VN_Ox/" + "compare_VN_Ox_2_2.1.csv", arrRowDict, matrix_similarity
            )

コード例 #36

0

ファイルを表示

    def calculate_jaccard(self, s0, s1):
        lemms_0 = set([a.lower() for a in s0 if a])
        lemms_1 = set([a.lower() for a in s1 if a])

        jaccard_simmilarity = (1 - jaccard_distance(lemms_0, lemms_1))
        return jaccard_simmilarity

コード例 #37

0

ファイルを表示

ファイル: linguistics.py プロジェクト: interpretation-experiment/analysis

 def unordered_content_distance(self, sentence):
     """Jaccard distance on (unordered) content words between `self` and
     `sentence`."""
     return jaccard_distance(set(self.content_words),
                             set(sentence.content_words))

コード例 #38

0

ファイルを表示

ファイル: xgboost2.py プロジェクト: WhiteIsClosing/Kaggle-Quora

def jaccard_dist(row):
    return jaccard_distance(set(str(row['question1'])),
                            set(str(row['question2'])))

コード例 #39

0

ファイルを表示

ファイル: utils.py プロジェクト: sigstj/fmap

def getJaccardDistance(a, b):
    try:
        jd = jaccard_distance(set(a), set(b))
    except ZeroDivisionError:
        jd = 0
    return jd

コード例 #40

0

ファイルを表示

ファイル: SimilaritySample.py プロジェクト: anhtukhtn/Similarity

def similarity_by_synsets_synsets_nbest_withword_average(WORD, dict_words):


  if WORD == "bank":
    asf = 0;
  # - - - - - - - - - - - - - - - - - - - - - - - - - - -
  #
  # dictionary data
  dict_words_synsets = get_nbest_synsets_n_v_with_word(dict_words,WORD);
  # print "dict-word_synsets"
  # print dict_words_synsets

  # - - - - - - - - - - - - - - - - - - - - - - - - - - -
  #
  # wordnet data

  wn_words = wn.synsets(WORD, pos = 'n');
  print "wn_words -------"
  print wn_words;

  wn_words_synsets = WordnetProcess.get_synsets_n_v(WORD, wn_words);

  print wn_words_synsets

  # matrix for similarity dict_words vs wn_words
  matrix_similarity = [[0 for x in range(len(dict_words))] for x in range(len(wn_words))];

  # - - - - - - - - - - - - - - - - - - - - - - - - - - -

  ####################################################################################################
  #
  # calculate 2d matrix of p

  for iWnWord in range(len(wn_words)):

    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # word-word
    for iDictWord in range(len(dict_words)):

      p_iWnWord_iDictWord = 0.;

      arr_p_word = [];
      #
      for dict_synset in dict_words_synsets[iDictWord]:

        # print "------------ dict noun"
        # print dictNoun;
        p_dictNoun_wnNouns = 0;

        # for some nouns don't have synsets

        arr_p  = [];

        # - - - - - - - - - - - - - - - - - - - - - - - -

        for wn_synset in wn_words_synsets[iWnWord]:
          #
          p_max = dict_synset.path_similarity(wn_synset);
          if p_max == None:
            continue

          arr_p.append(p_max);

          # print p_max

        arr_p = sorted(arr_p, reverse=True);

        nBest = 8;
        count = 0.0001;
        for i in xrange(0, len(arr_p)-1):
          if i < nBest:
            p_dictNoun_wnNouns += arr_p[i];
            count += 1;

        p_dictNoun_wnNouns = p_dictNoun_wnNouns/count;
        arr_p_word.append(p_dictNoun_wnNouns);

      arr_p_word = sorted(arr_p_word, reverse=True);
      nBest = 10;
      count = 5;
      for i in range(len(arr_p_word)):
        if i < nBest:
          if nBest > len(arr_p_word):
            if i == 0:
              p_iWnWord_iDictWord += arr_p_word[i]*5.;
            elif i< nBest/3:
              p_iWnWord_iDictWord += arr_p_word[i]*1;
            else:
              p_iWnWord_iDictWord += arr_p_word[i]*1;
          else:
            if i == 0:
              p_iWnWord_iDictWord += arr_p_word[i]*5.;
            elif i< len(arr_p_word)/3:
              p_iWnWord_iDictWord += arr_p_word[i]*1;
            else:
              p_iWnWord_iDictWord += arr_p_word[i]*1;

          count += 1;

      if count == 0:
        p_iWnWord_iDictWord = 0;
      else:
        p_iWnWord_iDictWord = p_iWnWord_iDictWord/count
      matrix_similarity[iWnWord][iDictWord] = p_iWnWord_iDictWord;
      # - - - - - - - - - - - - - - - - - - - - - - - - -

    # word-word
    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
  ####################################################################################################

  print "----------------------------------------------------"
  s = [[str(e) for e in row] for row in matrix_similarity]
  lens = [max(map(len, col)) for col in zip(*s)]
  fmt = '\t'.join('{{:{}}}'.format(x) for x in lens)
  table = [fmt.format(*row) for row in s]
  print '\n'.join(table)

  # - - - - - - - - - - - - - - - - - - - - - - - - - - -
  #
  # dictionary data

  wn_words = dict_words;
  wn_words_synsets = get_nbest_synsets_n_v_with_word(wn_words,WORD);

  # - - - - - - - - - - - - - - - - - - - - - - - - - - -
  #
  # wordnet data

  dict_words = wn.synsets(WORD, pos = 'n');
  # print wn_words;
  dict_words_synsets = WordnetProcess.get_synsets_n_v(WORD, dict_words);

  print "sysnets -----------------------.----.-----.--.-"

  # matrix for similarity dict_words vs wn_words
  matrix_similarity_reverse = [[0 for x in range(len(dict_words))] for x in range(len(wn_words))];

  # - - - - - - - - - - - - - - - - - - - - - - - - - - -

  ####################################################################################################
  #
  # calculate 2d matrix of p

  for iWnWord in range(len(wn_words)):

    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # word-word
    for iDictWord in range(len(dict_words)):

      p_iWnWord_iDictWord = 0.;

      arr_p_word = [];

      for dict_synset in dict_words_synsets[iDictWord]:

        # print dictNoun;
        p_dictNoun_wnNouns = 0;

        # for some nouns don't have synsets
        countwnNouns = 0.00000001;

        arr_p  = [];

        # - - - - - - - - - - - - - - - - - - - - - - - -

        for wn_synset in wn_words_synsets[iWnWord]:

          p_max = dict_synset.path_similarity(wn_synset);
          if p_max != None:
            arr_p.append(p_max);

          # print p_max
          # - - - - - - - - - - - - - - - - - - - - - - - -

        arr_p = sorted(arr_p, reverse=True);

        nBest = 8;
        count = 0.0001
        for i in range(len(arr_p)):
          if i < nBest:
            p_dictNoun_wnNouns += arr_p[i];
            count +=1

        p_dictNoun_wnNouns = p_dictNoun_wnNouns/count;
        arr_p_word.append(p_dictNoun_wnNouns);

      arr_p_word = sorted(arr_p_word, reverse=True);
      nBest = 10;
      count = 5;
      for i in xrange(0, len(arr_p_word)-1):
        if i < nBest:
          if nBest > len(arr_p_word):
            if i == 0:
              p_iWnWord_iDictWord += arr_p_word[i]*5;
            elif i< nBest/3:
              p_iWnWord_iDictWord += arr_p_word[i]*1.;
            else:
              p_iWnWord_iDictWord += arr_p_word[i]*1;
          else:
            if i == 0:
              p_iWnWord_iDictWord += arr_p_word[i]*5.;
            elif i< len(arr_p_word)/3:
              p_iWnWord_iDictWord += arr_p_word[i]*1.;
            else:
              p_iWnWord_iDictWord += arr_p_word[i]*1;

          count += 1;

      if count == 0:
        p_iWnWord_iDictWord = 0;
      else:
        p_iWnWord_iDictWord = p_iWnWord_iDictWord/count
      matrix_similarity_reverse[iWnWord][iDictWord] = p_iWnWord_iDictWord;
      # - - - - - - - - - - - - - - - - - - - - - - - - -

    # word-word
    # - - - - - - - - - - - - - - - - - - - - - - - - - - -

  ####################################################################################################

  print "----------------------------------------------------"
  s = [[str(e) for e in row] for row in matrix_similarity_reverse]
  lens = [max(map(len, col)) for col in zip(*s)]
  fmt = '\t'.join('{{:{}}}'.format(x) for x in lens)
  table = [fmt.format(*row) for row in s]
  print '\n'.join(table)

    # word-word
    # - - - - - - - - - - - - - - - - - - - - - - - - - - -

  dict_words = wn_words;
  wn_words = wn.synsets(WORD, pos = 'n');

  for iWnWord in range(len(wn_words)):
    for iDictWord in range(len(dict_words)):
      matrix_similarity[iWnWord][iDictWord] = matrix_similarity[iWnWord][iDictWord] + matrix_similarity_reverse[iDictWord][iWnWord];
      matrix_similarity[iWnWord][iDictWord] /= 2;

  ####################################################################################################

  print "----------------------------------------------------"
  s = [[str(e) for e in row] for row in matrix_similarity]
  lens = [max(map(len, col)) for col in zip(*s)]
  fmt = '\t'.join('{{:{}}}'.format(x) for x in lens)
  table = [fmt.format(*row) for row in s]
  print '\n'.join(table)


  ####################################################################################################
  #
  # @brief:
  #

  matrix_similarity_jaccard = [[0 for x in range(len(dict_words))] for x in range(len(wn_words))];

  for iWnWord in range(len(wn_words)):

    tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wn.synset(wn_words[iWnWord].name()).definition()));
    words = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS' or pos == 'JJ' or pos == '' or pos == 'VB' or pos == 'VBN' or pos == 'VBD' or pos == 'RB')];

    # words = nltk.wordpunct_tokenize(wn.synset(wn_words[iWnWord].name()).definition());
    # print words
    for i in range(len(words)):
      words[i] = wordnet_lemmatizer.lemmatize(words[i]);
    wn_set = set(words);
    # wn_set = set(wn.synset(wn_words[iWnWord].name()).definition().split())
    # print wn_set

    # - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # word-word
    for iDictWord in range(len(dict_words)):

      if not dict_words[str(iDictWord)].has_key("d") or dict_words[str(iDictWord)]["d"] == None:
        matrix_similarity_jaccard[iWnWord][iDictWord] = 1;
        continue

      tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(dict_words[str(iDictWord)]["d"]));
      words = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS' or pos == 'JJ' or pos == '' or pos == 'VB' or pos == 'VBN' or pos == 'VBD' or pos == 'RB')];

      # words = nltk.wordpunct_tokenize(dict_words[str(iDictWord)]["d"]);
      # print words
      for i in range(len(words)):
        words[i] = wordnet_lemmatizer.lemmatize(words[i]);
      dict_set = set(words);
      # print
      # dict_set = set(dict_words[str(iDictWord)]["d"].encode('utf8').split());
      matrix_similarity_jaccard[iWnWord][iDictWord] = jaccard_distance(wn_set,dict_set);


  for iWnWord in range(len(wn_words)):
    for iDictWord in range(len(dict_words)):
      matrix_similarity[iWnWord][iDictWord] = matrix_similarity[iWnWord][iDictWord]*10 + 2*(1-matrix_similarity_jaccard[iWnWord][iDictWord]);
      matrix_similarity[iWnWord][iDictWord] /= 12;

  ####################################################################################################

  print "----------------------------------------------------"
  s = [[str(e) for e in row] for row in matrix_similarity]
  lens = [max(map(len, col)) for col in zip(*s)]
  fmt = '\t'.join('{{:{}}}'.format(x) for x in lens)
  table = [fmt.format(*row) for row in s]
  print '\n'.join(table)


  ####################################################################################################
  #
  # write file

  # - - - - - - - - - - - - - - - - - - - - - - - - -
  # col
  arrColWn = [];
  for i in range(len(wn_words)):
    matrix_similarity[i].insert(0,wn.synset(wn_words[i].name()).definition());

  # - - - - - - - - - - - - - - - - - - - - - - - - -
  # row
  arrRowDict = [];
  arrRowDict.append("--");
  for i in range(len(dict_words)):
    if not dict_words[str(i)].has_key('tv'):
      dict_words[str(i)]['tv'] = "--";
    if dict_words[str(i)]['tv'] == None:
      dict_words[str(i)]['tv'] = "--"
    arrRowDict.append(dict_words[str(i)]["tv"].encode('utf8'));

  FileProcess.write_to_excel_file("Results/"+WORD+"_synsets_synsets_nbest_withword_average.csv",arrRowDict,matrix_similarity)

コード例 #41

0

ファイルを表示

ファイル: contents.py プロジェクト: interpretation-experiment/analysis

 def uc_distance(self, sentence):
     """Jaccard distance on (unordered) content lemmas between `self` and
     `sentence`."""
     return jaccard_distance(set(self.content_lemmas),
                             set(sentence.content_lemmas))

コード例 #42

0

ファイルを表示

ファイル: generate_generic_features.py プロジェクト: shj1987/MEDIQA_WTMED

def _jaccard(sent1, sent2):
    sent1 = set(sent1)
    sent2 = set(sent2)
    return jaccard_distance(sent1, sent2)