def ngrams_similarity(s1, s2, filter_stop_words=True): # Tokenize by sentences into words in lower case tokenized_sentence_1 = nltk.word_tokenize(s1.lower()) tokenized_sentence_2 = nltk.word_tokenize(s2.lower()) if filter_stop_words: tokenized_sentence_1 = [ token for token in tokenized_sentence_1 if token not in stop_words ] tokenized_sentence_2 = [ token for token in tokenized_sentence_2 if token not in stop_words ] grams_lst_1 = [w for w in nltk.ngrams(tokenized_sentence_1, 2)] grams_lst_2 = [w for w in nltk.ngrams(tokenized_sentence_2, 2)] if len(grams_lst_1) > 0 and len(grams_lst_2) > 0: sim2 = 1 - jaccard_distance(set(grams_lst_1), set(grams_lst_2)) else: sim2 = 0 grams_lst_1 = [w for w in nltk.ngrams(tokenized_sentence_1, 3)] grams_lst_2 = [w for w in nltk.ngrams(tokenized_sentence_2, 3)] if len(grams_lst_1) > 0 and len(grams_lst_2) > 0: sim3 = 1 - jaccard_distance(set(grams_lst_1), set(grams_lst_2)) else: sim3 = 0 grams_lst_1 = [w for w in nltk.ngrams(tokenized_sentence_1, 4)] grams_lst_2 = [w for w in nltk.ngrams(tokenized_sentence_2, 4)] if len(grams_lst_1) > 0 and len(grams_lst_2) > 0: sim4 = 1 - jaccard_distance(set(grams_lst_1), set(grams_lst_2)) else: sim4 = 0 return sim2, sim3, sim4
def fun_1_5_2(): def jacc_similarity(query, document): first = set(query).intersection(set(document)) second = set(query).union(set(document)) return len(first) / len(second) from nltk.metrics import jaccard_distance X = set([10, 20, 30, 40]) Y = set([20, 30, 60]) print jaccard_distance(X, Y)
def compute_feature5(frases1, frases2, X_train_or_test): sw = set(stopwords.words('english')) wnl = WordNetLemmatizer() feature = [] for sent1, sent2 in zip(frases1, frases2): sent1 = preprocess(sent1, wnl, sw) sent2 = preprocess(sent2, wnl, sw) jaccard_distance(set(sent1), set(sent2)) feature.append(jaccard_distance(set(sent1), set(sent2))) X_train_or_test = np.concatenate( (X_train_or_test, np.array(feature).reshape(len(feature), 1)), axis=1) return X_train_or_test
def print_absolute_agreement( cls, dataframe: pd.DataFrame, iaa_by_column_dict: Optional[Dict] = None) -> None: if iaa_by_column_dict is None: iaa_by_column_dict = cls.run_closed_class_jaccard_and_masi( dataframe) for column in cls.CLOSED_CLASS_COLUMNS: df = iaa_by_column_dict[column]['df'] print(f"Interannotator agreement for {column}") annotator_list = dataframe.source_spreadsheet.unique() print(" \t" + "\t".join([str(annotator) for annotator in annotator_list])) for a1 in annotator_list: a1_vals = list(df[df.source_spreadsheet == a1][column]) print(f"{a1}", end="\t") pairwise_agreements = [] for a2 in annotator_list: a2_vals = list(df[df.source_spreadsheet == a2][column]) agreement_sum = 0 for a1_val, a2_val in zip(a1_vals, a2_vals): agreement_sum += 1 - jaccard_distance(a1_val, a2_val) pairwise_agreements.append(agreement_sum / min(len(a1_vals), len(a2_vals))) print(f"{pairwise_agreements[-1]:.2f}", end="\t") print( f"\t{(sum(pairwise_agreements) - 1) / (len(pairwise_agreements) - 1):.2f}" ) print() print()
def jaccard_sim(word1, word2): set1 = set(word1) set2 = set(word2) coefficient = 1 - jaccard_distance(set1, set2) return coefficient
def jaccard_common(nominals): sents = brown.sents() sents_no_punct = [] for sent in sents: sents_no_punct.append([ ''.join(c for c in s if c not in string.punctuation) for s in sent ]) sents_no_punct = [words for sent in sents_no_punct for words in sent] sents_no_punct = [word for word in sents_no_punct if word] five_grams = ngrams(sents_no_punct, 5) e1_words, e2_words = [], [] for five_gram in five_grams: if nominals[0] in five_gram: for word in five_gram: if word != nominals[0]: e1_words.append(word) elif nominals[1] in five_gram: for word in five_gram: if word != nominals[1]: e2_words.append(word) e1_top, e2_top = [], [] e1_count = Counter(e1_words) e2_count = Counter(e2_words) e1_top = [word[0] for word in e1_count.most_common(4)] e2_top = [word[0] for word in e2_count.most_common(4)] return [e1_top, e2_top], jaccard_distance(set(Counter(e1_words).keys()), set(Counter(e2_words).keys()))
def jaccard_POS(sen_1, sen_2): pos_1 = split_and_POS(sen_1) pos_2 = split_and_POS(sen_2) pos_1 = set(pos_1) pos_2 = set(pos_2) return jaccard_distance(pos_1, pos_2)
def word_word_is_similarity(phrase_word_1, phrase_word_2): words_1 = Util.split_unicode_words(phrase_word_1) words_2 = Util.split_unicode_words(phrase_word_2) if len(words_1) == 0 or len(words_2) == 0: return False jaccard_similarity = jaccard_distance(words_1, words_2) return jaccard_similarity < 0.1
def ngrams_word_for(sen_1, sen_2, n_grams): ngrams_1 = get_ngrams_for_sen(sen_1, n_grams) ngrams_2 = get_ngrams_for_sen(sen_2, n_grams) set_1 = set(ngrams_1) set_2 = set(ngrams_2) if len(set_1) == 0 or len(set_2) == 0: return 0.00001 value = 1.00001 - jaccard_distance(set_1, set_2) return value
def score(self, lbl_types, ref_types, stemmed_word): """Gives the Jaccard distance between the two sets.""" # Hack: ref 23643 is empty after applying rules & so it case "A*D" from csv file if not len(ref_types): return 1 if stemmed_word: ref_types = self.replace_stem(stemmed_word, ref_types, lbl_types) return jaccard_distance(lbl_types, ref_types)
def jaccard_POS_ngrams(sen_1, sen_2, n_grams): pos_1 = split_and_POS(sen_1) pos_2 = split_and_POS(sen_2) pos_1 = get_ngrams_for(pos_1, n_grams) pos_2 = get_ngrams_for(pos_2, n_grams) pos_1 = set(pos_1) pos_2 = set(pos_2) return jaccard_distance(pos_1, pos_2)
def jaccard_distance_chunk(D): ''' Calculates the jaccard distance between lemmatized list pairs. ''' if len(D) > 0: D[JACCARD_DISTANCE] = D.loc[:, Q_WORD_TOKENIZED].apply( lambda x: jaccard_distance(set(literal_eval(x[0])), set(literal_eval(x[1]))), axis=1) return D
def getName(namesList, email): temp = [] for name in namesList: if str(name).lower() != str( email[0]).lower() and str(name).lower() != str( email[1]).lower(): temp.append(name) namesList = temp sim = 0.0 person = None for name in namesList: simn = 0.0 division = 0 for mail in email: namemail = mail.split('@') namemail = str(namemail[0]) if mail != None: char1_2 = set(ngrams(namemail, 2)) char1_3 = set(ngrams(namemail, 3)) char1_4 = set(ngrams(namemail, 4)) char2_2 = set(ngrams(name, 2)) char2_3 = set(ngrams(name, 3)) char2_4 = set(ngrams(name, 4)) char2_jd = 1.0 - jaccard_distance(char1_2, char2_2) char3_jd = 1.0 - jaccard_distance(char1_3, char2_3) char4_jd = 1.0 - jaccard_distance(char1_4, char2_4) simn += 0.2 * char2_jd + 0.5 * char3_jd + 0.3 * char4_jd division += 1 if division != 0: simn /= division if simn > sim: sim = simn person = name else: person = None break #EXTRA if sim < 0.05: person = None #print sim return person
def ne_simmilarity(s1, s2): sent1 = ner_transform(s1) sent2 = ner_transform(s2) # Compute similarity if len(sent1) > 0 and len(sent2) > 0: similarity = 1 - jaccard_distance(set(sent1), set(sent2)) # Compute label of similarity return similarity else: return 0
def predict(self, data_frame, maximum=5): predicted = [] for index, row in data_frame.iterrows(): s1 = row['sentence0'] s2 = row['sentence1'] jaccard_similarity = (1 - jaccard_distance(set(s1), set(s2))) * maximum predicted.append(jaccard_similarity) return predicted
def compute_feature10(frases1, frases2, X_train_or_test): feature = [] for sent1, sent2 in zip(frases1, frases2): result1 = method(sent1) result2 = method(sent2) c = 1 - jaccard_distance(set(result1), set(result2)) feature.append(c) X_train_or_test = np.concatenate( (X_train_or_test, np.array(feature).reshape(len(feature), 1)), axis=1) return X_train_or_test
def validator(mappings_dict, client, index_theta_one, index_theta_two, datetime_from_tm_2, datetime_to_tm_1, number_of_topics): """ pass """ from sklearn.preprocessing import MinMaxScaler from nltk.metrics import jaccard_distance scaler = MinMaxScaler() scores = dict(zip(mappings_dict.keys(), [0] * len(mappings_dict))) scores_for_normalization = [] for threshhold, map_dict in mappings_dict.items(): cnt_matches_for_threshhold = 0 for topic_parent, topic_childs_list in map_dict.items(): theta_1 = search(client=client, index=index_theta_one, query={'datetime__gte': datetime_from_tm_2, 'datetime__lte': datetime_to_tm_1, 'topic_id': topic_parent, 'topic_weight__gte': 0.05}, source=['document_es_id'], start=0, end=1000000, get_scan_obj=True ) scanned_parent = set([elem.document_es_id for elem in theta_1]) for topic_child in topic_childs_list: theta_2 = search(client=client, index=index_theta_two, query={'datetime__gte': datetime_from_tm_2, 'datetime__lte': datetime_to_tm_1, 'topic_id': topic_child, 'topic_weight__gte': 0.05}, source=['document_es_id'], start=0, end=1000000, get_scan_obj=True ) jaccard_score = 1 - jaccard_distance(scanned_parent, set([elem.document_es_id for elem in theta_2])) scores[threshhold] += jaccard_score cnt_matches_for_threshhold += 1 try: avg_score = scores[threshhold] / cnt_matches_for_threshhold scores_for_normalization.append(avg_score) scores[threshhold] = [len(map_dict) / number_of_topics, avg_score] except ZeroDivisionError: scores[threshhold] = [len(map_dict) / number_of_topics, 0] scores_normalized = [score[0] for score in scaler.fit_transform(np.array(scores_for_normalization).reshape(-1, 1))] for i, items in enumerate(scores.items()): scores[items[0]] += [scores_normalized[i]] return scores
def jaccard(sen_1, sen_2): tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(sen_1)) words = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS' or pos == 'JJ' or pos == '' or pos == 'VB' or pos == 'VBN' or pos == 'VBD' or pos == 'RB')] sen_set_1 = set(words) tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(sen_2)) words = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS' or pos == 'JJ' or pos == '' or pos == 'VB' or pos == 'VBN' or pos == 'VBD' or pos == 'RB')] sen_set_2 = set(words) jaccard_value = jaccard_distance(sen_set_1, sen_set_2) return jaccard_value
def jaccard_similarity_stemmer(self, config, sentence1, sentence2): """ Computes stem unigram similarity. """ tokens1 = [ self.stemmer.stem(token) for token in word_tokenize(sentence1[1]) if token not in self.punctuation_set ] tokens2 = [ self.stemmer.stem(token) for token in word_tokenize(sentence2[1]) if token not in self.punctuation_set ] return 1 - jaccard_distance(set(tokens1), set(tokens2))
def cal_ngrams_by_jacc(wn_grams, ox_grams): matrix_similarity_jaccard = [[0 for x in range(len(ox_grams))] for x in range(len(wn_grams))]; for iWnWord in range(len(wn_grams)): wn_set = set(wn_grams[iWnWord]); for iDictWord in range(len(ox_grams)): dict_set = set(ox_grams[iDictWord]) matrix_similarity_jaccard[iWnWord][iDictWord] = 1 - jaccard_distance(wn_set,dict_set); return matrix_similarity_jaccard
def jaccard_similarity(s1, s2): try: tokenized_sentence_1 = nltk.word_tokenize(s1.lower()) tokenized_sentence_2 = nltk.word_tokenize(s2.lower()) except: print("Error: S1[%s] \n S2[%s]" % (s1, s2)) return 0 # Compute similarity if len(tokenized_sentence_1) > 0 and len(tokenized_sentence_2) > 0: similarity = 1 - jaccard_distance(set(tokenized_sentence_1), set(tokenized_sentence_2)) return similarity else: return 0
def computeDistances(self): for k, v in sorted(self.keywords.iteritems()): prev = None for tup in v: self.allItems.append((tup[0], tup[1], k)) cnt = len(self.allItems)-1 if prev != None: self.allSeq.append([prev, cnt, k]) prev = cnt n=len(self.allItems) self.dist = numpy.zeros(shape=(n,n)) for i in range(0,n): for j in range(0,n): try: self.dist[i,j] = jaccard_distance(set(self.allItems[i][1]), set(self.allItems[j][1])) except ZeroDivisionError: self.dist[i,j] = 0 #sys.maxint
def get_dists(keyword): dists = [] for word in words_preprocessed: dists.append({ "edit_dist": edit_distance(word, keyword), "jaro_simi": jaro_similarity(word, keyword), "jaro_winkler_simi": jaro_winkler_similarity(word, keyword), "jaccard_dist": jaccard_distance(set(word), set(keyword)), "word": word, "keyword": keyword }) return pd.DataFrame(dists).sort_values("edit_dist").iloc[0:3, :]
def similarity_by_jaccard(ox_defis, wn_defis): matrix_similarity_jaccard = [[0 for x in range(len(ox_defis))] for x in range(len(wn_defis))]; for iWnWord in range(len(wn_defis)): tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wn_defis[iWnWord])); words = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS' or pos == 'JJ' or pos == '' or pos == 'VB' or pos == 'VBN' or pos == 'VBD' or pos == 'RB')]; # words = nltk.wordpunct_tokenize(wn.synset(wn_defis[iWnWord].name()).definition()); # print words for i in range(len(words)): words[i] = wordnet_lemmatizer.lemmatize(words[i]); wn_set = set(words); # print "\n" # print wn_set # wn_set = set(wn.synset(wn_defis[iWnWord].name()).definition().split()) # print wn_set # - - - - - - - - - - - - - - - - - - - - - - - - - - - # word-word for iDictWord in range(len(ox_defis)): # if not ox_defis[str(iDictWord)].has_key("d") or dict_words[str(iDictWord)]["d"] == None: # matrix_similarity_jaccard[iWnWord][iDictWord] = 1; # continue tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(ox_defis[iDictWord])); words = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS' or pos == 'JJ' or pos == '' or pos == 'VB' or pos == 'VBN' or pos == 'VBD' or pos == 'RB')]; # words = nltk.wordpunct_tokenize(ox_defis[str(iDictWord)]["d"]); # print words for i in range(len(words)): words[i] = wordnet_lemmatizer.lemmatize(words[i]); dict_set = set(words); # print dict_set # print # dict_set = set(ox_defis[str(iDictWord)]["d"].encode('utf8').split()); matrix_similarity_jaccard[iWnWord][iDictWord] = 1 - jaccard_distance(wn_set,dict_set); # matrix_similarity_jaccard[iWnWord][iDictWord] = cal_jacc_for_ngrams(wn_set, dict_set, 1) ######################################## return matrix_similarity_jaccard
def synsets_similarity(s1, s2): """ Find the jaccard similarity between two sentences synsets using lesk algorithm to disambiguate words given their context. """ lemmas_sentence_1, tagged_sentence_1 = lemmatize_sentence(s1.lower()) lemmas_sentence_2, tagged_sentence_2 = lemmatize_sentence(s2.lower()) # Disambiguate words and create list of sysnsets synsets_sentence_1 = [] for (lemma, word_tag) in zip(lemmas_sentence_1, tagged_sentence_1): if lemma in stop_words: continue synset = lesk(lemmas_sentence_1, lemma, wordnet_pos_code(word_tag[1])) if synset is not None: synsets_sentence_1.append(synset) else: found = wordnet.synsets(lemma, wordnet_pos_code(word_tag[1])) if len(found) > 0: synsets_sentence_1.append(found[0]) #print("Warn: lemma [%s] returned no disambiguation...using synset : %s" % (lemma, found[0])) synsets_sentence_2 = [] for (lemma, word_tag) in zip(lemmas_sentence_2, tagged_sentence_2): if lemma in stop_words: continue synset = lesk(lemmas_sentence_2, lemma, wordnet_pos_code(word_tag[1])) if synset is not None: synsets_sentence_2.append(synset) else: found = wordnet.synsets(lemma, wordnet_pos_code(word_tag[1])) if len(found) > 0: synsets_sentence_2.append(found[0]) #print("Warn: lemma [%s] returned no disambiguation...using synset : %s" % (lemma, found[0])) # Compute similarity if len(synsets_sentence_1) != 0 and len(synsets_sentence_2) != 0: similarity = 1 - jaccard_distance(set(synsets_sentence_1), set(synsets_sentence_2)) return similarity else: return 0
def computeDistances(self): for k, v in sorted(self.keywords.iteritems()): prev = None for tup in v: self.allItems.append((tup[0], tup[1], k)) cnt = len(self.allItems) - 1 if prev != None: self.allSeq.append([prev, cnt, k]) prev = cnt n = len(self.allItems) self.dist = numpy.zeros(shape=(n, n)) for i in range(0, n): for j in range(0, n): try: self.dist[i, j] = jaccard_distance(set(self.allItems[i][1]), set(self.allItems[j][1])) except ZeroDivisionError: self.dist[i, j] = 0 #sys.maxint
def number_overlap(self, config, sentence1, sentence2): """ Computes the Jaccard distance between the sets of the cardinal numbers that appear in the two sentences. It tries to parse numbers expressed in words (for example 'two', 'three hundreds and forty-nine'), numbers using ',' (for example '16,432,970') and decimal numbers (for example '45,233.4123'). If the number can't be parsed, it's added as the corresponding string to the set. """ numbers1 = [] for i, word in enumerate(sentence1[2]): if sentence1[3][i] == 'CD': try: numbers1.append(w2n.word_to_num(sentence1[2][i])) except ValueError: try: numbers1.append( w2n.word_to_num(sentence1[2][i].replace(',', ''))) except ValueError: try: numbers1.append( float(sentence1[2][i].replace(',', ''))) except ValueError: numbers1.append(word) numbers2 = [] for i, word in enumerate(sentence2[2]): if sentence2[3][i] == 'CD': try: numbers2.append(w2n.word_to_num(sentence2[2][i])) except ValueError: try: numbers2.append( w2n.word_to_num(sentence2[2][i].replace(',', ''))) except ValueError: try: numbers2.append( float(sentence2[2][i].replace(',', ''))) except ValueError: numbers2.append(word) try: return 1 - jaccard_distance(set(numbers1), set(numbers2)) except ZeroDivisionError: return 0
def dependency_similarity(s1, s2): """ Find the jaccard similarity between the semantic depency parsing nodes of the sentences using CoreNLP dependency parser. """ # pass parsed_sentence_1 = parser.raw_parse(s1) parsed_sentence_2 = parser.raw_parse(s2) tree1 = next(parsed_sentence_1) tree2 = next(parsed_sentence_2) triples1 = [t for t in tree1.triples()] triples2 = [t for t in tree2.triples()] # Compute similarity if len(triples1) != 0 and len(triples2) != 0: similarity = 1 - jaccard_distance(set(triples1), set(triples2)) return similarity else: return 0
def compute_feature8(frases1, frases2, X_train_or_test): feature = [] sw = set(stopwords.words('english')) wnl = WordNetLemmatizer() feature = [] for sent1, sent2 in zip(frases1, frases2): sent1b = sent1 sent2b = sent2 sent1 = preprocess(sent1, wnl, sw) sent2 = preprocess(sent2, wnl, sw) trigrams1 = list(nltk.trigrams(sent1)) trigrams2 = list(nltk.trigrams(sent2)) if len(trigrams1) == 0 or len(trigrams2) == 0: feature.append(0) else: feature.append(jaccard_distance(set(trigrams1), set(trigrams2))) X_train_or_test = np.concatenate((X_train_or_test, np.array(feature).reshape(len(feature), 1)), axis=1) return X_train_or_test
def lemmas_similarity(s1, s2, filter_stop_words=True): """ Jaccard lematized sentences similarity """ # Tokenize by sentences into words in lower case tokenized_sentence_1 = nltk.word_tokenize(s1.lower()) tokenized_sentence_2 = nltk.word_tokenize(s2.lower()) if not filter_stop_words: tokenized_sentence_1 = [ token for token in tokenized_sentence_1 if token not in stop_words ] tokenized_sentence_2 = [ token for token in tokenized_sentence_2 if token not in stop_words ] tagged_sentence_1 = pos_tag( tokenized_sentence_1) # [ (word, POS_TAG), ...] tagged_sentence_2 = pos_tag( tokenized_sentence_2) # [ (word, POS_TAG), ...] lemmas_sentence_1 = [ lemmatize(tagged_word, wnl) for tagged_word in tagged_sentence_1 ] lemmas_sentence_2 = [ lemmatize(tagged_word, wnl) for tagged_word in tagged_sentence_2 ] # [LEMMA_1, ...] # Compute similarity if len(lemmas_sentence_1) > 0 and len(lemmas_sentence_2) > 0: similarity = 1 - jaccard_distance(set(lemmas_sentence_1), set(lemmas_sentence_2)) # Compute label of similarity return similarity else: return 0
def create_graph(): g = nx.Graph() xml_docs = Collection() xml_docs_subset = xml_docs.get_docs(author="Wau Holland") docs_no = len(xml_docs_subset) id_dict = dict() stems_dict = dict() doc_id = 1 print "Put stems into a dict for each document (with an uniq id) ..." print "Create nodes with all the documents' relevant information ..." pb = ProgressBar(maxval=docs_no).start() for xml_doc in xml_docs_subset: pb.update(doc_id) id_dict[xml_doc.get_xml_filename()] = doc_id g.add_node(doc_id, id = xml_doc.get_id(), rawlen = xml_doc.get_rawlen(), subj = xml_doc.get_subj(), author = xml_doc.get_author(), date = xml_doc.get_date(), words = xml_doc.get_words(), uniq_stems = list(xml_doc.get_stems(uniq=True, relev=True)), rawcontent = xml_doc.get_rawcontent() ) doc_id += 1 # It seems sometimes a list (-> set conversion) gets returned # ... ugly. XXX stems_dict[doc_id] = set(xml_doc.get_stems(uniq=True, relev=True)) print "Create undirected, weighted graph based on Jaccard similarity ..." no_of_edges = docs_no * (docs_no - 1) / 2 pb = ProgressBar(maxval=no_of_edges).start() count = 1 for doc_idx1 in stems_dict.keys(): doc_idx2 = doc_idx1 + 1 # Nothing left to compare if (doc_idx1 == docs_no): break while True: # print "Comparing: ", doc_idx1, doc_idx2 # Find longer doc doc1_len, doc2_len = len(stems_dict[doc_idx1]), \ len(stems_dict[doc_idx2]) long_doc_len = max((doc1_len, doc2_len)) short_doc_len = min((doc1_len, doc2_len)) # In case a document has no useful stems to classify edge_weight = 0 alias_coeff = 0 if long_doc_len == 0 or short_doc_len == 0: pass else: alias_coeff = float(long_doc_len) / short_doc_len edge_weight = (1 - jaccard_distance(stems_dict[doc_idx1], stems_dict[doc_idx2])) \ * alias_coeff print alias_coeff, edge_weight # Still redundant, only for testing if (edge_weight > 0.3): cluster_stems = stems_dict[doc_idx1].intersection( stems_dict[doc_idx2]) try: g.node[doc_idx1]['cluster_stems'] except KeyError: g.node[doc_idx1]['cluster_stems'] = cluster_stems else: for stem in cluster_stems: g.node[doc_idx1]['cluster_stems'].add(stem) try: g.node[doc_idx2]['cluster_stems'] except KeyError: g.node[doc_idx2]['cluster_stems'] = cluster_stems else: for stem in cluster_stems: g.node[doc_idx2]['cluster_stems'].add(stem) # To be made more flexible if edge_weight > 0.3: g.add_edge(doc_idx1, doc_idx2, weight=edge_weight) doc_idx2 += 1 pb.update(count) count += 1 if doc_idx2 > docs_no: break print "Draw graph showing possible clusters ..." elarge = [(u,v) for (u,v,d) in g.edges(data=True) if d['weight'] > 0.4] emedium = [(u,v) for (u,v,d) in g.edges(data=True) if d['weight'] > 0.2 and d['weight'] < 0.4] esmall = [(u,v) for (u,v,d) in g.edges(data=True) if d['weight'] <= 0.2] print "elarge: ", len(elarge) print "emedium: ", len(emedium) print "esmall: ", len(esmall) pos = nx.spring_layout(g, scale=20) #pos = nx.random_layout(g) dlarge = [n for n,d in g.degree_iter() if d >= 20] dmedium = [n for n,d in g.degree_iter() if d > 1 and d < 20] dsmall = [n for n,d in g.degree_iter() if d == 1] dnone = [n for n,d in g.degree_iter() if d == 0] print "dlarge: ", len(dlarge) print "dmedium: ", len(dmedium) print "dsmall: ", len(dsmall) print "dnone: ", len(dnone) # Draw nodes # nx.draw_networkx_nodes(g, pos, node_size=5, linewidths=0) nx.draw_networkx_nodes(g, pos, nodelist=dlarge, node_size=20, node_color='b', linewidths=0) nx.draw_networkx_nodes(g, pos, nodelist=dmedium, node_size=10, node_color='g', alpha=0.8, linewidths=0) nx.draw_networkx_nodes(g, pos, nodelist=dsmall, node_size=5, node_color='b', alpha=0.2, linewidths=0, ) nx.draw_networkx_nodes(g, pos, nodelist=dnone, node_size=5, node_color='b', alpha=0.2, linewidths=0) # Draw edges nx.draw_networkx_edges(g, pos, edgelist=elarge, width=0.4) nx.draw_networkx_edges(g, pos, edgelist=emedium, edge_color='g', alpha=0.8, width=0.2) nx.draw_networkx_edges(g, pos, edgelist=esmall, width=0.1, alpha=0.1, edge_color='b') # Draw labels # nx.draw_networkx_labels(g, pos, font_size=1, font_family='sans-serif') plt.axis('off') plt.figure(1, figsize=(20,20)) """ print "Print PNG" plt.savefig("graph.png", dpi=600) """ # plt.show() nx.write_yaml(g, get_graph_file()) d3_js.export_d3_js(g)
def jaccard_distance_char(s1, s2): w1 = set(s1) w2 = set(s2) return jaccard_distance(w1, w2)
def jaccard_distance_word(s1, s2): w1 = set(s1.split(" ")) w2 = set(s2.split(" ")) return jaccard_distance(w1, w2)
def nGrams(d,inputfile): text.configure(state="normal") text.delete('1.0',END) e = inputfile+"/" fi = open(d+inputfile+".txt",'r') d += e fn = open(d+"ngrams.txt",'w') fi.readline() fn.write("char2\tchar3\tchar4\tword1\tword2\tword3\tlemma1\tlemma2\tlemma3\tposgm1\tposgm2\tposgm3\n") for line in fi.readlines(): sents = line.split("\t") words_1 = sents[0].split() words_2 = sents[1].split() if len(words_1) < 3: words_1.append(".") if len(words_2) < 3: words_2.append(".") char1_2 = set(ngrams(sents[0],2)) char1_3 = set(ngrams(sents[0],3)) char1_4 = set(ngrams(sents[0],4)) char2_2 = set(ngrams(sents[1],2)) char2_3 = set(ngrams(sents[1],3)) char2_4 = set(ngrams(sents[1],4)) word1_1 = set(ngrams(words_1,1)) word1_2 = set(ngrams(words_1,2)) word1_3 = set(ngrams(words_1,3)) word2_1 = set(ngrams(words_2,1)) word2_2 = set(ngrams(words_2,2)) word2_3 = set(ngrams(words_2,3)) sent1 = nltk.pos_tag(words_1) sent2 = nltk.pos_tag(words_2) nouns = ['NN','NNS','NNP','NNPS'] adj = ['JJ','JJR','JJS'] adv = ['RB','RBR','RBS'] verbs = ['VB','VBG','VBN','VBZ','VBP','VBD'] all_pos = ['NN','NNS','NNP','NNPS','JJ','JJR','JJS','RB','RBR','RBS','VB','VBG','VBN','VBZ','VBP','VBD'] posgm1 = [] posgm2 = [] for s in sent1: posgm1.append(s[1]) for s in sent2: posgm2.append(s[1]) pos = [] for s in sent1: if s[1] not in all_pos: pos.append((s[0],'v')) if s[1] in nouns: pos.append((s[0],'n')) if s[1] in adj: pos.append((s[0],'a')) if s[1] in adv: pos.append((s[0],'r')) if s[1] in verbs: pos.append((s[0],'v')) sent1 = pos pos = [] for s in sent2: if s[1] not in all_pos: pos.append((s[0],'v')) if s[1] in nouns: pos.append((s[0],'n')) if s[1] in adj: pos.append((s[0],'a')) if s[1] in adv: pos.append((s[0],'r')) if s[1] in verbs: pos.append((s[0],'v')) sent2 = pos lemma1 = [] for s in sent1: lemma1.append(lemmatizer.lemmatize(s[0],s[1])) lemma2 = [] for s in sent2: lemma2.append(lemmatizer.lemmatize(s[0],s[1])) print sents[0],sents[1] lemma1_1 = set(ngrams(lemma1,1)) lemma1_2 = set(ngrams(lemma1,2)) lemma1_3 = set(ngrams(lemma1,3)) lemma2_1 = set(ngrams(lemma2,1)) lemma2_2 = set(ngrams(lemma2,2)) lemma2_3 = set(ngrams(lemma2,3)) posgm1_1 = set(ngrams(posgm1,1)) posgm1_2 = set(ngrams(posgm1,2)) posgm1_3 = set(ngrams(posgm1,3)) posgm2_1 = set(ngrams(posgm2,1)) posgm2_2 = set(ngrams(posgm2,2)) posgm2_3 = set(ngrams(posgm2,3)) char2_jd = 1.0 - jaccard_distance(char1_2,char2_2) char3_jd = 1.0 - jaccard_distance(char1_3,char2_3) char4_jd = 1.0 - jaccard_distance(char1_4,char2_4) word1_jd = 1.0 - jaccard_distance(word1_1,word2_1) word2_jd = 1.0 - jaccard_distance(word1_2,word2_2) word3_jd = 1.0 - jaccard_distance(word1_3,word2_3) lemma1_jd = 1.0 - jaccard_distance(lemma1_1,lemma2_1) lemma2_jd = 1.0 - jaccard_distance(lemma1_2,lemma2_2) lemma3_jd = 1.0 - jaccard_distance(lemma1_3,lemma2_3) posgm1_jd = 1.0 - jaccard_distance(posgm1_1,posgm2_1) posgm2_jd = 1.0 - jaccard_distance(posgm1_2,posgm2_2) posgm3_jd = 1.0 - jaccard_distance(posgm1_3,posgm2_3) text.insert(INSERT,"%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t\n" % (sents[0],sents[1],char2_jd,char3_jd,char4_jd,word1_jd,word2_jd,word3_jd,lemma1_jd,lemma2_jd,lemma3_jd,posgm1_jd,posgm2_jd,posgm3_jd)) fn.write("%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n" % (char2_jd,char3_jd,char4_jd,word1_jd,word2_jd,word3_jd,lemma1_jd,lemma2_jd,lemma3_jd,posgm1_jd,posgm2_jd,posgm3_jd)) #print "LCS of ",sent1,sent2,"is: ",lcs(sent1,sent2)/(1.0 * min_len) fi.close() fn.close() text.configure(state="disabled")
def compareVietNetAndOxford(dict_VietNet, dict_Oxford): for WORD in dict_Oxford: if len(dict_Oxford[WORD]) == 0: continue # if WORD == "BA": # print "holyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyy" wn_words = wn.synsets(WORD, pos="n") if wn_words == None: continue if WORD == "baby": a = 1 if dict_VietNet.has_key(WORD): arr_VietNet = dict_VietNet[WORD] arr_Oxford = dict_Oxford[WORD] matrix_similarity = [[0 for x in range(len(arr_Oxford))] for x in range(len(wn_words))] for iWn in range(len(wn_words)): definitionWn = wn.synset(wn_words[iWn].name()).definition() vietNet = {} for iVietNet in arr_VietNet: levenshtein_vn_wn = Util.levenshtein(arr_VietNet[iVietNet]["d"], definitionWn) if levenshtein_vn_wn < len(definitionWn) / 2.0: vietNet = arr_VietNet[iVietNet] break if not vietNet.has_key("tv"): vietNet["tv"] = "" viet_net_tv = vietNet["tv"] for iOxford in range(len(arr_Oxford)): oxford = arr_Oxford[str(iOxford)] vietNet_tv = viet_net_tv if not oxford.has_key("tv"): continue oxford_tv = oxford["tv"].encode("utf-8") vietNet_tv.replace(";", "") oxford_tv = oxford_tv.replace(";", "") oxford_tv = oxford_tv.replace(",", "") oxford_tv = oxford_tv.replace("/", " ") arr_tv_oxford = set(oxford_tv.split(" ")) arr_tv_vietnet = set(vietNet_tv.split(" ")) jaccard = jaccard_distance(arr_tv_oxford, arr_tv_vietnet) print arr_tv_vietnet print arr_tv_oxford print jaccard matrix_similarity[iWn][iOxford] = 0 if jaccard < 0.95: matrix_similarity[iWn][iOxford] = 1 matrix_similarity[iWn].insert(0, viet_net_tv + "<>" + definitionWn.encode("utf-8")) print matrix_similarity # - - - - - - - - - - - - - - - - - - - - - - - - - # col # for i in range(len(dict_VietNet[WORD])): # matrix_similarity[i].insert(0,dict_VietNet[WORD][i]["tv"] + "<>" + dict_VietNet[WORD][i]["d"]); # - - - - - - - - - - - - - - - - - - - - - - - - - # row arrRowDict = [] arrRowDict.append(WORD) for i in range(len(dict_Oxford[WORD])): if not dict_Oxford[WORD][str(i)].has_key("tv"): dict_Oxford[WORD][str(i)]["tv"] = "-" if not dict_Oxford[WORD][str(i)].has_key("d"): dict_Oxford[WORD][str(i)]["d"] = "-" if dict_Oxford[WORD][str(i)]["d"] == None: dict_Oxford[WORD][str(i)]["d"] = "-" arrRowDict.append( dict_Oxford[WORD][str(i)]["tv"].encode("utf-8") + "<>" + dict_Oxford[WORD][str(i)]["d"].encode("utf-8") ) FileProcess.append_to_excel_file( "Results/parameters/VN_Ox/" + "compare_VN_Ox_2_2.1.csv", arrRowDict, matrix_similarity )
def calculate_jaccard(self, s0, s1): lemms_0 = set([a.lower() for a in s0 if a]) lemms_1 = set([a.lower() for a in s1 if a]) jaccard_simmilarity = (1 - jaccard_distance(lemms_0, lemms_1)) return jaccard_simmilarity
def unordered_content_distance(self, sentence): """Jaccard distance on (unordered) content words between `self` and `sentence`.""" return jaccard_distance(set(self.content_words), set(sentence.content_words))
def jaccard_dist(row): return jaccard_distance(set(str(row['question1'])), set(str(row['question2'])))
def getJaccardDistance(a, b): try: jd = jaccard_distance(set(a), set(b)) except ZeroDivisionError: jd = 0 return jd
def similarity_by_synsets_synsets_nbest_withword_average(WORD, dict_words): if WORD == "bank": asf = 0; # - - - - - - - - - - - - - - - - - - - - - - - - - - - # # dictionary data dict_words_synsets = get_nbest_synsets_n_v_with_word(dict_words,WORD); # print "dict-word_synsets" # print dict_words_synsets # - - - - - - - - - - - - - - - - - - - - - - - - - - - # # wordnet data wn_words = wn.synsets(WORD, pos = 'n'); print "wn_words -------" print wn_words; wn_words_synsets = WordnetProcess.get_synsets_n_v(WORD, wn_words); print wn_words_synsets # matrix for similarity dict_words vs wn_words matrix_similarity = [[0 for x in range(len(dict_words))] for x in range(len(wn_words))]; # - - - - - - - - - - - - - - - - - - - - - - - - - - - #################################################################################################### # # calculate 2d matrix of p for iWnWord in range(len(wn_words)): # - - - - - - - - - - - - - - - - - - - - - - - - - - - # word-word for iDictWord in range(len(dict_words)): p_iWnWord_iDictWord = 0.; arr_p_word = []; # for dict_synset in dict_words_synsets[iDictWord]: # print "------------ dict noun" # print dictNoun; p_dictNoun_wnNouns = 0; # for some nouns don't have synsets arr_p = []; # - - - - - - - - - - - - - - - - - - - - - - - - for wn_synset in wn_words_synsets[iWnWord]: # p_max = dict_synset.path_similarity(wn_synset); if p_max == None: continue arr_p.append(p_max); # print p_max arr_p = sorted(arr_p, reverse=True); nBest = 8; count = 0.0001; for i in xrange(0, len(arr_p)-1): if i < nBest: p_dictNoun_wnNouns += arr_p[i]; count += 1; p_dictNoun_wnNouns = p_dictNoun_wnNouns/count; arr_p_word.append(p_dictNoun_wnNouns); arr_p_word = sorted(arr_p_word, reverse=True); nBest = 10; count = 5; for i in range(len(arr_p_word)): if i < nBest: if nBest > len(arr_p_word): if i == 0: p_iWnWord_iDictWord += arr_p_word[i]*5.; elif i< nBest/3: p_iWnWord_iDictWord += arr_p_word[i]*1; else: p_iWnWord_iDictWord += arr_p_word[i]*1; else: if i == 0: p_iWnWord_iDictWord += arr_p_word[i]*5.; elif i< len(arr_p_word)/3: p_iWnWord_iDictWord += arr_p_word[i]*1; else: p_iWnWord_iDictWord += arr_p_word[i]*1; count += 1; if count == 0: p_iWnWord_iDictWord = 0; else: p_iWnWord_iDictWord = p_iWnWord_iDictWord/count matrix_similarity[iWnWord][iDictWord] = p_iWnWord_iDictWord; # - - - - - - - - - - - - - - - - - - - - - - - - - # word-word # - - - - - - - - - - - - - - - - - - - - - - - - - - - #################################################################################################### print "----------------------------------------------------" s = [[str(e) for e in row] for row in matrix_similarity] lens = [max(map(len, col)) for col in zip(*s)] fmt = '\t'.join('{{:{}}}'.format(x) for x in lens) table = [fmt.format(*row) for row in s] print '\n'.join(table) # - - - - - - - - - - - - - - - - - - - - - - - - - - - # # dictionary data wn_words = dict_words; wn_words_synsets = get_nbest_synsets_n_v_with_word(wn_words,WORD); # - - - - - - - - - - - - - - - - - - - - - - - - - - - # # wordnet data dict_words = wn.synsets(WORD, pos = 'n'); # print wn_words; dict_words_synsets = WordnetProcess.get_synsets_n_v(WORD, dict_words); print "sysnets -----------------------.----.-----.--.-" # matrix for similarity dict_words vs wn_words matrix_similarity_reverse = [[0 for x in range(len(dict_words))] for x in range(len(wn_words))]; # - - - - - - - - - - - - - - - - - - - - - - - - - - - #################################################################################################### # # calculate 2d matrix of p for iWnWord in range(len(wn_words)): # - - - - - - - - - - - - - - - - - - - - - - - - - - - # word-word for iDictWord in range(len(dict_words)): p_iWnWord_iDictWord = 0.; arr_p_word = []; for dict_synset in dict_words_synsets[iDictWord]: # print dictNoun; p_dictNoun_wnNouns = 0; # for some nouns don't have synsets countwnNouns = 0.00000001; arr_p = []; # - - - - - - - - - - - - - - - - - - - - - - - - for wn_synset in wn_words_synsets[iWnWord]: p_max = dict_synset.path_similarity(wn_synset); if p_max != None: arr_p.append(p_max); # print p_max # - - - - - - - - - - - - - - - - - - - - - - - - arr_p = sorted(arr_p, reverse=True); nBest = 8; count = 0.0001 for i in range(len(arr_p)): if i < nBest: p_dictNoun_wnNouns += arr_p[i]; count +=1 p_dictNoun_wnNouns = p_dictNoun_wnNouns/count; arr_p_word.append(p_dictNoun_wnNouns); arr_p_word = sorted(arr_p_word, reverse=True); nBest = 10; count = 5; for i in xrange(0, len(arr_p_word)-1): if i < nBest: if nBest > len(arr_p_word): if i == 0: p_iWnWord_iDictWord += arr_p_word[i]*5; elif i< nBest/3: p_iWnWord_iDictWord += arr_p_word[i]*1.; else: p_iWnWord_iDictWord += arr_p_word[i]*1; else: if i == 0: p_iWnWord_iDictWord += arr_p_word[i]*5.; elif i< len(arr_p_word)/3: p_iWnWord_iDictWord += arr_p_word[i]*1.; else: p_iWnWord_iDictWord += arr_p_word[i]*1; count += 1; if count == 0: p_iWnWord_iDictWord = 0; else: p_iWnWord_iDictWord = p_iWnWord_iDictWord/count matrix_similarity_reverse[iWnWord][iDictWord] = p_iWnWord_iDictWord; # - - - - - - - - - - - - - - - - - - - - - - - - - # word-word # - - - - - - - - - - - - - - - - - - - - - - - - - - - #################################################################################################### print "----------------------------------------------------" s = [[str(e) for e in row] for row in matrix_similarity_reverse] lens = [max(map(len, col)) for col in zip(*s)] fmt = '\t'.join('{{:{}}}'.format(x) for x in lens) table = [fmt.format(*row) for row in s] print '\n'.join(table) # word-word # - - - - - - - - - - - - - - - - - - - - - - - - - - - dict_words = wn_words; wn_words = wn.synsets(WORD, pos = 'n'); for iWnWord in range(len(wn_words)): for iDictWord in range(len(dict_words)): matrix_similarity[iWnWord][iDictWord] = matrix_similarity[iWnWord][iDictWord] + matrix_similarity_reverse[iDictWord][iWnWord]; matrix_similarity[iWnWord][iDictWord] /= 2; #################################################################################################### print "----------------------------------------------------" s = [[str(e) for e in row] for row in matrix_similarity] lens = [max(map(len, col)) for col in zip(*s)] fmt = '\t'.join('{{:{}}}'.format(x) for x in lens) table = [fmt.format(*row) for row in s] print '\n'.join(table) #################################################################################################### # # @brief: # matrix_similarity_jaccard = [[0 for x in range(len(dict_words))] for x in range(len(wn_words))]; for iWnWord in range(len(wn_words)): tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wn.synset(wn_words[iWnWord].name()).definition())); words = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS' or pos == 'JJ' or pos == '' or pos == 'VB' or pos == 'VBN' or pos == 'VBD' or pos == 'RB')]; # words = nltk.wordpunct_tokenize(wn.synset(wn_words[iWnWord].name()).definition()); # print words for i in range(len(words)): words[i] = wordnet_lemmatizer.lemmatize(words[i]); wn_set = set(words); # wn_set = set(wn.synset(wn_words[iWnWord].name()).definition().split()) # print wn_set # - - - - - - - - - - - - - - - - - - - - - - - - - - - # word-word for iDictWord in range(len(dict_words)): if not dict_words[str(iDictWord)].has_key("d") or dict_words[str(iDictWord)]["d"] == None: matrix_similarity_jaccard[iWnWord][iDictWord] = 1; continue tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(dict_words[str(iDictWord)]["d"])); words = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS' or pos == 'JJ' or pos == '' or pos == 'VB' or pos == 'VBN' or pos == 'VBD' or pos == 'RB')]; # words = nltk.wordpunct_tokenize(dict_words[str(iDictWord)]["d"]); # print words for i in range(len(words)): words[i] = wordnet_lemmatizer.lemmatize(words[i]); dict_set = set(words); # print # dict_set = set(dict_words[str(iDictWord)]["d"].encode('utf8').split()); matrix_similarity_jaccard[iWnWord][iDictWord] = jaccard_distance(wn_set,dict_set); for iWnWord in range(len(wn_words)): for iDictWord in range(len(dict_words)): matrix_similarity[iWnWord][iDictWord] = matrix_similarity[iWnWord][iDictWord]*10 + 2*(1-matrix_similarity_jaccard[iWnWord][iDictWord]); matrix_similarity[iWnWord][iDictWord] /= 12; #################################################################################################### print "----------------------------------------------------" s = [[str(e) for e in row] for row in matrix_similarity] lens = [max(map(len, col)) for col in zip(*s)] fmt = '\t'.join('{{:{}}}'.format(x) for x in lens) table = [fmt.format(*row) for row in s] print '\n'.join(table) #################################################################################################### # # write file # - - - - - - - - - - - - - - - - - - - - - - - - - # col arrColWn = []; for i in range(len(wn_words)): matrix_similarity[i].insert(0,wn.synset(wn_words[i].name()).definition()); # - - - - - - - - - - - - - - - - - - - - - - - - - # row arrRowDict = []; arrRowDict.append("--"); for i in range(len(dict_words)): if not dict_words[str(i)].has_key('tv'): dict_words[str(i)]['tv'] = "--"; if dict_words[str(i)]['tv'] == None: dict_words[str(i)]['tv'] = "--" arrRowDict.append(dict_words[str(i)]["tv"].encode('utf8')); FileProcess.write_to_excel_file("Results/"+WORD+"_synsets_synsets_nbest_withword_average.csv",arrRowDict,matrix_similarity)
def uc_distance(self, sentence): """Jaccard distance on (unordered) content lemmas between `self` and `sentence`.""" return jaccard_distance(set(self.content_lemmas), set(sentence.content_lemmas))
def _jaccard(sent1, sent2): sent1 = set(sent1) sent2 = set(sent2) return jaccard_distance(sent1, sent2)