def jaccard(self, inputA, inputB): if type(inputA) == str or type(inputA) == unicode: inputA_tokens = inputA.lower().split() inputB_tokens = inputB.lower().split() return distance.jaccard_distance(set(inputA_tokens), set(inputB_tokens)) elif type(inputA) == nltk.text.Text: inputA_tokens = map(lambda x: x.lower(), inputA.vocab().keys()) inputB_tokens = map(lambda x: x.lower(), inputB.vocab().keys()) return distance.jaccard_distance(set(inputA_tokens), set(inputB_tokens)) elif type(inputA) == list: inputA_tokens = map(lambda x: x.lower(), inputA) inputB_tokens = map(lambda x: x.lower(), inputB) return distance.jaccard_distance(set(inputA_tokens), set(inputB_tokens))
def calcualte(dict, manifesto_bow): article_id, article_dict = dict if data_io.DATA_FIELD in article_dict and len( article_dict[data_io.DATA_FIELD]) > 0: printv('\tProcessing ' + str(article_id) + ' with ' + str(len(article_dict[data_io.DATA_FIELD])) + ' words.') if SKIP_BOW_CREATION: distance = jaccard_distance(set(manifesto_bow), set(article_dict[data_io.DATA_FIELD])) else: article_bow = create_clean_bow(article_dict[data_io.DATA_FIELD]) distance = jaccard_distance(set(manifesto_bow), set(article_bow)) return (article_dict[data_io.DATE_FIELD] + '-' + article_id, distance)
def main(): assert len(sys.argv) == 4 reader = Seq2SeqDatasetReader(source_tokenizer=NoOpTokenizer(), target_tokenizer=NoOpTokenizer()) train = reader.read(sys.argv[1]) val = reader.read(sys.argv[2]) test = reader.read(sys.argv[3]) generator = Generator() rules, rules_anon, rules_ground, semantics, entities = load_all_2018(generator, GRAMMAR_DIR) anonymizer = Anonymizer(*entities) neighbors = [] for x in itertools.chain(train, val): command = str(x["source_tokens"][1:-1][0]) form = str(x["target_tokens"][1:-1][0]) anon_command = anonymizer(command) neighbors.append((anon_command, form)) test_pairs = [] for x in test: test_pairs.append((str(x["source_tokens"][1:-1][0]), str(x["target_tokens"][1:-1][0]))) print("Check grammar membership") naive_parser = GrammarBasedParser(rules_anon) anon_parser = AnonymizingParser(naive_parser, anonymizer) correct, parsed = bench_parser(anon_parser, test_pairs) print("Got {} of {} ({:.2f})".format(parsed, len(test_pairs), 100.0 * parsed / len(test_pairs))) print("Jaccard distance") sweep_thresh(neighbors, test_pairs, anonymizer, lambda x, y: jaccard_distance(set(x.split()), set(y.split())), [0.1 * i for i in range(11)]) print("Edit distance") sweep_thresh(neighbors, test_pairs, anonymizer, editdistance.eval)
def combineClusters(clusters, **twitter_stream_settings): def getHashtagSet(vector): return set([ word for dimension in vector for word in dimension.split() if word.startswith('#') ]) def getClusterInt(id): return int(id.split('_')[1]) mergedClustersMap = {} for cluster in [ clusters[v] for v in sorted(clusters, key=getClusterInt) ]: mergedClusterId = None for mergedCluster in mergedClustersMap.itervalues(): clusterHashtags, mergedClusterHashtags = getHashtagSet( cluster), getHashtagSet(mergedCluster) if len( clusterHashtags.union(mergedClusterHashtags) ) and jaccard_distance( clusterHashtags, mergedClusterHashtags) <= 1 - twitter_stream_settings[ 'cluster_merging_jaccard_distance_threshold']: mergedCluster.mergeCluster( cluster), mergedCluster.mergedClustersList.append( cluster.clusterId) mergedClusterId = mergedCluster.clusterId break if mergedClusterId == None: mergedCluster = StreamCluster.getClusterObjectToMergeFrom( cluster) mergedCluster.mergedClustersList = [cluster.clusterId] mergedClustersMap[mergedCluster.clusterId] = mergedCluster return mergedClustersMap
def jaccard(self, entry, gram_number): spellings = self.words[self.words.str.startswith(entry[0])] distances = ((jaccard_distance(set(ngrams(entry, gram_number)), set(ngrams(word, gram_number))), word) for word in spellings) closest = min(distances) return closest[1]
def mainFunction(listaTagsTreino, listaFrasesTreino, listaFrasesDesenvolvimento): results = [] bestSentences = [] i = 0 while i < len(listaFrasesDesenvolvimento): j = 0 best = 1000 tagId = "VOID" bestSentence = "" while j < len(listaFrasesTreino): # It is really a distance and not a similarity measure (1-similarity) result = jaccard_distance( set(listaFrasesTreino[j].split()), set(listaFrasesDesenvolvimento[i].split())) #result = edit_distance(listaFrasesTreino[j].split(), listaFrasesDesenvolvimento[i].split()) #print(result) if result < best: tagId = listaTagsTreino[j] bestSentence = listaFrasesTreino[j] best = result j = j + 1 results.append(tagId) bestSentences.append(bestSentence) i = i + 1 return results, bestSentences
def jaccard_team(input_team, all_teams): """ Trova il giocatore corrispondente a quello inserito dall'user. :param input_player: str :param all_players: list of str :return jac_player: str """ dist = 10 tri_guess = set(ngrams(input_team[:3].upper(), 2)) jac_team = '' for tm in all_teams: p = tm.replace(' ', '') trit = set(ngrams(p, 2)) jd = jaccard_distance(tri_guess, trit) if not jd: return tm elif jd < dist: dist = jd jac_team = tm return jac_team
def compute_lcs(self, doc1, doc2): LCS, MCLCS1, MCLCSN = self.lcs(doc1, doc2) jaccard_score = 1 - jaccard_distance(set(doc1), set(doc2)) #score = ( LCS + MCLCSN + jaccard_score)/3.0 score = 0.1 * LCS + 0.3 * MCLCSN + 0.6 * jaccard_score #score = jaccard_score return score
def testLSH(self): strings = [ "abcdefghijklmnopqrstuvwxyz", "abcdefghijklmnopqrstuvw", "defghijklmnopqrstuvw", "zyxwvutsrqponmlkjihgfedcba", "1abcdefghijklmnopuvw1", "123456789", "012345678", "234567890", ] for i, a in enumerate(strings): for j, b in enumerate(strings[i+1:]): print "'%s' (%d) <=> (%d)'%s': %f" % (a, i,j+i+1, b, 1-jaccard_distance(set(a),set(b))) random.seed(12345) lsh = LSHCache(shingler=Shingler(1)) self.assertListEqual([set(), set([0]), set([0,1]), set([0,1,2]), set([0,1,2,3]), set(), set([5]), set([5,6])], lsh.insert_batch(strings))
def jaccard_player(input_player, all_players): """ Trova il giocatore corrispondente a quello inserito dall'user. :param input_player: str :param all_players: list of str :return jac_player: str """ dist = 10 tri_guess = set(ngrams(input_player.upper(), 3)) jac_player = '' for pl in all_players: p = pl.replace(' ', '') trit = set(ngrams(p, 3)) jd = jaccard_distance(tri_guess, trit) if not jd: return pl elif jd < dist: dist = jd jac_player = pl return jac_player
def jaccard(string1, string2): ''' Jaccard distance ''' return jaccard_distance( set(string1.split()), set(string2.split()) )
def get_jaccard_sim(text1, text2): # countvect = TfidfVectorizer(stop_words='english', ngram_range=(1,2), binary=True) # countvect.fit([text1, text2]) t1 = _remove_tags(clean_tweet(text1)) t2 = _remove_tags(clean_tweet(text2)) tokens1 = t1.split() tokens2 = t2.split() return 1-jaccard_distance(set(tokens1), set(tokens2))
def extract_misc_attribute_businesses(misc_attributes_from_question, extracted_business_dictionary, question): #print 'misc attribute', misc_attributes_from_question candidate_businesses = {} distances_from_attributes = {} for attribute in misc_attributes_from_question: for token in question.split(): distance = jaccard_distance(set(list(attribute)), set(list(question))) distances_from_attributes[attribute] = distance sorted_distances = sorted(distances_from_attributes.items(), key=operator.itemgetter(1)) #print('Highest distances') #pprint (sorted_distances) for i in range(len(sorted_distances[:1])): best_attribute = sorted_distances[i] for business in extracted_business_dictionary: if business['attributes']: for attribute in business['attributes']: if str(attribute.split(':')[0]) == ''.join( best_attribute[0].split()): eliminated = ['No', 'None', 'False'] if attribute.split(':')[1] not in eliminated: #print 'Present',business candidate_businesses[ business['business_id']] = business['stars'] #print 'Candidate Businesses', candidate_businesses sorted_businesses = sorted(candidate_businesses.items(), key=operator.itemgetter(1))[::-1] #sorted_businesses = [dict1 for dict1 in extracted_business_dictionary] #Extract businesses sorted by ratings sorted_business_ids = [ business_id for business_id, ratings in sorted_businesses ] ranked_businesses = [] for business_id in sorted_business_ids: ranked_businesses += [ dictionary for dictionary in extracted_business_dictionary if dictionary['business_id'] == business_id ] if len(ranked_businesses) == 0: return extracted_business_dictionary, 'No' else: return ranked_businesses, 'Yes'
def jaccard(entries, gram_number): outcomes = [] for entry in entries: spellings = spellings_series[spellings_series.str.startswith(entry[0])] distances = ((jaccard_distance(set(ngrams(entry, gram_number)), set(ngrams(word, gram_number))), word) for word in spellings) closest = min(distances) outcomes.append(closest[1]) return outcomes
def answer_nine(entries=['cormulent', 'incendenece', 'validrate']): gram_num = 3 recommendations = [] for entry in entries: words = correct_series[correct_series.str.startswith(entry[0])] distances = ((jaccard_distance(set(ngrams(entry, gram_num)), set(ngrams(word, gram_num))), word) for word in words) closest = min(distances) recommendations.append(closest[1]) return recommendations # Your answer here
def getSuggestedWords(search_tokens): suggestion = [] for word in search_tokens: if not (d.check(word)): poss_suggest = d.suggest(word)[0:4] # fine-tune to pick best suggestion using jaccard_distance dists = [jaccard_distance(set(w), set(word)) for w in poss_suggest] suggestion.append(poss_suggest[dists.index(min(dists))]) else: suggestion.append(word) return suggestion
def answer_nine(entries=['cormulent', 'incendenece', 'validrate']): from nltk.metrics.distance import jaccard_distance from nltk.util import ngrams # return # Your answer here list = [] for entry in entries: temp = [(jaccard_distance(set(ngrams(entry, 3)), set(ngrams(w, 3))), w) for w in correct_spellings if w[0] == entry[0]] recommended_ = sorted(temp, key=lambda val: val[0])[0][1] list.append(recommended_) return list
def jaccard(misspelled_words, gram_number): correct_spellings = pd.Series(words.words()) outcomes = [] for entry in misspelled_words: words_starting_with = correct_spellings[correct_spellings.str.startswith(entry[0])] scoreWordPairs = [(word, jaccard_distance( set(ngrams(word, gram_number)), set(ngrams(entry, gram_number)) ) ) for word in words_starting_with] closet = min(scoreWordPairs, key=lambda x: x[1]) outcomes.append(closet[0]) return outcomes
def jaccard_result(name_to_fix: str, all_options: list, ngrams_lenght: int): name_to_correct = name_to_fix.lower().replace(' ', '') n_in = set(ngrams(name_to_correct, ngrams_lenght)) out_opts = [pl.lower().replace(' ', '') for pl in all_options] n_outs = [set(ngrams(pl, ngrams_lenght)) for pl in out_opts] distances = [jaccard_distance(n_in, n_out) for n_out in n_outs] if len(set(distances)) == 1 and distances[0] == 1: return jaccard_result(name_to_correct, all_options, ngrams_lenght-1) else: return np.array(all_options)[np.argsort(distances)][:3]
def answer_ten(entries=['cormulent', 'incendenece', 'validrate']): from nltk.metrics.distance import ( jaccard_distance, ) from nltk.util import ngrams spellings_series = pd.Series(correct_spellings) correct = [] for entry in entries: spellings = spellings_series[spellings_series.str.startswith(entry[0])] distances = ((jaccard_distance(set(ngrams(entry, 4)), set(ngrams(word, 4))), word) for word in spellings) closet = min(distances) correct.append(closet[1]) return correct
def jaccard_result(in_opt: str, all_opt: list, ngrm: int) -> str: in_opt = in_opt.lower().replace(' ', '') n_in = set(ngrams(in_opt, ngrm)) out_opts = [pl.lower().replace(' ', '') for pl in all_opt] n_outs = [set(ngrams(pl, ngrm)) for pl in out_opts] distances = [jaccard_distance(n_in, n_out) for n_out in n_outs] if len(set(distances)) == 1: return jaccard_result(in_opt, all_opt, ngrm - 1) if ngrm > 2 else '' else: idx = int(np.argmin(distances)) return all_opt[idx]
def doYouMean(keyword): from nltk.corpus import words correct_spellings = words.words() from nltk.metrics.distance import jaccard_distance from nltk.util import ngrams result = '' for key in keyword.split(): if len(key) > 1: temp = [(jaccard_distance(set(ngrams(key, 2)), set(ngrams(w, 2))), w) for w in correct_spellings if w[0] == key[0]] result += sorted(temp, key=lambda val: val[0])[0][1] + ' ' else: result += key + ' ' return result
def answer_nine(entries=['cormulent', 'incendenece', 'validrate']): from nltk.metrics.distance import (edit_distance, jaccard_distance) from nltk.util import ngrams df = pd.Series(data=correct_spellings) words = [word for word in df if word.startswith(entries[0][0])] res = [] for entry in entries: words = [word for word in df if word.startswith(entry[0])] distances = ((jaccard_distance(set(ngrams(entry, 3)), set(ngrams(word, 3))), word) for word in words) closest = min(distances) res.append(closest[1]) return res
def combineClusters(clusters, **twitter_stream_settings): def getHashtagSet(vector): return set([word for dimension in vector for word in dimension.split() if word.startswith('#')]) def getClusterInt(id): return int(id.split('_')[1]) mergedClustersMap = {} for cluster in [clusters[v] for v in sorted(clusters, key=getClusterInt)]: mergedClusterId = None for mergedCluster in mergedClustersMap.itervalues(): clusterHashtags, mergedClusterHashtags = getHashtagSet(cluster), getHashtagSet(mergedCluster) if len(clusterHashtags.union(mergedClusterHashtags)) and jaccard_distance(clusterHashtags, mergedClusterHashtags) <= 1-twitter_stream_settings['cluster_merging_jaccard_distance_threshold']: mergedCluster.mergeCluster(cluster), mergedCluster.mergedClustersList.append(cluster.clusterId) mergedClusterId = mergedCluster.clusterId break if mergedClusterId==None: mergedCluster = StreamCluster.getClusterObjectToMergeFrom(cluster) mergedCluster.mergedClustersList = [cluster.clusterId] mergedClustersMap[mergedCluster.clusterId]=mergedCluster return mergedClustersMap
def JaccardSimAndMasiDis(text1, text2, stop_words=False): word1list = word_tokenize(text1) word2list = word_tokenize(text2) if stop_words: word1list = [ word.lower() for word in word1list if word not in StopWords ] word2list = [ word.lower() for word in word2list if word not in StopWords ] word1set = set(word1list) word2set = set(word2list) return 1 - jaccard_distance( word1set, word2set) #, 1 - masi_distance(word1set, word2set)
def predict_labels(test_questions, known_questions, coarseness): labels = [] n_known_questions = len(known_questions) for test_question in test_questions: smallest_dist = 5000 closest_question = None for i in range(n_known_questions): dist = jaccard_distance(set(test_question), set(known_questions[i].question)) if (dist < smallest_dist): smallest_dist = dist closest_question = known_questions[i] if (closest_question != None): labels.append(get_label(closest_question, coarseness)) else: labels.append(None) return labels
def answer_ten(entries=['cormulent', 'incendenece', 'validrate']): from nltk.metrics.distance import jaccard_distance from nltk.util import ngrams correct_words = [] for e in entries: min_dist = 1 closest = None for c in correct_spellings: if c[0] == e[0]: d = jaccard_distance(set(ngrams(e, n=4)), set(ngrams(c, n=4))) if d < min_dist: min_dist = d closest = c correct_words = correct_words + [closest] return correct_words
def answer_nine(entries=['cormulent', 'incendenece', 'validrate']): from nltk.metrics.distance import jaccard_distance from nltk.util import ngrams # set gram number N_g = 3 spelling_df = pd.Series(correct_spellings) vals_returned = [] for word in entries: spell = spelling_df[spelling_df.str.startswith(word[0])] dist_calced = ((jaccard_distance(set(ngrams(word, N_g)), set(ngrams(var, N_g))), var) for var in spell) #closest_val = min(dist_calced) #vals_returned.append(closest_val[1]) vals_returned.append(min(dist_calced)[1]) # This function should return a list of length three: # ['cormulent_reccomendation', 'incendenece_reccomendation', 'validrate_reccomendation']. return vals_returned
def get_realtion_info(relation_candidate, remain_sentence): # [name, relation, target_entity, target_entity_keyid] # temp_relations = ccksNeo.get_entity_info_by_keyid(entity_keyid) #该实体的信息 # 实体名,路径,目标实体 # print(temp_relations) relation_info = [] for candidate in relation_candidate: # for key, value in temp_relations.items(): #路径名,目标实体 segmentor1 = Segmentor() segmentor1.load("./ltpdata/ltp_data_v3.4.0/cws.model") temp = list(segmentor1.segment(remain_sentence)) segmentor1.release() guanxideci = jieba.cut(candidate[0]) for word in guanxideci: if word in model and word in temp: temp.remove(word) ''' segmentor2 = Segmentor() segmentor2.load("./ltpdata/ltp_data_v3.4.0/cws.model") temp2 = list(segmentor2.segment(candidate[1])) segmentor2.release() ''' ##################jaccard temp2 = [candidate[1]] set1 = set(temp) set2 = set(temp2) jaccard = jaccard_distance(set1, set2) edit = difflib.SequenceMatcher(None, question, candidate[1]).ratio() print(temp, temp2) w2v = serviceWord2vec.get_similarity(temp, list(jieba.cut(candidate[1]))) ''' if key == c_relation_name: is_correct = 1 else: is_correct = 0 ''' # relation_info.append([candidate[0], candidate[1], candidate[2], candidate[3], jaccard, edit, w2v]) # 实体,路径名,目标实体,jaccard距离,编辑距离,向量相似度 # print(relation_info) return relation_info '''
def jaccard(entries, gram_number): """find the closet words to each entry Args: entries: collection of words to match gram_number: number of n-grams to use Returns: list: words with the closest jaccard distance to entries """ outcomes = [] for entry in entries: spellings = spellings_series[spellings_series.str.startswith(entry[0])] distances = ((jaccard_distance(set(ngrams(entry, gram_number)), set(ngrams(word, gram_number))), word) for word in spellings) # distances is a generator closest = min(distances) outcomes.append(closest[1]) return outcomes
def answer_ten(entries=['cormulent', 'incendenece', 'validrate']): from nltk.metrics.distance import jaccard_distance from nltk.util import ngrams # make tripairs of adjacent letters # print(set(ngrams('cormulent', 3))) best = [] for i, entry in enumerate(entries): spellings_check = [w for w in correct_spellings if w[0] == entry[0]] distances = [(entry, word, jaccard_distance(set(ngrams(entry, 4)), set(ngrams(word, 4)))) for word in spellings_check] distances.sort(key=lambda tup: tup[2]) best.append(distances[0]) recommended = [word for _, word, _ in best] return recommended
def jaccard_result(in_opt: str, all_opt: list, ngrm: int) -> str: """ Fix user input. """ in_opt = in_opt.lower().replace(' ', '') n_in = set(ngrams(in_opt, ngrm)) out_opts = [pl.lower().replace(' ', '').replace('+', '') for pl in all_opt] n_outs = [set(ngrams(pl, ngrm)) for pl in out_opts] if in_opt in out_opts: return all_opt[out_opts.index(in_opt)] distances = [jaccard_distance(n_in, n_out) for n_out in n_outs] if len(set(distances)) == 1: return jaccard_result(in_opt, all_opt, ngrm - 1) if ngrm > 2 else '' else: return all_opt[np.argmin(distances)]
def jaccard_distance_similarity(lhs, entities): min_similarity = [ ('', '', 1000000.0), ] for entity in entities: similarity = distance.jaccard_distance( set(ngrams(lhs, n=2)), set(ngrams(entity['author'], n=2))) if similarity < min_similarity[0][2]: min_similarity = [ (entity['author'], entity['url'], similarity), ] elif min_similarity[0][2] == similarity: min_similarity.append( (entity['author'], entity['url'], similarity)) print( 'Jaccard distance with {0} and {1} entities results in {2} minimum similarity of {3}' .format(lhs, len(entities), len(min_similarity), min_similarity[0][2])) return min_similarity
a = wordnet.synsets('tone')[4] b = wordnet.synsets('color')[0] wordnet.similarity(a,b) a = ['this', 'is', 'a', 'test'] b = ['this', 'was', 'a', 'test'] edit_distance(a, b) jaccard_distance(set(a), set(b)) masi_distance(set(a), set(b)) from pattern.web import DBPedia sparql = '\n'.join(( 'prefix dbo: <http://dbpedia.org/ontology/>', 'select ?person ?place where {',
@author: space ''' import argparse import logging import random import itertools as it import functools as ft from lsh import LSHCache, XORHashFamily, MultiplyHashFamily, Shingler from nltk.metrics.distance import jaccard_distance, masi_distance, edit_distance minhash_choices = { 'xor': XORHashFamily, 'multiply': MultiplyHashFamily, } similarity_choices = { 'jaccard': lambda a,b,s: 1 - jaccard_distance(set(s.shingle(a)), set(s.shingle(b))), 'masi': lambda a,b,s: 1 - masi_distance(set(s.shingle(a)), set(s.shingle(b))), 'edit': lambda a,b,s: 1 - float(edit_distance(a,b))/max(len(a),len(b)), 'edit_transposition': lambda a,b,s: 1-float(edit_distance(a,b,True))/max(len(a),len(b)) } generator_choices = { 'combinations': it.combinations, 'combinations_replacement': it.combinations_with_replacement, 'permutations': it.permutations } def parse_args(argv=None): parser = argparse.ArgumentParser(description="Analyze performance of LSH over a mock generated data set") lsh_group = parser.add_argument_group('LSH Cache parameters') lsh_group.add_argument("-b", "--num-bands", type=int, help="""number of bands in LSH cache""")
def jacquard_sim(text1,text2): set1=set(tokenizer(text1)) set2=set(tokenizer(text2)) sim=jaccard_distance(set1, set2)#, normalize=True) return sim
def search_misawa(meigens, targetSentence, retR=False, method='masi', model=None, dictionary=None): """ MASI距離によりベストなミサワを探す関数 - IN : 名言リスト、解析対象文章 - OUT : 画像のURL """ targetWords = mecab_func.breakdown_into_validwords(targetSentence) if len(targetWords) <= 2 or len(targetWords) >= 30: logger.warning("bad tweet for misawa-recommend") if retR: return 1., None else: return (1.) # 入力された文章で解析可能な場合 hit = False minr = 1.0 matched_inf = {} cnt = 0 for meigen in meigens: words = meigen['words'] if method == 'jaccard': # Jaccard距離による類似度判定。小さいほど類似 r = jaccard_distance(set(targetWords), set(words)) elif method == 'masi': # MASI距離による類似度判定。小さいほど類似 r = masi_distance(set(targetWords), set(words)) elif method[0:3] in ['lsi', 'lda', 'LSI', 'LDA']: # コサイン類似度で判定。負で評価し、小さいほど類似 vec = model[dictionary.doc2bow(targetWords)] r = -1.*matutils.cossim(meigen[method], vec) elif method[0:3] in ['d2v', 'doc']: # コサイン類似度で判定。負で評価し、小さいほど類似 r = -1.*d2v_similarity(targetWords, words, model) if r < minr: hit = True minr = r matched_inf = meigen cnt = cnt + 1 # 例外: すべての名言との距離が 1.0 if not hit: logger.info("ベストマッチなし") if retR: return 1., None else: return (1.) logger.info("========calculation report========") logger.info("method: %s [r = %f]" % (method, minr)) logger.info("input : %s %s" % (targetSentence.replace('\n', ' '), targetWords)) logger.info('meigen: %s %s' % (matched_inf['body'].replace('\n', ' '), matched_inf['words'])) if retR: # 戻り値: MASI距離, 全ミサワ情報 return minr, matched_inf else: # レポート # 戻り値: 画像のURL return(matched_inf)
def __init__(self,combo): self.f1,self.f2 = combo self.f1_set = set(self.clean_name(self.f1.name)) self.f2_set = set(self.clean_name(self.f2.name)) self.distance = jaccard_distance(self.f1_set,self.f2_set)
# -*- coding: utf-8 -*- from nltk.metrics.distance import jaccard_distance, masi_distance from prettytable import PrettyTable fields = ['X', 'Y', 'Jaccard(X,Y)', 'MASI(X,Y)'] pt = PrettyTable(fields) [pt.set_field_align(f, 'l') for f in fields] for z in range(4): X = set() for x in range(z, 4): Y = set() for y in range(1, 3): X.add(x) Y.add(y) pt.add_row([list(X), list(Y), round(jaccard_distance(X, Y), 2), round(masi_distance(X, Y), 2)]) print(pt)
# Features top_entry = json_response[0] true_matches = [bool(song['Match']) for song in json_response[1:]] FEATURE = 'SongName' NGRAMS = 2 top_entry_value = preproc(top_entry[FEATURE]) print 'Comparing song name to top match reference:', top_entry[FEATURE] top_entry_word_bigrams = set(ngrams(word_tokenize(top_entry_value), NGRAMS)) matches = [] for song in json_response[1:]: this_value = preproc(song[FEATURE]) print '\t%s' % song[FEATURE] this_word_bigrams = set(ngrams(word_tokenize(this_value), NGRAMS)) wbg_distance = jaccard_distance(top_entry_word_bigrams, this_word_bigrams) print '\t\tWord bigrams + Jaccard:\t'+str(wbg_distance) is_this_match = is_match(wbg_distance) print '\t\tMatch?', is_this_match matches.append(is_this_match) cm = ConfusionMatrix(true_matches, matches) print 'Confusion matrix' print cm print 'Accuracy:', accuracy(true_matches, matches)
def get_values(entities, domain): _random, bayes_random = {}, {} bayes_no_variation, bayes_variation = {}, {} siddharthan, deemter = {}, {} for _id in entities: evaluation = p.load(open(os.path.join(properties.evaluation_dir, _id))) for fold in evaluation: if fold not in bayes_random: _random[fold] = {'y_real':[], 'y_pred':[], 'string':[], 'jaccard':[]} bayes_random[fold] = {'y_real':[], 'y_pred':[], 'string':[], 'jaccard':[]} bayes_no_variation[fold] = {'y_real':[], 'y_pred':[], 'string':[], 'jaccard':[]} bayes_variation[fold] = {'y_real':[], 'y_pred':[], 'string':[], 'jaccard':[]} siddharthan[fold] = {'y_real':[], 'y_pred':[], 'string':[], 'jaccard':[]} deemter[fold] = {'y_real':[], 'y_pred':[], 'string':[], 'jaccard':[]} for item in evaluation[fold]: item_domain = get_domain(item['features']['fname']) if domain == item_domain or domain == '': string_real = item['real']['reference'] string_random = item['random']['reference'] string_bayes_random = item['bayes_random']['reference'][0][0] string_bayes_no_variation = item['bayes_no_variation']['reference'][0][0] string_bayes_variation = item['bayes_variation']['reference'][0][0] string_siddharthan = item['siddharthan']['reference'] string_deemter = item['deemter']['reference'] dist_random = edit_distance(string_random, string_real) dist_bayes_random = edit_distance(string_bayes_random, string_real) dist_bayes_no_variation = edit_distance(string_bayes_no_variation, string_real) dist_bayes_variation = edit_distance(string_bayes_variation, string_real) dist_siddharthan = edit_distance(string_siddharthan, string_real) dist_deemter = edit_distance(string_deemter, string_real) tokens_real = set(nltk.word_tokenize(string_real)) tokens_random = set(nltk.word_tokenize(string_random)) tokens_bayes_random = set(nltk.word_tokenize(string_bayes_random)) tokens_bayes_no_variation = set(nltk.word_tokenize(string_bayes_no_variation)) tokens_bayes_variation = set(nltk.word_tokenize(string_bayes_variation)) tokens_siddharthan = set(nltk.word_tokenize(string_siddharthan)) tokens_deemter = set(nltk.word_tokenize(string_deemter)) jaccard_random = jaccard_distance(tokens_random, tokens_real) jaccard_bayes_random = jaccard_distance(tokens_bayes_random, tokens_real) jaccard_bayes_no_variation = jaccard_distance(tokens_bayes_no_variation, tokens_real) jaccard_bayes_variation = jaccard_distance(tokens_bayes_variation, tokens_real) jaccard_siddharthan = jaccard_distance(tokens_siddharthan, tokens_real) jaccard_deemter = jaccard_distance(tokens_deemter, tokens_real) bayes_random[fold]['y_real'].append(item['real']['label']) bayes_random[fold]['y_pred'].append(item['bayes_random']['label'][0]) bayes_random[fold]['string'].append(dist_bayes_random) bayes_random[fold]['jaccard'].append(jaccard_bayes_random) bayes_no_variation[fold]['y_real'].append(item['real']['label']) bayes_no_variation[fold]['y_pred'].append(item['bayes_no_variation']['label'][0]) bayes_no_variation[fold]['string'].append(dist_bayes_no_variation) bayes_no_variation[fold]['jaccard'].append(jaccard_bayes_no_variation) bayes_variation[fold]['y_real'].append(item['real']['label']) bayes_variation[fold]['y_pred'].append(item['bayes_variation']['label'][0]) bayes_variation[fold]['string'].append(dist_bayes_variation) bayes_variation[fold]['jaccard'].append(jaccard_bayes_variation) _random[fold]['y_real'].append(item['real']['label']) _random[fold]['y_pred'].append(item['random']['label']) _random[fold]['string'].append(dist_random) _random[fold]['jaccard'].append(jaccard_random) siddharthan[fold]['y_real'].append(item['real']['label']) siddharthan[fold]['y_pred'].append(item['siddharthan']['label']) siddharthan[fold]['string'].append(dist_siddharthan) siddharthan[fold]['jaccard'].append(jaccard_siddharthan) deemter[fold]['y_real'].append(item['real']['label']) deemter[fold]['y_pred'].append(item['deemter']['label']) deemter[fold]['string'].append(dist_deemter) deemter[fold]['jaccard'].append(jaccard_deemter) return _random, bayes_random, bayes_no_variation, bayes_variation, siddharthan, deemter
def jaccard_unigram_distance(affil_1, affil_2): """Unigram distance between two strings""" affil_set_1 = set(w_tokenizer.tokenize(affil_1['Name'])) affil_set_2 = set(w_tokenizer.tokenize(affil_2['Name'])) return jaccard_distance(affil_set_1, affil_set_2)
def jacquard_sim(set1,set2): sim=jaccard_distance(set1, set2)#, normalize=True) return sim
def jaccard(self, inputA, inputB): # Returns jaccard index. Smaller the more better a = inputA.lower() b = inputB.lower() return distance.jaccard_distance(set(a.split()), set(b.split()))
def ner_jaccard(ne1, ne2): if(len(ne1)==0 or len(ne2)==0): return 1 return 1-jaccard_distance(set(ne1), set(ne2))