@author: space ''' import argparse import logging import random import itertools as it import functools as ft from lsh import LSHCache, XORHashFamily, MultiplyHashFamily, Shingler from nltk.metrics.distance import jaccard_distance, masi_distance, edit_distance minhash_choices = { 'xor': XORHashFamily, 'multiply': MultiplyHashFamily, } similarity_choices = { 'jaccard': lambda a,b,s: 1 - jaccard_distance(set(s.shingle(a)), set(s.shingle(b))), 'masi': lambda a,b,s: 1 - masi_distance(set(s.shingle(a)), set(s.shingle(b))), 'edit': lambda a,b,s: 1 - float(edit_distance(a,b))/max(len(a),len(b)), 'edit_transposition': lambda a,b,s: 1-float(edit_distance(a,b,True))/max(len(a),len(b)) } generator_choices = { 'combinations': it.combinations, 'combinations_replacement': it.combinations_with_replacement, 'permutations': it.permutations } def parse_args(argv=None): parser = argparse.ArgumentParser(description="Analyze performance of LSH over a mock generated data set") lsh_group = parser.add_argument_group('LSH Cache parameters') lsh_group.add_argument("-b", "--num-bands", type=int, help="""number of bands in LSH cache""") lsh_group.add_argument("-r", "--num-rows", type=int,
def masi(self, a, b): a = a.lower() b = b.lower() return distance.masi_distance(set(a.split()), set(b.split()))
# -*- coding: utf-8 -*- from nltk.metrics.distance import jaccard_distance, masi_distance from prettytable import PrettyTable fields = ['X', 'Y', 'Jaccard(X,Y)', 'MASI(X,Y)'] pt = PrettyTable(fields) [pt.set_field_align(f, 'l') for f in fields] for z in range(4): X = set() for x in range(z, 4): Y = set() for y in range(1, 3): X.add(x) Y.add(y) pt.add_row([list(X), list(Y), round(jaccard_distance(X, Y), 2), round(masi_distance(X, Y), 2)]) print(pt)
def main(): reuterList = [] reuterNumber = 0 for i in range(0, 21): filename = "reut2-%s.sgm" % ("%03d" % i) print filename sgm = RR(filename) for i in range(0, sgm.NumberOfReuters() - 1): #for i in range(0, 20): reuterNumber = reuterNumber + 1 print "Reuter Number: " + str(reuterNumber) title = sgm.ExtractTagData(i, "TITLE") title = title.lower(); #print "title: " + str(title) topics = sgm.ExtractTagData(i, "TOPICS") topics = topics.lower() topics = re.sub("<d>", "", topics) topics = re.sub("</d>", " ", topics) topicsTokens = nltk.word_tokenize(topics) #body = sgm.ExtractTagData(i, "BODY") #bodyTokens = nltk.word_tokenize(body) #bodyTokens = cleanTokens(bodyTokens) #TODO: if there is no topics perdict them #TODO: get rid of topics with 'earn ' #if(len(topicsTokens) != 0): # if(topicsTokens[0] is not 'earn ' and len(topicsTokens) > 1): # if(topicsTokens[0] is not 'acq ' and len(topicsTokens) > 1): #if(len(topicsTokens) != 0): if(len(topicsTokens) >= 2): #if(len(topicsTokens) != 0): newElement = [title, topics] reuterList.append(newElement) print print "Reuter List: " for i in range(0, len(reuterList)): print reuterList[i] print print "Number of Elements: " + str(len(reuterList)) #create matrix distanceMatrix = [[0 for x in xrange(len(reuterList))] for x in xrange(len(reuterList))] for i in range(0, len(reuterList)): for j in range(0, len(reuterList)): distanceMatrix[i][j] = Distance.masi_distance(set(reuterList[i][1]), set(reuterList[j][1])) #distanceMatrix[i][j] = Distance.jaccard_distance(set(reuterList[i][1]), set(reuterList[j][1])) print print "Distance Matrix: " for i in range(0, len(reuterList)): print distanceMatrix[i] print "done" print "Creating Plot: " fig = pylab.figure() #y = linkage(distanceMatrix, method = 'single') y = linkage(distanceMatrix, method = 'complete') z = dendrogram(y) fig.show() print "Saving as dendrogramLab4.png" fig.savefig('dendrogramLab4.png') #print "Entropy: " + str(nltk.probability.entropy(distanceMatrix)) print "done"
def get_ngram_stats(row, n, qcolumns, char=False): if char == True: q1 = ''.join(row[qcolumns[0]].split()) q2 = ''.join(row[qcolumns[1]].split()) else: q1 = row[qcolumns[0]].split() q2 = row[qcolumns[1]].split() q1_ngram_list = list(ngrams(q1, n)) q2_ngram_list = list(ngrams(q2, n)) q1_ngram_set = set(q1_ngram_list) q2_ngram_set = set(q2_ngram_list) q1_sum = len(q1_ngram_list) q2_sum = len(q2_ngram_list) diff = abs(q1_sum - q2_sum) if q1_sum + q2_sum != 0: diff_norm = diff / (q1_sum + q2_sum) * 2 else: diff_norm = -1 maximum = max([q1_sum, q2_sum]) minimum = min([q1_sum, q2_sum]) q1_unique = len(q1_ngram_set) q2_unique = len(q2_ngram_set) diff_unique = abs(q1_unique - q2_unique) intersect_r = Counter(q1_ngram_list) & Counter(q2_ngram_list) if q1_sum + q2_sum != 0: intersect_r = sum(intersect_r.values()) / (q1_sum + q2_sum) * 2 intersect_unique_r = len( q1_ngram_set.intersection(q2_ngram_set)) / (q1_unique + q2_unique) * 2 masi_dist = distance.masi_distance(q1_ngram_set, q2_ngram_set) else: intersect_r = -1 intersect_unique_r = -1 masi_dist = -1 if 0 != len(q1_ngram_set.union(q2_ngram_set)): jaccard_dist = (len(q1_ngram_set.union(q2_ngram_set)) - len( q1_ngram_set.intersection(q2_ngram_set))) / len( q1_ngram_set.union(q2_ngram_set)) else: jaccard_dist = 1 bin_dist = distance.binary_distance(q1_ngram_set, q2_ngram_set) listout = [ q1_sum, q2_sum, diff, diff_norm, maximum, minimum, q1_unique, q2_unique, diff_unique, intersect_r, intersect_unique_r, jaccard_dist, bin_dist, masi_dist ] return listout
def get_ngram_stats(row, n, qcolumns, char=False, append=''): if char == True: q1 = ''.join(row[qcolumns[0]].split()) q2 = ''.join(row[qcolumns[1]].split()) else: q1 = row[qcolumns[0]].split() q2 = row[qcolumns[1]].split() q1_ngram_list = list(ngrams(q1, n)) q2_ngram_list = list(ngrams(q2, n)) q1_ngram_set = set(q1_ngram_list) q2_ngram_set = set(q2_ngram_list) q1_sum = len(q1_ngram_list) q2_sum = len(q2_ngram_list) diff = abs(q1_sum - q2_sum) if q1_sum + q2_sum != 0: diff_norm = diff / (q1_sum + q2_sum) * 2 else: diff_norm = -1 maximum = max([q1_sum, q2_sum]) minimum = min([q1_sum, q2_sum]) q1_unique = len(q1_ngram_set) q2_unique = len(q2_ngram_set) diff_unique = abs(q1_unique - q2_unique) intersect_r = Counter(q1_ngram_list) & Counter(q2_ngram_list) if q1_sum + q2_sum != 0: intersect_r = sum(intersect_r.values()) / (q1_sum + q2_sum) * 2 intersect_unique_r = len( q1_ngram_set.intersection(q2_ngram_set)) / (q1_unique + q2_unique) * 2 else: intersect_r = -1 intersect_unique_r = -1 if 0 != len(q1_ngram_set.union(q2_ngram_set)): jaccard_dist = (len(q1_ngram_set.union(q2_ngram_set)) - len( q1_ngram_set.intersection(q2_ngram_set))) / len( q1_ngram_set.union(q2_ngram_set)) else: jaccard_dist = 1 bin_dist = distance.binary_distance(q1_ngram_set, q2_ngram_set) masi_dist = distance.masi_distance(q1_ngram_set, q2_ngram_set) listout = [ q1_sum, q2_sum, diff, diff_norm, maximum, minimum, q1_unique, q2_unique, diff_unique, intersect_r, intersect_unique_r, jaccard_dist, bin_dist, masi_dist ] keys = [ 'q1_sum', 'q2_sum', 'diff', 'diff_norm', 'max', 'min', 'q1_uni', 'q2_uni', 'diff_uni', 'intersect_r', 'inter_uni_r', 'jaccard_dist', 'bin_dist', 'masi_dist' ] keys = [x + str(n) + append for x in keys] dictout = dict(zip(keys, listout)) return pd.Series(dictout)
def search_misawa(meigens, targetSentence, retR=False, method='masi', model=None, dictionary=None): """ MASI距離によりベストなミサワを探す関数 - IN : 名言リスト、解析対象文章 - OUT : 画像のURL """ targetWords = mecab_func.breakdown_into_validwords(targetSentence) if len(targetWords) <= 2 or len(targetWords) >= 30: logger.warning("bad tweet for misawa-recommend") if retR: return 1., None else: return (1.) # 入力された文章で解析可能な場合 hit = False minr = 1.0 matched_inf = {} cnt = 0 for meigen in meigens: words = meigen['words'] if method == 'jaccard': # Jaccard距離による類似度判定。小さいほど類似 r = jaccard_distance(set(targetWords), set(words)) elif method == 'masi': # MASI距離による類似度判定。小さいほど類似 r = masi_distance(set(targetWords), set(words)) elif method[0:3] in ['lsi', 'lda', 'LSI', 'LDA']: # コサイン類似度で判定。負で評価し、小さいほど類似 vec = model[dictionary.doc2bow(targetWords)] r = -1.*matutils.cossim(meigen[method], vec) elif method[0:3] in ['d2v', 'doc']: # コサイン類似度で判定。負で評価し、小さいほど類似 r = -1.*d2v_similarity(targetWords, words, model) if r < minr: hit = True minr = r matched_inf = meigen cnt = cnt + 1 # 例外: すべての名言との距離が 1.0 if not hit: logger.info("ベストマッチなし") if retR: return 1., None else: return (1.) logger.info("========calculation report========") logger.info("method: %s [r = %f]" % (method, minr)) logger.info("input : %s %s" % (targetSentence.replace('\n', ' '), targetWords)) logger.info('meigen: %s %s' % (matched_inf['body'].replace('\n', ' '), matched_inf['words'])) if retR: # 戻り値: MASI距離, 全ミサワ情報 return minr, matched_inf else: # レポート # 戻り値: 画像のURL return(matched_inf)
# -*- coding: utf-8 -*- from nltk.metrics.distance import jaccard_distance, masi_distance from prettytable import PrettyTable fields = ['X', 'Y', 'Jaccard(X,Y)', 'MASI(X,Y)'] pt = PrettyTable(fields=fields) [pt.set_field_align(f, 'l') for f in fields] for z in range(4): X = set() for x in range(z, 4): Y = set() for y in range(1, 3): X.add(x) Y.add(y) pt.add_row([ list(X), list(Y), round(jaccard_distance(X, Y), 2), round(masi_distance(X, Y), 2) ]) pt.printt()
b = wordnet.synsets('color')[0] wordnet.similarity(a,b) a = ['this', 'is', 'a', 'test'] b = ['this', 'was', 'a', 'test'] edit_distance(a, b) jaccard_distance(set(a), set(b)) masi_distance(set(a), set(b)) from pattern.web import DBPedia sparql = '\n'.join(( 'prefix dbo: <http://dbpedia.org/ontology/>', 'select ?person ?place where {', ' ?person a dbo:President.', ' ?person dbo:birthPlace ?place.',