Example #1
0
@author: space
'''
import argparse
import logging
import random
import itertools as it
import functools as ft
from lsh import LSHCache, XORHashFamily, MultiplyHashFamily, Shingler
from nltk.metrics.distance import jaccard_distance, masi_distance, edit_distance

minhash_choices = { 'xor': XORHashFamily,
                    'multiply': MultiplyHashFamily,
                  }

similarity_choices = { 'jaccard': lambda a,b,s: 1 - jaccard_distance(set(s.shingle(a)), set(s.shingle(b))),
                       'masi': lambda a,b,s: 1 - masi_distance(set(s.shingle(a)), set(s.shingle(b))),
                       'edit': lambda a,b,s: 1 - float(edit_distance(a,b))/max(len(a),len(b)),
                       'edit_transposition': lambda a,b,s: 1-float(edit_distance(a,b,True))/max(len(a),len(b)) }

generator_choices = { 'combinations': it.combinations,
                      'combinations_replacement': it.combinations_with_replacement,
                      'permutations': it.permutations }

def parse_args(argv=None):
    parser = argparse.ArgumentParser(description="Analyze performance of LSH over a mock generated data set")

    lsh_group = parser.add_argument_group('LSH Cache parameters')
    
    lsh_group.add_argument("-b", "--num-bands", type=int,
                        help="""number of bands in LSH cache""")
    lsh_group.add_argument("-r", "--num-rows", type=int, 
Example #2
0
 def masi(self, a, b):
     a = a.lower()
     b = b.lower()
     return distance.masi_distance(set(a.split()), set(b.split()))
# -*- coding: utf-8 -*-

from nltk.metrics.distance import jaccard_distance, masi_distance
from prettytable import PrettyTable

fields = ['X', 'Y', 'Jaccard(X,Y)', 'MASI(X,Y)']
pt = PrettyTable(fields)
[pt.set_field_align(f, 'l') for f in fields]

for z in range(4):
    X = set()
    for x in range(z, 4):
        Y = set()
        for y in range(1, 3):
            X.add(x)
            Y.add(y)
            pt.add_row([list(X), list(Y), round(jaccard_distance(X, Y), 2),
                       round(masi_distance(X, Y), 2)])
print(pt)
def main():
    reuterList = []
    reuterNumber = 0
    for i in range(0, 21):
        filename = "reut2-%s.sgm" % ("%03d" % i)
        print filename
        sgm = RR(filename)
        for i in range(0, sgm.NumberOfReuters() - 1):
        #for i in range(0, 20):
        
            reuterNumber = reuterNumber + 1
            print "Reuter Number: " + str(reuterNumber)

            title = sgm.ExtractTagData(i, "TITLE")
            title = title.lower();
            #print "title: " + str(title)
            
            topics = sgm.ExtractTagData(i, "TOPICS")
            topics = topics.lower()
            topics = re.sub("<d>", "", topics)
            topics = re.sub("</d>", " ", topics)
            topicsTokens = nltk.word_tokenize(topics)

            #body = sgm.ExtractTagData(i, "BODY")
            #bodyTokens = nltk.word_tokenize(body)
            #bodyTokens = cleanTokens(bodyTokens)
        
            #TODO: if there is no topics perdict them
            
            #TODO: get rid of topics with 'earn '
            #if(len(topicsTokens) != 0):
            #    if(topicsTokens[0] is not 'earn ' and len(topicsTokens) > 1):
            #        if(topicsTokens[0] is not 'acq ' and len(topicsTokens) > 1):
                    #if(len(topicsTokens) != 0):
            
            if(len(topicsTokens) >= 2):
            #if(len(topicsTokens) != 0):
                newElement = [title, topics]
                reuterList.append(newElement)
            
    print
    print "Reuter List: "
    for i in range(0, len(reuterList)):
        print reuterList[i]
    
    print
    print "Number of Elements: " + str(len(reuterList))

    #create matrix
    distanceMatrix = [[0 for x in xrange(len(reuterList))] for x in xrange(len(reuterList))]
    for i in range(0, len(reuterList)):
        for j in range(0, len(reuterList)):
            distanceMatrix[i][j] = Distance.masi_distance(set(reuterList[i][1]), set(reuterList[j][1]))
            #distanceMatrix[i][j] = Distance.jaccard_distance(set(reuterList[i][1]), set(reuterList[j][1]))

    print
    print "Distance Matrix: "
    for i in range(0, len(reuterList)):
        print distanceMatrix[i]

    print "done"
        
    print "Creating Plot: "
    fig = pylab.figure()
    #y = linkage(distanceMatrix, method = 'single')
    y = linkage(distanceMatrix, method = 'complete')
    z = dendrogram(y)

    fig.show()
    print "Saving as dendrogramLab4.png"
    fig.savefig('dendrogramLab4.png')

    #print "Entropy: " + str(nltk.probability.entropy(distanceMatrix))

    print "done"
Example #5
0
File: Utils.py Project: lwheng/fyp
 def masi(self, a, b):
   a = a.lower()
   b = b.lower()
   return distance.masi_distance(set(a.split()), set(b.split()))
Example #6
0
    def get_ngram_stats(row, n, qcolumns, char=False):

        if char == True:
            q1 = ''.join(row[qcolumns[0]].split())
            q2 = ''.join(row[qcolumns[1]].split())
        else:
            q1 = row[qcolumns[0]].split()
            q2 = row[qcolumns[1]].split()

        q1_ngram_list = list(ngrams(q1, n))
        q2_ngram_list = list(ngrams(q2, n))

        q1_ngram_set = set(q1_ngram_list)
        q2_ngram_set = set(q2_ngram_list)

        q1_sum = len(q1_ngram_list)
        q2_sum = len(q2_ngram_list)

        diff = abs(q1_sum - q2_sum)

        if q1_sum + q2_sum != 0:
            diff_norm = diff / (q1_sum + q2_sum) * 2
        else:
            diff_norm = -1
        maximum = max([q1_sum, q2_sum])
        minimum = min([q1_sum, q2_sum])

        q1_unique = len(q1_ngram_set)
        q2_unique = len(q2_ngram_set)

        diff_unique = abs(q1_unique - q2_unique)

        intersect_r = Counter(q1_ngram_list) & Counter(q2_ngram_list)

        if q1_sum + q2_sum != 0:
            intersect_r = sum(intersect_r.values()) / (q1_sum + q2_sum) * 2
            intersect_unique_r = len(
                q1_ngram_set.intersection(q2_ngram_set)) / (q1_unique +
                                                            q2_unique) * 2
            masi_dist = distance.masi_distance(q1_ngram_set, q2_ngram_set)
        else:
            intersect_r = -1
            intersect_unique_r = -1
            masi_dist = -1

        if 0 != len(q1_ngram_set.union(q2_ngram_set)):
            jaccard_dist = (len(q1_ngram_set.union(q2_ngram_set)) - len(
                q1_ngram_set.intersection(q2_ngram_set))) / len(
                    q1_ngram_set.union(q2_ngram_set))
        else:
            jaccard_dist = 1

        bin_dist = distance.binary_distance(q1_ngram_set, q2_ngram_set)

        listout = [
            q1_sum, q2_sum, diff, diff_norm, maximum, minimum, q1_unique,
            q2_unique, diff_unique, intersect_r, intersect_unique_r,
            jaccard_dist, bin_dist, masi_dist
        ]

        return listout
Example #7
0
    def get_ngram_stats(row, n, qcolumns, char=False, append=''):

        if char == True:
            q1 = ''.join(row[qcolumns[0]].split())
            q2 = ''.join(row[qcolumns[1]].split())
        else:
            q1 = row[qcolumns[0]].split()
            q2 = row[qcolumns[1]].split()

        q1_ngram_list = list(ngrams(q1, n))
        q2_ngram_list = list(ngrams(q2, n))

        q1_ngram_set = set(q1_ngram_list)
        q2_ngram_set = set(q2_ngram_list)

        q1_sum = len(q1_ngram_list)
        q2_sum = len(q2_ngram_list)

        diff = abs(q1_sum - q2_sum)

        if q1_sum + q2_sum != 0:
            diff_norm = diff / (q1_sum + q2_sum) * 2
        else:
            diff_norm = -1
        maximum = max([q1_sum, q2_sum])
        minimum = min([q1_sum, q2_sum])

        q1_unique = len(q1_ngram_set)
        q2_unique = len(q2_ngram_set)

        diff_unique = abs(q1_unique - q2_unique)

        intersect_r = Counter(q1_ngram_list) & Counter(q2_ngram_list)

        if q1_sum + q2_sum != 0:
            intersect_r = sum(intersect_r.values()) / (q1_sum + q2_sum) * 2
            intersect_unique_r = len(
                q1_ngram_set.intersection(q2_ngram_set)) / (q1_unique +
                                                            q2_unique) * 2
        else:
            intersect_r = -1
            intersect_unique_r = -1

        if 0 != len(q1_ngram_set.union(q2_ngram_set)):
            jaccard_dist = (len(q1_ngram_set.union(q2_ngram_set)) - len(
                q1_ngram_set.intersection(q2_ngram_set))) / len(
                    q1_ngram_set.union(q2_ngram_set))
        else:
            jaccard_dist = 1

        bin_dist = distance.binary_distance(q1_ngram_set, q2_ngram_set)
        masi_dist = distance.masi_distance(q1_ngram_set, q2_ngram_set)

        listout = [
            q1_sum, q2_sum, diff, diff_norm, maximum, minimum, q1_unique,
            q2_unique, diff_unique, intersect_r, intersect_unique_r,
            jaccard_dist, bin_dist, masi_dist
        ]

        keys = [
            'q1_sum', 'q2_sum', 'diff', 'diff_norm', 'max', 'min', 'q1_uni',
            'q2_uni', 'diff_uni', 'intersect_r', 'inter_uni_r', 'jaccard_dist',
            'bin_dist', 'masi_dist'
        ]
        keys = [x + str(n) + append for x in keys]
        dictout = dict(zip(keys, listout))

        return pd.Series(dictout)
Example #8
0
def search_misawa(meigens, targetSentence, retR=False,
        method='masi', model=None, dictionary=None):
    """
    MASI距離によりベストなミサワを探す関数
    - IN  : 名言リスト、解析対象文章
    - OUT : 画像のURL
    """
    targetWords = mecab_func.breakdown_into_validwords(targetSentence)
    
    if len(targetWords) <= 2 or len(targetWords) >= 30:
        logger.warning("bad tweet for misawa-recommend")
        if retR:
            return 1., None
        else:
            return (1.)

    # 入力された文章で解析可能な場合
    hit = False
    minr = 1.0
    matched_inf = {}
    cnt = 0

    for meigen in meigens:

        words = meigen['words']

        if method == 'jaccard':
            # Jaccard距離による類似度判定。小さいほど類似
            r = jaccard_distance(set(targetWords), set(words))
        elif method == 'masi':
            # MASI距離による類似度判定。小さいほど類似
            r = masi_distance(set(targetWords), set(words))
        elif method[0:3] in ['lsi', 'lda', 'LSI', 'LDA']:
            # コサイン類似度で判定。負で評価し、小さいほど類似
            vec = model[dictionary.doc2bow(targetWords)]
            r = -1.*matutils.cossim(meigen[method], vec)
        elif method[0:3] in ['d2v', 'doc']:
            # コサイン類似度で判定。負で評価し、小さいほど類似
            r = -1.*d2v_similarity(targetWords, words, model)

        if r < minr:
            hit = True
            minr = r
            matched_inf = meigen
        cnt = cnt + 1

    # 例外: すべての名言との距離が 1.0
    if not hit:
        logger.info("ベストマッチなし")
        if retR:
            return 1., None
        else:
            return (1.)

    logger.info("========calculation report========")
    logger.info("method: %s [r = %f]" % (method, minr))
    logger.info("input : %s %s" % (targetSentence.replace('\n', ' '), targetWords))
    logger.info('meigen: %s %s' % (matched_inf['body'].replace('\n', ' '), matched_inf['words']))

    if retR:
        # 戻り値: MASI距離, 全ミサワ情報
        return minr, matched_inf
    else:
        # レポート
        # 戻り値: 画像のURL
        return(matched_inf)
Example #9
0
# -*- coding: utf-8 -*-

from nltk.metrics.distance import jaccard_distance, masi_distance
from prettytable import PrettyTable

fields = ['X', 'Y', 'Jaccard(X,Y)', 'MASI(X,Y)']
pt = PrettyTable(fields=fields)
[pt.set_field_align(f, 'l') for f in fields]

for z in range(4):
    X = set()
    for x in range(z, 4):
        Y = set()
        for y in range(1, 3):
            X.add(x)
            Y.add(y)
            pt.add_row([
                list(X),
                list(Y),
                round(jaccard_distance(X, Y), 2),
                round(masi_distance(X, Y), 2)
            ])
pt.printt()
Example #10
0
b = wordnet.synsets('color')[0]

wordnet.similarity(a,b)




a = ['this', 'is', 'a', 'test']
b = ['this', 'was', 'a', 'test']

edit_distance(a, b)

jaccard_distance(set(a), set(b))

masi_distance(set(a), set(b))








from pattern.web import DBPedia

sparql = '\n'.join((
    'prefix dbo: <http://dbpedia.org/ontology/>',
    'select ?person ?place where {',
    '    ?person a dbo:President.',
 '    ?person dbo:birthPlace ?place.',