Beispiel #1
0
 def testClear(self):
     random.seed(12345)
     lsh = LSHCache()
     self.assertSetEqual(set(), lsh.insert("123456789"))
     self.assertSetEqual(set([0]), lsh.insert("34567890"))
     self.assertSetEqual(set([0]), lsh.insert("0123456"))
     self.assertSetEqual(set([0,1,2]), lsh.insert("123456789"))
     lsh.clear()
     self.assertSetEqual(set(), lsh.insert("123456789"))
     self.assertSetEqual(set([0]), lsh.insert("34567890"))
     self.assertSetEqual(set([0]), lsh.insert("0123456"))
     self.assertSetEqual(set([0,1,2]), lsh.insert("123456789"))
Beispiel #2
0
def lsh():
    lns = [ln.decode('utf-8') for ln in open('clean_data').readlines()]
    cache = LSHCache()
    docs = []
    for ln in lns:
        word_dic = []
        for wd in list(jieba.cut(ln)):
            # if len(wd) > 1:
            word_dic.append(wd)
        docs.append(' '.join(word_dic))
    dups = {}

    for i, doc in enumerate(docs):
        dups[i] = cache.insert(doc.split(), i)
    for i, duplist in dups.items():
        if duplist:
            print 'orig [%d]: %s' % (i, docs[i])
            for dup in duplist:
                print'\tdup : [%d] %s' % (dup, docs[dup])
        else:
            print 'no dups found for doc [%d] : %s' % (i, docs[i])
Beispiel #3
0
        "lipstick on a pig",
        "you can put lipstick on a pig",
        "you can put lipstick on a pig but it's still a pig",
        "you can put lipstick on a pig it's still a pig",
        "i think they put some lipstick on a pig but it's still a pig",
        "putting lipstick on a pig",
        "you know you can put lipstick on a pig",
        "they were going to send us binders full of women",
        "they were going to send us binders of women",
        "a b c d e f",
        "a b c d f"
        ]

    dups = {}
    for i, doc in enumerate(docs):
        dups[i] = cache.insert(doc.split(), i)

    for i, duplist in dups.items():
        if duplist:
            print 'orig [%d]: %s' % (i, docs[i])
            for dup in duplist:
                print'\tdup : [%d] %s' % (dup, docs[dup])
        else:
            print 'no dups found for doc [%d] : %s' % (i, docs[i])

    cache.save('test.pkl')

    # Make a new cache to try loading from file
    new_cache = LSHCache()

    new_cache.from_file('test.pkl')
Beispiel #4
0

if __name__ == '__main__':
    cache = LSHCache()
    
    docs = [
        "lipstick on a pig",
        "you can put lipstick on a pig",
        "you can put lipstick on a pig but it's still a pig",
        "you can put lipstick on a pig it's still a pig",
        "i think they put some lipstick on a pig but it's still a pig",
        "putting lipstick on a pig",
        "you know you can put lipstick on a pig",
        "they were going to send us binders full of women",
        "they were going to send us binders of women",
        "a b c d e f",
        "a b c d f"
        ]

    dups = {}
    for i, doc in enumerate(docs):
        dups[i] = cache.insert(doc.split(), i)

    for i, duplist in dups.items():
        if duplist:
            print 'orig [%d]: %s' % (i, docs[i])
            for dup in duplist:
                print'\tdup : [%d] %s' % (dup, docs[dup])
        else:
            print 'no dups found for doc [%d] : %s' % (i, docs[i])