def testClear(self): random.seed(12345) lsh = LSHCache() self.assertSetEqual(set(), lsh.insert("123456789")) self.assertSetEqual(set([0]), lsh.insert("34567890")) self.assertSetEqual(set([0]), lsh.insert("0123456")) self.assertSetEqual(set([0,1,2]), lsh.insert("123456789")) lsh.clear() self.assertSetEqual(set(), lsh.insert("123456789")) self.assertSetEqual(set([0]), lsh.insert("34567890")) self.assertSetEqual(set([0]), lsh.insert("0123456")) self.assertSetEqual(set([0,1,2]), lsh.insert("123456789"))
def lsh(): lns = [ln.decode('utf-8') for ln in open('clean_data').readlines()] cache = LSHCache() docs = [] for ln in lns: word_dic = [] for wd in list(jieba.cut(ln)): # if len(wd) > 1: word_dic.append(wd) docs.append(' '.join(word_dic)) dups = {} for i, doc in enumerate(docs): dups[i] = cache.insert(doc.split(), i) for i, duplist in dups.items(): if duplist: print 'orig [%d]: %s' % (i, docs[i]) for dup in duplist: print'\tdup : [%d] %s' % (dup, docs[dup]) else: print 'no dups found for doc [%d] : %s' % (i, docs[i])
"lipstick on a pig", "you can put lipstick on a pig", "you can put lipstick on a pig but it's still a pig", "you can put lipstick on a pig it's still a pig", "i think they put some lipstick on a pig but it's still a pig", "putting lipstick on a pig", "you know you can put lipstick on a pig", "they were going to send us binders full of women", "they were going to send us binders of women", "a b c d e f", "a b c d f" ] dups = {} for i, doc in enumerate(docs): dups[i] = cache.insert(doc.split(), i) for i, duplist in dups.items(): if duplist: print 'orig [%d]: %s' % (i, docs[i]) for dup in duplist: print'\tdup : [%d] %s' % (dup, docs[dup]) else: print 'no dups found for doc [%d] : %s' % (i, docs[i]) cache.save('test.pkl') # Make a new cache to try loading from file new_cache = LSHCache() new_cache.from_file('test.pkl')
if __name__ == '__main__': cache = LSHCache() docs = [ "lipstick on a pig", "you can put lipstick on a pig", "you can put lipstick on a pig but it's still a pig", "you can put lipstick on a pig it's still a pig", "i think they put some lipstick on a pig but it's still a pig", "putting lipstick on a pig", "you know you can put lipstick on a pig", "they were going to send us binders full of women", "they were going to send us binders of women", "a b c d e f", "a b c d f" ] dups = {} for i, doc in enumerate(docs): dups[i] = cache.insert(doc.split(), i) for i, duplist in dups.items(): if duplist: print 'orig [%d]: %s' % (i, docs[i]) for dup in duplist: print'\tdup : [%d] %s' % (dup, docs[dup]) else: print 'no dups found for doc [%d] : %s' % (i, docs[i])