def testLSHArgs(self): lsh = LSHCache() self.assertEqual(20, lsh.num_bands()) self.assertEqual(5, lsh.num_rows_per_band()) self.assertEqual(100, lsh.num_total_rows()) self.assertEqual(2, lsh.shingler().shingle_len()) self.assertEqual(1, lsh.min_support()) lsh = LSHCache(b=10, r=7) self.assertEqual(10, lsh.num_bands()) self.assertEqual(7, lsh.num_rows_per_band()) self.assertEqual(70, lsh.num_total_rows()) self.assertEqual(2, lsh.shingler().shingle_len()) self.assertEqual(1, lsh.min_support()) lsh = LSHCache(n=70, r=7) self.assertEqual(10, lsh.num_bands()) self.assertEqual(7, lsh.num_rows_per_band()) self.assertEqual(70, lsh.num_total_rows()) self.assertEqual(2, lsh.shingler().shingle_len()) self.assertEqual(1, lsh.min_support()) lsh = LSHCache(n=70, b=10, m=3) self.assertEqual(10, lsh.num_bands()) self.assertEqual(7, lsh.num_rows_per_band()) self.assertEqual(70, lsh.num_total_rows()) self.assertEqual(2, lsh.shingler().shingle_len()) self.assertEqual(3, lsh.min_support()) lsh = LSHCache(n=70, b=10, r=7) self.assertEqual(10, lsh.num_bands()) self.assertEqual(7, lsh.num_rows_per_band()) self.assertEqual(70, lsh.num_total_rows()) self.assertEqual(2, lsh.shingler().shingle_len()) self.assertEqual(1, lsh.min_support()) lsh = LSHCache(shingler=Shingler(5)) self.assertEqual(20, lsh.num_bands()) self.assertEqual(5, lsh.num_rows_per_band()) self.assertEqual(100, lsh.num_total_rows()) self.assertEqual(5, lsh.shingler().shingle_len()) self.assertEqual(1, lsh.min_support()) lsh = LSHCache(shingler=Shingler(2,3)) self.assertEqual(20, lsh.num_bands()) self.assertEqual(5, lsh.num_rows_per_band()) self.assertEqual(100, lsh.num_total_rows()) self.assertEqual((2,3,), lsh.shingler().shingle_len()) self.assertEqual(1, lsh.min_support())
def testLSH(self): strings = [ "abcdefghijklmnopqrstuvwxyz", "abcdefghijklmnopqrstuvw", "defghijklmnopqrstuvw", "zyxwvutsrqponmlkjihgfedcba", "1abcdefghijklmnopuvw1", "123456789", "012345678", "234567890", ] for i, a in enumerate(strings): for j, b in enumerate(strings[i+1:]): print "'%s' (%d) <=> (%d)'%s': %f" % (a, i,j+i+1, b, 1-jaccard_distance(set(a),set(b))) random.seed(12345) lsh = LSHCache(shingler=Shingler(1)) self.assertListEqual([set(), set([0]), set([0,1]), set([0,1,2]), set([0,1,2,3]), set(), set([5]), set([5,6])], lsh.insert_batch(strings))
def testMultiLen(self): s = Shingler(2,3) shingles = set(s.shingle("abcdef")) self.assertSetEqual(set(map(tuple, ["ab", "bc","cd","de","ef","abc","bcd","cde","def"])), shingles) self.assertSetEqual(set([('a','b',),('b','c',),('a','b','c',)]), set(s.shingle("abc"))) self.assertSetEqual(set([('a','b',),(None,'a','b',)]), set(s.shingle("ab"))) self.assertSetEqual(set([(None,'a',),(None,None,'a',)]), set(s.shingle("a")))
def lsh_cache_from_args(args): seed_from_args(args) kwargs = {"shingler": Shingler(*args.shingle_len)} if args.minhash: kwargs['minhash'] = minhash_choices[args.minhash] for arg_key, kwarg_key in (('num_total', 'n'), ('num_bands', 'b'), ('num_rows', 'r'), ('min_support', 'm'), ('universe_size', ) * 2): value = getattr(args, arg_key) if value: kwargs[kwarg_key] = value cache = LSHCache(**kwargs) # logging.info(str(cache)) return cache
def testBadArgs(self): with self.assertRaises(AssertionError): Shingler(0) with self.assertRaises(AssertionError): Shingler(2,1)
def testLenTwo(self): s = Shingler(2) shingles = list(s.shingle("abcdef")) self.assertListEqual(map(tuple, ["ab", "bc", "cd","de", "ef"]), shingles) self.assertListEqual([(None,'a',)], list(s.shingle("a"))) self.assertListEqual([('a','b',)], list(s.shingle("ab")))
def testLenOne(self): s = Shingler(1) shingles = list(s.shingle("abcdef")) self.assertListEqual(map(tuple, "abcdef"), shingles) self.assertListEqual([('a',)], list(s.shingle("a")))