def test_c_cuckoo_filter_load(self): """test loading a saved cuckoo filter""" md5sum = "88bc3a08bfc967f9ba60e9d57c21207f" with NamedTemporaryFile(dir=os.getcwd(), suffix=".cko", delete=DELETE_TEMP_FILES) as fobj: cko = CountingCuckooFilter.init_error_rate(0.00001) for i in range(1000): cko.add(str(i)) if i % 2 == 1: cko.add(str(i)) cko.export(fobj.name) md5_out = calc_file_md5(fobj.name) self.assertEqual(md5sum, md5_out) ckf = CountingCuckooFilter.load_error_rate(error_rate=0.00001, filepath=fobj.name) for i in range(1000): self.assertEqual(ckf.check(str(i)), (i % 2) + 1) self.assertEqual(10000, ckf.capacity) self.assertEqual(4, ckf.bucket_size) self.assertEqual(500, ckf.max_swaps) self.assertEqual(2, ckf.expansion_rate) self.assertEqual(True, ckf.auto_expand) self.assertEqual(20, ckf.fingerprint_size_bits) self.assertEqual(3, ckf.fingerprint_size) self.assertEqual(0.00001, ckf.error_rate) self.assertEqual(0.025, ckf.load_factor())
def test_cuckoo_filter_load(self): """test loading a saved cuckoo filter""" md5sum = "3c693508d1a3acd819310fd0c11dc906" with NamedTemporaryFile(dir=os.getcwd(), suffix=".cko", delete=DELETE_TEMP_FILES) as fobj: cko = CuckooFilter.init_error_rate(0.00001) for i in range(1000): cko.add(str(i)) cko.export(fobj.name) md5_out = calc_file_md5(fobj.name) self.assertEqual(md5sum, md5_out) ckf = CuckooFilter.load_error_rate(error_rate=0.00001, filepath=fobj.name) for i in range(1000): self.assertTrue(ckf.check(str(i))) self.assertEqual(10000, ckf.capacity) self.assertEqual(4, ckf.bucket_size) self.assertEqual(500, ckf.max_swaps) self.assertEqual(2, ckf.expansion_rate) self.assertEqual(True, ckf.auto_expand) self.assertEqual(3, ckf.fingerprint_size) self.assertEqual(20, ckf.fingerprint_size_bits) self.assertEqual(0.00001, ckf.error_rate) self.assertEqual(0.025, ckf.load_factor())
def test_bfod_export(self): """export to on disk to new file""" with NamedTemporaryFile(dir=os.getcwd(), suffix=".blm", delete=DELETE_TEMP_FILES) as fobj: with NamedTemporaryFile(dir=os.getcwd(), suffix=".blm", delete=DELETE_TEMP_FILES) as fobj1: blm = BloomFilterOnDisk(fobj.name, 10, 0.05) blm.add("this is a test") blm.export(fobj1.name) blm.close() md5_1 = calc_file_md5(fobj.name) md5_2 = calc_file_md5(fobj1.name) self.assertEqual(md5_1, md5_2)
def test_hh_export(self): """test exporting a heavy hitters sketch""" md5_val = "fb1c39dd1a73f1ef0d7fc79f60fc028e" with NamedTemporaryFile(dir=os.getcwd(), suffix=".cms", delete=DELETE_TEMP_FILES) as fobj: hh1 = HeavyHitters(num_hitters=1000, width=1000, depth=5) hh1.add("this is a test", 100) hh1.export(fobj.name) md5_out = calc_file_md5(fobj.name) self.assertEqual(md5_out, md5_val)
def test_streamthreshold_export(self): """test exporting a stream threshold sketch""" md5_val = "fb1c39dd1a73f1ef0d7fc79f60fc028e" with NamedTemporaryFile(dir=os.getcwd(), suffix=".cms", delete=DELETE_TEMP_FILES) as fobj: st1 = StreamThreshold(threshold=10, width=1000, depth=5) st1.add("this is a test", 100) st1.export(fobj.name) md5_out = calc_file_md5(fobj.name) self.assertEqual(md5_out, md5_val)
def test_cms_export(self): """test exporting a count-min sketch""" md5_val = "fb1c39dd1a73f1ef0d7fc79f60fc028e" with NamedTemporaryFile(dir=os.getcwd(), suffix=".cms", delete=DELETE_TEMP_FILES) as fobj: cms = CountMinSketch(width=1000, depth=5) cms.add("this is a test", 100) cms.export(fobj.name) md5_out = calc_file_md5(fobj.name) self.assertEqual(md5_out, md5_val)
def test_rbf_import_empty(self): """test that rotating Bloom Filter is correct on import""" with NamedTemporaryFile(dir=os.getcwd(), suffix=".rbf", delete=DELETE_TEMP_FILES) as fobj: blm = RotatingBloomFilter(est_elements=25, false_positive_rate=0.05) blm.export(fobj.name) self.assertEqual(calc_file_md5(fobj.name), "eb5769ae9babdf7b37d6ce64d58812bc") blm2 = ExpandingBloomFilter(filepath=fobj.name) for bloom in blm2._blooms: self.assertEqual(bloom.elements_added, 0)
def test_c_cuckoo_filter_er_export(self): """test exporting a cuckoo filter""" md5sum = "f68767bd97b21426f5d2315fb38961ad" with NamedTemporaryFile(dir=os.getcwd(), suffix=".cko", delete=DELETE_TEMP_FILES) as fobj: cko = CountingCuckooFilter.init_error_rate(0.00001) for i in range(1000): cko.add(str(i)) cko.export(fobj.name) md5_out = calc_file_md5(fobj.name) self.assertEqual(md5sum, md5_out)
def test_c_cuckoo_filter_export(self): """test exporting a counting cuckoo filter""" md5sum = "6a98c2df1ec9fbb4f75f8e6392696b9b" with NamedTemporaryFile(dir=os.getcwd(), suffix=".cck", delete=DELETE_TEMP_FILES) as fobj: cko = CountingCuckooFilter(capacity=1000, bucket_size=2, auto_expand=False) for i in range(100): cko.add(str(i)) cko.export(fobj.name) md5_out = calc_file_md5(fobj.name) self.assertEqual(md5sum, md5_out)
def test_bf_export_file(self): """test exporting bloom filter to file""" md5_val = "8d27e30e1c5875b0edcf7413c7bdb221" blm = BloomFilter(est_elements=10, false_positive_rate=0.05) blm.add("this is a test") with NamedTemporaryFile(dir=os.getcwd(), suffix=".blm", delete=DELETE_TEMP_FILES) as fobj: blm.export(fobj.name) md5_out = calc_file_md5(fobj.name) self.assertEqual(md5_out, md5_val)
def test_cuckoo_filter_export(self): """test exporting a cuckoo filter""" md5sum = "1371760d4ee9ccbe83e0144919750140" with NamedTemporaryFile(dir=os.getcwd(), suffix=".cko", delete=DELETE_TEMP_FILES) as fobj: cko = CuckooFilter() for i in range(1000): cko.add(str(i)) cko.export(fobj.name) md5_out = calc_file_md5(fobj.name) self.assertEqual(md5sum, md5_out)
def test_cuckoo_filter_er_export(self): """test exporting a cuckoo filter""" md5sum = "3c693508d1a3acd819310fd0c11dc906" with NamedTemporaryFile(dir=os.getcwd(), suffix=".cko", delete=DELETE_TEMP_FILES) as fobj: cko = CuckooFilter.init_error_rate(0.00001) for i in range(1000): cko.add(str(i)) cko.export(fobj.name) md5_out = calc_file_md5(fobj.name) self.assertEqual(md5sum, md5_out)
def test_cms_load(self): """test loading a count-min sketch from file""" md5_val = "fb1c39dd1a73f1ef0d7fc79f60fc028e" with NamedTemporaryFile(dir=os.getcwd(), suffix=".cms", delete=DELETE_TEMP_FILES) as fobj: cms = CountMinSketch(width=1000, depth=5) self.assertEqual(cms.add("this is a test", 100), 100) cms.export(fobj.name) md5_out = calc_file_md5(fobj.name) self.assertEqual(md5_out, md5_val) # try loading directly to file! cms2 = CountMinSketch(filepath=fobj.name) self.assertEqual(cms2.elements_added, 100) self.assertEqual(cms2.check("this is a test"), 100)
def test_cms_load_diff_hash(self): """test loading a count-min sketch from file""" md5_val = "fb1c39dd1a73f1ef0d7fc79f60fc028e" with NamedTemporaryFile(dir=os.getcwd(), suffix=".cms", delete=DELETE_TEMP_FILES) as fobj: cms = CountMinSketch(width=1000, depth=5) self.assertEqual(cms.add("this is a test", 100), 100) cms.export(fobj.name) md5_out = calc_file_md5(fobj.name) self.assertEqual(md5_out, md5_val) cms2 = CountMinSketch(filepath=fobj.name, hash_function=different_hash) self.assertEqual(cms2.elements_added, 100) # should not work since it is a different hash self.assertNotEqual(cms.check("this is a test"), True) self.assertNotEqual(cms.hashes("this is a test"), cms2.hashes("this is a test"))
def test_another_hashing_algo(self): """test defining a completely different hashing strategy""" md5_val = "7f590086f9b962387e145899dd001256" # for default hash used results = [ 14409285476674975580, 1383622036369840193, 10825905054403519891, 3456253732347153957, 1494124715262089992, ] def my_hash(key, depth, encoding="utf-8"): """my hashing strategy""" max64mod = UINT64_T_MAX + 1 results = list() for i in range(0, depth): tmp = key[i:] + key[:i] val = int(hashlib.sha512(tmp.encode(encoding)).hexdigest(), 16) results.append(val % max64mod) return results blm = BloomFilter(est_elements=10, false_positive_rate=0.05, hash_function=my_hash) self.assertEqual(blm.elements_added, 0) blm.add("this is a test") with NamedTemporaryFile(dir=os.getcwd(), suffix=".blm", delete=DELETE_TEMP_FILES) as fobj: blm.export(fobj.name) md5_out = calc_file_md5(fobj.name) self.assertNotEqual(md5_out, md5_val) for i in range(0, 10): tmp = "this is a test {0}".format(i) blm.add(tmp) self.assertEqual(blm.elements_added, 11) for i in range(0, 10): tmp = "this is a test {0}".format(i) self.assertTrue(blm.check(tmp)) self.assertEqual(blm.hashes("this is a test", 5), results) res = blm.hashes("this is a test", 1) self.assertEqual(len(res), 1) self.assertEqual(res[0], results[0])
def test_bf_use_different_hash(self): """test that the different hash works as intended""" md5_val = "7f590086f9b962387e145899dd001256" # for default hash used results = [ 14409285476674975580, 6203976290780191624, 5074829385518853901, 3953072760750514173, 11782747630324011555, ] @hash_with_depth_int def my_hash(key, depth=1, encoding="utf-8"): """my hash function""" max64mod = UINT64_T_MAX + 1 val = int(hashlib.sha512(key.encode(encoding)).hexdigest(), 16) return val % max64mod blm = BloomFilter(est_elements=10, false_positive_rate=0.05, hash_function=my_hash) self.assertEqual(blm.elements_added, 0) blm.add("this is a test") with NamedTemporaryFile(dir=os.getcwd(), suffix=".blm", delete=DELETE_TEMP_FILES) as fobj: blm.export(fobj.name) md5_out = calc_file_md5(fobj.name) self.assertNotEqual(md5_out, md5_val) for i in range(0, 10): tmp = "this is a test {0}".format(i) blm.add(tmp) self.assertEqual(blm.elements_added, 11) for i in range(0, 10): tmp = "this is a test {0}".format(i) self.assertTrue(blm.check(tmp)) self.assertEqual(blm.hashes("this is a test", 5), results) res = blm.hashes("this is a test", 1) self.assertEqual(len(res), 1) self.assertEqual(res[0], results[0])
def test_c_cuckoo_filter_load(self): """test loading a saved counting cuckoo filter""" md5sum = "6a98c2df1ec9fbb4f75f8e6392696b9b" with NamedTemporaryFile(dir=os.getcwd(), suffix=".cck", delete=DELETE_TEMP_FILES) as fobj: cko = CountingCuckooFilter(capacity=1000, bucket_size=2, auto_expand=False) for i in range(100): cko.add(str(i)) cko.export(fobj.name) md5_out = calc_file_md5(fobj.name) self.assertEqual(md5sum, md5_out) ckf = CountingCuckooFilter(filepath=fobj.name) for i in range(100): self.assertEqual(ckf.check(str(i)), 1) self.assertEqual(1000, ckf.capacity) self.assertEqual(2, ckf.bucket_size) self.assertEqual(500, ckf.max_swaps) self.assertEqual(0.05, ckf.load_factor())
def test_cuckoo_filter_load(self): """test loading a saved cuckoo filter""" md5sum = "1371760d4ee9ccbe83e0144919750140" with NamedTemporaryFile(dir=os.getcwd(), suffix=".cko", delete=DELETE_TEMP_FILES) as fobj: cko = CuckooFilter() for i in range(1000): cko.add(str(i)) cko.export(fobj.name) md5_out = calc_file_md5(fobj.name) self.assertEqual(md5sum, md5_out) ckf = CuckooFilter(filepath=fobj.name) for i in range(1000): self.assertTrue(ckf.check(str(i))) self.assertEqual(10000, ckf.capacity) self.assertEqual(4, ckf.bucket_size) self.assertEqual(500, ckf.max_swaps) self.assertEqual(0.025, ckf.load_factor())
def test_streamthreshold_load(self): """test loading a stream threshold sketch from file""" md5_val = "fb1c39dd1a73f1ef0d7fc79f60fc028e" with NamedTemporaryFile(dir=os.getcwd(), suffix=".cms", delete=DELETE_TEMP_FILES) as fobj: st1 = StreamThreshold(threshold=10, width=1000, depth=5) self.assertEqual(st1.add("this is a test", 100), 100) self.assertEqual(st1.elements_added, 100) self.assertEqual(st1.meets_threshold, {"this is a test": 100}) st1.export(fobj.name) md5_out = calc_file_md5(fobj.name) self.assertEqual(md5_out, md5_val) # try loading directly to file! st2 = StreamThreshold(threshold=10, filepath=fobj.name) self.assertEqual(st2.width, 1000) self.assertEqual(st2.depth, 5) self.assertEqual(st2.elements_added, 100) self.assertEqual(st2.check("this is a test"), 100) # show on load that the tracking of stream threshold is gone self.assertEqual(st2.meets_threshold, dict()) self.assertEqual(st2.add("this is a test", 1), 101) self.assertEqual(st2.meets_threshold, {"this is a test": 101})
def test_hh_load(self): """test loading a heavy hitters from file""" md5_val = "fb1c39dd1a73f1ef0d7fc79f60fc028e" with NamedTemporaryFile(dir=os.getcwd(), suffix=".cms", delete=DELETE_TEMP_FILES) as fobj: hh1 = HeavyHitters(num_hitters=1000, width=1000, depth=5) self.assertEqual(hh1.add("this is a test", 100), 100) self.assertEqual(hh1.elements_added, 100) self.assertEqual(hh1.heavy_hitters, {"this is a test": 100}) hh1.export(fobj.name) md5_out = calc_file_md5(fobj.name) self.assertEqual(md5_out, md5_val) # try loading directly to file! hh2 = HeavyHitters(num_hitters=1000, filepath=fobj.name) self.assertEqual(hh2.width, 1000) self.assertEqual(hh2.depth, 5) self.assertEqual(hh2.elements_added, 100) self.assertEqual(hh2.check("this is a test"), 100) # show on load that the tracking of heavy hitters is gone self.assertEqual(hh2.heavy_hitters, dict()) self.assertEqual(hh2.add("this is a test", 1), 101) self.assertEqual(hh2.heavy_hitters, {"this is a test": 101})
def test_cbf_export_file(self): """test exporting bloom filter to file""" md5_val = "0b83c837da30e25f768f0527c039d341" with NamedTemporaryFile(dir=os.getcwd(), suffix=".cbm", delete=DELETE_TEMP_FILES) as fobj: blm = CountingBloomFilter(est_elements=10, false_positive_rate=0.01) blm.add("test") blm.add("out") blm.add("the") blm.add("counting") blm.add("bloom") blm.add("filter") blm.add("test") blm.add("Test") blm.add("out") blm.add("test") blm.export(fobj.name) md5_out = calc_file_md5(fobj.name) self.assertEqual(md5_out, md5_val)
def test_ebf_export(self): """basic expanding Bloom Filter export test""" with NamedTemporaryFile(dir=os.getcwd(), suffix=".ebf", delete=DELETE_TEMP_FILES) as fobj: blm = ExpandingBloomFilter(est_elements=25, false_positive_rate=0.05) blm.export(fobj.name) self.assertEqual(calc_file_md5(fobj.name), "eb5769ae9babdf7b37d6ce64d58812bc")