Esempio n. 1
0
    def test_c_cuckoo_filter_load(self):
        """test loading a saved cuckoo filter"""
        md5sum = "88bc3a08bfc967f9ba60e9d57c21207f"
        with NamedTemporaryFile(dir=os.getcwd(), suffix=".cko", delete=DELETE_TEMP_FILES) as fobj:
            cko = CountingCuckooFilter.init_error_rate(0.00001)
            for i in range(1000):
                cko.add(str(i))
                if i % 2 == 1:
                    cko.add(str(i))
            cko.export(fobj.name)
            md5_out = calc_file_md5(fobj.name)
            self.assertEqual(md5sum, md5_out)

            ckf = CountingCuckooFilter.load_error_rate(error_rate=0.00001, filepath=fobj.name)
            for i in range(1000):
                self.assertEqual(ckf.check(str(i)), (i % 2) + 1)

            self.assertEqual(10000, ckf.capacity)
            self.assertEqual(4, ckf.bucket_size)
            self.assertEqual(500, ckf.max_swaps)
            self.assertEqual(2, ckf.expansion_rate)
            self.assertEqual(True, ckf.auto_expand)
            self.assertEqual(20, ckf.fingerprint_size_bits)
            self.assertEqual(3, ckf.fingerprint_size)
            self.assertEqual(0.00001, ckf.error_rate)
            self.assertEqual(0.025, ckf.load_factor())
Esempio n. 2
0
    def test_cuckoo_filter_load(self):
        """test loading a saved cuckoo filter"""
        md5sum = "3c693508d1a3acd819310fd0c11dc906"
        with NamedTemporaryFile(dir=os.getcwd(),
                                suffix=".cko",
                                delete=DELETE_TEMP_FILES) as fobj:
            cko = CuckooFilter.init_error_rate(0.00001)
            for i in range(1000):
                cko.add(str(i))
            cko.export(fobj.name)
            md5_out = calc_file_md5(fobj.name)
            self.assertEqual(md5sum, md5_out)

            ckf = CuckooFilter.load_error_rate(error_rate=0.00001,
                                               filepath=fobj.name)
            for i in range(1000):
                self.assertTrue(ckf.check(str(i)))

            self.assertEqual(10000, ckf.capacity)
            self.assertEqual(4, ckf.bucket_size)
            self.assertEqual(500, ckf.max_swaps)
            self.assertEqual(2, ckf.expansion_rate)
            self.assertEqual(True, ckf.auto_expand)
            self.assertEqual(3, ckf.fingerprint_size)
            self.assertEqual(20, ckf.fingerprint_size_bits)
            self.assertEqual(0.00001, ckf.error_rate)
            self.assertEqual(0.025, ckf.load_factor())
Esempio n. 3
0
    def test_bfod_export(self):
        """export to on disk to new file"""
        with NamedTemporaryFile(dir=os.getcwd(),
                                suffix=".blm",
                                delete=DELETE_TEMP_FILES) as fobj:
            with NamedTemporaryFile(dir=os.getcwd(),
                                    suffix=".blm",
                                    delete=DELETE_TEMP_FILES) as fobj1:
                blm = BloomFilterOnDisk(fobj.name, 10, 0.05)
                blm.add("this is a test")

                blm.export(fobj1.name)
                blm.close()

                md5_1 = calc_file_md5(fobj.name)
                md5_2 = calc_file_md5(fobj1.name)
                self.assertEqual(md5_1, md5_2)
Esempio n. 4
0
 def test_hh_export(self):
     """test exporting a heavy hitters sketch"""
     md5_val = "fb1c39dd1a73f1ef0d7fc79f60fc028e"
     with NamedTemporaryFile(dir=os.getcwd(), suffix=".cms", delete=DELETE_TEMP_FILES) as fobj:
         hh1 = HeavyHitters(num_hitters=1000, width=1000, depth=5)
         hh1.add("this is a test", 100)
         hh1.export(fobj.name)
         md5_out = calc_file_md5(fobj.name)
     self.assertEqual(md5_out, md5_val)
Esempio n. 5
0
 def test_streamthreshold_export(self):
     """test exporting a stream threshold sketch"""
     md5_val = "fb1c39dd1a73f1ef0d7fc79f60fc028e"
     with NamedTemporaryFile(dir=os.getcwd(), suffix=".cms", delete=DELETE_TEMP_FILES) as fobj:
         st1 = StreamThreshold(threshold=10, width=1000, depth=5)
         st1.add("this is a test", 100)
         st1.export(fobj.name)
         md5_out = calc_file_md5(fobj.name)
     self.assertEqual(md5_out, md5_val)
Esempio n. 6
0
 def test_cms_export(self):
     """test exporting a count-min sketch"""
     md5_val = "fb1c39dd1a73f1ef0d7fc79f60fc028e"
     with NamedTemporaryFile(dir=os.getcwd(), suffix=".cms", delete=DELETE_TEMP_FILES) as fobj:
         cms = CountMinSketch(width=1000, depth=5)
         cms.add("this is a test", 100)
         cms.export(fobj.name)
         md5_out = calc_file_md5(fobj.name)
     self.assertEqual(md5_out, md5_val)
Esempio n. 7
0
    def test_rbf_import_empty(self):
        """test that rotating Bloom Filter is correct on import"""
        with NamedTemporaryFile(dir=os.getcwd(), suffix=".rbf", delete=DELETE_TEMP_FILES) as fobj:
            blm = RotatingBloomFilter(est_elements=25, false_positive_rate=0.05)
            blm.export(fobj.name)
            self.assertEqual(calc_file_md5(fobj.name), "eb5769ae9babdf7b37d6ce64d58812bc")

            blm2 = ExpandingBloomFilter(filepath=fobj.name)
            for bloom in blm2._blooms:
                self.assertEqual(bloom.elements_added, 0)
Esempio n. 8
0
 def test_c_cuckoo_filter_er_export(self):
     """test exporting a cuckoo filter"""
     md5sum = "f68767bd97b21426f5d2315fb38961ad"
     with NamedTemporaryFile(dir=os.getcwd(), suffix=".cko", delete=DELETE_TEMP_FILES) as fobj:
         cko = CountingCuckooFilter.init_error_rate(0.00001)
         for i in range(1000):
             cko.add(str(i))
         cko.export(fobj.name)
         md5_out = calc_file_md5(fobj.name)
         self.assertEqual(md5sum, md5_out)
Esempio n. 9
0
    def test_c_cuckoo_filter_export(self):
        """test exporting a counting cuckoo filter"""
        md5sum = "6a98c2df1ec9fbb4f75f8e6392696b9b"
        with NamedTemporaryFile(dir=os.getcwd(), suffix=".cck", delete=DELETE_TEMP_FILES) as fobj:
            cko = CountingCuckooFilter(capacity=1000, bucket_size=2, auto_expand=False)
            for i in range(100):
                cko.add(str(i))

            cko.export(fobj.name)
            md5_out = calc_file_md5(fobj.name)
            self.assertEqual(md5sum, md5_out)
Esempio n. 10
0
    def test_bf_export_file(self):
        """test exporting bloom filter to file"""
        md5_val = "8d27e30e1c5875b0edcf7413c7bdb221"
        blm = BloomFilter(est_elements=10, false_positive_rate=0.05)
        blm.add("this is a test")

        with NamedTemporaryFile(dir=os.getcwd(),
                                suffix=".blm",
                                delete=DELETE_TEMP_FILES) as fobj:
            blm.export(fobj.name)
            md5_out = calc_file_md5(fobj.name)
        self.assertEqual(md5_out, md5_val)
Esempio n. 11
0
 def test_cuckoo_filter_export(self):
     """test exporting a cuckoo filter"""
     md5sum = "1371760d4ee9ccbe83e0144919750140"
     with NamedTemporaryFile(dir=os.getcwd(),
                             suffix=".cko",
                             delete=DELETE_TEMP_FILES) as fobj:
         cko = CuckooFilter()
         for i in range(1000):
             cko.add(str(i))
         cko.export(fobj.name)
         md5_out = calc_file_md5(fobj.name)
         self.assertEqual(md5sum, md5_out)
Esempio n. 12
0
 def test_cuckoo_filter_er_export(self):
     """test exporting a cuckoo filter"""
     md5sum = "3c693508d1a3acd819310fd0c11dc906"
     with NamedTemporaryFile(dir=os.getcwd(),
                             suffix=".cko",
                             delete=DELETE_TEMP_FILES) as fobj:
         cko = CuckooFilter.init_error_rate(0.00001)
         for i in range(1000):
             cko.add(str(i))
         cko.export(fobj.name)
         md5_out = calc_file_md5(fobj.name)
         self.assertEqual(md5sum, md5_out)
Esempio n. 13
0
    def test_cms_load(self):
        """test loading a count-min sketch from file"""
        md5_val = "fb1c39dd1a73f1ef0d7fc79f60fc028e"
        with NamedTemporaryFile(dir=os.getcwd(), suffix=".cms", delete=DELETE_TEMP_FILES) as fobj:
            cms = CountMinSketch(width=1000, depth=5)
            self.assertEqual(cms.add("this is a test", 100), 100)
            cms.export(fobj.name)
            md5_out = calc_file_md5(fobj.name)
            self.assertEqual(md5_out, md5_val)

            # try loading directly to file!
            cms2 = CountMinSketch(filepath=fobj.name)
            self.assertEqual(cms2.elements_added, 100)
            self.assertEqual(cms2.check("this is a test"), 100)
Esempio n. 14
0
    def test_cms_load_diff_hash(self):
        """test loading a count-min sketch from file"""
        md5_val = "fb1c39dd1a73f1ef0d7fc79f60fc028e"
        with NamedTemporaryFile(dir=os.getcwd(), suffix=".cms", delete=DELETE_TEMP_FILES) as fobj:
            cms = CountMinSketch(width=1000, depth=5)
            self.assertEqual(cms.add("this is a test", 100), 100)
            cms.export(fobj.name)
            md5_out = calc_file_md5(fobj.name)
            self.assertEqual(md5_out, md5_val)

            cms2 = CountMinSketch(filepath=fobj.name, hash_function=different_hash)
            self.assertEqual(cms2.elements_added, 100)
            # should not work since it is a different hash
            self.assertNotEqual(cms.check("this is a test"), True)
            self.assertNotEqual(cms.hashes("this is a test"), cms2.hashes("this is a test"))
Esempio n. 15
0
    def test_another_hashing_algo(self):
        """test defining a completely different hashing strategy"""
        md5_val = "7f590086f9b962387e145899dd001256"  # for default hash used
        results = [
            14409285476674975580,
            1383622036369840193,
            10825905054403519891,
            3456253732347153957,
            1494124715262089992,
        ]

        def my_hash(key, depth, encoding="utf-8"):
            """my hashing strategy"""
            max64mod = UINT64_T_MAX + 1
            results = list()
            for i in range(0, depth):
                tmp = key[i:] + key[:i]
                val = int(hashlib.sha512(tmp.encode(encoding)).hexdigest(), 16)
                results.append(val % max64mod)
            return results

        blm = BloomFilter(est_elements=10,
                          false_positive_rate=0.05,
                          hash_function=my_hash)

        self.assertEqual(blm.elements_added, 0)
        blm.add("this is a test")
        with NamedTemporaryFile(dir=os.getcwd(),
                                suffix=".blm",
                                delete=DELETE_TEMP_FILES) as fobj:
            blm.export(fobj.name)
            md5_out = calc_file_md5(fobj.name)
        self.assertNotEqual(md5_out, md5_val)

        for i in range(0, 10):
            tmp = "this is a test {0}".format(i)
            blm.add(tmp)

        self.assertEqual(blm.elements_added, 11)

        for i in range(0, 10):
            tmp = "this is a test {0}".format(i)
            self.assertTrue(blm.check(tmp))

        self.assertEqual(blm.hashes("this is a test", 5), results)
        res = blm.hashes("this is a test", 1)
        self.assertEqual(len(res), 1)
        self.assertEqual(res[0], results[0])
Esempio n. 16
0
    def test_bf_use_different_hash(self):
        """test that the different hash works as intended"""
        md5_val = "7f590086f9b962387e145899dd001256"  # for default hash used
        results = [
            14409285476674975580,
            6203976290780191624,
            5074829385518853901,
            3953072760750514173,
            11782747630324011555,
        ]

        @hash_with_depth_int
        def my_hash(key, depth=1, encoding="utf-8"):
            """my hash function"""
            max64mod = UINT64_T_MAX + 1
            val = int(hashlib.sha512(key.encode(encoding)).hexdigest(), 16)
            return val % max64mod

        blm = BloomFilter(est_elements=10,
                          false_positive_rate=0.05,
                          hash_function=my_hash)
        self.assertEqual(blm.elements_added, 0)
        blm.add("this is a test")
        with NamedTemporaryFile(dir=os.getcwd(),
                                suffix=".blm",
                                delete=DELETE_TEMP_FILES) as fobj:
            blm.export(fobj.name)

            md5_out = calc_file_md5(fobj.name)
        self.assertNotEqual(md5_out, md5_val)

        for i in range(0, 10):
            tmp = "this is a test {0}".format(i)
            blm.add(tmp)

        self.assertEqual(blm.elements_added, 11)

        for i in range(0, 10):
            tmp = "this is a test {0}".format(i)
            self.assertTrue(blm.check(tmp))

        self.assertEqual(blm.hashes("this is a test", 5), results)
        res = blm.hashes("this is a test", 1)
        self.assertEqual(len(res), 1)
        self.assertEqual(res[0], results[0])
Esempio n. 17
0
    def test_c_cuckoo_filter_load(self):
        """test loading a saved counting cuckoo filter"""
        md5sum = "6a98c2df1ec9fbb4f75f8e6392696b9b"
        with NamedTemporaryFile(dir=os.getcwd(), suffix=".cck", delete=DELETE_TEMP_FILES) as fobj:
            cko = CountingCuckooFilter(capacity=1000, bucket_size=2, auto_expand=False)
            for i in range(100):
                cko.add(str(i))

            cko.export(fobj.name)
            md5_out = calc_file_md5(fobj.name)
            self.assertEqual(md5sum, md5_out)

            ckf = CountingCuckooFilter(filepath=fobj.name)
            for i in range(100):
                self.assertEqual(ckf.check(str(i)), 1)

            self.assertEqual(1000, ckf.capacity)
            self.assertEqual(2, ckf.bucket_size)
            self.assertEqual(500, ckf.max_swaps)
            self.assertEqual(0.05, ckf.load_factor())
Esempio n. 18
0
    def test_cuckoo_filter_load(self):
        """test loading a saved cuckoo filter"""
        md5sum = "1371760d4ee9ccbe83e0144919750140"
        with NamedTemporaryFile(dir=os.getcwd(),
                                suffix=".cko",
                                delete=DELETE_TEMP_FILES) as fobj:
            cko = CuckooFilter()
            for i in range(1000):
                cko.add(str(i))
            cko.export(fobj.name)
            md5_out = calc_file_md5(fobj.name)
            self.assertEqual(md5sum, md5_out)

            ckf = CuckooFilter(filepath=fobj.name)
            for i in range(1000):
                self.assertTrue(ckf.check(str(i)))

            self.assertEqual(10000, ckf.capacity)
            self.assertEqual(4, ckf.bucket_size)
            self.assertEqual(500, ckf.max_swaps)
            self.assertEqual(0.025, ckf.load_factor())
Esempio n. 19
0
    def test_streamthreshold_load(self):
        """test loading a stream threshold sketch from file"""
        md5_val = "fb1c39dd1a73f1ef0d7fc79f60fc028e"
        with NamedTemporaryFile(dir=os.getcwd(), suffix=".cms", delete=DELETE_TEMP_FILES) as fobj:
            st1 = StreamThreshold(threshold=10, width=1000, depth=5)
            self.assertEqual(st1.add("this is a test", 100), 100)
            self.assertEqual(st1.elements_added, 100)
            self.assertEqual(st1.meets_threshold, {"this is a test": 100})
            st1.export(fobj.name)
            md5_out = calc_file_md5(fobj.name)
            self.assertEqual(md5_out, md5_val)

            # try loading directly to file!
            st2 = StreamThreshold(threshold=10, filepath=fobj.name)
            self.assertEqual(st2.width, 1000)
            self.assertEqual(st2.depth, 5)
            self.assertEqual(st2.elements_added, 100)
            self.assertEqual(st2.check("this is a test"), 100)
            # show on load that the tracking of stream threshold is gone
            self.assertEqual(st2.meets_threshold, dict())
            self.assertEqual(st2.add("this is a test", 1), 101)
            self.assertEqual(st2.meets_threshold, {"this is a test": 101})
Esempio n. 20
0
    def test_hh_load(self):
        """test loading a heavy hitters from file"""
        md5_val = "fb1c39dd1a73f1ef0d7fc79f60fc028e"
        with NamedTemporaryFile(dir=os.getcwd(), suffix=".cms", delete=DELETE_TEMP_FILES) as fobj:
            hh1 = HeavyHitters(num_hitters=1000, width=1000, depth=5)
            self.assertEqual(hh1.add("this is a test", 100), 100)
            self.assertEqual(hh1.elements_added, 100)
            self.assertEqual(hh1.heavy_hitters, {"this is a test": 100})
            hh1.export(fobj.name)
            md5_out = calc_file_md5(fobj.name)
            self.assertEqual(md5_out, md5_val)

            # try loading directly to file!
            hh2 = HeavyHitters(num_hitters=1000, filepath=fobj.name)
            self.assertEqual(hh2.width, 1000)
            self.assertEqual(hh2.depth, 5)
            self.assertEqual(hh2.elements_added, 100)
            self.assertEqual(hh2.check("this is a test"), 100)
            # show on load that the tracking of heavy hitters is gone
            self.assertEqual(hh2.heavy_hitters, dict())
            self.assertEqual(hh2.add("this is a test", 1), 101)
            self.assertEqual(hh2.heavy_hitters, {"this is a test": 101})
Esempio n. 21
0
    def test_cbf_export_file(self):
        """test exporting bloom filter to file"""
        md5_val = "0b83c837da30e25f768f0527c039d341"
        with NamedTemporaryFile(dir=os.getcwd(),
                                suffix=".cbm",
                                delete=DELETE_TEMP_FILES) as fobj:
            blm = CountingBloomFilter(est_elements=10,
                                      false_positive_rate=0.01)
            blm.add("test")
            blm.add("out")
            blm.add("the")
            blm.add("counting")
            blm.add("bloom")
            blm.add("filter")

            blm.add("test")
            blm.add("Test")
            blm.add("out")
            blm.add("test")
            blm.export(fobj.name)

            md5_out = calc_file_md5(fobj.name)
            self.assertEqual(md5_out, md5_val)
Esempio n. 22
0
 def test_ebf_export(self):
     """basic expanding Bloom Filter export test"""
     with NamedTemporaryFile(dir=os.getcwd(), suffix=".ebf", delete=DELETE_TEMP_FILES) as fobj:
         blm = ExpandingBloomFilter(est_elements=25, false_positive_rate=0.05)
         blm.export(fobj.name)
         self.assertEqual(calc_file_md5(fobj.name), "eb5769ae9babdf7b37d6ce64d58812bc")