def test_fc_inverted_logic_automatic(self): fc = filtercascade.FilterCascade(min_filter_length=1024) self.assertEqual(None, fc.invertedLogic) iterator, huge_set = get_serial_iterator_and_set(num_iterator=100, num_set=50_000) # Should automatically invert the logic fc.initialize(include=huge_set, exclude=set(iterator)) self.assertTrue(fc.invertedLogic) iterator, huge_set = get_serial_iterator_and_set(num_iterator=100, num_set=50_000) fc.verify(include=huge_set, exclude=iterator) h = MockFile() fc.tofile(h) self.assertEqual(len(h.data), 1055) fc2 = filtercascade.FilterCascade.from_buf(h) self.assertFilterCascadeEqual(fc, fc2) iterator, huge_set = get_serial_iterator_and_set(num_iterator=100, num_set=50_000) fc2.verify(include=huge_set, exclude=iterator)
def test_fc_version_1_with_salt(self): with self.assertRaises(ValueError): filtercascade.FilterCascade( defaultHashAlg=filtercascade.fileformats.HashAlgorithm.SHA256, salt=b"happiness", version=1, )
def test_fc_inverted_logic_iterators(self): fc = filtercascade.FilterCascade() self.assertFalse(fc.invertedLogic) iterator, huge_set = get_serial_iterator_and_set(num_iterator=100, num_set=50_000) with self.assertRaises(filtercascade.InvertedLogicException): fc.initialize(include=huge_set, exclude=iterator)
def test_fc_input_formats(self): f1 = filtercascade.FilterCascade([]) f1.initialize(include=["A"], exclude=["D"]) f2 = filtercascade.FilterCascade([]) f2.initialize(include=[b"A"], exclude=[b"D"]) incClass = SimpleToByteClass(ord("A")) excClass = SimpleToByteClass(ord("D")) f3 = filtercascade.FilterCascade([]) f3.initialize(include=[incClass], exclude=[excClass]) self.assertTrue(incClass.method_called) self.assertTrue(excClass.method_called) self.assertFilterCascadeEqual(f1, f2) self.assertFilterCascadeEqual(f1, f3)
def test_expected_error_rates(self): fc = filtercascade.FilterCascade() result = fc.set_crlite_error_rates(include_len=50, exclude_len=1_000) self.assertAlmostEqual(result[0], 0.0353, places=3) self.assertEqual(result[1], 0.5) self.assertEqual(result, fc.error_rates) with self.assertRaises(filtercascade.InvertedLogicException): fc.set_crlite_error_rates(include_len=1_000, exclude_len=50)
def test_fc_serial_deserial(self): f1 = filtercascade.FilterCascade() f1.initialize(exclude=["A", "B", "C"], include=["D"]) h = MockFile() f1.tofile(h) f2 = filtercascade.FilterCascade.from_buf(h) self.assertFilterCascadeEqual(f1, f2)
def test_inverted_logic_erroneous_error_rate(self): not_blocked = ["one_not_blocked_item"] blocked = [str(i) for i in range(1000)] fprs = [len(blocked) / (math.sqrt(2) * len(not_blocked)), 0.5] with self.assertRaises(filtercascade.InvalidErrorRateException): filtercascade.FilterCascade( defaultHashAlg=filtercascade.fileformats.HashAlgorithm.SHA256, salt=b"VERY_PREDICTABLE", error_rates=fprs, )
def test_fc_load_version_2(self): fc = filtercascade.FilterCascade([], version=2) valid, revoked = get_serial_sets(num_valid=10, num_revoked=1) fc.initialize(include=revoked, exclude=valid) h = MockFile() fc.tofile(h) fc2 = filtercascade.FilterCascade.from_buf(h) self.assertFilterCascadeEqual(fc, fc2)
def test_fc_iterable(self): f = filtercascade.FilterCascade([]) valid, revoked = get_serial_sets(num_valid=500_000, num_revoked=3_000) f.initialize(include=revoked, exclude=valid) self.assertEqual(len(f.filters), 3) self.assertEqual(f.filters[0].size, 81272) self.assertEqual(f.filters[1].size, 14400) self.assertEqual(f.filters[2].size, 14400)
def test_fc_serial_deserial(self): f1 = filtercascade.FilterCascade([]) f1.initialize(include=["A", "B", "C"], exclude=["D"]) h = MockFile() f1.tofile(h) f2 = filtercascade.FilterCascade.from_buf(h) for i in range(0, len(f1.filters)): self.assertBloomerEqual(f1.filters[i], f2.filters[i])
def test_fc_load_version_2(self): fc = filtercascade.FilterCascade(version=2) iterator, small_set = get_serial_iterator_and_set(num_iterator=10, num_set=1) fc.initialize(include=small_set, exclude=iterator) h = MockFile() fc.tofile(h) fc2 = filtercascade.FilterCascade.from_buf(h) self.assertFilterCascadeEqual(fc, fc2)
def test_fc_small_filter_length(self): fc = filtercascade.FilterCascade(min_filter_length=8) iterator, small_set = get_serial_iterator_and_set(num_iterator=5_000, num_set=100) fc.initialize(include=small_set, exclude=iterator) h = MockFile() fc.tofile(h) self.assertEqual(len(h.data), 280) fc2 = filtercascade.FilterCascade.from_buf(h) self.assertFilterCascadeEqual(fc, fc2)
def test_sha256_with_salt(self): fc = filtercascade.FilterCascade( [], hashAlg=filtercascade.HashAlgorithm.SHA256, salt=b"happiness") valid, revoked = get_serial_sets(num_valid=10, num_revoked=1) fc.initialize(include=revoked, exclude=valid) self.assertEqual(len(fc.filters), 1) self.assertEqual(fc.bitCount(), 81272) f = MockFile() fc.tofile(f) self.assertEqual(len(f.data), 10183)
def test_fc_iterable(self): f = filtercascade.FilterCascade([]) serials = predictable_serial_gen(500_000) # revocations must be disjoint from the main set, so # slice off a set and re-use the remainder revocations = set(islice(serials, 3_000)) f.initialize(include=revocations, exclude=serials) self.assertEqual(len(f.filters), 3) self.assertEqual(f.filters[0].size, 81272) self.assertEqual(f.filters[1].size, 14400) self.assertEqual(f.filters[2].size, 14400)
def test_verify_failure(self): """ This test cheats, changing the corpus of data out from under the Bloom filter. Not every such change would raise an AssertionError, particularly on these small data-sets. """ fc = filtercascade.FilterCascade([]) valid, revoked = get_serial_sets(num_valid=10, num_revoked=1) fc.initialize(include=revoked, exclude=valid) with self.assertRaises(AssertionError): valid2, revoked2 = get_serial_sets(num_valid=10, num_revoked=2) fc.verify(include=revoked2, exclude=valid2)
def test_fc_load_version_2_with_salt(self): fc = filtercascade.FilterCascade( [], version=2, salt=b"nacl", hashAlg=filtercascade.HashAlgorithm.SHA256) valid, revoked = get_serial_sets(num_valid=10, num_revoked=1) fc.initialize(include=revoked, exclude=valid) h = MockFile() fc.tofile(h) fc2 = filtercascade.FilterCascade.from_buf(h) self.assertFilterCascadeEqual(fc, fc2)
def test_fc_inverted_logic_explicit(self): fc = filtercascade.FilterCascade(invertedLogic=True) iterator, small_set = get_serial_iterator_and_set(num_iterator=2, num_set=2) fc.initialize(include=small_set, exclude=set(iterator)) self.assertTrue(fc.invertedLogic) iterator, small_set = get_serial_iterator_and_set(num_iterator=2, num_set=2) fc.verify(include=small_set, exclude=iterator) h = MockFile() fc.tofile(h) fc2 = filtercascade.FilterCascade.from_buf(h) self.assertTrue(fc2.invertedLogic) self.assertFilterCascadeEqual(fc, fc2)
def test_verify_failure(self): """ This test cheats, changing the corpus of data out from under the Bloom filter. Not every such change would raise an AssertionError, particularly on these small data-sets. """ fc = filtercascade.FilterCascade() iterator, small_set = get_serial_iterator_and_set(num_iterator=10, num_set=1) fc.initialize(include=small_set, exclude=iterator) with self.assertRaises(AssertionError): iterator2, small_set2 = get_serial_iterator_and_set( num_iterator=10, num_set=2) fc.verify(include=small_set2, exclude=iterator2)
def test_fc_load_version_2_with_salt(self): fc = filtercascade.FilterCascade( version=2, salt=b"nacl", defaultHashAlg=filtercascade.fileformats.HashAlgorithm.SHA256, ) iterator, small_set = get_serial_iterator_and_set(num_iterator=10, num_set=1) fc.initialize(include=small_set, exclude=iterator) self.assertFalse(fc.invertedLogic) h = MockFile() fc.tofile(h) fc2 = filtercascade.FilterCascade.from_buf(h) self.assertFilterCascadeEqual(fc, fc2)
def test_sha256_with_salt(self): fc = filtercascade.FilterCascade( defaultHashAlg=filtercascade.fileformats.HashAlgorithm.SHA256, salt=b"happiness", ) iterator, small_set = get_serial_iterator_and_set(num_iterator=10, num_set=1) fc.initialize(include=small_set, exclude=iterator) self.assertEqual(len(fc.filters), 1) self.assertEqual(fc.bitCount(), 8128) f = MockFile() fc.tofile(f) self.assertEqual(len(f.data), 1039)
def verify_minimum_sets(self, *, hashAlg): fc = filtercascade.FilterCascade([], hashAlg=hashAlg) valid, revoked = get_serial_sets(num_valid=10, num_revoked=1) fc.initialize(include=revoked, exclude=valid) self.assertEqual(len(fc.filters), 1) self.assertEqual(fc.bitCount(), 81272) f = MockFile() fc.tofile(f) self.assertEqual(len(f.data), 10174) fc2 = filtercascade.FilterCascade.from_buf(f) valid2, revoked2 = get_serial_sets(num_valid=10, num_revoked=1) fc2.verify(include=revoked2, exclude=valid2)
def test_fc_iterable(self): f = filtercascade.FilterCascade(filters=[]) iterator, small_set = get_serial_iterator_and_set(num_iterator=500_000, num_set=3_000) f.initialize(include=small_set, exclude=iterator) self.assertFalse(f.invertedLogic) self.assertEqual(len(f.filters), 10) self.assertEqual( list(map(lambda x: x.size, f.filters)), [26824, 10624, 2184, 5208, 1440, 1872, 1440, 1440, 1440, 1440], ) h = MockFile() f.tofile(h) self.assertEqual(len(h), 6843)
def verify_minimum_sets(self, *, hashAlg): fc = filtercascade.FilterCascade(defaultHashAlg=hashAlg) iterator, small_set = get_serial_iterator_and_set(num_iterator=10, num_set=1) fc.initialize(include=small_set, exclude=iterator) self.assertEqual(len(fc.filters), 1) self.assertEqual(fc.bitCount(), 8128) f = MockFile() fc.tofile(f) self.assertEqual(len(f.data), 1030) fc2 = filtercascade.FilterCascade.from_buf(f) iterator2, small_set2 = get_serial_iterator_and_set(num_iterator=10, num_set=1) fc2.verify(include=small_set2, exclude=iterator2)
def test_increased_false_positive_rate_in_deeper_layer(self): salt = b"VERY_PREDICTABLE" blocked = [] not_blocked = [] for i in range(1, 1000): not_blocked.append(str(-i)) blocked.append(str(i)) fprs = [len(blocked) / (math.sqrt(2) * len(not_blocked)), 0.5] fc = filtercascade.FilterCascade( error_rates=fprs, defaultHashAlg=filtercascade.fileformats.HashAlgorithm.SHA256, salt=salt, ) fc.initialize(include=blocked, exclude=not_blocked) fc.verify(include=blocked, exclude=not_blocked) self.assertEqual(len(fc.filters), 6) self.assertEqual(fc.bitCount(), 7992)
def test_set_error_rates(self): fc = filtercascade.FilterCascade() with self.assertRaises(ValueError): fc.set_error_rates([]) with self.assertRaises(filtercascade.InvalidErrorRateException): fc.set_error_rates([-1]) with self.assertRaises(filtercascade.InvalidErrorRateException): fc.set_error_rates([1.1]) with self.assertRaises(filtercascade.InvalidErrorRateException): fc.set_error_rates([0]) with self.assertRaises(filtercascade.InvalidErrorRateException): fc.set_error_rates([0, 0.25, 0.9]) with self.assertRaises(filtercascade.InvalidErrorRateException): fc.set_error_rates([0.25, 0.9, 1.0]) with self.assertRaises(filtercascade.InvalidErrorRateException): fc.set_error_rates([0.25, 0.9, 940]) fc.set_error_rates([0.99, 0.01])
def test_fc_standard_logic_disk_layout(self): fc = filtercascade.FilterCascade( defaultHashAlg=filtercascade.fileformats.HashAlgorithm.SHA256, salt=b"a") iterator, small_set = get_serial_iterator_and_set(num_iterator=50_000, num_set=100) # Should automatically invert the logic fc.initialize(include=small_set, exclude=iterator) self.assertFalse(fc.invertedLogic) h = MockFile() fc.tofile(h) self.assertEqual(h.data[0:1], b"\x02") self.assertEqual(h.data[2], 0) # inverted self.assertEqual(h.data[3], 1) # salt_len self.assertEqual(h.data[4], ord("a")) # salt self.assertEqual(h.data[5], filtercascade.fileformats.HashAlgorithm.SHA256)
def test_fc_heterogenous_hash_algorithms(self): fc = filtercascade.FilterCascade(filters=[ filtercascade.Bloomer( size=32, nHashFuncs=6, level=1, hashAlg=filtercascade.fileformats.HashAlgorithm.SHA256, ), filtercascade.Bloomer( size=32, nHashFuncs=1, level=2, hashAlg=filtercascade.fileformats.HashAlgorithm.MURMUR3, ), ]) h = MockFile() fc.tofile(h) with self.assertRaises(ValueError): filtercascade.FilterCascade.from_buf(h)
m = hashlib.sha256() m.update(counter.to_bytes(4, byteorder="big")) yield m.hexdigest() def store(fc, path): if path.exists(): path.unlink() with open(path, "wb") as f: fc.tofile(f) large_set = set(predictable_serial_gen(100_000)) v2_sha256_with_salt = filtercascade.FilterCascade( [], defaultHashAlg=filtercascade.fileformats.HashAlgorithm.SHA256, salt=b"nacl") v2_sha256_with_salt.initialize(include=[b"this", b"that"], exclude=large_set | set([b"other"])) store(v2_sha256_with_salt, Path("test_v2_sha256_salt_mlbf")) v2_sha256 = filtercascade.FilterCascade( [], defaultHashAlg=filtercascade.fileformats.HashAlgorithm.SHA256) v2_sha256.initialize(include=[b"this", b"that"], exclude=large_set | set([b"other"])) store(v2_sha256, Path("test_v2_sha256_mlbf")) v2_murmur = filtercascade.FilterCascade( [], defaultHashAlg=filtercascade.fileformats.HashAlgorithm.MURMUR3) v2_murmur.initialize(include=[b"this", b"that"], exclude=large_set | set([b"other"]))
def test_fc_include_not_list(self): f = filtercascade.FilterCascade([]) with self.assertRaises(TypeError): f.initialize(include=predictable_serial_gen(1), exclude=predictable_serial_gen(1))
def test_fc_exclude_must_be_iterable(self): f = filtercascade.FilterCascade([]) with self.assertRaises(TypeError): f.initialize(include=[], exclude=list(1))