def test_words(self): '''Ensure that strings work well''' vocabulary = self.load_words('words') test_words = self.load_words('testwords') bloom_filter = BloomFilter(100000, 1e-4) intersection = set(vocabulary) & set(test_words) setup_collision_count = 0 for word in vocabulary: if bloom_filter.test_by_hash(word): setup_collision_count += 1 else: bloom_filter.add_by_hash(word) self.assertLess(setup_collision_count, 5) false_positive_count = 0 false_negative_count = 0 for word in test_words: if word in intersection: if not bloom_filter.test_by_hash(word): false_negative_count += 1 else: if bloom_filter.test_by_hash(word): false_positive_count += 1 self.assertEqual(false_negative_count, 0) self.assertLessEqual(false_positive_count, 6)
def test_returns_positive_when_hashes_collide(self): '''BloomFilter.test_by_hash() returns True when hashes collide''' bloom_filter = BloomFilter(1000000, 1e-3) bloom_filter.add_by_hash('abc') self.assertEqual(bloom_filter.test_by_hash(u'abc'), True)
def test_all_test_positive_when_hashes_collide(self): '''BloomFilter.test_by_hash() returns False when filter is empty''' bloom_filter = BloomFilter(1000000, 1e-3) bloom_filter.add_by_hash('abc') self.assertEqual(bloom_filter.test_by_hash('def'), False)
def test_all_test_positive_when_hashes_collide(self): """BloomFilter.test_by_hash() returns False when filter is empty.""" bloom_filter = BloomFilter(1000000, 1e-3) bloom_filter.add_by_hash("abc") self.assertEqual(bloom_filter.test_by_hash("def"), False)
def test_returns_true_positive_when_value_had_been_added(self): '''BloomFilter.test_by_hash() returns True after the item added''' bloom_filter = BloomFilter(1000000, 1e-3) bloom_filter.add_by_hash('abc') self.assertEqual(bloom_filter.test_by_hash('abc'), True)
def test_returns_positive_when_hashes_collide(self): """BloomFilter.test_by_hash() returns True when hashes collide.""" bloom_filter = BloomFilter(1000000, 1e-3) bloom_filter.add_by_hash("abc") self.assertEqual(bloom_filter.test_by_hash(u"abc"), True)
def test_serializes_filter_serialize_without_line_feeds(self): '''BloomFilter serializes with base64 shield without line feeds''' bloom_filter = BloomFilter(100, 0.1) bloom_filter.add_by_hash('abcdef') serialized_filter = bloom_filter.serialize() self.assertEqual(serialized_filter.find('\n'), -1)
def test_serializes_filter_serialize(self): '''BloomFilter can round trip serialize() -> deserialize()''' bloom_filter = BloomFilter(100, 0.1) bloom_filter.add_by_hash('abcdef') serialized_filter = bloom_filter.serialize() restored_filter = BloomFilter.deserialize(serialized_filter) self.assertEqual(bloom_filter.raw_data(), restored_filter.raw_data())
def test_serializes_filter_serialize_without_line_feeds(self): """Serializes with base64 shield without line feeds.""" bloom_filter = BloomFilter(100, 0.1) bloom_filter.add_by_hash("abcdef") serialized_filter = bloom_filter.serialize() # self.assertEqual(serialized_filter.find("\n"), -1) self.assertTrue(b"\n" not in serialized_filter)
def test_serializes_filter_serialize(self): """Can round trip serialize() -> deserialize().""" bloom_filter = BloomFilter(100, 0.1) bloom_filter.add_by_hash("abcdef") serialized_filter = bloom_filter.serialize() restored_filter = BloomFilter.deserialize(serialized_filter) self.assertEqual(bloom_filter.raw_data(), restored_filter.raw_data())
def test_non_randoms_at_all(self): '''Ensure that small bit differences do not play bad''' bloom_filter = BloomFilter(1000000, 1e-5) collision_count = 0 for ix in range(1000000): if bloom_filter.test_by_hash(ix): collision_count += 1 else: bloom_filter.add_by_hash(ix) self.assertEqual(collision_count, 0)
def test_objects(self): '''Ensure that objects work well''' # hash of object (with no __hash__) is its address, so it is # not overly random # # Nota Bene!: since memory is reused, there is a real # possibility of object hash collisions. # # For example: # for ix in xrange(1000000): # obj = object() # produces objects with exactly two hashes. bloom_filter = BloomFilter(1000000, 1e-5) collision_count = 0 objects = [object() for _ in range(1000000)] for obj in objects: if bloom_filter.test_by_hash(obj): collision_count += 1 else: bloom_filter.add_by_hash(obj) self.assertEqual(collision_count, 0)
def test_returns_false_when_readding_hash(self): '''BloomFilter.add_by_hash() returns False when readding hash.''' bloom_filter = BloomFilter(1000000, 1e-3) bloom_filter.add_by_hash('abc') self.assertFalse(bloom_filter.add_by_hash('abc'))
def test_returns_true_when_first_adding_hash(self): '''BloomFilter.add_by_hash() returns True when first adding hash.''' bloom_filter = BloomFilter(1000000, 1e-3) self.assertTrue(bloom_filter.add_by_hash('abc'))
def test_returns_false_when_readding_hash(self): """BloomFilter.add_by_hash() returns False when readding hash.""" bloom_filter = BloomFilter(1000000, 1e-3) bloom_filter.add_by_hash("abc") self.assertFalse(bloom_filter.add_by_hash("abc"))
def test_returns_true_when_first_adding_hash(self): """BloomFilter.add_by_hash() returns True when first adding hash.""" bloom_filter = BloomFilter(1000000, 1e-3) self.assertTrue(bloom_filter.add_by_hash("abc"))