def test_words(self):
        '''Ensure that strings work well'''
        vocabulary = self.load_words('words')
        test_words = self.load_words('testwords')
        bloom_filter = BloomFilter(100000, 1e-4)

        intersection = set(vocabulary) & set(test_words)

        setup_collision_count = 0
        for word in vocabulary:
            if bloom_filter.test_by_hash(word):
                setup_collision_count += 1
            else:
                bloom_filter.add_by_hash(word)
        self.assertLess(setup_collision_count, 5)

        false_positive_count = 0
        false_negative_count = 0
        for word in test_words:
            if word in intersection:
                if not bloom_filter.test_by_hash(word):
                    false_negative_count += 1
            else:
                if bloom_filter.test_by_hash(word):
                    false_positive_count += 1
        self.assertEqual(false_negative_count, 0)
        self.assertLessEqual(false_positive_count, 6)
Esempio n. 2
0
    def test_returns_positive_when_hashes_collide(self):
        '''BloomFilter.test_by_hash() returns True when hashes collide'''
        bloom_filter = BloomFilter(1000000, 1e-3)

        bloom_filter.add_by_hash('abc')

        self.assertEqual(bloom_filter.test_by_hash(u'abc'), True)
Esempio n. 3
0
    def test_all_test_positive_when_hashes_collide(self):
        '''BloomFilter.test_by_hash() returns False when filter is empty'''
        bloom_filter = BloomFilter(1000000, 1e-3)

        bloom_filter.add_by_hash('abc')

        self.assertEqual(bloom_filter.test_by_hash('def'), False)
    def test_all_test_positive_when_hashes_collide(self):
        """BloomFilter.test_by_hash() returns False when filter is empty."""
        bloom_filter = BloomFilter(1000000, 1e-3)

        bloom_filter.add_by_hash("abc")

        self.assertEqual(bloom_filter.test_by_hash("def"), False)
Esempio n. 5
0
    def test_returns_true_positive_when_value_had_been_added(self):
        '''BloomFilter.test_by_hash() returns True after the item added'''
        bloom_filter = BloomFilter(1000000, 1e-3)

        bloom_filter.add_by_hash('abc')

        self.assertEqual(bloom_filter.test_by_hash('abc'), True)
    def test_returns_positive_when_hashes_collide(self):
        """BloomFilter.test_by_hash() returns True when hashes collide."""
        bloom_filter = BloomFilter(1000000, 1e-3)

        bloom_filter.add_by_hash("abc")

        self.assertEqual(bloom_filter.test_by_hash(u"abc"), True)
Esempio n. 7
0
    def test_serializes_filter_serialize_without_line_feeds(self):
        '''BloomFilter serializes with base64 shield without line feeds'''
        bloom_filter = BloomFilter(100, 0.1)
        bloom_filter.add_by_hash('abcdef')

        serialized_filter = bloom_filter.serialize()

        self.assertEqual(serialized_filter.find('\n'), -1)
Esempio n. 8
0
    def test_serializes_filter_serialize(self):
        '''BloomFilter can round trip serialize() -> deserialize()'''
        bloom_filter = BloomFilter(100, 0.1)
        bloom_filter.add_by_hash('abcdef')

        serialized_filter = bloom_filter.serialize()

        restored_filter = BloomFilter.deserialize(serialized_filter)
        self.assertEqual(bloom_filter.raw_data(), restored_filter.raw_data())
    def test_serializes_filter_serialize_without_line_feeds(self):
        """Serializes with base64 shield without line feeds."""
        bloom_filter = BloomFilter(100, 0.1)
        bloom_filter.add_by_hash("abcdef")

        serialized_filter = bloom_filter.serialize()

        # self.assertEqual(serialized_filter.find("\n"), -1)
        self.assertTrue(b"\n" not in serialized_filter)
Esempio n. 10
0
    def test_serializes_filter_serialize(self):
        """Can round trip serialize() -> deserialize()."""
        bloom_filter = BloomFilter(100, 0.1)
        bloom_filter.add_by_hash("abcdef")

        serialized_filter = bloom_filter.serialize()

        restored_filter = BloomFilter.deserialize(serialized_filter)
        self.assertEqual(bloom_filter.raw_data(), restored_filter.raw_data())
 def test_non_randoms_at_all(self):
     '''Ensure that small bit differences do not play bad'''
     bloom_filter = BloomFilter(1000000, 1e-5)
     collision_count = 0
     for ix in range(1000000):
         if bloom_filter.test_by_hash(ix):
             collision_count += 1
         else:
             bloom_filter.add_by_hash(ix)
     self.assertEqual(collision_count, 0)
 def test_objects(self):
     '''Ensure that objects work well'''
     # hash of object (with no __hash__) is its address, so it is
     # not overly random
     #
     # Nota Bene!: since memory is reused, there is a real
     # possibility of object hash collisions.
     #
     # For example:
     #     for ix in xrange(1000000):
     #       obj = object()
     # produces objects with exactly two hashes.
     bloom_filter = BloomFilter(1000000, 1e-5)
     collision_count = 0
     objects = [object() for _ in range(1000000)]
     for obj in objects:
         if bloom_filter.test_by_hash(obj):
             collision_count += 1
         else:
             bloom_filter.add_by_hash(obj)
     self.assertEqual(collision_count, 0)
Esempio n. 13
0
    def test_returns_false_when_readding_hash(self):
        '''BloomFilter.add_by_hash() returns False when readding hash.'''
        bloom_filter = BloomFilter(1000000, 1e-3)

        bloom_filter.add_by_hash('abc')
        self.assertFalse(bloom_filter.add_by_hash('abc'))
Esempio n. 14
0
    def test_returns_true_when_first_adding_hash(self):
        '''BloomFilter.add_by_hash() returns True when first adding hash.'''
        bloom_filter = BloomFilter(1000000, 1e-3)

        self.assertTrue(bloom_filter.add_by_hash('abc'))
Esempio n. 15
0
    def test_returns_false_when_readding_hash(self):
        """BloomFilter.add_by_hash() returns False when readding hash."""
        bloom_filter = BloomFilter(1000000, 1e-3)

        bloom_filter.add_by_hash("abc")
        self.assertFalse(bloom_filter.add_by_hash("abc"))
Esempio n. 16
0
    def test_returns_true_when_first_adding_hash(self):
        """BloomFilter.add_by_hash() returns True when first adding hash."""
        bloom_filter = BloomFilter(1000000, 1e-3)

        self.assertTrue(bloom_filter.add_by_hash("abc"))