Beispiel #1
0
    def test_intersection_capacity_fail(self):
        bloom_one = BloomFilter(1000, 0.001)
        bloom_two = BloomFilter(100, 0.001)

        def _run():
            _ = bloom_one.intersection(bloom_two)

        self.assertRaises(ValueError, _run)
Beispiel #2
0
    def test_union_k_fail(self):
        bloom_one = BloomFilter(100, 0.01)
        bloom_two = BloomFilter(100, 0.001)

        def _run():
            _ = bloom_one.union(bloom_two)

        self.assertRaises(ValueError, _run)
Beispiel #3
0
    def test_union_size(self):
        fpr = 0.001
        # False positive rate with small numbers is high, therefore let's test with bigger sets
        bloom_one = BloomFilter(100000, fpr)
        bloom_two = BloomFilter(100000, fpr)
        listA = [str(random.getrandbits(8)) for i in range(10000)]
        listB = [str(random.getrandbits(8)) for i in range(10000)]

        for char in listA:
            bloom_one.add(char)
        for char in listB:
            bloom_two.add(char)

        merged_bloom = bloom_one.union(bloom_two)

        bloom_one_count = bloom_one.count
        bloom_two_count = bloom_two.count

        listA_uniq_count = len(set(listA))
        listB_uniq_count = len(set(listB))

        merged_bloom_count = merged_bloom.count
        listAB_uniq_count = len(set(listA).union(set(listB)))

        assert bloom_one_count == listA_uniq_count
        assert bloom_two_count == listB_uniq_count
        assert (listAB_uniq_count * (1 - fpr) <= merged_bloom_count <= listAB_uniq_count * (1 + fpr))
Beispiel #4
0
    def test_intersection_size(self):
        fpr = 0.001
        # False positive rate with small numbers is high, therefore let's test with bigger sets
        bloom_one = BloomFilter(100000, fpr)
        bloom_two = BloomFilter(100000, fpr)
        listA = [str(random.getrandbits(14)) for i in range(71000)]
        listB = [str(random.getrandbits(12)) for i in range(69000)]

        for char in listA:
            bloom_one.add(char)
        for char in listB:
            bloom_two.add(char)

        merged_bloom = bloom_one.intersection(bloom_two)

        bloom_one_count = bloom_one.count
        bloom_two_count = bloom_two.count

        listA_uniq_count = len(set(listA))
        listB_uniq_count = len(set(listB))

        merged_bloom_count = merged_bloom.count
        listAB_uniq_count = len(set(listA).intersection(set(listB)))

        assert bloom_one_count == listA_uniq_count
        assert bloom_two_count == listB_uniq_count
        # Intersection guarantees to have all elements of the intersection but the false positive rate might be slightly higher than that of the pure intersection:
        assert (listAB_uniq_count * (1 - 2 * fpr) <= merged_bloom_count <= listAB_uniq_count * (1 + 2 * fpr))
Beispiel #5
0
def initBloomFilter(stringUnique):
    '''
    keyHashMat is (hahsLenth * keyNum)
    :param stringUnique:
    :return:
    '''
    print('---initialize Bloom Filter---')
    BF = BloomFilter(capacity=int(len(stringUnique) * 1.2), error_rate=0.30)
    keyHashMat = []
    for i in stringUnique:
        BF.add(i)
        keyHashMat.append([int(x) for x in BF.keyhash(i)])
    keyHashMat = np.mat(keyHashMat).T
    return BF, BF.num_bits, BF.bitarray, keyHashMat
Beispiel #6
0
 def test_union(self):
     bloom_one = BloomFilter(100, 0.001)
     bloom_two = BloomFilter(100, 0.001)
     chars = [chr(i) for i in range(97, 123)]
     for char in chars[int(len(chars) / 2):]:
         bloom_one.add(char)
     for char in chars[:int(len(chars) / 2)]:
         bloom_two.add(char)
     new_bloom = bloom_one.union(bloom_two)
     for char in chars:
         self.assertTrue(char in new_bloom)
 def test_union(self):
     bloom_one = BloomFilter(100, 0.001)
     bloom_two = BloomFilter(100, 0.001)
     chars = [chr(i) for i in range_fn(97, 123)]
     for char in chars[int(len(chars)/2):]:
         bloom_one.add(char)
     for char in chars[:int(len(chars)/2)]:
         bloom_two.add(char)
     new_bloom = bloom_one.union(bloom_two)
     for char in chars:
         self.assertTrue(char in new_bloom)
Beispiel #8
0
from pybloom.pybloom import BloomFilter

import bitarray
import numpy as np
from sklearn import linear_model
from scipy.linalg import solve

f = BloomFilter(capacity=60, error_rate=0.30)

[f.add(x) for x in range(50)]

# print(9 in f)
# print('num of key: '+str(len(f)))
print(f.bitarray)  #the final array
# f.add('10')
# print(f.bitarray)
# print(len(f.bitarray))
print(f.num_bits)  #bloomFilter's size(length)
#
# a = bitarray.bitarray('0'*10)
# print(a)

print(f.keyhash(5))
#
#
#
# a = np.mat(np.array([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 1, ]]).T)
# y = np.mat(np.array([1, 1, 2, 2])).T
#
# num = np.mat(np.array([1, 1, 2])).T
#
Beispiel #9
0
            filter = klass(*args)
            for item in self.EXPECTED:
                filter.add(item)

            f = tempfile.TemporaryFile()
            filter.tofile(f)
            stringio = StringIO()
            filter.tofile(stringio)
            streams_to_test = [f, stringio]
            if not running_python_3:
                cstringio = cStringIO.StringIO()
                filter.tofile(cstringio)
                streams_to_test.append(cstringio)

            del filter

            for stream in streams_to_test:
                stream.seek(0)
                filter = klass.fromfile(stream)
                for item in self.EXPECTED:
                    self.assertTrue(item in filter)
                del(filter)
                stream.close()

if __name__ == '__main__':
    # unittest.main()
    f = BloomFilter(capacity=10000, error_rate=0.001)
    for i in range_fn(0, f.capacity):
        f.add(i)
    print (0 in f)
import tempfile
import time
from pybloom.pybloom import BloomFilter

NS = 10**9
for _p in xrange(1, 3):
    p = 10 ** _p
    for e in xrange(9):
        X = int(1000 * 10 ** (e / 2.0))
        print X, p, 
        bloomfilter = BloomFilter(X + 1, 1.0/p)
        t = time.time()

        for x in xrange(X):
            bloomfilter.add(x)
        print (time.time() - t) / X * NS,
        t = time.time()
        for x in xrange(X):
            x in bloomfilter
        print (time.time() - t) / X * NS,
        t = time.time()
        for x in xrange(X, 2*X):
            x in bloomfilter
        print (time.time() - t ) / X * NS 

import redis
import csv
from pybloom.pybloom import BloomFilter

redis_client = redis.StrictRedis(host='192.168.192.12', port=6381, db=2)
filter_key = 'test'
redis_client.delete(filter_key)


bf = BloomFilter(redis_client, filter_key, 100000000, 0.0000001)

id = '1000'
if not bf.contain(id):
    bf.add(id)
else:
    print("already in")
Beispiel #12
0
import tempfile
import time
from pybloom.pybloom import BloomFilter

NS = 10**9
for _p in xrange(1, 3):
    p = 10**_p
    for e in xrange(9):
        X = int(1000 * 10**(e / 2.0))
        print X, p,
        bloomfilter = BloomFilter(X + 1, 1.0 / p)
        t = time.time()

        for x in xrange(X):
            bloomfilter.add(x)
        print(time.time() - t) / X * NS,
        t = time.time()
        for x in xrange(X):
            x in bloomfilter
        print(time.time() - t) / X * NS,
        t = time.time()
        for x in xrange(X, 2 * X):
            x in bloomfilter
        print(time.time() - t) / X * NS
def get_filter(shingles):
    f = BloomFilter(capacity=10000, error_rate=0.001)
    for sg in shingles:
        f.add(" ".join(sg))
    return f
Beispiel #14
0
import redis
import csv
from pybloom.pybloom import BloomFilter

redis_client = redis.StrictRedis(host='192.168.192.12', port=6381, db=2)
filter_key = 'test'
redis_client.delete(filter_key)

bf = BloomFilter(redis_client, filter_key, 100000000, 0.0000001)

id = '1000'
if not bf.contain(id):
    bf.add(id)
else:
    print("already in")
Beispiel #15
0
 def test_nstar(self):
     bloom = BloomFilter(1000, 0.001)
     chars = [chr(i) for i in range_fn(0,200)]
     for char in chars:
         bloom.add(char)
     self.assertTrue(bloom.nstar() > len(chars)-10 and bloom.nstar() < len(chars)+10)
Beispiel #16
0
    def test_nstar_intersection_2(self):
        bloom_one = BloomFilter(200, 0.001)
        bloom_two = BloomFilter(200, 0.001)
        chars = [chr(i) for i in range_fn(0, 200)]
        for char in chars[int(len(chars)/2):]:
            bloom_one.add(char)
        for char in chars[:int(len(chars)/2)]:
            bloom_two.add(char)
        new_bloom = bloom_one.intersection(bloom_two)

        self.assertTrue(bloom_one.nstar() > len(chars)/2-10 and bloom_one.nstar() < len(chars)/2+10)
        self.assertTrue(bloom_two.nstar() > len(chars)/2-10 and bloom_two.nstar() < len(chars)/2+10)

        #The nstar operator will fail on the intersection of the filters..
        self.assertTrue(new_bloom.nstar() > 10)

        self.assertTrue(bloom_one.nstar_intersection(bloom_two) < 10)
Beispiel #17
0
    def test_nstar_union(self):
        bloom_one = BloomFilter(200, 0.001)
        bloom_two = BloomFilter(200, 0.001)
        chars = [chr(i) for i in range_fn(0, 200)]
        for char in chars[:int(len(chars)/2)]:
            bloom_one.add(char)
        for char in chars[int(len(chars)/2):]:
            bloom_two.add(char)
        new_bloom = bloom_one.union(bloom_two)

        self.assertTrue(bloom_one.nstar() > len(chars)/2-10 and bloom_one.nstar() < len(chars)/2+10)
        self.assertTrue(bloom_two.nstar() > len(chars)/2-10 and bloom_two.nstar() < len(chars)/2+10)
        self.assertTrue(new_bloom.nstar() > len(chars)-10 and new_bloom.nstar() < len(chars)+10)