def test_intersection_size(self): fpr = 0.001 # False positive rate with small numbers is high, therefore let's test with bigger sets bloom_one = BloomFilter(100000, fpr) bloom_two = BloomFilter(100000, fpr) listA = [str(random.getrandbits(14)) for i in range(71000)] listB = [str(random.getrandbits(12)) for i in range(69000)] for char in listA: bloom_one.add(char) for char in listB: bloom_two.add(char) merged_bloom = bloom_one.intersection(bloom_two) bloom_one_count = bloom_one.count bloom_two_count = bloom_two.count listA_uniq_count = len(set(listA)) listB_uniq_count = len(set(listB)) merged_bloom_count = merged_bloom.count listAB_uniq_count = len(set(listA).intersection(set(listB))) assert bloom_one_count == listA_uniq_count assert bloom_two_count == listB_uniq_count # Intersection guarantees to have all elements of the intersection but the false positive rate might be slightly higher than that of the pure intersection: assert (listAB_uniq_count * (1 - 2 * fpr) <= merged_bloom_count <= listAB_uniq_count * (1 + 2 * fpr))
def test_union_size(self): fpr = 0.001 # False positive rate with small numbers is high, therefore let's test with bigger sets bloom_one = BloomFilter(100000, fpr) bloom_two = BloomFilter(100000, fpr) listA = [str(random.getrandbits(8)) for i in range(10000)] listB = [str(random.getrandbits(8)) for i in range(10000)] for char in listA: bloom_one.add(char) for char in listB: bloom_two.add(char) merged_bloom = bloom_one.union(bloom_two) bloom_one_count = bloom_one.count bloom_two_count = bloom_two.count listA_uniq_count = len(set(listA)) listB_uniq_count = len(set(listB)) merged_bloom_count = merged_bloom.count listAB_uniq_count = len(set(listA).union(set(listB))) assert bloom_one_count == listA_uniq_count assert bloom_two_count == listB_uniq_count assert (listAB_uniq_count * (1 - fpr) <= merged_bloom_count <= listAB_uniq_count * (1 + fpr))
def test_union(self): bloom_one = BloomFilter(100, 0.001) bloom_two = BloomFilter(100, 0.001) chars = [chr(i) for i in range_fn(97, 123)] for char in chars[int(len(chars)/2):]: bloom_one.add(char) for char in chars[:int(len(chars)/2)]: bloom_two.add(char) new_bloom = bloom_one.union(bloom_two) for char in chars: self.assertTrue(char in new_bloom)
def test_union(self): bloom_one = BloomFilter(100, 0.001) bloom_two = BloomFilter(100, 0.001) chars = [chr(i) for i in range(97, 123)] for char in chars[int(len(chars) / 2):]: bloom_one.add(char) for char in chars[:int(len(chars) / 2)]: bloom_two.add(char) new_bloom = bloom_one.union(bloom_two) for char in chars: self.assertTrue(char in new_bloom)
def test_nstar_union(self): bloom_one = BloomFilter(200, 0.001) bloom_two = BloomFilter(200, 0.001) chars = [chr(i) for i in range_fn(0, 200)] for char in chars[:int(len(chars)/2)]: bloom_one.add(char) for char in chars[int(len(chars)/2):]: bloom_two.add(char) new_bloom = bloom_one.union(bloom_two) self.assertTrue(bloom_one.nstar() > len(chars)/2-10 and bloom_one.nstar() < len(chars)/2+10) self.assertTrue(bloom_two.nstar() > len(chars)/2-10 and bloom_two.nstar() < len(chars)/2+10) self.assertTrue(new_bloom.nstar() > len(chars)-10 and new_bloom.nstar() < len(chars)+10)
def initBloomFilter(stringUnique): ''' keyHashMat is (hahsLenth * keyNum) :param stringUnique: :return: ''' print('---initialize Bloom Filter---') BF = BloomFilter(capacity=int(len(stringUnique) * 1.2), error_rate=0.30) keyHashMat = [] for i in stringUnique: BF.add(i) keyHashMat.append([int(x) for x in BF.keyhash(i)]) keyHashMat = np.mat(keyHashMat).T return BF, BF.num_bits, BF.bitarray, keyHashMat
def test_nstar_intersection_2(self): bloom_one = BloomFilter(200, 0.001) bloom_two = BloomFilter(200, 0.001) chars = [chr(i) for i in range_fn(0, 200)] for char in chars[int(len(chars)/2):]: bloom_one.add(char) for char in chars[:int(len(chars)/2)]: bloom_two.add(char) new_bloom = bloom_one.intersection(bloom_two) self.assertTrue(bloom_one.nstar() > len(chars)/2-10 and bloom_one.nstar() < len(chars)/2+10) self.assertTrue(bloom_two.nstar() > len(chars)/2-10 and bloom_two.nstar() < len(chars)/2+10) #The nstar operator will fail on the intersection of the filters.. self.assertTrue(new_bloom.nstar() > 10) self.assertTrue(bloom_one.nstar_intersection(bloom_two) < 10)
filter = klass(*args) for item in self.EXPECTED: filter.add(item) f = tempfile.TemporaryFile() filter.tofile(f) stringio = StringIO() filter.tofile(stringio) streams_to_test = [f, stringio] if not running_python_3: cstringio = cStringIO.StringIO() filter.tofile(cstringio) streams_to_test.append(cstringio) del filter for stream in streams_to_test: stream.seek(0) filter = klass.fromfile(stream) for item in self.EXPECTED: self.assertTrue(item in filter) del(filter) stream.close() if __name__ == '__main__': # unittest.main() f = BloomFilter(capacity=10000, error_rate=0.001) for i in range_fn(0, f.capacity): f.add(i) print (0 in f)
import tempfile import time from pybloom.pybloom import BloomFilter NS = 10**9 for _p in xrange(1, 3): p = 10 ** _p for e in xrange(9): X = int(1000 * 10 ** (e / 2.0)) print X, p, bloomfilter = BloomFilter(X + 1, 1.0/p) t = time.time() for x in xrange(X): bloomfilter.add(x) print (time.time() - t) / X * NS, t = time.time() for x in xrange(X): x in bloomfilter print (time.time() - t) / X * NS, t = time.time() for x in xrange(X, 2*X): x in bloomfilter print (time.time() - t ) / X * NS
import redis import csv from pybloom.pybloom import BloomFilter redis_client = redis.StrictRedis(host='192.168.192.12', port=6381, db=2) filter_key = 'test' redis_client.delete(filter_key) bf = BloomFilter(redis_client, filter_key, 100000000, 0.0000001) id = '1000' if not bf.contain(id): bf.add(id) else: print("already in")
import tempfile import time from pybloom.pybloom import BloomFilter NS = 10**9 for _p in xrange(1, 3): p = 10**_p for e in xrange(9): X = int(1000 * 10**(e / 2.0)) print X, p, bloomfilter = BloomFilter(X + 1, 1.0 / p) t = time.time() for x in xrange(X): bloomfilter.add(x) print(time.time() - t) / X * NS, t = time.time() for x in xrange(X): x in bloomfilter print(time.time() - t) / X * NS, t = time.time() for x in xrange(X, 2 * X): x in bloomfilter print(time.time() - t) / X * NS
def get_filter(shingles): f = BloomFilter(capacity=10000, error_rate=0.001) for sg in shingles: f.add(" ".join(sg)) return f
def test_nstar(self): bloom = BloomFilter(1000, 0.001) chars = [chr(i) for i in range_fn(0,200)] for char in chars: bloom.add(char) self.assertTrue(bloom.nstar() > len(chars)-10 and bloom.nstar() < len(chars)+10)
from pybloom.pybloom import BloomFilter import bitarray import numpy as np from sklearn import linear_model from scipy.linalg import solve f = BloomFilter(capacity=60, error_rate=0.30) [f.add(x) for x in range(50)] # print(9 in f) # print('num of key: '+str(len(f))) print(f.bitarray) #the final array # f.add('10') # print(f.bitarray) # print(len(f.bitarray)) print(f.num_bits) #bloomFilter's size(length) # # a = bitarray.bitarray('0'*10) # print(a) print(f.keyhash(5)) # # # # a = np.mat(np.array([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 1, ]]).T) # y = np.mat(np.array([1, 1, 2, 2])).T # # num = np.mat(np.array([1, 1, 2])).T #