def test_bloom_filter(): bf = bloom.BloomFilter(bits=20, hashes=3) bf.add('hello') assert bf.contains('hello'), 'BloomFilter failed to add item "hello"' assert not bf.contains('hi'), 'BloomFilter failed to deny item "hi"' bf.add('hi') assert bf.contains('hello'), 'BloomFilter failed to add item "hello"' assert bf.contains('hi'), 'BloomFilter failed to add item "hi"'
h = .02 # step size in the mesh # we create an instance of SVM and fit out data. We do not scale our # data since we want to plot the support vectors C = 0.2 # SVM regularization parameter rbf_svc = svm.SVC(kernel='rbf', gamma=0.7, C=C).fit(X, Y) print('RBF: ') start = datetime.datetime.now() y_pred = rbf_svc.predict(X) end = datetime.datetime.now() print('========== Learned Bloom filter result =============') print("Learned Bloom average predict time: ", (end - start)) y_label = [int(i) for i in Y] conf_matrix = confusion_matrix(y_label, y_pred) print(conf_matrix) # print(classification_report(y_label, y_pred)) print('========== Traditional Bloom filter result =========') bloom = bloom.BloomFilter(len(X), fpr_b) for i in range(len(X)): if Y[i] == 1: bloom.add(X[i][0]) result = [] start = datetime.datetime.now() y_bloom = [bloom.check(x[0]) for x in X] end = datetime.datetime.now() print(bloom.size) print("Traditional Bloom average predict time: ", (end - start)) print(confusion_matrix(y_label, y_bloom))
#!/usr/bin/env python # Author Dario Clavijo 2017 # GPlv3 # used for checking have i been pwnd passwords against a bloomfilter import bloom import sys import hashlib #bf = bloom.BloomFilter(array_size=(1024**3)*8,do_hashing=True) bf = bloom.BloomFilter(filename=sys.argv[1], array_size=(1024**2) * 512, do_hashing=False, slice_bits=120, slices=7, ishex=True) print bf.check(hashlib.sha1(sys.argv[2]).hexdigest())
#!/usr/bin/env python # Author Dario Clavijo 2017 # GPlv3 import bloom import sys import fileinput SIZEMB = int(sys.argv[1]) bf = bloom.BloomFilter(array_size=(1024**2) * SIZEMB, do_hashing=False, slice_bits=120, slices=7, ishex=True) new = 0 seen = 0 fp = open(sys.argv[2], 'r+') for line in fp: try: #h=str(int(line.rstrip(),16)).encode('utf8') h = line.rstrip() #print(h) except: h = None if h != None: if bf.update(h) == False: new += 1 else: seen += 1 print("new:%d seen:%d" % (new, seen))
import jsonpath import json import redis import bloom import pymysql.cursors # ===== 连接数据库 conn = pymysql.connect(host="127.0.0.1", user="******", passwd="root", db="zhihu", charset='utf8', use_unicode=True) cursor = conn.cursor() bf = bloom.BloomFilter(0.001, 100000000) ssl._create_default_https_context = ssl._create_unverified_context # ===== 设置头信息 ua = [ 'User-Agent:Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;TheWorld)' 'User-Agent:Mozilla/5.0(compatible;MSIE9.0;WindowsNT6.1;Trident/5.0', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0' ] thisua = random.choice(ua) headers = {"User-Agent": thisua} headers1 = { 'Cache-Control': 'max-age=0', 'User-Agent': random.choice(ua),
import sys import bloom filename = sys.argv[1] try: Gigs = int(sys.argv[2]) except ValueError as verr: print "Plase input the correct number of Gigabytes of RAM to be used." exit(1) if Gigs > 0: bf = bloom.BloomFilter(array_size=Gigs * (1024**3), do_bkp=False, do_hashing=False, fast=False) bf.save(filename)
#!/usr/bin/env python # Author Dario Clavijo 2017 # GPlv3 import bloom import sys bf = bloom.BloomFilter() fp = open(sys.argv[1], 'r') for line in fp: bf.add(line.rstrip()) fp.close() bf.save(sys.argv[2])
import sys import bloom try: array_size = int(sys.argv[2]) except: array_size = (1024**3) * 5 bf = bloom.BloomFilter(array_size=array_size, do_bkp=False, do_hashing=False, bitshuffle=False) bf.filename = sys.argv[1] bf.save()
with open(sample, 'r') as sample_fh: content = sample_fh.read() # Get text from HTML content words = html.fromstring(content).text_content().replace("\n", "") words = re.findall(r"[\w]+", words) # Remove all punctuation etc., convert words to lower and delete # duplicates words = list(set([word.lower() for word in words])) # Remove common words words = remove_common_words(words) # Stemming to reduce the number of words words = list(set([p.stem(word, 0, len(word)-1) for word in words])) tmp_filter = bloom.BloomFilter(capacity=len(words), error_rate=error_rate) for word in words: tmp_filter.add(word) filters.append(tmp_filter.buckets) pages.append({"title": re.search(r"@title=(.*)\n", content).group(1), "url": sample[3:]}) # First Int32 is length filters_to_write = struct.pack("<i", len(filters)) # Then comes the length of each filter for i in filters: filters_to_write += struct.pack("<i", len(i)) # Finally comes the filters themselves for i in filters: