def test_functionality(self): bf = inbloom.Filter(20, 0.01) keys = [ "foo", "bar", "foosdfsdfs", "fossdfsdfo", "foasdfasdfasdfasdfo", "foasdfasdfasdasdfasdfasdfasdfasdfo" ] faux = ["goo", "gar", "gaz"] for k in keys: bf.add(k) for k in keys: assert bf.contains(k) for k in faux: assert not bf.contains(k) expected = '02000C0300C2246913049E040002002000017614002B0002' actual = hexlify(bf.buffer()).upper() assert expected == actual
def test_dump_load(self): bf = inbloom.Filter(20, 0.01) bf.add('abc') expected = '620d006400000014000000000020001000080000000000002000100008000400' actual = hexlify(inbloom.dump(bf)) assert expected == actual bf = inbloom.load(inbloom.dump(bf)) actual = hexlify(inbloom.dump(bf)) assert expected == actual data = inbloom.dump(bf) data = str([0xff, 0xff]) + data[2:] with self.assertRaisesRegexp(inbloom.error, "checksum mismatch"): inbloom.load(data) data = data[:4] with self.assertRaisesRegexp(inbloom.error, "incomplete payload"): inbloom.load(data)
def build_filters(): logging.info("set up our temp DB") set_up_temp_db() logging.info("Figuring out all of the phrases we have in our corpus") deals_count = 0 for deal in load_recent_deals(): deals_count += 1 try: phrases = get_all_phrases_for(deal) for phrase in phrases: if len(phrase) < MIN_PHRASE_LENGTH: continue phrase_rowid = update_count_for(phrase) save_deal_to_phrase_link(deal.deal_id, phrase_rowid) except Exception as e: logging.exception(e) if deals_count % 50 == 0: logging.info("Processed %d deals so far" % deals_count) logging.info("There were %d deals" % deals_count) total_phrase_count = load_total_phrase_count() logging.info("There were %d phrases - K ceiling is %d" % (total_phrase_count, UP_TO_K_MOST_FREQUENT_PHRASES)) logging.info("Building bloom filter") bloom_filter = inbloom.Filter( entries=UP_TO_K_MOST_FREQUENT_PHRASES, error=0.0001 ) for phrase, frequency in load_up_to_k_phrases_with_frequencies(UP_TO_K_MOST_FREQUENT_PHRASES): logging.debug("Loaded phrase: %s, which had frequency %d" % (phrase, frequency)) bloom_filter.add(phrase) logging.info("Bloom filter and sketch built OK") return bloom_filter, None
def __init__(self,bloom_capacity,error_rate): self.bloom_capacity = bloom_capacity self.error_rate = error_rate self.bf = inbloom.Filter(entries=bloom_capacity, error=error_rate)
#!/usr/bin/env python # -*- coding: UTF-8 -*- # ****************************************************** # DESC : # AUTHOR : Alex Stocks # VERSION : 1.0 # LICENCE : Apache License 2.0 # EMAIL : [email protected] # MOD : 2017-03-03 17:22 # FILE : bloom.py # ****************************************************** import inbloom import base64 import requests # Basic usage uids = ['u0667601477730140020006032', 'u0824011478254848030001156', 'u1768771480923934030001199', 'u2413521474991179020001113', 'u3686801478240154030005408', 'u3757851481174023020001166', 'u3795351482317225020001244', 'u4555881480912962010001166', 'u4710551480917824010001244', 'u4881041468996697020001184', 'u4881041474530826010001095', 'u4905391484641165010001141', 'u5425051481012754020001255', 'u6450591484584368010001166', 'u6937721484632783020001218', 'u7414411480402599030001267', 'u7675311485144153020001211', 'u9587291477909978030001277', 'u9805521470914539020001149'] bf = inbloom.Filter(entries=len(uids), error=0.001) for uid in uids: bf.add(uid) res = base64.b64encode(inbloom.dump(bf)) # yoID6AAAABMNJALf42lALcpcu2WH9sZcLPWh/g+ynjcSVaWfDxVuudTRAA== print len(bf.buffer()) print res, len(res) bf = inbloom.load(base64.b64decode(res)) print bf.contains(uids[0]) print bf.contains('u0')