def testNumberOfSetBitsNeverDecreases(self): n = 10 bf = BloomFilter(n, n) prev_cnt = 0 for i in range(n): bf.insert(str(i)) cnt = bf.count(True) self.assertTrue(cnt >= prev_cnt) prev_cnt = cnt
def testFalseNegativeNeverHappens(self): n = 10 inserted = [] bf = BloomFilter(n, n) for i in range(n): bf.insert(str(i)) inserted.append(i) for j in inserted: self.assertTrue(bf.query(j))
class WeatherSpider(scrapy.Spider): name = "myweather" allowed_domains = ["sina.com.cn"] start_urls = ['http://weather.sina.com.cn/beijing'] def __init__(self): self._bf = BloomFilter(0.0001,100000) def parse(self, response): html_doc = response.body #html_doc = html_doc.decode('utf-8') self._bf.insert_element(response.url) soup = BeautifulSoup(html_doc,'html.parser',from_encoding='utf-8') item = WeatherItem() item['ourl'] = response.url item['city'] = '' item['temp'] = '' item['qihou'] = '' item['wind'] = '' item['shidu'] = '' item['wuran'] = '' soup_city = soup.find(id='slider_ct_name') soup_temp = soup.find('div',{'class','slider_degree'}) soup_xijie = soup.find('p',{'class','slider_detail'}) soup_wuran = soup.find('div',{'class','slider_warn_i_tt'}) if soup_city and soup_temp and soup_wuran: item['city'] = soup_city.get_text() item['temp'] = soup_temp.get_text() xijie = soup_xijie.get_text() item['wuran'] = soup_wuran.find('p').get_text() item['qihou'] = xijie.split('|')[0].strip() item['wind'] = xijie.split('|')[1].strip() item['shidu'] = xijie.split('|')[2].strip() item['shidu'] = item['shidu'].split(u':')[1] urls_tmp = soup.find_all('a') urls = [] for url_tmp in urls_tmp: urls.append(url_tmp.get('href')) yield item for url in self._cut_urls(urls): yield self.make_requests_from_url(url) def _cut_urls(self,urls): cut_urls=[] pattern = re.compile(r'http://weather.sina.com.cn/') for url in urls: try: match = pattern.match(url) except TypeError as e: match = False if match and not self._bf.is_element_exist(url): cut_urls.append(url) return cut_urls
def __init__(self, addrs, params=MAINNET, user_agent="/pyBitcoin:0.1/", max_connections=10): self.addrs = addrs self.params = params self.user_agent = user_agent self.max_connections = max_connections self.peers = [] self.inventory = {} self.pending_txs = {} self.subscriptions = {} self.bloom_filter = BloomFilter(3, 0.01, random.getrandbits(32), BloomFilter.UPDATE_NONE) self.connect_to_peers()
def testInputIsFalsePositiveUntilInserted(self): n = 100 bf = BloomFilter(n/2, n) fp = [False]*n inserted = [False]*n fp_cnt = 0 for i in range(n): bf.insert(str(i)) inserted[i] = True for j in range(n): if inserted[j]: continue # It was false positive before, it must continue # to be (since it was not inserted). if fp[j]: self.assertTrue(bf.query(j)) # Update false positives list elif bf.query(j): fp[j] = True fp_cnt += 1 # We're inserting more elements than the size of the array, so # there must be false positives. self.assertTrue(fp_cnt > 0)
def bloom_filter_run(n, m, k=None): keys = range(n) # Random sampling without replacement random.shuffle(keys) probs = [] filter = BloomFilter(m, n, k) inserted = [False] * (n) for cnt, entry in enumerate(keys): filter.insert(str(entry)) inserted[entry] = True false_positives, total = 0, 0 # Compute false positives for probe in range(n): if not inserted[probe]: exists = filter.query(probe) if exists: false_positives += 1 total += 1 if total != 0: prob = false_positives * 1.0 / total probs.append(prob) return probs
def test_repr(self): dilberts = BloomFilter(key='dilberts') assert repr(dilberts) == '<BloomFilter key=dilberts>'
def server_handler(self): self.sock.bind((self.ip, self.port)) self.sock.listen(1) print("Starting server on %s : %d" % (self.ip, self.port)) try: conn, addr = self.sock.accept() except: print("Server never received a connection...closing") return print("Connection from %s" % (addr[0])) while True: data = conn.recv(1) if not data: break # First we're always going to get the NodeServerMsg data typ = int.from_bytes(data, 'big') if typ == NetworkMsg.INV.value: # Receive INV print('Receiving INV...') data = conn.recv(32) self.merkle_root = data print("Received INV of size 32 bytes") self.total_received += 32 elif typ == NetworkMsg.GET_GLBLK.value: # Receive GET_GLBLK print('Receiving GET_GLBLK...') data = conn.recv(3) self.other_txpool_size = int.from_bytes(data, 'big') print("Received GET_GRBLK of size 3 bytes") self.total_received += 3 elif typ == NetworkMsg.GLBLK.value: # Receive TX Bloom print('Receiving GLBLK...') data = conn.recv(3) tx_bloom_len = int.from_bytes(data, 'big') data = conn.recv(tx_bloom_len) tx_bloom = BloomFilter.deserialise(data) print("Received TX Bloom of size %d bytes" % tx_bloom_len) # Receive TX IBLT data = conn.recv(3) tx_iblt_len = int.from_bytes(data, 'big') data = conn.recv(tx_iblt_len) tx_iblt = SIBLT.deserialise(data) print("Received TX IBLT of size %d bytes" % tx_iblt_len) self.total_received += tx_bloom_len + tx_iblt_len # Construct GLBLK self.tx_filters = [tx_bloom, tx_iblt] elif typ == NetworkMsg.GET_GLBLKDAT.value: # Receive GET_GLBLKDAT print('Receiving GET_GLBLKDAT...') data = conn.recv(3) pair_filter_len = int.from_bytes(data, 'big') data = conn.recv(pair_filter_len) self.tx_missing_ids = cpickle.loads(data) print("Received GET_GLBLKDAT of size %d bytes" % pair_filter_len) self.total_received += pair_filter_len elif typ == NetworkMsg.GLBLKTX.value: # Receive GLBLKTX print('Receiving GLBLKTX...') data = conn.recv(3) tx_missing_length = int.from_bytes(data, 'big') data = conn.recv(tx_missing_length) self.tx_missing = cpickle.loads(data) print("Received GLBLKTX of size %d bytes" % tx_missing_length) self.total_received += tx_missing_length elif typ == NetworkMsg.GLBLKORD.value: # Receive TX print('Receiving GLBLKORD...') data = conn.recv(3) bloom_len = int.from_bytes(data, 'big') data = conn.recv(bloom_len) pair_bloom = BloomFilter.deserialise(data) print("Received Bloom of size %d bytes" % bloom_len) # Receive IBLT data = conn.recv(3) iblt_len = int.from_bytes(data, 'big') data = conn.recv(iblt_len) pair_iblt = SIBLT.deserialise(data) print("Received IBLT of size %d bytes" % iblt_len) self.total_received += bloom_len + iblt_len self.total_ord_received += bloom_len + iblt_len # Construct GLBLK self.pair_filters.append([pair_bloom, pair_iblt]) elif typ == NetworkMsg.COMPLETE.value: # Receive reconciliation analytics print('Receiving reconciliation analytics...') data = conn.recv(1) # TODO: Implement self.complete = data print('Received analytics of size %d bytes' % 1) self.total_received += 1 conn.close()
def test_repr(self): self.assertEqual(repr(BloomFilter(num_bits=10, num_hashers=5)), "BloomFilter(num_bits=10, num_hashers=5)")
def test(): ''' basic testing functions ''' blm = BloomFilter() blm.init(10, 0.05) blm.add("this is a test") print(blm.check("this is a test")) print(blm.check("blah")) print(blm) print(blm.bloom_array) blm.export('./dist/py_bloom.blm') print('\n\ncheck imported BloomFilter!') blm2 = BloomFilter() blm2.load('./dist/py_bloom.blm') print(blm2.check("this is a test")) print(blm2.check("blah")) print(blm2) print(blm2.bloom_array) blm2.add('yet another test') print("\n\ncheck intersection") blm3 = blm.intersection(blm2) print(blm3) print(blm3.check("this is a test")) print(blm3.check("yet another test")) print("\n\ncheck union") blm3 = blm.union(blm2) print(blm3) print(blm3.check("this is a test")) print('\n\ntest using `in`') print("this is a test" in blm3) print(blm3.check("yet another test")) print(blm3.estimate_elements()) print(blm.jaccard_index(blm2)) print ('\n\nexport to hex') hex_out = blm.export_hex() print(hex_out) print('import hex') blm4 = BloomFilter() blm4.load_hex(hex_out) print(blm4) # on disk code check print('\n\nbloom filter on disk') blmd = BloomFilterOnDisk() blmd.initialize('./dist/py_ondisk.blm', 10, 0.05) blmd.add("this is a test") print(blmd.check('this is a test')) print('Check use of in keyword ("this is a test" in blmd): ', 'this is a test' in blmd) print(blmd.check('yet another test')) # blmd.union(blm4) # blmd.intersection(blm) # print(blmd.jaccard_index(blm2)) print(blmd) # print ('\n\nexport to hex') # hex_out = blmd.export_hex() # print(hex_out) blmd.close()
def test_update_gets_stored(self): 'When we update() with elements, ensure that we Memcache the bit array' self.dilberts.update({'dan', 'eric'}) office_space = BloomFilter(key='dilberts') assert office_space._bit_array == self.dilberts._bit_array
def test_init_gets_stored(self): 'When we __init__() on an iterable, ensure we Memcache the bit array' office_space = BloomFilter(key='dilberts') assert office_space._bit_array == self.dilberts._bit_array
class BitcoinClient(object): def __init__(self, addrs, params=MAINNET, user_agent="/pyBitcoin:0.1/", max_connections=10): self.addrs = addrs self.params = params self.user_agent = user_agent self.max_connections = max_connections self.peers = [] self.inventory = {} self.pending_txs = {} self.subscriptions = {} self.bloom_filter = BloomFilter(3, 0.01, random.getrandbits(32), BloomFilter.UPDATE_NONE) self.connect_to_peers() def connect_to_peers(self): if len(self.peers) < self.max_connections: shuffle(self.addrs) for i in range(self.max_connections - len(self.peers)): if len(self.addrs) > 0: addr = self.addrs.pop(0) peer = PeerFactory(self.params, self.user_agent, self.inventory, self.bloom_filter, self.on_peer_disconnected) reactor.connectTCP(addr[0], addr[1], peer) self.peers.append(peer) def on_peer_disconnected(self, peer): self.peers.remove(peer) self.connect_to_peers() def broadcast_tx(self, tx): """ Send the tx to half our peers, wait for half of the remainder to announce the tx before calling back True. """ def on_peer_anncounce(txid): self.pending_txs[txid][0] += 1 if self.pending_txs[txid][0] >= self.pending_txs[txid][1] / 2: if self.pending_txs[txid][3].active(): self.pending_txs[txid][3].cancel() self.pending_txs[txid][2].callback(True) d = defer.Deferred() self.inventory[bitcoin.txhash(tx)] = tx inv_packet = inv("TX", bitcoin.txhash(tx)) self.bloom_filter.insert(bitcoin.bin_txhash(tx)) self.pending_txs[bitcoin.txhash(tx)] = [0, len(self.peers)/2, d, reactor.callLater(10, d.callback, False)] for peer in self.peers[len(self.peers)/2:]: peer.protocol.update_filter() peer.protocol.add_inv_callback(bitcoin.txhash(tx), on_peer_anncounce) for peer in self.peers[:len(self.peers)/2]: peer.protocol.send_message(message(inv_packet, self.params)) return d def subscribe_address(self, address, callback): """ Listen for transactions on an address. Since we can't validate the transaction, we will only callback if a majority of our peers relay it. If less than a majority relay it, we will have to wait for block inclusion to callback. """ def on_peer_announce(tx): txhash = bitcoin.txhash(bitcoin.serialize(tx["tx"])) if txhash in self.subscriptions[address][0] and self.subscriptions[address][0][txhash][0] != "complete": self.subscriptions[address][0][txhash][0] += 1 if self.subscriptions[address][0][txhash][0] >= self.subscriptions[address][0][txhash][1]: self.subscriptions[address][0][txhash][0] = "complete" self.subscriptions[address][1](tx["tx"]) elif txhash not in self.subscriptions[address][0]: self.subscriptions[address][0][txhash] = [1, len(self.peers)/2] self.subscriptions[address] = [{}, callback] self.bloom_filter.insert(unhexlify(bitcoin.b58check_to_hex(address))) for peer in self.peers: peer.protocol.add_inv_callback(bitcoin.b58check_to_hex(address), on_peer_announce) peer.protocol.update_filter()
def __init__(self): self._bf = BloomFilter(0.0001,100000)
class RecentlyConsumedSimulationTests(unittest.TestCase): "Simulate reddit's recently consumed problem to test our Bloom filter." def setUp(self): super(self.__class__, self).setUp() # Construct a set of links that the user has seen. self.seen_links = set() while len(self.seen_links) < 100: fullname = self.random_fullname() self.seen_links.add(fullname) # Construct a set of links that the user hasn't seen. Ensure that # there's no intersection between the seen set and the unseen set. self.unseen_links = set() while len(self.unseen_links) < 100: fullname = self.random_fullname() if fullname not in self.seen_links: self.unseen_links.add(fullname) # Initialize the recently consumed Bloom filter on the seen set. self.recently_consumed = BloomFilter( num_values=1000, false_positives=0.001, key='recently-consumed', ) self.recently_consumed.clear() self.recently_consumed.update(self.seen_links) def tearDown(self): self.recently_consumed.memcache.delete(self.recently_consumed.key) super(self.__class__, self).tearDown() @staticmethod def random_fullname(prefix='t3_', size=6): alphabet36, id36 = string.digits + string.ascii_lowercase, [] for _ in xrange(size): id36.append(random.choice(alphabet36)) return prefix + ''.join(id36) @staticmethod def round(number, sig_digits=1): '''Round a float to the specified number of significant digits. Reference implementation: https://github.com/ActiveState/code/blob/3b27230f418b714bc9a0f897cb8ea189c3515e99/recipes/Python/578114_Round_number_specified_number_significant/recipe-578114.py ''' try: ndigits = sig_digits - 1 - int(math.floor(math.log10(abs(number)))) except ValueError: # math.log10(number) raised a ValueError, so number must be 0.0. # No need to round 0.0. return number else: return round(number, ndigits) def test_zero_false_negatives(self): 'Ensure that we produce zero false negatives' for seen_link in self.seen_links: assert seen_link in self.recently_consumed def test_acceptable_false_positives(self): 'Ensure that we produce false positives at an acceptable rate' acceptable, actual = self.recently_consumed.false_positives, 0 for unseen_link in self.unseen_links: actual += unseen_link in self.recently_consumed actual /= float(len(self.unseen_links)) actual = self.round(actual, sig_digits=1) message = 'acceptable: {}; actual: {}'.format(acceptable, actual) assert actual <= acceptable, message
def test_item_in_filter(self): word = "dog" filter = BloomFilter(10) filter.add(word) self.assertIn(word, filter)
def setUp(self): super(self.__class__, self).setUp() self.dilberts = BloomFilter({'rajiv', 'raj'}, key='dilberts')
for x, c in counts.items(): cm.update(x, c) plot_stats(counts, cm) plt.show() def test_bloom(): # Generate N distinct values in the range [0, 100000] N = 50_000 s = set(np.random.choice(1_000_000, N, replace=False)) print(f"Num inserted values: {N}, min: {min(s)}, max: {max(s)}") desired_error_prob = [0.05, 0.1, 0.2] for p in desired_error_prob: M, d = BloomFilter.optimal_size(p, N) print(f"Desired FP rate: {100*p:.2f}%, size of Bloom filter: " f"{M} bits (with {d} hash functions)") bf = BloomFilter(M, d, s) test_set = np.random.choice(1_000_000, 100_000) fp, tn = 0, 0 for x in test_set: if x not in s: if x in bf: fp += 1 else: tn += 1 print( f"Estimated FP rate: {100*fp/(fp + tn):.2f}% (from {fp + tn} negative samples)"
def test_add_gets_stored(self): 'When we add() an element, ensure that we Memcache the bit array' self.dilberts.add('dan') office_space = BloomFilter(key='dilberts') assert office_space._bit_array == self.dilberts._bit_array
class BitcoinClient(object): def __init__(self, addrs, params=MAINNET, user_agent="/pyBitcoin:0.1/", max_connections=10): self.addrs = addrs self.params = params self.user_agent = user_agent self.max_connections = max_connections self.peers = [] self.inventory = {} self.pending_txs = {} self.subscriptions = {} self.bloom_filter = BloomFilter(3, 0.01, random.getrandbits(32), BloomFilter.UPDATE_NONE) self.connect_to_peers() def connect_to_peers(self): if len(self.peers) < self.max_connections: shuffle(self.addrs) for i in range(self.max_connections - len(self.peers)): if len(self.addrs) > 0: addr = self.addrs.pop(0) peer = PeerFactory(self.params, self.user_agent, self.inventory, self.bloom_filter, self.on_peer_disconnected) reactor.connectTCP(addr[0], addr[1], peer) self.peers.append(peer) def on_peer_disconnected(self, peer): self.peers.remove(peer) self.connect_to_peers() def broadcast_tx(self, tx): """ Send the tx to half our peers, wait for half of the remainder to announce the tx before calling back True. """ def on_peer_anncounce(txid): self.pending_txs[txid][0] += 1 if self.pending_txs[txid][0] >= self.pending_txs[txid][1] / 2: if self.pending_txs[txid][3].active(): self.pending_txs[txid][3].cancel() self.pending_txs[txid][2].callback(True) d = defer.Deferred() self.inventory[bitcoin.txhash(tx)] = tx inv_packet = inv("TX", bitcoin.txhash(tx)) self.bloom_filter.insert(bitcoin.bin_txhash(tx)) self.pending_txs[bitcoin.txhash(tx)] = [ 0, len(self.peers) / 2, d, reactor.callLater(10, d.callback, False) ] for peer in self.peers[len(self.peers) / 2:]: peer.protocol.update_filter() peer.protocol.add_inv_callback(bitcoin.txhash(tx), on_peer_anncounce) for peer in self.peers[:len(self.peers) / 2]: peer.protocol.send_message(message(inv_packet, self.params)) return d def subscribe_address(self, address, callback): """ Listen for transactions on an address. Since we can't validate the transaction, we will only callback if a majority of our peers relay it. If less than a majority relay it, we will have to wait for block inclusion to callback. """ def on_peer_announce(tx): txhash = bitcoin.txhash(bitcoin.serialize(tx["tx"])) if txhash in self.subscriptions[address][0] and self.subscriptions[ address][0][txhash][0] != "complete": self.subscriptions[address][0][txhash][0] += 1 if self.subscriptions[address][0][txhash][ 0] >= self.subscriptions[address][0][txhash][1]: self.subscriptions[address][0][txhash][0] = "complete" self.subscriptions[address][1](tx["tx"]) elif txhash not in self.subscriptions[address][0]: self.subscriptions[address][0][txhash] = [ 1, len(self.peers) / 2 ] self.subscriptions[address] = [{}, callback] self.bloom_filter.insert(unhexlify(bitcoin.b58check_to_hex(address))) for peer in self.peers: peer.protocol.add_inv_callback(bitcoin.b58check_to_hex(address), on_peer_announce) peer.protocol.update_filter()
def test_clear_gets_stored(self): 'When we clear() all elements, ensure that we Memcache the bit array' self.dilberts.clear() office_space = BloomFilter(key='dilberts') assert office_space._bit_array == self.dilberts._bit_array
return ''.join(random.choice(chars) for x in range(size)) def generateOther(): return generateValue(10, string.ascii_lowercase + string.digits) size = -(MAX_SIZE * log(ERROR_PROBABILITY)) / (log(2) * log(2)) count = (size / MAX_SIZE) * log(2) optimalSize = int(complex(size).real) optimalCount = int(complex(count).real) print optimalSize, optimalCount bloom = BloomFilter(optimalSize, optimalCount) falsePositive = False inserted = 0; falseValue = ""; while not falsePositive: for i in range(MAX_SIZE / 100): bloom.add(generateValue()) inserted += 1 print(bloom) for i in range(MAX_SIZE / 100): falseValue = generateOther() falsePositive = (falseValue in bloom)