def test_raises_error_on_out_of_range_error_rate(self): '''BloomFilter() raises on out-of-range error rate''' with self.assertRaises(ValueError): BloomFilter(5, -1) with self.assertRaises(ValueError): BloomFilter(5, 2)
def test_creates_filter_with_non_integral_capacity(self): '''BloomFilter() creates filter with non-integral capacity''' float_filter = BloomFilter(capacity=1000.2, error_rate=1e-3) int_filter = BloomFilter(capacity=1000, error_rate=1e-3) bit_count = int_filter.bit_count self.assertGreaterEqual(float_filter.bit_count, bit_count) self.assertLess(float_filter.bit_count, bit_count + 10) self.assertEqual(int_filter.hash_count, float_filter.hash_count)
def _build_guided_bloom(prefixes, fpp, k, num_bits, root, fib, protocol='v4'): '''Returns a Bloom filer optimized for the `root` bin search tree, and `encoded_pref_lens` dict for looking up the BMP prefix length from hash-encoded bit sequence. ''' max_shift = NUMBITS[protocol] if not (k or num_bits): bf = BloomFilter(fpp, len(prefixes['prefixes'])) else: bf = BloomFilter(fpp, len(prefixes['prefixes']), k=k, num_bits=num_bits) count = 0 # report progress for pair in prefixes['prefixes']: if count % 10000 == 0: print('build processsed %.3f of all prefixes' % (count / len(prefixes['prefixes']))) count += 1 prefix, preflen = pair # BMP is an index, can recover prefix length using prefixes['ix2len'] bmp, fib_val = _find_bmp(prefix, bf, root, fib, preflen - 1, prefixes['minn'], prefixes['len2ix'], prefixes['ix2len'], protocol=protocol) current = root count_hit = 0 while current: if preflen < current.val: current = current.left elif preflen == current.val: # insert using hash_1..hash_k pref_encoded = encode_ip_prefix_pair(prefix, preflen, protocol) bf.insert(pref_encoded, hashes=_choose_hash_funcs(0, end=bf.k)) break else: # preflen > current.val masked = (((1 << max_shift) - 1) << (max_shift - current.val)) & prefix pref_encoded = encode_ip_prefix_pair(masked, current.val, protocol) bf.insert(pref_encoded, hashes=_choose_hash_funcs(0, end=1)) count_hit += 1 # insert pointers bf.insert(pref_encoded, hashes=_choose_hash_funcs(count_hit, pattern=bmp)) current = current.right return bf, root
def test_returns_positive_when_hashes_collide(self): '''BloomFilter.test_by_hash() returns True when hashes collide''' bloom_filter = BloomFilter(1000000, 1e-3) bloom_filter.add_by_hash('abc') self.assertEqual(bloom_filter.test_by_hash(u'abc'), True)
def test_returns_true_positive_when_value_had_been_added(self): '''BloomFilter.test_by_hash() returns True after the item added''' bloom_filter = BloomFilter(1000000, 1e-3) bloom_filter.add_by_hash('abc') self.assertEqual(bloom_filter.test_by_hash('abc'), True)
def rappor(n,f,p,q,m): n=str(n) bloom=BloomFilter() noisydata=bloom.add_data(n,m) # Permanent randomized response for i in range(len(noisydata)): choose=np.random.randint(0,totalnum) if noisydata[i]==1: if choose/totalnum<=f/2: noisydata[i]=0 else: if choose/totalnum<=f/2: noisydata[i]=1 # Instantaneous randomized response for i in range(len(noisydata)): choose=np.random.randint(0,totalnum) if noisydata[i]==1: if choose/totalnum<=1-q: noisydata[i]=0 else: if choose/totalnum<=p: noisydata[i]=1 return noisydata
def build_bf(n, p, ref_fasta): # call bloom filter class and output stats bloomf = BloomFilter(n, p) print("Size of bit array:{}".format(bloomf.size)) print("False positive Probability:{}".format(bloomf.fp_prob)) print("Number of hash functions:{}".format(bloomf.hash_count)) mycoplasma_fasta = open(ref_fasta, 'r') N_count = 0 read_count = 0 while True: name = mycoplasma_fasta.readline() # read id if len(name) == 0: break # end of file read = mycoplasma_fasta.readline().strip() if 'N' not in read: # do not add any uncalled bases bloomf.add(read) read_count += 1 else: N_count += 1 print('N_count = %s' % N_count) print('read_count = %s' % read_count) mycoplasma_fasta.close() return bloomf
def test_dumps(self): bloom_filter = BloomFilter(300, 0.0001, MURMUR128_MITZ_32) for i in range(100): bloom_filter.put(i) byte_array = bloom_filter.dumps() new_filter = BloomFilter.loads(byte_array) self.assertEqual( new_filter.num_hash_functions, bloom_filter.num_hash_functions, "New filter's num of hash functions is expected to be the same as old filter's", ) self.assertEqual( new_filter.strategy, bloom_filter.strategy, "New filter's strategy is expected to be the same as old filter's", ) self.assertEqual( new_filter.data, bloom_filter.data, "New filter's data is expected to be the same as old filter's", ) self.assertEqual( new_filter.dumps(), byte_array, "New filter's dump is expected to be the same as old filter's", )
def test_basic_functionality(self): bloom_filter = BloomFilter(10000000, 0.001) for i in range(200): bloom_filter.put(i) for i in range(200): self.assertTrue( bloom_filter.might_contain(i), f"Number {i} is expected to be in bloomfilter", ) for i in range(200, 500): self.assertFalse( bloom_filter.might_contain(i), f"Number {i} is NOT expected to be in bloomfilter", ) words = ["hello", "world", "bloom", "filter"] for word in words: bloom_filter.put(word) for word in words: self.assertTrue(word in bloom_filter, f"Word '{word}' is expected to be in bloomfilter") self.assertFalse( "not_exist" in bloom_filter, "Word 'not_exist' is expected to be in bloomfilter", )
def test_all_test_positive_when_hashes_collide(self): """BloomFilter.test_by_hash() returns False when filter is empty.""" bloom_filter = BloomFilter(1000000, 1e-3) bloom_filter.add_by_hash("abc") self.assertEqual(bloom_filter.test_by_hash("def"), False)
def test_returns_positive_when_hashes_collide(self): """BloomFilter.test_by_hash() returns True when hashes collide.""" bloom_filter = BloomFilter(1000000, 1e-3) bloom_filter.add_by_hash("abc") self.assertEqual(bloom_filter.test_by_hash(u"abc"), True)
def test_all_test_positive_when_hashes_collide(self): '''BloomFilter.test_by_hash() returns False when filter is empty''' bloom_filter = BloomFilter(1000000, 1e-3) bloom_filter.add_by_hash('abc') self.assertEqual(bloom_filter.test_by_hash('def'), False)
def _add_bloom(self): new_error = self.base_error * self.error_tightening_ratio**len( self.bloom_filters) new_bloom = BloomFilter(self.capacity, new_error) self.bloom_filters.append(new_bloom) self.current_bloom = new_bloom return new_bloom
def make_checker(word_file='notes/words.txt', force_lower=True): '''Return a checker correctly spelled words >>> 'army' in make_checker() True >>> 'ahmee' in make_checker() False ''' try: with open('words.pickle') as cache_file: return pickle.load(cache_file) except IOError: pass with open(word_file) as f: s = f.read() if force_lower: s = s.lower() bf = BloomFilter(s.split(), population=4000000, probes=12) with open('words.pickle', 'w') as cache_file: pickle.dump(bf, cache_file) return bf
def test_words(self): '''Ensure that strings work well''' vocabulary = self.load_words('words') test_words = self.load_words('testwords') bloom_filter = BloomFilter(100000, 1e-4) intersection = set(vocabulary) & set(test_words) setup_collision_count = 0 for word in vocabulary: if bloom_filter.test_by_hash(word): setup_collision_count += 1 else: bloom_filter.add_by_hash(word) self.assertLess(setup_collision_count, 5) false_positive_count = 0 false_negative_count = 0 for word in test_words: if word in intersection: if not bloom_filter.test_by_hash(word): false_negative_count += 1 else: if bloom_filter.test_by_hash(word): false_positive_count += 1 self.assertEqual(false_negative_count, 0) self.assertLessEqual(false_positive_count, 6)
class newsSpider(scrapy.Spider): name = "news" start_urls = [ 'http://www.bbc.com/news', ] count = 0 n = 2000 #Number of bits p = 0.15 #falseProbabilityRate bloomf = BloomFilter(2000, 0.15) def parse(self, response, count=1): mydiv = response.xpath('//div') for p in mydiv.xpath('.//p/text()').extract(): p = p.replace(u"Â", u"").replace(u"â", u"") if 'Email' in p or 'MMS' in p or 'Follow' in p or 'stories' in p or 'news' in p or 'world' in p: continue yield {'text': p} newsSpider.count = newsSpider.count + 1 if newsSpider.count <= 5: URLlist = response.css('div a::attr("href")').extract() for next_page in URLlist: if self.bloomf.check(next_page): continue self.bloomf.add(next_page) newsSpider.count = newsSpider.count + 1 if newsSpider.count >= 5: break yield response.follow(next_page, self.parse)
def test_exercise_2(self): block_hash = bytes.fromhex( '0000000053787814ed9dd8c029d0a0a9af4ab8ec0591dc31bdc4ab31fae88ce9') passphrase = b'Jimmy Song Programming Blockchain' # FILL THIS IN secret = little_endian_to_int(hash256(passphrase)) private_key = PrivateKey(secret=secret) addr = private_key.point.address(testnet=True) filter_size = 30 filter_num_functions = 5 filter_tweak = 90210 # FILL THIS IN h160 = decode_base58(addr) bf = BloomFilter(filter_size, filter_num_functions, filter_tweak) bf.add(h160) node = SimpleNode('tbtc.programmingblockchain.com', testnet=True, logging=False) node.handshake() node.send(bf.filterload()) getdata = GetDataMessage() getdata.add_data(FILTERED_BLOCK_DATA_TYPE, block_hash) node.send(getdata) mb = node.wait_for(MerkleBlock) tx = node.wait_for(Tx) self.assertEqual( tx.serialize().hex(), '0100000002a663815ab2b2ba5f53e442f9a2ea6cc11bbcd98fb1585e48a134bd870dbfbd6a000000006a47304402202151107dc2367cf5a9e2429cde0641c252374501214ce52069fbca1320180aa602201a43b5d4f91e48514c00c01521dc04f02c57f15305adc4eaad01c418f6e7a1180121031dbe3aff7b9ad64e2612b8b15e9f5e4a3130663a526df91abfb7b1bd16de5d6effffffff618b00a343488bd62751cf21f368ce3be76e3a0323fdc594a0d24f27a1155cd2000000006a473044022024c4dd043ab8637c019528b549e0b10333b2dfa83e7ca66776e401ad3fc31b6702207d4d1d73ac8940c59c57c0b7daf084953324154811c10d06d0563947a88f99b20121031dbe3aff7b9ad64e2612b8b15e9f5e4a3130663a526df91abfb7b1bd16de5d6effffffff0280969800000000001976a914ad346f8eb57dee9a37981716e498120ae80e44f788aca0ce6594000000001976a9146e13971913b9aa89659a9f53d327baa8826f2d7588ac00000000' )
def _sync(self): self.bf = BloomFilter(BF_SIZE, BF_HASH_COUNT) with kv_reader(self.path) as r: while r.has_next(): key = r.read_key() self.bf.add(key) r.skip_value()
def main(): number_of_items = 20 false_positive_probability = 0.1 bloom = BloomFilter(number_of_items, false_positive_probability) word_present = [ 'abound', 'abounds', 'abundance', 'abundant', 'accessable', 'bloom', 'blossom', 'bolster', 'bonny', 'bonus', 'bonuses', 'coherent', 'cohesive', 'colorful', 'comely', 'comfort', 'gems', 'generosity', 'generous', 'generously', 'genial' ] word_absent = [ 'bluff', 'cheater', 'hate', 'war', 'humanity', 'racism', 'hurt', 'nuke', 'gloomy', 'facebook', 'geeksforgeeks', 'twitter' ] print('bloomfilter size: ', bloom.bit_size) print('false_positive_probability', bloom.false_positive_probability) print('hash_count: ', bloom.hash_count) for item in word_present: bloom.add(item) shuffle(word_present) shuffle(word_absent) random_list = word_present[:5] + word_absent[:5] shuffle(random_list) for word in random_list: print('word: ', word) if bloom.check(word): if word in word_absent: print('false positive') else: print('word most likely member') else: print('word not present')
def create(cls, path, memtable): bf = BloomFilter(BF_SIZE, BF_HASH_COUNT) with kv_writer(path) as writer: for key, value in memtable.entries(): writer.write_entry(key, value) bf.add(key) return cls(path, bf)
def test_exercise_4(self): last_block_hex = '000000000d65610b5af03d73ed67704713c9b734d87cf4b970d39a0416dd80f9' last_block = bytes.fromhex(last_block_hex) secret = little_endian_to_int( hash256(b'Jimmy Song Programming Blockchain')) private_key = PrivateKey(secret=secret) addr = private_key.point.address(testnet=True) h160 = decode_base58(addr) target_address = 'mwJn1YPMq7y5F8J3LkC5Hxg9PHyZ5K4cFv' self.assertEqual(addr, target_address) filter_size = 30 filter_num_functions = 5 filter_tweak = 90210 # FILL THIS IN target_h160 = decode_base58(target_address) target_script = p2pkh_script(target_h160) fee = 5000 # fee in satoshis node = SimpleNode('tbtc.programmingblockchain.com', testnet=True, logging=False) bf = BloomFilter(filter_size, filter_num_functions, filter_tweak) bf.add(h160) node.handshake() node.send(b'filterload', bf.filterload()) getheaders_message = GetHeadersMessage(start_block=last_block) node.send(getheaders_message.command, getheaders_message.serialize()) headers_envelope = node.wait_for_commands([HeadersMessage.command]) stream = headers_envelope.stream() headers = HeadersMessage.parse(stream) get_data_message = GetDataMessage() for block in headers.blocks: self.assertTrue(block.check_pow()) if last_block is not None: self.assertEqual(block.prev_block, last_block) last_block = block.hash() get_data_message.add_data(FILTERED_BLOCK_DATA_TYPE, last_block) node.send(get_data_message.command, get_data_message.serialize()) prev_tx = None while prev_tx is None: envelope = node.wait_for_commands([b'merkleblock', b'tx']) stream = envelope.stream() if envelope.command == b'merkleblock': mb = MerkleBlock.parse(stream) self.assertTrue(mb.is_valid()) else: prev = Tx.parse(stream, testnet=True) for i, tx_out in enumerate(prev.tx_outs): if tx_out.script_pubkey.address(testnet=True) == addr: prev_tx = prev.hash() prev_index = i prev_amount = tx_out.amount break tx_in = TxIn(prev_tx, prev_index) output_amount = prev_amount - fee tx_out = TxOut(output_amount, target_script) tx_obj = Tx(1, [tx_in], [tx_out], 0, testnet=True) tx_obj.sign_input(0, private_key) self.assertEqual( tx_obj.serialize().hex(), '010000000194e631abb9e1079ec72a1616a3aa0111c614e65b96a6a4420e2cc6af9e6cc96e000000006a47304402203cc8c56abe1c0dd043afa9eb125dafbebdde2dd4cd7abf0fb1aae0667a22006e02203c95b74d0f0735bbf1b261d36e077515b6939fc088b9d7c1b7030a5e494596330121021cdd761c7eb1c90c0af0a5963e94bf0203176b4662778d32bd6d7ab5d8628b32ffffffff01f8829800000000001976a914ad346f8eb57dee9a37981716e498120ae80e44f788ac00000000' )
def __init__(self): self.cc = ConnectToCassandra() self.n, self.word_present = self.cc.get_id() #no of items to add self.p = 0.05 #false positive probability self.bloomf = BloomFilter(self.n, self.p) for item in self.word_present: self.bloomf.add(bytes(to_integer(item.date())))
def test_byte_size_is_in_expected_range(self): '''BloomFilter.byte_size returns expected value''' bloom_filter = BloomFilter(1000000, 1e-3) size = bloom_filter.byte_size # 14377640 bits, 10 hashes self.assertLess(1797208, size) self.assertGreater(1800000, size)
def rabinKarp(self, patterns, txt): if (not txt or not patterns): raise ValueError('Search requires text and a pattern') q = 101 # a prime number d = 256 h = 1 matches = dict() for p in patterns: matches[p] = [] patternHashes = [] patternLen = len(next(iter(patterns))) #length of first pattern txtLen = len(txt) if (txtLen < patternLen): raise ValueError( 'A pattern longer than text to search cannot exist in the text.' ) # The value of h would be "pow(d, M-1)%q" for i in range(patternLen - 1): h = (h * d) % q numPat = len(patterns) if (numPat < 1): raise ValueError('Search requires a pattern') for pat in set(patterns): if (patternLen != len(pat)): raise ValueError( 'Search only supports a fixed length pattern match.') patternHash = 0 for i in range(patternLen): patternHash = (d * patternHash + ord(pat[i])) % q patternHashes.append(patternHash) bloomf = BloomFilter(patternHashes) # setup the first comparison based on length of pattern left = 0 right = patternLen txtHash = 0 for j in range(patternLen): txtHash = (d * txtHash + ord(txt[j])) % q #scoot through txt 1 char at a time while (right <= txtLen): if (bloomf.contains(txtHash)): if (txt[left:right] in patterns): matches[txt[left:right]].append(left) if (left + patternLen < txtLen): txtHash = (d * (txtHash - ord(txt[left]) * h) + ord(txt[left + patternLen])) % q left += 1 right += 1 return matches
def test_exercise_6(self): last_block_hex = '000000000d65610b5af03d73ed67704713c9b734d87cf4b970d39a0416dd80f9' secret = little_endian_to_int( hash256(b'Jimmy Song Programming Blockchain')) private_key = PrivateKey(secret=secret) addr = private_key.point.address(testnet=True) h160 = decode_base58(addr) target_address = 'mwJn1YPMq7y5F8J3LkC5Hxg9PHyZ5K4cFv' self.assertEqual(addr, target_address) target_h160 = decode_base58(target_address) target_script = p2pkh_script(target_h160) fee = 5000 node = SimpleNode('tbtc.programmingblockchain.com', testnet=True) bf = BloomFilter(30, 5, 90210) bf.add(h160) node.handshake() node.send(bf.filterload()) start_block = bytes.fromhex(last_block_hex) getheaders = GetHeadersMessage(start_block=start_block) node.send(getheaders) headers = node.wait_for(HeadersMessage) last_block = None getdata = GetDataMessage() for b in headers.blocks: if not b.check_pow(): raise RuntimeError('proof of work is invalid') if last_block is not None and b.prev_block != last_block: raise RuntimeError('chain broken') getdata.add_data(FILTERED_BLOCK_DATA_TYPE, b.hash()) last_block = b.hash() node.send(getdata) prev_tx, prev_index, prev_tx_obj = None, None, None while prev_tx is None: message = node.wait_for(MerkleBlock, Tx) if message.command == b'merkleblock': if not message.is_valid(): raise RuntimeError('invalid merkle proof') else: message.testnet = True for i, tx_out in enumerate(message.tx_outs): if tx_out.script_pubkey.address(testnet=True) == addr: prev_tx = message.hash() prev_index = i prev_amount = tx_out.amount self.assertEqual( message.id(), '6ec96c9eafc62c0e42a4a6965be614c61101aaa316162ac79e07e1b9ab31e694' ) self.assertEqual(i, 0) break tx_in = TxIn(prev_tx, prev_index) output_amount = prev_amount - fee tx_out = TxOut(output_amount, target_script) tx_obj = Tx(1, [tx_in], [tx_out], 0, testnet=True) tx_obj.sign_input(0, private_key) self.assertEqual( tx_obj.serialize().hex(), '010000000194e631abb9e1079ec72a1616a3aa0111c614e65b96a6a4420e2cc6af9e6cc96e000000006a47304402203cc8c56abe1c0dd043afa9eb125dafbebdde2dd4cd7abf0fb1aae0667a22006e02203c95b74d0f0735bbf1b261d36e077515b6939fc088b9d7c1b7030a5e494596330121021cdd761c7eb1c90c0af0a5963e94bf0203176b4662778d32bd6d7ab5d8628b32ffffffff01f8829800000000001976a914ad346f8eb57dee9a37981716e498120ae80e44f788ac00000000' )
def test_serializes_filter_serialize_without_line_feeds(self): '''BloomFilter serializes with base64 shield without line feeds''' bloom_filter = BloomFilter(100, 0.1) bloom_filter.add_by_hash('abcdef') serialized_filter = bloom_filter.serialize() self.assertEqual(serialized_filter.find('\n'), -1)
def test_bloomfilter(self): bloom = BloomFilter(100) for i in xrange(50): bloom.add(str(i)) assert "20" in bloom assert "25" in bloom assert "49" in bloom assert "50" not in bloom
def test_serializes_filter_serialize(self): '''BloomFilter can round trip serialize() -> deserialize()''' bloom_filter = BloomFilter(100, 0.1) bloom_filter.add_by_hash('abcdef') serialized_filter = bloom_filter.serialize() restored_filter = BloomFilter.deserialize(serialized_filter) self.assertEqual(bloom_filter.raw_data(), restored_filter.raw_data())
def __init__(self, cnt, word_present): self.n = cnt self.word_present = word_present #no of items to add self.p = 0.05 #false positive probability self.bloomf = BloomFilter(self.n, self.p) for item in self.word_present: print(item) self.bloomf.add( bytes(to_integer(datetime.datetime.strptime(item, '%Y%m%d'))))
def test(): bf = BloomFilter(num_hashes=10, size_bytes=100) bf.add('hello') s = pickle.dumps(bf) bf2 = pickle.loads(s) assert 'hi' not in bf2 assert 'hello' in bf2 assert (bf.seeds == bf2.seeds).all()