class TestBloomFilter(unittest.TestCase): def setUp(self): self.size = 500000 self.hash_count = 7 self.bf = BloomFilter(self.size, self.hash_count) lst = ['abc', 'xyz', 'foo', 'bar'] for item in lst: self.bf.add(item) def _initialize(self): pass def _cleanup(self): if self.bf: del(self.bf) self.bf = None def test_lookup_yes(self): self.assertEqual(self.bf.lookup('foo'), True) def test_lookup_no(self): self.assertEqual(self.bf.lookup('hello'), False) def tearDown(self): self._cleanup()
def build_bf(n, p, ref_fasta): # call bloom filter class and output stats bloomf = BloomFilter(n, p) print("Size of bit array:{}".format(bloomf.size)) print("False positive Probability:{}".format(bloomf.fp_prob)) print("Number of hash functions:{}".format(bloomf.hash_count)) mycoplasma_fasta = open(ref_fasta, 'r') N_count = 0 read_count = 0 while True: name = mycoplasma_fasta.readline() # read id if len(name) == 0: break # end of file read = mycoplasma_fasta.readline().strip() if 'N' not in read: # do not add any uncalled bases bloomf.add(read) read_count += 1 else: N_count += 1 print('N_count = %s' % N_count) print('read_count = %s' % read_count) mycoplasma_fasta.close() return bloomf
class BloomFilterMR(MRJob): def __init__(self, *args, **kwargs): super(BloomFilterMR, self).__init__(*args, **kwargs) self.n = 20 self.p = 0.05 self.hot_list = [1,8,14,12,23,31,55] #defining steps def steps(self): return [ MRStep(mapper_init=self.mapper_init ,mapper=self.mapper ) ] def mapper_init(self): self.bloomf = BloomFilter(self.n,self.p) for elem in self.hot_list: self.bloomf.add(str(elem)) #MapReduce Phase 1 : convert temperature data into city,day,temp,temp_count def mapper(self, _, line): (city,temp,timestamp) = line.split('|') if self.bloomf.check(temp): yield city,(temp,timestamp)
def test_exercise_2(self): block_hash = bytes.fromhex( '0000000053787814ed9dd8c029d0a0a9af4ab8ec0591dc31bdc4ab31fae88ce9') passphrase = b'Jimmy Song Programming Blockchain' # FILL THIS IN secret = little_endian_to_int(hash256(passphrase)) private_key = PrivateKey(secret=secret) addr = private_key.point.address(testnet=True) filter_size = 30 filter_num_functions = 5 filter_tweak = 90210 # FILL THIS IN h160 = decode_base58(addr) bf = BloomFilter(filter_size, filter_num_functions, filter_tweak) bf.add(h160) node = SimpleNode('tbtc.programmingblockchain.com', testnet=True, logging=False) node.handshake() node.send(bf.filterload()) getdata = GetDataMessage() getdata.add_data(FILTERED_BLOCK_DATA_TYPE, block_hash) node.send(getdata) mb = node.wait_for(MerkleBlock) tx = node.wait_for(Tx) self.assertEqual( tx.serialize().hex(), '0100000002a663815ab2b2ba5f53e442f9a2ea6cc11bbcd98fb1585e48a134bd870dbfbd6a000000006a47304402202151107dc2367cf5a9e2429cde0641c252374501214ce52069fbca1320180aa602201a43b5d4f91e48514c00c01521dc04f02c57f15305adc4eaad01c418f6e7a1180121031dbe3aff7b9ad64e2612b8b15e9f5e4a3130663a526df91abfb7b1bd16de5d6effffffff618b00a343488bd62751cf21f368ce3be76e3a0323fdc594a0d24f27a1155cd2000000006a473044022024c4dd043ab8637c019528b549e0b10333b2dfa83e7ca66776e401ad3fc31b6702207d4d1d73ac8940c59c57c0b7daf084953324154811c10d06d0563947a88f99b20121031dbe3aff7b9ad64e2612b8b15e9f5e4a3130663a526df91abfb7b1bd16de5d6effffffff0280969800000000001976a914ad346f8eb57dee9a37981716e498120ae80e44f788aca0ce6594000000001976a9146e13971913b9aa89659a9f53d327baa8826f2d7588ac00000000' )
def main(): number_of_items = 20 false_positive_probability = 0.1 bloom = BloomFilter(number_of_items, false_positive_probability) word_present = [ 'abound', 'abounds', 'abundance', 'abundant', 'accessable', 'bloom', 'blossom', 'bolster', 'bonny', 'bonus', 'bonuses', 'coherent', 'cohesive', 'colorful', 'comely', 'comfort', 'gems', 'generosity', 'generous', 'generously', 'genial' ] word_absent = [ 'bluff', 'cheater', 'hate', 'war', 'humanity', 'racism', 'hurt', 'nuke', 'gloomy', 'facebook', 'geeksforgeeks', 'twitter' ] print('bloomfilter size: ', bloom.bit_size) print('false_positive_probability', bloom.false_positive_probability) print('hash_count: ', bloom.hash_count) for item in word_present: bloom.add(item) shuffle(word_present) shuffle(word_absent) random_list = word_present[:5] + word_absent[:5] shuffle(random_list) for word in random_list: print('word: ', word) if bloom.check(word): if word in word_absent: print('false positive') else: print('word most likely member') else: print('word not present')
def test_exercise_4(self): last_block_hex = '000000000d65610b5af03d73ed67704713c9b734d87cf4b970d39a0416dd80f9' last_block = bytes.fromhex(last_block_hex) secret = little_endian_to_int( hash256(b'Jimmy Song Programming Blockchain')) private_key = PrivateKey(secret=secret) addr = private_key.point.address(testnet=True) h160 = decode_base58(addr) target_address = 'mwJn1YPMq7y5F8J3LkC5Hxg9PHyZ5K4cFv' self.assertEqual(addr, target_address) filter_size = 30 filter_num_functions = 5 filter_tweak = 90210 # FILL THIS IN target_h160 = decode_base58(target_address) target_script = p2pkh_script(target_h160) fee = 5000 # fee in satoshis node = SimpleNode('tbtc.programmingblockchain.com', testnet=True, logging=False) bf = BloomFilter(filter_size, filter_num_functions, filter_tweak) bf.add(h160) node.handshake() node.send(b'filterload', bf.filterload()) getheaders_message = GetHeadersMessage(start_block=last_block) node.send(getheaders_message.command, getheaders_message.serialize()) headers_envelope = node.wait_for_commands([HeadersMessage.command]) stream = headers_envelope.stream() headers = HeadersMessage.parse(stream) get_data_message = GetDataMessage() for block in headers.blocks: self.assertTrue(block.check_pow()) if last_block is not None: self.assertEqual(block.prev_block, last_block) last_block = block.hash() get_data_message.add_data(FILTERED_BLOCK_DATA_TYPE, last_block) node.send(get_data_message.command, get_data_message.serialize()) prev_tx = None while prev_tx is None: envelope = node.wait_for_commands([b'merkleblock', b'tx']) stream = envelope.stream() if envelope.command == b'merkleblock': mb = MerkleBlock.parse(stream) self.assertTrue(mb.is_valid()) else: prev = Tx.parse(stream, testnet=True) for i, tx_out in enumerate(prev.tx_outs): if tx_out.script_pubkey.address(testnet=True) == addr: prev_tx = prev.hash() prev_index = i prev_amount = tx_out.amount break tx_in = TxIn(prev_tx, prev_index) output_amount = prev_amount - fee tx_out = TxOut(output_amount, target_script) tx_obj = Tx(1, [tx_in], [tx_out], 0, testnet=True) tx_obj.sign_input(0, private_key) self.assertEqual( tx_obj.serialize().hex(), '010000000194e631abb9e1079ec72a1616a3aa0111c614e65b96a6a4420e2cc6af9e6cc96e000000006a47304402203cc8c56abe1c0dd043afa9eb125dafbebdde2dd4cd7abf0fb1aae0667a22006e02203c95b74d0f0735bbf1b261d36e077515b6939fc088b9d7c1b7030a5e494596330121021cdd761c7eb1c90c0af0a5963e94bf0203176b4662778d32bd6d7ab5d8628b32ffffffff01f8829800000000001976a914ad346f8eb57dee9a37981716e498120ae80e44f788ac00000000' )
def test_bloom(): data = (str(uuid.uuid1()) for i in range(100000)) filter = BloomFilter(100000, 0.0001) for item in data: if not item in filter: filter.add(item) print "{name} costs {bytes} bytes.".format(name=sys._getframe().f_code.co_name, bytes=filter.container_size())
def create(cls, path, memtable): bf = BloomFilter(BF_SIZE, BF_HASH_COUNT) with kv_writer(path) as writer: for key, value in memtable.entries(): writer.write_entry(key, value) bf.add(key) return cls(path, bf)
def test_exercise_6(self): last_block_hex = '000000000d65610b5af03d73ed67704713c9b734d87cf4b970d39a0416dd80f9' secret = little_endian_to_int( hash256(b'Jimmy Song Programming Blockchain')) private_key = PrivateKey(secret=secret) addr = private_key.point.address(testnet=True) h160 = decode_base58(addr) target_address = 'mwJn1YPMq7y5F8J3LkC5Hxg9PHyZ5K4cFv' self.assertEqual(addr, target_address) target_h160 = decode_base58(target_address) target_script = p2pkh_script(target_h160) fee = 5000 node = SimpleNode('tbtc.programmingblockchain.com', testnet=True) bf = BloomFilter(30, 5, 90210) bf.add(h160) node.handshake() node.send(bf.filterload()) start_block = bytes.fromhex(last_block_hex) getheaders = GetHeadersMessage(start_block=start_block) node.send(getheaders) headers = node.wait_for(HeadersMessage) last_block = None getdata = GetDataMessage() for b in headers.blocks: if not b.check_pow(): raise RuntimeError('proof of work is invalid') if last_block is not None and b.prev_block != last_block: raise RuntimeError('chain broken') getdata.add_data(FILTERED_BLOCK_DATA_TYPE, b.hash()) last_block = b.hash() node.send(getdata) prev_tx, prev_index, prev_tx_obj = None, None, None while prev_tx is None: message = node.wait_for(MerkleBlock, Tx) if message.command == b'merkleblock': if not message.is_valid(): raise RuntimeError('invalid merkle proof') else: message.testnet = True for i, tx_out in enumerate(message.tx_outs): if tx_out.script_pubkey.address(testnet=True) == addr: prev_tx = message.hash() prev_index = i prev_amount = tx_out.amount self.assertEqual( message.id(), '6ec96c9eafc62c0e42a4a6965be614c61101aaa316162ac79e07e1b9ab31e694' ) self.assertEqual(i, 0) break tx_in = TxIn(prev_tx, prev_index) output_amount = prev_amount - fee tx_out = TxOut(output_amount, target_script) tx_obj = Tx(1, [tx_in], [tx_out], 0, testnet=True) tx_obj.sign_input(0, private_key) self.assertEqual( tx_obj.serialize().hex(), '010000000194e631abb9e1079ec72a1616a3aa0111c614e65b96a6a4420e2cc6af9e6cc96e000000006a47304402203cc8c56abe1c0dd043afa9eb125dafbebdde2dd4cd7abf0fb1aae0667a22006e02203c95b74d0f0735bbf1b261d36e077515b6939fc088b9d7c1b7030a5e494596330121021cdd761c7eb1c90c0af0a5963e94bf0203176b4662778d32bd6d7ab5d8628b32ffffffff01f8829800000000001976a914ad346f8eb57dee9a37981716e498120ae80e44f788ac00000000' )
def test_bloomfilter(self): bloom = BloomFilter(100) for i in xrange(50): bloom.add(str(i)) assert "20" in bloom assert "25" in bloom assert "49" in bloom assert "50" not in bloom
def test(): bf = BloomFilter(num_hashes=10, size_bytes=100) bf.add('hello') s = pickle.dumps(bf) bf2 = pickle.loads(s) assert 'hi' not in bf2 assert 'hello' in bf2 assert (bf.seeds == bf2.seeds).all()
def test_excluded(self): bf = BloomFilter() bf.add('t1') bf.add('t2') test1 = bf.test("t3") test2 = bf.test("t4") test3 = bf.test("t5") # making few checks to eliminate test failings on false positives self.assertFalse(test1 and test2 and test3)
def write_bloom_filter(): bloomf = BloomFilter(n, p) print("Size of bit array:{}".format(bloomf.size)) print("False positive Probability:{}".format(bloomf.fp_prob)) print("Number of hash functions:{}".format(bloomf.hash_count)) for item in word_present: bloomf.add(item) with open(filename, "wb") as outfile: outfile.write(bloomf.prepare_bloom_filter_to_write())
def create_bloomfilter_file(self): bloomf = BloomFilter(self.unique_word_count, self.false_positive_probability) try: for word in self.ta_words_unique: bloomf.add(word) bloomf.writetofile(self.bloom_file_path) except Exception as e: track = traceback.format_exc() print(track)
def create_csv_bloomfilter_files(self): items_count = len(self.dict_tamil_word) falsepositive_probability = 0.001 bloomf = BloomFilter(items_count, falsepositive_probability) with open(self.csv_file_path, "w") as f: for word, count in self.dict_tamil_word.items(): write_line = word + "," + str(count) + os.linesep bloomf.add(word) f.write(write_line) bloomf.writetofile(self.bloomfilter_file_path)
def test_get_filtered_txs(self): from bloomfilter import BloomFilter bf = BloomFilter(30, 5, 90210) h160 = decode_base58('mseRGXB89UTFVkWJhTRTzzZ9Ujj4ZPbGK5') bf.add(h160) node = SimpleNode('tbtc.programmingblockchain.com', testnet=True) node.handshake() node.send(bf.filterload()) block_hash = bytes.fromhex('00000000000377db7fde98411876c53e318a395af7304de298fd47b7c549d125') txs = node.get_filtered_txs([block_hash]) self.assertEqual(txs[0].id(), '0c024b9d3aa2ae8faae96603b8d40c88df2fc6bf50b3f446295206f70f3cf6ad') self.assertEqual(txs[1].id(), '0886537e27969a12478e0d33707bf6b9fe4fdaec8d5d471b5304453b04135e7e') self.assertEqual(txs[2].id(), '23d4effc88b80fb7dbcc2e6a0b0af9821c6fe3bb4c8dc3b61bcab7c45f0f6888')
def test_bloom_filter(num_of_items, fp_prob): bloomf = BloomFilter(num_of_items,fp_prob) # words to be added word_present = ['abound','abounds','abundance','abundant','accessable', 'bloom','blossom','bolster','bonny','bonus','bonuses', 'coherent','cohesive','colorful','comely','comfort', 'gems','generosity','generous','generously','genial'] # word not added word_absent = ['bluff','cheater','hate','war','humanity', 'racism','hurt','nuke','gloomy','facebook', 'geeksforgeeks','twitter'] top_passwords_last_years = ['123456', '123456789', 'qwerty', 'password', 'football', '1234567', '12345678', 'letmein', '1234', '1234567890', 'dragon', 'baseball', 'sunshine', 'iloveyou','trustno1', 'princess', 'adobe123', '123123', 'welcome', 'login', 'admin', '111111', 'qwerty123', 'solo', '1q2w3e4r', 'master', 'abc123', '666666', 'photoshop', '1qaz2wsx', 'qwertyuiop', 'ashley', 'mustang', '121212', 'starwars', '654321', 'bailey', 'access', 'flower', '555555', 'passw0rd', 'monkey', 'lovely', 'shadow', '7777777', '12345', 'michael', '!@#$%^&*', 'jesus', 'password1', 'superman', 'hello', 'charlie', '888888', '696969', 'hottie', 'freedom', 'aa123456', 'qazwsx', 'ninja', 'azerty', 'loveme', 'whatever', 'donald', 'batman', 'zaq1zaq1', 'Football', '0', '123qwe', '1111111', '12345', '000000', '1234', '1q2w3e4r5t', '123', '987654321', '12345679', 'mynoob', '123321', '18atcskd2w', '3rjs1la7qe', 'google', 'zxcvbnm', '1q2w3e', ] for item in word_present: bloomf.add(item) shuffle(word_present) shuffle(word_absent) test_words = word_present[:10] + word_absent shuffle(test_words) for word in test_words: if bloomf.check(word): if word in word_absent: print("'{}' is a false positive!".format(word)) else: print("'{}' is probably present!".format(word)) else: print("'{}' is definitely not present!".format(word))
def build_index(self, document_identifier, kpriv, list_of_words): # Create an empty list to hold the trapdoors for the word (x1, x2, ..., xr) trapdoor = [] # Create an empty list to hold the codewords for the word (y1, y2, ..., yr) codewords = [] for word in list_of_words: ''' Create a trapdoor for each unique word ''' # Takes the word and creates a trapdoor for i in range(0,self.r): # Converts kpriv[i] from hex to a bytes object - Necessary to use HMAC key = bytes.fromhex(kpriv[i]) w = bytes(word, 'utf-8') trapdoor_digest = hmac.new(key, msg=w, digestmod=hashlib.sha1) trapdoor_digest = trapdoor_digest.hexdigest() trapdoor.append(trapdoor_digest) # Take each word and hash it again with the document_identifier as the key to generate y1, y2, ..., yr for i in range(0, len(trapdoor)): # encode the docunemt identifier and the trapdoor[i] d_id = bytes(document_identifier, 'utf-8') message = bytes(trapdoor[i], 'utf-8') codeword_digest = hmac.new(message, msg=d_id, digestmod=hashlib.sha1) codeword_digest = codeword_digest.hexdigest() codewords.append(codeword_digest) #Create a bloom filter and insert the codewords into the bloom filter # Creates a bloom filter bf = BloomFilter() # For each value in the list of codewords, add the codeword to the bloom filter for codeword in codewords: bf.add(codeword) # adding noise - take the total number of words - unique words * r and insert into bloom filter for i in range (0, (self.unique_word_count - len(list_of_words)) * self.r): # generate a random number from 0 - bf.size index = random.randrange(0, bf.size-1) bf.set_index(index) return(document_identifier, bf)
class CreateBloomFilter(): def __init__(self): self.cc = ConnectToCassandra() self.n, self.word_present = self.cc.get_id() #no of items to add self.p = 0.05 #false positive probability self.bloomf = BloomFilter(self.n, self.p) for item in self.word_present: self.bloomf.add(bytes(to_integer(item.date()))) def createfilter(self): for item in self.word_present: self.bloomf.add(bytes(to_integer(item))) def testdate(self, todate): todate = to_integer(todate) if self.bloomf.check(bytes(todate)): return 1 else: return 0
def encryptData(data,size,fp=0.01,bigrams=2,bpower=8,p=None): """ Criptografa um string bigrams : 2 = Bigrams size : Size of BF fp : False positive rate """ bloomfilter = BloomFilter(size,fp,bfpower=bpower) if p != None: bloomfilter.set_hashfunction_by_p(p) index = ngram.NGram(N=bigrams) bigrams = list(index.ngrams(index.pad(str(data)))) for bigram in bigrams: bloomfilter.add(str(bigram)) return bloomfilter
class Document: def __init__(self, terms, doc_id): self.id = doc_id self.terms = terms self.terms = tokenize_terms(self.terms) self.signature = BloomFilter() self.signature.add(self.terms) def verify(self, query): if isinstance(query, str): if query not in self.terms: return False elif isinstance(query, list): for term in query: if term not in self.terms: return False else: return False return True
def test_example_5(self): last_block_hex = '00000000000538d5c2246336644f9a4956551afb44ba47278759ec55ea912e19' address = 'mwJn1YPMq7y5F8J3LkC5Hxg9PHyZ5K4cFv' h160 = decode_base58(address) node = SimpleNode('tbtc.programmingblockchain.com', testnet=True, logging=False) bf = BloomFilter(30, 5, 90210) bf.add(h160) node.handshake() node.send(b'filterload', bf.filterload()) start_block = bytes.fromhex(last_block_hex) getheaders_message = GetHeadersMessage(start_block=start_block) node.send(b'getheaders', getheaders_message.serialize()) headers_envelope = node.wait_for_commands({b'headers'}) stream = headers_envelope.stream() headers = HeadersMessage.parse(stream) get_data_message = GetDataMessage() for b in headers.blocks: if not b.check_pow(): raise RuntimeError('proof of work is invalid') get_data_message.add_data(FILTERED_BLOCK_DATA_TYPE, b.hash()) node.send(b'getdata', get_data_message.serialize()) found = False while not found: envelope = node.wait_for_commands({b'merkleblock', b'tx'}) stream = envelope.stream() if envelope.command == b'merkleblock': mb = MerkleBlock.parse(stream) if not mb.is_valid(): raise RuntimeError('invalid merkle proof') else: prev_tx_obj = Tx.parse(stream, testnet=True) for i, tx_out in enumerate(prev_tx_obj.tx_outs): if tx_out.script_pubkey.address(testnet=True) == address: self.assertEqual( prev_tx_obj.id(), 'e3930e1e566ca9b75d53b0eb9acb7607f547e1182d1d22bd4b661cfe18dcddf1' ) self.assertEqual(i, 0) found = True break
def bloom(word_present): n = 20 #no of items to add p = 0.05 #false positive probability bloomf = BloomFilter(n, p) print("Size of bit array:{}".format(bloomf.size)) print("False positive Probability:{}".format(bloomf.fp_prob)) print("Number of hash functions:{}".format(bloomf.hash_count)) role = [ 'Financial analyst', 'Jr. Maintenance Engineer', 'Marketing manager', 'Quantitative analyst', 'Sales Consultant', 'Sales Executive', 'Sales Representative', 'Sr. Software engineer', 'Sr. Software tester', 'Technical support', 'Web developer', 'Jr. Software engineer', 'Jr. Software tester', 'Sr. Maintenance Engineer' ] # words to be added #word_present = ['abound','abounds','abundance','abundant','accessable', # 'bloom','blossom','bolster','bonny','bonus','bonuses', # 'coherent','cohesive','colorful','comely','comfort', # 'gems','generosity','generous','generously','genial'] # word not added word_absent = [] for i in role: if i not in word_present: word_absent.append(i) for item in word_present: bloomf.add(item) shuffle(word_present) shuffle(word_absent) print word_present for word in word_present: print word if bloomf.check(word): if word in word_absent: print("'{}' is a false positive!".format(word)) else: print("'{}' is probably present!".format(word)) else: print("'{}' is definitely not present!".format(word))
class DuplicatesPipeline(object): def __init__(self): # self.bf = BloomFilter(10000000, 0.01, 'filter.bloom') self.bf = BloomFilter(10000, 0.0001, 'filter.bloom') self.f_write = open('visitedsites','w') self.si = SearchIndex() self.si.SearchInit() self.count_num = 0 def process_item(self, item, spider): # print '************%d pages visited!*****************' %len(self.bf) temp='?' str1=item['url'] str2=str1[:str1.find(temp)] # if self.bf.add(item['url']):#True if item in the BF # if self.bf.lookup(item['url']): if self.bf.lookup(str2): raise DropItem("Duplicate item found: %s" % item) else: # print '%d pages visited!'% len(self.url_seen) self.count_num+=1 # self.bf.add(item['url']) # self.save_to_file(item['url'],item['title']) self.bf.add(str2) self.save_to_file(item['url'],item['title']) self.si.AddIndex(item) print self.count_num return item def save_to_file(self,url,utitle): self.f_write.write(url) self.f_write.write('\t') self.f_write.write(utitle.encode('utf-8')) self.f_write.write('\n') def __del__(self): """docstring for __del__""" self.f_write.close() self.si.IndexDone()
def dblookuptimetest(): print("Testing DB lookup time using bloom filter\n") bf = BloomFilter(500000, 7) huge = [] lines = open("/usr/share/dict/american-english").read().splitlines() for line in lines: bf.add(line) huge.append(line) import datetime start = datetime.datetime.now() bf.contains("google") finish = datetime.datetime.now() print('Checking "google" using bloom filter in dictionary\n') print((finish - start).microseconds) start = datetime.datetime.now() for word in huge: if word == "google": break finish = datetime.datetime.now() print('Checking "google" without using bloom filter in dictionary\n') print((finish - start).microseconds) print(bf.contains("Max")) print(bf.contains("mice")) print(bf.contains("3")) start = datetime.datetime.now() bf.contains("apple") finish = datetime.datetime.now() print((finish - start).microseconds) start = datetime.datetime.now() for word in huge: if word == "apple": break finish = datetime.datetime.now() print((finish - start).microseconds)
class SSTable: """Represents a Sorted-String-Table (SSTable) on disk""" def __init__(self, path, bf=None): self.path = path self.bf = bf if not self.bf: self._sync() def _sync(self): self.bf = BloomFilter(BF_SIZE, BF_HASH_COUNT) with kv_reader(self.path) as r: while r.has_next(): key = r.read_key() self.bf.add(key) r.skip_value() @classmethod def create(cls, path, memtable): bf = BloomFilter(BF_SIZE, BF_HASH_COUNT) with kv_writer(path) as writer: for key, value in memtable.entries(): writer.write_entry(key, value) bf.add(key) return cls(path, bf) def search(self, search_key): if not self.bf.exists(search_key): return None with kv_reader(self.path) as r: while r.has_next(): key = r.read_key() # stop if the key is too big if key > search_key: return None if key == search_key: return r.read_value() r.skip_value() return None
def merge(cls, sstables: List[SSTable]) -> SSTable: new_path = sstables[0].path.replace(".dat", "-compacted.dat") new_index = sstables[0].index new_bf = BloomFilter(BF_SIZE, BF_HASH_COUNT) readers = [cls.Entries(sstable) for sstable in sstables if sstable.size > 0] with kv_writer(new_path) as writer: while readers: min_reader = min( readers, key=lambda r: (r.current_pair[0], r.sstable.index * -1), ) for reader in readers: if reader is min_reader: continue if reader.current_pair[0] == min_reader.current_pair[0]: reader.advance() if min_reader.current_pair[1] is not TOMBSTONE: writer.write_entry(*min_reader.current_pair) new_bf.add(min_reader.current_pair[0]) min_reader.advance() readers = [reader for reader in readers if reader.has_next] return cls(new_path, new_index, new_bf)
def main(): m = 1000000 # max hash value h = 2000 # number of hash functions jaccard = 0.8 N = np.linspace(10, 10**3, num=10).astype('int') jaccard_minhash = [] jaccard_bloom = [] jaccard_true = [] for n in N: d1 = set([str(x) for x in range(n)]) min_d2 = int(n*(1.-jaccard)/(1. + jaccard)) d2 = set([str(x) for x in range(min_d2, min_d2 + n)]) b1 = BloomFilter(m, h) b2 = BloomFilter(m, h) mh1 = MinHash(h) mh2 = MinHash(h) for s1, s2 in izip(d1, d2): b1.add(s1) b2.add(s2) mh1.hash(d1) mh2.hash(d2) jaccard_minhash.append(1.-hamming(mh1.vec, mh2.vec)) jaccard_bloom.append(1-2*float(sum(np.not_equal(b1.bit_array, b2.bit_array)))/(sum(b1.bit_array) + sum(b2.bit_array))) jaccard_true.append(float(len(d1.intersection(d2)))/len(d1.union(d2))) plt.plot(N, np.array([jaccard_bloom, jaccard_minhash, jaccard_true]).T) plt.legend(['Bloom Filter', 'MinHash', 'True'], loc='upper left') plt.xlabel('Number of strings') plt.ylabel('Jaccard Coefficient') plt.title('Jaccard Approximation Through Hashing') plt.show()
def test_bloom_filter(): bloomfilter = BloomFilter(NUM_KEYS, FALSE_POSITIVE_PROBABILITY) word_present = [ 'abound', 'abounds', 'abundance', 'abundant', 'accessable', 'bloom', 'blossom', 'bolster', 'bonny', 'bonus', 'bonuses', 'coherent', 'cohesive', 'colorful', 'comely', 'comfort', 'gems', 'generosity', 'generous', 'generously', 'genial' ] word_absent = ['facebook', 'twitter'] for item in word_present: bloomfilter.add(item) test_words = word_present[:10] + word_absent shuffle(test_words) for word in test_words: if bloomfilter.is_member(word): if word in word_absent: print(f"'{word}' is a false positive!") else: print(f"'{word}' is probably present!") else: print(f"'{word}' is definitely not present!")
class CreateBloomFilter(): def __init__(self, cnt, word_present): self.n = cnt self.word_present = word_present #no of items to add self.p = 0.05 #false positive probability self.bloomf = BloomFilter(self.n, self.p) for item in self.word_present: print(item) self.bloomf.add( bytes(to_integer(datetime.datetime.strptime(item, '%Y%m%d')))) def createfilter(self, cnt, word_present): self.p = 0.05 #false positive probability self.bloomf = BloomFilter(cnt, self.p) for item in word_present: self.bloomf.add(bytes(to_integer(item))) def testdate(self, todate): todate = datetime.datetime.strptime(todate, '%Y%m%d') todate = to_integer(todate) if self.bloomf.check(bytes(todate)): return 1 else: return 0
word_present = [] inFile = open("/Users/siddhartharoynandi/Desktop/listed_username_30.txt") for line in inFile: word_present.append(line) n = len(word_present) # no of items to add p = 0.05 # false positive probability bloomf = BloomFilter(n, p) #print("Size of bit array:{}".format(bloomf.size)) #print("False positive Probability:{}".format(bloomf.fp_prob)) #print("Number of hash functions:{}".format(bloomf.hash_count)) for item in word_present: bloomf.add(item) word_tobe_tested = [] inFile = open("/Users/siddhartharoynandi/Desktop/listed_username_365.txt") for line in inFile: word_tobe_tested.append(line) shuffle(word_present) shuffle(word_tobe_tested) count = 0 count1 = 0 for word in word_tobe_tested: if bloomf.check(word): count1 = count1 + 1
) except: # invalid data or mistake pass filter_visual_window.close() pass except: pass elif event == 'Insert new password': try: if len(values['-NEW-PASSWORD-']) != 0: if bloomf.check_if_add(values['-NEW-PASSWORD-']): sg.PopupError("inserted not successfully This word[ " + str(values['-NEW-PASSWORD-']) + " ] in bloom filter , Try Again!") else: bloomf.add(values['-NEW-PASSWORD-']) sg.PopupOK( "The password has been inserted successfully [ we Found overlap " + str(bloomf.c) + "bits]") if bloomf.c == sizeofhashs: sg.popup_ok("ohh ! , we Found False positive ") false_positive += 1 else: sg.PopupError( "inserted not successfully Null input, Try Again!") except: pass elif event == 'Show complete password strength analysis': try: if len(values['-NEW-PASSWORD-']) == 0: sg.popup_error("Null input :(")
#address = 'mwJn1YPMq7y5F8J3LkC5Hxg9PHyZ5K4cFv' # our test #last_block_hex = '0000000017e6fbd8931bce659d45d92040a4674950f2ae5416d0bf1a239641f9' last_block_hex = '00000000970369111c044804ec0319792c9e1aa29f59a622c5d14b3544ae4eba' #0000000017e6fbd8931bce659d45d92040a4674950f2ae5416d0bf1a239641f9 #last_block_hex = '0000000000000004fea90996fdf40772e2c2c76205a1fb57fae465194fdaffb9' address = 'mvEg6eZ3sUApodedYQrkpEPMMALsr1K1k1' h160 = decode_base58(address) node = SimpleNode('testnet.programmingbitcoin.com', testnet=True, logging=False) bf = BloomFilter(size=30, function_count=5, tweak=90210) bf.add(h160) node.handshake() node.send(bf.filterload()) start_block = bytes.fromhex(last_block_hex) getheaders = GetHeadersMessage(start_block=start_block) node.send(getheaders) print('ok2') headers = node.wait_for(HeadersMessage) print('ok3') getdata = GetDataMessage() for b in headers.blocks: if not b.check_pow(): raise RuntimeError('proof of work is invalid') getdata.add_data(FILTERED_BLOCK_DATA_TYPE, b.hash()) node.send(getdata) found = False
if __name__ == '__main__': from random import sample from string import ascii_letters states = '''Alabama Alaska Arizona Arkansas California Colorado Connecticut Delaware Florida Georgia Hawaii Idaho Illinois Indiana Iowa Kansas Kentucky Louisiana Maine Maryland Massachusetts Michigan Minnesota Mississippi Missouri Montana Nebraska Nevada NewHampshire NewJersey NewMexico NewYork NorthCarolina NorthDakota Ohio Oklahoma Oregon Pennsylvania RhodeIsland SouthCarolina SouthDakota Tennessee Texas Utah Vermont Virginia Washington WestVirginia Wisconsin Wyoming'''.split() bf1 = BloomFilter(ideal_num_elements_n=100000, error_rate_p=0.001) for state in states: bf1.add(state) json_bf = bf1.toJSON() print "##################" print json_bf print "##################" len_json = len(json_bf) print "data size: %s bytes"%len_json bf2 = BloomFilter.fromJSON(json_bf) assertListEquals(bf1.data, bf2.data) new_data = bf2.get_data()