def build_bf(n, p, ref_fasta): # call bloom filter class and output stats bloomf = BloomFilter(n, p) print("Size of bit array:{}".format(bloomf.size)) print("False positive Probability:{}".format(bloomf.fp_prob)) print("Number of hash functions:{}".format(bloomf.hash_count)) mycoplasma_fasta = open(ref_fasta, 'r') N_count = 0 read_count = 0 while True: name = mycoplasma_fasta.readline() # read id if len(name) == 0: break # end of file read = mycoplasma_fasta.readline().strip() if 'N' not in read: # do not add any uncalled bases bloomf.add(read) read_count += 1 else: N_count += 1 print('N_count = %s' % N_count) print('read_count = %s' % read_count) mycoplasma_fasta.close() return bloomf
def _decode_similarity_destination(self, meta_message, authentication_impl): if __debug__: from authentication import Authentication assert isinstance(meta_message, Message) assert isinstance(authentication_impl, Authentication.Implementation) try: my_similarity, = self._dispersy_database.execute(u"SELECT similarity FROM similarity WHERE community = ? AND user = ? AND cluster = ?", (self._community.database_id, self._community._my_member.database_id, meta_message.destination.cluster)).next() except StopIteration: raise DropPacket("We don't know our own similarity... should not happen") my_similarity = BloomFilter(str(my_similarity), 0) try: sender_similarity, = self._dispersy_database.execute(u"SELECT similarity FROM similarity WHERE community = ? AND user = ? AND cluster = ?", (self._community.database_id, authentication_impl.member.database_id, meta_message.destination.cluster)).next() except StopIteration: raise DelayPacketBySimilarity(self._community, authentication_impl.member, meta_message.destination) sender_similarity = BloomFilter(str(sender_similarity), 0) return meta_message.destination.implement(my_similarity.bic_occurrence(sender_similarity))
def _sync(self): self.bf = BloomFilter(BF_SIZE, BF_HASH_COUNT) with kv_reader(self.path) as r: while r.has_next(): key = r.read_key() self.bf.add(key) r.skip_value()
def test_all_test_positive_when_hashes_collide(self): """BloomFilter.test_by_hash() returns False when filter is empty.""" bloom_filter = BloomFilter(1000000, 1e-3) bloom_filter.add_by_hash("abc") self.assertEqual(bloom_filter.test_by_hash("def"), False)
def test_returns_positive_when_hashes_collide(self): """BloomFilter.test_by_hash() returns True when hashes collide.""" bloom_filter = BloomFilter(1000000, 1e-3) bloom_filter.add_by_hash("abc") self.assertEqual(bloom_filter.test_by_hash(u"abc"), True)
def test_bloom(): data = (str(uuid.uuid1()) for i in range(100000)) filter = BloomFilter(100000, 0.0001) for item in data: if not item in filter: filter.add(item) print "{name} costs {bytes} bytes.".format(name=sys._getframe().f_code.co_name, bytes=filter.container_size())
def __init__(self): self.cc = ConnectToCassandra() self.n, self.word_present = self.cc.get_id() #no of items to add self.p = 0.05 #false positive probability self.bloomf = BloomFilter(self.n, self.p) for item in self.word_present: self.bloomf.add(bytes(to_integer(item.date())))
class BloomFilterMR(MRJob): def __init__(self, *args, **kwargs): super(BloomFilterMR, self).__init__(*args, **kwargs) self.n = 20 self.p = 0.05 self.hot_list = [1,8,14,12,23,31,55] #defining steps def steps(self): return [ MRStep(mapper_init=self.mapper_init ,mapper=self.mapper ) ] def mapper_init(self): self.bloomf = BloomFilter(self.n,self.p) for elem in self.hot_list: self.bloomf.add(str(elem)) #MapReduce Phase 1 : convert temperature data into city,day,temp,temp_count def mapper(self, _, line): (city,temp,timestamp) = line.split('|') if self.bloomf.check(temp): yield city,(temp,timestamp)
def main(): number_of_items = 20 false_positive_probability = 0.1 bloom = BloomFilter(number_of_items, false_positive_probability) word_present = [ 'abound', 'abounds', 'abundance', 'abundant', 'accessable', 'bloom', 'blossom', 'bolster', 'bonny', 'bonus', 'bonuses', 'coherent', 'cohesive', 'colorful', 'comely', 'comfort', 'gems', 'generosity', 'generous', 'generously', 'genial' ] word_absent = [ 'bluff', 'cheater', 'hate', 'war', 'humanity', 'racism', 'hurt', 'nuke', 'gloomy', 'facebook', 'geeksforgeeks', 'twitter' ] print('bloomfilter size: ', bloom.bit_size) print('false_positive_probability', bloom.false_positive_probability) print('hash_count: ', bloom.hash_count) for item in word_present: bloom.add(item) shuffle(word_present) shuffle(word_absent) random_list = word_present[:5] + word_absent[:5] shuffle(random_list) for word in random_list: print('word: ', word) if bloom.check(word): if word in word_absent: print('false positive') else: print('word most likely member') else: print('word not present')
def test_all_test_positive_when_hashes_collide(self): '''BloomFilter.test_by_hash() returns False when filter is empty''' bloom_filter = BloomFilter(1000000, 1e-3) bloom_filter.add_by_hash('abc') self.assertEqual(bloom_filter.test_by_hash('def'), False)
def test_returns_true_positive_when_value_had_been_added(self): '''BloomFilter.test_by_hash() returns True after the item added''' bloom_filter = BloomFilter(1000000, 1e-3) bloom_filter.add_by_hash('abc') self.assertEqual(bloom_filter.test_by_hash('abc'), True)
def test_returns_positive_when_hashes_collide(self): '''BloomFilter.test_by_hash() returns True when hashes collide''' bloom_filter = BloomFilter(1000000, 1e-3) bloom_filter.add_by_hash('abc') self.assertEqual(bloom_filter.test_by_hash(u'abc'), True)
class TestBloomFilter(unittest.TestCase): def setUp(self): self.size = 500000 self.hash_count = 7 self.bf = BloomFilter(self.size, self.hash_count) lst = ['abc', 'xyz', 'foo', 'bar'] for item in lst: self.bf.add(item) def _initialize(self): pass def _cleanup(self): if self.bf: del(self.bf) self.bf = None def test_lookup_yes(self): self.assertEqual(self.bf.lookup('foo'), True) def test_lookup_no(self): self.assertEqual(self.bf.lookup('hello'), False) def tearDown(self): self._cleanup()
def test_exercise_4(self): last_block_hex = '000000000d65610b5af03d73ed67704713c9b734d87cf4b970d39a0416dd80f9' last_block = bytes.fromhex(last_block_hex) secret = little_endian_to_int( hash256(b'Jimmy Song Programming Blockchain')) private_key = PrivateKey(secret=secret) addr = private_key.point.address(testnet=True) h160 = decode_base58(addr) target_address = 'mwJn1YPMq7y5F8J3LkC5Hxg9PHyZ5K4cFv' self.assertEqual(addr, target_address) filter_size = 30 filter_num_functions = 5 filter_tweak = 90210 # FILL THIS IN target_h160 = decode_base58(target_address) target_script = p2pkh_script(target_h160) fee = 5000 # fee in satoshis node = SimpleNode('tbtc.programmingblockchain.com', testnet=True, logging=False) bf = BloomFilter(filter_size, filter_num_functions, filter_tweak) bf.add(h160) node.handshake() node.send(b'filterload', bf.filterload()) getheaders_message = GetHeadersMessage(start_block=last_block) node.send(getheaders_message.command, getheaders_message.serialize()) headers_envelope = node.wait_for_commands([HeadersMessage.command]) stream = headers_envelope.stream() headers = HeadersMessage.parse(stream) get_data_message = GetDataMessage() for block in headers.blocks: self.assertTrue(block.check_pow()) if last_block is not None: self.assertEqual(block.prev_block, last_block) last_block = block.hash() get_data_message.add_data(FILTERED_BLOCK_DATA_TYPE, last_block) node.send(get_data_message.command, get_data_message.serialize()) prev_tx = None while prev_tx is None: envelope = node.wait_for_commands([b'merkleblock', b'tx']) stream = envelope.stream() if envelope.command == b'merkleblock': mb = MerkleBlock.parse(stream) self.assertTrue(mb.is_valid()) else: prev = Tx.parse(stream, testnet=True) for i, tx_out in enumerate(prev.tx_outs): if tx_out.script_pubkey.address(testnet=True) == addr: prev_tx = prev.hash() prev_index = i prev_amount = tx_out.amount break tx_in = TxIn(prev_tx, prev_index) output_amount = prev_amount - fee tx_out = TxOut(output_amount, target_script) tx_obj = Tx(1, [tx_in], [tx_out], 0, testnet=True) tx_obj.sign_input(0, private_key) self.assertEqual( tx_obj.serialize().hex(), '010000000194e631abb9e1079ec72a1616a3aa0111c614e65b96a6a4420e2cc6af9e6cc96e000000006a47304402203cc8c56abe1c0dd043afa9eb125dafbebdde2dd4cd7abf0fb1aae0667a22006e02203c95b74d0f0735bbf1b261d36e077515b6939fc088b9d7c1b7030a5e494596330121021cdd761c7eb1c90c0af0a5963e94bf0203176b4662778d32bd6d7ab5d8628b32ffffffff01f8829800000000001976a914ad346f8eb57dee9a37981716e498120ae80e44f788ac00000000' )
def rappor(n,f,p,q,m): n=str(n) bloom=BloomFilter() noisydata=bloom.add_data(n,m) # Permanent randomized response for i in range(len(noisydata)): choose=np.random.randint(0,totalnum) if noisydata[i]==1: if choose/totalnum<=f/2: noisydata[i]=0 else: if choose/totalnum<=f/2: noisydata[i]=1 # Instantaneous randomized response for i in range(len(noisydata)): choose=np.random.randint(0,totalnum) if noisydata[i]==1: if choose/totalnum<=1-q: noisydata[i]=0 else: if choose/totalnum<=p: noisydata[i]=1 return noisydata
def test_exercise_2(self): block_hash = bytes.fromhex( '0000000053787814ed9dd8c029d0a0a9af4ab8ec0591dc31bdc4ab31fae88ce9') passphrase = b'Jimmy Song Programming Blockchain' # FILL THIS IN secret = little_endian_to_int(hash256(passphrase)) private_key = PrivateKey(secret=secret) addr = private_key.point.address(testnet=True) filter_size = 30 filter_num_functions = 5 filter_tweak = 90210 # FILL THIS IN h160 = decode_base58(addr) bf = BloomFilter(filter_size, filter_num_functions, filter_tweak) bf.add(h160) node = SimpleNode('tbtc.programmingblockchain.com', testnet=True, logging=False) node.handshake() node.send(bf.filterload()) getdata = GetDataMessage() getdata.add_data(FILTERED_BLOCK_DATA_TYPE, block_hash) node.send(getdata) mb = node.wait_for(MerkleBlock) tx = node.wait_for(Tx) self.assertEqual( tx.serialize().hex(), '0100000002a663815ab2b2ba5f53e442f9a2ea6cc11bbcd98fb1585e48a134bd870dbfbd6a000000006a47304402202151107dc2367cf5a9e2429cde0641c252374501214ce52069fbca1320180aa602201a43b5d4f91e48514c00c01521dc04f02c57f15305adc4eaad01c418f6e7a1180121031dbe3aff7b9ad64e2612b8b15e9f5e4a3130663a526df91abfb7b1bd16de5d6effffffff618b00a343488bd62751cf21f368ce3be76e3a0323fdc594a0d24f27a1155cd2000000006a473044022024c4dd043ab8637c019528b549e0b10333b2dfa83e7ca66776e401ad3fc31b6702207d4d1d73ac8940c59c57c0b7daf084953324154811c10d06d0563947a88f99b20121031dbe3aff7b9ad64e2612b8b15e9f5e4a3130663a526df91abfb7b1bd16de5d6effffffff0280969800000000001976a914ad346f8eb57dee9a37981716e498120ae80e44f788aca0ce6594000000001976a9146e13971913b9aa89659a9f53d327baa8826f2d7588ac00000000' )
def create(cls, path, memtable): bf = BloomFilter(BF_SIZE, BF_HASH_COUNT) with kv_writer(path) as writer: for key, value in memtable.entries(): writer.write_entry(key, value) bf.add(key) return cls(path, bf)
def test_raises_error_on_out_of_range_error_rate(self): '''BloomFilter() raises on out-of-range error rate''' with self.assertRaises(ValueError): BloomFilter(5, -1) with self.assertRaises(ValueError): BloomFilter(5, 2)
def test_exercise_6(self): last_block_hex = '000000000d65610b5af03d73ed67704713c9b734d87cf4b970d39a0416dd80f9' secret = little_endian_to_int( hash256(b'Jimmy Song Programming Blockchain')) private_key = PrivateKey(secret=secret) addr = private_key.point.address(testnet=True) h160 = decode_base58(addr) target_address = 'mwJn1YPMq7y5F8J3LkC5Hxg9PHyZ5K4cFv' self.assertEqual(addr, target_address) target_h160 = decode_base58(target_address) target_script = p2pkh_script(target_h160) fee = 5000 node = SimpleNode('tbtc.programmingblockchain.com', testnet=True) bf = BloomFilter(30, 5, 90210) bf.add(h160) node.handshake() node.send(bf.filterload()) start_block = bytes.fromhex(last_block_hex) getheaders = GetHeadersMessage(start_block=start_block) node.send(getheaders) headers = node.wait_for(HeadersMessage) last_block = None getdata = GetDataMessage() for b in headers.blocks: if not b.check_pow(): raise RuntimeError('proof of work is invalid') if last_block is not None and b.prev_block != last_block: raise RuntimeError('chain broken') getdata.add_data(FILTERED_BLOCK_DATA_TYPE, b.hash()) last_block = b.hash() node.send(getdata) prev_tx, prev_index, prev_tx_obj = None, None, None while prev_tx is None: message = node.wait_for(MerkleBlock, Tx) if message.command == b'merkleblock': if not message.is_valid(): raise RuntimeError('invalid merkle proof') else: message.testnet = True for i, tx_out in enumerate(message.tx_outs): if tx_out.script_pubkey.address(testnet=True) == addr: prev_tx = message.hash() prev_index = i prev_amount = tx_out.amount self.assertEqual( message.id(), '6ec96c9eafc62c0e42a4a6965be614c61101aaa316162ac79e07e1b9ab31e694' ) self.assertEqual(i, 0) break tx_in = TxIn(prev_tx, prev_index) output_amount = prev_amount - fee tx_out = TxOut(output_amount, target_script) tx_obj = Tx(1, [tx_in], [tx_out], 0, testnet=True) tx_obj.sign_input(0, private_key) self.assertEqual( tx_obj.serialize().hex(), '010000000194e631abb9e1079ec72a1616a3aa0111c614e65b96a6a4420e2cc6af9e6cc96e000000006a47304402203cc8c56abe1c0dd043afa9eb125dafbebdde2dd4cd7abf0fb1aae0667a22006e02203c95b74d0f0735bbf1b261d36e077515b6939fc088b9d7c1b7030a5e494596330121021cdd761c7eb1c90c0af0a5963e94bf0203176b4662778d32bd6d7ab5d8628b32ffffffff01f8829800000000001976a914ad346f8eb57dee9a37981716e498120ae80e44f788ac00000000' )
def rabinKarp(self, patterns, txt): if (not txt or not patterns): raise ValueError('Search requires text and a pattern') q = 101 # a prime number d = 256 h = 1 matches = dict() for p in patterns: matches[p] = [] patternHashes = [] patternLen = len(next(iter(patterns))) #length of first pattern txtLen = len(txt) if (txtLen < patternLen): raise ValueError( 'A pattern longer than text to search cannot exist in the text.' ) # The value of h would be "pow(d, M-1)%q" for i in range(patternLen - 1): h = (h * d) % q numPat = len(patterns) if (numPat < 1): raise ValueError('Search requires a pattern') for pat in set(patterns): if (patternLen != len(pat)): raise ValueError( 'Search only supports a fixed length pattern match.') patternHash = 0 for i in range(patternLen): patternHash = (d * patternHash + ord(pat[i])) % q patternHashes.append(patternHash) bloomf = BloomFilter(patternHashes) # setup the first comparison based on length of pattern left = 0 right = patternLen txtHash = 0 for j in range(patternLen): txtHash = (d * txtHash + ord(txt[j])) % q #scoot through txt 1 char at a time while (right <= txtLen): if (bloomf.contains(txtHash)): if (txt[left:right] in patterns): matches[txt[left:right]].append(left) if (left + patternLen < txtLen): txtHash = (d * (txtHash - ord(txt[left]) * h) + ord(txt[left + patternLen])) % q left += 1 right += 1 return matches
def test_bloomfilter(self): bloom = BloomFilter(100) for i in xrange(50): bloom.add(str(i)) assert "20" in bloom assert "25" in bloom assert "49" in bloom assert "50" not in bloom
def test_serializes_filter_serialize_without_line_feeds(self): '''BloomFilter serializes with base64 shield without line feeds''' bloom_filter = BloomFilter(100, 0.1) bloom_filter.add_by_hash('abcdef') serialized_filter = bloom_filter.serialize() self.assertEqual(serialized_filter.find('\n'), -1)
def test_simple(self): ELM1 = 'something' ELM2 = 'something else' bf = BloomFilter(100000) bf.add_key(ELM1) self.assertTrue(bf.contains(ELM1)) #very insignificant chance that this next assertion will FAIL self.assertTrue(not(bf.contains(ELM2)))
def test_simple2(self): ELM1 = 'something' ELM2 = 'something else' bf = BloomFilter(100000) bf.add_key(ELM1) bf.add_key(ELM2) self.assertTrue(bf.contains(ELM1)) self.assertTrue(bf.contains(ELM2))
def test_serializes_filter_serialize_without_line_feeds(self): """Serializes with base64 shield without line feeds.""" bloom_filter = BloomFilter(100, 0.1) bloom_filter.add_by_hash("abcdef") serialized_filter = bloom_filter.serialize() # self.assertEqual(serialized_filter.find("\n"), -1) self.assertTrue(b"\n" not in serialized_filter)
def __init__(self, cnt, word_present): self.n = cnt self.word_present = word_present #no of items to add self.p = 0.05 #false positive probability self.bloomf = BloomFilter(self.n, self.p) for item in self.word_present: print(item) self.bloomf.add( bytes(to_integer(datetime.datetime.strptime(item, '%Y%m%d'))))
def test(): bf = BloomFilter(num_hashes=10, size_bytes=100) bf.add('hello') s = pickle.dumps(bf) bf2 = pickle.loads(s) assert 'hi' not in bf2 assert 'hello' in bf2 assert (bf.seeds == bf2.seeds).all()
def test_creates_filter_with_non_integral_capacity(self): '''BloomFilter() creates filter with non-integral capacity''' float_filter = BloomFilter(capacity=1000.2, error_rate=1e-3) int_filter = BloomFilter(capacity=1000, error_rate=1e-3) bit_count = int_filter.bit_count self.assertGreaterEqual(float_filter.bit_count, bit_count) self.assertLess(float_filter.bit_count, bit_count + 10) self.assertEqual(int_filter.hash_count, float_filter.hash_count)
def _build_guided_bloom(prefixes, fpp, k, num_bits, root, fib, protocol='v4'): '''Returns a Bloom filer optimized for the `root` bin search tree, and `encoded_pref_lens` dict for looking up the BMP prefix length from hash-encoded bit sequence. ''' max_shift = NUMBITS[protocol] if not (k or num_bits): bf = BloomFilter(fpp, len(prefixes['prefixes'])) else: bf = BloomFilter(fpp, len(prefixes['prefixes']), k=k, num_bits=num_bits) count = 0 # report progress for pair in prefixes['prefixes']: if count % 10000 == 0: print('build processsed %.3f of all prefixes' % (count / len(prefixes['prefixes']))) count += 1 prefix, preflen = pair # BMP is an index, can recover prefix length using prefixes['ix2len'] bmp, fib_val = _find_bmp(prefix, bf, root, fib, preflen - 1, prefixes['minn'], prefixes['len2ix'], prefixes['ix2len'], protocol=protocol) current = root count_hit = 0 while current: if preflen < current.val: current = current.left elif preflen == current.val: # insert using hash_1..hash_k pref_encoded = encode_ip_prefix_pair(prefix, preflen, protocol) bf.insert(pref_encoded, hashes=_choose_hash_funcs(0, end=bf.k)) break else: # preflen > current.val masked = (((1 << max_shift) - 1) << (max_shift - current.val)) & prefix pref_encoded = encode_ip_prefix_pair(masked, current.val, protocol) bf.insert(pref_encoded, hashes=_choose_hash_funcs(0, end=1)) count_hit += 1 # insert pointers bf.insert(pref_encoded, hashes=_choose_hash_funcs(count_hit, pattern=bmp)) current = current.right return bf, root
def __init__(self): reload(sys) sys.setdefaultencoding("utf-8") self.datas = [] #清洗后的数据 self.raw_datas = [] #原始数据 # self.key_infos = set() #帮助去除重复数据 self.key_infos = BloomFilter(0.001, 1000000) #学习使用bloomfilter self.dupli_count = 0
def test_non_randoms_at_all(self): '''Ensure that small bit differences do not play bad''' bloom_filter = BloomFilter(1000000, 1e-5) collision_count = 0 for ix in range(1000000): if bloom_filter.test_by_hash(ix): collision_count += 1 else: bloom_filter.add_by_hash(ix) self.assertEqual(collision_count, 0)
def __init__( self, capacity, tolerant, redis_conn={}): size, hash_count = self._get_cap(capacity, tolerant) self.redis_pool = redis.ConnectionPool( host=redis_conn.get('host', '127.0.0.1'), port=redis_conn.get('port', 6379), db=redis_conn.get('db', 0), ) self.bf = BloomFilter(hash_count, size) self.bfkey = redis_conn.get('bfkey', 'bf')
def write_bloom_filter(): bloomf = BloomFilter(n, p) print("Size of bit array:{}".format(bloomf.size)) print("False positive Probability:{}".format(bloomf.fp_prob)) print("Number of hash functions:{}".format(bloomf.hash_count)) for item in word_present: bloomf.add(item) with open(filename, "wb") as outfile: outfile.write(bloomf.prepare_bloom_filter_to_write())
def __init__(self, maxLevel=4): proxies = None self.size = 100000000 self.bloomfilter = BloomFilter(self.size) self.maxLevel = maxLevel self.threadLock = Lock() self.checkThreadLock = Lock() self.totalThreads = 0 self.crawledPages = 0 self.verbose = True self.console = Console()
def test_get_filtered_txs(self): from bloomfilter import BloomFilter bf = BloomFilter(30, 5, 90210) h160 = decode_base58('mseRGXB89UTFVkWJhTRTzzZ9Ujj4ZPbGK5') bf.add(h160) node = SimpleNode('tbtc.programmingblockchain.com', testnet=True) node.handshake() node.send(bf.filterload()) block_hash = bytes.fromhex('00000000000377db7fde98411876c53e318a395af7304de298fd47b7c549d125') txs = node.get_filtered_txs([block_hash]) self.assertEqual(txs[0].id(), '0c024b9d3aa2ae8faae96603b8d40c88df2fc6bf50b3f446295206f70f3cf6ad') self.assertEqual(txs[1].id(), '0886537e27969a12478e0d33707bf6b9fe4fdaec8d5d471b5304453b04135e7e') self.assertEqual(txs[2].id(), '23d4effc88b80fb7dbcc2e6a0b0af9821c6fe3bb4c8dc3b61bcab7c45f0f6888')
class pydrbloomfilter: def __init__( self, capacity, tolerant, redis_conn={}): size, hash_count = self._get_cap(capacity, tolerant) self.redis_pool = redis.ConnectionPool( host=redis_conn.get('host', '127.0.0.1'), port=redis_conn.get('port', 6379), db=redis_conn.get('db', 0), ) self.bf = BloomFilter(hash_count, size) self.bfkey = redis_conn.get('bfkey', 'bf') ''' given M = num_bits, k = num_slices, P = error_rate, n = capacity k = log2(1/P) solving for m = bits_per_slice n ~= M * ((ln(2) ** 2) / abs(ln(P))) n ~= (k * m) * ((ln(2) ** 2) / abs(ln(P))) m ~= n * abs(ln(P)) / (k * (ln(2) ** 2)) ''' def _get_cap(self, capacity, false_prob): num_slices = int(math.ceil(math.log(1.0/false_prob, 2))) bits_per_slice = int(math.ceil( (capacity * abs(math.log(false_prob))) / (num_slices * (math.log(2) ** 2)) )) num_bits = num_slices * bits_per_slice size = math.ceil(math.log(num_bits, 2)) return int(size), int(num_slices) def add(self, string): r_client = redis.Redis(connection_pool=self.redis_pool) return r_client.setbit(self.bfkey, self.bf.generate(string), 1) def exists(self, string): r_client = redis.Redis(connection_pool=self.redis_pool) return r_client.getbit(self.bfkey, self.bf.generate(string)) def __contains__(self, string): return self.exists(string) def clear(self): r_client = redis.Redis(connection_pool=self.redis_pool) return r_client.delete(self.bfkey)
def setUp(self): self.size = 500000 self.hash_count = 7 self.bf = BloomFilter(self.size, self.hash_count) lst = ['abc', 'xyz', 'foo', 'bar'] for item in lst: self.bf.add(item)
def __init__(self): # self.bf = BloomFilter(10000000, 0.01, 'filter.bloom') self.bf = BloomFilter(10000, 0.0001, 'filter.bloom') self.f_write = open('visitedsites','w') self.si = SearchIndex() self.si.SearchInit() self.count_num = 0
def __init__(self): self.sqli_bool = self.load('sqli_bool') self.sqli_error = self.load('sqli_error') self.form_pattern = re.compile(r"(?i)<form .+?>.+?</form>".encode()) self.action_pattern = re.compile(r'''(?i)<form .*?action=["']([^\s"'<>]+)'''.encode()) self.method_pattern = re.compile(r'''(?i)<form .*?method=["']([^\s"'<>]+)'''.encode()) self.input_pattern = re.compile(r'''(?i)<input .*?type=["']([^\s"'<>]+).*?name=["']([^\s"'<>]+).*?value=["']([^\s"'<>]+)'''.encode()) self.input_types = [b'',b'text',b'hidden',b'password'] self.posted = BloomFilter()
class DuplicatesPipeline(object): def __init__(self): # self.bf = BloomFilter(10000000, 0.01, 'filter.bloom') self.bf = BloomFilter(10000, 0.0001, 'filter.bloom') self.f_write = open('visitedsites','w') self.si = SearchIndex() self.si.SearchInit() self.count_num = 0 def process_item(self, item, spider): # print '************%d pages visited!*****************' %len(self.bf) temp='?' str1=item['url'] str2=str1[:str1.find(temp)] # if self.bf.add(item['url']):#True if item in the BF # if self.bf.lookup(item['url']): if self.bf.lookup(str2): raise DropItem("Duplicate item found: %s" % item) else: # print '%d pages visited!'% len(self.url_seen) self.count_num+=1 # self.bf.add(item['url']) # self.save_to_file(item['url'],item['title']) self.bf.add(str2) self.save_to_file(item['url'],item['title']) self.si.AddIndex(item) print self.count_num return item def save_to_file(self,url,utitle): self.f_write.write(url) self.f_write.write('\t') self.f_write.write(utitle.encode('utf-8')) self.f_write.write('\n') def __del__(self): """docstring for __del__""" self.f_write.close() self.si.IndexDone()
class TestBloomFilter(unittest.TestCase): def setUp(self): self.bf = BloomFilter(256) self.existing_strings = [ 'tiny', 'bloom', 'rate', 'back', 'apple', 'google', 'dijkstra', 'limiter', 'url', 'travel', 'man', '2', ] for each in self.existing_strings: self.bf.insert(each) self.non_existing_strings = [ 'multi', 'short', 'path', 'components', 'connect', 'unit', 'test', ] def test_of_bloomfilter(self): for each in self.existing_strings: self.assertTrue(self.bf.lookup(each)) for each in self.non_existing_strings: # with small false positive, this will fail :) self.assertFalse(self.bf.lookup(each))
def setUp(self): self.bf = BloomFilter(256) self.existing_strings = [ 'tiny', 'bloom', 'rate', 'back', 'apple', 'google', 'dijkstra', 'limiter', 'url', 'travel', 'man', '2', ] for each in self.existing_strings: self.bf.insert(each) self.non_existing_strings = [ 'multi', 'short', 'path', 'components', 'connect', 'unit', 'test', ]
def __init__ (self, indiv_instance, ff_name, prefix="/tmp/exelixi"): self.indiv_class = indiv_instance.__class__ self.feature_factory = instantiate_class(ff_name) self.prefix = prefix self._shard_id = None self._exe_dict = None self._hash_ring = None self.n_pop = self.feature_factory.n_pop self._total_indiv = 0 self._term_limit = self.feature_factory.term_limit self._hist_granularity = self.feature_factory.hist_granularity self._selection_rate = self.feature_factory.selection_rate self._mutation_rate = self.feature_factory.mutation_rate self._shard = {} self._bf = BloomFilter(num_bytes=125, num_probes=14, iterable=[])
import random import math from bloomfilter import BloomFilter bf = BloomFilter(101, 5) bf.insert(56) assert not bf.contains(99) assert bf.contains(56) bitsize = 16000000 bf_num_count = 1000000 test_num_count = 100000000 hash_count = int(math.log(2) * bitsize / bf_num_count) print 'number of hash functions to use:', hash_count bf = BloomFilter(bitsize, hash_count) bf_nums = set() for _ in xrange(bf_num_count): rand_int = random.randint(0, 2**64) bf_nums.add(rand_int) bf.insert(rand_int) false_alarms = 0 for _ in xrange(test_num_count): rand_num = random.randint(0, 2**64) if bf.contains(rand_num) and not (rand_num in bf_nums): false_alarms += 1
for i in xrange(len(l1)): assert l1[i] == l2[i] if __name__ == '__main__': from random import sample from string import ascii_letters states = '''Alabama Alaska Arizona Arkansas California Colorado Connecticut Delaware Florida Georgia Hawaii Idaho Illinois Indiana Iowa Kansas Kentucky Louisiana Maine Maryland Massachusetts Michigan Minnesota Mississippi Missouri Montana Nebraska Nevada NewHampshire NewJersey NewMexico NewYork NorthCarolina NorthDakota Ohio Oklahoma Oregon Pennsylvania RhodeIsland SouthCarolina SouthDakota Tennessee Texas Utah Vermont Virginia Washington WestVirginia Wisconsin Wyoming'''.split() bf1 = BloomFilter(ideal_num_elements_n=100000, error_rate_p=0.001) for state in states: bf1.add(state) json_bf = bf1.toJSON() print "##################" print json_bf print "##################" len_json = len(json_bf) print "data size: %s bytes"%len_json bf2 = BloomFilter.fromJSON(json_bf) assertListEquals(bf1.data, bf2.data)
class Population (object): def __init__ (self, indiv_instance, ff_name, prefix="/tmp/exelixi"): self.indiv_class = indiv_instance.__class__ self.feature_factory = instantiate_class(ff_name) self.prefix = prefix self._shard_id = None self._exe_dict = None self._hash_ring = None self.n_pop = self.feature_factory.n_pop self._total_indiv = 0 self._term_limit = self.feature_factory.term_limit self._hist_granularity = self.feature_factory.hist_granularity self._selection_rate = self.feature_factory.selection_rate self._mutation_rate = self.feature_factory.mutation_rate self._shard = {} self._bf = BloomFilter(num_bytes=125, num_probes=14, iterable=[]) def set_ring (self, shard_id, exe_dict): """initialize the HashRing""" self._shard_id = shard_id self._exe_dict = exe_dict self._hash_ring = HashRing(exe_dict.keys()) ###################################################################### ## Individual lifecycle within the local subset of the Population def populate (self, current_gen): """initialize the population""" for _ in xrange(self.n_pop): # constructor pattern indiv = self.indiv_class() indiv.populate(current_gen, self.feature_factory.generate_features()) # add the generated Individual to the Population # failure semantics: must filter nulls from initial population self.reify(indiv) def reify (self, indiv): """test/add a newly generated Individual into the Population (birth)""" neighbor_shard_id = None exe_uri = None if self._hash_ring: neighbor_shard_id = self._hash_ring.get_node(indiv.key) if neighbor_shard_id != self._shard_id: exe_uri = self._exe_dict[neighbor_shard_id] # distribute this operation over the hash ring, through a remote queue if exe_uri: msg = { "key": indiv.key, "gen": indiv.gen, "feature_set": loads(indiv.get_json_feature_set()) } lines = post_exe_rest(self.prefix, neighbor_shard_id, exe_uri, "pop/reify", msg) return False else: return self._reify_locally(indiv) def receive_reify (self, key, gen, feature_set): """test/add a received reify request """ indiv = self.indiv_class() indiv.populate(gen, feature_set) self._reify_locally(indiv) def _reify_locally (self, indiv): """test/add a newly generated Individual into the Population locally (birth)""" if not indiv.key in self._bf: self._bf.update([indiv.key]) self._total_indiv += 1 # potentially the most expensive operation, deferred until remote reification indiv.get_fitness(self.feature_factory, force=True) self._shard[indiv.key] = indiv return True else: return False def evict (self, indiv): """remove an Individual from the Population (death)""" if indiv.key in self._shard: # Individual only needs to be removed locally del self._shard[indiv.key] # NB: serialize to disk (write behinds) url = self._get_storage_path(indiv) def get_part_hist (self): """tally counts for the partial histogram of the fitness distribution""" d = (Counter([ round(indiv.get_fitness(self.feature_factory, force=False), self._hist_granularity) for indiv in self._shard.values() ])).items() d.sort(reverse=True) return d def get_fitness_cutoff (self, hist): """determine fitness cutoff (bin lower bounds) for the parent selection filter""" h = hist.items() h.sort(reverse=True) logging.debug("fit: %s", h) n_indiv = sum([ count for bin, count in h ]) part_sum = 0 break_next = False for bin, count in h: if break_next: break part_sum += count percentile = part_sum / float(n_indiv) break_next = percentile >= self._selection_rate logging.debug("fit: percentile %f part_sum %d n_indiv %d bin %f", percentile, part_sum, n_indiv, bin) return bin def _get_storage_path (self, indiv): """create a path for durable storage of an Individual""" return self.prefix + "/" + indiv.key def _boost_diversity (self, current_gen, indiv): """randomly select other individuals and mutate them, to promote genetic diversity""" if self._mutation_rate > random(): indiv.mutate(self, current_gen, self.feature_factory) elif len(self._shard.values()) >= 3: # ensure that there are at least three parents self.evict(indiv) def _select_parents (self, current_gen, fitness_cutoff): """select the parents for the next generation""" partition = map(lambda x: (round(x.get_fitness(), self._hist_granularity) >= fitness_cutoff, x), self._shard.values()) good_fit = map(lambda x: x[1], filter(lambda x: x[0], partition)) poor_fit = map(lambda x: x[1], filter(lambda x: not x[0], partition)) # randomly select other individuals to promote genetic diversity, while removing the remnant for indiv in poor_fit: self._boost_diversity(current_gen, indiv) return self._shard.values() def next_generation (self, current_gen, fitness_cutoff): """select/mutate/crossover parents to produce a new generation""" parents = self._select_parents(current_gen, fitness_cutoff) for _ in xrange(self.n_pop - len(parents)): f, m = sample(parents, 2) success = f.breed(self, current_gen, m, self.feature_factory) # backfill to avoid the dreaded Population collapse for _ in xrange(self.n_pop - len(self._shard.values())): # constructor pattern indiv = self.indiv_class() indiv.populate(current_gen, self.feature_factory.generate_features()) self.reify(indiv) logging.info("gen: %d shard %s size %d total %d", current_gen, self._shard_id, len(self._shard.values()), self._total_indiv) def test_termination (self, current_gen, hist): """evaluate the terminating condition for this generation and report progress""" return self.feature_factory.test_termination(current_gen, self._term_limit, hist) def enum (self, fitness_cutoff): """enum all Individuals that exceed the given fitness cutoff""" return [[ "%0.4f" % indiv.get_fitness(), str(indiv.gen), indiv.get_json_feature_set() ] for indiv in filter(lambda x: x.get_fitness() >= fitness_cutoff, self._shard.values()) ] def report_summary (self): """report a summary of the evolution""" for indiv in sorted(self._shard.values(), key=lambda x: x.get_fitness(), reverse=True): print self._get_storage_path(indiv) print "\t".join(["%0.4f" % indiv.get_fitness(), "%d" % indiv.gen, indiv.get_json_feature_set()])
class Scanner(): def __init__(self): self.sqli_bool = self.load('sqli_bool') self.sqli_error = self.load('sqli_error') self.form_pattern = re.compile(r"(?i)<form .+?>.+?</form>".encode()) self.action_pattern = re.compile(r'''(?i)<form .*?action=["']([^\s"'<>]+)'''.encode()) self.method_pattern = re.compile(r'''(?i)<form .*?method=["']([^\s"'<>]+)'''.encode()) self.input_pattern = re.compile(r'''(?i)<input .*?type=["']([^\s"'<>]+).*?name=["']([^\s"'<>]+).*?value=["']([^\s"'<>]+)'''.encode()) self.input_types = [b'',b'text',b'hidden',b'password'] self.posted = BloomFilter() def load(self,tp): tree = ET.parse("payloads/"+tp+".xml") root = tree.getroot() payloads = [] for x in root.findall('payload'): payloads.append((x[0].text,x[1].text)) return payloads async def get_forms(self,url,session): response = None Forms = [] try: response = await session.get(url) response.headers = myfunc.tolower(response.headers) content_type = response.headers.get('content-type','').split(';')[0] if content_type in ['text/html', 'application/xml']: body = await response.read() forms = self.form_pattern.findall(body) for form in forms: Form = dict() tmp = self.action_pattern.search(form) action = tmp.group(1) if tmp else b'' tmp = self.method_pattern.search(form) method = tmp.group(1) if tmp else b'get' inputs = self.input_pattern.findall(form) params = dict() for ip in inputs: if ip[0] in self.input_types: params[myfunc.decode(ip[1])] = myfunc.decode(ip[2]) Form['action'] = urljoin(url,action.decode()) Form['method'] = method.decode() Form['params'] = params Forms.append(Form) except Exception as e: print(e) finally: await response.release() return Forms async def scan(self,url,session): #get型 parse = urlparse(url) query = list(map(lambda x:x.split('='),parse.query.split('&'))) for i in range(len(query)): if len(query[0])<2: break #bool盲注 for payload in self.sqli_bool: q1 = dict() q2 = dict() for x in query: q1[x[0]] = x[1] q2[x[0]] = x[1] q1[query[i][0]] = query[i][1]+payload[0] q2[query[i][0]] = query[i][1]+payload[1] body1 = body2 = '' try: q1 = urlunparse((parse.scheme,parse.netloc,parse.path,parse.params,urlencode(q1),parse.fragment)) q2 = urlunparse((parse.scheme,parse.netloc,parse.path,parse.params,urlencode(q2),parse.fragment)) resp1 = await session.get(q1) body1 = await resp1.read() resp2 = await session.get(q2) body2 = await resp2.read() except: pass if abs(1-len(body1)/len(body2))>0.98: print(abs(1-len(body1)/len(body2))) print("Type: Bool_Sqli[GET]") print("Url: "+q1) print("Payload: "+payload[0]) print("Param: "+query[i][0]) break #post型 Forms = await self.get_forms(url,session) for Form in Forms: Form_json = json.dumps(Form) if self.posted.isContain(Form_json): continue self.posted.insert(Form_json) for param_key in Form['params'].keys(): for payload in self.sqli_bool: new_params1 = new_params2 = Form['params'] new_params1[param_key] = new_params1[param_key]+payload[0] new_params2[param_key] = new_params2[param_key]+payload[1] response1 = response2 = None try: if Form['method'].lower()=='get': response1 = await session.get(Form['action'],params=new_params1) body1 = await response1.read() response2 = await session.get(Form['action'],params=new_params2) body2 = await response2.read() if Form['method'].lower()=='post': response1 = await session.post(Form['action'],data=new_params1) body1 = await response1.read() response2 = await session.post(Form['action'],data=new_params2) body2 = await response2.read() finally: body1 = await response1.read() await response1.release() body2 = await response2.read() await response2.release() if abs(1-len(body1)/len(body2))>0.98: print(abs(1-len(body1)/len(body2))) print("Type: Bool_Sqli[POST]") print("Url: "+url) print("Payload: "+payload[0]) print("Param: "+param_key) break