def build_bf(n, p, ref_fasta):
    # call bloom filter class and output stats
    bloomf = BloomFilter(n, p)
    print("Size of bit array:{}".format(bloomf.size))
    print("False positive Probability:{}".format(bloomf.fp_prob))
    print("Number of hash functions:{}".format(bloomf.hash_count))

    mycoplasma_fasta = open(ref_fasta, 'r')
    N_count = 0
    read_count = 0
    while True:
        name = mycoplasma_fasta.readline()  # read id
        if len(name) == 0:
            break  # end of file
        read = mycoplasma_fasta.readline().strip()
        if 'N' not in read:
            # do not add any uncalled bases
            bloomf.add(read)
            read_count += 1
        else:
            N_count += 1
    print('N_count = %s' % N_count)
    print('read_count = %s' % read_count)
    mycoplasma_fasta.close()
    return bloomf
Esempio n. 2
0
    def _decode_similarity_destination(self, meta_message, authentication_impl):
        if __debug__:
            from authentication import Authentication
        assert isinstance(meta_message, Message)
        assert isinstance(authentication_impl, Authentication.Implementation)

        try:
            my_similarity, = self._dispersy_database.execute(u"SELECT similarity FROM similarity WHERE community = ? AND user = ? AND cluster = ?",
                                                             (self._community.database_id,
                                                              self._community._my_member.database_id,
                                                              meta_message.destination.cluster)).next()
        except StopIteration:
            raise DropPacket("We don't know our own similarity... should not happen")
        my_similarity = BloomFilter(str(my_similarity), 0)

        try:
            sender_similarity, = self._dispersy_database.execute(u"SELECT similarity FROM similarity WHERE community = ? AND user = ? AND cluster = ?",
                                                                 (self._community.database_id,
                                                                  authentication_impl.member.database_id,
                                                                  meta_message.destination.cluster)).next()
        except StopIteration:
            raise DelayPacketBySimilarity(self._community, authentication_impl.member, meta_message.destination)
        sender_similarity = BloomFilter(str(sender_similarity), 0)

        return meta_message.destination.implement(my_similarity.bic_occurrence(sender_similarity))
Esempio n. 3
0
 def _sync(self):
     self.bf = BloomFilter(BF_SIZE, BF_HASH_COUNT)
     with kv_reader(self.path) as r:
         while r.has_next():
             key = r.read_key()
             self.bf.add(key)
             r.skip_value()
    def test_all_test_positive_when_hashes_collide(self):
        """BloomFilter.test_by_hash() returns False when filter is empty."""
        bloom_filter = BloomFilter(1000000, 1e-3)

        bloom_filter.add_by_hash("abc")

        self.assertEqual(bloom_filter.test_by_hash("def"), False)
    def test_returns_positive_when_hashes_collide(self):
        """BloomFilter.test_by_hash() returns True when hashes collide."""
        bloom_filter = BloomFilter(1000000, 1e-3)

        bloom_filter.add_by_hash("abc")

        self.assertEqual(bloom_filter.test_by_hash(u"abc"), True)
Esempio n. 6
0
def test_bloom():
    data = (str(uuid.uuid1()) for i in range(100000))
    filter = BloomFilter(100000, 0.0001)
    for item in data:
        if not item in filter:
            filter.add(item)
    print "{name} costs {bytes} bytes.".format(name=sys._getframe().f_code.co_name, bytes=filter.container_size())
Esempio n. 7
0
 def __init__(self):
     self.cc = ConnectToCassandra()
     self.n, self.word_present = self.cc.get_id()  #no of items to add
     self.p = 0.05  #false positive probability
     self.bloomf = BloomFilter(self.n, self.p)
     for item in self.word_present:
         self.bloomf.add(bytes(to_integer(item.date())))
Esempio n. 8
0
class BloomFilterMR(MRJob):
    
    
    def __init__(self, *args, **kwargs):
        super(BloomFilterMR, self).__init__(*args, **kwargs)
        self.n = 20
        self.p = 0.05
        self.hot_list = [1,8,14,12,23,31,55]
		
    

    #defining steps 
    def steps(self):    
        return [
            MRStep(mapper_init=self.mapper_init
            		,mapper=self.mapper
                  )

        ]

    def mapper_init(self):
		self.bloomf = BloomFilter(self.n,self.p) 

		for elem in self.hot_list:
			self.bloomf.add(str(elem))


    #MapReduce Phase 1 : convert temperature data into city,day,temp,temp_count
    def mapper(self, _, line):
        (city,temp,timestamp) = line.split('|')
        if self.bloomf.check(temp):
            yield city,(temp,timestamp)
Esempio n. 9
0
def main():
    number_of_items = 20
    false_positive_probability = 0.1
    bloom = BloomFilter(number_of_items, false_positive_probability)
    word_present = [
        'abound', 'abounds', 'abundance', 'abundant', 'accessable', 'bloom',
        'blossom', 'bolster', 'bonny', 'bonus', 'bonuses', 'coherent',
        'cohesive', 'colorful', 'comely', 'comfort', 'gems', 'generosity',
        'generous', 'generously', 'genial'
    ]
    word_absent = [
        'bluff', 'cheater', 'hate', 'war', 'humanity', 'racism', 'hurt',
        'nuke', 'gloomy', 'facebook', 'geeksforgeeks', 'twitter'
    ]
    print('bloomfilter size: ', bloom.bit_size)
    print('false_positive_probability', bloom.false_positive_probability)
    print('hash_count: ', bloom.hash_count)

    for item in word_present:
        bloom.add(item)
    shuffle(word_present)
    shuffle(word_absent)
    random_list = word_present[:5] + word_absent[:5]
    shuffle(random_list)
    for word in random_list:
        print('word: ', word)
        if bloom.check(word):
            if word in word_absent:
                print('false positive')
            else:
                print('word most likely member')
        else:
            print('word not present')
Esempio n. 10
0
    def test_all_test_positive_when_hashes_collide(self):
        '''BloomFilter.test_by_hash() returns False when filter is empty'''
        bloom_filter = BloomFilter(1000000, 1e-3)

        bloom_filter.add_by_hash('abc')

        self.assertEqual(bloom_filter.test_by_hash('def'), False)
Esempio n. 11
0
    def test_returns_true_positive_when_value_had_been_added(self):
        '''BloomFilter.test_by_hash() returns True after the item added'''
        bloom_filter = BloomFilter(1000000, 1e-3)

        bloom_filter.add_by_hash('abc')

        self.assertEqual(bloom_filter.test_by_hash('abc'), True)
Esempio n. 12
0
    def test_returns_positive_when_hashes_collide(self):
        '''BloomFilter.test_by_hash() returns True when hashes collide'''
        bloom_filter = BloomFilter(1000000, 1e-3)

        bloom_filter.add_by_hash('abc')

        self.assertEqual(bloom_filter.test_by_hash(u'abc'), True)
Esempio n. 13
0
class TestBloomFilter(unittest.TestCase):
    def setUp(self):
        self.size = 500000
        self.hash_count = 7

        self.bf = BloomFilter(self.size, self.hash_count)
        lst = ['abc', 'xyz', 'foo', 'bar']
        for item in lst:
            self.bf.add(item)

    def _initialize(self):
        pass

    def _cleanup(self):
        if self.bf:
            del(self.bf)
            self.bf = None

    def test_lookup_yes(self):
        self.assertEqual(self.bf.lookup('foo'), True)

    def test_lookup_no(self):
        self.assertEqual(self.bf.lookup('hello'), False)

    def tearDown(self):
        self._cleanup()
 def test_exercise_4(self):
     last_block_hex = '000000000d65610b5af03d73ed67704713c9b734d87cf4b970d39a0416dd80f9'
     last_block = bytes.fromhex(last_block_hex)
     secret = little_endian_to_int(
         hash256(b'Jimmy Song Programming Blockchain'))
     private_key = PrivateKey(secret=secret)
     addr = private_key.point.address(testnet=True)
     h160 = decode_base58(addr)
     target_address = 'mwJn1YPMq7y5F8J3LkC5Hxg9PHyZ5K4cFv'
     self.assertEqual(addr, target_address)
     filter_size = 30
     filter_num_functions = 5
     filter_tweak = 90210  # FILL THIS IN
     target_h160 = decode_base58(target_address)
     target_script = p2pkh_script(target_h160)
     fee = 5000  # fee in satoshis
     node = SimpleNode('tbtc.programmingblockchain.com',
                       testnet=True,
                       logging=False)
     bf = BloomFilter(filter_size, filter_num_functions, filter_tweak)
     bf.add(h160)
     node.handshake()
     node.send(b'filterload', bf.filterload())
     getheaders_message = GetHeadersMessage(start_block=last_block)
     node.send(getheaders_message.command, getheaders_message.serialize())
     headers_envelope = node.wait_for_commands([HeadersMessage.command])
     stream = headers_envelope.stream()
     headers = HeadersMessage.parse(stream)
     get_data_message = GetDataMessage()
     for block in headers.blocks:
         self.assertTrue(block.check_pow())
         if last_block is not None:
             self.assertEqual(block.prev_block, last_block)
         last_block = block.hash()
         get_data_message.add_data(FILTERED_BLOCK_DATA_TYPE, last_block)
     node.send(get_data_message.command, get_data_message.serialize())
     prev_tx = None
     while prev_tx is None:
         envelope = node.wait_for_commands([b'merkleblock', b'tx'])
         stream = envelope.stream()
         if envelope.command == b'merkleblock':
             mb = MerkleBlock.parse(stream)
             self.assertTrue(mb.is_valid())
         else:
             prev = Tx.parse(stream, testnet=True)
             for i, tx_out in enumerate(prev.tx_outs):
                 if tx_out.script_pubkey.address(testnet=True) == addr:
                     prev_tx = prev.hash()
                     prev_index = i
                     prev_amount = tx_out.amount
                     break
     tx_in = TxIn(prev_tx, prev_index)
     output_amount = prev_amount - fee
     tx_out = TxOut(output_amount, target_script)
     tx_obj = Tx(1, [tx_in], [tx_out], 0, testnet=True)
     tx_obj.sign_input(0, private_key)
     self.assertEqual(
         tx_obj.serialize().hex(),
         '010000000194e631abb9e1079ec72a1616a3aa0111c614e65b96a6a4420e2cc6af9e6cc96e000000006a47304402203cc8c56abe1c0dd043afa9eb125dafbebdde2dd4cd7abf0fb1aae0667a22006e02203c95b74d0f0735bbf1b261d36e077515b6939fc088b9d7c1b7030a5e494596330121021cdd761c7eb1c90c0af0a5963e94bf0203176b4662778d32bd6d7ab5d8628b32ffffffff01f8829800000000001976a914ad346f8eb57dee9a37981716e498120ae80e44f788ac00000000'
     )
Esempio n. 15
0
def rappor(n,f,p,q,m):
    n=str(n)
    bloom=BloomFilter()
    noisydata=bloom.add_data(n,m)
    
    
# Permanent randomized response  
    for i in range(len(noisydata)):
        choose=np.random.randint(0,totalnum)
        if noisydata[i]==1:
            if choose/totalnum<=f/2:
                noisydata[i]=0
        else:
            if choose/totalnum<=f/2:
                noisydata[i]=1

                
                    

                    
    # Instantaneous randomized response
    for i in range(len(noisydata)):
         choose=np.random.randint(0,totalnum)
         if noisydata[i]==1:
             if choose/totalnum<=1-q:
                 noisydata[i]=0
         else:
             if choose/totalnum<=p:
                 noisydata[i]=1
                 
    return noisydata
Esempio n. 16
0
 def test_exercise_2(self):
     block_hash = bytes.fromhex(
         '0000000053787814ed9dd8c029d0a0a9af4ab8ec0591dc31bdc4ab31fae88ce9')
     passphrase = b'Jimmy Song Programming Blockchain'  # FILL THIS IN
     secret = little_endian_to_int(hash256(passphrase))
     private_key = PrivateKey(secret=secret)
     addr = private_key.point.address(testnet=True)
     filter_size = 30
     filter_num_functions = 5
     filter_tweak = 90210  # FILL THIS IN
     h160 = decode_base58(addr)
     bf = BloomFilter(filter_size, filter_num_functions, filter_tweak)
     bf.add(h160)
     node = SimpleNode('tbtc.programmingblockchain.com',
                       testnet=True,
                       logging=False)
     node.handshake()
     node.send(bf.filterload())
     getdata = GetDataMessage()
     getdata.add_data(FILTERED_BLOCK_DATA_TYPE, block_hash)
     node.send(getdata)
     mb = node.wait_for(MerkleBlock)
     tx = node.wait_for(Tx)
     self.assertEqual(
         tx.serialize().hex(),
         '0100000002a663815ab2b2ba5f53e442f9a2ea6cc11bbcd98fb1585e48a134bd870dbfbd6a000000006a47304402202151107dc2367cf5a9e2429cde0641c252374501214ce52069fbca1320180aa602201a43b5d4f91e48514c00c01521dc04f02c57f15305adc4eaad01c418f6e7a1180121031dbe3aff7b9ad64e2612b8b15e9f5e4a3130663a526df91abfb7b1bd16de5d6effffffff618b00a343488bd62751cf21f368ce3be76e3a0323fdc594a0d24f27a1155cd2000000006a473044022024c4dd043ab8637c019528b549e0b10333b2dfa83e7ca66776e401ad3fc31b6702207d4d1d73ac8940c59c57c0b7daf084953324154811c10d06d0563947a88f99b20121031dbe3aff7b9ad64e2612b8b15e9f5e4a3130663a526df91abfb7b1bd16de5d6effffffff0280969800000000001976a914ad346f8eb57dee9a37981716e498120ae80e44f788aca0ce6594000000001976a9146e13971913b9aa89659a9f53d327baa8826f2d7588ac00000000'
     )
Esempio n. 17
0
 def create(cls, path, memtable):
     bf = BloomFilter(BF_SIZE, BF_HASH_COUNT)
     with kv_writer(path) as writer:
         for key, value in memtable.entries():
             writer.write_entry(key, value)
             bf.add(key)
     return cls(path, bf)
Esempio n. 18
0
    def test_raises_error_on_out_of_range_error_rate(self):
        '''BloomFilter() raises on out-of-range error rate'''
        with self.assertRaises(ValueError):
            BloomFilter(5, -1)

        with self.assertRaises(ValueError):
            BloomFilter(5, 2)
Esempio n. 19
0
 def test_exercise_6(self):
     last_block_hex = '000000000d65610b5af03d73ed67704713c9b734d87cf4b970d39a0416dd80f9'
     secret = little_endian_to_int(
         hash256(b'Jimmy Song Programming Blockchain'))
     private_key = PrivateKey(secret=secret)
     addr = private_key.point.address(testnet=True)
     h160 = decode_base58(addr)
     target_address = 'mwJn1YPMq7y5F8J3LkC5Hxg9PHyZ5K4cFv'
     self.assertEqual(addr, target_address)
     target_h160 = decode_base58(target_address)
     target_script = p2pkh_script(target_h160)
     fee = 5000
     node = SimpleNode('tbtc.programmingblockchain.com', testnet=True)
     bf = BloomFilter(30, 5, 90210)
     bf.add(h160)
     node.handshake()
     node.send(bf.filterload())
     start_block = bytes.fromhex(last_block_hex)
     getheaders = GetHeadersMessage(start_block=start_block)
     node.send(getheaders)
     headers = node.wait_for(HeadersMessage)
     last_block = None
     getdata = GetDataMessage()
     for b in headers.blocks:
         if not b.check_pow():
             raise RuntimeError('proof of work is invalid')
         if last_block is not None and b.prev_block != last_block:
             raise RuntimeError('chain broken')
         getdata.add_data(FILTERED_BLOCK_DATA_TYPE, b.hash())
         last_block = b.hash()
     node.send(getdata)
     prev_tx, prev_index, prev_tx_obj = None, None, None
     while prev_tx is None:
         message = node.wait_for(MerkleBlock, Tx)
         if message.command == b'merkleblock':
             if not message.is_valid():
                 raise RuntimeError('invalid merkle proof')
         else:
             message.testnet = True
             for i, tx_out in enumerate(message.tx_outs):
                 if tx_out.script_pubkey.address(testnet=True) == addr:
                     prev_tx = message.hash()
                     prev_index = i
                     prev_amount = tx_out.amount
                     self.assertEqual(
                         message.id(),
                         '6ec96c9eafc62c0e42a4a6965be614c61101aaa316162ac79e07e1b9ab31e694'
                     )
                     self.assertEqual(i, 0)
                     break
     tx_in = TxIn(prev_tx, prev_index)
     output_amount = prev_amount - fee
     tx_out = TxOut(output_amount, target_script)
     tx_obj = Tx(1, [tx_in], [tx_out], 0, testnet=True)
     tx_obj.sign_input(0, private_key)
     self.assertEqual(
         tx_obj.serialize().hex(),
         '010000000194e631abb9e1079ec72a1616a3aa0111c614e65b96a6a4420e2cc6af9e6cc96e000000006a47304402203cc8c56abe1c0dd043afa9eb125dafbebdde2dd4cd7abf0fb1aae0667a22006e02203c95b74d0f0735bbf1b261d36e077515b6939fc088b9d7c1b7030a5e494596330121021cdd761c7eb1c90c0af0a5963e94bf0203176b4662778d32bd6d7ab5d8628b32ffffffff01f8829800000000001976a914ad346f8eb57dee9a37981716e498120ae80e44f788ac00000000'
     )
Esempio n. 20
0
    def rabinKarp(self, patterns, txt):
        if (not txt or not patterns):
            raise ValueError('Search requires text and a pattern')

        q = 101  # a prime number
        d = 256
        h = 1

        matches = dict()
        for p in patterns:
            matches[p] = []

        patternHashes = []
        patternLen = len(next(iter(patterns)))  #length of first pattern
        txtLen = len(txt)

        if (txtLen < patternLen):
            raise ValueError(
                'A pattern longer than text to search cannot exist in the text.'
            )

        # The value of h would be "pow(d, M-1)%q"
        for i in range(patternLen - 1):
            h = (h * d) % q

        numPat = len(patterns)
        if (numPat < 1):
            raise ValueError('Search requires a pattern')

        for pat in set(patterns):
            if (patternLen != len(pat)):
                raise ValueError(
                    'Search only supports a fixed length pattern match.')

            patternHash = 0
            for i in range(patternLen):
                patternHash = (d * patternHash + ord(pat[i])) % q
            patternHashes.append(patternHash)
        bloomf = BloomFilter(patternHashes)

        # setup the first comparison based on length of pattern
        left = 0
        right = patternLen
        txtHash = 0
        for j in range(patternLen):
            txtHash = (d * txtHash + ord(txt[j])) % q

        #scoot through txt 1 char at a time
        while (right <= txtLen):
            if (bloomf.contains(txtHash)):
                if (txt[left:right] in patterns):
                    matches[txt[left:right]].append(left)
            if (left + patternLen < txtLen):
                txtHash = (d * (txtHash - ord(txt[left]) * h) +
                           ord(txt[left + patternLen])) % q
            left += 1
            right += 1

        return matches
Esempio n. 21
0
 def test_bloomfilter(self):
     bloom = BloomFilter(100)
     for i in xrange(50):
         bloom.add(str(i))
     assert "20" in bloom
     assert "25" in bloom
     assert "49" in bloom
     assert "50" not in bloom
Esempio n. 22
0
 def test_bloomfilter(self):
     bloom = BloomFilter(100)
     for i in xrange(50):
         bloom.add(str(i))
     assert "20" in bloom
     assert "25" in bloom
     assert "49" in bloom
     assert "50" not in bloom
Esempio n. 23
0
    def test_serializes_filter_serialize_without_line_feeds(self):
        '''BloomFilter serializes with base64 shield without line feeds'''
        bloom_filter = BloomFilter(100, 0.1)
        bloom_filter.add_by_hash('abcdef')

        serialized_filter = bloom_filter.serialize()

        self.assertEqual(serialized_filter.find('\n'), -1)
Esempio n. 24
0
 def test_simple(self):
     ELM1 = 'something'
     ELM2 = 'something else'
     bf = BloomFilter(100000)
     bf.add_key(ELM1)
     self.assertTrue(bf.contains(ELM1))
     #very insignificant chance that this next assertion will FAIL
     self.assertTrue(not(bf.contains(ELM2)))
Esempio n. 25
0
 def test_simple2(self):
     ELM1 = 'something'
     ELM2 = 'something else'
     bf = BloomFilter(100000)
     bf.add_key(ELM1)
     bf.add_key(ELM2)
     self.assertTrue(bf.contains(ELM1))
     self.assertTrue(bf.contains(ELM2))
Esempio n. 26
0
    def test_serializes_filter_serialize_without_line_feeds(self):
        """Serializes with base64 shield without line feeds."""
        bloom_filter = BloomFilter(100, 0.1)
        bloom_filter.add_by_hash("abcdef")

        serialized_filter = bloom_filter.serialize()

        # self.assertEqual(serialized_filter.find("\n"), -1)
        self.assertTrue(b"\n" not in serialized_filter)
Esempio n. 27
0
 def __init__(self, cnt, word_present):
     self.n = cnt
     self.word_present = word_present  #no of items to add
     self.p = 0.05  #false positive probability
     self.bloomf = BloomFilter(self.n, self.p)
     for item in self.word_present:
         print(item)
         self.bloomf.add(
             bytes(to_integer(datetime.datetime.strptime(item, '%Y%m%d'))))
Esempio n. 28
0
def test():
    bf = BloomFilter(num_hashes=10, size_bytes=100)
    bf.add('hello')
    s = pickle.dumps(bf)

    bf2 = pickle.loads(s)
    assert 'hi' not in bf2
    assert 'hello' in bf2
    assert (bf.seeds == bf2.seeds).all()
Esempio n. 29
0
    def test_creates_filter_with_non_integral_capacity(self):
        '''BloomFilter() creates filter with non-integral capacity'''
        float_filter = BloomFilter(capacity=1000.2, error_rate=1e-3)
        int_filter = BloomFilter(capacity=1000, error_rate=1e-3)

        bit_count = int_filter.bit_count
        self.assertGreaterEqual(float_filter.bit_count, bit_count)
        self.assertLess(float_filter.bit_count, bit_count + 10)
        self.assertEqual(int_filter.hash_count, float_filter.hash_count)
Esempio n. 30
0
def _build_guided_bloom(prefixes, fpp, k, num_bits, root, fib, protocol='v4'):
    '''Returns a Bloom filer optimized for the `root` bin search tree,
        and `encoded_pref_lens` dict for looking up the BMP prefix length
        from hash-encoded bit sequence.
    '''
    max_shift = NUMBITS[protocol]

    if not (k or num_bits):
        bf = BloomFilter(fpp, len(prefixes['prefixes']))
    else:
        bf = BloomFilter(fpp,
                         len(prefixes['prefixes']),
                         k=k,
                         num_bits=num_bits)

    count = 0  # report progress
    for pair in prefixes['prefixes']:
        if count % 10000 == 0:
            print('build processsed %.3f of all prefixes' %
                  (count / len(prefixes['prefixes'])))
        count += 1

        prefix, preflen = pair
        # BMP is an index, can recover prefix length using prefixes['ix2len']
        bmp, fib_val = _find_bmp(prefix,
                                 bf,
                                 root,
                                 fib,
                                 preflen - 1,
                                 prefixes['minn'],
                                 prefixes['len2ix'],
                                 prefixes['ix2len'],
                                 protocol=protocol)

        current = root
        count_hit = 0
        while current:
            if preflen < current.val:
                current = current.left
            elif preflen == current.val:
                # insert using hash_1..hash_k
                pref_encoded = encode_ip_prefix_pair(prefix, preflen, protocol)
                bf.insert(pref_encoded, hashes=_choose_hash_funcs(0, end=bf.k))
                break
            else:  # preflen > current.val
                masked = (((1 << max_shift) - 1) <<
                          (max_shift - current.val)) & prefix
                pref_encoded = encode_ip_prefix_pair(masked, current.val,
                                                     protocol)
                bf.insert(pref_encoded, hashes=_choose_hash_funcs(0, end=1))
                count_hit += 1
                # insert pointers
                bf.insert(pref_encoded,
                          hashes=_choose_hash_funcs(count_hit, pattern=bmp))
                current = current.right
    return bf, root
Esempio n. 31
0
    def __init__(self):
        reload(sys)
        sys.setdefaultencoding("utf-8")
        self.datas = []  #清洗后的数据
        self.raw_datas = []  #原始数据

        # self.key_infos = set()      #帮助去除重复数据
        self.key_infos = BloomFilter(0.001, 1000000)  #学习使用bloomfilter

        self.dupli_count = 0
 def test_non_randoms_at_all(self):
     '''Ensure that small bit differences do not play bad'''
     bloom_filter = BloomFilter(1000000, 1e-5)
     collision_count = 0
     for ix in range(1000000):
         if bloom_filter.test_by_hash(ix):
             collision_count += 1
         else:
             bloom_filter.add_by_hash(ix)
     self.assertEqual(collision_count, 0)
 def __init__(
         self, capacity, tolerant,
         redis_conn={}):
     size, hash_count = self._get_cap(capacity, tolerant)
     self.redis_pool = redis.ConnectionPool(
         host=redis_conn.get('host', '127.0.0.1'),
         port=redis_conn.get('port', 6379),
         db=redis_conn.get('db', 0),
     )
     self.bf = BloomFilter(hash_count, size)
     self.bfkey = redis_conn.get('bfkey', 'bf')
Esempio n. 34
0
def write_bloom_filter():
    bloomf = BloomFilter(n, p)
    print("Size of bit array:{}".format(bloomf.size))
    print("False positive Probability:{}".format(bloomf.fp_prob))
    print("Number of hash functions:{}".format(bloomf.hash_count))

    for item in word_present:
        bloomf.add(item)

    with open(filename, "wb") as outfile:
        outfile.write(bloomf.prepare_bloom_filter_to_write())
Esempio n. 35
0
 def __init__(self, maxLevel=4):
     proxies = None
     self.size = 100000000
     self.bloomfilter = BloomFilter(self.size)
     self.maxLevel = maxLevel
     self.threadLock = Lock()
     self.checkThreadLock = Lock()
     self.totalThreads = 0
     self.crawledPages = 0
     self.verbose = True
     self.console = Console()
Esempio n. 36
0
 def test_get_filtered_txs(self):
     from bloomfilter import BloomFilter
     bf = BloomFilter(30, 5, 90210)
     h160 = decode_base58('mseRGXB89UTFVkWJhTRTzzZ9Ujj4ZPbGK5')
     bf.add(h160)
     node = SimpleNode('tbtc.programmingblockchain.com', testnet=True)
     node.handshake()
     node.send(bf.filterload())
     block_hash = bytes.fromhex('00000000000377db7fde98411876c53e318a395af7304de298fd47b7c549d125')
     txs = node.get_filtered_txs([block_hash])
     self.assertEqual(txs[0].id(), '0c024b9d3aa2ae8faae96603b8d40c88df2fc6bf50b3f446295206f70f3cf6ad')
     self.assertEqual(txs[1].id(), '0886537e27969a12478e0d33707bf6b9fe4fdaec8d5d471b5304453b04135e7e')
     self.assertEqual(txs[2].id(), '23d4effc88b80fb7dbcc2e6a0b0af9821c6fe3bb4c8dc3b61bcab7c45f0f6888')
Esempio n. 37
0
class pydrbloomfilter:

    def __init__(
            self, capacity, tolerant,
            redis_conn={}):
        size, hash_count = self._get_cap(capacity, tolerant)
        self.redis_pool = redis.ConnectionPool(
            host=redis_conn.get('host', '127.0.0.1'),
            port=redis_conn.get('port', 6379),
            db=redis_conn.get('db', 0),
        )
        self.bf = BloomFilter(hash_count, size)
        self.bfkey = redis_conn.get('bfkey', 'bf')

    '''
    given M = num_bits, k = num_slices, P = error_rate, n = capacity
    k = log2(1/P)
    solving for m = bits_per_slice
    n ~= M * ((ln(2) ** 2) / abs(ln(P)))
    n ~= (k * m) * ((ln(2) ** 2) / abs(ln(P)))
    m ~= n * abs(ln(P)) / (k * (ln(2) ** 2))

    '''
    def _get_cap(self, capacity, false_prob):
        num_slices = int(math.ceil(math.log(1.0/false_prob, 2)))
        bits_per_slice = int(math.ceil(
            (capacity * abs(math.log(false_prob))) /
            (num_slices * (math.log(2) ** 2))
            ))
        num_bits = num_slices * bits_per_slice
        size = math.ceil(math.log(num_bits, 2))
        return int(size), int(num_slices)

    def add(self, string):
        r_client = redis.Redis(connection_pool=self.redis_pool)
        return r_client.setbit(self.bfkey, self.bf.generate(string), 1)

    def exists(self, string):
        r_client = redis.Redis(connection_pool=self.redis_pool)
        return r_client.getbit(self.bfkey, self.bf.generate(string))

    def __contains__(self, string):
        return self.exists(string)

    def clear(self):
        r_client = redis.Redis(connection_pool=self.redis_pool)
        return r_client.delete(self.bfkey)
Esempio n. 38
0
    def setUp(self):
        self.size = 500000
        self.hash_count = 7

        self.bf = BloomFilter(self.size, self.hash_count)
        lst = ['abc', 'xyz', 'foo', 'bar']
        for item in lst:
            self.bf.add(item)
Esempio n. 39
0
    def __init__(self):
#         self.bf = BloomFilter(10000000, 0.01, 'filter.bloom')
        
        self.bf = BloomFilter(10000, 0.0001, 'filter.bloom')
        self.f_write = open('visitedsites','w')
        self.si = SearchIndex()
        self.si.SearchInit()
        self.count_num = 0
Esempio n. 40
0
 def __init__(self):
     self.sqli_bool = self.load('sqli_bool')
     self.sqli_error = self.load('sqli_error')
     self.form_pattern = re.compile(r"(?i)<form .+?>.+?</form>".encode())
     self.action_pattern = re.compile(r'''(?i)<form .*?action=["']([^\s"'<>]+)'''.encode())
     self.method_pattern = re.compile(r'''(?i)<form .*?method=["']([^\s"'<>]+)'''.encode())
     self.input_pattern = re.compile(r'''(?i)<input .*?type=["']([^\s"'<>]+).*?name=["']([^\s"'<>]+).*?value=["']([^\s"'<>]+)'''.encode())
     self.input_types = [b'',b'text',b'hidden',b'password']
     self.posted = BloomFilter()
Esempio n. 41
0
class DuplicatesPipeline(object):
    def __init__(self):
#         self.bf = BloomFilter(10000000, 0.01, 'filter.bloom')
        
        self.bf = BloomFilter(10000, 0.0001, 'filter.bloom')
        self.f_write = open('visitedsites','w')
        self.si = SearchIndex()
        self.si.SearchInit()
        self.count_num = 0
        

    def process_item(self, item, spider):
#         print '************%d pages visited!*****************' %len(self.bf)
        temp='?'
        str1=item['url']
        str2=str1[:str1.find(temp)]
#         if self.bf.add(item['url']):#True if item in the BF
#         if self.bf.lookup(item['url']):
        if self.bf.lookup(str2):   
            raise DropItem("Duplicate item found: %s" % item)
        else:
#             print '%d pages visited!'% len(self.url_seen)
            self.count_num+=1
#             self.bf.add(item['url'])
#             self.save_to_file(item['url'],item['title'])
            self.bf.add(str2)
            self.save_to_file(item['url'],item['title'])
            self.si.AddIndex(item)
            print self.count_num
            return item

    def save_to_file(self,url,utitle):
        self.f_write.write(url)
        self.f_write.write('\t')
        self.f_write.write(utitle.encode('utf-8'))
        self.f_write.write('\n')

    def __del__(self):
        """docstring for __del__"""
        self.f_write.close()
        self.si.IndexDone()
class TestBloomFilter(unittest.TestCase):

    def setUp(self):
        self.bf = BloomFilter(256)
        self.existing_strings = [
            'tiny', 'bloom', 'rate', 'back', 'apple', 'google',
            'dijkstra', 'limiter', 'url', 'travel', 'man',
            '2',
        ]
        for each in self.existing_strings:
            self.bf.insert(each)
        self.non_existing_strings = [
            'multi', 'short', 'path', 'components', 'connect',
            'unit', 'test',
        ]

    def test_of_bloomfilter(self):
        for each in self.existing_strings:
            self.assertTrue(self.bf.lookup(each))
        for each in self.non_existing_strings:
            # with small false positive, this will fail :)
            self.assertFalse(self.bf.lookup(each))
 def setUp(self):
     self.bf = BloomFilter(256)
     self.existing_strings = [
         'tiny', 'bloom', 'rate', 'back', 'apple', 'google',
         'dijkstra', 'limiter', 'url', 'travel', 'man',
         '2',
     ]
     for each in self.existing_strings:
         self.bf.insert(each)
     self.non_existing_strings = [
         'multi', 'short', 'path', 'components', 'connect',
         'unit', 'test',
     ]
Esempio n. 44
0
File: ga.py Progetto: smorin/exelixi
    def __init__ (self, indiv_instance, ff_name, prefix="/tmp/exelixi"):
        self.indiv_class = indiv_instance.__class__
        self.feature_factory = instantiate_class(ff_name)

        self.prefix = prefix
        self._shard_id = None
        self._exe_dict = None
        self._hash_ring = None

        self.n_pop = self.feature_factory.n_pop
        self._total_indiv = 0
        self._term_limit = self.feature_factory.term_limit
        self._hist_granularity = self.feature_factory.hist_granularity

        self._selection_rate = self.feature_factory.selection_rate
        self._mutation_rate = self.feature_factory.mutation_rate

        self._shard = {}
        self._bf = BloomFilter(num_bytes=125, num_probes=14, iterable=[])
Esempio n. 45
0
import random
import math
from bloomfilter import BloomFilter

bf = BloomFilter(101, 5)
bf.insert(56)
assert not bf.contains(99)
assert bf.contains(56)


bitsize = 16000000
bf_num_count =  1000000
test_num_count = 100000000

hash_count = int(math.log(2) * bitsize / bf_num_count)
print 'number of hash functions to use:', hash_count

bf = BloomFilter(bitsize, hash_count) 

bf_nums = set()
for _ in xrange(bf_num_count):
    rand_int = random.randint(0, 2**64)
    bf_nums.add(rand_int)
    bf.insert(rand_int)

false_alarms = 0
for _ in xrange(test_num_count):
    rand_num = random.randint(0, 2**64)
    if bf.contains(rand_num) and not (rand_num in bf_nums):
        false_alarms += 1
    for i in xrange(len(l1)):
        assert l1[i] == l2[i]

if __name__ == '__main__':
    from random import sample
    from string import ascii_letters

    states = '''Alabama Alaska Arizona Arkansas California Colorado Connecticut
        Delaware Florida Georgia Hawaii Idaho Illinois Indiana Iowa Kansas
        Kentucky Louisiana Maine Maryland Massachusetts Michigan Minnesota
        Mississippi Missouri Montana Nebraska Nevada NewHampshire NewJersey
        NewMexico NewYork NorthCarolina NorthDakota Ohio Oklahoma Oregon
        Pennsylvania RhodeIsland SouthCarolina SouthDakota Tennessee Texas Utah
        Vermont Virginia Washington WestVirginia Wisconsin Wyoming'''.split()

    bf1 = BloomFilter(ideal_num_elements_n=100000, error_rate_p=0.001)
    for state in states:
        bf1.add(state)

    json_bf = bf1.toJSON()

    print "##################"
    print json_bf
    print "##################"

    len_json = len(json_bf)
    print "data size: %s bytes"%len_json

    bf2 = BloomFilter.fromJSON(json_bf)
    assertListEquals(bf1.data, bf2.data)
Esempio n. 47
0
File: ga.py Progetto: smorin/exelixi
class Population (object):
    def __init__ (self, indiv_instance, ff_name, prefix="/tmp/exelixi"):
        self.indiv_class = indiv_instance.__class__
        self.feature_factory = instantiate_class(ff_name)

        self.prefix = prefix
        self._shard_id = None
        self._exe_dict = None
        self._hash_ring = None

        self.n_pop = self.feature_factory.n_pop
        self._total_indiv = 0
        self._term_limit = self.feature_factory.term_limit
        self._hist_granularity = self.feature_factory.hist_granularity

        self._selection_rate = self.feature_factory.selection_rate
        self._mutation_rate = self.feature_factory.mutation_rate

        self._shard = {}
        self._bf = BloomFilter(num_bytes=125, num_probes=14, iterable=[])


    def set_ring (self, shard_id, exe_dict):
        """initialize the HashRing"""
        self._shard_id = shard_id
        self._exe_dict = exe_dict
        self._hash_ring = HashRing(exe_dict.keys())


    ######################################################################
    ## Individual lifecycle within the local subset of the Population

    def populate (self, current_gen):
        """initialize the population"""
        for _ in xrange(self.n_pop):
            # constructor pattern
            indiv = self.indiv_class()
            indiv.populate(current_gen, self.feature_factory.generate_features())

            # add the generated Individual to the Population
            # failure semantics: must filter nulls from initial population
            self.reify(indiv)


    def reify (self, indiv):
        """test/add a newly generated Individual into the Population (birth)"""
        neighbor_shard_id = None
        exe_uri = None

        if self._hash_ring:
            neighbor_shard_id = self._hash_ring.get_node(indiv.key)

            if neighbor_shard_id != self._shard_id:
                exe_uri = self._exe_dict[neighbor_shard_id]

        # distribute this operation over the hash ring, through a remote queue
        if exe_uri:
            msg = { "key": indiv.key, "gen": indiv.gen, "feature_set": loads(indiv.get_json_feature_set()) }
            lines = post_exe_rest(self.prefix, neighbor_shard_id, exe_uri, "pop/reify", msg)
            return False
        else:
            return self._reify_locally(indiv)


    def receive_reify (self, key, gen, feature_set):
        """test/add a received reify request """
        indiv = self.indiv_class()
        indiv.populate(gen, feature_set)
        self._reify_locally(indiv)


    def _reify_locally (self, indiv):
        """test/add a newly generated Individual into the Population locally (birth)"""
        if not indiv.key in self._bf:
            self._bf.update([indiv.key])
            self._total_indiv += 1

            # potentially the most expensive operation, deferred until remote reification
            indiv.get_fitness(self.feature_factory, force=True)
            self._shard[indiv.key] = indiv

            return True
        else:
            return False


    def evict (self, indiv):
        """remove an Individual from the Population (death)"""
        if indiv.key in self._shard:
            # Individual only needs to be removed locally
            del self._shard[indiv.key]

            # NB: serialize to disk (write behinds)
            url = self._get_storage_path(indiv)


    def get_part_hist (self):
        """tally counts for the partial histogram of the fitness distribution"""
        d = (Counter([ round(indiv.get_fitness(self.feature_factory, force=False), self._hist_granularity) for indiv in self._shard.values() ])).items()
        d.sort(reverse=True)
        return d


    def get_fitness_cutoff (self, hist):
        """determine fitness cutoff (bin lower bounds) for the parent selection filter"""
        h = hist.items()
        h.sort(reverse=True)
        logging.debug("fit: %s", h)

        n_indiv = sum([ count for bin, count in h ])
        part_sum = 0
        break_next = False

        for bin, count in h:
            if break_next:
                break

            part_sum += count
            percentile = part_sum / float(n_indiv)
            break_next = percentile >= self._selection_rate

        logging.debug("fit: percentile %f part_sum %d n_indiv %d bin %f", percentile, part_sum, n_indiv, bin)
        return bin


    def _get_storage_path (self, indiv):
        """create a path for durable storage of an Individual"""
        return self.prefix + "/" + indiv.key


    def _boost_diversity (self, current_gen, indiv):
        """randomly select other individuals and mutate them, to promote genetic diversity"""
        if self._mutation_rate > random():
            indiv.mutate(self, current_gen, self.feature_factory)
        elif len(self._shard.values()) >= 3:
            # ensure that there are at least three parents
            self.evict(indiv)


    def _select_parents (self, current_gen, fitness_cutoff):
        """select the parents for the next generation"""
        partition = map(lambda x: (round(x.get_fitness(), self._hist_granularity) >= fitness_cutoff, x), self._shard.values())
        good_fit = map(lambda x: x[1], filter(lambda x: x[0], partition))
        poor_fit = map(lambda x: x[1], filter(lambda x: not x[0], partition))

        # randomly select other individuals to promote genetic diversity, while removing the remnant
        for indiv in poor_fit:
            self._boost_diversity(current_gen, indiv)

        return self._shard.values()


    def next_generation (self, current_gen, fitness_cutoff):
        """select/mutate/crossover parents to produce a new generation"""
        parents = self._select_parents(current_gen, fitness_cutoff)

        for _ in xrange(self.n_pop - len(parents)):
            f, m = sample(parents, 2) 
            success = f.breed(self, current_gen, m, self.feature_factory)

        # backfill to avoid the dreaded Population collapse
        for _ in xrange(self.n_pop - len(self._shard.values())):
            # constructor pattern
            indiv = self.indiv_class()
            indiv.populate(current_gen, self.feature_factory.generate_features())
            self.reify(indiv)

        logging.info("gen: %d shard %s size %d total %d", current_gen, self._shard_id, len(self._shard.values()), self._total_indiv)


    def test_termination (self, current_gen, hist):
        """evaluate the terminating condition for this generation and report progress"""
        return self.feature_factory.test_termination(current_gen, self._term_limit, hist)


    def enum (self, fitness_cutoff):
        """enum all Individuals that exceed the given fitness cutoff"""
        return [[ "%0.4f" % indiv.get_fitness(), str(indiv.gen), indiv.get_json_feature_set() ]
                for indiv in filter(lambda x: x.get_fitness() >= fitness_cutoff, self._shard.values()) ]


    def report_summary (self):
        """report a summary of the evolution"""
        for indiv in sorted(self._shard.values(), key=lambda x: x.get_fitness(), reverse=True):
            print self._get_storage_path(indiv)
            print "\t".join(["%0.4f" % indiv.get_fitness(), "%d" % indiv.gen, indiv.get_json_feature_set()])
Esempio n. 48
0
class Scanner():
    def __init__(self):
        self.sqli_bool = self.load('sqli_bool')
        self.sqli_error = self.load('sqli_error')
        self.form_pattern = re.compile(r"(?i)<form .+?>.+?</form>".encode())
        self.action_pattern = re.compile(r'''(?i)<form .*?action=["']([^\s"'<>]+)'''.encode())
        self.method_pattern = re.compile(r'''(?i)<form .*?method=["']([^\s"'<>]+)'''.encode())
        self.input_pattern = re.compile(r'''(?i)<input .*?type=["']([^\s"'<>]+).*?name=["']([^\s"'<>]+).*?value=["']([^\s"'<>]+)'''.encode())
        self.input_types = [b'',b'text',b'hidden',b'password']
        self.posted = BloomFilter()

    def load(self,tp):
        tree = ET.parse("payloads/"+tp+".xml")
        root = tree.getroot()
        payloads = []
        for x in root.findall('payload'):
            payloads.append((x[0].text,x[1].text))
        return payloads

    async def get_forms(self,url,session):
        response = None
        Forms = []
        try:
            response = await session.get(url)
            response.headers = myfunc.tolower(response.headers)
            content_type = response.headers.get('content-type','').split(';')[0]
            if content_type in ['text/html', 'application/xml']:
                body = await response.read()
                forms = self.form_pattern.findall(body)
                for form in forms:
                    Form = dict()
                    tmp = self.action_pattern.search(form)
                    action = tmp.group(1) if tmp else b''
                    tmp = self.method_pattern.search(form)
                    method = tmp.group(1) if tmp else b'get'
                    inputs = self.input_pattern.findall(form)
                    params = dict()
                    for ip in inputs:
                        if ip[0] in self.input_types:
                            params[myfunc.decode(ip[1])] = myfunc.decode(ip[2])
                    Form['action'] = urljoin(url,action.decode())
                    Form['method'] = method.decode()
                    Form['params'] = params
                    Forms.append(Form)
        except Exception as e:
            print(e)
        finally:
            await response.release()
        return Forms

    async def scan(self,url,session):
        #get型
        parse = urlparse(url)
        query = list(map(lambda x:x.split('='),parse.query.split('&')))
        for i in range(len(query)):
            if len(query[0])<2:
                break
            #bool盲注
            for payload in self.sqli_bool:
                q1 = dict()
                q2 = dict()
                for x in query:
                    q1[x[0]] = x[1]
                    q2[x[0]] = x[1]
                q1[query[i][0]] = query[i][1]+payload[0]
                q2[query[i][0]] = query[i][1]+payload[1]
                body1 = body2 = ''
                try:
                    q1 = urlunparse((parse.scheme,parse.netloc,parse.path,parse.params,urlencode(q1),parse.fragment))
                    q2 = urlunparse((parse.scheme,parse.netloc,parse.path,parse.params,urlencode(q2),parse.fragment))
                    resp1 = await session.get(q1)
                    body1 = await resp1.read()
                    resp2 = await session.get(q2)
                    body2 = await resp2.read()
                except:
                    pass
                if abs(1-len(body1)/len(body2))>0.98:
                    print(abs(1-len(body1)/len(body2)))
                    print("Type: Bool_Sqli[GET]")
                    print("Url: "+q1)
                    print("Payload: "+payload[0])
                    print("Param: "+query[i][0])
                    break
        #post型
        Forms = await self.get_forms(url,session)

        for Form in Forms:
            Form_json = json.dumps(Form)
            if self.posted.isContain(Form_json):
                continue
            self.posted.insert(Form_json)
            for param_key in Form['params'].keys():
                for payload in self.sqli_bool:
                    new_params1 = new_params2 = Form['params']
                    new_params1[param_key] = new_params1[param_key]+payload[0]
                    new_params2[param_key] = new_params2[param_key]+payload[1]
                    response1 = response2 = None
                    try:
                        if Form['method'].lower()=='get':
                            response1 = await session.get(Form['action'],params=new_params1)
                            body1 = await response1.read()
                            response2 = await session.get(Form['action'],params=new_params2)
                            body2 = await response2.read()
                        if Form['method'].lower()=='post':
                            response1 = await session.post(Form['action'],data=new_params1)
                            body1 = await response1.read()
                            response2 = await session.post(Form['action'],data=new_params2)
                            body2 = await response2.read()
                    finally:
                        body1 = await response1.read()
                        await response1.release()
                        body2 = await response2.read()
                        await response2.release()
                        if abs(1-len(body1)/len(body2))>0.98:
                            print(abs(1-len(body1)/len(body2)))
                            print("Type: Bool_Sqli[POST]")
                            print("Url: "+url)
                            print("Payload: "+payload[0])
                            print("Param: "+param_key)
                            break