Ejemplo n.º 1
0
class TestBloomFilter(unittest.TestCase):
    def setUp(self):
        self.size = 500000
        self.hash_count = 7

        self.bf = BloomFilter(self.size, self.hash_count)
        lst = ['abc', 'xyz', 'foo', 'bar']
        for item in lst:
            self.bf.add(item)

    def _initialize(self):
        pass

    def _cleanup(self):
        if self.bf:
            del(self.bf)
            self.bf = None

    def test_lookup_yes(self):
        self.assertEqual(self.bf.lookup('foo'), True)

    def test_lookup_no(self):
        self.assertEqual(self.bf.lookup('hello'), False)

    def tearDown(self):
        self._cleanup()
def build_bf(n, p, ref_fasta):
    # call bloom filter class and output stats
    bloomf = BloomFilter(n, p)
    print("Size of bit array:{}".format(bloomf.size))
    print("False positive Probability:{}".format(bloomf.fp_prob))
    print("Number of hash functions:{}".format(bloomf.hash_count))

    mycoplasma_fasta = open(ref_fasta, 'r')
    N_count = 0
    read_count = 0
    while True:
        name = mycoplasma_fasta.readline()  # read id
        if len(name) == 0:
            break  # end of file
        read = mycoplasma_fasta.readline().strip()
        if 'N' not in read:
            # do not add any uncalled bases
            bloomf.add(read)
            read_count += 1
        else:
            N_count += 1
    print('N_count = %s' % N_count)
    print('read_count = %s' % read_count)
    mycoplasma_fasta.close()
    return bloomf
Ejemplo n.º 3
0
class BloomFilterMR(MRJob):
    
    
    def __init__(self, *args, **kwargs):
        super(BloomFilterMR, self).__init__(*args, **kwargs)
        self.n = 20
        self.p = 0.05
        self.hot_list = [1,8,14,12,23,31,55]
		
    

    #defining steps 
    def steps(self):    
        return [
            MRStep(mapper_init=self.mapper_init
            		,mapper=self.mapper
                  )

        ]

    def mapper_init(self):
		self.bloomf = BloomFilter(self.n,self.p) 

		for elem in self.hot_list:
			self.bloomf.add(str(elem))


    #MapReduce Phase 1 : convert temperature data into city,day,temp,temp_count
    def mapper(self, _, line):
        (city,temp,timestamp) = line.split('|')
        if self.bloomf.check(temp):
            yield city,(temp,timestamp)
Ejemplo n.º 4
0
 def test_exercise_2(self):
     block_hash = bytes.fromhex(
         '0000000053787814ed9dd8c029d0a0a9af4ab8ec0591dc31bdc4ab31fae88ce9')
     passphrase = b'Jimmy Song Programming Blockchain'  # FILL THIS IN
     secret = little_endian_to_int(hash256(passphrase))
     private_key = PrivateKey(secret=secret)
     addr = private_key.point.address(testnet=True)
     filter_size = 30
     filter_num_functions = 5
     filter_tweak = 90210  # FILL THIS IN
     h160 = decode_base58(addr)
     bf = BloomFilter(filter_size, filter_num_functions, filter_tweak)
     bf.add(h160)
     node = SimpleNode('tbtc.programmingblockchain.com',
                       testnet=True,
                       logging=False)
     node.handshake()
     node.send(bf.filterload())
     getdata = GetDataMessage()
     getdata.add_data(FILTERED_BLOCK_DATA_TYPE, block_hash)
     node.send(getdata)
     mb = node.wait_for(MerkleBlock)
     tx = node.wait_for(Tx)
     self.assertEqual(
         tx.serialize().hex(),
         '0100000002a663815ab2b2ba5f53e442f9a2ea6cc11bbcd98fb1585e48a134bd870dbfbd6a000000006a47304402202151107dc2367cf5a9e2429cde0641c252374501214ce52069fbca1320180aa602201a43b5d4f91e48514c00c01521dc04f02c57f15305adc4eaad01c418f6e7a1180121031dbe3aff7b9ad64e2612b8b15e9f5e4a3130663a526df91abfb7b1bd16de5d6effffffff618b00a343488bd62751cf21f368ce3be76e3a0323fdc594a0d24f27a1155cd2000000006a473044022024c4dd043ab8637c019528b549e0b10333b2dfa83e7ca66776e401ad3fc31b6702207d4d1d73ac8940c59c57c0b7daf084953324154811c10d06d0563947a88f99b20121031dbe3aff7b9ad64e2612b8b15e9f5e4a3130663a526df91abfb7b1bd16de5d6effffffff0280969800000000001976a914ad346f8eb57dee9a37981716e498120ae80e44f788aca0ce6594000000001976a9146e13971913b9aa89659a9f53d327baa8826f2d7588ac00000000'
     )
Ejemplo n.º 5
0
def main():
    number_of_items = 20
    false_positive_probability = 0.1
    bloom = BloomFilter(number_of_items, false_positive_probability)
    word_present = [
        'abound', 'abounds', 'abundance', 'abundant', 'accessable', 'bloom',
        'blossom', 'bolster', 'bonny', 'bonus', 'bonuses', 'coherent',
        'cohesive', 'colorful', 'comely', 'comfort', 'gems', 'generosity',
        'generous', 'generously', 'genial'
    ]
    word_absent = [
        'bluff', 'cheater', 'hate', 'war', 'humanity', 'racism', 'hurt',
        'nuke', 'gloomy', 'facebook', 'geeksforgeeks', 'twitter'
    ]
    print('bloomfilter size: ', bloom.bit_size)
    print('false_positive_probability', bloom.false_positive_probability)
    print('hash_count: ', bloom.hash_count)

    for item in word_present:
        bloom.add(item)
    shuffle(word_present)
    shuffle(word_absent)
    random_list = word_present[:5] + word_absent[:5]
    shuffle(random_list)
    for word in random_list:
        print('word: ', word)
        if bloom.check(word):
            if word in word_absent:
                print('false positive')
            else:
                print('word most likely member')
        else:
            print('word not present')
 def test_exercise_4(self):
     last_block_hex = '000000000d65610b5af03d73ed67704713c9b734d87cf4b970d39a0416dd80f9'
     last_block = bytes.fromhex(last_block_hex)
     secret = little_endian_to_int(
         hash256(b'Jimmy Song Programming Blockchain'))
     private_key = PrivateKey(secret=secret)
     addr = private_key.point.address(testnet=True)
     h160 = decode_base58(addr)
     target_address = 'mwJn1YPMq7y5F8J3LkC5Hxg9PHyZ5K4cFv'
     self.assertEqual(addr, target_address)
     filter_size = 30
     filter_num_functions = 5
     filter_tweak = 90210  # FILL THIS IN
     target_h160 = decode_base58(target_address)
     target_script = p2pkh_script(target_h160)
     fee = 5000  # fee in satoshis
     node = SimpleNode('tbtc.programmingblockchain.com',
                       testnet=True,
                       logging=False)
     bf = BloomFilter(filter_size, filter_num_functions, filter_tweak)
     bf.add(h160)
     node.handshake()
     node.send(b'filterload', bf.filterload())
     getheaders_message = GetHeadersMessage(start_block=last_block)
     node.send(getheaders_message.command, getheaders_message.serialize())
     headers_envelope = node.wait_for_commands([HeadersMessage.command])
     stream = headers_envelope.stream()
     headers = HeadersMessage.parse(stream)
     get_data_message = GetDataMessage()
     for block in headers.blocks:
         self.assertTrue(block.check_pow())
         if last_block is not None:
             self.assertEqual(block.prev_block, last_block)
         last_block = block.hash()
         get_data_message.add_data(FILTERED_BLOCK_DATA_TYPE, last_block)
     node.send(get_data_message.command, get_data_message.serialize())
     prev_tx = None
     while prev_tx is None:
         envelope = node.wait_for_commands([b'merkleblock', b'tx'])
         stream = envelope.stream()
         if envelope.command == b'merkleblock':
             mb = MerkleBlock.parse(stream)
             self.assertTrue(mb.is_valid())
         else:
             prev = Tx.parse(stream, testnet=True)
             for i, tx_out in enumerate(prev.tx_outs):
                 if tx_out.script_pubkey.address(testnet=True) == addr:
                     prev_tx = prev.hash()
                     prev_index = i
                     prev_amount = tx_out.amount
                     break
     tx_in = TxIn(prev_tx, prev_index)
     output_amount = prev_amount - fee
     tx_out = TxOut(output_amount, target_script)
     tx_obj = Tx(1, [tx_in], [tx_out], 0, testnet=True)
     tx_obj.sign_input(0, private_key)
     self.assertEqual(
         tx_obj.serialize().hex(),
         '010000000194e631abb9e1079ec72a1616a3aa0111c614e65b96a6a4420e2cc6af9e6cc96e000000006a47304402203cc8c56abe1c0dd043afa9eb125dafbebdde2dd4cd7abf0fb1aae0667a22006e02203c95b74d0f0735bbf1b261d36e077515b6939fc088b9d7c1b7030a5e494596330121021cdd761c7eb1c90c0af0a5963e94bf0203176b4662778d32bd6d7ab5d8628b32ffffffff01f8829800000000001976a914ad346f8eb57dee9a37981716e498120ae80e44f788ac00000000'
     )
Ejemplo n.º 7
0
def test_bloom():
    data = (str(uuid.uuid1()) for i in range(100000))
    filter = BloomFilter(100000, 0.0001)
    for item in data:
        if not item in filter:
            filter.add(item)
    print "{name} costs {bytes} bytes.".format(name=sys._getframe().f_code.co_name, bytes=filter.container_size())
Ejemplo n.º 8
0
 def create(cls, path, memtable):
     bf = BloomFilter(BF_SIZE, BF_HASH_COUNT)
     with kv_writer(path) as writer:
         for key, value in memtable.entries():
             writer.write_entry(key, value)
             bf.add(key)
     return cls(path, bf)
Ejemplo n.º 9
0
 def test_exercise_6(self):
     last_block_hex = '000000000d65610b5af03d73ed67704713c9b734d87cf4b970d39a0416dd80f9'
     secret = little_endian_to_int(
         hash256(b'Jimmy Song Programming Blockchain'))
     private_key = PrivateKey(secret=secret)
     addr = private_key.point.address(testnet=True)
     h160 = decode_base58(addr)
     target_address = 'mwJn1YPMq7y5F8J3LkC5Hxg9PHyZ5K4cFv'
     self.assertEqual(addr, target_address)
     target_h160 = decode_base58(target_address)
     target_script = p2pkh_script(target_h160)
     fee = 5000
     node = SimpleNode('tbtc.programmingblockchain.com', testnet=True)
     bf = BloomFilter(30, 5, 90210)
     bf.add(h160)
     node.handshake()
     node.send(bf.filterload())
     start_block = bytes.fromhex(last_block_hex)
     getheaders = GetHeadersMessage(start_block=start_block)
     node.send(getheaders)
     headers = node.wait_for(HeadersMessage)
     last_block = None
     getdata = GetDataMessage()
     for b in headers.blocks:
         if not b.check_pow():
             raise RuntimeError('proof of work is invalid')
         if last_block is not None and b.prev_block != last_block:
             raise RuntimeError('chain broken')
         getdata.add_data(FILTERED_BLOCK_DATA_TYPE, b.hash())
         last_block = b.hash()
     node.send(getdata)
     prev_tx, prev_index, prev_tx_obj = None, None, None
     while prev_tx is None:
         message = node.wait_for(MerkleBlock, Tx)
         if message.command == b'merkleblock':
             if not message.is_valid():
                 raise RuntimeError('invalid merkle proof')
         else:
             message.testnet = True
             for i, tx_out in enumerate(message.tx_outs):
                 if tx_out.script_pubkey.address(testnet=True) == addr:
                     prev_tx = message.hash()
                     prev_index = i
                     prev_amount = tx_out.amount
                     self.assertEqual(
                         message.id(),
                         '6ec96c9eafc62c0e42a4a6965be614c61101aaa316162ac79e07e1b9ab31e694'
                     )
                     self.assertEqual(i, 0)
                     break
     tx_in = TxIn(prev_tx, prev_index)
     output_amount = prev_amount - fee
     tx_out = TxOut(output_amount, target_script)
     tx_obj = Tx(1, [tx_in], [tx_out], 0, testnet=True)
     tx_obj.sign_input(0, private_key)
     self.assertEqual(
         tx_obj.serialize().hex(),
         '010000000194e631abb9e1079ec72a1616a3aa0111c614e65b96a6a4420e2cc6af9e6cc96e000000006a47304402203cc8c56abe1c0dd043afa9eb125dafbebdde2dd4cd7abf0fb1aae0667a22006e02203c95b74d0f0735bbf1b261d36e077515b6939fc088b9d7c1b7030a5e494596330121021cdd761c7eb1c90c0af0a5963e94bf0203176b4662778d32bd6d7ab5d8628b32ffffffff01f8829800000000001976a914ad346f8eb57dee9a37981716e498120ae80e44f788ac00000000'
     )
Ejemplo n.º 10
0
 def test_bloomfilter(self):
     bloom = BloomFilter(100)
     for i in xrange(50):
         bloom.add(str(i))
     assert "20" in bloom
     assert "25" in bloom
     assert "49" in bloom
     assert "50" not in bloom
Ejemplo n.º 11
0
 def test_bloomfilter(self):
     bloom = BloomFilter(100)
     for i in xrange(50):
         bloom.add(str(i))
     assert "20" in bloom
     assert "25" in bloom
     assert "49" in bloom
     assert "50" not in bloom
Ejemplo n.º 12
0
def test():
    bf = BloomFilter(num_hashes=10, size_bytes=100)
    bf.add('hello')
    s = pickle.dumps(bf)

    bf2 = pickle.loads(s)
    assert 'hi' not in bf2
    assert 'hello' in bf2
    assert (bf.seeds == bf2.seeds).all()
Ejemplo n.º 13
0
    def test_excluded(self):
        bf = BloomFilter()
        bf.add('t1')
        bf.add('t2')

        test1 = bf.test("t3")
        test2 = bf.test("t4")
        test3 = bf.test("t5")
        # making few checks to eliminate test failings on false positives
        self.assertFalse(test1 and test2 and test3)
Ejemplo n.º 14
0
def write_bloom_filter():
    bloomf = BloomFilter(n, p)
    print("Size of bit array:{}".format(bloomf.size))
    print("False positive Probability:{}".format(bloomf.fp_prob))
    print("Number of hash functions:{}".format(bloomf.hash_count))

    for item in word_present:
        bloomf.add(item)

    with open(filename, "wb") as outfile:
        outfile.write(bloomf.prepare_bloom_filter_to_write())
    def create_bloomfilter_file(self):

        bloomf = BloomFilter(self.unique_word_count,
                             self.false_positive_probability)
        try:
            for word in self.ta_words_unique:
                bloomf.add(word)
            bloomf.writetofile(self.bloom_file_path)
        except Exception as e:
            track = traceback.format_exc()
            print(track)
Ejemplo n.º 16
0
    def create_csv_bloomfilter_files(self):
        items_count = len(self.dict_tamil_word)
        falsepositive_probability = 0.001
        bloomf = BloomFilter(items_count, falsepositive_probability)

        with open(self.csv_file_path, "w") as f:
            for word, count in self.dict_tamil_word.items():
                write_line = word + "," + str(count) + os.linesep
                bloomf.add(word)
                f.write(write_line)

        bloomf.writetofile(self.bloomfilter_file_path)
Ejemplo n.º 17
0
 def test_get_filtered_txs(self):
     from bloomfilter import BloomFilter
     bf = BloomFilter(30, 5, 90210)
     h160 = decode_base58('mseRGXB89UTFVkWJhTRTzzZ9Ujj4ZPbGK5')
     bf.add(h160)
     node = SimpleNode('tbtc.programmingblockchain.com', testnet=True)
     node.handshake()
     node.send(bf.filterload())
     block_hash = bytes.fromhex('00000000000377db7fde98411876c53e318a395af7304de298fd47b7c549d125')
     txs = node.get_filtered_txs([block_hash])
     self.assertEqual(txs[0].id(), '0c024b9d3aa2ae8faae96603b8d40c88df2fc6bf50b3f446295206f70f3cf6ad')
     self.assertEqual(txs[1].id(), '0886537e27969a12478e0d33707bf6b9fe4fdaec8d5d471b5304453b04135e7e')
     self.assertEqual(txs[2].id(), '23d4effc88b80fb7dbcc2e6a0b0af9821c6fe3bb4c8dc3b61bcab7c45f0f6888')
def test_bloom_filter(num_of_items, fp_prob):
    bloomf = BloomFilter(num_of_items,fp_prob) 
    
    # words to be added 
    word_present = ['abound','abounds','abundance','abundant','accessable', 
                    'bloom','blossom','bolster','bonny','bonus','bonuses', 
                    'coherent','cohesive','colorful','comely','comfort', 
                    'gems','generosity','generous','generously','genial']
    
    # word not added
    word_absent = ['bluff','cheater','hate','war','humanity', 
                'racism','hurt','nuke','gloomy','facebook', 
                'geeksforgeeks','twitter']
    
    top_passwords_last_years = ['123456',	'123456789', 'qwerty', 'password',
                                'football', '1234567', '12345678', 'letmein',
                                '1234', '1234567890', 'dragon', 'baseball',
                                'sunshine', 'iloveyou','trustno1', 'princess',
                                'adobe123', '123123', 'welcome', 'login', 'admin',
                                '111111', 'qwerty123', 'solo', '1q2w3e4r', 'master',
                                'abc123', '666666', 'photoshop', '1qaz2wsx', 'qwertyuiop',
                                'ashley', 'mustang', '121212', 'starwars',	'654321',
                                'bailey',	'access', 'flower', '555555', 'passw0rd',
                                'monkey', 'lovely', 'shadow', '7777777', '12345', 'michael',
                                '!@#$%^&*', 'jesus', 'password1', 'superman', 'hello',
                                'charlie', '888888', '696969', 'hottie', 'freedom', 'aa123456',
                                'qazwsx', 'ninja', 'azerty', 'loveme', 'whatever', 'donald',
                                'batman', 'zaq1zaq1', 'Football', '0', '123qwe', '1111111',
                                '12345', '000000', '1234', '1q2w3e4r5t', '123', '987654321',
                                '12345679', 'mynoob', '123321', '18atcskd2w', '3rjs1la7qe',
                                'google', 'zxcvbnm', '1q2w3e', ]
    
    for item in word_present:
        bloomf.add(item)

    shuffle(word_present) 
    shuffle(word_absent)
    
    test_words = word_present[:10] + word_absent 
    shuffle(test_words)
    
    for word in test_words:
        if bloomf.check(word):
            if word in word_absent:
                print("'{}' is a false positive!".format(word))
            else:
                print("'{}' is probably present!".format(word))
        else:
            print("'{}' is definitely not present!".format(word))
Ejemplo n.º 19
0
    def build_index(self, document_identifier, kpriv, list_of_words):
        # Create an empty list to hold the trapdoors for the word (x1, x2, ..., xr)
        trapdoor = []
        # Create an empty list to hold the codewords for the word (y1, y2, ..., yr)
        codewords = []

        for word in list_of_words:
            '''
            Create a trapdoor for each unique word
            '''
            # Takes the word and creates a trapdoor
            for i in range(0,self.r):
                # Converts kpriv[i] from hex to a bytes object - Necessary to use HMAC
                key = bytes.fromhex(kpriv[i])
                w = bytes(word, 'utf-8')
                trapdoor_digest = hmac.new(key, msg=w, digestmod=hashlib.sha1)
                trapdoor_digest = trapdoor_digest.hexdigest()
                trapdoor.append(trapdoor_digest)

        # Take each word and hash it again with the document_identifier as the key to generate y1, y2, ..., yr
        for i in range(0, len(trapdoor)):
            # encode the docunemt identifier and the trapdoor[i]
            d_id = bytes(document_identifier, 'utf-8')
            message = bytes(trapdoor[i], 'utf-8')
            codeword_digest = hmac.new(message, msg=d_id, digestmod=hashlib.sha1)
            codeword_digest = codeword_digest.hexdigest()
            codewords.append(codeword_digest)

        #Create a bloom filter and insert the codewords into the bloom filter

        # Creates a bloom filter
        bf = BloomFilter()

        # For each value in the list of codewords, add the codeword to the bloom filter
        for codeword in codewords:
            bf.add(codeword)

        # adding noise - take the total number of words - unique words * r and insert into bloom filter
        for i in range (0, (self.unique_word_count - len(list_of_words)) * self.r):
            # generate a random number from 0 - bf.size
            index = random.randrange(0, bf.size-1)
            bf.set_index(index)

        return(document_identifier, bf)
Ejemplo n.º 20
0
class CreateBloomFilter():
    def __init__(self):
        self.cc = ConnectToCassandra()
        self.n, self.word_present = self.cc.get_id()  #no of items to add
        self.p = 0.05  #false positive probability
        self.bloomf = BloomFilter(self.n, self.p)
        for item in self.word_present:
            self.bloomf.add(bytes(to_integer(item.date())))

    def createfilter(self):
        for item in self.word_present:
            self.bloomf.add(bytes(to_integer(item)))

    def testdate(self, todate):
        todate = to_integer(todate)
        if self.bloomf.check(bytes(todate)):
            return 1
        else:
            return 0
Ejemplo n.º 21
0
def encryptData(data,size,fp=0.01,bigrams=2,bpower=8,p=None):
    """
        Criptografa um string

        bigrams : 2 = Bigrams
        size : Size of BF
        fp : False positive rate
    """
    bloomfilter = BloomFilter(size,fp,bfpower=bpower)
    if p != None:
        bloomfilter.set_hashfunction_by_p(p)

    index = ngram.NGram(N=bigrams)
    bigrams = list(index.ngrams(index.pad(str(data))))

    for bigram in bigrams:
        bloomfilter.add(str(bigram))

    return bloomfilter
class Document:
    def __init__(self, terms, doc_id):
        self.id = doc_id
        self.terms = terms
        self.terms = tokenize_terms(self.terms)
        self.signature = BloomFilter()
        self.signature.add(self.terms)

    def verify(self, query):
        if isinstance(query, str):
            if query not in self.terms:
                return False
        elif isinstance(query, list):
            for term in query:
                if term not in self.terms:
                    return False
        else:
            return False
        return True
Ejemplo n.º 23
0
 def test_example_5(self):
     last_block_hex = '00000000000538d5c2246336644f9a4956551afb44ba47278759ec55ea912e19'
     address = 'mwJn1YPMq7y5F8J3LkC5Hxg9PHyZ5K4cFv'
     h160 = decode_base58(address)
     node = SimpleNode('tbtc.programmingblockchain.com',
                       testnet=True,
                       logging=False)
     bf = BloomFilter(30, 5, 90210)
     bf.add(h160)
     node.handshake()
     node.send(b'filterload', bf.filterload())
     start_block = bytes.fromhex(last_block_hex)
     getheaders_message = GetHeadersMessage(start_block=start_block)
     node.send(b'getheaders', getheaders_message.serialize())
     headers_envelope = node.wait_for_commands({b'headers'})
     stream = headers_envelope.stream()
     headers = HeadersMessage.parse(stream)
     get_data_message = GetDataMessage()
     for b in headers.blocks:
         if not b.check_pow():
             raise RuntimeError('proof of work is invalid')
         get_data_message.add_data(FILTERED_BLOCK_DATA_TYPE, b.hash())
     node.send(b'getdata', get_data_message.serialize())
     found = False
     while not found:
         envelope = node.wait_for_commands({b'merkleblock', b'tx'})
         stream = envelope.stream()
         if envelope.command == b'merkleblock':
             mb = MerkleBlock.parse(stream)
             if not mb.is_valid():
                 raise RuntimeError('invalid merkle proof')
         else:
             prev_tx_obj = Tx.parse(stream, testnet=True)
             for i, tx_out in enumerate(prev_tx_obj.tx_outs):
                 if tx_out.script_pubkey.address(testnet=True) == address:
                     self.assertEqual(
                         prev_tx_obj.id(),
                         'e3930e1e566ca9b75d53b0eb9acb7607f547e1182d1d22bd4b661cfe18dcddf1'
                     )
                     self.assertEqual(i, 0)
                     found = True
                     break
Ejemplo n.º 24
0
def bloom(word_present):
    n = 20  #no of items to add
    p = 0.05  #false positive probability

    bloomf = BloomFilter(n, p)
    print("Size of bit array:{}".format(bloomf.size))
    print("False positive Probability:{}".format(bloomf.fp_prob))
    print("Number of hash functions:{}".format(bloomf.hash_count))
    role = [
        'Financial analyst', 'Jr. Maintenance Engineer', 'Marketing manager',
        'Quantitative analyst', 'Sales Consultant', 'Sales Executive',
        'Sales Representative', 'Sr. Software engineer', 'Sr. Software tester',
        'Technical support', 'Web developer', 'Jr. Software engineer',
        'Jr. Software tester', 'Sr. Maintenance Engineer'
    ]
    # words to be added
    #word_present = ['abound','abounds','abundance','abundant','accessable',
    #               'bloom','blossom','bolster','bonny','bonus','bonuses',
    #              'coherent','cohesive','colorful','comely','comfort',
    #             'gems','generosity','generous','generously','genial']

    # word not added
    word_absent = []
    for i in role:
        if i not in word_present:
            word_absent.append(i)

    for item in word_present:
        bloomf.add(item)

    shuffle(word_present)
    shuffle(word_absent)
    print word_present
    for word in word_present:
        print word
        if bloomf.check(word):
            if word in word_absent:
                print("'{}' is a false positive!".format(word))
            else:
                print("'{}' is probably present!".format(word))
        else:
            print("'{}' is definitely not present!".format(word))
Ejemplo n.º 25
0
class DuplicatesPipeline(object):
    def __init__(self):
#         self.bf = BloomFilter(10000000, 0.01, 'filter.bloom')
        
        self.bf = BloomFilter(10000, 0.0001, 'filter.bloom')
        self.f_write = open('visitedsites','w')
        self.si = SearchIndex()
        self.si.SearchInit()
        self.count_num = 0
        

    def process_item(self, item, spider):
#         print '************%d pages visited!*****************' %len(self.bf)
        temp='?'
        str1=item['url']
        str2=str1[:str1.find(temp)]
#         if self.bf.add(item['url']):#True if item in the BF
#         if self.bf.lookup(item['url']):
        if self.bf.lookup(str2):   
            raise DropItem("Duplicate item found: %s" % item)
        else:
#             print '%d pages visited!'% len(self.url_seen)
            self.count_num+=1
#             self.bf.add(item['url'])
#             self.save_to_file(item['url'],item['title'])
            self.bf.add(str2)
            self.save_to_file(item['url'],item['title'])
            self.si.AddIndex(item)
            print self.count_num
            return item

    def save_to_file(self,url,utitle):
        self.f_write.write(url)
        self.f_write.write('\t')
        self.f_write.write(utitle.encode('utf-8'))
        self.f_write.write('\n')

    def __del__(self):
        """docstring for __del__"""
        self.f_write.close()
        self.si.IndexDone()
Ejemplo n.º 26
0
def dblookuptimetest():
    print("Testing DB lookup time using bloom filter\n")
    bf = BloomFilter(500000, 7)
    huge = []

    lines = open("/usr/share/dict/american-english").read().splitlines()
    for line in lines:
        bf.add(line)
        huge.append(line)

    import datetime

    start = datetime.datetime.now()
    bf.contains("google")
    finish = datetime.datetime.now()
    print('Checking "google" using bloom filter in dictionary\n')
    print((finish - start).microseconds)

    start = datetime.datetime.now()
    for word in huge:
        if word == "google":
            break
    finish = datetime.datetime.now()
    print('Checking "google" without  using bloom filter in dictionary\n')
    print((finish - start).microseconds)

    print(bf.contains("Max"))
    print(bf.contains("mice"))
    print(bf.contains("3"))

    start = datetime.datetime.now()
    bf.contains("apple")
    finish = datetime.datetime.now()
    print((finish - start).microseconds)

    start = datetime.datetime.now()
    for word in huge:
        if word == "apple":
            break
    finish = datetime.datetime.now()
    print((finish - start).microseconds)
Ejemplo n.º 27
0
class SSTable:
    """Represents a Sorted-String-Table (SSTable) on disk"""

    def __init__(self, path, bf=None):
        self.path = path
        self.bf = bf
        if not self.bf:
            self._sync()

    def _sync(self):
        self.bf = BloomFilter(BF_SIZE, BF_HASH_COUNT)
        with kv_reader(self.path) as r:
            while r.has_next():
                key = r.read_key()
                self.bf.add(key)
                r.skip_value()

    @classmethod
    def create(cls, path, memtable):
        bf = BloomFilter(BF_SIZE, BF_HASH_COUNT)
        with kv_writer(path) as writer:
            for key, value in memtable.entries():
                writer.write_entry(key, value)
                bf.add(key)
        return cls(path, bf)

    def search(self, search_key):
        if not self.bf.exists(search_key):
            return None
        with kv_reader(self.path) as r:
            while r.has_next():
                key = r.read_key()
                # stop if the key is too big
                if key > search_key:
                    return None
                if key == search_key:
                    return r.read_value()
                r.skip_value()
        return None
Ejemplo n.º 28
0
 def merge(cls, sstables: List[SSTable]) -> SSTable:
     new_path = sstables[0].path.replace(".dat", "-compacted.dat")
     new_index = sstables[0].index
     new_bf = BloomFilter(BF_SIZE, BF_HASH_COUNT)
     readers = [cls.Entries(sstable) for sstable in sstables
                 if sstable.size > 0]
     with kv_writer(new_path) as writer:
         while readers:
             min_reader = min(
                 readers,
                 key=lambda r: (r.current_pair[0], r.sstable.index * -1),
             )
             for reader in readers:
                 if reader is min_reader:
                     continue
                 if reader.current_pair[0] == min_reader.current_pair[0]:
                     reader.advance()
             if min_reader.current_pair[1] is not TOMBSTONE:
                 writer.write_entry(*min_reader.current_pair)
                 new_bf.add(min_reader.current_pair[0])
             min_reader.advance()
             readers = [reader for reader in readers if reader.has_next]
     return cls(new_path, new_index, new_bf)
Ejemplo n.º 29
0
def main():
    m = 1000000  # max hash value
    h = 2000  # number of hash functions
    jaccard = 0.8
    N = np.linspace(10, 10**3, num=10).astype('int')

    jaccard_minhash = []
    jaccard_bloom = []
    jaccard_true = []

    for n in N:
        d1 = set([str(x) for x in range(n)])
        min_d2 = int(n*(1.-jaccard)/(1. + jaccard))
        d2 = set([str(x) for x in range(min_d2, min_d2 + n)])

        b1 = BloomFilter(m, h)
        b2 = BloomFilter(m, h)

        mh1 = MinHash(h)
        mh2 = MinHash(h)

        for s1, s2 in izip(d1, d2):
            b1.add(s1)
            b2.add(s2)
        mh1.hash(d1)
        mh2.hash(d2)

        jaccard_minhash.append(1.-hamming(mh1.vec, mh2.vec))
        jaccard_bloom.append(1-2*float(sum(np.not_equal(b1.bit_array, b2.bit_array)))/(sum(b1.bit_array) + sum(b2.bit_array)))
        jaccard_true.append(float(len(d1.intersection(d2)))/len(d1.union(d2)))

    plt.plot(N, np.array([jaccard_bloom, jaccard_minhash, jaccard_true]).T)
    plt.legend(['Bloom Filter', 'MinHash', 'True'], loc='upper left')
    plt.xlabel('Number of strings')
    plt.ylabel('Jaccard Coefficient')
    plt.title('Jaccard Approximation Through Hashing')
    plt.show()
def test_bloom_filter():
    bloomfilter = BloomFilter(NUM_KEYS, FALSE_POSITIVE_PROBABILITY)
    word_present = [
        'abound', 'abounds', 'abundance', 'abundant', 'accessable', 'bloom',
        'blossom', 'bolster', 'bonny', 'bonus', 'bonuses', 'coherent',
        'cohesive', 'colorful', 'comely', 'comfort', 'gems', 'generosity',
        'generous', 'generously', 'genial'
    ]

    word_absent = ['facebook', 'twitter']

    for item in word_present:
        bloomfilter.add(item)

    test_words = word_present[:10] + word_absent
    shuffle(test_words)
    for word in test_words:
        if bloomfilter.is_member(word):
            if word in word_absent:
                print(f"'{word}' is a false positive!")
            else:
                print(f"'{word}' is probably present!")
        else:
            print(f"'{word}' is definitely not present!")
Ejemplo n.º 31
0
class CreateBloomFilter():
    def __init__(self, cnt, word_present):
        self.n = cnt
        self.word_present = word_present  #no of items to add
        self.p = 0.05  #false positive probability
        self.bloomf = BloomFilter(self.n, self.p)
        for item in self.word_present:
            print(item)
            self.bloomf.add(
                bytes(to_integer(datetime.datetime.strptime(item, '%Y%m%d'))))

    def createfilter(self, cnt, word_present):
        self.p = 0.05  #false positive probability
        self.bloomf = BloomFilter(cnt, self.p)
        for item in word_present:
            self.bloomf.add(bytes(to_integer(item)))

    def testdate(self, todate):
        todate = datetime.datetime.strptime(todate, '%Y%m%d')
        todate = to_integer(todate)
        if self.bloomf.check(bytes(todate)):
            return 1
        else:
            return 0
Ejemplo n.º 32
0
word_present = []
inFile = open("/Users/siddhartharoynandi/Desktop/listed_username_30.txt")
for line in inFile:
    word_present.append(line)

n = len(word_present)  # no of items to add
p = 0.05  # false positive probability

bloomf = BloomFilter(n, p)
#print("Size of bit array:{}".format(bloomf.size))
#print("False positive Probability:{}".format(bloomf.fp_prob))
#print("Number of hash functions:{}".format(bloomf.hash_count))

for item in word_present:
    bloomf.add(item)

word_tobe_tested = []
inFile = open("/Users/siddhartharoynandi/Desktop/listed_username_365.txt")
for line in inFile:
    word_tobe_tested.append(line)

shuffle(word_present)
shuffle(word_tobe_tested)

count = 0
count1 = 0

for word in word_tobe_tested:
    if bloomf.check(word):
        count1 = count1 + 1
Ejemplo n.º 33
0
                         )
                 except:  # invalid data or mistake
                     pass
         filter_visual_window.close()
         pass
     except:
         pass
 elif event == 'Insert new password':
     try:
         if len(values['-NEW-PASSWORD-']) != 0:
             if bloomf.check_if_add(values['-NEW-PASSWORD-']):
                 sg.PopupError("inserted not successfully  This word[ " +
                               str(values['-NEW-PASSWORD-']) +
                               " ] in bloom filter , Try Again!")
             else:
                 bloomf.add(values['-NEW-PASSWORD-'])
                 sg.PopupOK(
                     "The password has been inserted successfully [ we Found overlap "
                     + str(bloomf.c) + "bits]")
             if bloomf.c == sizeofhashs:
                 sg.popup_ok("ohh ! , we Found False positive ")
                 false_positive += 1
         else:
             sg.PopupError(
                 "inserted not successfully  Null input, Try Again!")
     except:
         pass
 elif event == 'Show complete password strength analysis':
     try:
         if len(values['-NEW-PASSWORD-']) == 0:
             sg.popup_error("Null input :(")
Ejemplo n.º 34
0
#address = 'mwJn1YPMq7y5F8J3LkC5Hxg9PHyZ5K4cFv'

# our test

#last_block_hex = '0000000017e6fbd8931bce659d45d92040a4674950f2ae5416d0bf1a239641f9'
last_block_hex = '00000000970369111c044804ec0319792c9e1aa29f59a622c5d14b3544ae4eba'
#0000000017e6fbd8931bce659d45d92040a4674950f2ae5416d0bf1a239641f9
#last_block_hex = '0000000000000004fea90996fdf40772e2c2c76205a1fb57fae465194fdaffb9'
address = 'mvEg6eZ3sUApodedYQrkpEPMMALsr1K1k1'

h160 = decode_base58(address)
node = SimpleNode('testnet.programmingbitcoin.com',
                  testnet=True,
                  logging=False)
bf = BloomFilter(size=30, function_count=5, tweak=90210)
bf.add(h160)
node.handshake()
node.send(bf.filterload())
start_block = bytes.fromhex(last_block_hex)
getheaders = GetHeadersMessage(start_block=start_block)
node.send(getheaders)
print('ok2')
headers = node.wait_for(HeadersMessage)
print('ok3')
getdata = GetDataMessage()
for b in headers.blocks:
    if not b.check_pow():
        raise RuntimeError('proof of work is invalid')
    getdata.add_data(FILTERED_BLOCK_DATA_TYPE, b.hash())
node.send(getdata)
found = False
Ejemplo n.º 35
0
if __name__ == '__main__':
    from random import sample
    from string import ascii_letters

    states = '''Alabama Alaska Arizona Arkansas California Colorado Connecticut
        Delaware Florida Georgia Hawaii Idaho Illinois Indiana Iowa Kansas
        Kentucky Louisiana Maine Maryland Massachusetts Michigan Minnesota
        Mississippi Missouri Montana Nebraska Nevada NewHampshire NewJersey
        NewMexico NewYork NorthCarolina NorthDakota Ohio Oklahoma Oregon
        Pennsylvania RhodeIsland SouthCarolina SouthDakota Tennessee Texas Utah
        Vermont Virginia Washington WestVirginia Wisconsin Wyoming'''.split()

    bf1 = BloomFilter(ideal_num_elements_n=100000, error_rate_p=0.001)
    for state in states:
        bf1.add(state)

    json_bf = bf1.toJSON()

    print "##################"
    print json_bf
    print "##################"

    len_json = len(json_bf)
    print "data size: %s bytes"%len_json

    bf2 = BloomFilter.fromJSON(json_bf)
    assertListEquals(bf1.data, bf2.data)

    new_data = bf2.get_data()