Beispiel #1
0
 def testNumberOfSetBitsNeverDecreases(self):
     n = 10
     bf = BloomFilter(n, n)
     prev_cnt = 0
     for i in range(n):
         bf.insert(str(i))
         cnt = bf.count(True)
         self.assertTrue(cnt >= prev_cnt)
         prev_cnt = cnt
Beispiel #2
0
 def testFalseNegativeNeverHappens(self):
     n = 10
     inserted = []
     bf = BloomFilter(n, n)
     for i in range(n):
         bf.insert(str(i))
         inserted.append(i)
         for j in inserted:
             self.assertTrue(bf.query(j))
class WeatherSpider(scrapy.Spider):
  name = "myweather"
  allowed_domains = ["sina.com.cn"]
  start_urls = ['http://weather.sina.com.cn/beijing']

  def __init__(self):
    self._bf = BloomFilter(0.0001,100000)

  def parse(self, response):
    html_doc = response.body
    #html_doc = html_doc.decode('utf-8')
    self._bf.insert_element(response.url)
    soup = BeautifulSoup(html_doc,'html.parser',from_encoding='utf-8')
    item = WeatherItem()
    item['ourl'] = response.url
    item['city'] = ''
    item['temp'] = ''
    item['qihou'] = ''
    item['wind'] = ''
    item['shidu'] = ''
    item['wuran'] = ''
    soup_city = soup.find(id='slider_ct_name')
    soup_temp = soup.find('div',{'class','slider_degree'})
    soup_xijie = soup.find('p',{'class','slider_detail'})
    soup_wuran = soup.find('div',{'class','slider_warn_i_tt'})
    if soup_city and soup_temp and soup_wuran:
      item['city'] = soup_city.get_text()
      item['temp'] = soup_temp.get_text()
      xijie = soup_xijie.get_text()
      item['wuran'] = soup_wuran.find('p').get_text()
      item['qihou'] = xijie.split('|')[0].strip()
      item['wind'] = xijie.split('|')[1].strip()
      item['shidu'] = xijie.split('|')[2].strip()
      item['shidu'] = item['shidu'].split(u':')[1]
    urls_tmp = soup.find_all('a')
    urls = []
    for url_tmp in urls_tmp:
      urls.append(url_tmp.get('href'))
    yield item
    for url in self._cut_urls(urls):
      yield self.make_requests_from_url(url)
  
  def _cut_urls(self,urls):
    cut_urls=[]
    pattern = re.compile(r'http://weather.sina.com.cn/')
    for url in urls:
      try:
        match = pattern.match(url)
      except TypeError as e:
        match = False
      if match and not self._bf.is_element_exist(url):
        cut_urls.append(url)
    return cut_urls
Beispiel #4
0
 def __init__(self, addrs, params=MAINNET, user_agent="/pyBitcoin:0.1/", max_connections=10):
     self.addrs = addrs
     self.params = params
     self.user_agent = user_agent
     self.max_connections = max_connections
     self.peers = []
     self.inventory = {}
     self.pending_txs = {}
     self.subscriptions = {}
     self.bloom_filter = BloomFilter(3, 0.01, random.getrandbits(32), BloomFilter.UPDATE_NONE)
     self.connect_to_peers()
Beispiel #5
0
 def testInputIsFalsePositiveUntilInserted(self):
     n = 100
     bf = BloomFilter(n/2, n)
     fp = [False]*n
     inserted = [False]*n
     fp_cnt = 0
     for i in range(n):
         bf.insert(str(i))
         inserted[i] = True
         for j in range(n):
             if inserted[j]:
                 continue
             # It was false positive before, it must continue
             # to be (since it was not inserted).
             if fp[j]:
                 self.assertTrue(bf.query(j))
             # Update false positives list
             elif bf.query(j):
                 fp[j] = True
                 fp_cnt += 1
     # We're inserting more elements than the size of the array, so
     # there must be false positives.
     self.assertTrue(fp_cnt > 0)
def bloom_filter_run(n, m, k=None):
    keys = range(n)
    # Random sampling without replacement
    random.shuffle(keys)
    probs = []
    filter = BloomFilter(m, n, k)
    inserted = [False] * (n)

    for cnt, entry in enumerate(keys):
        filter.insert(str(entry))

        inserted[entry] = True
        false_positives, total = 0, 0
        # Compute false positives
        for probe in range(n):
            if not inserted[probe]:
                exists = filter.query(probe)
                if exists:
                    false_positives += 1
                total += 1
        if total != 0:
            prob = false_positives * 1.0 / total
            probs.append(prob)
    return probs
Beispiel #7
0
 def test_repr(self):
     dilberts = BloomFilter(key='dilberts')
     assert repr(dilberts) == '<BloomFilter key=dilberts>'
Beispiel #8
0
    def server_handler(self):
        self.sock.bind((self.ip, self.port))
        self.sock.listen(1)
        print("Starting server on %s : %d" % (self.ip, self.port))

        try:
            conn, addr = self.sock.accept()
        except:
            print("Server never received a connection...closing")
            return

        print("Connection from %s" % (addr[0]))
        while True:
            data = conn.recv(1)
            if not data: break
            # First we're always going to get the NodeServerMsg data
            typ = int.from_bytes(data, 'big')

            if typ == NetworkMsg.INV.value:
                # Receive INV
                print('Receiving INV...')
                data = conn.recv(32)
                self.merkle_root = data
                print("Received INV of size 32 bytes")
                self.total_received += 32
            elif typ == NetworkMsg.GET_GLBLK.value:
                # Receive GET_GLBLK
                print('Receiving GET_GLBLK...')
                data = conn.recv(3)
                self.other_txpool_size = int.from_bytes(data, 'big')
                print("Received GET_GRBLK of size 3 bytes")
                self.total_received += 3
            elif typ == NetworkMsg.GLBLK.value:
                # Receive TX Bloom
                print('Receiving GLBLK...')
                data = conn.recv(3)
                tx_bloom_len = int.from_bytes(data, 'big')

                data = conn.recv(tx_bloom_len)
                tx_bloom = BloomFilter.deserialise(data)
                print("Received TX Bloom of size %d bytes" % tx_bloom_len)

                # Receive TX IBLT
                data = conn.recv(3)
                tx_iblt_len = int.from_bytes(data, 'big')

                data = conn.recv(tx_iblt_len)
                tx_iblt = SIBLT.deserialise(data)
                print("Received TX IBLT of size %d bytes" % tx_iblt_len)
                self.total_received += tx_bloom_len + tx_iblt_len

                # Construct GLBLK
                self.tx_filters = [tx_bloom, tx_iblt]
            elif typ == NetworkMsg.GET_GLBLKDAT.value:
                # Receive GET_GLBLKDAT
                print('Receiving GET_GLBLKDAT...')
                data = conn.recv(3)
                pair_filter_len = int.from_bytes(data, 'big')

                data = conn.recv(pair_filter_len)
                self.tx_missing_ids = cpickle.loads(data)
                print("Received GET_GLBLKDAT of size %d bytes" % pair_filter_len)
                self.total_received += pair_filter_len
            elif typ == NetworkMsg.GLBLKTX.value:
                # Receive GLBLKTX
                print('Receiving GLBLKTX...')
                data = conn.recv(3)
                tx_missing_length = int.from_bytes(data, 'big')

                data = conn.recv(tx_missing_length)
                self.tx_missing = cpickle.loads(data)
                print("Received GLBLKTX of size %d bytes" % tx_missing_length)
                self.total_received += tx_missing_length
            elif typ == NetworkMsg.GLBLKORD.value:
                # Receive TX
                print('Receiving GLBLKORD...')
                data = conn.recv(3)
                bloom_len = int.from_bytes(data, 'big')

                data = conn.recv(bloom_len)
                pair_bloom = BloomFilter.deserialise(data)
                print("Received Bloom of size %d bytes" % bloom_len)

                # Receive IBLT
                data = conn.recv(3)
                iblt_len = int.from_bytes(data, 'big')

                data = conn.recv(iblt_len)
                pair_iblt = SIBLT.deserialise(data)
                print("Received IBLT of size %d bytes" % iblt_len)
                self.total_received += bloom_len + iblt_len
                self.total_ord_received += bloom_len + iblt_len

                # Construct GLBLK
                self.pair_filters.append([pair_bloom, pair_iblt])
            elif typ == NetworkMsg.COMPLETE.value:
                # Receive reconciliation analytics
                print('Receiving reconciliation analytics...')
                data = conn.recv(1)
                # TODO: Implement
                self.complete = data
                print('Received analytics of size %d bytes' % 1)
                self.total_received += 1

        conn.close()
Beispiel #9
0
 def test_repr(self):
     self.assertEqual(repr(BloomFilter(num_bits=10, num_hashers=5)),
                      "BloomFilter(num_bits=10, num_hashers=5)")
def test():
    ''' basic testing functions '''
    blm = BloomFilter()
    blm.init(10, 0.05)
    blm.add("this is a test")
    print(blm.check("this is a test"))
    print(blm.check("blah"))
    print(blm)
    print(blm.bloom_array)
    blm.export('./dist/py_bloom.blm')

    print('\n\ncheck imported BloomFilter!')

    blm2 = BloomFilter()
    blm2.load('./dist/py_bloom.blm')
    print(blm2.check("this is a test"))
    print(blm2.check("blah"))
    print(blm2)
    print(blm2.bloom_array)

    blm2.add('yet another test')

    print("\n\ncheck intersection")
    blm3 = blm.intersection(blm2)
    print(blm3)
    print(blm3.check("this is a test"))
    print(blm3.check("yet another test"))

    print("\n\ncheck union")
    blm3 = blm.union(blm2)
    print(blm3)
    print(blm3.check("this is a test"))
    print('\n\ntest using `in`')
    print("this is a test" in blm3)
    print(blm3.check("yet another test"))
    print(blm3.estimate_elements())

    print(blm.jaccard_index(blm2))

    print ('\n\nexport to hex')
    hex_out = blm.export_hex()
    print(hex_out)
    print('import hex')
    blm4 = BloomFilter()
    blm4.load_hex(hex_out)
    print(blm4)

    # on disk code check
    print('\n\nbloom filter on disk')
    blmd = BloomFilterOnDisk()
    blmd.initialize('./dist/py_ondisk.blm', 10, 0.05)
    blmd.add("this is a test")
    print(blmd.check('this is a test'))
    print('Check use of in keyword ("this is a test" in blmd): ',
          'this is a test' in blmd)
    print(blmd.check('yet another test'))
    # blmd.union(blm4)
    # blmd.intersection(blm)
    # print(blmd.jaccard_index(blm2))
    print(blmd)
    # print ('\n\nexport to hex')
    # hex_out = blmd.export_hex()
    # print(hex_out)
    blmd.close()
Beispiel #11
0
 def test_update_gets_stored(self):
     'When we update() with elements, ensure that we Memcache the bit array'
     self.dilberts.update({'dan', 'eric'})
     office_space = BloomFilter(key='dilberts')
     assert office_space._bit_array == self.dilberts._bit_array
Beispiel #12
0
 def test_init_gets_stored(self):
     'When we __init__() on an iterable, ensure we Memcache the bit array'
     office_space = BloomFilter(key='dilberts')
     assert office_space._bit_array == self.dilberts._bit_array
Beispiel #13
0
class BitcoinClient(object):

    def __init__(self, addrs, params=MAINNET, user_agent="/pyBitcoin:0.1/", max_connections=10):
        self.addrs = addrs
        self.params = params
        self.user_agent = user_agent
        self.max_connections = max_connections
        self.peers = []
        self.inventory = {}
        self.pending_txs = {}
        self.subscriptions = {}
        self.bloom_filter = BloomFilter(3, 0.01, random.getrandbits(32), BloomFilter.UPDATE_NONE)
        self.connect_to_peers()

    def connect_to_peers(self):
        if len(self.peers) < self.max_connections:
            shuffle(self.addrs)
            for i in range(self.max_connections - len(self.peers)):
                if len(self.addrs) > 0:
                    addr = self.addrs.pop(0)
                    peer = PeerFactory(self.params, self.user_agent, self.inventory,
                                       self.bloom_filter, self.on_peer_disconnected)
                    reactor.connectTCP(addr[0], addr[1], peer)
                    self.peers.append(peer)

    def on_peer_disconnected(self, peer):
        self.peers.remove(peer)
        self.connect_to_peers()

    def broadcast_tx(self, tx):
        """
        Send the tx to half our peers, wait for half of the remainder to announce the tx before
        calling back True.
        """
        def on_peer_anncounce(txid):
            self.pending_txs[txid][0] += 1
            if self.pending_txs[txid][0] >= self.pending_txs[txid][1] / 2:
                if self.pending_txs[txid][3].active():
                    self.pending_txs[txid][3].cancel()
                    self.pending_txs[txid][2].callback(True)

        d = defer.Deferred()
        self.inventory[bitcoin.txhash(tx)] = tx
        inv_packet = inv("TX", bitcoin.txhash(tx))
        self.bloom_filter.insert(bitcoin.bin_txhash(tx))
        self.pending_txs[bitcoin.txhash(tx)] = [0, len(self.peers)/2, d, reactor.callLater(10, d.callback, False)]
        for peer in self.peers[len(self.peers)/2:]:
            peer.protocol.update_filter()
            peer.protocol.add_inv_callback(bitcoin.txhash(tx), on_peer_anncounce)
        for peer in self.peers[:len(self.peers)/2]:
            peer.protocol.send_message(message(inv_packet, self.params))
        return d

    def subscribe_address(self, address, callback):
        """
        Listen for transactions on an address. Since we can't validate the transaction, we will only
        callback if a majority of our peers relay it. If less than a majority relay it, we will have
        to wait for block inclusion to callback.
        """
        def on_peer_announce(tx):
            txhash = bitcoin.txhash(bitcoin.serialize(tx["tx"]))
            if txhash in self.subscriptions[address][0] and self.subscriptions[address][0][txhash][0] != "complete":
                self.subscriptions[address][0][txhash][0] += 1
                if self.subscriptions[address][0][txhash][0] >= self.subscriptions[address][0][txhash][1]:
                    self.subscriptions[address][0][txhash][0] = "complete"
                    self.subscriptions[address][1](tx["tx"])
            elif txhash not in self.subscriptions[address][0]:
                self.subscriptions[address][0][txhash] = [1, len(self.peers)/2]

        self.subscriptions[address] = [{}, callback]
        self.bloom_filter.insert(unhexlify(bitcoin.b58check_to_hex(address)))
        for peer in self.peers:
            peer.protocol.add_inv_callback(bitcoin.b58check_to_hex(address), on_peer_announce)
            peer.protocol.update_filter()
Beispiel #14
0
 def __init__(self):
   self._bf = BloomFilter(0.0001,100000)
Beispiel #15
0
class RecentlyConsumedSimulationTests(unittest.TestCase):
    "Simulate reddit's recently consumed problem to test our Bloom filter."

    def setUp(self):
        super(self.__class__, self).setUp()

        # Construct a set of links that the user has seen.
        self.seen_links = set()
        while len(self.seen_links) < 100:
            fullname = self.random_fullname()
            self.seen_links.add(fullname)

        # Construct a set of links that the user hasn't seen.  Ensure that
        # there's no intersection between the seen set and the unseen set.
        self.unseen_links = set()
        while len(self.unseen_links) < 100:
            fullname = self.random_fullname()
            if fullname not in self.seen_links:
                self.unseen_links.add(fullname)

        # Initialize the recently consumed Bloom filter on the seen set.
        self.recently_consumed = BloomFilter(
            num_values=1000,
            false_positives=0.001,
            key='recently-consumed',
        )
        self.recently_consumed.clear()
        self.recently_consumed.update(self.seen_links)

    def tearDown(self):
        self.recently_consumed.memcache.delete(self.recently_consumed.key)
        super(self.__class__, self).tearDown()

    @staticmethod
    def random_fullname(prefix='t3_', size=6):
        alphabet36, id36 = string.digits + string.ascii_lowercase, []
        for _ in xrange(size):
            id36.append(random.choice(alphabet36))
        return prefix + ''.join(id36)

    @staticmethod
    def round(number, sig_digits=1):
        '''Round a float to the specified number of significant digits.

        Reference implementation:
            https://github.com/ActiveState/code/blob/3b27230f418b714bc9a0f897cb8ea189c3515e99/recipes/Python/578114_Round_number_specified_number_significant/recipe-578114.py
        '''
        try:
            ndigits = sig_digits - 1 - int(math.floor(math.log10(abs(number))))
        except ValueError:
            # math.log10(number) raised a ValueError, so number must be 0.0.
            # No need to round 0.0.
            return number
        else:
            return round(number, ndigits)

    def test_zero_false_negatives(self):
        'Ensure that we produce zero false negatives'
        for seen_link in self.seen_links:
            assert seen_link in self.recently_consumed

    def test_acceptable_false_positives(self):
        'Ensure that we produce false positives at an acceptable rate'
        acceptable, actual = self.recently_consumed.false_positives, 0

        for unseen_link in self.unseen_links:
            actual += unseen_link in self.recently_consumed
        actual /= float(len(self.unseen_links))
        actual = self.round(actual, sig_digits=1)

        message = 'acceptable: {}; actual: {}'.format(acceptable, actual)
        assert actual <= acceptable, message
 def test_item_in_filter(self):
     word = "dog"
     filter = BloomFilter(10)
     filter.add(word)
     self.assertIn(word, filter)
Beispiel #17
0
 def setUp(self):
     super(self.__class__, self).setUp()
     self.dilberts = BloomFilter({'rajiv', 'raj'}, key='dilberts')
Beispiel #18
0
    for x, c in counts.items():
        cm.update(x, c)
    plot_stats(counts, cm)

    plt.show()


def test_bloom():
    # Generate N distinct values in the range [0, 100000]
    N = 50_000
    s = set(np.random.choice(1_000_000, N, replace=False))
    print(f"Num inserted values: {N}, min: {min(s)}, max: {max(s)}")

    desired_error_prob = [0.05, 0.1, 0.2]
    for p in desired_error_prob:
        M, d = BloomFilter.optimal_size(p, N)
        print(f"Desired FP rate: {100*p:.2f}%, size of Bloom filter: "
              f"{M} bits (with {d} hash functions)")

        bf = BloomFilter(M, d, s)

        test_set = np.random.choice(1_000_000, 100_000)
        fp, tn = 0, 0
        for x in test_set:
            if x not in s:
                if x in bf:
                    fp += 1
                else:
                    tn += 1
        print(
            f"Estimated FP rate: {100*fp/(fp + tn):.2f}% (from {fp + tn} negative samples)"
Beispiel #19
0
 def test_add_gets_stored(self):
     'When we add() an element, ensure that we Memcache the bit array'
     self.dilberts.add('dan')
     office_space = BloomFilter(key='dilberts')
     assert office_space._bit_array == self.dilberts._bit_array
Beispiel #20
0
class BitcoinClient(object):
    def __init__(self,
                 addrs,
                 params=MAINNET,
                 user_agent="/pyBitcoin:0.1/",
                 max_connections=10):
        self.addrs = addrs
        self.params = params
        self.user_agent = user_agent
        self.max_connections = max_connections
        self.peers = []
        self.inventory = {}
        self.pending_txs = {}
        self.subscriptions = {}
        self.bloom_filter = BloomFilter(3, 0.01, random.getrandbits(32),
                                        BloomFilter.UPDATE_NONE)
        self.connect_to_peers()

    def connect_to_peers(self):
        if len(self.peers) < self.max_connections:
            shuffle(self.addrs)
            for i in range(self.max_connections - len(self.peers)):
                if len(self.addrs) > 0:
                    addr = self.addrs.pop(0)
                    peer = PeerFactory(self.params, self.user_agent,
                                       self.inventory, self.bloom_filter,
                                       self.on_peer_disconnected)
                    reactor.connectTCP(addr[0], addr[1], peer)
                    self.peers.append(peer)

    def on_peer_disconnected(self, peer):
        self.peers.remove(peer)
        self.connect_to_peers()

    def broadcast_tx(self, tx):
        """
        Send the tx to half our peers, wait for half of the remainder to announce the tx before
        calling back True.
        """
        def on_peer_anncounce(txid):
            self.pending_txs[txid][0] += 1
            if self.pending_txs[txid][0] >= self.pending_txs[txid][1] / 2:
                if self.pending_txs[txid][3].active():
                    self.pending_txs[txid][3].cancel()
                    self.pending_txs[txid][2].callback(True)

        d = defer.Deferred()
        self.inventory[bitcoin.txhash(tx)] = tx
        inv_packet = inv("TX", bitcoin.txhash(tx))
        self.bloom_filter.insert(bitcoin.bin_txhash(tx))
        self.pending_txs[bitcoin.txhash(tx)] = [
            0,
            len(self.peers) / 2, d,
            reactor.callLater(10, d.callback, False)
        ]
        for peer in self.peers[len(self.peers) / 2:]:
            peer.protocol.update_filter()
            peer.protocol.add_inv_callback(bitcoin.txhash(tx),
                                           on_peer_anncounce)
        for peer in self.peers[:len(self.peers) / 2]:
            peer.protocol.send_message(message(inv_packet, self.params))
        return d

    def subscribe_address(self, address, callback):
        """
        Listen for transactions on an address. Since we can't validate the transaction, we will only
        callback if a majority of our peers relay it. If less than a majority relay it, we will have
        to wait for block inclusion to callback.
        """
        def on_peer_announce(tx):
            txhash = bitcoin.txhash(bitcoin.serialize(tx["tx"]))
            if txhash in self.subscriptions[address][0] and self.subscriptions[
                    address][0][txhash][0] != "complete":
                self.subscriptions[address][0][txhash][0] += 1
                if self.subscriptions[address][0][txhash][
                        0] >= self.subscriptions[address][0][txhash][1]:
                    self.subscriptions[address][0][txhash][0] = "complete"
                    self.subscriptions[address][1](tx["tx"])
            elif txhash not in self.subscriptions[address][0]:
                self.subscriptions[address][0][txhash] = [
                    1, len(self.peers) / 2
                ]

        self.subscriptions[address] = [{}, callback]
        self.bloom_filter.insert(unhexlify(bitcoin.b58check_to_hex(address)))
        for peer in self.peers:
            peer.protocol.add_inv_callback(bitcoin.b58check_to_hex(address),
                                           on_peer_announce)
            peer.protocol.update_filter()
Beispiel #21
0
 def test_clear_gets_stored(self):
     'When we clear() all elements, ensure that we Memcache the bit array'
     self.dilberts.clear()
     office_space = BloomFilter(key='dilberts')
     assert office_space._bit_array == self.dilberts._bit_array
Beispiel #22
0
    return ''.join(random.choice(chars) for x in range(size))


def generateOther():
    return generateValue(10, string.ascii_lowercase + string.digits)


size = -(MAX_SIZE * log(ERROR_PROBABILITY)) / (log(2) * log(2))
count = (size / MAX_SIZE) * log(2)

optimalSize = int(complex(size).real)
optimalCount = int(complex(count).real)

print optimalSize, optimalCount

bloom = BloomFilter(optimalSize, optimalCount)

falsePositive = False
inserted = 0;
falseValue = "";

while not falsePositive:

    for i in range(MAX_SIZE / 100):
        bloom.add(generateValue())
        inserted += 1
    print(bloom)

    for i in range(MAX_SIZE / 100):
        falseValue = generateOther()
        falsePositive = (falseValue in bloom)