def test_union_k_fail(self): bloom_one = BloomFilter(100, 0.01) bloom_two = BloomFilter(100, 0.001) def _run(): bloom_one.union(bloom_two) self.assertRaises(ValueError, _run)
def test_intersection_capacity_fail(self): bloom_one = BloomFilter(1000, 0.001) bloom_two = BloomFilter(100, 0.001) def _run(): bloom_one.intersection(bloom_two) self.assertRaises(ValueError, _run)
def test_union(self): bloom_one = BloomFilter(100, 0.001) bloom_two = BloomFilter(100, 0.001) chars = [chr(i) for i in range_fn(97, 123)] for char in chars[int(len(chars) / 2):]: bloom_one.add(char) for char in chars[:int(len(chars) / 2)]: bloom_two.add(char) new_bloom = bloom_one.union(bloom_two) for char in chars: self.assertTrue(char in new_bloom)
def bloomf(self): bFilter = BloomFilter(capacity=1000, error_rate=0.001) return bFilter
from __future__ import print_function import time from pybloom_live.pybloom import BloomFilter try: range = xrange except NameError: pass NS = 10**9 for _p in range(1, 3): p = 10 ** _p for e in range(9): X = int(1000 * 10 ** (e / 2.0)) print(X, p, end='') bloomfilter = BloomFilter(X + 1, 1.0/p) t = time.time() for x in range(X): bloomfilter.add(x) print((time.time() - t) / X * NS, end='') t = time.time() for x in range(X): x in bloomfilter print((time.time() - t) / X * NS, end='') t = time.time() for x in range(X, 2*X): x in bloomfilter print((time.time() - t) / X * NS)
import re headers = {} headers[ 'User-Agent'] = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" headers['Accept-Encoding'] = 'gzip, deflate, br' headers['Accept-Language'] = 'zh-CN,zh;q=0.9' headers['Connection'] = 'keep-alive' headers['Host'] = 'tieba.baidu.com' baidu_base_url = 'https://tieba.baidu.com' baidu_base_url_no_https = 'http://tieba.baidu.com' from pybloom_live.pybloom import BloomFilter title_url_bloom = BloomFilter(capacity=2 << 15, error_rate=0.01) class BaiduTiebaSpider(scrapy.Spider): name = "tieba" allowed_domains = ['tieba.baidu.com'] root_url = 'https://tieba.baidu.com/f?kw=%E7%9B%B8%E4%BA%B2&ie=utf-8&pn=' custom_settings = { 'ITEM_PIPELINES': { 'spider.pipelines.TiebaPipeline': 300 }, } MAX_DEEP_INDEX = 1000