def __prepare_bloom(self): """Prepare bloom for existing checks """ self.__bloom = pybloom_live.ScalableBloomFilter() columns = [getattr(self.__table.c, key) for key in self.__update_keys] keys = select(columns).execution_options(stream_results=True).execute() for key in keys: self.__bloom.add(tuple(key))
def start_requests(self): self.crawls_left = MAX_CRAWL self.filter = pybloom_live.ScalableBloomFilter() for news in News.objects.all(): self.filter.add(news.url) url = 'https://voice.hupu.com/nba/1' yield Request(url, headers=self.headers)
def __init__(self, black_patterns=(), white_patterns=(r"^http",), capacity=None): """ constructor, use instance of BloomFilter if capacity else instance of set """ self._re_black_list = [re.compile(pattern, flags=re.IGNORECASE) for pattern in black_patterns] if black_patterns else [] self._re_white_list = [re.compile(pattern, flags=re.IGNORECASE) for pattern in white_patterns] if white_patterns else [] self._url_set = set() if not capacity else None self._bloom_filter = pybloom_live.ScalableBloomFilter(capacity, error_rate=0.001) if capacity else None return
def __init__(self, black_patterns=(CONFIG_URLPATTERN_ALL,), white_patterns=("^http",), capacity=None): """ constructor, use variable of BloomFilter if capacity else variable of set """ self.re_black_list = [re.compile(_pattern, flags=re.IGNORECASE) for _pattern in black_patterns] self.re_white_list = [re.compile(_pattern, flags=re.IGNORECASE) for _pattern in white_patterns] self.url_set = set() if not capacity else None self.bloom_filter = pybloom_live.ScalableBloomFilter(capacity, error_rate=0.001) if capacity else None return
def __init__(self, black=(CONFIG_URLPATTERN_ALL, ), white=('^http', ), bloom_capacity: int = 0): self.black_list = [ re.compile(pattern, flags=re.IGNORECASE) for pattern in black ] if black else [] self.white_list = [ re.compile(pattern, flags=re.IGNORECASE) for pattern in white ] if white else [] # if bloom_capacity > 0, use bloom filter, else use set self.url_set = set() if not bloom_capacity else None self.bloom_filter = pybloom_live.ScalableBloomFilter( bloom_capacity, error_rate=0.001) if bloom_capacity else None
def bloom(): global count filter = pybloom_live.ScalableBloomFilter( mode=pybloom_live.ScalableBloomFilter.SMALL_SET_GROWTH) for i in range(10000000): count += 1 try: url = get_random_str(random.randint(100, 500)) except Exception: count -= 1 continue if count % 100000 == 0: print(f'count: {count}') filter.add(url) with open('a.bloom', 'wb+') as f: filter.tofile(f)
def __init__(self, capacity=None): self.url_set = set() if not capacity else None self.bloom_filter = pybloom_live.ScalableBloomFilter( capacity, error_rate=0.001) if capacity else None
def set_deduplication(self, off = False): if off: self.bloom = None else: self.bloom = pybloom_live.ScalableBloomFilter(mode = pybloom_live.ScalableBloomFilter.LARGE_SET_GROWTH)
def __init__(self, black=(CONFIG_URLPATTERN_ALL,), white=("^http",), bloom_capacity=None): self._black_list = [re.compile(pattern, flags=re.IGNORECASE) for pattern in black] if black else [] self._white_list = [re.compile(pattern, flags=re.IGNORECASE) for pattern in white] if white else [] self._url_set = set() if not bloom_capacity else None self._bloom_filter = pybloom_live.ScalableBloomFilter(bloom_capacity, error_rate=0.001) if bloom_capacity else None
def __prepare_bloom(self): self.bloom = pybloom_live.ScalableBloomFilter() columns = [getattr(self.table.c, key) for key in self.update_keys] keys = select(columns).execution_options(stream_results=True).execute() for key in keys: self.bloom.add(key)
def new_bf(): mode = pybloom_live.ScalableBloomFilter.SMALL_SET_GROWTH return pybloom_live.ScalableBloomFilter(mode=mode)
def __init__(self): self.bloom_filter = pybloom_live.ScalableBloomFilter( mode=pybloom_live.ScalableBloomFilter.LARGE_SET_GROWTH) self.set = set()