Ejemplo n.º 1
0
 def __prepare_bloom(self):
     """Prepare bloom for existing checks
     """
     self.__bloom = pybloom_live.ScalableBloomFilter()
     columns = [getattr(self.__table.c, key) for key in self.__update_keys]
     keys = select(columns).execution_options(stream_results=True).execute()
     for key in keys:
         self.__bloom.add(tuple(key))
Ejemplo n.º 2
0
Archivo: bot.py Proyecto: Chrogeek/CTRL
    def start_requests(self):
        self.crawls_left = MAX_CRAWL
        self.filter = pybloom_live.ScalableBloomFilter()

        for news in News.objects.all():
            self.filter.add(news.url)

        url = 'https://voice.hupu.com/nba/1'
        yield Request(url, headers=self.headers)
Ejemplo n.º 3
0
    def __init__(self, black_patterns=(), white_patterns=(r"^http",), capacity=None):
        """
        constructor, use instance of BloomFilter if capacity else instance of set
        """
        self._re_black_list = [re.compile(pattern, flags=re.IGNORECASE) for pattern in black_patterns] if black_patterns else []
        self._re_white_list = [re.compile(pattern, flags=re.IGNORECASE) for pattern in white_patterns] if white_patterns else []

        self._url_set = set() if not capacity else None
        self._bloom_filter = pybloom_live.ScalableBloomFilter(capacity, error_rate=0.001) if capacity else None
        return
Ejemplo n.º 4
0
    def __init__(self, black_patterns=(CONFIG_URLPATTERN_ALL,), white_patterns=("^http",), capacity=None):
        """
        constructor, use variable of BloomFilter if capacity else variable of set
        """
        self.re_black_list = [re.compile(_pattern, flags=re.IGNORECASE) for _pattern in black_patterns]
        self.re_white_list = [re.compile(_pattern, flags=re.IGNORECASE) for _pattern in white_patterns]

        self.url_set = set() if not capacity else None
        self.bloom_filter = pybloom_live.ScalableBloomFilter(capacity, error_rate=0.001) if capacity else None
        return
Ejemplo n.º 5
0
    def __init__(self,
                 black=(CONFIG_URLPATTERN_ALL, ),
                 white=('^http', ),
                 bloom_capacity: int = 0):
        self.black_list = [
            re.compile(pattern, flags=re.IGNORECASE) for pattern in black
        ] if black else []
        self.white_list = [
            re.compile(pattern, flags=re.IGNORECASE) for pattern in white
        ] if white else []

        # if bloom_capacity > 0, use bloom filter, else use set
        self.url_set = set() if not bloom_capacity else None
        self.bloom_filter = pybloom_live.ScalableBloomFilter(
            bloom_capacity, error_rate=0.001) if bloom_capacity else None
Ejemplo n.º 6
0
def bloom():
    global count
    filter = pybloom_live.ScalableBloomFilter(
        mode=pybloom_live.ScalableBloomFilter.SMALL_SET_GROWTH)
    for i in range(10000000):
        count += 1
        try:
            url = get_random_str(random.randint(100, 500))
        except Exception:
            count -= 1
            continue

        if count % 100000 == 0:
            print(f'count: {count}')
        filter.add(url)

    with open('a.bloom', 'wb+') as f:
        filter.tofile(f)
Ejemplo n.º 7
0
    def __init__(self, capacity=None):

        self.url_set = set() if not capacity else None
        self.bloom_filter = pybloom_live.ScalableBloomFilter(
            capacity, error_rate=0.001) if capacity else None
Ejemplo n.º 8
0
 def set_deduplication(self, off = False):
   if off:
     self.bloom = None
   else:
     self.bloom = pybloom_live.ScalableBloomFilter(mode = pybloom_live.ScalableBloomFilter.LARGE_SET_GROWTH)
Ejemplo n.º 9
0
 def __init__(self, black=(CONFIG_URLPATTERN_ALL,), white=("^http",), bloom_capacity=None):
     self._black_list = [re.compile(pattern, flags=re.IGNORECASE) for pattern in black] if black else []
     self._white_list = [re.compile(pattern, flags=re.IGNORECASE) for pattern in white] if white else []
     self._url_set = set() if not bloom_capacity else None
     self._bloom_filter = pybloom_live.ScalableBloomFilter(bloom_capacity, error_rate=0.001) if bloom_capacity else None
Ejemplo n.º 10
0
 def __prepare_bloom(self):
     self.bloom = pybloom_live.ScalableBloomFilter()
     columns = [getattr(self.table.c, key) for key in self.update_keys]
     keys = select(columns).execution_options(stream_results=True).execute()
     for key in keys:
         self.bloom.add(key)
Ejemplo n.º 11
0
def new_bf():
    mode = pybloom_live.ScalableBloomFilter.SMALL_SET_GROWTH
    return pybloom_live.ScalableBloomFilter(mode=mode)
Ejemplo n.º 12
0
 def __init__(self):
     self.bloom_filter = pybloom_live.ScalableBloomFilter(
         mode=pybloom_live.ScalableBloomFilter.LARGE_SET_GROWTH)
     self.set = set()