コード例 #1
0
class RedisBloomDupeFilter(RFPDupeFilter):
    """
    Redis-bloom request duplicates filter for redis-spider.
    This class can also be used with default Scrapy's scheduler.
    """
    def __init__(self, server, key, debug=False, **kwargs):
        super().__init__(server, key, debug)
        spider_settings = kwargs.get('spider_settings')
        if not spider_settings:
            raise EnvironmentError(
                "Please ensure you are using 'scrapy_ddiy.utils.scheduler.SchedulerDdiy' as the SCHEDULER."
            )

        self.server = Client(host=spider_settings.get('REDIS_HOST'),
                             port=spider_settings.get('REDIS_PORT'),
                             **spider_settings.get('REDIS_PARAMS'))
        assert self.server.ping(
        ), 'Redis failed to establish a connection, please check the settings'
        error_rate = spider_settings.getfloat('REDIS_BLOOM_ERROR_RATE')
        capacity = spider_settings.getint('REDIS_BLOOM_CAPACITY')
        assert capacity, "Please set the 'REDIS_BLOOM_CAPACITY' for the spider"
        assert error_rate, "Please set the 'REDIS_BLOOM_ERROR_RATE' for the spider"
        if not self.server.keys(self.key):
            try:
                # By default, bloom-filter is auto-scaling
                self.server.bfCreate(self.key, error_rate, capacity)
            except redis.exceptions.ResponseError:
                raise EnvironmentError(
                    'The redis not loaded the redis-bloom module. See the doc [ xx ]'
                )

    def request_seen(self, request):
        """Returns True if request was already seen"""
        fp = self.request_fingerprint(request)
        # This returns the number of values added, zero if already exists.
        added = self.server.bfAdd(self.key, fp)
        return added == 0
コード例 #2
0
def load_data():

    if environ.get('REDIS_SERVER') is not None:
        redis_server = environ.get('REDIS_SERVER')
    else:
        redis_server = 'localhost'

    if environ.get('REDIS_PORT') is not None:
        redis_port = int(environ.get('REDIS_PORT'))
    else:
        redis_port = 6379

    if environ.get('REDIS_PASSWORD') is not None:
        redis_password = environ.get('REDIS_PASSWORD')
    else:
        redis_password = ''

    rdb = redis.Redis(host=redis_server,
                      port=redis_port,
                      password=redis_password)
    rb = RedisBloom(host=redis_server,
                    port=redis_port,
                    password=redis_password)
    rts = RedisTimeseries(host=redis_server,
                          port=redis_port,
                          password=redis_password)

    with open('./users.csv', encoding='utf-8') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        line_count = 0
        for row in csv_reader:
            if line_count > 0:
                rdb.hset("user:%s" % (row[0].replace(" ", '')),
                         mapping={
                             'Name': row[0],
                             'AgeDemo': row[1],
                             'IncomeDemo': row[2],
                             'Sex': row[3]
                         })
                rdb.lpush("USERLIST", row[0])
            line_count += 1

    with open('./campaigns.csv', encoding='utf-8') as csv_file:
        rts.create('TOTALREVENUE')
        csv_reader = csv.reader(csv_file, delimiter=',')
        line_count = 0
        for row in csv_reader:
            if line_count > 0:
                rdb.zadd("campaign:%s" % (row[0].replace(" ", '')),
                         {row[2]: row[1]})
                rb.bfCreate(row[2], 0.01, 1000)
                rb.set("counter:%s" % (row[2].replace(" ", '')), row[3])
                rts.create("ADVIEW:%s" % (row[2].replace(" ", '')))
                rb.sadd("AdStats", row[2])
            line_count += 1

    for gear in ['./adgear.py', './adstats.py']:
        file = open(gear, mode='r')
        g = file.read()
        rdb.execute_command('RG.PYEXECUTE', g)
        file.close()
コード例 #3
0
class Redis(BaseDb):
    '''
    proxy以 proxy:IP:port作为key,以hash方式存储,field为type, protocol,score, ctime
    '''

    __slots__ = ('_filter_name')

    @property
    def filter_name(self):
        return self._filter_name

    @filter_name.setter
    def filter_name(self, value):
        self._filter_name = value

    def __init__(self, host, pwd=None, port=6379, db=0):
        super().__init__()
        self.host = host
        self.pwd = pwd
        self.port = port
        self.db = db
        self._filter_name = ''

    def connect_to_redis(self):
        try:
            self.conn = Client(host=self.host,
                               port=self.port,
                               db=self.db,
                               password=self.pwd)
        except Exception as e:
            print(e)
            return False

        return True

    def gen_key_name(self, record):
        # print(record)
        # print('ip' in record)
        # print('port' in record)
        if 'ip' in record and 'port' in record:
            return 'Proxy:%s:%s' % (record['ip'], record['port'])
        else:
            return None

    def exists(self, key_name):
        '''
        判断key是否已经存在,普通方式,和bf做对比,实际不使用
        :param key_name:
        :return: 0(false)/1(True)
        '''
        return self.conn.exists(key_name)

    def delete(self, key_name):
        return self.conn.delete(key_name)

    def delete_all(self):
        return self.conn.flushdb()

    # def hdelete(self, key_name):
    #     return self.conn.hdel(key_name)

    def hmset(self, record, validate_time):
        valid_fields = ['ip', 'port', 'proxy_type', 'protocol', 'score']
        # print(record)
        for single_valid_field in valid_fields:
            # print(single_valid_field)
            # print(single_valid_field not in record)
            if single_valid_field not in record:
                raise InvalidFieldException(single_valid_field)

        key_name = self.gen_key_name(record)
        field_value = {
            'proxy_type': record['proxy_type'],
            'protocol': record['protocol'],
            'score': record['score'],
            # 'ctime': record['ctime']
        }

        self.conn.hmset(key_name, field_value)
        self.conn.expire(key_name, validate_time)

    def multi_hmet(self, records, validate_time):
        for single_record in records:
            # print(single_record)
            self.hmset(single_record, validate_time)

    def time_interval_in_seconds(self, old_date_time, new_date_time):
        '''
        计算old_date_time和new_date_time之间时间间隔,单位秒
        :param old_date_time:
        :param new_date_time:
        :return:    int
        '''

        if not helper.match_expect_type(old_date_time, 'datetime.datetime'):
            if helper.match_expect_type(old_date_time, 'str'):
                old_date_time = datetime.datetime.strptime(
                    old_date_time, '%Y-%m-%d %H:%M:%S')
            else:
                raise ValueError('old_date_time的格式不正确')

        if not helper.match_expect_type(new_date_time, 'datetime.datetime'):
            if helper.match_expect_type(new_date_time, 'str'):
                new_date_time = datetime.datetime.strptime(
                    new_date_time, '%Y-%m-%d %H:%M:%S')
            else:
                raise ValueError('new_date_time的格式不正确')

        # datetime.datetime.now()+datetime.timedelta(days=1)
        return int((new_date_time - old_date_time).total_seconds())
        # print((new_date_time - old_date_time).total_seconds())

    def expire(self, key_name, ttl):
        return self.conn.expire(key_name, ttl)

    def bf_create(self, fpp=0.001, capacity=1000, expansion=1):
        '''
        创建一个bloom过滤器
        :param filter_name: 过滤器名称
        :param fpp: 假阳性概率
        :param capacity: 过滤器存储元素的个数
        :param expansion: 当filter填满后,新建的子filter的capacity是当前filter的几倍大小。1,说明同样大小
        :return: 0(create fail)/1(create success)
        '''
        try:
            self.conn.bfCreate(key=self._filter_name,
                               errorRate=fpp,
                               capacity=capacity,
                               expansion=expansion)
        except redis.exceptions.ResponseError as e:
            # print(e)    #item exists
            return 0
        return 1

    def bf_madd(self, records):
        items = ''
        for single_record in records:
            items += self.gen_key_name(single_record)
        self.conn.bfMAdd(self._filter_name, items)

    def bf_add(self, record):
        item = self.gen_key_name(record)

        self.conn.bfMAdd(self._filter_name, item)

    def bf_exists(self, item):
        return self.conn.bfExists(self._filter_name, item)

    def bf_mexists(self, items):
        '''
        :param items: 是一个list,调用bfMExists,加*变成可变参数
        :return:
        '''
        return self.conn.bfMExists(self._filter_name, *items)
コード例 #4
0
ファイル: black_list.py プロジェクト: williamsyb/mycookbook
def create_key(key, error, capacity):
    rb = Client(connection_pool=pool)
    rb.bfCreate(key, errorRate=error, capacity=capacity)
コード例 #5
0
class FullLayeredCache(LayeredCache):
    """
    Multi-Layered key value store with bloom filter and dgraph.

    Layer 1: In Memory LRU Key Value Map
    Layer 2: Redis Key Value Store
    Layer 3: Bloom filter
    Layer 4: DGraph

    The primary difference between this class and the LayeredCache class is that this
    one includes the bloom filter and DGraph.
    """
    def __init__(self, node_name: str, lru_size: int, p=1.0e-6, n=1000000):
        """
        Initialize last two layers of cache

        :param node_name:
        :param lru_size:
        """
        super(FullLayeredCache, self).__init__(node_name, lru_size)

        # Set to true so we add a timeout to layer 2 redis key value stores
        self.set_timeout = True

        # Create the bloom filter client object
        self.bloom = RedisBloom(port=6378)

        # Create a dgraph client, stub, and transaction
        self.dgraph, self.stub = get_client()
        self.txn = self.dgraph.txn()

        # Initialize the bloom filter (if it doesnt already exist)
        try:
            self.bloom.bfInfo(node_name)
        except exceptions.ResponseError:
            self.bloom.bfCreate(node_name, p, n)

    def __contains__(self, key: str) -> bool:
        """
        Check to see if key is in a layer of the cache. We will start at
        layer 1 and go walk through each layer until we find a result.
        We will update previous layers if we cache miss.

        We'll return True if the key was found at a layer, False if we
        cache miss.

        :param key:
        :return:
        """

        # Check layer 1 and 2
        if super(FullLayeredCache, self).__contains__(key):
            return True

        # Check the layer 3 bloom filter
        exists_in_bloom = self.bloom.bfExists(self.node_name,
                                              self._get_key(key))
        if exists_in_bloom == 1:
            # Unfortunately, we can't store the actual value in the bloom filter.
            # For this, we can't update previous layers with the value for this key.
            return True

        # All else has failed, we must now check dgraph. This is super super slow.
        query = """query all($a: string) { all(func: eq(%s, $a)) { uid } }""" % self.node_name
        dgraph_result = self.txn.query(query, variables={"$a": str(key)})
        thing = json.loads(dgraph_result.json)
        if len(thing["all"]) > 0:
            # Update previous layers
            self[key] = thing["all"][0]["uid"]
            return True

        # Cache miss, return False
        return False

    def __getitem__(self, key: str) -> Union[str, None]:
        """
        Check each layer iteratively for the key specified. If we find the result
        at a given layer, we update previous layers with the result.

        If the result was not found, return None.

        :param key:
        :return:
        """
        # Check layer 1 and 2
        item = super(FullLayeredCache, self).__getitem__(key)
        if item is not None:
            return item

        # Check layer 3 bloom filter
        exists_in_bloom = self.bloom.bfExists(self.node_name,
                                              self._get_key(key))
        if exists_in_bloom == 1:
            return True

        # All else has failed, we must now check dgraph. This is super super slow.
        query = """query all($a: string) { all(func: eq(%s, $a)) { uid } }""" % self.node_name
        dgraph_result = self.txn.query(query, variables={"$a": str(key)})
        thing = json.loads(dgraph_result.json)
        if len(thing["all"]) > 0:
            # Update previous layers
            self[key] = thing["all"][0]["uid"]
            return thing["all"][0]["uid"]

        # Cache miss, return None
        return None

    def close(self):
        """
        Close all outstanding connections

        :return:
        """

        # Close the layer 2 redis connection
        super(FullLayeredCache, self).close()

        # Close layer 3 bloom filter connection
        self.bloom.close()

        # Close layer 4 dgraph connections
        self.stub.close()