class RedisBloomDupeFilter(RFPDupeFilter): """ Redis-bloom request duplicates filter for redis-spider. This class can also be used with default Scrapy's scheduler. """ def __init__(self, server, key, debug=False, **kwargs): super().__init__(server, key, debug) spider_settings = kwargs.get('spider_settings') if not spider_settings: raise EnvironmentError( "Please ensure you are using 'scrapy_ddiy.utils.scheduler.SchedulerDdiy' as the SCHEDULER." ) self.server = Client(host=spider_settings.get('REDIS_HOST'), port=spider_settings.get('REDIS_PORT'), **spider_settings.get('REDIS_PARAMS')) assert self.server.ping( ), 'Redis failed to establish a connection, please check the settings' error_rate = spider_settings.getfloat('REDIS_BLOOM_ERROR_RATE') capacity = spider_settings.getint('REDIS_BLOOM_CAPACITY') assert capacity, "Please set the 'REDIS_BLOOM_CAPACITY' for the spider" assert error_rate, "Please set the 'REDIS_BLOOM_ERROR_RATE' for the spider" if not self.server.keys(self.key): try: # By default, bloom-filter is auto-scaling self.server.bfCreate(self.key, error_rate, capacity) except redis.exceptions.ResponseError: raise EnvironmentError( 'The redis not loaded the redis-bloom module. See the doc [ xx ]' ) def request_seen(self, request): """Returns True if request was already seen""" fp = self.request_fingerprint(request) # This returns the number of values added, zero if already exists. added = self.server.bfAdd(self.key, fp) return added == 0
def load_data(): if environ.get('REDIS_SERVER') is not None: redis_server = environ.get('REDIS_SERVER') else: redis_server = 'localhost' if environ.get('REDIS_PORT') is not None: redis_port = int(environ.get('REDIS_PORT')) else: redis_port = 6379 if environ.get('REDIS_PASSWORD') is not None: redis_password = environ.get('REDIS_PASSWORD') else: redis_password = '' rdb = redis.Redis(host=redis_server, port=redis_port, password=redis_password) rb = RedisBloom(host=redis_server, port=redis_port, password=redis_password) rts = RedisTimeseries(host=redis_server, port=redis_port, password=redis_password) with open('./users.csv', encoding='utf-8') as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') line_count = 0 for row in csv_reader: if line_count > 0: rdb.hset("user:%s" % (row[0].replace(" ", '')), mapping={ 'Name': row[0], 'AgeDemo': row[1], 'IncomeDemo': row[2], 'Sex': row[3] }) rdb.lpush("USERLIST", row[0]) line_count += 1 with open('./campaigns.csv', encoding='utf-8') as csv_file: rts.create('TOTALREVENUE') csv_reader = csv.reader(csv_file, delimiter=',') line_count = 0 for row in csv_reader: if line_count > 0: rdb.zadd("campaign:%s" % (row[0].replace(" ", '')), {row[2]: row[1]}) rb.bfCreate(row[2], 0.01, 1000) rb.set("counter:%s" % (row[2].replace(" ", '')), row[3]) rts.create("ADVIEW:%s" % (row[2].replace(" ", ''))) rb.sadd("AdStats", row[2]) line_count += 1 for gear in ['./adgear.py', './adstats.py']: file = open(gear, mode='r') g = file.read() rdb.execute_command('RG.PYEXECUTE', g) file.close()
class Redis(BaseDb): ''' proxy以 proxy:IP:port作为key,以hash方式存储,field为type, protocol,score, ctime ''' __slots__ = ('_filter_name') @property def filter_name(self): return self._filter_name @filter_name.setter def filter_name(self, value): self._filter_name = value def __init__(self, host, pwd=None, port=6379, db=0): super().__init__() self.host = host self.pwd = pwd self.port = port self.db = db self._filter_name = '' def connect_to_redis(self): try: self.conn = Client(host=self.host, port=self.port, db=self.db, password=self.pwd) except Exception as e: print(e) return False return True def gen_key_name(self, record): # print(record) # print('ip' in record) # print('port' in record) if 'ip' in record and 'port' in record: return 'Proxy:%s:%s' % (record['ip'], record['port']) else: return None def exists(self, key_name): ''' 判断key是否已经存在,普通方式,和bf做对比,实际不使用 :param key_name: :return: 0(false)/1(True) ''' return self.conn.exists(key_name) def delete(self, key_name): return self.conn.delete(key_name) def delete_all(self): return self.conn.flushdb() # def hdelete(self, key_name): # return self.conn.hdel(key_name) def hmset(self, record, validate_time): valid_fields = ['ip', 'port', 'proxy_type', 'protocol', 'score'] # print(record) for single_valid_field in valid_fields: # print(single_valid_field) # print(single_valid_field not in record) if single_valid_field not in record: raise InvalidFieldException(single_valid_field) key_name = self.gen_key_name(record) field_value = { 'proxy_type': record['proxy_type'], 'protocol': record['protocol'], 'score': record['score'], # 'ctime': record['ctime'] } self.conn.hmset(key_name, field_value) self.conn.expire(key_name, validate_time) def multi_hmet(self, records, validate_time): for single_record in records: # print(single_record) self.hmset(single_record, validate_time) def time_interval_in_seconds(self, old_date_time, new_date_time): ''' 计算old_date_time和new_date_time之间时间间隔,单位秒 :param old_date_time: :param new_date_time: :return: int ''' if not helper.match_expect_type(old_date_time, 'datetime.datetime'): if helper.match_expect_type(old_date_time, 'str'): old_date_time = datetime.datetime.strptime( old_date_time, '%Y-%m-%d %H:%M:%S') else: raise ValueError('old_date_time的格式不正确') if not helper.match_expect_type(new_date_time, 'datetime.datetime'): if helper.match_expect_type(new_date_time, 'str'): new_date_time = datetime.datetime.strptime( new_date_time, '%Y-%m-%d %H:%M:%S') else: raise ValueError('new_date_time的格式不正确') # datetime.datetime.now()+datetime.timedelta(days=1) return int((new_date_time - old_date_time).total_seconds()) # print((new_date_time - old_date_time).total_seconds()) def expire(self, key_name, ttl): return self.conn.expire(key_name, ttl) def bf_create(self, fpp=0.001, capacity=1000, expansion=1): ''' 创建一个bloom过滤器 :param filter_name: 过滤器名称 :param fpp: 假阳性概率 :param capacity: 过滤器存储元素的个数 :param expansion: 当filter填满后,新建的子filter的capacity是当前filter的几倍大小。1,说明同样大小 :return: 0(create fail)/1(create success) ''' try: self.conn.bfCreate(key=self._filter_name, errorRate=fpp, capacity=capacity, expansion=expansion) except redis.exceptions.ResponseError as e: # print(e) #item exists return 0 return 1 def bf_madd(self, records): items = '' for single_record in records: items += self.gen_key_name(single_record) self.conn.bfMAdd(self._filter_name, items) def bf_add(self, record): item = self.gen_key_name(record) self.conn.bfMAdd(self._filter_name, item) def bf_exists(self, item): return self.conn.bfExists(self._filter_name, item) def bf_mexists(self, items): ''' :param items: 是一个list,调用bfMExists,加*变成可变参数 :return: ''' return self.conn.bfMExists(self._filter_name, *items)
def create_key(key, error, capacity): rb = Client(connection_pool=pool) rb.bfCreate(key, errorRate=error, capacity=capacity)
class FullLayeredCache(LayeredCache): """ Multi-Layered key value store with bloom filter and dgraph. Layer 1: In Memory LRU Key Value Map Layer 2: Redis Key Value Store Layer 3: Bloom filter Layer 4: DGraph The primary difference between this class and the LayeredCache class is that this one includes the bloom filter and DGraph. """ def __init__(self, node_name: str, lru_size: int, p=1.0e-6, n=1000000): """ Initialize last two layers of cache :param node_name: :param lru_size: """ super(FullLayeredCache, self).__init__(node_name, lru_size) # Set to true so we add a timeout to layer 2 redis key value stores self.set_timeout = True # Create the bloom filter client object self.bloom = RedisBloom(port=6378) # Create a dgraph client, stub, and transaction self.dgraph, self.stub = get_client() self.txn = self.dgraph.txn() # Initialize the bloom filter (if it doesnt already exist) try: self.bloom.bfInfo(node_name) except exceptions.ResponseError: self.bloom.bfCreate(node_name, p, n) def __contains__(self, key: str) -> bool: """ Check to see if key is in a layer of the cache. We will start at layer 1 and go walk through each layer until we find a result. We will update previous layers if we cache miss. We'll return True if the key was found at a layer, False if we cache miss. :param key: :return: """ # Check layer 1 and 2 if super(FullLayeredCache, self).__contains__(key): return True # Check the layer 3 bloom filter exists_in_bloom = self.bloom.bfExists(self.node_name, self._get_key(key)) if exists_in_bloom == 1: # Unfortunately, we can't store the actual value in the bloom filter. # For this, we can't update previous layers with the value for this key. return True # All else has failed, we must now check dgraph. This is super super slow. query = """query all($a: string) { all(func: eq(%s, $a)) { uid } }""" % self.node_name dgraph_result = self.txn.query(query, variables={"$a": str(key)}) thing = json.loads(dgraph_result.json) if len(thing["all"]) > 0: # Update previous layers self[key] = thing["all"][0]["uid"] return True # Cache miss, return False return False def __getitem__(self, key: str) -> Union[str, None]: """ Check each layer iteratively for the key specified. If we find the result at a given layer, we update previous layers with the result. If the result was not found, return None. :param key: :return: """ # Check layer 1 and 2 item = super(FullLayeredCache, self).__getitem__(key) if item is not None: return item # Check layer 3 bloom filter exists_in_bloom = self.bloom.bfExists(self.node_name, self._get_key(key)) if exists_in_bloom == 1: return True # All else has failed, we must now check dgraph. This is super super slow. query = """query all($a: string) { all(func: eq(%s, $a)) { uid } }""" % self.node_name dgraph_result = self.txn.query(query, variables={"$a": str(key)}) thing = json.loads(dgraph_result.json) if len(thing["all"]) > 0: # Update previous layers self[key] = thing["all"][0]["uid"] return thing["all"][0]["uid"] # Cache miss, return None return None def close(self): """ Close all outstanding connections :return: """ # Close the layer 2 redis connection super(FullLayeredCache, self).close() # Close layer 3 bloom filter connection self.bloom.close() # Close layer 4 dgraph connections self.stub.close()