def __init__(self, server, key, debug=False, **kwargs): super().__init__(server, key, debug) spider_settings = kwargs.get('spider_settings') if not spider_settings: raise EnvironmentError( "Please ensure you are using 'scrapy_ddiy.utils.scheduler.SchedulerDdiy' as the SCHEDULER." ) self.server = Client(host=spider_settings.get('REDIS_HOST'), port=spider_settings.get('REDIS_PORT'), **spider_settings.get('REDIS_PARAMS')) assert self.server.ping( ), 'Redis failed to establish a connection, please check the settings' error_rate = spider_settings.getfloat('REDIS_BLOOM_ERROR_RATE') capacity = spider_settings.getint('REDIS_BLOOM_CAPACITY') assert capacity, "Please set the 'REDIS_BLOOM_CAPACITY' for the spider" assert error_rate, "Please set the 'REDIS_BLOOM_ERROR_RATE' for the spider" if not self.server.keys(self.key): try: # By default, bloom-filter is auto-scaling self.server.bfCreate(self.key, error_rate, capacity) except redis.exceptions.ResponseError: raise EnvironmentError( 'The redis not loaded the redis-bloom module. See the doc [ xx ]' )
def __init__(self, *, redis_client: Client = None, redis_host: str = None, redis_port: int = None, max_command_params: int = DEFAULT_MAX_COMMAND_PARAMS): if redis_client is not None: self.redis_client = redis_client else: self.redis_client = Client(host=redis_host, port=redis_port) self.__max_command_params = max_command_params
def connect_to_redis(self): try: self.conn = Client(host=self.host, port=self.port, db=self.db, password=self.pwd) except Exception as e: print(e) return False return True
class RedisBloomFilterClient(BloomFilterClient): """ A RedisBloom-based bloom filter client. """ def __init__(self, *, redis_client: Client = None, redis_host: str = None, redis_port: int = None, max_command_params: int = DEFAULT_MAX_COMMAND_PARAMS): if redis_client is not None: self.redis_client = redis_client else: self.redis_client = Client(host=redis_host, port=redis_port) self.__max_command_params = max_command_params def exists(self, key: str, objects: List[T], value_func: Callable[[T], str] = str) -> List[T]: # If the bloom filter key doesn't exist, all values should be returned. if not key or not objects or not self.redis_client.exists(key): return objects if not isinstance(objects, list): raise ValueError("The objects parameter must be a list") # Split the provided object list according to the maximum number of parameters allowed per Redis command. if self.__max_command_params: commands = [ chunk for chunk in chunked(objects, self.__max_command_params) ] else: commands = [objects] # Create a pipeline to send all the commands to Redis at once. pipeline = self.redis_client.pipeline(transaction=False) for command in commands: pipeline.bfMExists(key, *map(value_func, command)) # Execute and get results for all the pipelined commands. results = pipeline.execute() # Build the final results. return [ value for value in chain( *[compress(*pairs) for pairs in zip(commands, results)]) ]
def __init__(self, settings, stats): self.logger = logging.getLogger(__name__) self.settings = settings self.stats = stats REDIS_HOST = self.settings.get('REDIS_HOST') REDIS_PORT = self.settings.get('REDIS_PORT') REDIS_PASSWORD = self.settings.get('REDIS_PASSWORD') try: self.rb = Client(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD) self.logger.info(f"Successfully connected to redis server") except Exception as e: self.logger.error(f"Unable to connect to redis server: {e}")
def __init__(self, config): """Follow类初始化""" self.rb = Client() self.filter_redis_key = 'uidfilter' self.validate_config(config) self.cookie = {'Cookie': config['cookie']} user_id_list = config['user_id_list'] if not isinstance(user_id_list, list): if not os.path.isabs(user_id_list): user_id_list = os.path.split( os.path.realpath(__file__))[0] + os.sep + user_id_list user_id_list = self.get_user_list(user_id_list) self.user_id_list = user_id_list # 要爬取的微博用户的user_id列表 self.user_id = '' self.follow_list = [] # 存储爬取到的所有关注微博的uri和用户昵称 self.fans_list = [] # 存储爬取到的所有粉丝微博的uri和用户昵称 self.file_name = 'user_id_list' + str(time()) + '.txt'
def load_data(): if environ.get('REDIS_SERVER') is not None: redis_server = environ.get('REDIS_SERVER') else: redis_server = 'localhost' if environ.get('REDIS_PORT') is not None: redis_port = int(environ.get('REDIS_PORT')) else: redis_port = 6379 if environ.get('REDIS_PASSWORD') is not None: redis_password = environ.get('REDIS_PASSWORD') else: redis_password = '' rdb = redis.Redis(host=redis_server, port=redis_port, password=redis_password) rb = RedisBloom(host=redis_server, port=redis_port, password=redis_password) rts = RedisTimeseries(host=redis_server, port=redis_port, password=redis_password) rdb.set("CONFIG", "YES") rts.create('s-unfiltered', retention_ms=60000) rts.create('s-filtered', retention_ms=60000) rts.create('unfiltered', labels={'Type': 'Final'}, retention_ms=86400000) rts.create('filtered', labels={'Type': 'Final'}, retention_ms=86400000) rts.createrule('s-unfiltered', 'unfiltered', 'last', 1000) rts.createrule('s-filtered', 'filtered', 'last', 1000) for gear in ['./dedup.py']: file = open(gear, mode='r') g = file.read() rdb.execute_command('RG.PYEXECUTE', g) file.close() if environ.get('REDIS_SCRABBLE') is not None: for line in fileinput.input("2019_Collins_Scrabble_Words.txt"): rb.bfAdd("Scrabble-Bloom", line.rstrip())
class RedisManager(): def __init__(self, settings, stats): self.logger = logging.getLogger(__name__) self.settings = settings self.stats = stats REDIS_HOST = self.settings.get('REDIS_HOST') REDIS_PORT = self.settings.get('REDIS_PORT') REDIS_PASSWORD = self.settings.get('REDIS_PASSWORD') try: self.rb = Client(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD) self.logger.info(f"Successfully connected to redis server") except Exception as e: self.logger.error(f"Unable to connect to redis server: {e}") def _bf_add_url_(self, url): try: bf_add = self.rb.bfAdd('bf_urls', url) if bf_add: self.stats.inc_value('redis/bloomfilter/added_urls') self.logger.info(f"Added '{url}' to bloomfilter.") else: self.logger.error(f"Couldn't add '{url}' to bloomfilter") except Exception as e: self.logger.error(e) def _bf_check_url_pres_(self, url): if self.rb.bfExists('bf_urls', url): self.logger.debug(f"Found '{url}' in bloomfilter") self.stats.inc_value('redis/bloomfilter/existing_urls') return True else: self.logger.debug(f"Couldn't find '{url}' in bloomfilter") self.stats.inc_value('redis/bloomfilter/not_existing_urls') return False # if __name__ == '__main__': # rm = RedisManager() # rm._bf_add_url_("test1")
def __init__(self): host = os.getenv("REDIS_HOST") port = os.getenv("REDIS_PORT") if not host or not port: raise Exception( "No Redis Host or Port provided. Please provide Host and Port in docker run command as env" ) port = int(port) self.redis_client = redis.Redis(host=host, port=port) self.bloom_client = Client(host=host, port=port)
class RedisBloomDupeFilter(RFPDupeFilter): """ Redis-bloom request duplicates filter for redis-spider. This class can also be used with default Scrapy's scheduler. """ def __init__(self, server, key, debug=False, **kwargs): super().__init__(server, key, debug) spider_settings = kwargs.get('spider_settings') if not spider_settings: raise EnvironmentError( "Please ensure you are using 'scrapy_ddiy.utils.scheduler.SchedulerDdiy' as the SCHEDULER." ) self.server = Client(host=spider_settings.get('REDIS_HOST'), port=spider_settings.get('REDIS_PORT'), **spider_settings.get('REDIS_PARAMS')) assert self.server.ping( ), 'Redis failed to establish a connection, please check the settings' error_rate = spider_settings.getfloat('REDIS_BLOOM_ERROR_RATE') capacity = spider_settings.getint('REDIS_BLOOM_CAPACITY') assert capacity, "Please set the 'REDIS_BLOOM_CAPACITY' for the spider" assert error_rate, "Please set the 'REDIS_BLOOM_ERROR_RATE' for the spider" if not self.server.keys(self.key): try: # By default, bloom-filter is auto-scaling self.server.bfCreate(self.key, error_rate, capacity) except redis.exceptions.ResponseError: raise EnvironmentError( 'The redis not loaded the redis-bloom module. See the doc [ xx ]' ) def request_seen(self, request): """Returns True if request was already seen""" fp = self.request_fingerprint(request) # This returns the number of values added, zero if already exists. added = self.server.bfAdd(self.key, fp) return added == 0
def setup_rebloom(): # create the client client = Client() # remove any old keys client.delete('ufo_words', 'ufo_shapes') # setup some Top-K action! client.topkReserve('ufo_words', k=10, width=400, depth=10, decay=0.9) client.topkReserve('ufo_shapes', k=10, width=20, depth=10, decay=0.9) # return the client return client
def redisbloom_client(cls, host: str, port: int): """ Returns a redisbloom Client, installing the redisbloom package if necessary. If/when a proper virtualenv setup is available on Glue and redisbloom can be pre-installed, this function will still serve to create as few instances of the redisbloom Client as possible (one per forked Python thread per Spark executor). NOTE: Intended ONLY for use on a Glue (PySpark) executor, likely as the first step of a foreachParititon statement. :param host: The Redis host URL :param port: The Redis port :return: An instance of redisbloom.client.Client """ if not (host, port) in cls.__redisbloom_client: try: from redisbloom.client import Client except ImportError: # Only install the redisbloom package to fix the failed import cls.__install_packages(['redisbloom==0.4.0']) from redisbloom.client import Client cls.__redisbloom_client[(host, port)] = Client(host=host, port=port) return cls.__redisbloom_client[(host, port)]
def __init__(self, node_name: str, lru_size: int, p=1.0e-6, n=1000000): """ Initialize last two layers of cache :param node_name: :param lru_size: """ super(FullLayeredCache, self).__init__(node_name, lru_size) # Set to true so we add a timeout to layer 2 redis key value stores self.set_timeout = True # Create the bloom filter client object self.bloom = RedisBloom(port=6378) # Create a dgraph client, stub, and transaction self.dgraph, self.stub = get_client() self.txn = self.dgraph.txn() # Initialize the bloom filter (if it doesnt already exist) try: self.bloom.bfInfo(node_name) except exceptions.ResponseError: self.bloom.bfCreate(node_name, p, n)
ielements = 1024; offset = 123456789; t = 5*1048576; a = []; b = []; i = 0; # Test Element test_element = randint(ielements,offset-1); filter_name = str(test_element); # Create the Cuckoo Filter r = Client() r.cfCreate(filter_name, cfsize); # Insert a fraction of the elements for x in range(1,ielements-1): r.cfAdd(filter_name, str(x)); # Test a large number of elements for x in range(offset,t+offset): pos = r.cfExists(filter_name, str(x)); #print(pos,x) if pos == 0: a.append(x) # Print FPR and set size
from redisbloom.client import Client # 因为我使用的是虚拟机中docker的redis, 填写虚拟机的ip地址和暴露的端口 rb = Client(host='node01', port=6379) rb.bfAdd('urls', 'baidu') rb.bfAdd('urls', 'google') print(rb.bfExists('urls', 'baidu')) # out: 1 print(rb.bfExists('urls', 'tencent2')) # out: 0 rb.bfMAdd('urls', 'a', 'b') print(rb.bfMExists('urls', 'google', 'baidu', 'tencent')) # out: [1, 1, 0]
def redis_attack(): try: # connect to Redis server # targe item target = 'ASDFGHJKLZXCVBNM' # rb = Client() # rb.cmsInitByDim('dim', 1000, 5) # rb.cmsIncrBy('dim', ['foo'], [5]) # rb.cmsIncrBy('dim', ['foo', 'bar'], [5, 15]) # msg = rb.cmsQuery('dim', 'foo') count = 0 dbkey = 'cms4' rb = Client() rb.cmsInitByDim(dbkey, 4096, 4) finish = True rb.cmsIncrBy(dbkey, [target], [5]) estimate = rb.cmsQuery(dbkey, target)[0] set = [] print(estimate) while (finish): # update the cms random_string = randomword(16) estimate = rb.cmsQuery(dbkey, target)[0] rb.cmsIncrBy(dbkey, [random_string], [1]) count = count + 1 if estimate < rb.cmsQuery(dbkey, target)[0]: print('Element Found!') print(random_string) set.append(random_string) finish = False for i in range(10): estimate = rb.cmsQuery(dbkey, target)[0] for x in set: rb.cmsIncrBy(dbkey, [x], [10]) if estimate == rb.cmsQuery(dbkey, target)[0]: finish = True print('False positive!') break else: print("Finally finished!\n") print("Attack set found!") for x in set: print(x) #print(msg) print("Test for the attack set... ") print("Target element estimate before attack -> " + str(rb.cmsQuery(dbkey, target)[0])) for x in set: print("Inserting..." + x) rb.cmsIncrBy(dbkey, [x], [1]) print("Target element estimate after attack -> " + str(rb.cmsQuery(dbkey, target)[0])) print("Operations needed -> " + str(count)) except Exception as e: print(e)
class Redis(BaseDb): ''' proxy以 proxy:IP:port作为key,以hash方式存储,field为type, protocol,score, ctime ''' __slots__ = ('_filter_name') @property def filter_name(self): return self._filter_name @filter_name.setter def filter_name(self, value): self._filter_name = value def __init__(self, host, pwd=None, port=6379, db=0): super().__init__() self.host = host self.pwd = pwd self.port = port self.db = db self._filter_name = '' def connect_to_redis(self): try: self.conn = Client(host=self.host, port=self.port, db=self.db, password=self.pwd) except Exception as e: print(e) return False return True def gen_key_name(self, record): # print(record) # print('ip' in record) # print('port' in record) if 'ip' in record and 'port' in record: return 'Proxy:%s:%s' % (record['ip'], record['port']) else: return None def exists(self, key_name): ''' 判断key是否已经存在,普通方式,和bf做对比,实际不使用 :param key_name: :return: 0(false)/1(True) ''' return self.conn.exists(key_name) def delete(self, key_name): return self.conn.delete(key_name) def delete_all(self): return self.conn.flushdb() # def hdelete(self, key_name): # return self.conn.hdel(key_name) def hmset(self, record, validate_time): valid_fields = ['ip', 'port', 'proxy_type', 'protocol', 'score'] # print(record) for single_valid_field in valid_fields: # print(single_valid_field) # print(single_valid_field not in record) if single_valid_field not in record: raise InvalidFieldException(single_valid_field) key_name = self.gen_key_name(record) field_value = { 'proxy_type': record['proxy_type'], 'protocol': record['protocol'], 'score': record['score'], # 'ctime': record['ctime'] } self.conn.hmset(key_name, field_value) self.conn.expire(key_name, validate_time) def multi_hmet(self, records, validate_time): for single_record in records: # print(single_record) self.hmset(single_record, validate_time) def time_interval_in_seconds(self, old_date_time, new_date_time): ''' 计算old_date_time和new_date_time之间时间间隔,单位秒 :param old_date_time: :param new_date_time: :return: int ''' if not helper.match_expect_type(old_date_time, 'datetime.datetime'): if helper.match_expect_type(old_date_time, 'str'): old_date_time = datetime.datetime.strptime( old_date_time, '%Y-%m-%d %H:%M:%S') else: raise ValueError('old_date_time的格式不正确') if not helper.match_expect_type(new_date_time, 'datetime.datetime'): if helper.match_expect_type(new_date_time, 'str'): new_date_time = datetime.datetime.strptime( new_date_time, '%Y-%m-%d %H:%M:%S') else: raise ValueError('new_date_time的格式不正确') # datetime.datetime.now()+datetime.timedelta(days=1) return int((new_date_time - old_date_time).total_seconds()) # print((new_date_time - old_date_time).total_seconds()) def expire(self, key_name, ttl): return self.conn.expire(key_name, ttl) def bf_create(self, fpp=0.001, capacity=1000, expansion=1): ''' 创建一个bloom过滤器 :param filter_name: 过滤器名称 :param fpp: 假阳性概率 :param capacity: 过滤器存储元素的个数 :param expansion: 当filter填满后,新建的子filter的capacity是当前filter的几倍大小。1,说明同样大小 :return: 0(create fail)/1(create success) ''' try: self.conn.bfCreate(key=self._filter_name, errorRate=fpp, capacity=capacity, expansion=expansion) except redis.exceptions.ResponseError as e: # print(e) #item exists return 0 return 1 def bf_madd(self, records): items = '' for single_record in records: items += self.gen_key_name(single_record) self.conn.bfMAdd(self._filter_name, items) def bf_add(self, record): item = self.gen_key_name(record) self.conn.bfMAdd(self._filter_name, item) def bf_exists(self, item): return self.conn.bfExists(self._filter_name, item) def bf_mexists(self, items): ''' :param items: 是一个list,调用bfMExists,加*变成可变参数 :return: ''' return self.conn.bfMExists(self._filter_name, *items)
class Follow(object): def __init__(self, config): """Follow类初始化""" self.rb = Client() self.filter_redis_key = 'uidfilter' self.validate_config(config) self.cookie = {'Cookie': config['cookie']} user_id_list = config['user_id_list'] if not isinstance(user_id_list, list): if not os.path.isabs(user_id_list): user_id_list = os.path.split( os.path.realpath(__file__))[0] + os.sep + user_id_list user_id_list = self.get_user_list(user_id_list) self.user_id_list = user_id_list # 要爬取的微博用户的user_id列表 self.user_id = '' self.follow_list = [] # 存储爬取到的所有关注微博的uri和用户昵称 self.fans_list = [] # 存储爬取到的所有粉丝微博的uri和用户昵称 self.file_name = 'user_id_list' + str(time()) + '.txt' def validate_config(self, config): """验证配置是否正确""" user_id_list = config['user_id_list'] if (not isinstance(user_id_list, list)) and (not user_id_list.endswith('.txt')): sys.exit(u'user_id_list值应为list类型或txt文件路径') if not isinstance(user_id_list, list): if not os.path.isabs(user_id_list): user_id_list = os.path.split( os.path.realpath(__file__))[0] + os.sep + user_id_list if not os.path.isfile(user_id_list): sys.exit(u'不存在%s文件' % user_id_list) def deal_html(self, url): """处理html""" try: html = requests.get(url, cookies=self.cookie, verify=False).content selector = etree.HTML(html) return selector except Exception as e: print('Error: ', e) traceback.print_exc() def get_page_num(self): """获取关注列表页数""" url = "https://weibo.cn/%s/follow" % self.user_id selector = self.deal_html(url) if selector.xpath("//input[@name='mp']") == []: page_num = 1 else: page_num = (int)( selector.xpath("//input[@name='mp']")[0].attrib['value']) return page_num def get_one_page(self, page): """获取第page页的user_id""" print(u'%s第%d页%s' % ('-' * 30, page, '-' * 30)) url = 'https://weibo.cn/%s/follow?page=%d' % (self.user_id, page) selector = self.deal_html(url) table_list = selector.xpath('//table') if (page == 1 and len(table_list) == 0): print(u'cookie无效或提供的user_id无效') else: for t in table_list: im = t.xpath('.//a/@href')[-1] uri = im.split('uid=')[-1].split('&')[0].split('/')[-1] nickname = t.xpath('.//a/text()')[0] # if {'uri': uri, 'nickname': nickname} not in self.follow_list: if self.rb.bfExists(self.filter_redis_key, uri) == 0: self.rb.bfAdd(self.filter_redis_key, uri) self.follow_list.append({'uri': uri, 'nickname': nickname}) print(u'%s %s' % (nickname, uri)) def get_follow_list(self): """获取关注用户主页地址""" page_num = self.get_page_num() print(u'用户关注页数:' + str(page_num)) page1 = 0 random_pages = random.randint(1, 5) for page in tqdm(range(1, page_num + 1), desc=u'关注列表爬取进度'): self.get_one_page(page) if page - page1 == random_pages and page < page_num: sleep(random.randint(6, 10)) page1 = page random_pages = random.randint(1, 5) print(u'用户关注列表爬取完毕') def get_fans_page_num(self): """获取关注列表页数""" url = "https://weibo.cn/%s/fans" % self.user_id selector = self.deal_html(url) if selector.xpath("//input[@name='mp']") == []: page_num = 1 else: page_num = (int)( selector.xpath("//input[@name='mp']")[0].attrib['value']) return page_num def get_fans_one_page(self, page): """获取第page页的user_id""" print(u'%s第%d页%s' % ('-' * 30, page, '-' * 30)) url = 'https://weibo.cn/%s/fans?page=%d' % (self.user_id, page) selector = self.deal_html(url) table_list = selector.xpath('//table') if (page == 1 and len(table_list) == 0): print(u'cookie无效或提供的user_id无效') else: for t in table_list: im = t.xpath('.//a/@href')[-1] uri = im.split('uid=')[-1].split('&')[0].split('/')[-1] nickname = t.xpath('.//a/text()')[0] #if {'uri': uri, 'nickname': nickname} not in self.fans_list: if self.rb.bfExists(self.filter_redis_key, uri) == 0: self.rb.bfAdd(self.filter_redis_key, uri) self.fans_list.append({'uri': uri, 'nickname': nickname}) print(u'%s %s' % (nickname, uri)) def get_fans_list(self): """获取关注用户主页地址""" page_num = self.get_fans_page_num() print(u'用户关注页数:' + str(page_num)) page1 = 0 random_pages = random.randint(1, 5) for page in tqdm(range(1, page_num + 1), desc=u'关注列表爬取进度'): self.get_fans_one_page(page) if page - page1 == random_pages and page < page_num: sleep(random.randint(6, 10)) page1 = page random_pages = random.randint(1, 5) print(u'用户粉丝列表爬取完毕') def write_to_txt(self): with open(self.file_name, 'ab') as f: for user in self.follow_list: f.write((user['uri'] + ' ' + user['nickname'] + '\n').encode( sys.stdout.encoding)) for user in self.fans_list: f.write((user['uri'] + ' ' + user['nickname'] + '\n').encode( sys.stdout.encoding)) def get_user_list(self, file_name): """获取文件中的微博id信息""" with open(file_name, 'rb') as f: try: lines = f.read().splitlines() lines = [line.decode('utf-8-sig') for line in lines] except UnicodeDecodeError: sys.exit(u'%s文件应为utf-8编码,请先将文件编码转为utf-8再运行程序' % file_name) user_id_list = [] for line in lines: info = line.split(' ') if len(info) > 0 and info[0].isdigit(): user_id = info[0] if user_id not in user_id_list: user_id_list.append(user_id) return user_id_list def initialize_info(self, user_id): """初始化爬虫信息""" self.follow_list = [] self.fans_list = [] self.user_id = user_id def check_unique(self, user_id): """查看user_id是否已经保存过""" def start(self): """运行爬虫""" for user_id in self.user_id_list: self.initialize_info(user_id) print(u'开始抓取:' + user_id) print('*' * 100) try: self.get_follow_list() # 爬取关注列表 self.get_fans_list() # 爬取粉丝列表 except Exception as e: print('Error: ', e) traceback.print_exc() sleep(10) # 如果出错则跳过用户,而不是退出 self.write_to_txt() print(u'信息抓取完毕') print('*' * 100)
import ujson as json from redis.exceptions import ResponseError from rediscluster import RedisCluster from redisbloom.client import Client import config rc_list = json.loads(config.config(section='rediscluster')['rediscluster']) redisbloomclient = Client(host=config.config()['host'], port=config.config()['port']) rediscluster_client = RedisCluster(startup_nodes=rc_list, decode_responses=True) import os from concurrent.futures import ThreadPoolExecutor, as_completed n_cpus = os.cpu_count() print(f'Number of CPUs: {n_cpus}') executor = ThreadPoolExecutor(max_workers=n_cpus) from pathlib import Path datapath = Path('../input') def parse_json_body_text(json_filename): print("Processing ..", json_filename.stem) with open(json_filename) as json_data: data = json.load(json_data) for body_text in data['body_text']: para = body_text['text'] yield para
import ujson as json from redis.exceptions import ResponseError from rediscluster import RedisCluster from redisbloom.client import Client import config rc_list=json.loads(config.config(section='rediscluster')['rediscluster']) redisbloomclient = Client(host=config.config()['host'],port=config.config()['port']) rediscluster_client = RedisCluster(startup_nodes=rc_list, decode_responses=True) import os from concurrent.futures import ThreadPoolExecutor, as_completed n_cpus = os.cpu_count() print(f'Number of CPUs: {n_cpus}') executor = ThreadPoolExecutor(max_workers=n_cpus) from pathlib import Path datapath = Path('../input') def parse_json_body_text(json_filename): print("Processing ..", json_filename.stem) with open(json_filename) as json_data: data = json.load(json_data)
def load_data(): if environ.get('REDIS_SERVER') is not None: redis_server = environ.get('REDIS_SERVER') else: redis_server = 'localhost' if environ.get('REDIS_PORT') is not None: redis_port = int(environ.get('REDIS_PORT')) else: redis_port = 6379 if environ.get('REDIS_PASSWORD') is not None: redis_password = environ.get('REDIS_PASSWORD') else: redis_password = '' rdb = redis.Redis(host=redis_server, port=redis_port, password=redis_password) rb = RedisBloom(host=redis_server, port=redis_port, password=redis_password) rts = RedisTimeseries(host=redis_server, port=redis_port, password=redis_password) with open('./users.csv', encoding='utf-8') as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') line_count = 0 for row in csv_reader: if line_count > 0: rdb.hset("user:%s" % (row[0].replace(" ", '')), mapping={ 'Name': row[0], 'AgeDemo': row[1], 'IncomeDemo': row[2], 'Sex': row[3] }) rdb.lpush("USERLIST", row[0]) line_count += 1 with open('./campaigns.csv', encoding='utf-8') as csv_file: rts.create('TOTALREVENUE') csv_reader = csv.reader(csv_file, delimiter=',') line_count = 0 for row in csv_reader: if line_count > 0: rdb.zadd("campaign:%s" % (row[0].replace(" ", '')), {row[2]: row[1]}) rb.bfCreate(row[2], 0.01, 1000) rb.set("counter:%s" % (row[2].replace(" ", '')), row[3]) rts.create("ADVIEW:%s" % (row[2].replace(" ", ''))) rb.sadd("AdStats", row[2]) line_count += 1 for gear in ['./adgear.py', './adstats.py']: file = open(gear, mode='r') g = file.read() rdb.execute_command('RG.PYEXECUTE', g) file.close()
def setUp(self): global rb rb = RedisBloom(port=port) rb.flushdb()
import redis from redisbloom.client import Client import os import sys from dotenv import load_dotenv load_dotenv() redisClient = Client.from_url(os.getenv('REDIS_URL'), decode_responses=True) def run(consumer, group='cdr_stats_worker', stream='events:cdr'): """ Subscribe to CDR events and write to hashes """ print(f'Starting {group}/{consumer} consumer listen on {stream}') try: redisClient.xgroup_create(stream, group, id='0', mkstream=True) except redis.exceptions.ResponseError as error: print(error) if not str(error) == 'BUSYGROUP Consumer Group name already exists': raise error if not redisClient.exists('stats:callers:top50'): redisClient.topkReserve('stats:callers:top50', 50, 2000, 7, 0.925) while True: for offset in ['0', '>']: for _, entries in redisClient.xreadgroup(group, consumer, {stream: offset},
def get_item(key, item): """判断是否存在""" rb = Client(connection_pool=pool) return rb.bfExists(key, item)
def add_item(key, item): """添加值""" rb = Client(connection_pool=pool) return rb.bfAdd(key, item)
""" 基于redis布隆过滤器的误判率的测试 """ import time from redisbloom.client import Client # pip install redisbloom rb = Client(host='node01', port=6379) def insert(size, key='book'): """插入数据""" # 一条条插入速度太慢了 # for i in range(size): # rb.bfAdd(key, f'book{i}') s = time.time() step = 1000 # 每次插入1000条数据 for start in range(0, size, step): stop = start + step if stop >= size: stop = size rb.bfMAdd(key, *range(start, stop)) print('插入结束... 花费时间: {:.4f}s'.format(time.time() - s)) def select(size, key='book'): """查询数据""" # 统计误判个数 count = 0 s = time.time()
def create_key(key, error, capacity): rb = Client(connection_pool=pool) rb.bfCreate(key, errorRate=error, capacity=capacity)
from flask import Flask, jsonify, send_from_directory from redisbloom.client import Client client = Client() # the Flask app app = Flask(__name__, instance_relative_config=True) # this route returns the TopK shapes as JSON @app.route('/shapes') def shapes(): top_shapes = [{ 'shape': shape, 'count': client.topkCount('ufo_shapes', shape)[0] } for shape in client.topkList('ufo_shapes')] return jsonify(top_shapes) # this route returns the TopK shapes as JSON @app.route('/words') def words(): top_words = [{ 'word': word, 'count': client.topkCount('ufo_words', word)[0] } for word in client.topkList('ufo_words')]
class FullLayeredCache(LayeredCache): """ Multi-Layered key value store with bloom filter and dgraph. Layer 1: In Memory LRU Key Value Map Layer 2: Redis Key Value Store Layer 3: Bloom filter Layer 4: DGraph The primary difference between this class and the LayeredCache class is that this one includes the bloom filter and DGraph. """ def __init__(self, node_name: str, lru_size: int, p=1.0e-6, n=1000000): """ Initialize last two layers of cache :param node_name: :param lru_size: """ super(FullLayeredCache, self).__init__(node_name, lru_size) # Set to true so we add a timeout to layer 2 redis key value stores self.set_timeout = True # Create the bloom filter client object self.bloom = RedisBloom(port=6378) # Create a dgraph client, stub, and transaction self.dgraph, self.stub = get_client() self.txn = self.dgraph.txn() # Initialize the bloom filter (if it doesnt already exist) try: self.bloom.bfInfo(node_name) except exceptions.ResponseError: self.bloom.bfCreate(node_name, p, n) def __contains__(self, key: str) -> bool: """ Check to see if key is in a layer of the cache. We will start at layer 1 and go walk through each layer until we find a result. We will update previous layers if we cache miss. We'll return True if the key was found at a layer, False if we cache miss. :param key: :return: """ # Check layer 1 and 2 if super(FullLayeredCache, self).__contains__(key): return True # Check the layer 3 bloom filter exists_in_bloom = self.bloom.bfExists(self.node_name, self._get_key(key)) if exists_in_bloom == 1: # Unfortunately, we can't store the actual value in the bloom filter. # For this, we can't update previous layers with the value for this key. return True # All else has failed, we must now check dgraph. This is super super slow. query = """query all($a: string) { all(func: eq(%s, $a)) { uid } }""" % self.node_name dgraph_result = self.txn.query(query, variables={"$a": str(key)}) thing = json.loads(dgraph_result.json) if len(thing["all"]) > 0: # Update previous layers self[key] = thing["all"][0]["uid"] return True # Cache miss, return False return False def __getitem__(self, key: str) -> Union[str, None]: """ Check each layer iteratively for the key specified. If we find the result at a given layer, we update previous layers with the result. If the result was not found, return None. :param key: :return: """ # Check layer 1 and 2 item = super(FullLayeredCache, self).__getitem__(key) if item is not None: return item # Check layer 3 bloom filter exists_in_bloom = self.bloom.bfExists(self.node_name, self._get_key(key)) if exists_in_bloom == 1: return True # All else has failed, we must now check dgraph. This is super super slow. query = """query all($a: string) { all(func: eq(%s, $a)) { uid } }""" % self.node_name dgraph_result = self.txn.query(query, variables={"$a": str(key)}) thing = json.loads(dgraph_result.json) if len(thing["all"]) > 0: # Update previous layers self[key] = thing["all"][0]["uid"] return thing["all"][0]["uid"] # Cache miss, return None return None def close(self): """ Close all outstanding connections :return: """ # Close the layer 2 redis connection super(FullLayeredCache, self).close() # Close layer 3 bloom filter connection self.bloom.close() # Close layer 4 dgraph connections self.stub.close()