def __init__(self, usage, strategy='robin', length=10, fast_response=5, redis_args=None): """ :param usage: one of SCORE_MAPS's keys, such as https :param length: if total available proxies are less than length, you must refresh pool :param strategy: the load balance of proxy ip, the value is one of ['robin', 'greedy'] :param fast_response: if you use greedy strategy, if will be needed to decide whether a proxy ip should continue to be used :param redis_args: redis connetion args, it's a dict, the keys include host, port, db and password """ self.score_queue = SCORE_MAPS.get(usage) self.ttl_queue = TTL_MAPS.get(usage) self.speed_queue = SPEED_MAPS.get(usage) self.strategy = strategy # pool is a queue, which is FIFO self.pool = list() self.length = length self.fast_response = fast_response self.handlers = [RobinStrategy(), GreedyStrategy()] if isinstance(redis_args, dict): self.conn = get_redis_conn(**redis_args) else: self.conn = get_redis_conn()
def __init__(self, usage, strategy='robin', fast_response=5, redis_args=None): """ :param usage: one of SCORE_MAPS's keys, such as https you must refresh pool :param strategy: the load balance of proxy ip, the value is one of ['robin', 'greedy'] :param fast_response: if you use greedy strategy, if will be needed to decide whether a proxy ip should continue to be used :param redis_args: redis connetion args, it's a dict, the keys include host, port, db and password """ # if there are multi parent classes, super is only used for the first parent according to MRO super().__init__(usage) self.strategy = strategy # pool is a queue, which is FIFO self.pool = list() self.fast_response = fast_response self.handlers = [RobinStrategy(), GreedyStrategy()] if isinstance(redis_args, dict): self.conn = get_redis_conn(**redis_args) else: self.conn = get_redis_conn() t = threading.Thread(target=self._refresh_periodically) t.setDaemon(True) t.start()
def __init__(self, usage, strategy='robin', fast_response=5, redis_args=None): """ :param usage: one of SCORE_MAPS's keys, such as https you must refresh pool :param strategy: the load balance of proxy ip, the value is one of ['robin', 'greedy'] :param fast_response: if you use greedy strategy, it will be needed to decide whether a proxy ip should continue to be used :param redis_args: redis connetion args, it's a dict, whose keys include host, port, db and password """ # if there are multi parent classes, super is only used for the first parent according to MRO super().__init__(usage) self.strategy = strategy # pool is a queue, which is FIFO self.pool = list() self.fast_response = fast_response self.handlers = [RobinStrategy(), GreedyStrategy()] if isinstance(redis_args, dict): self.conn = get_redis_conn(**redis_args) else: self.conn = get_redis_conn() t = threading.Thread(target=self._refresh_periodically) t.setDaemon(True) t.start()
def __init__(self, usage, strategy='robin', fast_response=5, score_map=SCORE_MAPS, ttl_map=TTL_MAPS, speed_map=SPEED_MAPS, longest_response_time=LONGEST_RESPONSE_TIME, lowest_score=LOWEST_SCORE, ttl_validated_resource=TTL_VALIDATED_RESOURCE, min_pool_size=LOWEST_TOTAL_PROXIES, all_data=DATA_ALL, redis_args=None): """ :param usage: one of SCORE_MAPS's keys, such as https :param strategy: the load balance of proxy ip, the value is one of ['robin', 'greedy'] :param fast_response: if you use greedy strategy, it will be needed to decide whether a proxy ip should continue to be used :param score_map: score map of your project, default value is SCORE_MAPS in haipproxy.config.settings :param ttl_map: ttl map of your project, default value is TTL_MAPS in haipproxy.config.settings :param speed_map: speed map of your project, default value is SPEED_MAPS in haipproxy.config.settings :param ttl_validated_resource: time of latest validated proxies :param min_pool_size: min pool size of self.pool :param all_data: all proxies are stored in this set :param redis_args: redis connetion args, it's a dict, whose keys include host, port, db and password """ # if there are multi parent classes, super is only used for the first parent according to MRO if usage not in score_map.keys(): # client_logger.warning('task value is invalid, https task will be used') usage = 'https' score_queue = score_map.get(usage) ttl_queue = ttl_map.get(usage) speed_queue = speed_map.get(usage) super().__init__(score_queue, ttl_queue, speed_queue, longest_response_time, lowest_score, ttl_validated_resource, min_pool_size) self.strategy = strategy # pool is a FIFO queue self.pool = list() self.min_pool_size = min_pool_size self.fast_response = fast_response self.all_data = all_data self.handlers = [RobinStrategy(), GreedyStrategy()] if isinstance(redis_args, dict): self.conn = get_redis_conn(**redis_args) else: self.conn = get_redis_conn() t = threading.Thread(target=self._refresh_periodically) t.setDaemon(True) t.start()
def schedule_task_with_lock(self, task): """Crawler scheduler filters tasks according to task type""" if not task.get('enable'): return None task_queue = task.get('task_queue') if task_queue not in self.task_queues: return None conn = get_redis_conn() task_name = task.get('name') internal = task.get('internal') urls = task.get('resource') lock_indentifier = acquire_lock(conn, task_name) if not lock_indentifier: return False pipe = conn.pipeline(True) try: now = int(time.time()) pipe.hget(TIMER_RECORDER, task_name) r = pipe.execute()[0] if not r or (now - int(r.decode('utf-8'))) >= internal * 60: pipe.lpush(task_queue, *urls) pipe.hset(TIMER_RECORDER, task_name, now) pipe.execute() # scheduler_logger.info('crawler task {} has been stored into redis successfully'.format(task_name)) return True else: return None finally: release_lock(conn, task_name, lock_indentifier)
def schedule_task_with_lock(self, task): """Crawler scheduler filters tasks according to task type""" if not task.get('enable'): return None task_queue = task.get('task_queue') if task_queue not in self.task_queues: return None conn = get_redis_conn() task_name = task.get('name') interval = task.get('interval') urls = task.get('resource') lock_indentifier = acquire_lock(conn, task_name) if not lock_indentifier: return False pipe = conn.pipeline(True) try: now = int(time.time()) pipe.hget(TIMER_RECORDER, task_name) r = pipe.execute()[0] if not r or (now - int(r.decode('utf-8'))) >= interval * 60: pipe.lpush(task_queue, *urls) pipe.hset(TIMER_RECORDER, task_name, now) pipe.execute() # scheduler_logger.info('crawler task {} has been stored into redis successfully'.format(task_name)) return True else: return None finally: release_lock(conn, task_name, lock_indentifier)
def save_old_sales_data(): r = utils.get_redis_conn(domain) sales_result = r.get('data:sales') sales = JSONDecoder().decode(bytes.decode(sales_result)) sub_branch_result = r.get('data:sub_branch') sub_branch = JSONDecoder().decode(bytes.decode(sub_branch_result)) headers = utils.login(domain, login_body) sub_map = {} branch_map = {} for branch in sub_branch: sub_name = branch['subsidiary'] if sub_name not in sub_map: sub_result = utils.call_request(domain, 'reference-data-service', 'refSubsidiaryCreate', {'subsidiaryName': sub_name}, headers)['result'] sub_map[sub_name] = sub_result['subsidiaryId'] branch_name = branch['branch'] branch_result = utils.call_request( domain, 'reference-data-service', 'refBranchCreate', { 'subsidiaryId': sub_map[sub_name], 'branchName': branch_name }, headers)['result'] branch_map[branch_name] = branch_result['branchId'] for sale in sales: sale_name = sale['salesName'] branch_id = branch_map[sale['branch']] utils.call_request(domain, 'reference-data-service', 'refSalesCreate', { 'branchId': branch_id, 'salesName': sale_name }, headers)
def get_old_sales_data(): headers = utils.login(domain, login_body) sales = utils.call_request(domain, 'reference-data-service', 'refSalesList', {}, headers)['result'] sub_branch = utils.call_request(domain, 'reference-data-service', 'refSubsidiaryBranchList', {}, headers)['result'] print(sub_branch) print(sales) r = utils.get_redis_conn(domain) sales_result = JSONEncoder().encode(sales) sub_branch_result = JSONEncoder().encode(sub_branch) r.set('data:sales', str(sales_result)) r.set('data:sub_branch', str(sub_branch_result))
def __init__(self, usage, strategy='robin', length=10, fast_response=5): """ :param usage: one of SCORE_MAPS's keys, such as https :param length: if total available proxies are less than length, you must refresh pool :param strategy: the load balance of proxy ip, the value is one of ['robin', 'greedy'] :param fast_response: if you use greedy strategy, if will be needed to decide whether a proxy ip should continue to be used """ self.score_queue = SCORE_MAPS.get(usage) self.ttl_queue = TTL_MAPS.get(usage) self.speed_queue = SPEED_MAPS.get(usage) self.strategy = strategy # pool is a queue, which is FIFO self.pool = list() self.length = length self.fast_response = fast_response self.handlers = [RobinStrategy(), GreedyStrategy()] self.conn = get_redis_conn()
def update_conf(self): conn = get_redis_conn() start_time = int(time.time()) - TTL_VALIDATED_RESOURCE * 60 pipe = conn.pipeline(False) pipe.zrevrangebyscore(self.score_queue, '+inf', LOWEST_SCORE) pipe.zrevrangebyscore(self.ttl_queue, '+inf', start_time) pipe.zrangebyscore(self.speed_queue, 0, 1000 * LONGEST_RESPONSE_TIME) scored_proxies, ttl_proxies, speed_proxies = pipe.execute() proxies = scored_proxies and ttl_proxies and speed_proxies if not proxies: proxies = scored_proxies and ttl_proxies if not proxies: proxies = ttl_proxies proxies = decode_all(proxies) conts = list() with open(self.template_path, 'r') as fr, open(self.conf_path, 'w') as fw: original_conf = fr.read() if not proxies: fw.write(original_conf) client_logger.info('no proxies got at this turn') else: conts.append(original_conf) # if two proxies use the same ip and different ports and no name # if assigned,cache_peer error will raise. for index, proxy in enumerate(proxies): _, ip_port = proxy.split('://') ip, port = ip_port.split(':') conts.append( self.default_conf_detail.format(ip, port, index)) conts.extend(self.other_confs) conf = '\n'.join(conts) fw.write(conf) # in docker, execute with shell will fail subprocess.call([self.squid_path, '-k', 'reconfigure'], shell=False) client_logger.info('update squid conf successfully')
def update_conf(self): conn = get_redis_conn() proxies = self.get_available_proxies(conn) conts = list() with open(self.template_path, 'r') as fr, open(self.conf_path, 'w') as fw: original_conf = fr.read() if not proxies: fw.write(original_conf) # client_logger.info('no proxies got at this turn') else: conts.append(original_conf) # if two proxies use the same ip and different ports and no name # is assigned, cache_peer error will raise. for index, proxy in enumerate(proxies): _, ip_port = proxy.split('://') ip, port = ip_port.split(':') conts.append(self.default_conf_detail.format(ip, port, index)) conts.extend(self.other_confs) conf = '\n'.join(conts) fw.write(conf) # in docker, execute with shell will fail subprocess.call([self.squid_path, '-k', 'reconfigure'], shell=False)
def schedule_task_with_lock(self, task): """Validator scheduler filters tasks according to task name since it's task name stands for task type""" if not task.get('enable'): return None task_queue = task.get('task_queue') if task_queue not in self.task_queues: return None conn = get_redis_conn() internal = task.get('internal') task_name = task.get('name') resource_queue = task.get('resource') lock_indentifier = acquire_lock(conn, task_name) if not lock_indentifier: return False pipe = conn.pipeline(True) try: now = int(time.time()) pipe.hget(TIMER_RECORDER, task_name) pipe.zrevrangebyscore(resource_queue, '+inf', '-inf') r, proxies = pipe.execute() if not r or (now - int(r.decode('utf-8'))) >= internal * 60: if not proxies: print('fetched no proxies from task {}'.format(task_name)) return None pipe.sadd(task_queue, *proxies) pipe.hset(TIMER_RECORDER, task_name, now) pipe.execute() print( 'validator task {} has been stored into redis successfully' .format(task_name)) return True else: return None finally: release_lock(conn, task_name, lock_indentifier)
def update_conf(self): conn = get_redis_conn() proxies = self.get_available_proxies(conn) conts = list() with open(self.template_path, 'r') as fr, open(self.conf_path, 'w') as fw: original_conf = fr.read() if not proxies: fw.write(original_conf) # client_logger.info('no proxies got at this turn') else: conts.append(original_conf) # if two proxies use the same ip and different ports and no name # if assigned,cache_peer error will raise. for index, proxy in enumerate(proxies): _, ip_port = proxy.split('://') ip, port = ip_port.split(':') conts.append(self.default_conf_detail.format(ip, port, index)) conts.extend(self.other_confs) conf = '\n'.join(conts) fw.write(conf) # in docker, execute with shell will fail subprocess.call([self.squid_path, '-k', 'reconfigure'], shell=False) # client_logger.info('update squid conf successfully')
def schedule_task_with_lock(self, task): """Validator scheduler filters tasks according to task name since its task name stands for task type""" if not task.get('enable'): return None task_queue = task.get('task_queue') if task_queue not in self.task_queues: return None conn = get_redis_conn() interval = task.get('interval') task_name = task.get('name') resource_queue = task.get('resource') lock_indentifier = acquire_lock(conn, task_name) if not lock_indentifier: return False pipe = conn.pipeline(True) try: now = int(time.time()) pipe.hget(TIMER_RECORDER, task_name) pipe.zrevrangebyscore(resource_queue, '+inf', '-inf') r, proxies = pipe.execute() if not r or (now - int(r.decode('utf-8'))) >= interval * 60: if not proxies: # scheduler_logger.warning('fetched no proxies from task {}'.format(task_name)) print('fetched no proxies from task {}'.format(task_name)) return None pipe.sadd(task_queue, *proxies) pipe.hset(TIMER_RECORDER, task_name, now) pipe.execute() # scheduler_logger.info('validator task {} has been stored into redis successfully'.format(task_name)) return True else: return None finally: release_lock(conn, task_name, lock_indentifier)
def open_spider(self, spider): self.redis_con = get_redis_conn(db=META_DATA_DB)
def init_db(): redis_client = get_redis_conn(**redis_args) return redis_client
def init_db(): redis_client = get_redis_conn(db=1) return redis_client
def __init__(self, retries=5): self.retries = retries self.fetcher = ProxyFetcher('zhihu', strategy='greedy') self.conn = get_redis_conn(db=1) self.scheme = 'https'
def init_db(): redis_client = get_redis_conn(db=1) return redis_client
def __init__(self, retries=5): self.retries = retries self.fetcher = ProxyFetcher('zhihu', **self.client_configs) self.conn = get_redis_conn(**self.redis_args) self.scheme = 'https'
def __init__(self, retries=5): self.retries = retries self.fetcher = ProxyFetcher('zhihu', strategy='greedy') self.conn = get_redis_conn(db=1) self.scheme = 'https'
def setup_redis(self, crawler): """send signals when the spider is free""" self.redis_batch_size = SPIDER_FEED_SIZE self.redis_con = get_redis_conn() crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
def setup_redis(self, crawler): """send signals when the spider is free""" self.redis_batch_size = SPIDER_FEED_SIZE self.redis_con = get_redis_conn() crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
for i in redis_conn.lrange(word + '_url_list', 0, -1) ] tfidf_list = [ float(i.decode()) for i in redis_conn.lrange(word + '_tfidf_list', 0, -1) ] for url in url_list: if url in result_dict: result_dict[url] += (1 + tfidf_list[url_list.index(url)]) else: result_dict[url] = tfidf_list[url_list.index(url)] sorted_url_list =[[url_tuple[0], redis_conn.hget('url2title', url_tuple[0]).decode()]\ for url_tuple in sorted(result_dict.items(), key=lambda d:d[1], reverse=True)[:20]] for url in sorted_url_list: cur.execute("""select content from url_hash where url_hash=%s""", (mymd5(url[0]), )) url.append(find_keyword(cur.fetchone()[0], keyword_list)) return jsonify( dict(lengths=len(sorted_url_list), urls=sorted_url_list, keyword=keyword, keyword_list=keyword_list)), 200 if __name__ == '__main__': redis_conn = get_redis_conn(sys.argv[1]) mysql_conn = get_MySQL_conn(sys.argv[1]) all_word_set = redis_conn.smembers('all_word_list') app.run(debug=True)
# encoding=utf-8 import sys sys.path.append("/root/audit") from utils import get_redis_conn from config import * ################################################ # # 用于手动添加订单数据,数据从nginx+lua中的日志获取 # 日志文件:'/var/log/nginx/charging_resource.log' # ################################################ redis_con = get_redis_conn() data = 'POST++instance++5b26203117ff41cbae690248106b96cd++{"server": {"security_groups": [{"name": "default"}], "OS-DCF:diskConfig": "MANUAL", "id": "d6bd688f-f77c-4a06-adb8-5ea8728723cf", "links": [{"href": "http://nova/v2.1/daa89f2bdee1431abd2794fd38598da9/servers/d6bd688f-f77c-4a06-adb8-5ea8728723cf", "rel": "self"}, {"href": "http://nova/daa89f2bdee1431abd2794fd38598da9/servers/d6bd688f-f77c-4a06-adb8-5ea8728723cf", "rel": "bookmark"}], "adminPass": "******"}}' def add_data2redis(data): redis_con.sadd(charging_data, data) if __name__ == '__main__': add_data2redis(data=data)
def __init__(self, proxy_mode=1, retries=5): self.proxy_mode = proxy_mode self.retries = retries self.fetcher = ProxyFetcher('zhihu', strategy='greedy', length=5) self.conn = get_redis_conn(db=1)
def open_spider(self, spider): self.redis_con = get_redis_conn(db=META_DATA_DB)
def __init__(self): self.redis_con = get_redis_conn(**redis_args)