class KeywordsController: """ 获取待搜索关键词 """ def __init__(self): self.sleep_time = 0 self.task_list = [ ZhiDaoGenerator().crawl_zhidao_words, BaiduHotGenerator().crawl_hot_words ] self.logger = LogHandler('keywords_controller') def speed_control(self, kw): """ 检查一次新关键词数量,控制在500-1000范围内 """ nums = redis_cli.scard('zhTemporaryWords') if nums > 1000: # 多一些可以接受,所以缓慢增加睡眠时间 self.sleep_time += 10 elif nums < 500: # 过少时迅速降低睡眠时间防止饥饿 self.sleep_time = int(self.sleep_time / 2) self.logger.info('keywords crawler sleep time:%d' % self.sleep_time) time.sleep(self.sleep_time) def save_keyword(self, kw): """ 新待搜索关键词结果存入zhTemporaryWords表中 """ if redis_cli.sadd('zhKeyWords', kw): # 添加未发生覆盖,将此关键词存至zhTemporaryWords内待被搜索 self.logger.info(str(kw)) redis_cli.sadd('zhTemporaryWords', kw) def kw_run(self): """ 所有新关键词存至zhTemporaryWords内待被搜索 """ while True: # 每5次更新一次速度 for action in self.task_list * 5 + [self.speed_control]: action(self.save_keyword)
class ProxyCheck(ProxyManager, Thread): def __init__(self, queue, item_dict): ProxyManager.__init__(self) Thread.__init__(self) self.log = LogHandler('proxy_check', file=False) # 多线程同时写一个日志文件会有问题 self.queue = queue self.item_dict = item_dict def run(self): self.db.changeTable(self.useful_proxy_queue) while self.queue.qsize(): proxy = self.queue.get() count = self.item_dict[proxy] if validUsefulProxy(proxy): # 验证通过计数器减1 if count and int(count) > 0: self.db.put(proxy, num=int(count) - 1) else: pass self.log.info('ProxyCheck: {} validation pass'.format(proxy)) else: self.log.info('ProxyCheck: {} validation fail'.format(proxy)) if count and int(count) + 1 >= FAIL_COUNT: self.log.info( 'ProxyCheck: {} fail too many, delete!'.format(proxy)) self.db.delete(proxy) else: self.db.put(proxy, num=int(count) + 1) self.queue.task_done()
class ProxyRefreshSchedule(ProxyManager): """ 代理定时刷新 """ def __init__(self): ProxyManager.__init__(self) self.log = LogHandler('refresh_schedule') def validProxy(self): """ 验证raw_proxy_queue中的代理, 将可用的代理放入useful_proxy_queue :return: """ self.db.changeTable(self.raw_proxy_queue) raw_proxy_item = self.db.pop() self.log.info('ProxyRefreshSchedule: %s start validProxy' % time.ctime()) # 计算剩余代理,用来减少重复计算 remaining_proxies = self.getAll() while raw_proxy_item: raw_proxy = raw_proxy_item.get('proxy') if isinstance(raw_proxy, bytes): # 兼容Py3 raw_proxy = raw_proxy.decode('utf8') if (raw_proxy not in remaining_proxies) and validUsefulProxy(raw_proxy): self.db.changeTable(self.useful_proxy_queue) self.db.put(raw_proxy) self.log.info('ProxyRefreshSchedule: %s validation pass' % raw_proxy) else: self.log.info('ProxyRefreshSchedule: %s validation fail' % raw_proxy) self.db.changeTable(self.raw_proxy_queue) raw_proxy_item = self.db.pop() remaining_proxies = self.getAll() self.log.info('ProxyRefreshSchedule: %s validProxy complete' % time.ctime())
class ProxyManager(object): """ ProxyManager """ def __init__(self): self.db = ProxyDBClient('proxy') self.raw_proxy_queue = 'raw_proxy' self.log = LogHandler('proxy_manager') self.useful_proxy_queue = 'useful_proxy' def refresh(self): """ fetch proxy into Db by ProxyGetter/getFreeProxy.py :return: """ self.db.changeTable(self.raw_proxy_queue) for proxyGetter in conf.proxy_getter_functions: # fetch try: self.log.info( "{func}: fetch proxy start".format(func=proxyGetter)) for proxy in getattr(GetFreeProxy, proxyGetter.strip())(): # 直接存储代理, 不用在代码中排重, hash 结构本身具有排重功能 proxy = proxy.strip() if proxy and verifyProxyFormat(proxy): self.log.info('{func}: fetch proxy {proxy}'.format( func=proxyGetter, proxy=proxy)) self.db.put(proxy) else: self.log.error( '{func}: fetch proxy {proxy} error'.format( func=proxyGetter, proxy=proxy)) except Exception as e: self.log.warn( "{func}: fetch proxy fail".format(func=proxyGetter)) continue def get(self): """ return a useful proxy :return: """ self.db.changeTable(self.useful_proxy_queue) item_dict = self.db.getAll() if item_dict: if EnvUtil.PY3: return random.choice(list(item_dict.keys())) else: return random.choice(item_dict.keys()) return None # return self.db.pop() def delete(self, proxy): """ delete proxy from pool :param proxy: :return: """ self.db.changeTable(self.useful_proxy_queue) self.db.delete(proxy) def getAll(self): """ get all proxy from pool as list :return: """ self.db.changeTable(self.useful_proxy_queue) item_dict = self.db.getAll() if EnvUtil.PY3: return list(item_dict.keys()) if item_dict else list() return item_dict.keys() if item_dict else list() def getNumber(self): self.db.changeTable(self.raw_proxy_queue) total_raw_proxy = self.db.getNumber() self.db.changeTable(self.useful_proxy_queue) total_useful_queue = self.db.getNumber() return { 'raw_proxy': total_raw_proxy, 'useful_proxy': total_useful_queue }
class ZhihuTopicGenerator: """ 分为两个过程:id获取和扩展 """ def __init__(self): # 代理ip self.p_receiver = ProxiesReceiver() # 建立会话,设置requests重连次数和重连等待时间 self.session = requests.Session() retry = Retry(connect=3, backoff_factor=0.5) adapter = HTTPAdapter(max_retries=retry) self.session.mount('https://', adapter) self.logger = LogHandler('topics_generator') logging.getLogger("urllib3").setLevel(logging.ERROR) def __get_topic_message(self, tid): """ 话题信息获取,最终结果存放: zhTopicMessage哈希表内,key-话题id,value-{'name':名称, 'introduction': 简介, 'questions_count':问题数, 'best_answers_count':精华问题数, 'followers_count':关注人数, 'best_answerers_count':优秀回答者人数} """ try: j_rst = self.session.get(url=topic_message_url % tid, headers=headers, proxies=self.p_receiver.one_random, timeout=3).json() if redis_cli.hset( 'zhTopicMessage', tid, str({ "name": j_rst.get("name"), 'introduction': j_rst.get("introduction"), "questions_count": j_rst.get("questions_count"), "best_answers_count": j_rst.get("best_answers_count"), 'followers_count': j_rst.get("followers_count"), "best_answerers_count": j_rst.get("best_answerers_count") })): # 待获取相关信息 redis_cli.sadd('zhNewTopicID', tid) self.logger.info("zhNewTopicID:%d", tid) return True except RequestException as re: self.logger.warn(re) except Exception as e: raise e return False def __get_hot_topics(self): """ 搜索zhTemporaryWords内关键词,从其结果中得到相关话题id和名称 """ tw = redis_cli.block_pop('zhTemporaryWords').decode('utf-8') # pop # 不断翻页至最后,最大获取1000条 for offset in range(0, 1000, 10): try: url = zh_search_url % (tw, offset) j_topics = self.session.get(url=url, headers=headers, proxies=self.p_receiver.one_random, timeout=3).json() topics = j_topics.get('data', None) if j_topics else None if not topics: # 已到最后 return # 每一页获取话题相关详细信息 for t in topics: if t.get('object') and t.get('object').get('id'): try: tid = int(t['object']['id']) except ValueError as ve: self.logger.warning(ve, t['object']['id']) continue if self.__get_topic_message(tid): yield tid else: break except RequestException as re: self.logger.warn((re, url)) except ReadTimeout as rte: self.logger.warn((rte, url)) except KeyError as ke: self.logger.warn((ke, url)) except Exception as e: raise e @staticmethod def __save_to_dag(child_topic_id, parent_topic_id): """ 按其结构保存为有向无环图 """ ids = redis_cli.hget('zhTopicDAG', parent_topic_id) if not ids or ids.decode() == "None": redis_cli.hset('zhTopicDAG', parent_topic_id, str({child_topic_id})) else: new_ids = eval(ids) new_ids.add(child_topic_id) redis_cli.hset('zhTopicDAG', parent_topic_id, str(new_ids)) def __add_topics(self, url, topic_id, func): try: req = self.session.get(url=url % int(topic_id), headers=headers, proxies=self.p_receiver.one_random, timeout=3) if not req: # 获取子父话题有可能不存在 return for p in req.json()['data']: expand_topic_id = int(p['id']) func(topic_id, expand_topic_id) self.__get_topic_message(expand_topic_id) except RequestException as re: self.logger.warn(re) except ReadTimeout as rte: self.logger.warn(rte) except Exception as e: raise e def __expand_topics(self, tid): """ 话题扩展,分别向父子话题不断扩展 """ self.__add_topics(parent_url, tid, lambda a, b: self.__save_to_dag(a, b)) self.__add_topics(child_url, tid, lambda a, b: self.__save_to_dag(b, a)) @timethis def process(self): for tid in self.__get_hot_topics(): self.__expand_topics(tid)