class ZhiDaoGenerator: """ 百度知道关键词抓取,关键词有两部分,1.问题内容包含的关键词,2.问题所属的关键词 """ def __init__(self): self.logger = LogHandler('zhidao_crawl') try: # 知道请求头,获取Cookie的BAIDUID,否则抓取不到数据 self.zhidao_headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 16 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'Cookie': 'BAIDUID=%s' % requests.get('https://zhidao.baidu.com/browse/', timeout=3).cookies['BAIDUID'] } except Exception as e: raise e def __zd_question_keywords(self, url): """ 获取知道问题内容的关键词 """ try: req = requests.get(url, timeout=3) req.encoding = req.apparent_encoding kws = [] for kw_tag in BeautifulSoup(req.text, "html.parser").find_all( 'li', class_=lambda class_: class_ and ('word grid' in class_)): kw = kw_tag.find(class_="word-text") if kw is not None: kws.append(kw.string) return kws except RequestException as re: self.logger.warning(re) return [] except Exception as e: raise e @timethis def crawl_zhidao_words(self, save_keyword): """ 百度知道抓取 """ try: req = requests.get( url= 'https://zhidao.baidu.com/list?_pjax=%23j-question-list-pjax-container', headers=self.zhidao_headers, timeout=3) req.encoding = req.apparent_encoding for qs in BeautifulSoup(req.text, "html.parser").find_all( 'div', class_='question-title-section'): # 问题所属领域关键词提取 for qt in map(lambda x: x.string.replace('\n', ''), qs.find_all('a', class_='tag-item')): save_keyword(qt) # 问题内容包含关键词提取 for qm in self.__zd_question_keywords(qs.a.get('href')): save_keyword(str(qm)) except RequestException as re: self.logger.warn(re) except Exception as e: raise e
class ProxyManager(object): """ ProxyManager """ def __init__(self): self.db = ProxyDBClient('proxy') self.raw_proxy_queue = 'raw_proxy' self.log = LogHandler('proxy_manager') self.useful_proxy_queue = 'useful_proxy' def refresh(self): """ fetch proxy into Db by ProxyGetter/getFreeProxy.py :return: """ self.db.changeTable(self.raw_proxy_queue) for proxyGetter in conf.proxy_getter_functions: # fetch try: self.log.info( "{func}: fetch proxy start".format(func=proxyGetter)) for proxy in getattr(GetFreeProxy, proxyGetter.strip())(): # 直接存储代理, 不用在代码中排重, hash 结构本身具有排重功能 proxy = proxy.strip() if proxy and verifyProxyFormat(proxy): self.log.info('{func}: fetch proxy {proxy}'.format( func=proxyGetter, proxy=proxy)) self.db.put(proxy) else: self.log.error( '{func}: fetch proxy {proxy} error'.format( func=proxyGetter, proxy=proxy)) except Exception as e: self.log.warn( "{func}: fetch proxy fail".format(func=proxyGetter)) continue def get(self): """ return a useful proxy :return: """ self.db.changeTable(self.useful_proxy_queue) item_dict = self.db.getAll() if item_dict: if EnvUtil.PY3: return random.choice(list(item_dict.keys())) else: return random.choice(item_dict.keys()) return None # return self.db.pop() def delete(self, proxy): """ delete proxy from pool :param proxy: :return: """ self.db.changeTable(self.useful_proxy_queue) self.db.delete(proxy) def getAll(self): """ get all proxy from pool as list :return: """ self.db.changeTable(self.useful_proxy_queue) item_dict = self.db.getAll() if EnvUtil.PY3: return list(item_dict.keys()) if item_dict else list() return item_dict.keys() if item_dict else list() def getNumber(self): self.db.changeTable(self.raw_proxy_queue) total_raw_proxy = self.db.getNumber() self.db.changeTable(self.useful_proxy_queue) total_useful_queue = self.db.getNumber() return { 'raw_proxy': total_raw_proxy, 'useful_proxy': total_useful_queue }
class QuestionsGenerator: """ 获取话题内的精华和讨论问题""" top_activity_url = "https://www.zhihu.com/api/v4/topics/%u/feeds/top_activity?limit=10&offset=%u" # 讨论问题 essence_url = "https://www.zhihu.com/api/v4/topics/%u/feeds/essence?limit=10&offset=%u" # 精华问题 def __init__(self): self.p_receiver = ProxiesReceiver() self.essence_dict = dict() self.activity_dict = dict() self.session = requests.Session() # 先从话题id中筛选出未获取问题的id放入待获取队列 QuestionsGenerator.save_new_topic_id() self.logger = LogHandler("question_generator") @staticmethod def badge_parse(badge): res_array = [] for b in badge: res_array.append({ 'author_type': b['type'], 'author_description': b['description'] }) return res_array def topic_sticky_parse(self, topic_sticky): res_array = [] try: for t in topic_sticky: if t['target'].get('title'): res_array.append({ 'id': t['target']['id'], 'title': t['target']['title'], 'author_name': t['target']['author']['name'], 'author_gender': t['target']['author']['gender'], 'author_headline': t['target']['author']['headline'], 'author_url_token': t['target']['author']['url_token'], 'excerpt': t['target']['excerpt'], 'updated_time': t['target']['updated'], 'contents': t['target']['comment_count'], 'likes': t['target']['voteup_count'], 'created_time': t['target']['created'] }) else: res_array.append({ 'id': t['target']['id'], # 热门回答 'title': t['target']['question']['title'], 'author_name': t['target']['author']['name'], 'author_gender': t['target']['author']['gender'], 'author_headline': t['target']['author']['headline'], 'author_url_token': t['target']['author']['url_token'], 'excerpt': t['target']['excerpt'], 'updated_time': t['target']['updated_time'], 'created': t['target']['question']['created'], 'created_time': t['target']['created_time'] }) except KeyError as ke: self.logger.error(ke, topic_sticky) except Exception as e: raise e return res_array @timethis def crawl_topic_message(self, tid, q_url, num, message_dict): """ 话题对应问题/文章获取,最终结果存放: message_dict: key-类型(文章/问题),value-{key-id, value-{'question/title':问题/文章名称, author':作者, 'gender':性别,'author_badge':作者标签, 'author_headline':作者签名, 'author_url_token':作者url标识,excerpt':摘录, 'created_time':创建时间, 'updated_time':最后更新时间, 'comment_count':'评论数', likes':点赞数}} :param tid:话题ID :param q_url:请求url :param num 抓取数量 :param message_dict:存储字典 :return: """ session = requests.session() sticky_num = 0 for offset in range(0, num, 10): url = q_url % (tid, offset) try: ques = session.get(url=url, headers=headers, proxies=self.p_receiver.one_random, timeout=3) except Exception as re: self.logger.warn((re, url)) continue try: q_json = ques.json() if ques else {} except JSONDecodeError as je: self.logger.error((je, url, ques)) continue for q in q_json.get('data', []): target = q.get('target', {}) question_type = str(target.get('type', 'none_type')).lower() if question_type == 'none_type': continue elif question_type == 'answer': # 问题回答是双id,使用元组转成字符串 addtwodimdict( message_dict, 'answer', str((target['question']['id'], target['id'])), { 'question': target['question']['title'], 'author_name': target['author']['name'], 'author_gender': target['author']['gender'], 'author_badge': QuestionsGenerator.badge_parse( target['author']['badge']), 'author_headline': target['author']['headline'], 'author_url_token': target['author']['url_token'], 'excerpt': target['excerpt'], 'created_time': target['created_time'], 'updated_time': target['updated_time'], 'contents': target['comment_count'], 'likes': target['voteup_count'] }) elif question_type == 'article': # 文章是单id addtwodimdict( message_dict, 'article', target['id'], { 'title': target['title'], 'author_name': target['author']['name'], 'author_gender': target['author']['gender'], 'author_badge': QuestionsGenerator.badge_parse( target['author']['badge']), 'author_headline': target['author']['headline'], 'author_url_token': target['author']['url_token'], 'excerpt': target['excerpt'], 'created_time': target['created'], 'updated_time': target['updated'], 'contents': target['comment_count'], 'likes': target['voteup_count'] }) elif question_type == 'question': pass # 目前抓取到的是一些没有回答的问题,这里过滤掉 elif question_type == 'topic_sticky_module': # 热门置顶 addtwodimdict( message_dict, 'topic_sticky_module', sticky_num, { 'title': target['title'], 'data': self.topic_sticky_parse(target['data']) }) sticky_num += 1 else: self.logger.error( "There was a new type:{}!\n".format(question_type)) if str(q_json.get('paging', {}).get('is_end', 'none')).lower() == 'true': return def process(self): """ zhTopicQuestions内包含精华(点赞比较多)和讨论(最新比较热)的问题和文章 对应结构essence_dict和activity_dict :return: """ tid = int(redis_cli.block_pop('zhNewTopicID')) self.crawl_topic_message(tid, QuestionsGenerator.essence_url, conf.essence_nums, self.essence_dict) self.crawl_topic_message(tid, QuestionsGenerator.top_activity_url, conf.top_activity_nums, self.activity_dict) with open(os.path.join(OUTPUT_PATH, "{}.json".format(int(tid))), "w", encoding="GB18030") as f: json.dump( { 'essence': self.essence_dict, 'top_activity': self.activity_dict }, f, indent=4, ensure_ascii=False) @staticmethod def save_new_topic_id(): """ output目录下不要手动放其他名称文件 """ for nid in set(redis_cli.hkeys('zhTopicMessage')) - set( {x.split('.')[0] for x in os.listdir(OUTPUT_PATH)}): redis_cli.sadd('zhNewTopicID', nid)
class ZhihuTopicGenerator: """ 分为两个过程:id获取和扩展 """ def __init__(self): # 代理ip self.p_receiver = ProxiesReceiver() # 建立会话,设置requests重连次数和重连等待时间 self.session = requests.Session() retry = Retry(connect=3, backoff_factor=0.5) adapter = HTTPAdapter(max_retries=retry) self.session.mount('https://', adapter) self.logger = LogHandler('topics_generator') logging.getLogger("urllib3").setLevel(logging.ERROR) def __get_topic_message(self, tid): """ 话题信息获取,最终结果存放: zhTopicMessage哈希表内,key-话题id,value-{'name':名称, 'introduction': 简介, 'questions_count':问题数, 'best_answers_count':精华问题数, 'followers_count':关注人数, 'best_answerers_count':优秀回答者人数} """ try: j_rst = self.session.get(url=topic_message_url % tid, headers=headers, proxies=self.p_receiver.one_random, timeout=3).json() if redis_cli.hset( 'zhTopicMessage', tid, str({ "name": j_rst.get("name"), 'introduction': j_rst.get("introduction"), "questions_count": j_rst.get("questions_count"), "best_answers_count": j_rst.get("best_answers_count"), 'followers_count': j_rst.get("followers_count"), "best_answerers_count": j_rst.get("best_answerers_count") })): # 待获取相关信息 redis_cli.sadd('zhNewTopicID', tid) self.logger.info("zhNewTopicID:%d", tid) return True except RequestException as re: self.logger.warn(re) except Exception as e: raise e return False def __get_hot_topics(self): """ 搜索zhTemporaryWords内关键词,从其结果中得到相关话题id和名称 """ tw = redis_cli.block_pop('zhTemporaryWords').decode('utf-8') # pop # 不断翻页至最后,最大获取1000条 for offset in range(0, 1000, 10): try: url = zh_search_url % (tw, offset) j_topics = self.session.get(url=url, headers=headers, proxies=self.p_receiver.one_random, timeout=3).json() topics = j_topics.get('data', None) if j_topics else None if not topics: # 已到最后 return # 每一页获取话题相关详细信息 for t in topics: if t.get('object') and t.get('object').get('id'): try: tid = int(t['object']['id']) except ValueError as ve: self.logger.warning(ve, t['object']['id']) continue if self.__get_topic_message(tid): yield tid else: break except RequestException as re: self.logger.warn((re, url)) except ReadTimeout as rte: self.logger.warn((rte, url)) except KeyError as ke: self.logger.warn((ke, url)) except Exception as e: raise e @staticmethod def __save_to_dag(child_topic_id, parent_topic_id): """ 按其结构保存为有向无环图 """ ids = redis_cli.hget('zhTopicDAG', parent_topic_id) if not ids or ids.decode() == "None": redis_cli.hset('zhTopicDAG', parent_topic_id, str({child_topic_id})) else: new_ids = eval(ids) new_ids.add(child_topic_id) redis_cli.hset('zhTopicDAG', parent_topic_id, str(new_ids)) def __add_topics(self, url, topic_id, func): try: req = self.session.get(url=url % int(topic_id), headers=headers, proxies=self.p_receiver.one_random, timeout=3) if not req: # 获取子父话题有可能不存在 return for p in req.json()['data']: expand_topic_id = int(p['id']) func(topic_id, expand_topic_id) self.__get_topic_message(expand_topic_id) except RequestException as re: self.logger.warn(re) except ReadTimeout as rte: self.logger.warn(rte) except Exception as e: raise e def __expand_topics(self, tid): """ 话题扩展,分别向父子话题不断扩展 """ self.__add_topics(parent_url, tid, lambda a, b: self.__save_to_dag(a, b)) self.__add_topics(child_url, tid, lambda a, b: self.__save_to_dag(b, a)) @timethis def process(self): for tid in self.__get_hot_topics(): self.__expand_topics(tid)