def __init__(self): logger.info("TopicSpider init...") super().__init__() # url_manager方法已经内置,只需要使用id_manager传入ID参数即可 self.id_manager = SpiderFrame.UrlManager( db_set_name=config.TOPIC_ID_SET, use_redis=config.USE_REDIS) self.exit_code = 1
def __init__(self, get_detail): super().__init__() self.get_detail = get_detail # 添加到question_id if get_detail: self.url_manager = SpiderFrame.UrlManager( use_redis=config.USE_REDIS, db_set_name=config.QUESTION_ID_SET)
def __init__(self): logger.info("CommentSpider init...") super().__init__() self.exit_code = 1 self.flag = True # url_manager方法已经内置,只需要使用id_manager传入ID参数即可 self.id_manager = SpiderFrame.UrlManager(db_set_name=config.ANSWER_ID_SET, use_redis=config.USE_REDIS)
def get_hot_list(get_detail=False): html_downloader = SpiderFrame.HtmlDownloader() html_parser = HTMLParser(get_detail) data_saver = SpiderFrame.DataSaver(db_name="知乎", set_name="热榜") url = "https://www.zhihu.com/api/v3/feed/topstory/hot-lists/total?limit=50&desktop=true" result = {"HotListUpdated": strftime("%Y-%m-%d", localtime()), "data": []} logger.info("Getting Hot List....") while True: res = html_downloader.download(url) res = json_lds(res) for data in res['data']: html_parser.parse(data) result['data'].append(data) if res['paging']['is_end']: logger.info("Paging is end, exit") break url = res['paging']['next'] html_downloader.proxies.__exit__() logger.info("Saving Data To MongoDB") data_saver.mongo_insert(result) logger.info('Complete!')
@desc 话题爬虫,许多question归属于某类话题,爬取这些question的信息(以等待回答为基础) @main_function spider(question_id: str) """ from frame import SpiderFrame from bs4 import BeautifulSoup from time import sleep from redis import Redis from re import findall from requests import exceptions import pymongo.errors import config import json logger = SpiderFrame.logger html_downloader = SpiderFrame.HtmlDownloader() data_saver = SpiderFrame.DataSaver(db_name=config.DB_NAME, set_name=config.TOPIC_SET) url_manager = SpiderFrame.UrlManager(db_set_name=config.TOPIC_SET, use_redis=config.USE_REDIS) redis = Redis(host=config.REDIS_HOST, port=config.REDIS_PORT, password=config.REDIS_PASSWORD) def parse_base_topic_info(html: str): soup = BeautifulSoup(html, "lxml") try: title = soup.find("h2", {"class": "ContentItem-title"}).text except Exception as e: logger.error("Get Topic title failed, Exception: {0}".format(e))
def __init__(self): super().__init__() self.url_manager = SpiderFrame.UrlManager( use_redis=config.USE_REDIS, db_set_name=config.QUESTION_SET)
follower = int( soup.find_all("strong", {"class": "NumberBoard-itemValue" })[0].text.strip().replace(",", "")) except: follower = 0 try: watched = int( soup.find_all("strong", {"class": "NumberBoard-itemValue" })[1].text.strip().replace(",", "")) except: watched = 0 return title, question, tag_list, follower, watched html_parser = HtmlParser() html_downloader = SpiderFrame.HtmlDownloader() data_saver = SpiderFrame.DataSaver(db_name=config.DB_NAME, set_name=config.QUESTION_SET) def _init_url_(_question_id: str): base_url_start = "https://www.zhihu.com/api/v4/questions/" base_url_end = "/answers?include=data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed" \ "%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by" \ "%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count" \ "%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info" \ "%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting" \ "%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recognized%2Cpaid_info%2Cpaid_info_content%3Bdata%5B" \ "%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%2A%5D.topics" \ "&limit=5&offset=0" return base_url_start + _question_id + base_url_end
def search(keyword): global KWD KWD = keyword logger.info("Getting Keywords:{0}, detail={1}".format( keyword, config.KEY_WORDS_SEARCH_ADD_QUESTION_ID)) base_url = 'https://api.zhihu.com/search_v3' html_downloader = SpiderFrame.HtmlDownloader() data_saver = SpiderFrame.DataSaver(db_name=config.DB_NAME, set_name=config.KEY_WORDS_SEARCH_SET) html_downloader.headers = { "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8," "application/signed-exchange;v=b3;q=0.9", "accept-encoding": "gzip, deflate, br", "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6", "cache-control": "no-cache", "cookie": "d_c0=\"AADWzpXyDxKPTkP_r53qvH9ipDf4dAG7XE4=|1603087870\"; " "_zap=b47b3886-7c4a-4101-9ee5-4c803bcf6cd8; _xsrf=LRWrd8I0FyQr3hxZ49tYEABlJI0MFizY; " "capsion_ticket=\"2|1:0|10:1603262862|14:capsion_ticket|44" ":N2UxNWE4YzExZWYxNDUwYWFkZjM4MjQ4MDhjNWExNjY" "=|fa44c3793ac9cf5fac96aab9dc9d8faadba2d384e00351c9c9642028ceace6ad\"; " "r_cap_id=\"YmY4MWY5YzA0OWRlNDk0Yjk2MTEyYWEzZDU5MjZmMmM=|1603262864" "|9dbd3b9caeccd1669c26ee92e5b543543a611713\"; " "cap_id=\"OGVlYjJjOTQ2YTgyNGMzZTlmODk4NDUzMzQ0ZTkyNjA=|1603262864" "|5e52e69215700dd4539d66e5a0833dd4a0c4c1fe\"; " "l_cap_id=\"ODViMjY0YmExNWNlNGVmYWJmMGY5MGUyNTUzMjQxMzM=|1603262864" "|8a107e67c1f9223cd88f066cda42b6ce2102b632\"; " "z_c0=Mi4xQnNEUkNBQUFBQUFBQU5iT2xmSVBFaGNBQUFCaEFsVk5saWQ5WUFERVEzVUJpOVdzZHRZcnloaE9OZWVXVDZwTlhR" "|1603262870|42b123d5ae8b1fb74a8815b13eae8cb34f92508c; tst=r; " "q_c1=582f701a20454c59be03f2470d62b194|1603326280000|1603326280000; " "Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1603653130,1603680022,1603682173,1603683176; " "Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1603683176; " "KLBRSID=af132c66e9ed2b57686ff5c489976b91|1603684342|1603684271", "pragma": "no-cache", "sec-fetch-dest": "document", "sec-fetch-mode": "navigate", "sec-fetch-site": "none", "sec-fetch-user": "******", "upgrade-insecure-requests": "1", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/86.0.4240.111 Safari/537.36 Edg/86.0.622.51 " } prams = { "advert_count": "0", "correction": "1", "lc_idx": "0", "limit": "20", "offset": "0", "q": keyword, "search_hash_id": "1e3c9a021028e71019c7977637948651", "show_all_topics": "0", "t": "general", "vertical_info": "0,1,0,0,0,0,0,0,0,2" } html_parser = HTMLParser(config.KEY_WORDS_SEARCH_ADD_QUESTION_ID) res = html_downloader.download(url=base_url, params=prams) if not data_saver.mg_data_db.find_one({"KeyWord": KWD}): data_saver.mongo_insert({"KeyWord": KWD, "data": []}) while True: res = json.loads(res) logger.info("Saving Data To MongoDB") for data in res['data']: data_saver.mg_data_db.update_one({"KeyWord": KWD}, {'$addToSet': { "data": data }}) html_parser.parse(data) if res['paging']['is_end']: logger.info("Paging is end, exit") break next_url = res['paging']['next'] res = html_downloader.download(next_url) # exit logger.info("Exit All Thread") html_downloader.proxies.__exit__()
def __init__(self, get_detail=False): super().__init__() self.get_detail = get_detail if get_detail: self.url_manager = SpiderFrame.UrlManager(db_set_name='知乎@HotList')
@creat_date 2020/10/06 @update_data 2020/10/06 @desc 评论爬虫,提供answer的id,爬该answer下所有评论 <已实现增量> @info 没有其他链接,不需要队列 @main_function spyder(question_id: str) """ from frame import SpiderFrame from requests import exceptions from json import loads as json_lds from time import sleep from redis import Redis import config logger = SpiderFrame.logger html_downloader = SpiderFrame.HtmlDownloader() url_manager = SpiderFrame.UrlManager(use_redis=config.USE_REDIS, db_set_name=config.COMMENT_SET) data_saver = SpiderFrame.DataSaver(db_name=config.DB_NAME, set_name=config.COMMENT_SET) redis = Redis(host=config.REDIS_HOST, port=config.REDIS_PORT, password=config.REDIS_PASSWORD) def spider(answer_id: str) -> None: # 增量爬取评论 offset = config.MONGO_DOC_LIMIT logger.info("Get comments for answer id: {0}".format(answer_id)) url = "https://www.zhihu.com/api/v4/answers/{}/root_comments?limit=10&offset=0&order=normal&status=open" \