def crawl_topic_page(self, max_page): """ 分页面下载 """ if not max_page or max_page == 0: url = crawl_topic_url.format(self.topic_id) content = crawler.crawl(url) if not content: return file_name = '{}/data/topics/{}/top-answers_page_1.html'.format( os.path.abspath('.'), self.topic_id) self.webpage_save(file_name, content) self.extract_question_id(content) else: for i in range(1, max_page + 1): ## 全部回答页面抓取,数量较大,以生活话题为例,有近100000个回答,其中大多数问题,关注人数较少 #file_name = '{}/data/topics/{}/page_{}.html'.format(os.path.abspath('.'), self.topic_id, i) ## 精华回答抓取,最多为50个页面,每个页面20个回答,即最多共1000个精华回答,其中所属的问题会有重复 file_name = '{}/data/topics/{}/top-answers_page_{}.html'.format( os.path.abspath('.'), self.topic_id, i) if os.path.exists(file_name): with open(file_name, 'rb') as _r: content = _r.read() else: url = page_topic_url.format(self.topic_id, i) content = crawler.crawl(url) time.sleep(0.5) if not content: continue self.webpage_save(file_name, content) self.extract_question_id(content)
def crawl_topic_page(self, max_page): """ 分页面下载 """ if not max_page or max_page == 0: url = crawl_topic_url.format(self.topic_id) content = crawler.crawl(url) if not content: return self.webpage_save(1, content) self.extract_question_id(content) else: for i in range(1, max_page + 1): url = page_topic_url.format(self.topic_id, i) content = crawler.crawl(url) if not content: continue self.webpage_save(i, content) self.extract_question_id(content) time.sleep(0.5)
def crawl_question(self): """ 根据question_url抓取question页面 """ self.question_url = self.get_question_url() content = crawler.crawl(self.question_url) #with open('./data/questions/38589246.html', 'rb') as _r: # content = _r.read() if not content: return self.webpage_save(content) return self.parse_question(content)
def get_topic_max_page(self): """ 根据入口链接找出此topic下页面总数 """ url = crawl_topic_url.format(self.topic_id) content = crawler.crawl(url) max_page = 0 try: page_num = re.findall('\?page=(\d+)', content) if page_num: max_page = max(int(i) for i in page_num) except Exception, e: log.error('topic_topic: get_topic_max_page except={}'.format(e))
def call_questionAPI(self): """ 根据questionAPI获取answer数据 """ while True: url_api = self.get_questionAPI_url() #print url_api content = crawler.crawl(url_api) time.sleep(0.2) if not content: log.error( 'answer_call_questionAPI: url_api={}'.format(url_api)) break try: content_json = json.loads(content) except Exception, e: log.error( 'answer_call_questionAPI: except={} url_api={}'.format( str(e), url_api)) if not content_json or not content_json.get('data') or len( content_json.get('data')) == 0: break self.parse_answer(content_json)