def parse_answer(self, reponse): #处理question的answer ans_json = json.loads(reponse.text) is_end = ans_json["paging"]["is_end"] next_url = ans_json["paging"]["next"] #提取answer的具体字段 for answer in ans_json["data"]: answer_item = ZhihuAnswerItem() answer_item["zhihu_id"] = answer["id"] answer_item["url"] = answer["url"] answer_item["question_id"] = answer["question"]["id"] answer_item["author_id"] = answer["author"][ "id"] if "id" in answer["author"] else None answer_item[ "content"] = answer["content"] if "content" in answer else None answer_item["parise_num"] = answer["voteup_count"] answer_item["comments_num"] = answer["comment_count"] answer_item["create_time"] = answer["created_time"] answer_item["update_time"] = answer["updated_time"] answer_item["crawl_time"] = datetime.datetime.now() yield answer_item if not is_end: yield scrapy.Request(next_url, headers=self.headers, callback=self.parse_answer)
def parse_answer(self, response): ''' parse answer ''' if response.status in self.handle_httpstatus_list: self.failed_urls.append(response.url) # 数据收集,当Response状态码为403/404/500时,failed_url数加1 self.crawler.stats.inc_value("failed_url") answer_dcit = json.loads(response.text) is_end = answer_dcit['paging']['is_end'] next_url = answer_dcit['paging']['next'] for answer in answer_dcit['data']: answer_item = ZhihuAnswerItem() answer_item["answer_id"] = answer['id'] answer_item["question_id"] = answer['question']['id'] answer_item["answer_url"] = answer['url'] answer_item["author_id"] = answer['author'][ 'id'] if 'id' in answer['author'] else '' answer_item["content"] = answer['content'] answer_item["praise_nums"] = answer['voteup_count'] answer_item["comment_nums"] = answer['comment_count'] answer_item["create_time"] = answer['created_time'] answer_item["update_time"] = answer['updated_time'] answer_item["crawl_time"] = datetime.datetime.now() question_create_time = answer['question']['created'] question_update_time = answer['question']['updated_time'] yield answer_item if not is_end: yield scrapy.Request(next_url, callback=self.parse_answer)
def parse_answer(self, reponse): #处理question的answer ans_json = json.loads(reponse.text) is_end = ans_json["paging"]["is_end"] #判断是否还有后续的页面要请求 #totals_answer = ans_json["paging"]["totals"] next_url = ans_json["paging"]["next"] #提取answer的具体字段 for answer in ans_json["data"]: answer_item = ZhihuAnswerItem() #申明实例 answer_item["zhihu_id"] = answer["id"] answer_item["url"] = answer["url"] answer_item["question_id"] = answer["question"]["id"] answer_item["author_id"] = answer["author"][ "id"] if "id" in answer["author"] else None #可能为匿名用户 answer_item[ "content"] = answer["content"] if "content" in answer else None answer_item["parise_num"] = answer["voteup_count"] answer_item["comments_num"] = answer["comment_count"] answer_item["create_time"] = answer["created_time"] answer_item["update_time"] = answer["updated_time"] answer_item["crawl_time"] = datetime.datetime.now() #取当前时间 yield answer_item #yield出去,交给pipelines.py if not is_end: #如果还有后续页面,继续请求 yield scrapy.Request(next_url, headers=self.headers, callback=self.parse_answer)
def parse_answer(self, response): # 回答请求返回的是json格式,可以load处理 ans_json = json.loads(response.text) is_end = ans_json['paging']['is_end'] next_url = ans_json['paging']['next'] for answer in ans_json['data']: answer_item = ZhihuAnswerItem() answer_item['zhihu_id'] = answer['id'] answer_item['url'] = answer['url'] answer_item['question_id'] = answer['question']['id'] answer_item['author_id'] = answer['author'][ 'id'] if 'id' in answer['author'] else None answer_item['content'] = answer[ 'content'] if 'content' in answer else answer['excerpt'] answer_item['praise_num'] = answer['voteup_count'] answer_item['comments_num'] = answer['comment_count'] answer_item['create_time'] = answer['created_time'] answer_item['update_time'] = answer['updated_time'] # answer_item['crawl_time'] = datetime.datetime.now() answer_item['crawl_time'] = time.time() yield answer_item # 判断是否结束,如果没有结束进行请求next_url,next_url是json返回的 if not is_end: yield scrapy.Request(next_url, headers=self.header, callback=self.parse_answer)
def parse_answer(self, response): answer_json = json.loads(response.text) is_end = answer_json['paging']['is_end'] next_url = answer_json['paging']['next'] for answer in answer_json['data']: answer_item = ZhihuAnswerItem() answer_item['id'] = answer['id'] answer_item['url'] = answer['url'] answer_item['question_id'] = answer['question']['id'] answer_item['author_id'] = answer['author'][ 'id'] if 'id' in answer['author'] else None answer_item['author_name'] = answer['author'][ 'name'] if 'name' in answer['author'] else None answer_item[ 'content'] = answer['content'] if 'content' in answer else None answer_item['approve_num'] = answer['voteup_count'] answer_item['comments_num'] = answer['comment_count'] answer_item['create_time'] = answer['created_time'] answer_item['update_time'] = answer['updated_time'] answer_item['crawl_time'] = int(time.time()) yield answer_item if not is_end: yield scrapy.Request( next_url, meta={'cookiejar': response.meta['cookiejar']}, headers=self.headers, callback=self.parse_answer)
def parse_answer(self, response): answer_json = json.loads(response.text) is_end = answer_json["paging"]["is_end"] totals = answer_json["paging"]["totals"] next_url = answer_json["paging"]["next"] # 提取answer具体数据 for answer in answer_json["data"]: answer_itme = ZhihuAnswerItem() answer_itme["zhihu_id"] = answer["id"] answer_itme["url"] = answer["url"] answer_itme["question_id"] = answer["question"]["id"] answer_itme["author_id"] = answer["author"][ "id"] if "id" in answer["author"] else None answer_itme[ "content"] = answer["content"] if "content" in answer else None answer_itme["praise_num"] = answer["voteup_count"] answer_itme["comments_num"] = answer["comment_count"] answer_itme["create_time"] = answer["created_time"] answer_itme["update_time"] = answer["updated_time"] answer_itme["crawl_time"] = datetime.datetime.now() yield answer_itme if not is_end: yield scrapy.Request(next_url, format(response.meta.get("question_id", ""), 20, 0), headers=self.headers, callback=self.parse_answer)
def parse_answer(self, response): # 处理question的answer ans_json = json.loads(response.text) is_end = ans_json["paging"]["is_end"] # 判断后续是否还有页面需要请求 # total_answer = ans_json["paging"]["totals"] next_url = ans_json["paging"]["is_end"] # 提取answer的具体字段 for answer in ans_json["data"]: answer_item = ZhihuAnswerItem() answer_item['zhihu_id'] = answer["id"] answer_item['url'] = answer["url"] answer_item['question_id'] = answer["question"]["id"] answer_item['author_id'] = answer["author"][ "id"] if "id" in answer["author"] else None # 匿名提问者现在是id=0 answer_item[ 'content'] = answer["content"] if "content" in answer else None answer_item['praise_num'] = answer["voteup_count"] answer_item['comments_num'] = answer["comment_count"] answer_item['create_time'] = answer["created_time"] answer_item['update_time'] = answer["updated_time"] answer_item['crawl_time'] = datetime.datetime.now() # current time yield answer_item # yield出去交给pipeline处理 if not is_end: yield scrapy.Request(next_url, headers=self.headers, callback=self.parse_answer)
def parse_answer(self, response): # 处理question的answer answer_json = json.loads(response.text) is_end = answer_json['paging']['is_end'] next_url = answer_json['paging']['next'] # 提取answer的具体信息 for answer in answer_json['data']: answer_item = ZhihuAnswerItem() answer_item['zhihu_id'] = answer['id'] answer_item['url'] = answer['url'] answer_item['question_id'] = answer['question']['id'] answer_item['author_id'] = answer['author'][ 'id'] if 'id' in answer['author'] else None # answer_item['content'] = answer['content'] if 'content' in answer else None answer_item['praise_num'] = answer['voteup_count'] answer_item['comments_num'] = answer['comment_count'] answer_item['updated_time'] = answer['updated_time'] answer_item['created_time'] = answer['created_time'] answer_item['crawl_time'] = datetime.datetime.now() yield answer_item if not is_end: yield scrapy.Request(next_url, callback=self.parse_answer)
def parse_answer(self, response): """ 1. 获取每条回答的信息 2. 通过api接口自动获取更多回答 """ answer_json = json.loads(response.text) is_end = answer_json["paging"]["is_end"] next_url = answer_json["paging"]["next"] for answer in answer_json["data"]: answer_item = ZhihuAnswerItem() answer_item['answer_id'] = answer['id'] answer_item['url'] = answer['url'] answer_item['question_id'] = answer['question']['id'] answer_item['author_id'] = answer["author"][ "id"] if 'id' in answer['author'] else None answer_item['author_name'] = answer['author'][ 'name'] if 'name' in answer['author'] else None answer_item['content'] = answer['content'] answer_item['agree_num'] = answer['voteup_count'] answer_item['comment_num'] = answer['comment_count'] answer_item['create_time'] = answer['created_time'] answer_item['update_time'] = answer['updated_time'] yield answer_item if not is_end: yield scrapy.Request(next_url, headers=self.headers, callback=self.parse_answer)
def parse_answer(self, response): """ 分析API返回的json格式文件,从中提取信息 """ answer_json = json.load(response.text) is_end = answer_json["paging"]["is_end"] next_url = answer_json["paging"]["next"] for answer in answer_json["data"]: answer_item = ZhihuAnswerItem() answer_item["zhihu_id"] = answer["id"] answer_item["url"] = answer["url"] answer_item["question_id"] = answer["question"]["id"] # 用户可能匿名,此时就没有ID,所以这里要判断一下,是否有该字段 answer_item["author_id"] = answer["author"][ "id"] if "id" in answer["author"] else None answer_item[ "content"] = answer["content"] if "content" in answer else None answer_item["praise_num"] = answer["voteup_count"] answer_item["comments_num"] = answer["comment_count"] answer_item["create_time"] = answer["created_time"] answer_item["update_time"] = answer["updated_time"] answer_item["crawl_time"] = datetime.now() yield answer_item # 不是回答结尾的话继续通过next_url获得回答 if not is_end: yield scrapy.Request(next_url, headers=self.headers, callback=self.parse_answer)
def parse_answer(self, response): # 回答列表返回json结果,所以将字符串转换为json结构 ans_json = json.loads(response.text) # 获取是否为最后一页 is_end = ans_json["paging"]["is_end"] # 获取下一页url next_url = ans_json["paging"]["next"] # 提取answer字段 for answer in ans_json["data"]: answer_item = ZhihuAnswerItem() answer_item["zhihu_id"] = answer["id"] answer_item["url"] = answer["url"] answer_item["question_id"] = answer["question"]["id"] answer_item["author_id"] = answer["author"][ "id"] if "id" in answer["author"] else None answer_item[ "content"] = answer["content"] if "content" in answer else None answer_item["praise_num"] = answer["voteup_count"] answer_item["comments_num"] = answer["comment_count"] answer_item["create_time"] = answer["created_time"] answer_item["update_time"] = answer["updated_time"] # 处理item yield answer_item # 如果不是最后一页,则处理下一页 if not is_end: yield scrapy.Request(next_url, headers=self.headers, callback=self.parse_answer)