def parse_question(self, response): """ question page question item """ if "QuestionHeader-title" in response.text: # match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) if match_obj: question_id = int(match_obj.group(2)) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css("title", "h1.QuestionHeader-title::text") item_loader.add_css("content", ".QuestionHeader-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", ".List-headerText span::text") item_loader.add_css("comments_num", ".QuestionHeader-Comment button::text") item_loader.add_css("watch_user_num", ".NumberBoard-value::text") item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text") question_item = item_loader.load_item() else: match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) if match_obj: question_id = int(match_obj.group(2)) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) # item_loader.add_css("title", ".zh-question-title h2 a::text") item_loader.add_xpath( "title", "//*[@id='zh-question-title']/h2/a/text()|//*[@id='zh-question-title']/h2/span/text()" ) item_loader.add_css("content", "#zh-question-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", "#zh-question-answer-num::text") item_loader.add_css( "comments_num", "#zh-question-meta-wrap a[name='addcomment']::text") # item_loader.add_css("watch_user_num", "#zh-question-side-header-wrap::text") item_loader.add_xpath( "watch_user_num", "//*[@id='zh-question-side-header-wrap']/text()|//*[@class='zh-question-followers-sidebar']/div/a/strong/text()" ) item_loader.add_css("topics", ".zm-tag-editor-labels a::text") question_item = item_loader.load_item() print("---------------------------here.") yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers, callback=self.parse_answer) yield question_item
def parse_question(self, response): # 处理question页面, 从页面中提取出具体的question item if "QuestionHeader-title" in response.text: match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) if match_obj: question_id = int(match_obj.group(2)) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css("title", "h1.QuestionHeader-title::text") item_loader.add_css("content", ".QuestionHeader-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", ".List-headerText span::text") item_loader.add_css("comments_num", ".QuestionHeader-actions button::text") item_loader.add_css("watch_user_num", ".NumberBoard-value::text") item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text") question_item = item_loader.load_item() yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers, callback=self.parse_answer) yield question_item
def parse_questions(self, response): q_item = ItemLoader(item=ZhihuQuestionItem(), response=response) q_id = response.meta['question_id'] q_item.add_value('q_id', q_id) q_item.add_value('q_url', response.url) q_item.add_css('q_title', '.QuestionHeader-tags+h1::text') q_item.add_css( 'q_content', '.QuestionRichText.QuestionRichText--collapsed span::text') q_item.add_css('q_topic', '.Tag.QuestionTopic .Popover div::text') q_item.add_css('q_answers_num', '.List-headerText span::text') q_item.add_xpath( 'q_follower', '//div[@class="NumberBoard QuestionFollowStatus-counts NumberBoard--divider"]/button//strong/text()' ) q_item.add_xpath( 'q_watcher', '//div[@class="NumberBoard QuestionFollowStatus-counts NumberBoard--divider"]/div//strong/text()' ) q_item.add_value('crawl_time', datetime.datetime.now()) question_item = q_item.load_item() answers_url = self.temp_answers_url[0].format(q_id, 15, 0) yield Request(url=answers_url, callback=self.parse_answers, headers=self.headers, meta={'q_id': q_id}) yield question_item
def parse_question(self, response): """ 解析问题详情及获取指定范围答案 """ item = ZhihuQuestionItem() item['name'] = response.xpath('//meta[@itemprop="name"]/@content').extract()[0] item['url'] = response.xpath('//meta[@itemprop="url"]/@content').extract()[0] item['keywords'] = response.xpath('//meta[@itemprop="keywords"]/@content').extract()[0] item['answer_count'] = response.xpath('//meta[@itemprop="answerCount"]/@content').extract()[0] item['comment_count'] = response.xpath('//meta[@itemprop="commentCount"]/@content').extract()[0] item['flower_count'] = response.xpath('//meta[@itemprop="zhihu:followerCount"]/@content').extract()[0] item['date_created'] = response.xpath('//meta[@itemprop="dateCreated"]/@content').extract()[0][0:19].replace('T',' ') count_answer = int(item['answer_count']) question_id = int(re.match(r'https://www.zhihu.com/question/(\d+)', response.url).group(1)) item['question_id'] = question_id yield item # 从指定位置开始获取指定数量答案 if count_answer > self.answer_count: count_answer = self.answer_count n = self.answer_offset while n + 20 <= count_answer: yield scrapy.Request(self.more_answer_url.format(question_id, n, n + 10), headers=self.headers,cookies=self.cookies, callback=self.parse_answer) n += 20
def parse_question(self, response): item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) question_id = response.meta.get('question_id', '') item_loader.add_xpath('title', '//h1[@class="QuestionHeader-title"]/text()') item_loader.add_xpath('content', '//span[@class="RichText"]/text()') item_loader.add_value('zhihu_id', question_id) item_loader.add_value('url', response.url) item_loader.add_css('answer_num', '.List-headerText span::text') item_loader.add_css('comments_num', '.QuestionHeader-Comment button::text') item_loader.add_css('watch_user_num', '.NumberBoard-value::text') item_loader.add_xpath('topics', '//div[@class="Popover"]/div/text()') article_item = item_loader.load_item() yield article_item
def get_question_url(self, response): result = json.loads(response.text) data = result.get('data') for item in data: question_id = item.get('id') question = ZhihuQuestionItem() question['id'] = question_id question['title'] = item.get('title') question['answer_count'] = item.get('answer_count') question['follower_count'] = item.get('follower_count') yield question yield Request(url=self.anwser_url.format(question_id), callback=self.parse)
def parse_ans_and_que(self,response): #解析answer和question的数据,并构造个人页面的url实行访问 response_json = json.loads(response.text) if response_json["paging"]["is_start"] == True: #仅获取api中首页的第一个数据信息中的queston数据 question_item = ZhihuQuestionItem() data = response_json["data"][0] question_item['question_id'] = data["question"]["id"] question_item['question_url'] = response.meta.get("question_url", "") question_item['question'] = data["question"]["title"] question_item['content'] = response.meta.get("content","") question_item['comments_num'] = response.meta.get("comments_num","") question_item['followers_num'] = response.meta.get("followers_num", "") question_item['watch_user_num'] = response.meta.get("watch_user_num", "") question_item['question_topics'] = response.meta.get("question_topics","") question_item['question_created_time'] = data["question"]["created"] question_item['question_updated_time'] = data["question"]["updated_time"] question_item['answer_num'] = response.meta.get("answer_num","") yield question_item #获取user_key,构造user_url,并且解析answer的数据 for answer_data in response_json["data"]: # user_url.request answer_item = ZhihuAnswerItem() author_id =answer_data["author"]["url_token"] if author_id is not None: author_url = "https://www.zhihu.com/people/" + author_id +"/answers" yield Request(url=author_url, callback=self.parse_user,meta={"author_id":author_id}) else: continue answer_item['question'] = answer_data["question"]["title"] answer_item['question_id'] = answer_data["question"]["id"] answer_item['answer_id'] =answer_data["id"] answer_item['answer_url'] = answer_data["url"] answer_item['content'] = answer_data["content"] answer_item['author_url_token'] = author_id answer_item['vote_num'] = answer_data["voteup_count"] answer_item['comments_num'] = answer_data["comment_count"] answer_item['answer_created_time'] = answer_data["created_time"] answer_item['answer_updated_time'] = answer_data["updated_time"] yield answer_item if response_json["paging"]["is_end"] == False: #判断此api是否有下一页,若有则request next_url = response_json["paging"]["next"] if next_url is not None: yield Request(url=next_url,callback=self.parse_ans_and_que)
def parse_question(self, response): create_time = response.meta.get("create_time", "") question_id = response.meta.get("question_id", "") headers = { "HOST": "www.zhihu.com", "Referer": "https://www.zhizhu.com", 'User-Agent': "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:79.0) Gecko/20100101 Firefox/79.0" } # match_obj = re.search("(.*zhihu.com/question/(\d+))(/|$).*", response.url) # question_id = int(match_obj.group(2)) item_loader = ZhihuItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css("title", "h1.QuestionHeader-title::text") item_loader.add_css("content", ".QuestionHeader-detail .RichText.ztext::text") item_loader.add_value("url", response.url) item_loader.add_value("question_id", question_id) item_loader.add_css("topics", ".QuestionHeader-topics #null-toggle::text") item_loader.add_xpath( "attention_num", "//div[contains(text(),'关注者')]/following-sibling::strong/text()") item_loader.add_css("comments_num", ".QuestionHeader-Comment button::text") item_loader.add_xpath("answer_num", "//*[@class='List-headerText']//span/text()[1]") item_loader.add_xpath( "click_num", "//div[contains(text(),'被浏览')]/following-sibling::strong/text()") item_loader.add_value( "create_time", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(create_time))) item_loader.add_value("crawl_time", str(datetime.datetime.now())) question_item = item_loader.load_item() yield question_item yield scrapy.Request( self.start_answer_url.format(question_id=question_id), headers=headers, callback=self.parse_answer)
def parse_question(self, response): """ 解析问题详情及获取指定范围答案 """ text = response.text item = ZhihuQuestionItem() item['name'] = re.findall(r'<meta itemprop="name" content="(.*?)"', text)[0] item['url'] = re.findall(r'<meta itemprop="url" content="(.*?)"', text)[0] item['keywords'] = re.findall( r'<meta itemprop="keywords" content="(.*?)"', text)[0] item['answer_count'] = re.findall( r'<meta itemprop="answerCount" content="(.*?)"', text)[0] item['comment_count'] = re.findall( r'<meta itemprop="commentCount" content="(.*?)"', text)[0] item['flower_count'] = re.findall( r'<meta itemprop="zhihu:followerCount" content="(.*?)"', text)[0] item['date_created'] = re.findall( r'<meta itemprop="dateCreated" content="(.*?)"', text)[0] count_answer = int(item['answer_count']) yield item question_id = int( re.match(r'https://www.zhihu.com/question/(\d+)', response.url).group(1)) # 从指定位置开始获取指定数量答案 if count_answer > self.answer_count: count_answer = self.answer_count n = self.answer_offset while n + 20 <= count_answer: yield scrapy.Request(self.more_answer_url.format( question_id, n, n + 20), headers=self.headers, callback=self.parse_answer) n += 20