def parse_question(self, response): #处理question页面, 从页面中提取出具体的question item if "QuestionHeader-title" in response.text: #处理新版本 match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) if match_obj: question_id = int(match_obj.group(2)) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css("title", "h1.QuestionHeader-title::text") item_loader.add_css("content", ".QuestionHeader-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", ".List-headerText span::text") # item_loader.add_css("comments_num", ".QuestionHeader-actions button::text") item_loader.add_css("comments_num", ".QuestionHeader-Comment button::text") # item_loader.add_css("watch_user_num", ".NumberBoard-value::text") item_loader.add_css("watch_user_num", ".NumberBoard-itemValue::text") item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text") question_item = item_loader.load_item() else: #处理老版本页面的item提取 match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) if match_obj: question_id = int(match_obj.group(2)) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) # item_loader.add_css("title", ".zh-question-title h2 a::text") item_loader.add_xpath( "title", "//*[@id='zh-question-title']/h2/a/text()|//*[@id='zh-question-title']/h2/span/text()" ) item_loader.add_css("content", "#zh-question-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", "#zh-question-answer-num::text") item_loader.add_css( "comments_num", "#zh-question-meta-wrap a[name='addcomment']::text") # item_loader.add_css("watch_user_num", "#zh-question-side-header-wrap::text") item_loader.add_xpath( "watch_user_num", "//*[@id='zh-question-side-header-wrap']/text()|//*[@class='zh-question-followers-sidebar']/div/a/strong/text()" ) item_loader.add_css("topics", ".zm-tag-editor-labels a::text") question_item = item_loader.load_item() yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers, callback=self.parse_answer) yield question_item
def parse_question(self, response): #处理question页面,并提出具体的question item if 'QuestionHeader-title' in response.text: #处理新版本 match_obj = re.match('(.*zhihu.com.question/(/d+).*)(/|$)', response.url) if match_obj: question_id = int(match_obj.group(2)) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css('title', 'h1.QuestionHeader-title::text') item_loader.add_css('content', '.QuestionHeader-detail') item_loader.add_value('url', response.url) item_loader.add_value('zhihu_id', question_id) item_loader.add_css('answer_num', '.List-headerText span::text') item_loader.add_css('comment_nums', '.QuestionHeader-actions button::text') item_loader.add_css('watch_user_num', '.NumberBoard-value::text') item_loader.add_css('topics', '.QuestionHeader-topics .Popover div::text') question_item = item_loader.load_item() else: #处理旧版本页面item页面提取 match_obj = re.match('(.*zhihu.com.question/(/d+).*)(/|$)', response.url) if match_obj: question_id = int(match_obj.group(2)) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) #item_loader.add_css('title', '.zh-question-title h2 a::text') item_loader.add_xpath( 'title', '//*[@id="zh-question-title"]/h2/span/text()|//*[@id="zh-question-title"]/h2/a/text()' ) item_loader.add_css('content', '#zh-question-detail') item_loader.add_value('url', response.url) item_loader.add_value('zhihu_id', question_id) item_loader.add_css('answer_num', '#zh-question-answer-num::text') item_loader.add_css( 'comment_nums', '#zh-question-meta-wrap a[name="addcomment"]::text') #item_loader.add_css('watch_user_num', '.#zh-question-side-header-wrap::text') item_loader.add_xpath( 'watch_user_num', '//*[@id=zh-question-side-header-wrap]/text()|//*[@class="zh-question-fllowers-sidebar"]/div/a/strong/text()' ) item_loader.add_css('topics', '.zm-tag-editor-labels a::text') question_item = item_loader.load_item() yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.header, callback=self.parse_answer) yield question_item
def parse_question(self, response): question_id = int(response.meta.get('question_id')) # handle new version if "QuestionHeader-title" in response.text: print("This is the new version of Zhihu") question_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) question_loader.add_value('url', response.url) question_loader.add_value('zhihu_id', question_id) question_loader.add_css('title', "h1.QuestionHeader-title::text") question_loader.add_css('content', "div.QuestionHeader-detail") question_loader.add_css('answer_num', "h4.List-headerText span::text") question_loader.add_css('comments_num', "div.QuestionHeader-Comment button::text") question_loader.add_css('watch_user_num', "div.NumberBoard-value::text") question_loader.add_css( 'topics', "div.QuestionHeader-topics .Popover div::text") else: print("This is the old version of Zhihu") question_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) question_loader.add_value('url', response.url) question_loader.add_value('zhihu_id', question_id) question_loader.add_xpath( "title", "//*[@id='zh-question-title']/h2/a/text()|//*[@id='zh-question-title']/h2/span/text()" ) question_loader.add_css("content", "#zh-question-detail") question_loader.add_css("answer_num", "#zh-question-answer-num::text") question_loader.add_css( "comments_num", "#zh-question-meta-wrap a[name='addcomment']::text") question_loader.add_xpath( "watch_user_num", "//*[@id='zh-question-side-header-wrap']/text()|//*[@class='zh-question-followers-sidebar']/div/a/strong/text()" ) question_loader.add_css("topics", ".zm-tag-editor-labels a::text") question_item = question_loader.load_item() yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.header, callback=self.parse_answer) yield question_item
def parse_question(self, response): #处理question页面,从页面中提取具体的item if "QuestionHeader-title" in response.text: #处理新版本 match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) if match_obj: question_id = int(match_obj.group(2)) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) #传递实例 item_loader.add_css("title", "h1.QuestionHeader-title::text") item_loader.add_css("content", ".QuestionHeader-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", ".List-headerText span::text") item_loader.add_css("comments_num", ".QuestionHeader-Comment") item_loader.add_css("watch_user_num", ".NumberBoard-value::text") item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text") question_item = item_loader.load_item() else: #处理旧版本 match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) if match_obj: question_id = int(match_obj.group(2)) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) # 传递实例 item_loader.add_css("title", ".zh-question-title h2 a::text") item_loader.add_css("content", "#zh-question-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", "#zh-question-answer-num::text") item_loader.add_css( "comments_num", "#zh-question-meat-wrap a[name='addcomment']::text") item_loader.add_css("watch_user_num", "#zh-question-side-header-warp::text") item_loader.add_css("topics", ".zm-tag-editor-labels a::text") question_item = item_loader.load_item() yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers, callback=self.parse_answer) yield question_item
def parse_question(self, response): #处理question页面,从页面中提取具体的question item if "QuestionHeader-title" in response.text: match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) if match_obj: question_id = int(match_obj.group(2)) #处理新版本 item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css("title", "h1.QuestionHeader-title::text") item_loader.add_css("content", ".QuestionHeader-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", ".List-headerText span::text") #item_loader.add_xpath("answer_num","//*[@id='root']/div/main/div/div[2]/div[1]/div[1]/a/text()") item_loader.add_css("comments_num", ".QuestionHeader-Comment button::text") #item_loader.add_css("watch_user_num",".NumberBoard-value::text") item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text") question_item = item_loader.load_item() #如果是request,会路由到下载器进行下载 #yield scrapy.Request(self.start_answer_url.format(question_id,20,0),headers=self.headers,callback=self.parse_answer) #如果是item,会路由到pipelines中 yield question_item
def parse_question(self, response): #处理question页面,从页面提取数据 question_id = int(response.meta.get("question_id", "")) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css("title", "h1.QuestionHeader-title::text") item_loader.add_css("content", "div.QuestionHeader-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", ".List-headerText span::text") item_loader.add_css("comments_num", ".QuestionHeaderActions button::text") item_loader.add_css("watch_user_num", "strong.NumberBoard-itemValue::attr(title)") item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text") item_loader.add_value("crawl_time", datetime.datetime.now()) question_item = item_loader.load_item() self.headers["Referer"] = "https://www.zhihu.com/question/" + str( question_id) yield scrapy.Request(self.start_answer_url.format(question_id, 0, 20), headers=self.headers, callback=self.parse_answer) yield question_item
def parse_question(self, response): """ 处理question页面,从页面中提取出具体的question item :type response: HtmlResponse :param response: :return: """ question_id = int(response.meta.get('question_id')) if "QuestionHeader-title" in response.text: item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css('title', 'h1.QuestionHeader-title::text') # item_loader.add_xpath('title',"//*[@id=') item_loader.add_css('content', '.QuestionHeader-detail') item_loader.add_value('url', response.url) item_loader.add_value('zhihu_id', int(response.meta.get('question_id'))) # item_loader.add_css('answer_num','.List-headerText span::text') item_loader.add_xpath( 'answer_num', '//*[@id="root"]/div/main/div/div[2]/div[1]/div[1]/a/text()|//*[@id="QuestionAnswers-answers"]/div/div/div[1]/h4/span/text()' ) item_loader.add_css('comments_num', '.QuestionHeaderActions button::text') # item_loader.add_xpath('comments_num','//*[@class="Button Button--plain"][1]/svg/text()') item_loader.add_css('watch_user_num', '.NumberBoard-value::text') item_loader.add_css('topics', '.QuestionHeader-topics .Popover div::text') question_item = item_loader.load_item() # 请求问题的答案 yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), callback=self.parse_answer, headers=self.headers) yield question_item
def parse_question(self, response): item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css('title', 'h1.QuestionHeader-title::text') # item_loader.add_xpath('title', "//h1[@class='QuestionHeader-title']/text()|//h1[@class='QuestionHeader-title']/a/text()") item_loader.add_css('content', '.QuestionHeader-detail') item_loader.add_value('url', response.url) item_loader.add_value('zhihu_id', response.meta.get('zhihu_id', '')) item_loader.add_css('answer_num', 'h4.List-headerText span::text') item_loader.add_css('comments_num', '.QuestionHeader-Comment button::text') item_loader.add_css('watch_user_num', 'strong.NumberBoard-itemValue::text') item_loader.add_css('topics', '.QuestionHeader-topics .Popover div::text') question_item = item_loader.load_item() yield scrapy.Request(self.start_answer_url.format( response.meta.get('zhihu_id', ''), 20, 0), headers=self.headers, callback=self.parse_answer)
def parse_question(self, response): if "QuestionHeader-title" in response.text: print("latest version") match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) if match_obj: question_id = int(match_obj.group(2)) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css("title", "h1.QuestionHeader-title::text") item_loader.add_css("content", ".QuestionHeader-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", ".List-headerText span::text") item_loader.add_css("comments_num", ".QuestionHeader-actions button::text") item_loader.add_css("watch_user_num", ".NumberBoard-value::text") item_loader.add_css("topics", ".QuestionHeader-topics .Popover::text") question_item = item_loader.load_item() print("question_item: ") print(question_item) else: print("old version , please switch to latest version") yield question_item
def parse_question(self, response): match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) if match_obj: question_id = int(match_obj.group(2)) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css("title", "h1.QuestionHeader-title::text") item_loader.add_css("content", ".QuestionHeader-detail span::text") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", ".List-headerText span::text") item_loader.add_css("comments_num", ".QuestionHeader-Comment button::text") item_loader.add_css("watch_user_num", ".NumberBoard-itemValue::text") item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text") question_item = item_loader.load_item() if question_item.get("content") == None: item_loader.add_value("content", "None") question_item = item_loader.load_item() print("sth") #分析回答 最多只能一次提取20条 # answer_num = int(question_item.get("answer_num")) yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers, callback=self.parse_answer) #yield 出去的item 被路由到pipline里面 #TODO yield question_item
def parse_detail(self, response): # 处理question页面,从页面中提取quesiont item match_obj = re.match('(.*zhihu.com/question/(\d+))(/|$).*', response.url) question_id = int(match_obj.group(2)) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css('title', 'h1.QuestionHeader-title::text') item_loader.add_css('content', '.QuestionHeader-detail' ) # .QuestionHeader-detail span.RichText::text item_loader.add_value('url', response.url) item_loader.add_value('zhihu_id', question_id) item_loader.add_css('answer_num', '.List-headerText span::text') item_loader.add_css( 'comments_num', '.QuestionHeader-Comment button.Button--plain::text') item_loader.add_css( 'watch_user_num', '.NumberBoard-itemInner strong.NumberBoard-itemValue::attr("title")' ) item_loader.add_css('topics', '.QuestionHeader-topics .Popover div::text') question_item = item_loader.load_item() yield scrapy.Request(self.start_answer_urls.format(question_id, 3, 0), headers=self.headers, callback=self.parse_answer) yield question_item
def parse_question(self, response): item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css("title", "h1.QuestionHeader-title::text") item_loader.add_css('content', ".QuestionHeader-detail div span::text") item_loader.add_css("answer_num", ".List-headerText span::text") item_loader.add_css("comments_num", ".QuestionHeader-Comment button::text") item_loader.add_css( "watch_user_num", ".QuestionFollowStatus-counts button div strong::text") item_loader.add_css( "click_num", ".QuestionFollowStatus-counts div div strong::text") item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text") item_loader.add_value("url", response.url) request_id = response.meta.get("request_id") item_loader.add_value("zhihu_id", request_id) item_loader.add_value("crawl_time", datetime.datetime.now()) question_item = item_loader.load_item() yield scrapy.Request(self.start_answer_url.format(request_id, 20, 1), headers=self.headers, callback=self.parse_answer) yield question_item
def parse_question(self, response): #处理question,从页面中提取出具体的question item match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) if match_obj: question_id = int(match_obj.group(2)) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css( "title", ".QuestionHeader .QuestionHeader-content .QuestionHeader-main h1.QuestionHeader-title::text" ) item_loader.add_css("content", "div.QuestionHeader-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", "h4.List-headerText span::text") item_loader.add_css("comments_num", ".QuestionHeader-Comment button::text") item_loader.add_css("watch_user_num", ".NumberBoard-itemValue::text") item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text") question_item = item_loader.load_item() # 路由到下载器 yield scrapy.Request(self.start_answer_url.format(question_id, 5, 0), headers=self.headers, callback=self.parse_answer) yield question_item #路由到pipeline中
def parse_question(self, response): match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) question_id = 0 if match_obj: question_id = int(match_obj.group(2)) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css("title", "h1.QuestionHeader-title::text") item_loader.add_css("content", ".QuestionHeader-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", ".List-headerText span::text") item_loader.add_css("comments_num", ".QuestionHeader-actions button::text") item_loader.add_css("watch_user_num", ".NumberBoard-value::text") item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text") question_item = item_loader.load_item() yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers, callback=self.parse_answer) yield question_item
def parse_question(self, response): # 处理question页面, 从页面中提取出具体的question item question_id = response.meta.get('question_id', '') item_loader = ArticleItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css("title", "h1.QuestionHeader-title::text") item_loader.add_css("content", ".QuestionHeader-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", ".List-headerText span::text") item_loader.add_css("comments_num", ".QuestionHeader-Comment span::text") item_loader.add_css( "watch_user_num", ".NumberBoard:first-child .NumberBoard-itemValue::text") item_loader.add_css( "click_num", ".NumberBoard:last-child .NumberBoard-itemValue::text") item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text") question_item = item_loader.load_item() # 请求答案api answer_url = self.start_answer_url.format(question_id, 5, 0) yield scrapy.Request(answer_url, headers=self.headers, callback=self.parse_answer) yield question_item
def parse_question(self, response): """ 处理 question 页面,从页面中提取出具体的 question item 和 answer item 这个函数中也可以继续按照 parse 一样,对所有的 url 进行跟踪 """ item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) if "answer" not in response.url: # 处理新版本 item_loader.add_css("question_title", ".QuestionHeader .QuestionHeader-title::text") # item_loader.add_css("question_content", ".QuestionHeader-detail .RichText::text") # 这个可能取不全 item_loader.add_xpath( "question_content", "//div[@class='QuestionHeader-detail']//span/text() " "| //div[@class='QuestionHeader-detail']//span//*/text()") item_loader.add_value("question_url", response.url) item_loader.add_value("question_id", int(response.meta.get("question_id", ""))) item_loader.add_css("answer_num", ".QuestionMainAction::text") item_loader.add_css("comments_num", ".QuestionHeader-Comment button::text") item_loader.add_css("attentioned_num", ".NumberBoard-value::text") item_loader.add_css("scanned_num", ".NumberBoard-value::text") item_loader.add_css("question_topics", ".Tag-content .Popover div::text") item_loader.add_value("crawl_time", datetime.datetime.now()) else: # 处理旧版本 item_loader.add_css( "question_title", ".QuestionHeader .QuestionHeader-title::text") # # item_loader.add_css("question_content", ".QuestionHeader-detail .RichText::text") # item_loader.add_xpath( "question_content", "//div[@class='QuestionHeader-detail']//span/text() " "| //div[@class='QuestionHeader-detail']//span//*/text()") item_loader.add_value("question_url", response.url) item_loader.add_value("question_id", int(response.meta.get("question_id", ""))) item_loader.add_css("answer_num", ".QuestionMainAction::text") # item_loader.add_css("comments_num", ".QuestionHeader-Comment button::text") item_loader.add_css("attentioned_num", ".NumberBoard-value::text") item_loader.add_css("scanned_num", ".NumberBoard-value::text") item_loader.add_css("question_topics", ".Tag-content .Popover div::text") item_loader.add_value("crawl_time", datetime.datetime.now()) question_item = item_loader.load_item() answer_url = self.start_answer_url.format( response.meta.get("question_id"), 0, 20) # for testing yield scrapy.Request(answer_url, headers=self.headers, callback=self.parse_answer, dont_filter=True) yield question_item
def parse_question(self, response): #处理question页面, 从页面中提取出具体的question item if "QuestionHeader-title" in response.text: #处理新版本 match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) if match_obj: question_id = int(match_obj.group(2)) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css("title", "h1.QuestionHeader-title::text") item_loader.add_css("content", ".QuestionHeader-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", ".List-headerText span::text") item_loader.add_xpath( "comments_num", "//*[@class='QuestionHeader-Comment']/button/text()") item_loader.add_xpath( "watch_user_num", "//*[@class='QuestionHeader-follow-status']/div/div/button/div/strong/text()" ) item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text") question_item = item_loader.load_item() yield scrapy.Request(self.start_answer_url.format(question_id, 3, 0), callback=self.parse_answer) yield question_item
def parse_question(self, response): # 处理question页面 match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) if match_obj: question_id = int(match_obj.group(2)) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) # 标题 item_loader.add_css("title", "h1.QuestionHeader-title::text") # 问题内容 item_loader.add_css("content", ".QuestionHeader-detail") # 问题url item_loader.add_value("url", response.url) # 问题id item_loader.add_value("zhihu_id", question_id) # 回答数量 item_loader.add_xpath("answer_num", "//h4[@class='List-headerText']/span//text()") # 评论数量 item_loader.add_css("comment_num", ".QuestionHeader-Comment button::text") # 关注者和浏览数 item_loader.add_xpath( "watch_user_num", "//strong[@class='NumberBoard-itemValue']/text()") # 所属话题 item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text") question_item = item_loader.load_item() yield scrapy.Request(self.start_answer_url.format( str(question_id), 20, 0), callback=self.parse_answer) yield question_item
def parse_question(self, response): """ 从question页面提取question item """ question_id = response.meta.get('question_id', '') item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css('title', 'h1.QuestionHeader-title:nth-child(2)::text') item_loader.add_css('content', '.QuestionHeader-detail') item_loader.add_value('url', response.url) item_loader.add_value('zhihu_id', question_id) item_loader.add_css('answer_num', '.List-headerText > span:nth-child(1)::text') item_loader.add_css( 'comments_num', '.QuestionHeader-Comment > button:nth-child(1)::text') item_loader.add_css('watch_user_num', '.NumberBoard-itemValue::text') item_loader.add_css('topics', '.QuestionHeader-topics .Popover > div::text') question_item = item_loader.load_item() yield scrapy.Request(url=self.start_answer_urls.format(question_id, 0), headers=self.headers, callback=self.parse_answer) yield question_item
def parse_question(self, response): """ 进入chrome.exe执行路径 手动启动chrome chrome.exe --remote-debugging-port=9222 解析每个问题详情页数据 提取具体question item :param response: :return: """ if "QuestionHeader-title" in response.text: # 新版本页面的处理方式 match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) # 提取问题url 和 问题id的正则 if match_obj: question_id = int(match_obj.group(2)) # 问题id item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) # 实例化itemloader item_loader.add_css("title", "h1.QuestionHeader-title::text") item_loader.add_css("content", ".QuestionHeader-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", ".List-headerText span::text") item_loader.add_css("comments_num", ".ContentItem-actions button::text") item_loader.add_css("watch_user_num", ".NumberBoard-itemValue::text") item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text") question_item = item_loader.load_item() else: # 旧版页面的处理方式 已取消 pass # question_url = self.start_answer_url.format(question_id, 20, 0) yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), callback=self.parse_answer) yield question_item # 异步存储问题数据
def parse_question(self, response): match_obj = re.match("(.*zhihu.com/question/(\d+)).*", response.url) question_id = 0 if match_obj: question_id = int(match_obj.group(2)) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css("title", ".QuestionHeader-title::text") item_loader.add_css("content", ".QuestionHeader-detail") item_loader.add_value("url", response.url) item_loader.add_value("question_id", question_id) item_loader.add_css("answer_num", ".List-headerText>span::text") item_loader.add_css("comments_num", ".QuestionHeader-Comment>button::text") item_loader.add_css("watch_user_num", ".NumberBoard-itemValue::text") item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text") question_item = item_loader.load_item() if int(question_id) == 315387406: s = question_item["answer_num"] question_item["click_num"] = question_item["watch_user_num"][1] question_item["watch_user_num"] = question_item["watch_user_num"][0] yield scrapy.Request(url=self.answer_start_url.format( question_id, 5, 0), headers=self.headers, cookies=self.cookies1, callback=self.parse_answer) yield question_item
def parse_question(self, response): # 处理question页面,从页面中提取出具体的question item if "QuestionMainAction" in response.text: # 处理老版本 # 利用正则表达式提取request_url和request_id match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) if match_obj: question_id = int(match_obj.group(2)) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css("title", "h1.QuestionHeader-title::text") item_loader.add_css("content", ".QuestionHeader-detail") item_loader.add_css("url", response.url) item_loader.add_css("zhihu_id", question_id) question_item = item_loader.load_item() else: # 利用正则表达式提取request_url和request_id match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) if match_obj: question_id = int(match_obj.group(2)) # 处理新版本 item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css("title", "h1.QuestionHeader-title::text") item_loader.add_css("content", ".QuestionHeader-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", ".List-headerText span::text") item_loader.add_css("comments_num", ".QuestionHeader-Comment button::text") item_loader.add_css("watch_user_num", ".NumberBoard-itemValue::text") item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text") question_item = item_loader.load_item() # 解析question中的answer yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers, callback=self.parse_answer) # 将question_item传到pipelines.py中 yield question_item
def parse_question(self, response): # 处理question页面,从页面中提取具体的question item # 这个函数体中也可像parse函数那样跟踪url match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) # 提取id或者带/answer if match_obj: question_id = int(match_obj.group(2)) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css( "title", "div[data-zop-question]::attr(data-zop-question)") # 一个字典提取 item_loader.add_css( "topics", "div[data-zop-question]::attr(data-zop-question)") # 一个字典提取 """ json.loads(d).get('title') # title tp_l = json.loads(d)['topics'] ",".join([d.get('name') for d in tp_l]) # topics """ item_loader.add_css("content", ".QuestionHeader-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) # item_loader.add_css("answer_num", ".QuestionMainAction::text") # 正则提取 item_loader.add_xpath( "answer_num", "//*[@class='List-headerText']/span/text()|//*[@class='QuestionMainAction']/text()" ) """ re.match(".*?([,\d]+).*", d).group(1).replace(",", "") # int """ item_loader.add_css("comments_num", ".QuestionHeader-Comment .Button::text") """ re.match(".*?(\d+).*", d).group(1) # int """ item_loader.add_css("watch_user_num", ".QuestionFollowStatus-counts strong::text") """ d.replace(",", "") """ item_loader.add_css( "click_num", ".QuestionFollowStatus-counts strong::text") # 不能提取第一个 question_item = item_loader.load_item() yield question_item yield scrapy.Request( url=self.start_answer_url.format(question_id, 0, 3), headers=self.headers, callback=self.parse_answer, )
def parse_question(self, response): # 处理quetion页面 if "QuestionHeader-title" in response.text: # 处理知乎新版本 item_load = ItemLoader(item=ZhihuQuestionItem(), response=response) item_load.add_css("title", 'QuestionHeader-title::text') item_load.add_css("content", 'QuestionHeader-detail') item_load.add_value("url", response.url) question_id = response.meta.get("question_id") item_load.add_value("zhihu_id", question_id) item_load.add_css("answer_num", '.List-headerText span::text') item_load.add_css("comments_num", '.QuestionHeader-actions button::text') item_load.add_css("watch_user_num", '.NumberBoard-value::text') item_load.add_css("topics", '.QuestionHeader-topics .Popover div::text') # item_load.add_css("click_num",'') # item_load.add_value("crawl_time",time.time.now()) question_item = item_load.load_item() else: # 处理老版本 item_load = ItemLoader(item=ZhihuQuestionItem(), response=response) question_id = response.meta.get("question_id") item_load.add_value("zhihu_id", question_id) item_load.add_value("url", response.url) # item_load.add_css("title", '.zh-question-title h2 a::text') #可能在span或者a标签里,但是css选择器无法实现或的选择改为xpath item_load.add_xpath("title", '//*[@id="zh-question-title"]/h2/a/text()|//*[@id="zh-question-title"]/h2/span/text()') item_load.add_css("content", '#zh-question-detail') item_load.add_css("answer_num", '#zh-question-answer-num::text') item_load.add_css("comments_num", '#zh-question-meta-wrap a[name="addcomment"]::text') item_load.add_css("watch_user_num", '#zh-question-side-header-wrap::text') # item_loader.add_xpath("watch_user_num", item_load.add_xpath("watch_user_num", "//*[@id='zh-question-side-header-wrap']/text()|//*[@class='zh-question-followers-sidebar']/div/a/strong/text()") item_load.add_css("topics", '.zm-tag-editor-labels a::text') question_item = item_load.load_item() yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers, callback=self.parse_answer) yield question_item
def parse_question(self,response): question_id = response.meta.get('question_id','') item_loader = ItemLoader(item = ZhihuQuestionItem(),response=response) item_loader.add_css('title','.QuestionHeader h1.QuestionHeader-title::text') item_loader.add_css('topics','.QuestionTopic .Popover div::text') item_loader.add_css('content','.QuestionHeader-detail span::text') item_loader.add_value('url',response.url) item_loader.add_value('zhihu_id',question_id) item_loader.add_css('answer_num','.List-headerText span::text ') item_loader.add_css('comments_num','.QuestionHeader-Comment button::text') item_loader.add_css('watch_user_num','.NumberBoard-itemValue::text') question_item = item_loader.load_item() yield scrapy.Request(self.start_answer_url.format(question_id,20,0),headers=self.headers, callback=self.parse_answer) yield question_item scrapy.FormRequest pass
def parse_question(self, response): # 处理question页面, 从页面中提取出具体的question item if 'QuestionHeader-title' in response.text: #处理新版本 match_obj = re.match('(.*zhihu.com/question/(\d+))(/|$).*', response.url) if match_obj: question_id = int(match_obj.group(2)) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_xpath( 'title', '//main//h1[@class="QuestionHeader-title"]//text()') item_loader.add_xpath('content', '//div[@class="QuestionHeader-detail"]') item_loader.add_value('url', response.url) item_loader.add_value('zhihu_id', question_id) item_loader.add_xpath('answer_num', '//div[@class="List"]/div/h4/span/text()') item_loader.add_xpath( 'comments_num', '//div[@class="QuestionHeader-Comment"]//button//text()') item_loader.add_xpath( 'watch_user_num', '//strong[@class="NumberBoard-itemValue"]//text()') item_loader.add_xpath( 'topics', '//div[@class="QuestionHeader-topics"]//div[@class="Popover"]//text()' ) # item_loader.add_css('title', 'h1.QuestionHeader-title::text') # item_loader.add_css('content', '.QuestionHeader-detail') # item_loader.add_value('url', response.url) # item_loader.add_value('zhihu_id', question_id) # item_loader.add_css('answer_num', '.List-headerText span::text') # item_loader.add_css('comments_num', '.QuestionHeader-Comment button::text') # item_loader.add_css('watch_user_num', '.NumberBoard-itemValue::text') # item_loader.add_css('topics', '.QuestionHeader-topics .Popover div::text') question_item = item_loader.load_item() yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers, callback=self.parse_answer) yield question_item pass
def parse_question(self, response): itemloader = mArticleItemLoader(item=ZhihuQuestionItem(), response=response) itemloader.add_css("title", ".QuestionHeader-title::text") itemloader.add_css("detail", ".QuestionHeader-detail") itemloader.add_css("comment_nums", ".QuestionHeader-Comment button::text") itemloader.add_css("attention_nums", ".Button.NumberBoard-item .NumberBoard-value::text") itemloader.add_css("watch_nums", "div.NumberBoard-item > div:nth-child(2)::text") itemloader.add_css("tags", ".Popover div::text") itemloader.add_value("crawl_time", datetime.now().strftime("%Y/%m/%d")) itemloader.add_value("url", response.url) itemloader.add_value("object_id", get_md5(response.url)) itemloader.add_value("question_id", response.meta.get("id")) item = itemloader.load_item() answer_url = self.start_answer_url.format(response.meta.get("id")) yield scrapy.Request(url=answer_url, headers=self.headers,callback=self.parse_answer) yield item
def parse(self, response): print(response.text) item_loader = CustnomItemLoader(item=ZhihuQuestionItem(),response=response) item_loader.add_xpath('zhihu_id',) item_loader.add_xpath() item_loader.add_xpath() item_loader.add_xpath('title','//h1[@class="QuestionHeader-title"]') item_loader.add_xpath() item_loader.add_xpath() item_loader.add_xpath() item_loader.add_xpath() item_loader.add_xpath() item_loader.add_xpath() item_loader.add_xpath() item_loader.add_xpath() item_loader.add_xpath()
def parse_question(self, response): # 处理question页面, 从页面中提取出具体的question item if "QuestionHeader-title" in response.text: #处理新版本 match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) if match_obj: question_id = int(match_obj.group(2)) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_xpath( "title", '//*[@id="root"]/div/main/div/div[1]/div[2]/div/div[1]/div[1]/h1/text()|//*[@id="root"]/div/main/article/div[1]/div[3]/div[1]/div/h1/text()' ) item_loader.add_xpath( "content", '//*[@id="root"]/div/main/div/div[1]/div[2]/div/div[1]/div[1]/div[2]/div/div/div/span/p/text|//*[@id="root"]/div/main/article/div[1]/div[3]/div[1]/p/text()' ) item_loader.add_xpath("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", "#zh-question-answer-num::text") item_loader.add_xpath( "comments_num", '//*[@id="root"]/div/main/div/div[1]/div[2]/div/div[2]/div/div/div[2]/div[1]/button/text()|//*[@id="root"]/div/main/article/div[1]/div[4]/button[1]/text()' ) item_loader.add_xpath( "watch_user_num", '//*[@id="root"]/div/main/div/div[1]/div[2]/div/div[1]/div[2]/div/div/div/button/div/strong' ) item_loader.add_xpath("topics", '.QuestionHeader-topics .Popover::text') question_item = item_loader.load_item() else: # 处理老版本页面的item提取 # match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) # if match_obj: # question_id = int(match_obj.group(2)) pass yield scrapy.Request(self.start_answer_url.format(question_id, 0, 20), headers=self.headers, callback=self.parse_answer) yield question_item
def parse_question(self, response): #处理question页面,从页面中提取具体的queston item question_id = int(response.meta.get('question_id', '')) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css('title', 'h1.QuestionHeader-title::text') item_loader.add_css('content' 'div.QuestionHeader-detail') item_loader.add_value('url', response.url) item_loader.add_css('answer_num', 'h4.List-headerText span::text') item_loader.add_css('comment_num', '.QuestionHeader-Comment button::text') item_loader.add_value('question_id', question_id) item_loader.add_css('watch_user_num' '."NumberBoard-itemValue::text') item_loader.add_css('topics', '.QuestionHeader-topics .Popover div::text') question_item = item_loader.load_item() yield scrapy.Request(self.start_answer_url.format(question_id, 5, 0), headers=self.header, callback=self.parse_answer) yield question_item