def parse_question(self, response): # 处理question页面, 从页面中提取出具体的question item question_id = int(re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url).group(2)) if "QuestionHeader-title" in response.text: item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css("title", "h1.QuestionHeader-title::text") item_loader.add_css("content", ".QuestionHeader-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", ".List-headerText span::text") item_loader.add_css("comments_num", ".QuestionHeader-actions button::text") item_loader.add_css("watch_user_num", ".NumberBoard-value::text") item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text") question_item = item_loader.load_item() else: # 处理老版本页面的item提取 item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) # item_loader.add_css("title", ".zh-question-title h2 a::text") item_loader.add_xpath("title", "//*[@id='zh-question-title']/h2/a/text()|//*[@id='zh-question-title']/h2/span/text()") item_loader.add_css("content", "#zh-question-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", "#zh-question-answer-num::text") item_loader.add_css("comments_num", "#zh-question-meta-wrap a[name='addcomment']::text") # item_loader.add_css("watch_user_num", "#zh-question-side-header-wrap::text") item_loader.add_xpath("watch_user_num", "//*[@id='zh-question-side-header-wrap']/text()|//*[@class='zh-question-followers-sidebar']/div/a/strong/text()") item_loader.add_css("topics", ".zm-tag-editor-labels a::text") question_item = item_loader.load_item() yield question_item
def parse_question(self, response): """ 处理question页面,从页面中提取出具体的question item 这里其实也可以提取到url,但是为了逻辑更清晰,不加 """ question_id = '' match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) if match_obj: question_id = int(match_obj.group(2)) if "QuestionHeader-title" in response.text: # 处理新版本 item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css("title", "h1.QuestionHeader-title::text") item_loader.add_css("content", ".QuestionHeader-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", ".List-headerText span::text") item_loader.add_css("comments_num", ".QuestionHeader-actions button::text") item_loader.add_css("watch_user_num", ".NumberBoard-value::text") item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text") question_item = item_loader.load_item() else: # 旧版本 item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) # item_loader.add_css("title", ".zh-question-title h2 a::text") item_loader.add_xpath( "title", "//*[@id='zh-question-title']/h2/a/text()|//*[@id='zh-question-title']/h2/span/text()" ) item_loader.add_css("content", "#zh-question-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", "#zh-question-answer-num::text") item_loader.add_css( "comments_num", "#zh-question-meta-wrap a[name='addcomment']::text") # item_loader.add_css("watch_user_num", "#zh-question-side-header-wrap::text") item_loader.add_xpath( "watch_user_num", "//*[@id='zh-question-side-header-wrap']/text()|//*[@class='zh-question-followers-sidebar']/div/a/strong/text()" ) item_loader.add_css("topics", ".zm-tag-editor-labels a::text") question_item = item_loader.load_item() # 请求answer yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers, callback=self.parse_answer) # 识别到是一个item,提交给pipeline;如果识别到一个Request,则会去下载页面,然后交给parse yield question_item
def parse_question(self, response): # 处理question页面,从页面中提取出具体的question item if "QuestionHeader-title" in response.text: # 处理新版本 match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) if match_obj: question_id = int( match_obj.group(2)) # question_id在数据库中是int类型,传入前最好先处理好 item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css("title", "h1.QuestionHeader-title::text") item_loader.add_css("content", ".QuestionHeader-detail") # 取html内容,不用伪类选择器 item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", ".List-headerText span::text") item_loader.add_css("comments_num", ".QuestionHeader-Comment button::text") item_loader.add_css("watch_user_num", ".NumberBoard-value::text") # return a list item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text" ) # 子带元素中寻找元素 div是后代节点,多少层都能找 # item_loader.add_xpath("topics", "//*[@id='root']/div/main/div/meta[3]") # 提取的整个标签, 可以考虑以后用正则表达式处理 question_item = item_loader.load_item() else: # 处理知乎旧版本页面的item提取 -- 实际现在应该都是新版本了。 match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) if match_obj: question_id = int( match_obj.group(2)) # question_id在数据库中是int类型,传入前最好先处理好 item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css("title", ".zh-question-title h2 a::text") item_loader.add_css("content", "#zh-question-detail") # #zh中#表示取的id item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", ".List-headerText span::text") item_loader.add_css( "comments_num", "#zh-question-meta-wrap a[name='addcomment']::text") item_loader.add_css( "watch_user_num", "#zh-question-side-header-wrap::text") # return a list item_loader.add_css("topics", ".zm-tag-editor-labels a::text") # 子带元素中寻找元素 question_item = item_loader.load_item() yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers, callback=self.parse_answer) yield question_item # comment for debug answer
def parse_question(self, response): # 处理question页面,从页面中提取具体的quesiton item if "QuestionHeader-title" in response.text: # 处理新版本的页面 match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) if match_obj: question_id = int(match_obj.group(2)) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css("title", "h1.QuestionHeader-title::text") item_loader.add_css("content", ".QuestionHeader-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", ".List-headerText span::text") item_loader.add_css("comments_num", "Questionheader-actions button::text") item_loader.add_css("watch_user_num", ".NumberBoard-value::text") item_loader.add_css("topics", ".QuestionHeader-topics .Popover::text") question_item = item_loader.load_item() else: # 处理知乎旧版本 match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) if match_obj: question_id = int(match_obj.group(2)) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) # item_loader.add_css("title", ".zh-question-title h2 a::text") item_loader.add_xpath( "title", "//*[@id='zh-question-title']/h2/a/text()|//*[@id='zh-question-title']/h2/span/text()" ) item_loader.add_css("content", "#zh-question-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", ".List-headerText span::text") item_loader.add_css( "comments_num", "#zh-question-meta-wrap a[name='addcomment']::text") # item_loader.add_css("watch_user_num", "#zh-question-side-header-wrap::text") item_loader.add_xpath( "watch_user_num", "//*[@id=zh-question-side-header-wrap]/text()|//*[@class=zh-question-followers-sidebar]/div/a/strong/text()" ) item_loader.add_css("topics", ".zm-tag-editor-labels a::text") question_item = item_loader.load_item() yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers, callback=self.parse_answer) yield question_item
def parse_question(self, response): # 处理question页面,从页面中提取具体的quesiton item if "QuestionHeader-title" in response.text: # 处理新版本的页面 match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) if match_obj: question_id = int(match_obj.group(2)) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css("title", "h1.QuestionHeader-title::text") item_loader.add_css("content", ".QuestionHeader-detail") item_loader.add_value("url", response.url) item_loader.ad_value("zhihu_id", question_id) item_loader.add_css("answer_num", ".List-headerText span::text") item_loader.add_css("comments_num", "Questionheader-actions button::text") item_loader.add_css("watch_user_num", ".NumberBoard-value::text") item_loader.add_css("topics", ".QuestionHeader-topics .Popover::text") question_item = item_loader.load_item() else: # 处理知乎旧版本 match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) if match_obj: question_id = int(match_obj.group(2)) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css("title", ".zh-question-title h2 a::text") item_loader.add_css("content", "#zh-question-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", ".List-headerText span::text") item_loader.add_css( "comments_num", "#zh-question-meta-wrap a[name='addcomment']::text") item_loader.add_css("watch_user_num", "#zh-question-side-header-wrap::text") item_loader.add_css("topics", ".zm-tag-editor-labels a::text") question_item = item_loader.load_item() pass
def parse_question(self, response): """ 提取知乎问题相应字段 """ question_id = int(response.meta.get("question_id", "")) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css("title", ".QuestionHeader-content h1::text") item_loader.add_css("content", ".QuestionHeader-detail span::text") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css( "watcher_num", ".NumberBoard.QuestionFollowStatus-counts .NumberBoard-value::text" ) item_loader.add_css("answer_num", ".List-headerText span::text") item_loader.add_css( "click_num", ".NumberBoard.QuestionFollowStatus-counts .NumberBoard-value::text" ) item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text") question_item = item_loader.load_item() # 传入问题的id,每次请求的数量,以及第一次请求的偏移值 yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers, callback=self.parse_answer) yield question_item
def parse_question(self, response): match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) question_id = '' if match_obj: question_id = int(match_obj.group(2)) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css("title", "h1.QuestionHeader-title::text") item_loader.add_css("content", ".QuestionHeader-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", ".List-headerText span::text") item_loader.add_css("comments_num", ".QuestionHeader-Comment button::text") item_loader.add_css( "watch_user_num", ".QuestionFollowStatus-counts button div strong::text") item_loader.add_css( "follow_user_num", ".QuestionFollowStatus-counts div div strong::text") item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text") question_item = item_loader.load_item() yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), cookies=self.cookies, headers=zhihuLogin.headers, callback=self.parse_answer) yield question_item
def parse_question(self, response): # 处理question页面,从页面中提取出具体的question item question_id = response.meta.get('question_id') item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_xpath( 'title', "//div[@class='QuestionHeader']//h1[@class='QuestionHeader-title']/text()" ) # item_loader.add_xpath('content', "//div[@class='QuestionHeader-detail']") item_loader.add_value('url', response.url) item_loader.add_value('zhihu_id', question_id) item_loader.add_xpath('answer_num', "//h4[@class='List-headerText']/span/text()") item_loader.add_xpath( 'comments_num', "//div[@class='QuestionHeader-Comment']/button/text()") item_loader.add_xpath('watch_user_num', "//div[@class='NumberBoard-value']/text()") item_loader.add_xpath('topics', "//a[@class='TopicLink']/div/div/text()") question_item = item_loader.load_item() yield scrapy.Request(self.start_answer_url.format(question_id, 0), callback=self.parse_answer) yield question_item
def parse_question(self, response): # 处理question页面, 从页面中提取出具体的question item if "QuestionHeader-title" in response.text: match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) if match_obj: question_id = int(match_obj.group(2)) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css("title", "h1.QuestionHeader-title::text") item_loader.add_css("content", ".QuestionHeader-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", ".List-headerText span::text") item_loader.add_css("comments_num", ".QuestionHeader-actions button::text") item_loader.add_css("watch_user_num", ".NumberBoard-value::text") item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text") question_item = item_loader.load_item() yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers, callback=self.parse_answer) yield question_item
def parse_question(self, response): match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) if match_obj: question_id = int(match_obj.group(2)) item_loader = ZhihuQuestionItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css("title", "h1.QuestionHeader-title::text") item_loader.add_css("content", ".QuestionHeader-detail") item_loader.add_value('url', response.url) item_loader.add_value("id", question_id) item_loader.add_css('answer_num', ".List-headerText span::text") item_loader.add_css('comments_num', ".QuestionHeader-Comment button::text") item_loader.add_css('follow_num', ".NumberBoard-itemValue::text") item_loader.add_css('view_num', ".NumberBoard-itemValue:nth-child(2)::text") item_loader.add_css('topics', ".QuestionHeader-topics .Popover div::text") item_loader.add_value( 'crawl_time', datetime.datetime.now().strftime(SQL_DATETIME_FORMAT)) question_item = item_loader.load_item() yield scrapy.Request( self.start_answer_url.format(question_id, 20, 0), meta={'cookiejar': response.meta['cookiejar']}, headers=self.headers, callback=self.parse_answer) yield question_item
def parse_question(self, response): # 处理question页面, 从页面中提取出具体的question item # 注意:这里也可以继续抓取url进行下一步跟踪,参考self.parse(),此处省略 question_id = int(response.meta.get("question_id", "")) if "QuestionHeader-title" in response.text: # 处理新版本 item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css("title", "h1.QuestionHeader-title::text") # 取html,不取text item_loader.add_css("content", ".QuestionHeader-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) # 回答数,提取text item_loader.add_css("answer_num", ".List-headerText span::text") # 评论数,提取text,略微改动 item_loader.add_css("comments_num", ".QuestionHeader-Comment button::text") # 关注数和浏览数一起提取,略微改动 item_loader.add_css("watch_user_num", ".NumberBoard-itemValue::text") # 主题。 css格式里空格代表后代(子代用">")小心Popover后面还有个后代div item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text") question_item = item_loader.load_item() else: # 处理旧版本,略 raise NotImplementedError yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers, callback=self.parse_answer) yield question_item
def parse_question(self, response): """ 1. 获取问题详情 """ item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) question_id = re.match("(.*zhihu.com/question/?(\d+)).*", response.url).group(2) item_loader.add_value('question_id', question_id) item_loader.add_value('url', response.url) item_loader.add_css('title', 'h1.QuestionHeader-title::text') item_loader.add_css( 'content', '.QuestionRichText.QuestionRichText--expandable span') item_loader.add_css( 'topics', '.QuestionHeader-topics .Tag-content .Popover div::text') item_loader.add_css('answer_num', '.List-headerText span::text') item_loader.add_css( 'follow_num', '.QuestionFollowStatus-counts .NumberBoard-itemValue::text') item_loader.add_css( 'browse_num', '.QuestionFollowStatus-counts .NumberBoard-itemValue::text') item_loader.add_css('comment_num', ".QuestionHeader-Comment button::text") question_item = item_loader.load_item() yield question_item yield scrapy.Request(self.answer_start_url.format(question_id, 20, 0), headers=self.headers, callback=self.parse_answer)
def parse_question(self, response): #处理question页面, 从页面中提取出具体的question item #在items.py里定义item if "QuestionHeader-title" in response.text: #处理新版本 match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) #通过re匹配zhihu_id if match_obj: question_id = int(match_obj.group(2)) #在数据库里定义了zhihu_id为int类型 item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css("title", "h1.QuestionHeader-title::text") item_loader.add_css("content", ".QuestionHeader-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) #通过re匹配zhihu_id #也可以用meta获得zhihu_id item_loader.add_css("answer_num", ".List-headerText span::text") #item_loader.add_css("comments_num", ".QuestionHeader-actions button::text") item_loader.add_css("comments_num", ".QuestionHeader-Comment button::text") #item_loader.add_css("watch_user_num", ".NumberBoard-value::text") item_loader.add_css("watch_user_num", ".NumberBoard-itemValue::text") item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text") #没加>,只有空格,是指在后代节点中寻找 #item_loader.add_css("topics", ".QuestionHeader-topics > .Popover div::text") #加>,是指在子节点中寻找 question_item = item_loader.load_item() else: #处理老版本页面的item提取 match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) #通过re匹配zhihu_id if match_obj: question_id = int(match_obj.group(2)) item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response) # item_loader.add_css("title", ".zh-question-title h2 a::text") #title可能放在a标签里,也可能放在span标签里 item_loader.add_xpath("title", "//*[@id='zh-question-title']/h2/a/text()|//*[@id='zh-question-title']/h2/span/text()") #xpath有或 item_loader.add_css("content", "#zh-question-detail") # #取ID .取class item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", "#zh-question-answer-num::text") item_loader.add_css("comments_num", "#zh-question-meta-wrap a[name='addcomment']::text") # item_loader.add_css("watch_user_num", "#zh-question-side-header-wrap::text") item_loader.add_xpath("watch_user_num", "//*[@id='zh-question-side-header-wrap']/text()|//*[@class='zh-question-followers-sidebar']/div/a/strong/text()") item_loader.add_css("topics", ".zm-tag-editor-labels a::text") question_item = item_loader.load_item() #pass yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers, callback=self.parse_answer) #请求20条 #从0开始 yield question_item #发到pipelines.py
def parse_questions(self, response): """ 解析某个 ‘问题’ 页面,提取数据,收集到item :param response: :return: """ zhihu_question_item = ZhihuQuestionItem() # 从response中提取出想要的信息 url = response.url zhihu_id = response.meta.get('zhihu_id') topic = ','.join( response.css( '.QuestionHeader-topics .Popover div::text').extract()) title = response.css( '.QuestionHeader h1.QuestionHeader-title::text').extract_first('') content = response.css('.QuestionRichText span::text').extract_first( '') answer_num = int( response.css('.List-headerText span::text').extract_first(0)) comments_num = int( response.css('.QuestionHeader-Comment button::text').re('(\d+)') [0]) watch_user_num = int( response.css('.NumberBoard-value::text').re('(\d+)')[0]) click_num = int( response.css('.NumberBoard-value::text').re('(\d+)')[1]) # 把上面提取出来的信息填充到 item 中 zhihu_question_item['url'] = url zhihu_question_item['zhihu_id'] = zhihu_id zhihu_question_item['topic'] = topic zhihu_question_item['title'] = title zhihu_question_item['content'] = content zhihu_question_item['answer_num'] = answer_num zhihu_question_item['comments_num'] = comments_num zhihu_question_item['watch_user_num'] = watch_user_num zhihu_question_item['click_num'] = click_num # 到这里还要解析问题的回答信息,在这里触发第问题答案的初始请求,offset为0 yield scrapy.Request(url=self.start_answers_url.format( zhihu_id, 20, 0), headers=self.headers, callback=self.parse_answers) # 用yield 把 item 传输到pipeline yield zhihu_question_item # 注释是因为,几个爬虫返回去到pipeline里面的都叫item,如何区分? # 按理说爬取全站,是每个页面的所有url都应该提取出来,做self.parse里面的操做,这里为了简化代码,提高可读性,暂时不做,可以最后加 """
def parse_question(self, response): question_id = response.meta.get('question_id') item_loader = ItemLoader(item=ZhihuQuestionItem(),response=response) item_loader.add_css('title','h1.QuestionHeader-title::text') item_loader.add_css('content','.QuestionHeader-detail') item_loader.add_value('url',response.url) item_loader.add_value('zhihu_id', question_id) item_loader.add_css('answer_num','.List-headerText span::text') item_loader.add_css('comments_num','.QuestionHeaderActions .QuestionHeader-Comment button::text') item_loader.add_css('watch_user_num','.NumberBoard-itemValue::text') item_loader.add_css('topics','.QuestionHeader-topics .Popover div::text') question_item = item_loader.load_item() yield scrapy.Request(url=self.start_answer_url.format(question_id,20,0),headers=self.header,callback=self.parse_answer) yield question_item
def parse_question(self, response): item_loader = ZhihuItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css('title', '.QuestionHeader-main h1::text') item_loader.add_css('content', '.QuestionRichText span::text') item_loader.add_value('url', response.url) item_loader.add_value('zhihu_id', response.meta.get('question_id')) item_loader.add_css('answer_num', '.List-headerText span::text') item_loader.add_css('comments_num', '.QuestionHeader-Comment .Button::text') item_loader.add_css('topics', '.QuestionHeader-topics .Popover div::text') item_loader.add_css('watch_user_num', '.QuestionFollowStatus .Button .NumberBoard-value::text') item_loader.add_css('click_num', '.QuestionFollowStatus .NumberBoard-item .NumberBoard-value::text') item_loader.add_value('crawl_time', datetime.now()) question_item = item_loader.load_item() yield question_item
def parse_question(self, response): ''' parse question ''' if response.status in self.handle_httpstatus_list: self.failed_urls.append(response.url) # 数据收集,当Response状态码为403/404/500时,failed_url数加1 self.crawler.stats.inc_value("failed_url") question_item = ZhihuQuestionItem() question_id = int(response.meta.get("question_id")) title = response.css('.QuestionHeader-title::text').extract_first('') question_url = response.url topics = response.css( 'meta[itemprop="keywords"]::attr(content)').extract() topics = '/'.join(topics) content = response.css( '.QuestionRichText--collapsed div span::text').extract_first('') answer_nums = response.css( '.List-headerText span::text').extract_first('') answer_nums = extract_nums(answer_nums) comment_nums = response.css( '.QuestionHeader-Comment button::text').extract_first('') comment_nums = extract_nums(comment_nums) watch_user_nums = response.css( '.NumberBoard-itemValue::text').extract_first('') watch_user_nums = extract_nums(watch_user_nums) click_nums = response.css('.NumberBoard-itemValue::text').extract()[1] click_nums = extract_nums(click_nums) crawl_time = datetime.datetime.now() question_item["question_id"] = question_id question_item["topics"] = topics question_item["question_url"] = question_url question_item["title"] = title question_item["content"] = content question_item["answer_nums"] = answer_nums question_item["comment_nums"] = comment_nums question_item["watch_user_nums"] = watch_user_nums question_item["click_nums"] = click_nums question_item["crawl_time"] = crawl_time yield question_item yield scrapy.Request(self.start_answer_url.format(question_id, 5, 0), callback=self.parse_answer)
def parse_question(self, response): zhihu_id = response.meta.get('question_id', '') # 使用ItemLoader加载item item_loader = ItemLoader(ZhihuQuestionItem(), response=response) item_loader.add_value('zhihu_id', zhihu_id) item_loader.add_css('topics', '.QuestionHeader-topics .Popover div::text') item_loader.add_value('url', response.url) item_loader.add_css('title', '.QuestionHeader-title::text') item_loader.add_css('content', '.QuestionHeader-detail') # if 'answer' in response.url: # item_loader.add_css('answer_num', '.Question-mainColumn a.QuestionMainAction::text') # else: # item_loader.add_css('answer_num', '.QuestionAnswers-answers .List-headerText span::text') # 从两种不同的样式中获取数据 可以用xpath的 | 或符号 item_loader.add_xpath( 'answer_num', '//*[@class="List-headerText"]/span/text()|//a[@class="QuestionMainAction"]/text()' ) item_loader.add_css('comments_num', '.QuestionHeader-Comment button::text') item_loader.add_css( 'watch_user_num', '.QuestionFollowStatus .NumberBoard-itemValue::text') item_loader.add_css( 'click_num', '.QuestionFollowStatus .NumberBoard-itemValue::text') # 根据上面我们定义的规则将ItemLoader加载成question_item question_item = item_loader.load_item() # 如果这个问题有人回答,则发送请求获取回答信息 # 没有回答将回答数赋值为0 if 'answer_num' in question_item: # 问题页面有回答,我们拼接一个获取回答的请求,进行分析回答 yield scrapy.Request(self.answer_url.format(zhihu_id, 0), headers=self.header, callback=self.parse_answer) else: question_item['answer_num'] = ['0'] # 这里yield出去什么,scrapy会自动分析,如果是一个itme,会自动调用pipeline;如果是request会自动下载这个页面,跳转到callback函数 yield question_item
def parse_question(self, response): zhihu_id = response.meta.get("question_id", "") question_item_loader = ArticleItemLoader(item=ZhihuQuestionItem(), response=response) question_item_loader.add_css("title", "h1.QuestionHeader-title::text") question_item_loader.add_css("content", ".QuestionHeader-detail") question_item_loader.add_value("url", response.url) question_item_loader.add_value("zhihu_id", zhihu_id) question_item_loader.add_css("answer_num", ".List-headerText span::text") question_item_loader.add_css("comments_num", ".QuestionHeader-Comment button::text") question_item_loader.add_css("watch_user_num", ".NumberBoard-itemValue::text") question_item_loader.add_css( "topics", ".QuestionHeader-topics .Popover div::text") question_item = question_item_loader.load_item() yield scrapy.Request(self.start_answer_url.format(zhihu_id, 20, 0), headers=self.headers, callback=self.parse_answer) yield question_item
def parse_question(self, response): # 可选:在question页面也进行进一步的a标签提取与跟踪 # 处理question页面, 从页面中提取出具体的question item if "QuestionHeader-title" in response.text: # 处理新版本 match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) if match_obj: question_id = str(match_obj.group(2)) item_loader = XiaojianrenItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css("title", "h1.QuestionHeader-title::text") item_loader.add_css("content", '.QuestionRichText--collapsed div span::text') item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", ".List-headerText span::text") item_loader.add_css("comments_num", ".QuestionHeader-Comment button::text") # 这里一次把一列值提出来 item_loader.add_css("watch_user_num", ".NumberBoard-itemValue::text") item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text") question_item = item_loader.load_item() else: # 处理老版本页面的item提取(好像已经没有老版页面了我这里放着保险一下) match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url) question_id = '' if match_obj: question_id = str(match_obj.group(2)) item_loader = XiaojianrenItemLoader(item=ZhihuQuestionItem(), response=response) # item_loader.add_css("title", ".zh-question-title h2 a::text") item_loader.add_xpath( "title", "//*[@id='zh-question-title']/h2/a/text()|//*[@id='zh-question-title']/h2/span/text()" ) item_loader.add_css("content", "#zh-question-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", "#zh-question-answer-num::text") item_loader.add_css( "comments_num", "#zh-question-meta-wrap a[name='addcomment']::text") # item_loader.add_css("watch_user_num", "#zh-question-side-header-wrap::text") item_loader.add_xpath( "watch_user_num", "//*[@id='zh-question-side-header-wrap']/text()|//*[@class='zh-question-followers-sidebar']/div/a/strong/text()" ) item_loader.add_css("topics", ".zm-tag-editor-labels a::text") question_item = item_loader.load_item() question_item.get("title") # 发起向后台具体answer的接口请求 yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers, meta={ "title": question_item.get("title"), "question_url": question_item.get("url") }, callback=self.parse_answer) yield question_item