Example #1
0
    def parse_question(self, response):
        """
        question page question item
        """
        if "QuestionHeader-title" in response.text:
            #
            match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*",
                                 response.url)
            if match_obj:
                question_id = int(match_obj.group(2))
            item_loader = ItemLoader(item=ZhihuQuestionItem(),
                                     response=response)
            item_loader.add_css("title", "h1.QuestionHeader-title::text")
            item_loader.add_css("content", ".QuestionHeader-detail")
            item_loader.add_value("url", response.url)
            item_loader.add_value("zhihu_id", question_id)
            item_loader.add_css("answer_num", ".List-headerText span::text")
            item_loader.add_css("comments_num",
                                ".QuestionHeader-Comment button::text")
            item_loader.add_css("watch_user_num", ".NumberBoard-value::text")
            item_loader.add_css("topics",
                                ".QuestionHeader-topics .Popover div::text")
            question_item = item_loader.load_item()
        else:

            match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*",
                                 response.url)
            if match_obj:
                question_id = int(match_obj.group(2))
            item_loader = ItemLoader(item=ZhihuQuestionItem(),
                                     response=response)
            # item_loader.add_css("title", ".zh-question-title h2 a::text")
            item_loader.add_xpath(
                "title",
                "//*[@id='zh-question-title']/h2/a/text()|//*[@id='zh-question-title']/h2/span/text()"
            )
            item_loader.add_css("content", "#zh-question-detail")
            item_loader.add_value("url", response.url)
            item_loader.add_value("zhihu_id", question_id)
            item_loader.add_css("answer_num", "#zh-question-answer-num::text")
            item_loader.add_css(
                "comments_num",
                "#zh-question-meta-wrap a[name='addcomment']::text")
            # item_loader.add_css("watch_user_num", "#zh-question-side-header-wrap::text")
            item_loader.add_xpath(
                "watch_user_num",
                "//*[@id='zh-question-side-header-wrap']/text()|//*[@class='zh-question-followers-sidebar']/div/a/strong/text()"
            )
            item_loader.add_css("topics", ".zm-tag-editor-labels a::text")
            question_item = item_loader.load_item()

        print("---------------------------here.")
        yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0),
                             headers=self.headers,
                             callback=self.parse_answer)
        yield question_item
Example #2
0
    def parse_question(self, response):
        # 处理question页面, 从页面中提取出具体的question item
        if "QuestionHeader-title" in response.text:
            match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*",
                                 response.url)
            if match_obj:
                question_id = int(match_obj.group(2))

            item_loader = ItemLoader(item=ZhihuQuestionItem(),
                                     response=response)
            item_loader.add_css("title", "h1.QuestionHeader-title::text")
            item_loader.add_css("content", ".QuestionHeader-detail")
            item_loader.add_value("url", response.url)
            item_loader.add_value("zhihu_id", question_id)
            item_loader.add_css("answer_num", ".List-headerText span::text")
            item_loader.add_css("comments_num",
                                ".QuestionHeader-actions button::text")
            item_loader.add_css("watch_user_num", ".NumberBoard-value::text")
            item_loader.add_css("topics",
                                ".QuestionHeader-topics .Popover div::text")

            question_item = item_loader.load_item()

        yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0),
                             headers=self.headers,
                             callback=self.parse_answer)
        yield question_item
Example #3
0
    def parse_questions(self, response):

        q_item = ItemLoader(item=ZhihuQuestionItem(), response=response)
        q_id = response.meta['question_id']
        q_item.add_value('q_id', q_id)
        q_item.add_value('q_url', response.url)
        q_item.add_css('q_title', '.QuestionHeader-tags+h1::text')
        q_item.add_css(
            'q_content',
            '.QuestionRichText.QuestionRichText--collapsed span::text')
        q_item.add_css('q_topic', '.Tag.QuestionTopic .Popover div::text')
        q_item.add_css('q_answers_num', '.List-headerText span::text')
        q_item.add_xpath(
            'q_follower',
            '//div[@class="NumberBoard QuestionFollowStatus-counts NumberBoard--divider"]/button//strong/text()'
        )
        q_item.add_xpath(
            'q_watcher',
            '//div[@class="NumberBoard QuestionFollowStatus-counts NumberBoard--divider"]/div//strong/text()'
        )
        q_item.add_value('crawl_time', datetime.datetime.now())
        question_item = q_item.load_item()

        answers_url = self.temp_answers_url[0].format(q_id, 15, 0)

        yield Request(url=answers_url,
                      callback=self.parse_answers,
                      headers=self.headers,
                      meta={'q_id': q_id})

        yield question_item
Example #4
0
    def parse_question(self, response):
        """ 解析问题详情及获取指定范围答案 """
        item = ZhihuQuestionItem()

        item['name'] = response.xpath('//meta[@itemprop="name"]/@content').extract()[0]
        item['url'] = response.xpath('//meta[@itemprop="url"]/@content').extract()[0]
        item['keywords'] = response.xpath('//meta[@itemprop="keywords"]/@content').extract()[0]
        item['answer_count'] = response.xpath('//meta[@itemprop="answerCount"]/@content').extract()[0]
        item['comment_count'] = response.xpath('//meta[@itemprop="commentCount"]/@content').extract()[0]
        item['flower_count'] = response.xpath('//meta[@itemprop="zhihu:followerCount"]/@content').extract()[0]
        item['date_created'] = response.xpath('//meta[@itemprop="dateCreated"]/@content').extract()[0][0:19].replace('T',' ')

        count_answer = int(item['answer_count'])

        question_id = int(re.match(r'https://www.zhihu.com/question/(\d+)', response.url).group(1))

        item['question_id'] = question_id

        yield item

        # 从指定位置开始获取指定数量答案
        if count_answer > self.answer_count:
            count_answer = self.answer_count
        n = self.answer_offset
        while n + 20 <= count_answer:
            yield scrapy.Request(self.more_answer_url.format(question_id, n, n + 10), headers=self.headers,cookies=self.cookies,
                                 callback=self.parse_answer)
            n += 20
Example #5
0
 def parse_question(self, response):
     item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
     question_id = response.meta.get('question_id', '')
     item_loader.add_xpath('title', '//h1[@class="QuestionHeader-title"]/text()')
     item_loader.add_xpath('content', '//span[@class="RichText"]/text()')
     item_loader.add_value('zhihu_id', question_id)
     item_loader.add_value('url', response.url)
     item_loader.add_css('answer_num', '.List-headerText span::text')
     item_loader.add_css('comments_num', '.QuestionHeader-Comment button::text')
     item_loader.add_css('watch_user_num', '.NumberBoard-value::text')
     item_loader.add_xpath('topics', '//div[@class="Popover"]/div/text()')
     article_item = item_loader.load_item()
     yield article_item
Example #6
0
    def get_question_url(self, response):
        result = json.loads(response.text)
        data = result.get('data')
        for item in data:
            question_id = item.get('id')

            question = ZhihuQuestionItem()
            question['id'] = question_id
            question['title'] = item.get('title')
            question['answer_count'] = item.get('answer_count')
            question['follower_count'] = item.get('follower_count')
            yield question
            yield Request(url=self.anwser_url.format(question_id),
                          callback=self.parse)
Example #7
0
    def parse_ans_and_que(self,response):
        #解析answer和question的数据,并构造个人页面的url实行访问
        response_json = json.loads(response.text)
        if response_json["paging"]["is_start"] == True:
            #仅获取api中首页的第一个数据信息中的queston数据
            question_item = ZhihuQuestionItem()
            data = response_json["data"][0]
            question_item['question_id'] = data["question"]["id"]
            question_item['question_url'] = response.meta.get("question_url", "")
            question_item['question'] = data["question"]["title"]
            question_item['content'] = response.meta.get("content","")
            question_item['comments_num'] = response.meta.get("comments_num","")
            question_item['followers_num'] = response.meta.get("followers_num", "")
            question_item['watch_user_num'] = response.meta.get("watch_user_num", "")
            question_item['question_topics'] = response.meta.get("question_topics","")
            question_item['question_created_time'] = data["question"]["created"]
            question_item['question_updated_time'] = data["question"]["updated_time"]
            question_item['answer_num'] = response.meta.get("answer_num","")
            yield question_item


        #获取user_key,构造user_url,并且解析answer的数据
        for answer_data in response_json["data"]:
            # user_url.request
            answer_item = ZhihuAnswerItem()
            author_id =answer_data["author"]["url_token"]
            if author_id is not None:
                author_url = "https://www.zhihu.com/people/" + author_id +"/answers"
                yield Request(url=author_url, callback=self.parse_user,meta={"author_id":author_id})
            else:
                continue

            answer_item['question'] = answer_data["question"]["title"]
            answer_item['question_id'] = answer_data["question"]["id"]
            answer_item['answer_id'] =answer_data["id"]
            answer_item['answer_url'] = answer_data["url"]
            answer_item['content'] = answer_data["content"]
            answer_item['author_url_token'] = author_id
            answer_item['vote_num'] = answer_data["voteup_count"]
            answer_item['comments_num'] = answer_data["comment_count"]
            answer_item['answer_created_time'] = answer_data["created_time"]
            answer_item['answer_updated_time'] = answer_data["updated_time"]
            yield answer_item

        if response_json["paging"]["is_end"] == False:
            #判断此api是否有下一页,若有则request
            next_url = response_json["paging"]["next"]
            if next_url is not None:
              yield Request(url=next_url,callback=self.parse_ans_and_que)
Example #8
0
    def parse_question(self, response):
        create_time = response.meta.get("create_time", "")
        question_id = response.meta.get("question_id", "")
        headers = {
            "HOST":
            "www.zhihu.com",
            "Referer":
            "https://www.zhizhu.com",
            'User-Agent':
            "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:79.0) Gecko/20100101 Firefox/79.0"
        }
        # match_obj = re.search("(.*zhihu.com/question/(\d+))(/|$).*", response.url)
        # question_id = int(match_obj.group(2))
        item_loader = ZhihuItemLoader(item=ZhihuQuestionItem(),
                                      response=response)
        item_loader.add_css("title", "h1.QuestionHeader-title::text")
        item_loader.add_css("content",
                            ".QuestionHeader-detail .RichText.ztext::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("question_id", question_id)
        item_loader.add_css("topics",
                            ".QuestionHeader-topics #null-toggle::text")
        item_loader.add_xpath(
            "attention_num",
            "//div[contains(text(),'关注者')]/following-sibling::strong/text()")
        item_loader.add_css("comments_num",
                            ".QuestionHeader-Comment button::text")
        item_loader.add_xpath("answer_num",
                              "//*[@class='List-headerText']//span/text()[1]")
        item_loader.add_xpath(
            "click_num",
            "//div[contains(text(),'被浏览')]/following-sibling::strong/text()")
        item_loader.add_value(
            "create_time",
            time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(create_time)))
        item_loader.add_value("crawl_time", str(datetime.datetime.now()))
        question_item = item_loader.load_item()

        yield question_item
        yield scrapy.Request(
            self.start_answer_url.format(question_id=question_id),
            headers=headers,
            callback=self.parse_answer)
Example #9
0
    def parse_question(self, response):
        """ 解析问题详情及获取指定范围答案 """
        text = response.text
        item = ZhihuQuestionItem()

        item['name'] = re.findall(r'<meta itemprop="name" content="(.*?)"',
                                  text)[0]
        item['url'] = re.findall(r'<meta itemprop="url" content="(.*?)"',
                                 text)[0]
        item['keywords'] = re.findall(
            r'<meta itemprop="keywords" content="(.*?)"', text)[0]
        item['answer_count'] = re.findall(
            r'<meta itemprop="answerCount" content="(.*?)"', text)[0]
        item['comment_count'] = re.findall(
            r'<meta itemprop="commentCount" content="(.*?)"', text)[0]
        item['flower_count'] = re.findall(
            r'<meta itemprop="zhihu:followerCount" content="(.*?)"', text)[0]
        item['date_created'] = re.findall(
            r'<meta itemprop="dateCreated" content="(.*?)"', text)[0]

        count_answer = int(item['answer_count'])
        yield item

        question_id = int(
            re.match(r'https://www.zhihu.com/question/(\d+)',
                     response.url).group(1))

        # 从指定位置开始获取指定数量答案
        if count_answer > self.answer_count:
            count_answer = self.answer_count
        n = self.answer_offset
        while n + 20 <= count_answer:
            yield scrapy.Request(self.more_answer_url.format(
                question_id, n, n + 20),
                                 headers=self.headers,
                                 callback=self.parse_answer)
            n += 20