Ejemplo n.º 1
0
    def parse_answer(self, reponse):
        #处理question的answer
        ans_json = json.loads(reponse.text)
        is_end = ans_json["paging"]["is_end"]
        next_url = ans_json["paging"]["next"]

        #提取answer的具体字段
        for answer in ans_json["data"]:
            answer_item = ZhihuAnswerItem()
            answer_item["zhihu_id"] = answer["id"]
            answer_item["url"] = answer["url"]
            answer_item["question_id"] = answer["question"]["id"]
            answer_item["author_id"] = answer["author"][
                "id"] if "id" in answer["author"] else None
            answer_item[
                "content"] = answer["content"] if "content" in answer else None
            answer_item["parise_num"] = answer["voteup_count"]
            answer_item["comments_num"] = answer["comment_count"]
            answer_item["create_time"] = answer["created_time"]
            answer_item["update_time"] = answer["updated_time"]
            answer_item["crawl_time"] = datetime.datetime.now()

            yield answer_item

        if not is_end:
            yield scrapy.Request(next_url,
                                 headers=self.headers,
                                 callback=self.parse_answer)
Ejemplo n.º 2
0
    def parse_answer(self, response):
        '''
            parse answer
        '''
        if response.status in self.handle_httpstatus_list:
            self.failed_urls.append(response.url)
            # 数据收集,当Response状态码为403/404/500时,failed_url数加1
            self.crawler.stats.inc_value("failed_url")

        answer_dcit = json.loads(response.text)
        is_end = answer_dcit['paging']['is_end']
        next_url = answer_dcit['paging']['next']

        for answer in answer_dcit['data']:
            answer_item = ZhihuAnswerItem()
            answer_item["answer_id"] = answer['id']
            answer_item["question_id"] = answer['question']['id']
            answer_item["answer_url"] = answer['url']
            answer_item["author_id"] = answer['author'][
                'id'] if 'id' in answer['author'] else ''
            answer_item["content"] = answer['content']
            answer_item["praise_nums"] = answer['voteup_count']
            answer_item["comment_nums"] = answer['comment_count']
            answer_item["create_time"] = answer['created_time']
            answer_item["update_time"] = answer['updated_time']
            answer_item["crawl_time"] = datetime.datetime.now()
            question_create_time = answer['question']['created']
            question_update_time = answer['question']['updated_time']
            yield answer_item

        if not is_end:
            yield scrapy.Request(next_url, callback=self.parse_answer)
Ejemplo n.º 3
0
    def parse_answer(self, reponse):
        #处理question的answer
        ans_json = json.loads(reponse.text)
        is_end = ans_json["paging"]["is_end"]  #判断是否还有后续的页面要请求
        #totals_answer = ans_json["paging"]["totals"]
        next_url = ans_json["paging"]["next"]

        #提取answer的具体字段
        for answer in ans_json["data"]:
            answer_item = ZhihuAnswerItem()  #申明实例
            answer_item["zhihu_id"] = answer["id"]
            answer_item["url"] = answer["url"]
            answer_item["question_id"] = answer["question"]["id"]
            answer_item["author_id"] = answer["author"][
                "id"] if "id" in answer["author"] else None  #可能为匿名用户
            answer_item[
                "content"] = answer["content"] if "content" in answer else None
            answer_item["parise_num"] = answer["voteup_count"]
            answer_item["comments_num"] = answer["comment_count"]
            answer_item["create_time"] = answer["created_time"]
            answer_item["update_time"] = answer["updated_time"]
            answer_item["crawl_time"] = datetime.datetime.now()  #取当前时间

            yield answer_item  #yield出去,交给pipelines.py

        if not is_end:  #如果还有后续页面,继续请求
            yield scrapy.Request(next_url,
                                 headers=self.headers,
                                 callback=self.parse_answer)
Ejemplo n.º 4
0
    def parse_answer(self, response):
        # 回答请求返回的是json格式,可以load处理
        ans_json = json.loads(response.text)
        is_end = ans_json['paging']['is_end']
        next_url = ans_json['paging']['next']

        for answer in ans_json['data']:
            answer_item = ZhihuAnswerItem()
            answer_item['zhihu_id'] = answer['id']
            answer_item['url'] = answer['url']
            answer_item['question_id'] = answer['question']['id']
            answer_item['author_id'] = answer['author'][
                'id'] if 'id' in answer['author'] else None
            answer_item['content'] = answer[
                'content'] if 'content' in answer else answer['excerpt']
            answer_item['praise_num'] = answer['voteup_count']
            answer_item['comments_num'] = answer['comment_count']
            answer_item['create_time'] = answer['created_time']
            answer_item['update_time'] = answer['updated_time']
            # answer_item['crawl_time'] = datetime.datetime.now()
            answer_item['crawl_time'] = time.time()

            yield answer_item

        # 判断是否结束,如果没有结束进行请求next_url,next_url是json返回的
        if not is_end:
            yield scrapy.Request(next_url,
                                 headers=self.header,
                                 callback=self.parse_answer)
Ejemplo n.º 5
0
    def parse_answer(self, response):
        answer_json = json.loads(response.text)
        is_end = answer_json['paging']['is_end']
        next_url = answer_json['paging']['next']

        for answer in answer_json['data']:
            answer_item = ZhihuAnswerItem()
            answer_item['id'] = answer['id']
            answer_item['url'] = answer['url']
            answer_item['question_id'] = answer['question']['id']
            answer_item['author_id'] = answer['author'][
                'id'] if 'id' in answer['author'] else None
            answer_item['author_name'] = answer['author'][
                'name'] if 'name' in answer['author'] else None
            answer_item[
                'content'] = answer['content'] if 'content' in answer else None
            answer_item['approve_num'] = answer['voteup_count']
            answer_item['comments_num'] = answer['comment_count']
            answer_item['create_time'] = answer['created_time']
            answer_item['update_time'] = answer['updated_time']
            answer_item['crawl_time'] = int(time.time())
            yield answer_item

        if not is_end:
            yield scrapy.Request(
                next_url,
                meta={'cookiejar': response.meta['cookiejar']},
                headers=self.headers,
                callback=self.parse_answer)
Ejemplo n.º 6
0
    def parse_answer(self, response):
        answer_json = json.loads(response.text)
        is_end = answer_json["paging"]["is_end"]
        totals = answer_json["paging"]["totals"]
        next_url = answer_json["paging"]["next"]

        # 提取answer具体数据
        for answer in answer_json["data"]:
            answer_itme = ZhihuAnswerItem()
            answer_itme["zhihu_id"] = answer["id"]
            answer_itme["url"] = answer["url"]
            answer_itme["question_id"] = answer["question"]["id"]
            answer_itme["author_id"] = answer["author"][
                "id"] if "id" in answer["author"] else None
            answer_itme[
                "content"] = answer["content"] if "content" in answer else None
            answer_itme["praise_num"] = answer["voteup_count"]
            answer_itme["comments_num"] = answer["comment_count"]
            answer_itme["create_time"] = answer["created_time"]
            answer_itme["update_time"] = answer["updated_time"]
            answer_itme["crawl_time"] = datetime.datetime.now()

            yield answer_itme

        if not is_end:
            yield scrapy.Request(next_url,
                                 format(response.meta.get("question_id", ""),
                                        20, 0),
                                 headers=self.headers,
                                 callback=self.parse_answer)
Ejemplo n.º 7
0
    def parse_answer(self, response):
        # 处理question的answer
        ans_json = json.loads(response.text)
        is_end = ans_json["paging"]["is_end"]  # 判断后续是否还有页面需要请求
        # total_answer = ans_json["paging"]["totals"]
        next_url = ans_json["paging"]["is_end"]

        # 提取answer的具体字段
        for answer in ans_json["data"]:
            answer_item = ZhihuAnswerItem()
            answer_item['zhihu_id'] = answer["id"]
            answer_item['url'] = answer["url"]
            answer_item['question_id'] = answer["question"]["id"]
            answer_item['author_id'] = answer["author"][
                "id"] if "id" in answer["author"] else None  # 匿名提问者现在是id=0
            answer_item[
                'content'] = answer["content"] if "content" in answer else None
            answer_item['praise_num'] = answer["voteup_count"]
            answer_item['comments_num'] = answer["comment_count"]
            answer_item['create_time'] = answer["created_time"]
            answer_item['update_time'] = answer["updated_time"]
            answer_item['crawl_time'] = datetime.datetime.now()  # current time

            yield answer_item  # yield出去交给pipeline处理

        if not is_end:
            yield scrapy.Request(next_url,
                                 headers=self.headers,
                                 callback=self.parse_answer)
Ejemplo n.º 8
0
    def parse_answer(self, response):
        # 处理question的answer
        answer_json = json.loads(response.text)
        is_end = answer_json['paging']['is_end']
        next_url = answer_json['paging']['next']

        # 提取answer的具体信息
        for answer in answer_json['data']:
            answer_item = ZhihuAnswerItem()

            answer_item['zhihu_id'] = answer['id']
            answer_item['url'] = answer['url']
            answer_item['question_id'] = answer['question']['id']
            answer_item['author_id'] = answer['author'][
                'id'] if 'id' in answer['author'] else None
            # answer_item['content'] = answer['content'] if 'content' in answer else None
            answer_item['praise_num'] = answer['voteup_count']
            answer_item['comments_num'] = answer['comment_count']
            answer_item['updated_time'] = answer['updated_time']
            answer_item['created_time'] = answer['created_time']
            answer_item['crawl_time'] = datetime.datetime.now()

            yield answer_item

        if not is_end:
            yield scrapy.Request(next_url, callback=self.parse_answer)
Ejemplo n.º 9
0
    def parse_answer(self, response):
        """
        1. 获取每条回答的信息
        2. 通过api接口自动获取更多回答
        """

        answer_json = json.loads(response.text)

        is_end = answer_json["paging"]["is_end"]
        next_url = answer_json["paging"]["next"]

        for answer in answer_json["data"]:
            answer_item = ZhihuAnswerItem()
            answer_item['answer_id'] = answer['id']
            answer_item['url'] = answer['url']
            answer_item['question_id'] = answer['question']['id']
            answer_item['author_id'] = answer["author"][
                "id"] if 'id' in answer['author'] else None
            answer_item['author_name'] = answer['author'][
                'name'] if 'name' in answer['author'] else None
            answer_item['content'] = answer['content']
            answer_item['agree_num'] = answer['voteup_count']
            answer_item['comment_num'] = answer['comment_count']
            answer_item['create_time'] = answer['created_time']
            answer_item['update_time'] = answer['updated_time']

            yield answer_item

        if not is_end:
            yield scrapy.Request(next_url,
                                 headers=self.headers,
                                 callback=self.parse_answer)
Ejemplo n.º 10
0
    def parse_answer(self, response):
        """
        分析API返回的json格式文件,从中提取信息
        """
        answer_json = json.load(response.text)
        is_end = answer_json["paging"]["is_end"]
        next_url = answer_json["paging"]["next"]

        for answer in answer_json["data"]:
            answer_item = ZhihuAnswerItem()
            answer_item["zhihu_id"] = answer["id"]
            answer_item["url"] = answer["url"]
            answer_item["question_id"] = answer["question"]["id"]
            # 用户可能匿名,此时就没有ID,所以这里要判断一下,是否有该字段
            answer_item["author_id"] = answer["author"][
                "id"] if "id" in answer["author"] else None
            answer_item[
                "content"] = answer["content"] if "content" in answer else None
            answer_item["praise_num"] = answer["voteup_count"]
            answer_item["comments_num"] = answer["comment_count"]
            answer_item["create_time"] = answer["created_time"]
            answer_item["update_time"] = answer["updated_time"]
            answer_item["crawl_time"] = datetime.now()
            yield answer_item

        # 不是回答结尾的话继续通过next_url获得回答
        if not is_end:
            yield scrapy.Request(next_url,
                                 headers=self.headers,
                                 callback=self.parse_answer)
Ejemplo n.º 11
0
    def parse_answer(self, response):
        # 回答列表返回json结果,所以将字符串转换为json结构
        ans_json = json.loads(response.text)
        # 获取是否为最后一页
        is_end = ans_json["paging"]["is_end"]
        # 获取下一页url
        next_url = ans_json["paging"]["next"]

        # 提取answer字段
        for answer in ans_json["data"]:
            answer_item = ZhihuAnswerItem()
            answer_item["zhihu_id"] = answer["id"]
            answer_item["url"] = answer["url"]
            answer_item["question_id"] = answer["question"]["id"]
            answer_item["author_id"] = answer["author"][
                "id"] if "id" in answer["author"] else None
            answer_item[
                "content"] = answer["content"] if "content" in answer else None
            answer_item["praise_num"] = answer["voteup_count"]
            answer_item["comments_num"] = answer["comment_count"]
            answer_item["create_time"] = answer["created_time"]
            answer_item["update_time"] = answer["updated_time"]
            # 处理item
            yield answer_item

        # 如果不是最后一页,则处理下一页
        if not is_end:
            yield scrapy.Request(next_url,
                                 headers=self.headers,
                                 callback=self.parse_answer)