Esempio n. 1
0
    def parse_answer(self, response):
        ans_json = json.loads(response.text)
        # 同一个question下的answer数量太多,只取前五个,否则会影响question的爬取速度
        # is_end = ans_json["paging"]["is_end"]
        # next_url = ans_json["paging"]["next"]
        # headers = {
        #     "HOST": "www.zhihu.com",
        #     "Referer": "https://www.zhizhu.com",
        #     'User-Agent': "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:79.0) Gecko/20100101 Firefox/79.0"
        #     }
        for answer in ans_json["data"]:
            answer_item = ZhihuAnswerItem()
            answer_item["title"] = answer["question"]["title"]
            answer_item["answer_id"] = answer["id"]
            answer_item["url"] = answer["url"]
            answer_item["question_id"] = answer["question"]["id"]
            answer_item["author_id"] = answer["author"]["id"]
            answer_item["content"] = remove_tags(answer["content"])
            answer_item["praise_num"] = answer["voteup_count"]
            answer_item["comments_num"] = answer["comment_count"]
            answer_item["create_time"] = time.strftime(
                "%Y-%m-%d %H:%M:%S", time.localtime(answer["created_time"]))
            answer_item["update_time"] = time.strftime(
                "%Y-%m-%d %H:%M:%S", time.localtime(answer["updated_time"]))
            answer_item["crawl_time"] = str(datetime.datetime.now())

            yield answer_item
Esempio n. 2
0
    def parse_ans(self, response):
        selector = Selector(response)
        print response.url
        username = response.url.split('/')[-2]
        try:
            for record in selector.xpath(r"id('zh-profile-answer-list')/div"):
                ask_title = ''.join(record.xpath(r"h2/a/text()").extract())
                url = host + ''.join(record.xpath("h2/a/@href").extract()) # answer_url
                ask_url = url.split('/answer')[0]

                agree_num = ''.join(record.xpath('div/div[2]/a/text()').extract())
                summary = ''.join(record.xpath(r"div/div[4]/div/text()").extract()).replace("\n","").strip()  #TODO
                content = ''.join(record.xpath(r"div/div[4]/textarea/text()").extract()).replace("\n","").strip()

                comment_num = ''.join(record.xpath(r"div/div[5]/div/a[2]/text()").extract()) #'添加评论'或者'3 条评论'

                comment_num = comment_num.split(' ')[0] #取数字
                if comment_num.startswith(u'\n添加评论'):
                    comment_num = '0'

                yield ZhihuAnswerItem(_id=url,username = username,url = url, ask_title = ask_title, \
                                      ask_url = ask_url, agree_num = agree_num, summary = summary
                                      , content = content, comment_num = comment_num)
        except Exception, e:
            open('error_pages/answers_' + response.url.split('/')[-2]+'.html', 'w').write(response.body)
            print '='*10 + str(e)
Esempio n. 3
0
    def parse_answer(self, reponse):
        # 处理question的answer
        ans_json = json.loads(reponse.text)
        is_end = ans_json["paging"]["is_end"]
        next_url = ans_json["paging"]["next"]

        # 提取answer的具体字段
        for answer in ans_json["data"]:
            answer_item = ZhihuAnswerItem()
            answer_item["zhihu_id"] = answer["id"]
            answer_item["url"] = answer["url"]
            answer_item["question_id"] = answer["question"]["id"]
            answer_item["author_id"] = answer["author"][
                "id"] if "id" in answer["author"] else None
            answer_item[
                "content"] = answer["content"] if "content" in answer else None
            answer_item["parise_num"] = answer["voteup_count"]
            answer_item["comments_num"] = answer["comment_count"]
            answer_item["create_time"] = answer["created_time"]
            answer_item["update_time"] = answer["updated_time"]
            answer_item["crawl_time"] = datetime.datetime.now()

            yield answer_item

        if not is_end:
            yield scrapy.Request(next_url,
                                 headers=self.headers,
                                 callback=self.parse_answer)
Esempio n. 4
0
    def parse_ans_and_que(self,response):
        #解析answer和question的数据,并构造个人页面的url实行访问
        response_json = json.loads(response.text)
        if response_json["paging"]["is_start"] == True:
            #仅获取api中首页的第一个数据信息中的queston数据
            question_item = ZhihuQuestionItem()
            data = response_json["data"][0]
            question_item['question_id'] = data["question"]["id"]
            question_item['question_url'] = response.meta.get("question_url", "")
            question_item['question'] = data["question"]["title"]
            question_item['content'] = response.meta.get("content","")
            question_item['comments_num'] = response.meta.get("comments_num","")
            question_item['followers_num'] = response.meta.get("followers_num", "")
            question_item['watch_user_num'] = response.meta.get("watch_user_num", "")
            question_item['question_topics'] = response.meta.get("question_topics","")
            question_item['question_created_time'] = data["question"]["created"]
            question_item['question_updated_time'] = data["question"]["updated_time"]
            question_item['answer_num'] = response.meta.get("answer_num","")
            yield question_item


        #获取user_key,构造user_url,并且解析answer的数据
        for answer_data in response_json["data"]:
            # user_url.request
            answer_item = ZhihuAnswerItem()
            author_id =answer_data["author"]["url_token"]
            if author_id is not None:
                author_url = "https://www.zhihu.com/people/" + author_id +"/answers"
                yield Request(url=author_url, callback=self.parse_user,meta={"author_id":author_id})
            else:
                continue

            answer_item['question'] = answer_data["question"]["title"]
            answer_item['question_id'] = answer_data["question"]["id"]
            answer_item['answer_id'] =answer_data["id"]
            answer_item['answer_url'] = answer_data["url"]
            answer_item['content'] = answer_data["content"]
            answer_item['author_url_token'] = author_id
            answer_item['vote_num'] = answer_data["voteup_count"]
            answer_item['comments_num'] = answer_data["comment_count"]
            answer_item['answer_created_time'] = answer_data["created_time"]
            answer_item['answer_updated_time'] = answer_data["updated_time"]
            yield answer_item

        if response_json["paging"]["is_end"] == False:
            #判断此api是否有下一页,若有则request
            next_url = response_json["paging"]["next"]
            if next_url is not None:
              yield Request(url=next_url,callback=self.parse_ans_and_que)
Esempio n. 5
0
    def parse_answer(self, response):
        """ 解析获取到的指定范围答案 """
        answers = json.loads(response.text)

        for ans in answers['data']:
            item = ZhihuAnswerItem()
            item['question_id'] = re.match(
                r'http://www.zhihu.com/api/v4/questions/(\d+)',
                ans['question']['url']).group(1)
            item['author'] = ans['author']['name']
            item['ans_url'] = ans['url']
            item['comment_count'] = ans['comment_count']
            item['upvote_count'] = ans['voteup_count']
            item['excerpt'] = ans['excerpt']

            yield item
Esempio n. 6
0
    def parse_answer(self, response):
        """ 解析获取到的指定范围答案 """
        text = response.text
        answers = json.loads(text)

        for ans in answers['data']:
            item = ZhihuAnswerItem()
            item['answer_id'] = ans['id']
            item['question_id'] = ans['question']['id']
            item['author'] = ans['author']['name']
            # https://www.zhihu.com/question/266730428/answer/314851560
            item['ans_url'] = 'https://www.zhihu.com/question/' + str(item['question_id']) + '/answer/' + str(item['answer_id'])
            item['comment_count'] = ans['comment_count']
            item['upvote_count'] = ans['voteup_count']
            item['excerpt'] = ans['excerpt']
            if item['upvote_count'] > self.setting['MIN_UPVOTE_COUNT']:
                item['content'] = self.parse_content(ans['content'])
            item['content'] = ans['content']
            yield item
Esempio n. 7
0
    def parse(self, response):
        result = json.loads(response.text)
        is_end = result.get('paging').get('is_end')
        next_url = result.get('paging').get('next')
        answers = result.get('data')
        for answer in answers:
            answer_item = ZhihuAnswerItem()
            answer_item['id'] = answer.get('id')
            answer_item['question'] = answer.get('question').get('title')
            headline = answer.get('author').get('headline')
            headline = '(' + headline + ')' if len(headline) > 0 else ''
            answer_item['author'] = answer.get('author').get('name') + headline
            answer_item['answer'] = answer.get('content')
            answer_item['voteup_count'] = answer.get('voteup_count')
            timestamp = answer.get('updated_time')
            answer_item['create_at'] = time.strftime('%Y-%m-%d',
                                                     time.localtime(timestamp))
            yield answer_item

        if is_end == False:
            yield Request(next_url, callback=self.parse)
Esempio n. 8
0
                    for record in selector.xpath(r"id('zh-profile-answer-list')/div"):
                        ask_title = ''.join(record.xpath(r"h2/a/text()").extract())
                        url = host + ''.join(record.xpath("h2/a/@href").extract()) # answer_url
                        ask_url = url.split('/answer')[0]

                        agree_num = ''.join(record.xpath('div/div[2]/a/text()').extract())
                        summary = ''.join(record.xpath(r"div/div[4]/div/text()").extract()).replace("\n","").strip()  #TODO
                        content = ''.join(record.xpath(r"div/div[4]/textarea/text()").extract()).replace("\n","").strip()

                        comment_num = record.xpath(r"div/div[5]/div/a[2]/text()")[1].extract() #'添加评论'或者'3 条评论'
                        comment_num = comment_num.split(' ')[0] #取数字
                        if comment_num.startswith(u'添加评论'):
                            comment_num = '0'

                        yield ZhihuAnswerItem(_id=url,username = username,url = url, ask_title = ask_title, \
                                              ask_url = ask_url, agree_num = agree_num, summary = summary
                                              , content = content, comment_num = comment_num)
                except Exception, e:
                    open('error_pages/answers_' + response.url.split('/')[-2]+'.html', 'w').write(response.body)
                    print '='*10 + str(e)

            elif typeinfo.startswith('asks'):
                username = response.url.split('/')[-2]
                try:
                    for record in selector.xpath(r"id('zh-profile-ask-list')/div"):
                        view_num = record.xpath(r'span/div[1]/text()')[0].extract()
                        title = record.xpath(r"div/h2/a/text()")[0].extract()
                        answer_num = record.xpath(r"div/div/span[1]/following-sibling::text()")[0].extract().split(' ')[0].replace('\n','')
                        follower_num = record.xpath(r"div/div/span[2]/following-sibling::text()")[0].extract().split(' ')[0].replace('\n','')
                        url = host+record.xpath(r"div/h2/a/@href")[0].extract()