Example #1
0
    def user_parse(self, response):
        """
        解析用户个人信息
        """
        infos = json.loads(response.body)
        url_token = infos['url_token']

        user_item = ZhihuItem()
        user_item['user'] = infos
        yield user_item

        # 请求关注的人
        following_count = infos['following_count']
        if following_count > 0:
            yield scrapy.Request(self.user_followees_url.format(url_token),
                                 headers=self.headers,
                                 priority=10,
                                 callback=self.following_parse,
                                 meta={'children_url_token': url_token})

        # 请求粉丝
        follower_count = infos['follower_count']
        if follower_count > 0:
            yield scrapy.Request(self.user_followers_url.format(url_token),
                                 headers=self.headers,
                                 priority=5,
                                 callback=self.followers_parse,
                                 meta={
                                     'parent_url_token': url_token,
                                 })
Example #2
0
 def parse(self, response):
     print(response.url)
     # 将json格式转换为python格式
     data = json.loads(response.text)['data']
     is_end = json.loads(response.text)['paging']['is_end']
     for each in data:
         item = ZhihuItem()
         item['id'] = each['id']
         item['name'] = each['name']
         item['headline'] = each['headline']
         item['gender'] = each['gender']
         item['url_token'] = each['url_token']
         yield scrapy.Request(url=self.quesUrl + item['url_token'] + '/following/questions',
                              meta={'item': item, 'download_timeout': 10},
                              callback=self.getQuesHtml)
Example #3
0
 def parse_user(self, response):
     """
     解析用户详情页面
     """
     items = ZhihuItem()
     user_info = json.loads(response.text)
     if user_info:
         # 遍历items中的Field
         for field in items.fields:
             if field in user_info.keys():
                 items[field] = user_info.get(field)
         yield items
         # 从用户详情页抓取到关注者和关注的页面并发起请求
         yield Request(self.follows_url.format(user=user_info.get('url-token'), include=self.follows_query, offset=0, limit=20), callback=self.parse_follower)
         yield Request(self.following_url.format(user=user_info.get('url-token'), include=self.following_query, offset=0, limit=20), callback=self.parse_following)
Example #4
0
 def parse_user(self, response):
     result = json.loads(response.text)
     item = ZhihuItem()
     for field in item.fields:
         if field in result.keys():
             item[field] = result.get(field)
     yield item
     yield Request(self.follows_url.format(user=result.get('url_token'),
                                           include=self.follows_query,
                                           offset=0,
                                           limit=20),
                   callback=self.parse_user)
     yield Request(self.followers_url.format(user=result.get('url_token'),
                                             include=self.followers_query,
                                             offset=0,
                                             limit=20),
                   callback=self.parse_user)
Example #5
0
 def answer_parse(self, response):
     jsonBody = json.loads(response.body)
     self.anserws_meet_end = jsonBody['paging']['is_end']
     if not self.anserws_meet_end:
         for item in jsonBody['data']:
             pipleitem = ZhihuItem()
             pipleitem['id'] = item['id']
             pipleitem[
                 'url'] = 'https://www.zhihu.com/question/{q_id}/answer/{a_id}'.format(
                     q_id=item['question']['id'], a_id=item['id'])
             pipleitem['platform'] = '知乎'
             pipleitem['viewType'] = '问答'
             pipleitem['searchWord'] = response.meta['kw']
             pipleitem['Title'] = item['question']['title']
             pipleitem['crawlTime'] = self.get_localtime()
             pipleitem['publishTime'] = self.get_createtime(
                 item['created_time'])
             pipleitem['level'] = 1
             pipleitem['commentID'] = 1
             pipleitem['comment_count'] = item['comment_count']
             pipleitem['like'] = item['voteup_count']
             pipleitem['authorName'] = item['author']['name']
             pipleitem['authorID'] = item['author']['id']
             pipleitem['Content'] = item['excerpt']
             yield pipleitem
             if item['comment_count'] > 0:
                 self.comment_meet_end = False
                 self.comment_parmas['offset'] = 0
                 while not self.comment_meet_end:
                     paramas = urllib.parse.urlencode(self.comment_parmas)
                     url = 'https://www.zhihu.com/api/v4/answers/{id}/comments?{paramters}'.format(
                         id=item['id'], paramters=paramas)
                     yield Request(url=url,
                                   callback=self.comment_parse,
                                   meta={
                                       'answerid': item['id'],
                                       'kw': response.meta['kw'],
                                       'title': item['question']['title']
                                   })
                     self.comment_parmas['offset'] += self.comment_parmas[
                         'limit']
                     time.sleep(5)  #--*-- 这个数值必须大于download delay值 --*--
             # print('--------------------ANSWER--------------------\n',response.url,'\n',pipleitem,'\n--------------------ANSWER--------------------\n')
     else:
         return
Example #6
0
    def parse(self, response):
        item = ZhihuItem()
        # item['up_num'] = re.findall(r'"voteupCount":(\d+)', response.text)[0]
        # item['zhuanfa'] = re.findall(r'"voting":(\d+)', response.text)[0]
        # item['comment_num'] = re.findall(r'"commentCount":(\d+)', response.text)[0]
        # item['url'] = response.meta.get('url')
        # item['MonitorName'] = response.meta.get('MonitorName')
        # item['read_num'] = ''
        # print(item['up_num'], item['comment_num'], item['zhuanfa'], item['url'])
        # yield item

        item['comment_num'] = re.findall(r'"commentCount":(\d+)',
                                         response.text)[0]
        item['read_num'] = re.findall(r'"visitCount":(\d+)', response.text)[0]
        item['url'] = response.meta.get('url')
        item['MonitorName'] = response.meta.get('MonitorName')
        item['zhuanfa'] = ''
        item['up_num'] = ''
        print(item['read_num'], item['comment_num'], item['url'])
        yield item
Example #7
0
 def artical_parse(self, response):
     pipleitem = ZhihuItem()
     pipleitem['viewType'] = '文章'
     pipleitem['id'] = response.meta['id']
     pipleitem['url'] = response.url
     pipleitem['platform'] = '知乎'
     pipleitem['searchWord'] = response.meta['kw']
     pipleitem['Title'] = response.css('.Post-Header .Post-Title').xpath(
         'string(.)').extract_first()
     pipleitem['crawlTime'] = self.get_localtime()
     created_secs = int(
         re.findall('"created":(\d*)', response.body.decode())[0])
     pipleitem['publishTime'] = self.get_createtime(secs=created_secs)
     pipleitem['level'] = 1
     pipleitem['authorName'] = response.css(
         '.AuthorInfo-name .UserLink-link').xpath('text()').extract_first()
     pipleitem['authorID'] = response.css(
         '.AuthorInfo-name .UserLink-link').xpath('@href').extract_first()
     pipleitem['commentID'] = 1
     pipleitem['Content'] = response.css('#root .Post-RichText').xpath(
         'string(.)').extract_first()
     # print('--------------------ARTICLE--------------------\n', response.url, '\n', pipleitem,'\n--------------------ARTICLE--------------------\n')
     return pipleitem
Example #8
0
    def parse_user(self, response):
        result = json.loads(response.text)
        item = ZhihuItem()

        # 获取数据
        for field in item.fields:
            if field in result.keys():
                item[field] = result.get(field)

        # 返回数据
        yield item
        # 发送该用户的关注者api请求
        yield Request(
            self.follows_url.format(user=result.get('url_token'),
                                    include=self.follow_include,
                                    limit=20,
                                    offset=0), self.parse_follows)

        # 发送该用户的粉丝api请求
        yield Request(
            self.followers_url.format(user=result.get('url_token'),
                                      include=self.followers_include,
                                      limit=20,
                                      offset=0), self.parse_followers)
Example #9
0
 def comment_parse(self, response):
     jsonBody = json.loads(response.body)
     self.comment_meet_end = jsonBody['paging']['is_end']
     if not self.comment_meet_end:
         for item in jsonBody['data']:
             pipleitem = ZhihuItem()
             pipleitem['id'] = response.meta['answerid']
             pipleitem['commentID'] = item['id']
             pipleitem['url'] = item['url']
             pipleitem['platform'] = '知乎'
             pipleitem['viewType'] = '问答'
             pipleitem['searchWord'] = response.meta['kw']
             pipleitem['Title'] = response.meta['title']
             pipleitem['crawlTime'] = self.get_localtime()
             pipleitem['publishTime'] = self.get_createtime(
                 item['created_time'])
             pipleitem['level'] = 2
             pipleitem['like'] = item['vote_count']
             pipleitem['authorName'] = item['author']['member']['name']
             pipleitem['authorID'] = item['author']['member']['id']
             pipleitem['Content'] = item['content']
             # print('--------------------COMMENT--------------------\n',response.url,'\n',pipleitem,'\n--------------------COMMENT--------------------\n')
             yield pipleitem
     return
Example #10
0
    def parse_detail(self, response):
            list_Label = response.css(".ProfileHeader-detailLabel::text").extract()
            # 行业和个人简介
            business_introduce = response.css(".ProfileHeader-detailValue::text").extract()
            introduce=''
            if len(business_introduce) == 2:
                business = business_introduce[0]
                introduce = business_introduce[1]
                list_Label.remove('所在行业')
                list_Label.remove('个人简介')
            elif len(business_introduce) == 1:
                for i in list_Label:
                    if i == '所在行业':
                        business = business_introduce[0]
                        introduce = ''
                        list_Label.remove('所在行业')
                    elif i == '个人简介':
                        business = ''
                        introduce = business_introduce[0]
                        list_Label.remove('个人简介')
                        break
            else:
                business = ''
                introduce = ''
            # 所在地
            place = response.css("div.ProfileHeader-detailValue span:nth-child(1)::text").extract()
            list_data = response.css(
                "div.ProfileHeader-detailValue div.ProfileHeader-field:nth-child(1)::text").extract()
            # 职位
            job = ''
            # 学校
            edu = ''
            if len(list_data) == 2:
                job = list_data[0]
                edu = list_data[1]
            elif len(list_data) == 1:
                for i in list_Label:
                    if i == '职业经历':
                        job = list_data[0]
                    elif i == '教育经历':
                        edu = list_data[0]
            elif len(list_data) == 0:
                job = ''
                edu = ''
            else:
                print("异常")
            # 数据清洗
            if place:
                place = "".join(place)
                place = place.replace("现居", "")
            else:
                place = "".join(place)
            # 性别
            gender = response.css("meta[itemprop*='gender']::attr(content)").extract()
            if gender:
                if gender[0] == 'Male':
                    gender = 0  # 0->male
                else:
                    gender = 1  # 1->female
            else:
                gender = -1  # -1未知
            # 名称
            u_name = response.css("span.ProfileHeader-name::text").extract_first()

            # 回答数
            answerCount = response.css("li[aria-controls*='Profile-answers'] a span::text").extract_first()
            # 被关注数
            followerCount = response.css("meta[itemprop*='followerCount']::attr(content)").extract_first()
            # 提问数
            asks = response.css("li[aria-controls*='Profile-asks'] a span::text").extract_first()
            # 关注数
            content = str(response.body)
            re_match = re.match(".*?<.*?title=\"(\d*)\">.*</strong>", content)
            if re_match:
                follwingCount = re_match.group(1)
            else:
                follwingCount = 0
            url_token = response.meta['url_token']
            # ZhihuItem类实例化Zhihu_item
            Zhihu_item = ZhihuItem()
            Zhihu_item["u_name"] = u_name  # 1用户名称
            Zhihu_item["follwingCount"] = follwingCount  # 2用户关注的人数量
            Zhihu_item["followerCount"] = followerCount  # 3关注用户的人的数量
            Zhihu_item["gender"] = gender  # 4性别
            Zhihu_item["edu"] = edu  # 5教育经历
            Zhihu_item["job"] = job # 6职业经历
            Zhihu_item["place"] = place  # 7居住地列表
            Zhihu_item["url_token"] = [url_token] #8 关键字
            Zhihu_item["asks"] = asks  # 9提问数
            Zhihu_item["answerCount"] = answerCount  # 10回答数
            Zhihu_item["introduce"] = introduce #11回答数
            # item值填充
            yield Zhihu_item