Beispiel #1
0
 def parse_user(self, response):
     result = json.loads(response.text)
     item = UserItem()
     for field in item.fields:
         if field in result.keys():
             item[field] = result.get(field)
     yield field
Beispiel #2
0
    def parse_user(self, response):
        """
        1.解析用户详细信息
        2.获取用户的关注列表,进行下一步递归调用
        3.获取用户的粉丝列表,进行下一步递归调用
        :param response:
        :return:
        """
        # 将相应内容写入文件,方便查看
        # with open('data/user.html', 'w') as f:
        #     f.write(response.text)

        result = json.loads(response.text)
        item = UserItem()
        for field in item.fields:
            if field in result.keys():
                item[field] = result.get(field)
        yield item

        yield Request(self.fllowers_url.format(user=result.get('url_token'),
                                               include=self.fllowers_query,
                                               offset=self.offset,
                                               limit=self.limit),
                      callback=self.parse_follows)
        yield Request(self.followees_url.format(user=result.get('url_token'),
                                                include=self.fllowers_query,
                                                offset=self.offset,
                                                limit=self.limit),
                      callback=self.parse_follows)
Beispiel #3
0
    def parse(self, response):
        # time.sleep(0.5)
        result = json.loads(response.text)
        item = UserItem()
        # 1.1 从item.fields循坏取出Item的field。
        # 1.2 判断field是否在result.keys()中。
        # 1.3 对item赋值,使
        for field in item.fields:
            if field in result.keys():
                item[field] = result.get(field)
        yield item

        # 4. 对这个用户的关注人列表再发起请求,回调函数是follows_user,这样再可获取这个用户的所有关注人的详细信息
        yield Request(url=self.follows_url.format(user=('url_token'),
                                                  include=self.follows_query,
                                                  limit=20,
                                                  offset=0),
                      callback=self.follows_user)
        # 对这个用户的粉丝列表再发起请求
        yield Request(url=self.followers_url.format(
            user=('url_token'),
            include=self.followers_query,
            limit=20,
            offset=0),
                      callback=self.followers_user)
Beispiel #4
0
    def parse_user(self, response):
        '''
        解析用户详情信息;获取某个用户的关注列表;获取某个用户的粉丝列表
        :param response:
        :return:
        '''
        result = json.loads(response.text)  # 将请求得到的json字符串转换为字典
        # print(result)
        item = UserItem()
        for field in item.fields:
            if field in result.keys():
                item[field] = result.get(field)
        yield item

        # 请求每个用户的关注列表
        yield Request(self.followees_url.format(user=result.get('url_token'),
                                                include=self.followees_query,
                                                offset=0,
                                                limit=20),
                      callback=self.parse_followees)
        # 请求每个用户的粉丝列表
        yield Request(self.followers_url.format(user=result.get('url_token'),
                                                include=self.followers_query,
                                                offset=0,
                                                limit=20),
                      callback=self.parse_followers)
Beispiel #5
0
    def parse_user(self, response):

        result = json.loads(response.text)

        item = UserItem()

        for field in item.fields:

            if field in result.keys():

                item[field] = result.get(field)

        yield item

        yield Request(
            self.follows_url.format(user=result.get('url_token'),
                                    include=self.follows_query,
                                    limit=20,
                                    offset=0), self.parse_follows)

        yield Request(
            self.followers_url.format(user=result.get('url_token'),
                                      include=self.followers_query,
                                      limit=20,
                                      offset=0), self.parse_followers)
Beispiel #6
0
 def parse_user(self,response):
     #print(response.text) #為json格式
     result = json.loads(response.text)
     item= UserItem()
     for field in item.fields:
         if field in result.keys():
             item[field]=result.get(field)
     yield item  #把字典形式訊息保存下來
Beispiel #7
0
 def parse_user(self, response):
     print('parse_user')
     data=json.loads(response.text)#json格式反序列化
     item=UserItem()
     for field in item.fields:#item.fields列出item的所有的属性
         if field in data.keys():#如果我们想要的字段,确实爬到了相应的数据,就用它填充
             item[field]=data.get(field)
     yield item
Beispiel #8
0
 def parse_user(self, response):
     result = json.loads(response.text)
     item = UserItem()
     self.count += 1
     self.logger.info(self.count)
     for field in item.fields:  # 返回item所有字段
         if field in result.keys():
             item[field] = result.get(field)
     yield item
Beispiel #9
0
 def parse_user(self, response):
     #   爬取用户的基本信息
     result = json.loads(response.text)
     item = UserItem()
     # 遍历item中的所有字段
     for field in item.fields:
         if field in result.keys():
             item[field] = result.get(field)
     print(item)
     yield item
Beispiel #10
0
    def parse_user(self,response):
        #print(response.text) #為json格式
        result = json.loads(response.text)
        item= UserItem()
        for field in item.fields:
            if field in result.keys():
                item[field]=result.get(field)
        yield item  #把字典形式把每位用戶訊息保存下來

        #第四步實現異步爬取,繼續遞歸再把每個用戶的關注人的關注列表,及粉絲的粉絲列表 (這行會讓程序一直下去)
        yield  Request(self.follows_url.format(user=result.get('url_token'), include=self.follows_query, offset=0, limit=20), self.parse_follows, headers=headers)
        yield  Request(self.followers_url.format(user=result.get('url_token'), include=self.followers_query, offset=0, limit=20), self.parse_followers, headers=headers)
Beispiel #11
0
 def parse_user(self, response):  #爬取个人信息回调
     result = json.loads(response.text)  #json解析
     item = UserItem()  #获取item
     for field in item.fields:  #循环item的字段
         if field in result.keys():  #item字段在json结果内
             item[field] = result.get(field)  #赋值
     yield item  #数据
     yield scrapy.Request(
         self.follows_url.format(user=result.get('url_token'),
                                 include=self.follows_query,
                                 limit=20,
                                 offset=0), self.parse_follows)
Beispiel #12
0
 def parse_user(self, response):
     result = json.loads(response.text)
     item = UserItem()
     for field in item.fields:
         if field in result.keys():
             item[field] = result.get(field)
     yield item
     # 这一段递归是进一步对粉丝进行请求,去掉的话就是单纯的请求轮子哥的全部粉丝
     yield scrapy.Request(
         self.user_url.format(user=result.get('url_token'),
                              include=self.use_query,
                              offset=0), self.parse_followee)
Beispiel #13
0
 def parse_user(self, response):
     # 获取单个人的详细列表
     result = json.loads(response.text)
     item = UserItem()
     for field in item.fields:
         if field in result.keys():
             item[field] = result.get(field)
     yield item
     # 并继续递归的获取这个人的关注列表,爬取这个人关注者的详细信息
     yield Request(url=self.follow_url.format(user=result.get('url_token'), include=self.follow_query, offset=0, limit=20), callback=self.parse_follow)
     #并继续递归的获取这个人的粉丝列表,爬取这个人粉丝的详细信息
     yield Request(url=self.follower_url.format(user=result.get('url_token'), include=self.follower_query, offset=0, limit=20), callback=self.parse_follower)
Beispiel #14
0
    def parse_user(self, response):
        result = json.loads(response.text)
        item = UserItem()
        for field in item.fields:
            if field in result.keys():
                item[field] = result[field]

        yield item

        # 获取每个用户的关注列表
        yield Request(self.followers_url.format(user=item['url_token'], include=self.followers_query, offset=0, limit=20),
                      callback=self.parse_follows)
Beispiel #15
0
 def parse_user(self, response):
     time.sleep(random.randrange(0, 3))
     results = json.loads(response.text)
     item = UserItem()
     for field in item.fields:
         if field in results.keys():
             item[field] = results.get(field)
     yield item
     yield Request(
         self.follows_url.format(user=results.get('url_token'),
                                 include=self.follows_query,
                                 offset=20,
                                 limit=20), self.parse_follows)
Beispiel #16
0
 def parse_user(self, response):
     result = json.loads(response.text)
     item = UserItem()
     for field in item.fields:
         if field in result.keys():
             item[field] = result.get(field)
     yield item
     # 获取当前用户的关注列表,递归获取关注列表用户的关注者信息
     yield Request(self.follows_url.format(
         follow_user=result.get('url_token'),
         follow_include=self.follow_query,
         offset=0,
         limit=20),
                   callback=self.parse_follow)
Beispiel #17
0
 def parse_user(self, response):
     res = json.loads(response.text)
     item = UserItem()
     for field in item.fields:
         if field in res.keys():
             item[field] = res.get(field)
     print(item)
     yield item
     yield Request(url=self.follows_url.format(
         user=item["url_token"],
         follows_query=self.follows_query,
         limit=20,
         offset=0),
                   callback=self.parse_follows)
Beispiel #18
0
    def parse_user(self, response):
        # 返回用户信息
        # JSON对象
        result = json.loads(response.text)
        item = UserItem()
        # 以集合返回item的所有名称(即Field())
        for field in item.fields:
            # 如果这个属性属于返回结果,我们就对它进行赋值
            if field in result.keys():
                item[field] = result.get(field)
        yield item

        yield Request(self.follows_url.format(user=result.get('url_token'), include=self.follows_query, limit=20, offset=0), self.parse_follows)
        yield Request(self.followers_url.format(user=result.get('url_token'), include=self.followers_query, limit=20, offset=0), self.parse_followers)
Beispiel #19
0
    def parse_user(self, response):
        result = json.loads(
            response.text)  #获取用户信息页的信息,由于页面为json信息,所以需要用到json。loads
        item = UserItem()  #实例化一个item类,这个类有很多fileds字段用于提取用户信息(类似CSS选择器选择对应字段之类)
        for field in item.fields:  #从item获取所有item.fields字段(也就是用户信息的字段)
            if field in result.keys(
            ):  #如果字段在用户信息结果里,则将对应用户信息字段的值传给item对应字段(抓取到对应信息)
                item[field] = result.get(field)
        yield item  #抛出抓取到的item

        yield Request(
            self.follows_url.format(user=result.get('url_token'),
                                    include=self.follows_query,
                                    offset=0,
                                    limit=20), self.parse_follows)
Beispiel #20
0
 def parse_user(self, response):
     result = json.loads(response.text)
     item = UserItem()
     for field in item.fields:
         if field in result.keys():
             item[field] = result.get(field)
             if field == 'url':
                 item[field] = item[field].replace('/api/v4/', '/')
     yield item
     yield scrapy.Request(
         url=self.followees_url.format(user=result.get('url_token'), include=self.followees_query, offset=0,
                                       limit=20), callback=self.parse_followees, dont_filter=True)
     yield scrapy.Request(
         url=self.followers_url.format(user=result.get('url_token'), include=self.followers_query, offset=0,
                                       limit=20), callback=self.parse_followers, dont_filter=True)
Beispiel #21
0
 def parse_user_info(self, response):
     result = json.loads(response.text)          #将获取到的Python对象转换为json对象
     item = UserItem()                           #实例化一个item用来传递信息
     #这个方法很有用可以快速取得自己要的内容(json返回),然后在使用判断进行快速赋值
     for field in item.fields:                   #item的属性fields,是一个集合,循环给item赋值
         #保证取得了我们定义好的数据而没有定义的数据不会出现
         if field in result.keys():
             #依次给item赋值
             item[field] = result.get(field)
     #返回给item
     yield item
     #将url_token传递给获取用户关注列表的函数,爬取关注列表中的关注列表,从而进行层层递归爬取个人信息
     yield Request(self.followees_url.format(user=result['url_token'], include=self.followees_query, offset=0, limit=20), callback=self.parse_followees)
     #将url_token传递给获取用户粉丝列表的函数
     yield Request(self.followers_url.format(user=result['url_token'], include=self.followers_query, offset=0, limit=20), callback=self.parse_followers)
Beispiel #22
0
 def parse_user(self, response):
     result = json.loads(response.text)
     item = UserItem()
    # item.fields输出items.py里面定义的所有名称
     for field in item.fields:
         if field in result.keys():
             #字典的get函数拿到值
             item[field] = result.get(field)
     yield item
     #每一个人再请求自己的关注列表
     yield Request(self.follows_url.format(user=result.get('url_token'),include=self.follows_query,limit=20,offset=0),\
                   callback=self.parse_follows)
     #请求粉丝列表
     yield Request(self.followers_url.format(user=result.get('url_token'), include=self.followers_query, limit=20, offset=0), \
         callback=self.parse_followers)
Beispiel #23
0
    def parse_user(self, response):
        result = json.loads(response.text)
        item = UserItem()

        for field in item.fields:
            if field in result.keys():
                item[field] = result.get(field)
        yield item  # 用户信息
        time.sleep(random.uniform(0.5, 1))
        yield Request(
            self.follows_url.format(user=result.get('url_token'), include=self.follows_query, limit=20, offset=0),
            self.get_next_follow, meta={'latter': result.get('url_token')})

        yield Request(
            self.followers_url.format(user=result.get('url_token'), include=self.followers_query, limit=20, offset=0),
            self.get_next_follower, meta={'latter': result.get('url_token')})  # meta传导发起的token,便于形成网络
Beispiel #24
0
 def parse_user(self, response):
     results = json.loads(response.text)
     item = UserItem()
     for field in item.fields:
         if field in results.keys():
             item[field] = results.get(field)
     yield item
     #获取用户后接着输出用户关注的对象
     yield Request(self.follows_url.format(user=results.get('url_token'),
                                           include=self.follows_query,
                                           offset=0),
                   callback=self.parse_follows)
     # 获取用户后接着输出用户粉丝的对象
     yield Request(self.follower_url.format(user=results.get('url_token'),
                                            include=self.follower_query,
                                            offset=0),
                   callback=self.parse_followers)
Beispiel #25
0
    def parse_user(self, response):
        result = json.loads(response.text)
        item = UserItem()
        item['id'] = result['id']
        item['name'] = result['name']
        item['avatar_url'] = result['avatar_url']  # 头像
        item['follower_count'] = result['follower_count']  # 粉丝数
        item['headline'] = result['headline']  # 签名
        item['user_url'] = result['url']  # 主页
        item['gender'] = result['gender']
        item['url_token'] = result['url_token']
        yield item

        yield Request(self.follow_url.format(user=result.get('url_token'),
                                             include=self.follow_query,
                                             offset=0),
                      callback=self.parse_follow)
Beispiel #26
0
 def parse_user(self, response):
     result = json.loads(response.text)
     item = UserItem()
     for field in item.fields:
         if field in result.keys():
             item[field] = result.get(field)
     yield item
     # print ('---666666---')
     # print (item)
     # print ('---777777---')
     # print (result)
     # print (result['url_token'])
     yield scrapy.Request(
         self.follows_url.format(user=result['url_token'],
                                 include=self.follows_query,
                                 limit=20,
                                 offset=0), self.parse_follows)
Beispiel #27
0
    def parseUser(self, response):
        result = json.loads(response.text)
        item = UserItem()

        for field in item.fields:
            if field in result.keys():
                item[field] = result.get(field)
        yield item
        #定义回调函数,爬取关注用户与被关注用户的详细信息,实现层层迭代
        yield Request(self.follows_url.format(user=result.get('url_token'),
                                              include=self.follows_query,
                                              offset=0,
                                              limit=20),
                      callback=self.parseFollows)
        yield Request(self.followers_url.format(user=result.get('url_token'),
                                                include=self.followers_query,
                                                offset=0,
                                                limit=20),
                      callback=self.parseFollowers)
Beispiel #28
0
 def parse(self, response):
     items = json.loads(response.body, encoding='utf8')['data']
     print(len(items))
     for item in items:
         user = UserItem()
         user['user_type'] = item['user_type']
         user['answer_count'] = item['answer_count']
         user['url_token'] = item['url_token']
         user['uid'] = item['id']
         user['articles_count'] = item['articles_count']
         user['name'] = item['name']
         user['headline'] = item['headline']
         user['gender'] = item['gender']
         user['follower_count'] = item['follower_count']
         yield user
     if not json.loads(response.body, encoding='utf8')['paging']['is_end']:
         self.offset += 20
         next_url = self.BASE_URL.format(limit=self.LIMIT, offset=self.offset)
         yield scrapy.Request(url=next_url, callback=self.parse)
Beispiel #29
0
    def parse_user(self, response):
        if response.status == 200:
            result = json.loads(response.text)
            item = UserItem()
            item['url_token'] = result.get('url_token')
            item['name'] = result.get('name')
            # location
            try:
                item['location'] = result.get('locations')[0].get('name')
            except:
                item['location'] = ''

            item['gender'] = result.get('gender')

            item['avatar_url'] = result.get('avatar_url').replace('is', 'xl')

            # business
            try:
                item['business'] = result.get('business').get('name')
            except:
                item['business'] = ''

            # company and job
            try:
                item['company'] = result.get('employments')[0].get('company').get('name')
                item['job'] = result.get('employments')[0].get('job').get('name')
            except:
                item['company'] = ''
                item['job'] = ''

            item['headline'] = result.get('headline', '')

            # school
            try:
                item['school'] = result.get('educations')[0].get('school').get('name')
            except:
                item['school'] = ''

            item['voteup_count'] = result.get('voteup_count')
            item['follower_count'] = result.get('follower_count')

            yield item
Beispiel #30
0
 def parse_user(self, response):
     #解析用户详情页面的函数
     #print(response.text)
     result = json.loads(response.text)  #把json模式的东西转换成正常模式
     item = UserItem()
     for field in item.fields:
         if field in result.keys():
             item[field] = result.get(field)
     yield item
     yield Request(
         self.follows_url.format(user=result.get('url_token'),
                                 include=self.follows_query,
                                 limit=20,
                                 offset=0), self.parse_follows)
     #接着构建一个函数,把这个用户的关注列表解析出来
     yield Request(
         self.followers_url.format(user=result.get('url_token'),
                                   include=self.followers_query,
                                   limit=20,
                                   offset=0), self.parse_followers)