def parse_user(self, response): result = json.loads(response.text) item = UserItem() for field in item.fields: if field in result.keys(): item[field] = result.get(field) yield field
def parse_user(self, response): """ 1.解析用户详细信息 2.获取用户的关注列表,进行下一步递归调用 3.获取用户的粉丝列表,进行下一步递归调用 :param response: :return: """ # 将相应内容写入文件,方便查看 # with open('data/user.html', 'w') as f: # f.write(response.text) result = json.loads(response.text) item = UserItem() for field in item.fields: if field in result.keys(): item[field] = result.get(field) yield item yield Request(self.fllowers_url.format(user=result.get('url_token'), include=self.fllowers_query, offset=self.offset, limit=self.limit), callback=self.parse_follows) yield Request(self.followees_url.format(user=result.get('url_token'), include=self.fllowers_query, offset=self.offset, limit=self.limit), callback=self.parse_follows)
def parse(self, response): # time.sleep(0.5) result = json.loads(response.text) item = UserItem() # 1.1 从item.fields循坏取出Item的field。 # 1.2 判断field是否在result.keys()中。 # 1.3 对item赋值,使 for field in item.fields: if field in result.keys(): item[field] = result.get(field) yield item # 4. 对这个用户的关注人列表再发起请求,回调函数是follows_user,这样再可获取这个用户的所有关注人的详细信息 yield Request(url=self.follows_url.format(user=('url_token'), include=self.follows_query, limit=20, offset=0), callback=self.follows_user) # 对这个用户的粉丝列表再发起请求 yield Request(url=self.followers_url.format( user=('url_token'), include=self.followers_query, limit=20, offset=0), callback=self.followers_user)
def parse_user(self, response): ''' 解析用户详情信息;获取某个用户的关注列表;获取某个用户的粉丝列表 :param response: :return: ''' result = json.loads(response.text) # 将请求得到的json字符串转换为字典 # print(result) item = UserItem() for field in item.fields: if field in result.keys(): item[field] = result.get(field) yield item # 请求每个用户的关注列表 yield Request(self.followees_url.format(user=result.get('url_token'), include=self.followees_query, offset=0, limit=20), callback=self.parse_followees) # 请求每个用户的粉丝列表 yield Request(self.followers_url.format(user=result.get('url_token'), include=self.followers_query, offset=0, limit=20), callback=self.parse_followers)
def parse_user(self, response): result = json.loads(response.text) item = UserItem() for field in item.fields: if field in result.keys(): item[field] = result.get(field) yield item yield Request( self.follows_url.format(user=result.get('url_token'), include=self.follows_query, limit=20, offset=0), self.parse_follows) yield Request( self.followers_url.format(user=result.get('url_token'), include=self.followers_query, limit=20, offset=0), self.parse_followers)
def parse_user(self,response): #print(response.text) #為json格式 result = json.loads(response.text) item= UserItem() for field in item.fields: if field in result.keys(): item[field]=result.get(field) yield item #把字典形式訊息保存下來
def parse_user(self, response): print('parse_user') data=json.loads(response.text)#json格式反序列化 item=UserItem() for field in item.fields:#item.fields列出item的所有的属性 if field in data.keys():#如果我们想要的字段,确实爬到了相应的数据,就用它填充 item[field]=data.get(field) yield item
def parse_user(self, response): result = json.loads(response.text) item = UserItem() self.count += 1 self.logger.info(self.count) for field in item.fields: # 返回item所有字段 if field in result.keys(): item[field] = result.get(field) yield item
def parse_user(self, response): # 爬取用户的基本信息 result = json.loads(response.text) item = UserItem() # 遍历item中的所有字段 for field in item.fields: if field in result.keys(): item[field] = result.get(field) print(item) yield item
def parse_user(self,response): #print(response.text) #為json格式 result = json.loads(response.text) item= UserItem() for field in item.fields: if field in result.keys(): item[field]=result.get(field) yield item #把字典形式把每位用戶訊息保存下來 #第四步實現異步爬取,繼續遞歸再把每個用戶的關注人的關注列表,及粉絲的粉絲列表 (這行會讓程序一直下去) yield Request(self.follows_url.format(user=result.get('url_token'), include=self.follows_query, offset=0, limit=20), self.parse_follows, headers=headers) yield Request(self.followers_url.format(user=result.get('url_token'), include=self.followers_query, offset=0, limit=20), self.parse_followers, headers=headers)
def parse_user(self, response): #爬取个人信息回调 result = json.loads(response.text) #json解析 item = UserItem() #获取item for field in item.fields: #循环item的字段 if field in result.keys(): #item字段在json结果内 item[field] = result.get(field) #赋值 yield item #数据 yield scrapy.Request( self.follows_url.format(user=result.get('url_token'), include=self.follows_query, limit=20, offset=0), self.parse_follows)
def parse_user(self, response): result = json.loads(response.text) item = UserItem() for field in item.fields: if field in result.keys(): item[field] = result.get(field) yield item # 这一段递归是进一步对粉丝进行请求,去掉的话就是单纯的请求轮子哥的全部粉丝 yield scrapy.Request( self.user_url.format(user=result.get('url_token'), include=self.use_query, offset=0), self.parse_followee)
def parse_user(self, response): # 获取单个人的详细列表 result = json.loads(response.text) item = UserItem() for field in item.fields: if field in result.keys(): item[field] = result.get(field) yield item # 并继续递归的获取这个人的关注列表,爬取这个人关注者的详细信息 yield Request(url=self.follow_url.format(user=result.get('url_token'), include=self.follow_query, offset=0, limit=20), callback=self.parse_follow) #并继续递归的获取这个人的粉丝列表,爬取这个人粉丝的详细信息 yield Request(url=self.follower_url.format(user=result.get('url_token'), include=self.follower_query, offset=0, limit=20), callback=self.parse_follower)
def parse_user(self, response): result = json.loads(response.text) item = UserItem() for field in item.fields: if field in result.keys(): item[field] = result[field] yield item # 获取每个用户的关注列表 yield Request(self.followers_url.format(user=item['url_token'], include=self.followers_query, offset=0, limit=20), callback=self.parse_follows)
def parse_user(self, response): time.sleep(random.randrange(0, 3)) results = json.loads(response.text) item = UserItem() for field in item.fields: if field in results.keys(): item[field] = results.get(field) yield item yield Request( self.follows_url.format(user=results.get('url_token'), include=self.follows_query, offset=20, limit=20), self.parse_follows)
def parse_user(self, response): result = json.loads(response.text) item = UserItem() for field in item.fields: if field in result.keys(): item[field] = result.get(field) yield item # 获取当前用户的关注列表,递归获取关注列表用户的关注者信息 yield Request(self.follows_url.format( follow_user=result.get('url_token'), follow_include=self.follow_query, offset=0, limit=20), callback=self.parse_follow)
def parse_user(self, response): res = json.loads(response.text) item = UserItem() for field in item.fields: if field in res.keys(): item[field] = res.get(field) print(item) yield item yield Request(url=self.follows_url.format( user=item["url_token"], follows_query=self.follows_query, limit=20, offset=0), callback=self.parse_follows)
def parse_user(self, response): # 返回用户信息 # JSON对象 result = json.loads(response.text) item = UserItem() # 以集合返回item的所有名称(即Field()) for field in item.fields: # 如果这个属性属于返回结果,我们就对它进行赋值 if field in result.keys(): item[field] = result.get(field) yield item yield Request(self.follows_url.format(user=result.get('url_token'), include=self.follows_query, limit=20, offset=0), self.parse_follows) yield Request(self.followers_url.format(user=result.get('url_token'), include=self.followers_query, limit=20, offset=0), self.parse_followers)
def parse_user(self, response): result = json.loads( response.text) #获取用户信息页的信息,由于页面为json信息,所以需要用到json。loads item = UserItem() #实例化一个item类,这个类有很多fileds字段用于提取用户信息(类似CSS选择器选择对应字段之类) for field in item.fields: #从item获取所有item.fields字段(也就是用户信息的字段) if field in result.keys( ): #如果字段在用户信息结果里,则将对应用户信息字段的值传给item对应字段(抓取到对应信息) item[field] = result.get(field) yield item #抛出抓取到的item yield Request( self.follows_url.format(user=result.get('url_token'), include=self.follows_query, offset=0, limit=20), self.parse_follows)
def parse_user(self, response): result = json.loads(response.text) item = UserItem() for field in item.fields: if field in result.keys(): item[field] = result.get(field) if field == 'url': item[field] = item[field].replace('/api/v4/', '/') yield item yield scrapy.Request( url=self.followees_url.format(user=result.get('url_token'), include=self.followees_query, offset=0, limit=20), callback=self.parse_followees, dont_filter=True) yield scrapy.Request( url=self.followers_url.format(user=result.get('url_token'), include=self.followers_query, offset=0, limit=20), callback=self.parse_followers, dont_filter=True)
def parse_user_info(self, response): result = json.loads(response.text) #将获取到的Python对象转换为json对象 item = UserItem() #实例化一个item用来传递信息 #这个方法很有用可以快速取得自己要的内容(json返回),然后在使用判断进行快速赋值 for field in item.fields: #item的属性fields,是一个集合,循环给item赋值 #保证取得了我们定义好的数据而没有定义的数据不会出现 if field in result.keys(): #依次给item赋值 item[field] = result.get(field) #返回给item yield item #将url_token传递给获取用户关注列表的函数,爬取关注列表中的关注列表,从而进行层层递归爬取个人信息 yield Request(self.followees_url.format(user=result['url_token'], include=self.followees_query, offset=0, limit=20), callback=self.parse_followees) #将url_token传递给获取用户粉丝列表的函数 yield Request(self.followers_url.format(user=result['url_token'], include=self.followers_query, offset=0, limit=20), callback=self.parse_followers)
def parse_user(self, response): result = json.loads(response.text) item = UserItem() # item.fields输出items.py里面定义的所有名称 for field in item.fields: if field in result.keys(): #字典的get函数拿到值 item[field] = result.get(field) yield item #每一个人再请求自己的关注列表 yield Request(self.follows_url.format(user=result.get('url_token'),include=self.follows_query,limit=20,offset=0),\ callback=self.parse_follows) #请求粉丝列表 yield Request(self.followers_url.format(user=result.get('url_token'), include=self.followers_query, limit=20, offset=0), \ callback=self.parse_followers)
def parse_user(self, response): result = json.loads(response.text) item = UserItem() for field in item.fields: if field in result.keys(): item[field] = result.get(field) yield item # 用户信息 time.sleep(random.uniform(0.5, 1)) yield Request( self.follows_url.format(user=result.get('url_token'), include=self.follows_query, limit=20, offset=0), self.get_next_follow, meta={'latter': result.get('url_token')}) yield Request( self.followers_url.format(user=result.get('url_token'), include=self.followers_query, limit=20, offset=0), self.get_next_follower, meta={'latter': result.get('url_token')}) # meta传导发起的token,便于形成网络
def parse_user(self, response): results = json.loads(response.text) item = UserItem() for field in item.fields: if field in results.keys(): item[field] = results.get(field) yield item #获取用户后接着输出用户关注的对象 yield Request(self.follows_url.format(user=results.get('url_token'), include=self.follows_query, offset=0), callback=self.parse_follows) # 获取用户后接着输出用户粉丝的对象 yield Request(self.follower_url.format(user=results.get('url_token'), include=self.follower_query, offset=0), callback=self.parse_followers)
def parse_user(self, response): result = json.loads(response.text) item = UserItem() item['id'] = result['id'] item['name'] = result['name'] item['avatar_url'] = result['avatar_url'] # 头像 item['follower_count'] = result['follower_count'] # 粉丝数 item['headline'] = result['headline'] # 签名 item['user_url'] = result['url'] # 主页 item['gender'] = result['gender'] item['url_token'] = result['url_token'] yield item yield Request(self.follow_url.format(user=result.get('url_token'), include=self.follow_query, offset=0), callback=self.parse_follow)
def parse_user(self, response): result = json.loads(response.text) item = UserItem() for field in item.fields: if field in result.keys(): item[field] = result.get(field) yield item # print ('---666666---') # print (item) # print ('---777777---') # print (result) # print (result['url_token']) yield scrapy.Request( self.follows_url.format(user=result['url_token'], include=self.follows_query, limit=20, offset=0), self.parse_follows)
def parseUser(self, response): result = json.loads(response.text) item = UserItem() for field in item.fields: if field in result.keys(): item[field] = result.get(field) yield item #定义回调函数,爬取关注用户与被关注用户的详细信息,实现层层迭代 yield Request(self.follows_url.format(user=result.get('url_token'), include=self.follows_query, offset=0, limit=20), callback=self.parseFollows) yield Request(self.followers_url.format(user=result.get('url_token'), include=self.followers_query, offset=0, limit=20), callback=self.parseFollowers)
def parse(self, response): items = json.loads(response.body, encoding='utf8')['data'] print(len(items)) for item in items: user = UserItem() user['user_type'] = item['user_type'] user['answer_count'] = item['answer_count'] user['url_token'] = item['url_token'] user['uid'] = item['id'] user['articles_count'] = item['articles_count'] user['name'] = item['name'] user['headline'] = item['headline'] user['gender'] = item['gender'] user['follower_count'] = item['follower_count'] yield user if not json.loads(response.body, encoding='utf8')['paging']['is_end']: self.offset += 20 next_url = self.BASE_URL.format(limit=self.LIMIT, offset=self.offset) yield scrapy.Request(url=next_url, callback=self.parse)
def parse_user(self, response): if response.status == 200: result = json.loads(response.text) item = UserItem() item['url_token'] = result.get('url_token') item['name'] = result.get('name') # location try: item['location'] = result.get('locations')[0].get('name') except: item['location'] = '' item['gender'] = result.get('gender') item['avatar_url'] = result.get('avatar_url').replace('is', 'xl') # business try: item['business'] = result.get('business').get('name') except: item['business'] = '' # company and job try: item['company'] = result.get('employments')[0].get('company').get('name') item['job'] = result.get('employments')[0].get('job').get('name') except: item['company'] = '' item['job'] = '' item['headline'] = result.get('headline', '') # school try: item['school'] = result.get('educations')[0].get('school').get('name') except: item['school'] = '' item['voteup_count'] = result.get('voteup_count') item['follower_count'] = result.get('follower_count') yield item
def parse_user(self, response): #解析用户详情页面的函数 #print(response.text) result = json.loads(response.text) #把json模式的东西转换成正常模式 item = UserItem() for field in item.fields: if field in result.keys(): item[field] = result.get(field) yield item yield Request( self.follows_url.format(user=result.get('url_token'), include=self.follows_query, limit=20, offset=0), self.parse_follows) #接着构建一个函数,把这个用户的关注列表解析出来 yield Request( self.followers_url.format(user=result.get('url_token'), include=self.followers_query, limit=20, offset=0), self.parse_followers)