def resp2item_v2(resp, base_weibo=None, base_user=None): items = [] if resp is None or 'deleted' in resp or 'mid' not in resp and 'name' not in resp: return items if 'mid' in resp: weibo = WeiboItem() for key in WeiboItem.RESP_ITER_KEYS: if key in resp: weibo[key] = resp[key] if 'user' not in weibo: weibo['user'] = base_user weibo['timestamp'] = local2unix(weibo['created_at']) if base_weibo: base_weibo['retweeted_status'] = weibo items.append(weibo) items.extend(resp2item_v2(resp.get('user'), base_weibo=weibo)) items.extend(resp2item_v2(resp.get('retweeted_status'), base_weibo=weibo)) else: user = UserItem() for key in UserItem.RESP_ITER_KEYS: user[key] = resp[key] if base_weibo: base_weibo['user'] = user items.append(user) items.extend(resp2item_v2(resp.get('status'), base_user=user)) return items
def handle_user(self, res): print('拦截user接口') user = UserItem() if res.get('status_code') == 0: data = res.get('user', {}) user['user_id'] = data.get('uid') user['nickname'] = data.get('nickname') user['unique_id'] = data.get('unique_id') user['gender'] = data.get('gender') user['birthday'] = data.get('birthday') user['signature'] = data.get('signature') user['school_name'] = data.get('school_name') user['aweme_count'] = data.get('aweme_count') user['total_favorited'] = data.get('total_favorited') user['following_count'] = data.get('follower_count') user['aweme_fans'] = data.get('follower_count') for item in data.get('followers_detail', []): temp = item.get('app_name', '') if temp == 'news_article': user['news_article_fans'] = item.get('fans_count') if temp == 'live_stream': user['live_stream_fans'] = item.get('fans_count') user['mplatform_followers_count'] = data.get('mplatform_followers_count') user['country'] = data.get('country') user['province'] = data.get('province') user['city'] = data.get('city') user['location'] = data.get('location', '') user['district'] = data.get('district') user['custom_verify'] = data.get('custom_verify') user['with_fusion_shop_entry'] = data.get('with_fusion_shop_entry') user['with_commerce_entry'] = data.get('with_commerce_entry') user['avatar'] = data.get('avatar_medium', {}).get('url_list', [''])[0] user['share_url'] = data.get('share_info', {}).get('share_url') self.db.insert([user], 'User_table')
def parse(self, response): user_item = UserItem() user_item['crawl_time'] = int(time.time()) selector = Selector(response) user_item['_id'] = re.findall('(\d+)/info', response.url)[0] user_info_text = ";".join( selector.xpath('body/div[@class="c"]//text()').extract()) nick_name = re.findall('昵称;?:?(.*?);', user_info_text) gender = re.findall('性别;?:?(.*?);', user_info_text) place = re.findall('地区;?:?(.*?);', user_info_text) brief_introduction = re.findall('简介;?:?(.*?);', user_info_text) birthday = re.findall('生日;?:?(.*?);', user_info_text) sex_orientation = re.findall('性取向;?:?(.*?);', user_info_text) sentiment = re.findall('感情状况;?:?(.*?);', user_info_text) vip_level = re.findall('会员等级;?:?(.*?);', user_info_text) authentication = re.findall('认证;?:?(.*?);', user_info_text) labels = re.findall('标签;?:?(.*?)更多>>', user_info_text) if nick_name and nick_name[0]: user_item["nick_name"] = nick_name[0].replace(u"\xa0", "") if gender and gender[0]: user_item["gender"] = gender[0].replace(u"\xa0", "") if place and place[0]: place = place[0].replace(u"\xa0", "").split(" ") user_item["province"] = place[0] if len(place) > 1: user_item["city"] = place[1] if brief_introduction and brief_introduction[0]: user_item["brief_introduction"] = brief_introduction[0].replace( u"\xa0", "") if birthday and birthday[0]: user_item['birthday'] = birthday[0] if sex_orientation and sex_orientation[0]: if sex_orientation[0].replace(u"\xa0", "") == gender[0]: user_item["sex_orientation"] = "同性恋" else: user_item["sex_orientation"] = "异性恋" if sentiment and sentiment[0]: user_item["sentiment"] = sentiment[0].replace(u"\xa0", "") if vip_level and vip_level[0]: user_item["vip_level"] = vip_level[0].replace(u"\xa0", "") if authentication and authentication[0]: user_item["authentication"] = authentication[0].replace( u"\xa0", "") if labels and labels[0]: user_item["labels"] = labels[0].replace(u"\xa0", ",").replace(';', '').strip(',') request_meta = response.meta request_meta['item'] = user_item yield Request(self.base_url + '/u/{}'.format(user_item['_id']), callback=self.parse_further_information, meta=request_meta, dont_filter=True, priority=1)
def db_add_user(self, password, username, email, recent_reads_title=[], recent_reads_url=[], tags=[], stocks=[]): user = UserItem() user['password'] = django_pbkdf2_sha256.hash(password) user['username'] = username user['email'] = email user['recent_reads_title'] = recent_reads_title user['recent_reads_url'] = recent_reads_url user['tags'] = tags user['stocks'] = stocks return self.db_insert_user(user)
def parse_follows(self,response): ''' 解析用户关注 :param response:Response对象 :return: ''' result = json.loads(response.text) if result.get('data').get('cards')[-1].get('card_group'): follows = result.get('data').get('cards')[-1].get('card_group') for follow in follows: if follow.get('user'): uid = follow.get('user').get('id') yield Request(self.user_url.format(uid=uid),callback=self.parse_user) uid = response.meta.get('uid') user_relation_item = UserItem() follows = [{'id':follow.get('user').get('id'),'name':follow.get('user').get('screen_name')} for follow in follows] user_relation_item['id'] = uid user_relation_item['follows'] = follows user_relation_item['fans'] = [] yield user_relation_item page = response.get('meta').get('page') + 1 yield Request(self.follow_url.format(uid=uid,page=page),callback=self.parse_follows,meta={'page':page,'uid':uid})
def parse_user(self,response): ''' 解析用户信息 :param response: Response对象 :return: ''' result = json.loads(response.text) if result.get('data').get('userInfo'): user_info = result.get('data').get('userInfo') user_item = UserItem() field_map = { 'id':'id', 'name':'screen_name', 'avatar':'profile_image_url',#头像 'cover':'https://tva1.sinaimg.cn/crop.0.0.640.640.640/549d0121tw1egm1kjly3jj20hs0hsq4f.jpg', 'gender':'gender', 'description':'description', 'fans_count':'followers_count', 'follows_count':'follow_count', 'weibos_count':'statuses_count', 'verified':'verified', 'verified_reason':'verified_reason', 'verified_type':'verified_type' } for field,attr in field_map: user_item[field] = user_info.get(attr) yield user_item #关注 uid = user_info.get('id') yield Request(self.follow_url.format(uid=uid,page=1),callback=self.parse_follows, meta={'page':1,'uid':uid}) #粉丝 yield Request(self.fan_url.format(uid=uid,page=1),callback=self.parse_fans, meta={'page':1,'uid':uid}) #微博 yield Request(self.weibo_url.format(uid=uid,page=1),callback=self.parse_weibos, meta={'page':1,'uid':uid})
def parse_user(main_page_url, cookies=None, xsrf=None): """ parse user information from main page :type main_page_url: str :param xsrf: :param cookies: :param main_page_url: :return: """ if cookies is None or xsrf is None: return None else: headers_ = headers_base headers_['Referer'] = main_page_url + "/answers" item = UserItem() response = requests.get(main_page_url, headers=headers_, cookies=cookies) if response.status_code == 200: doc = content_to_html(response.content) _parse_profile(html_doc=doc, item=item) _parse_home(html_doc=doc, main_page_url=main_page_url, item=item) _parse_follow(html_doc=doc, item=item) return item