Example #1
0
def resp2item_v2(resp, base_weibo=None, base_user=None):
    items = []
    if resp is None or 'deleted' in resp or 'mid' not in resp and 'name' not in resp:
        return items

    if 'mid' in resp:
        weibo = WeiboItem()
        for key in WeiboItem.RESP_ITER_KEYS:
            if key in resp:
                weibo[key] = resp[key]
        if 'user' not in weibo:
            weibo['user'] = base_user
        weibo['timestamp'] = local2unix(weibo['created_at'])

        if base_weibo:
            base_weibo['retweeted_status'] = weibo

        items.append(weibo)
        items.extend(resp2item_v2(resp.get('user'), base_weibo=weibo))
        items.extend(resp2item_v2(resp.get('retweeted_status'), base_weibo=weibo))
    else:
        user = UserItem()
        for key in UserItem.RESP_ITER_KEYS:
            user[key] = resp[key]

        if base_weibo:
            base_weibo['user'] = user

        items.append(user)
        items.extend(resp2item_v2(resp.get('status'), base_user=user))

    return items
Example #2
0
 def handle_user(self, res):
     print('拦截user接口')
     user = UserItem()
     if res.get('status_code') == 0:
         data = res.get('user', {})
         user['user_id'] = data.get('uid')
         user['nickname'] = data.get('nickname')
         user['unique_id'] = data.get('unique_id')
         user['gender'] = data.get('gender')
         user['birthday'] = data.get('birthday')
         user['signature'] = data.get('signature')
         user['school_name'] = data.get('school_name')
         user['aweme_count'] = data.get('aweme_count')
         user['total_favorited'] = data.get('total_favorited')
         user['following_count'] = data.get('follower_count')
         user['aweme_fans'] = data.get('follower_count')
         for item in data.get('followers_detail', []):
             temp = item.get('app_name', '')
             if temp == 'news_article':
                 user['news_article_fans'] = item.get('fans_count')
             if temp == 'live_stream':
                 user['live_stream_fans'] = item.get('fans_count')
         user['mplatform_followers_count'] = data.get('mplatform_followers_count')
         user['country'] = data.get('country')
         user['province'] = data.get('province')
         user['city'] = data.get('city')
         user['location'] = data.get('location', '')
         user['district'] = data.get('district')
         user['custom_verify'] = data.get('custom_verify')
         user['with_fusion_shop_entry'] = data.get('with_fusion_shop_entry')
         user['with_commerce_entry'] = data.get('with_commerce_entry')
         user['avatar'] = data.get('avatar_medium', {}).get('url_list', [''])[0]
         user['share_url'] = data.get('share_info', {}).get('share_url')
         self.db.insert([user], 'User_table')
Example #3
0
 def parse(self, response):
     user_item = UserItem()
     user_item['crawl_time'] = int(time.time())
     selector = Selector(response)
     user_item['_id'] = re.findall('(\d+)/info', response.url)[0]
     user_info_text = ";".join(
         selector.xpath('body/div[@class="c"]//text()').extract())
     nick_name = re.findall('昵称;?:?(.*?);', user_info_text)
     gender = re.findall('性别;?:?(.*?);', user_info_text)
     place = re.findall('地区;?:?(.*?);', user_info_text)
     brief_introduction = re.findall('简介;?:?(.*?);', user_info_text)
     birthday = re.findall('生日;?:?(.*?);', user_info_text)
     sex_orientation = re.findall('性取向;?:?(.*?);', user_info_text)
     sentiment = re.findall('感情状况;?:?(.*?);', user_info_text)
     vip_level = re.findall('会员等级;?:?(.*?);', user_info_text)
     authentication = re.findall('认证;?:?(.*?);', user_info_text)
     labels = re.findall('标签;?:?(.*?)更多>>', user_info_text)
     if nick_name and nick_name[0]:
         user_item["nick_name"] = nick_name[0].replace(u"\xa0", "")
     if gender and gender[0]:
         user_item["gender"] = gender[0].replace(u"\xa0", "")
     if place and place[0]:
         place = place[0].replace(u"\xa0", "").split(" ")
         user_item["province"] = place[0]
         if len(place) > 1:
             user_item["city"] = place[1]
     if brief_introduction and brief_introduction[0]:
         user_item["brief_introduction"] = brief_introduction[0].replace(
             u"\xa0", "")
     if birthday and birthday[0]:
         user_item['birthday'] = birthday[0]
     if sex_orientation and sex_orientation[0]:
         if sex_orientation[0].replace(u"\xa0", "") == gender[0]:
             user_item["sex_orientation"] = "同性恋"
         else:
             user_item["sex_orientation"] = "异性恋"
     if sentiment and sentiment[0]:
         user_item["sentiment"] = sentiment[0].replace(u"\xa0", "")
     if vip_level and vip_level[0]:
         user_item["vip_level"] = vip_level[0].replace(u"\xa0", "")
     if authentication and authentication[0]:
         user_item["authentication"] = authentication[0].replace(
             u"\xa0", "")
     if labels and labels[0]:
         user_item["labels"] = labels[0].replace(u"\xa0",
                                                 ",").replace(';',
                                                              '').strip(',')
     request_meta = response.meta
     request_meta['item'] = user_item
     yield Request(self.base_url + '/u/{}'.format(user_item['_id']),
                   callback=self.parse_further_information,
                   meta=request_meta,
                   dont_filter=True,
                   priority=1)
Example #4
0
 def db_add_user(self,
                 password,
                 username,
                 email,
                 recent_reads_title=[],
                 recent_reads_url=[],
                 tags=[],
                 stocks=[]):
     user = UserItem()
     user['password'] = django_pbkdf2_sha256.hash(password)
     user['username'] = username
     user['email'] = email
     user['recent_reads_title'] = recent_reads_title
     user['recent_reads_url'] = recent_reads_url
     user['tags'] = tags
     user['stocks'] = stocks
     return self.db_insert_user(user)
Example #5
0
 def parse_follows(self,response):
     '''
     解析用户关注
     :param response:Response对象
     :return:
     '''
     result = json.loads(response.text)
     if result.get('data').get('cards')[-1].get('card_group'):
         follows = result.get('data').get('cards')[-1].get('card_group')
         for follow in follows:
             if follow.get('user'):
                 uid = follow.get('user').get('id')
                 yield Request(self.user_url.format(uid=uid),callback=self.parse_user)
         uid = response.meta.get('uid')
         user_relation_item = UserItem()
         follows = [{'id':follow.get('user').get('id'),'name':follow.get('user').get('screen_name')}
                     for follow in follows]
         user_relation_item['id'] = uid
         user_relation_item['follows'] = follows
         user_relation_item['fans'] = []
         yield user_relation_item
         page = response.get('meta').get('page') + 1
         yield Request(self.follow_url.format(uid=uid,page=page),callback=self.parse_follows,meta={'page':page,'uid':uid})
Example #6
0
 def parse_user(self,response):
     '''
     解析用户信息
     :param response: Response对象
     :return:
     '''
     result = json.loads(response.text)
     if result.get('data').get('userInfo'):
         user_info = result.get('data').get('userInfo')
         user_item = UserItem()
         field_map = {
             'id':'id',
             'name':'screen_name',
             'avatar':'profile_image_url',#头像
             'cover':'https://tva1.sinaimg.cn/crop.0.0.640.640.640/549d0121tw1egm1kjly3jj20hs0hsq4f.jpg',
             'gender':'gender',
             'description':'description',
             'fans_count':'followers_count',
             'follows_count':'follow_count',
             'weibos_count':'statuses_count',
             'verified':'verified',
             'verified_reason':'verified_reason',
             'verified_type':'verified_type'
         }
         for field,attr in field_map:
             user_item[field] = user_info.get(attr)
         yield user_item
         #关注
         uid = user_info.get('id')
         yield Request(self.follow_url.format(uid=uid,page=1),callback=self.parse_follows,
                       meta={'page':1,'uid':uid})
         #粉丝
         yield Request(self.fan_url.format(uid=uid,page=1),callback=self.parse_fans,
                       meta={'page':1,'uid':uid})
         #微博
         yield Request(self.weibo_url.format(uid=uid,page=1),callback=self.parse_weibos,
                       meta={'page':1,'uid':uid})
Example #7
0
def parse_user(main_page_url, cookies=None, xsrf=None):
    """
    parse user information from main page
    :type main_page_url: str
    :param xsrf:
    :param cookies:
    :param main_page_url:
    :return:
    """
    if cookies is None or xsrf is None:
        return None
    else:
        headers_ = headers_base
        headers_['Referer'] = main_page_url + "/answers"
        item = UserItem()
        response = requests.get(main_page_url,
                                headers=headers_,
                                cookies=cookies)
        if response.status_code == 200:
            doc = content_to_html(response.content)
            _parse_profile(html_doc=doc, item=item)
            _parse_home(html_doc=doc, main_page_url=main_page_url, item=item)
            _parse_follow(html_doc=doc, item=item)
        return item