Ejemplo n.º 1
0
def stat_user(search_time=None, force_update=False):
    """
    统计用户信息
    """
    if not search_time:
        search_time = datetime.datetime.now()
        search_time = "%s-%s-%s" % (now_time.year, now_time.month,
                                    now_time.day)
    content = Content.get(search_time)
    if not content:
        return
    #遍历所有用户
    for types, ids in content.weibo.iteritems():
        for id, context in ids.iteritems():
            user = Weibo.get(id)
            #如果还没有今天的统计就统计一下
            if not user.stat_info.get(search_time) or force_update:
                create_at = []
                for tmp_content in context:
                    if not tmp_content.get('created_at'): continue
                    print types, tmp_content['created_at']
                    create_at.append(
                        tmp_content['created_at'].split(' ')[3].split(':')[0])
                user.stat_info[search_time] = {}
                user.stat_info[search_time] = {
                    'send_count': len(context),
                    'create_at': create_at
                }
                #print ,create_atuser.stat_info[search_time]
                user.put()
Ejemplo n.º 2
0
def stat_user(search_time=None,force_update=False):
    """
    统计用户信息
    """
    if not search_time:
        search_time = datetime.datetime.now()
        search_time = "%s-%s-%s"%(now_time.year,now_time.month,now_time.day)
    content = Content.get(search_time)
    if not content:
        return
    #遍历所有用户
    for types,ids in content.weibo.iteritems():
        for id,context in ids.iteritems():
            user = Weibo.get(id)
            #如果还没有今天的统计就统计一下
            if not user.stat_info.get(search_time) or force_update:
                create_at = []
                for tmp_content in context:
                    if not tmp_content.get('created_at'):continue
                    print types,tmp_content['created_at']
                    create_at.append(tmp_content['created_at'].split(' ')[3].split(':')[0])
                user.stat_info[search_time] = {}
                user.stat_info[search_time] = {'send_count':len(context),'create_at':create_at}
                #print ,create_atuser.stat_info[search_time]
                user.put()
Ejemplo n.º 3
0
#-*- coding: utf-8 -*-
Ejemplo n.º 4
0
#-*- coding: utf-8 -*-
Ejemplo n.º 5
0
def get_content(weibo_type,
                user_id,
                debug=False,
                count=200,
                force_update=False,
                content_type=0):
    """
    抓取微博内容
    weibo_type:微博类型  注意需要是已经存在的类别
    user_id:微博的id 注意这里不是微博名字 是微博id 
    debug:调试模式 不插入数据库
    force_update:强制更新  删除所有 重新获取
    content_type:0全部,1原创,2图片,3视频,4音乐
    """
    content_dict = {}
    #ty:时尚 美图 旅游 搞笑.....
    #用户id
    result = client.statuses.user_timeline.get(uid=user_id,
                                               count=count,
                                               feature=content_type)
    contents = dict(result)
    #遍历所有发的帖子 前100条
    for s_item in contents['statuses']:

        #可能是转帖 所以需要再取一次
        if not s_item.get('original_pic'):
            if s_item.get('retweeted_status', {}).get('original_pic'):
                s_item['original_pic'] = s_item['retweeted_status'][
                    'original_pic']
            else:
                #如果没有图片 就pass掉
                continue

        #filter列表包含这些内容不保存 可能是广告数据
        if "http://" in s_item['text'] or "包邮" in s_item['text']\
        or "去评论中找链接哦" in s_item['text']\
         or "www." in s_item['text'] or re.findall('[0-9]元',s_item['text'])\
         or s_item['text'].count(" ★") >= 3 or s_item['text'].count("(") >= 3\
         or s_item['text'].count(":") > 5 or s_item['text'].count("【") > 2\
         or s_item['text'].count("、") > 5 or '@' in s_item['text']\
         or '#' in s_item['text']:
            continue

        #gif图片单独存放
        if '.gif' in s_item.get('original_pic', ''):
            response = urllib.urlopen(url=s_item['original_pic'])
            response_dict = dict(response.headers)
            file_size = response_dict.get('content-length')
            if file_size:
                #计算他是多少M的大小
                file_size = float(file_size) / 1000.0 / 1000.0
                file_size = decimal.Decimal(file_size).quantize(
                    decimal.Decimal('0.0'))
                s_item['file_size'] = file_size

        #如果是检查视频微博 判断视频长度
        if content_type in [3, '3']:
            if 'http://' in s_item['text']:
                video_url = s_item['text']
            elif 'http://' in s_item['retweeted_status']['text']:
                video_url = s_item['retweeted_status']['text']
            video_index = b.index('http')
            #视频地址
            #视频片段有多少个
            s_item['video_url'] = video_url[video_index:].split(' ')[0]
            video_count = utils.get_video_count(s_item['video_url'])
            s_item['video_count'] = video_count
            print s_item['video_url'], video_count

        #判断字数小于5个字过滤
        if len(s_item['text'].decode('utf-8')) <= 5:
            continue


#        #计算图片的大小
#        if s_item.get('original_pic'):
#            response = urllib.urlopen(url=s_item['original_pic'])
#            img_data = response.read()
#            io = cStringIO.StringIO(img_data)
#            s_item['width'],s_item['height'] = Image.open(io).size

#格式化时间  按照时间分开存放内容
        created_at = s_item['created_at'].split(' ')
        time_str = created_at[len(created_at) - 1] + "-" + str(
            time_dict[created_at[1]]) + '-' + created_at[2]
        if time_str not in content_dict:
            content_dict[time_str] = {}

        #[时间][搞笑][周杰伦的微博的id]  注意是id哦~
        if user_id not in content_dict[time_str]:
            content_dict[time_str][user_id] = []
        need_data = {
            'id': s_item['id'],
            'screen_name': weibo_user.userids[int(user_id)],
            'type': weibo_type,
            'text': s_item['text'],
            'bmiddle_pic': s_item.get('bmiddle_pic'),
            'original_pic': s_item.get('original_pic'),
            'thumbnail_pic': s_item.get('thumbnail_pic'),
            'reposts_count': s_item.get('reposts_count'),
            'comments_count': s_item.get('comments_count'),
            'attitudes_count': s_item.get('attitudes_count'),
            'mlevel': s_item.get('mlevel'),
            'width': s_item.get('width'),
            'height': s_item.get('height'),
            'text_size': len(s_item['text'].decode('utf-8')),
            'created_at': s_item['created_at'],
            'file_size': s_item.get('file_size'),
            'video_url': s_item.get('video_url'),
            'avatar_large': s_item.get('user', {}).get('avatar_large'),
            'profile_image_url': s_item.get('user',
                                            {}).get('profile_image_url'),
        }
        #[时间][用户id] = [微博,微博,微博]
        content_dict[time_str][user_id].append(need_data)

    #按照时间分开存储 k:时间 :{用户id:[]}
    for k, v in content_dict.iteritems():
        cont_obj = Content.get(k)
        if not cont_obj:
            cont_obj = Content._install(k)
        #新添加类别
        if weibo_type not in cont_obj.weibo:
            cont_obj.weibo[weibo_type] = v
        else:
            #有可能内容已经存在 u_id:用户id item_value:帖子集合[]
            for u_id, item_value in v.iteritems():
                #如果没用该用户的信息 创建
                if u_id not in cont_obj.weibo[weibo_type] or force_update:
                    cont_obj.weibo[weibo_type][u_id] = []
                    cont_obj.weibo[weibo_type][u_id] = item_value
                else:

                    #如果有该用户信息 需要判断是否有重复内容
                    now_ids = [
                        va['id'] for va in cont_obj.weibo[weibo_type][u_id]
                    ]
                    for cont in item_value:
                        if cont['id'] not in now_ids:
                            cont_obj.weibo[weibo_type][u_id].append(cont)
        if not debug:
            a = time.time()
            cont_obj.put()
            print 'result', time.time() - a
Ejemplo n.º 6
0
def get_content(weibo_type,user_id,debug=False,count=200,force_update=False,content_type=0):
    """
    抓取微博内容
    weibo_type:微博类型  注意需要是已经存在的类别
    user_id:微博的id 注意这里不是微博名字 是微博id 
    debug:调试模式 不插入数据库
    force_update:强制更新  删除所有 重新获取
    content_type:0全部,1原创,2图片,3视频,4音乐
    """
    content_dict = {}
    #ty:时尚 美图 旅游 搞笑.....
    #用户id
    result = client.statuses.user_timeline.get(uid=user_id,count=count,feature=content_type)
    contents = dict(result)
    #遍历所有发的帖子 前100条
    for s_item in contents['statuses']:
        
        #可能是转帖 所以需要再取一次
        if not s_item.get('original_pic'):
            if s_item.get('retweeted_status',{}).get('original_pic'):
                s_item['original_pic'] = s_item['retweeted_status']['original_pic']
            else:
                #如果没有图片 就pass掉
                continue
            
        #filter列表包含这些内容不保存 可能是广告数据
        if "http://" in s_item['text'] or "包邮" in s_item['text']\
        or "去评论中找链接哦" in s_item['text']\
         or "www." in s_item['text'] or re.findall('[0-9]元',s_item['text'])\
         or s_item['text'].count(" ★") >= 3 or s_item['text'].count("(") >= 3\
         or s_item['text'].count(":") > 5 or s_item['text'].count("【") > 2\
         or s_item['text'].count("、") > 5 or '@' in s_item['text']\
         or '#' in s_item['text']:
            continue
        
        #gif图片单独存放
        if '.gif' in s_item.get('original_pic',''):
            response = urllib.urlopen(url=s_item['original_pic'])
            response_dict = dict(response.headers)
            file_size = response_dict.get('content-length')
            if file_size:
                #计算他是多少M的大小
                file_size = float(file_size) / 1000.0 / 1000.0
                file_size = decimal.Decimal(file_size).quantize(decimal.Decimal('0.0'))
                s_item['file_size'] = file_size
                
        #如果是检查视频微博 判断视频长度
        if content_type in [3,'3']:
            if 'http://' in s_item['text']:
                video_url = s_item['text']
            elif 'http://' in s_item['retweeted_status']['text']:
                video_url = s_item['retweeted_status']['text']
            video_index = b.index('http')
            #视频地址
            #视频片段有多少个
            s_item['video_url'] = video_url[video_index:].split(' ')[0]
            video_count = utils.get_video_count(s_item['video_url'])
            s_item['video_count'] = video_count
            print s_item['video_url'],video_count
            
        #判断字数小于5个字过滤
        if len(s_item['text'].decode('utf-8'))<= 5:
            continue
        
#        #计算图片的大小
#        if s_item.get('original_pic'):
#            response = urllib.urlopen(url=s_item['original_pic'])
#            img_data = response.read()
#            io = cStringIO.StringIO(img_data)
#            s_item['width'],s_item['height'] = Image.open(io).size
            
        #格式化时间  按照时间分开存放内容
        created_at = s_item['created_at'].split(' ')
        time_str = created_at[len(created_at)-1] + "-" + str(time_dict[created_at[1]]) + '-' + created_at[2]
        if time_str not in content_dict:
            content_dict[time_str] = {}
            
        #[时间][搞笑][周杰伦的微博的id]  注意是id哦~
        if user_id not in content_dict[time_str]:
            content_dict[time_str][user_id] = []
        need_data = {
                     'id':s_item['id'],
                     'screen_name':weibo_user.userids[int(user_id)],
                     'type':weibo_type,
                     'text':s_item['text'],
                     'bmiddle_pic':s_item.get('bmiddle_pic'),
                     'original_pic':s_item.get('original_pic'),
                     'thumbnail_pic':s_item.get('thumbnail_pic'),
                     'reposts_count':s_item.get('reposts_count'),
                     'comments_count':s_item.get('comments_count'),
                     'attitudes_count':s_item.get('attitudes_count'),
                     'mlevel':s_item.get('mlevel'),
                     'width':s_item.get('width'),
                     'height':s_item.get('height'),
                     'text_size':len(s_item['text'].decode('utf-8')),
                     'created_at':s_item['created_at'],
                     'file_size':s_item.get('file_size'),
                     'video_url':s_item.get('video_url'),
                     
                     'avatar_large':s_item.get('user',{}).get('avatar_large'),
                     'profile_image_url':s_item.get('user',{}).get('profile_image_url'),
                     
                     }
        #[时间][用户id] = [微博,微博,微博]
        content_dict[time_str][user_id].append(need_data)
        
    #按照时间分开存储 k:时间 :{用户id:[]}
    for k,v in content_dict.iteritems():
        cont_obj = Content.get(k)
        if not cont_obj:
            cont_obj = Content._install(k)
        #新添加类别 
        if weibo_type not in cont_obj.weibo:
            cont_obj.weibo[weibo_type] = v
        else:
            #有可能内容已经存在 u_id:用户id item_value:帖子集合[]
            for u_id,item_value in v.iteritems():
                #如果没用该用户的信息 创建
                if u_id not in cont_obj.weibo[weibo_type] or force_update:
                    cont_obj.weibo[weibo_type][u_id] = []
                    cont_obj.weibo[weibo_type][u_id] = item_value
                else:
                    
                #如果有该用户信息 需要判断是否有重复内容
                    now_ids = [va['id'] for va in cont_obj.weibo[weibo_type][u_id]]
                    for cont in item_value:
                        if cont['id'] not in now_ids:
                            cont_obj.weibo[weibo_type][u_id].append(cont)
        if not debug:
            a = time.time()
            cont_obj.put()
            print 'result',time.time()-a