コード例 #1
0
ファイル: whole.py プロジェクト: huxiaoqian/project
def followers_rank(top_n, date, window_size):
    #user_search = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_user', schema_version=1)
    count, get_results = user_search.search(query={'followers_count': {'$gt': FOLLOWERS_MIN_SUPPORT}}, sort_by=['-followers_count'], fields=['_id'], max_offset=top_n)
    sorted_uids = []
    print count
    for user in get_results():
        sorted_uids.append(user['_id'])

    return sorted_uids
コード例 #2
0
def make(date):
    end_ts = datetime2ts(date)
    start_ts = end_ts - 24*60*60

    db_name = get_leveldb('impotant', end_ts)

    daily_user_important_bucket = leveldb.LevelDB(os.path.join(LEVELDBPATH, db_name),
                                                  block_cache_size=8 * (2 << 25), write_buffer_size=8 * (2 << 25))

    batch = leveldb.WriteBatch()

    query_dict = {'timestamp': {'$gt': start_ts, '$lt': end_ts}, 'reposts_count': {'$gt': 500}}

    statuses_count, get_statuses_results = statuses_search.search(query=query_dict, field=['user', '_id', 'reposts_count'])

    print 'total statuses: %s' % statuses_count

    print 'writing to levelDB %s...' % db_name

    count = 0

    uid_important = {}
    for status in get_statuses_results():
        if count % 10000 == 0:
            print 'current count: %s' % count
        uid = status['user']
        reposts_count = status['reposts_count']
        followers_count = 0
        user_count, get_user_results = user_search.search(query={'_id': uid})
        if user_count == 1:
            for user in get_user_results():
                followers_count = user['followers_count']
                
        important = 0.9 * reposts_count + 0.1 * followers_count
        if uid not in uid_important:
            uid_important[uid] = 0
        important += uid_important[uid]
        uid_important[uid] = important
        print uid,important
        batch.Put(str(uid), str(important))
        count += 1

    daily_user_important_bucket.Write(batch, sync=True)

    print 'done.'
コード例 #3
0
def get_superior_userid(weibo):
    text = weibo['text']
    if isinstance(text, str):
        text = text.decode('utf-8', 'ignore')
    RE = re.compile(u'//@([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+):',
                    re.UNICODE)
    repost_chains = RE.findall(text)

    # 直接上级就是转发的源头节点,这种情况下在微博文本中不存正则表达匹配的内容
    '''
    # get direct_superior_name
    reposts_name = set()
    if repost_chains!=[]:
        repost_name = repost_chains[0]
    else:
        repost_name = None
    return repost_name
    
    '''

    if (weibo['retweeted_uid'] != 0
            and weibo['retweeted_uid']) and (repost_chains == []):
        direct_superior_id = weibo['retweeted_uid']
        return direct_superior_id

    if repost_chains != []:
        direct_superior_name = repost_chains[0]
        count, results = user_search.search(
            query={'name': direct_superior_name}, fields=['_id', 'name'])
        if count != 0:
            for result in results():
                direct_superior_id = result['_id']
        else:
            direct_superior_id = None
        #direct_superior_id = find_in_mongo(direct_superior_name) # 在mongodb中查询

        if not direct_superior_id:
            #direct_superior_id = find_by_scripy(direct_superior_name)
            direct_superior_id = None
    else:
        direct_superior_name = None
        direct_superior_id = None

    return direct_superior_id
コード例 #4
0
def get_superior_userid(weibo):
    text = weibo['text']
    if isinstance(text, str):
        text = text.decode('utf-8', 'ignore')
    RE = re.compile(u'//@([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+):', re.UNICODE)
    repost_chains = RE.findall(text)
    
    # 直接上级就是转发的源头节点,这种情况下在微博文本中不存正则表达匹配的内容
    
    '''
    # get direct_superior_name
    reposts_name = set()
    if repost_chains!=[]:
        repost_name = repost_chains[0]
    else:
        repost_name = None
    return repost_name
    
    '''

    if (weibo['retweeted_uid']!=0 and weibo['retweeted_uid']) and (repost_chains == []):
        direct_superior_id = weibo['retweeted_uid']
        return direct_superior_id
    
    if repost_chains!=[]:
        direct_superior_name = repost_chains[0]
        count, results = user_search.search(query={'name':direct_superior_name}, fields=['_id', 'name'])
        if count != 0:
            for result in results():
                direct_superior_id = result['_id']
        else:
            direct_superior_id = None
        #direct_superior_id = find_in_mongo(direct_superior_name) # 在mongodb中查询

        if not direct_superior_id:
            #direct_superior_id = find_by_scripy(direct_superior_name)
            direct_superior_id = None
    else:
        direct_superior_name = None
        direct_superior_id = None

    return direct_superior_id
コード例 #5
0
ファイル: autocalculate.py プロジェクト: huxiaoqian/project
def get_user(uid):
    user = {}
    count,get_results = user_search.search(query={'_id': uid})
    for r in get_results():
        user['id'] = r['_id']
        user['province'] = r['province']
        user['bi_followers_count'] = 'None'
        user['verified'] = r['verified']
        user['description'] = r['description']
        if not r['friends_count']:
            user['friends_count'] = 0
        else:
            user['friends_count'] = r['friends_count']
        user['city'] = r['city']
        user['gender']  = r['gender']
        user['profile_image_url'] = r['profile_image_url']
        user['verified_reason'] = 'None'
        if not r['followers_count']:
            user['followers_count'] = 0
        else:
            user['followers_count'] = r['followers_count']

        user['location'] = r['location']
        if not r['statuses_count']:
            user['statuses_count'] = 0
        else:
            user['statuses_count'] = r['statuses_count']

        if r['name']:
            user['name'] = r['name']
        else:
            user['name'] = u'未知用户'
        break
    if user == {}:
        return None
    else:
        return user