Exemple #1
0
def update_follower2leveldb():
    # 从leveldb更新leveldb的用户粉丝数数据
    # test 0.15 seconds per 10000 users, total 22670000 users, 0.09 h
    users = xapian_search_user.iter_all_docs(fields=['user', 'followers_count'])
    
    count = 0
    ts = te = time.time()
    for k, v in user_followers_count_leveldb.RangeIter():
        uid = int(k)
        follower = int(v)
        
        try:
            active, important, _follower, domain = daily_identify_aifd_bucket.Get(str(uid)).split('_')
        except KeyError:
            active = 0
            important = 0
            domain = 20

        daily_identify_aifd_bucket.Put(str(uid), str(active) + '_' + str(important) + '_' + \
                                       str(follower) + '_' + str(domain))

        if count % 10000 == 0:
            te = time.time()
            print count, '%s sec' % (te - ts), ' identify person follower', now_datestr
            ts = te
        count += 1
def iter_userbasic2leveldb():
    users = xapian_search_user.iter_all_docs(fields=xapian_user_fields)

    count = 0
    batch = leveldb.WriteBatch()
    ts = te = time.time()
    for user in users:
        if count % 10000 == 0:
            te = time.time()
            daily_profile_person_basic_db.Write(batch, sync=True)
            batch = leveldb.WriteBatch()
            print count, '%s sec' % (te - ts), 'xapian2leveldb person basic'
            ts = te
        # extraction and transfer
        try:
            userId = int(user['_id'])
        except:
            count += 1
            continue
        province = user['province']
        city = user['city']
        verified = user['verified']
        name = _utf_encode(user['name'])
        friendsCount = user['friends_count']
        gender = user['gender']
        profileImageUrl = user['profile_image_url']
        verifiedType = user['verified_type']
        followersCount = user['followers_count']
        location = _utf_encode(user['location'])
        statusesCount = user['statuses_count']
        description = _utf_encode(user['description'])
        domain = userLeveldb2DomainZh(userId)
        
        try:
            created_at = int(user['created_at'])
        except:
            count += 1
            continue

        date = batch_date_1
        #Load
        key = str(userId)
        value = '_\/'.join([str(province), str(city), str(verified), \
                            str(name), str(friendsCount), str(gender), \
                            str(profileImageUrl), str(verifiedType), \
                            str(followersCount), str(location), \
                            str(statusesCount), str(description), \
                            str(created_at), str(domain)])
        batch.Put(key, value)

        count += 1
def user_name_uid_xapian2redis():
    ''' test 2 secondes per 10000
    '''
    count = 0
    ts = te = time.time()
    users = xapian_search_user.iter_all_docs(fields=['name', '_id'])
    for user in users:
        name = user['name']
        uid = user['_id']
        global_r0.hset(USER_NAME_UID, name, int(uid))

        count += 1
        if count % 10000 == 0:
            te = time.time()
            print count, '%s sec' % (te - ts)
            ts = te
def follower_leveldb():
    # test 10 thousand per second
    get_results = xapian_search_user.iter_all_docs(fields=['_id', 'followers_count'])

    count = 0
    ts = te = time.time()
    for result in get_results:
        field_daily_active_count_bucket.Put(str(result['_id']), str(result['followers_count']))
        
        if count % 10000 == 0:
            te = time.time()
            print count, '%s sec' % (te - ts), 'identify followers_count to leveldb '
            ts = te

        count = count + 1

    return 'Done'
def batch_handle_domain_basic():
    count = 0
    ts = te = time.time()
    users = xapian_search_user.iter_all_docs(fields=['_id', 'verified', 'location']) 
    for user in users:
        if count % 10000 == 0:
            te = time.time()
            print count, '%s sec' % (te - ts), ' %s daily domain basic' % batch_date_1
            ts = te

        domainid = userLeveldb2Domain(user['_id'])
        verified = user['verified']
        province_str = user['location'].split(' ')[0]
        
        try:
            verified_count, unverified_count, province_dict = daily_profile_domain_basic_db.Get(str(domainid)).split('_\/')
            verified_count = int(verified_count)
            unverified_count = int(unverified_count)
            province_dict = json.loads(province_dict)
        except KeyError:
            verified_count = unverified_count = 0
            province_dict = {}

        if verified:
            verified_count += 1
        else:
            unverified_count += 1

        try:
            province_dict[province_str] += 1
        except KeyError:
            province_dict[province_str] = 1

        key = str(domainid)
        value = '_\/'.join([str(verified_count), str(unverified_count), json.dumps(province_dict)])
        daily_profile_domain_basic_db.Put(key, value)

        count += 1
Exemple #6
0
def calFieldByFriends():
    protousers = readProtoUser()

    iter_count = 0
    ts = te = time.time()
    users = xapian_search_user.iter_all_docs(fields=['friends'])
    for user in users:
        area_dict = {}
        friends = user['friends']

        for fri in friends:
            try:
                area = protousers[fri]
                area_dict[area] += 1
            except KeyError:
                pass

        if area_dict != {}:
            area_counts = sorted(area_dict.iteritems(), key=itemgetter(1), reverse=True)
            if len(area_counts) == 1:
                areas = area_counts[0][0]
            else:
                areas = area_counts[0][0] + ',' + area_counts[1][0]
            
            try:             
                e_areas = global_user_field_bucket.Get(str(uid) + '_' + update_datestr)
                areas = ','.join(e_areas.split(',') + areas.split(','))
            except KeyError:
                pass

            global_user_field_bucket.Put(str(uid) + '_' + update_datestr, areas)                

        if iter_count % 10000 == 0:
            te = time.time()
            print iter_count, '%s sec' % (te - ts)
            ts = te

        iter_count += 1
# -*- coding: utf-8 -*-

import os
import time
import leveldb
from config import xapian_search_user, LEVELDBPATH

user_name_bucket = leveldb.LevelDB(os.path.join(LEVELDBPATH, 'linhao_user_name'),
                                                block_cache_size=8 * (2 << 25), write_buffer_size=8 * (2 << 25))
users = xapian_search_user.iter_all_docs(fields=['user', 'name'])

count = 0
ts = te = time.time()
for user in users:
    uid = user['user']
    name = user['name']
    user_name_bucket.Put(str(name.encode('utf-8')), str(uid))
    
    if count % 10000 == 0:
        te = time.time()
        print count, '%s sec' % (te - ts), ' user name to leveldb'
        ts = te
    count += 1