Esempio n. 1
0
def update_domain2leveldb():
    # 从leveldb更新leveldb的用户领域所属数据
    # test 0.15 seconds per 10000 users, total 22670000 users, 0.09 h
    count = 0
    ts = te = time.time()
    for k, v in domain_leveldb.RangeIter():
        uid, datestr = k.split('_')
        domainid = DOMAIN_LIST.index(v)

        try:
            active, important, follower, _domain = daily_identify_aifd_bucket.Get(str(uid)).split('_')
        except KeyError:
            active = 0
            important = 0
            follower = 0
        
        domain = domainid
        daily_identify_aifd_bucket.Put(str(uid), str(active) + '_' + str(important) + '_' + \
                                       str(follower) + '_' + str(domain))

        if count % 10000 == 0:
            te = time.time()
            print count, '%s sec' % (te - ts), ' identify person domain', now_datestr
            ts = te
        count += 1
Esempio n. 2
0
def user2domainFromLeveldb(uid, updatetime='20131220'):
    try:
        domainstr = user_domain_bucket.Get(str(uid) + '_' + str(updatetime))
        domainid = DOMAIN_LIST.index(domainstr)
    except:
        domainid = -1

    return domainid
Esempio n. 3
0
def user2domain(uid, updatetime="20131220"):
    try:
        v = domain_leveldb.Get(str(uid) + "_" + str(updatetime))
        domainid = DOMAIN_LIST.index(v)
    except KeyError:
        domainid = 20

    return domainid
Esempio n. 4
0
def userLeveldb2Domain(uid, updatetime='20131220'):
    try:
        v = spieduser_bucket.Get(str(uid) + '_' + str(updatetime))
        domainid = DOMAIN_LIST.index(v)
    except KeyError:
        domainid = -1

    return domainid
Esempio n. 5
0
def sentiment_field(domain, xapian_search_weibo, start_ts, over_ts, sort_field='reposts_count', save_fields=RESP_ITER_KEYS, during=Hour, w_limit=TOP_WEIBOS_LIMIT, k_limit=TOP_KEYWORDS_LIMIT):
    if domain_uids != []:
        start_ts = int(start_ts)
        over_ts = int(over_ts)

        over_ts = ts2HourlyTime(over_ts, during)
        interval = (over_ts - start_ts) / during

        for i in range(interval, 0, -1):
            emotions_count = {}
            emotions_kcount = {}
            emotions_weibo = {}

            begin_ts = over_ts - during * i
            end_ts = begin_ts + during
            print begin_ts, end_ts, 'domain %s starts calculate' % domain

            query_dict = {
                'timestamp': {'$gt': begin_ts, '$lt': end_ts},
                '$or': []
            }

            for uid in domain_uids:
                query_dict['$or'].append({'user': uid})

            for k, v in emotions_kv.iteritems():
                query_dict['sentiment'] = v
                scount = xapian_search_weibo.search(query=query_dict, count_only=True)
                mset = xapian_search_weibo.search(query=query_dict, sort_by=[sort_field], \
                                                  max_offset=w_limit, mset_direct=True)
                kcount = top_keywords(gen_mset_iter(xapian_search_weibo, mset, fields=['terms']), top=k_limit)
                top_ws = top_weibos(gen_mset_iter(xapian_search_weibo, mset, fields=save_fields), top=w_limit)

                emotions_count[v] = [end_ts, scount]
                emotions_kcount[v] = [end_ts, kcount]
                emotions_weibo[v] = [end_ts, top_ws]

                print k, v, ', emotions count: ', emotions_count, ', emotion keywords length: ', len(kcount), ', emotion weibos length: ', len(top_ws)

            print domain, date, ' %s %s saved emotions counts, keywords and weibos' % (begin_ts, end_ts)
            save_count_results(DOMAIN_LIST.index(domain), emotions_count, during)
            save_kcount_results(DOMAIN_LIST.index(domain), emotions_kcount, during, TOP_KEYWORDS_LIMIT)
            save_weibos_results(DOMAIN_LIST.index(domain), emotions_weibo, during, TOP_WEIBOS_LIMIT)
def _add_domain_usersFromLeveldb(updatetime='20131220'):
    try:
        spieduser_bucket = leveldb.LevelDB(os.path.join(LEVELDBPATH, 'spiedusers_4'),
                                           block_cache_size=8 * (2 << 25), write_buffer_size=8 * (2 << 25))
    except:
        print 'spieduser_bucket leveldb not available now'

    for k, v in spieduser_bucket.RangeIter():
        uid, updatetime = k.split('_')
        uid = int(uid)
        domain = str(v)
        domainid = int(DOMAIN_LIST.index(domain))
        r.sadd(DOMAIN_USERS % domainid, uid)
def _add_all_user_domain(r):
    '''test 10000 users per second
    '''
    count = 0
    ts = te = time.time()
    for k, v in spieduser_bucket.RangeIter():
        uid, updatetime = k.split('_')
        uid = int(uid)
        domainid = DOMAIN_LIST.index(v)
        r.hset(USER_DOMAIN, uid, domainid)

        if count % 10000 == 0:
            te = time.time()
            print count, '%s sec' % (te - ts)
            ts = te
        count += 1