def co_search(es, user_list, bulk_action, count_n, tb):
    search_list = []
    for item in user_list:
        uid = item.get('uid', '0') # obtain uid, notice "uid" or "user"
        search_list.append(uid)

    search_result = es.mget(index=index_destination, doc_type=index_destination_doctype, body={"ids": search_list}, _source=False)["docs"]
    search_list = []

    for item in search_result:
        if not item['found']:
            user_info = {}
            user_info['uid'] = item['_id']
            user_info['low_number'] = 0
            xdata = expand_index_action(user_info)
            bulk_action.extend([xdata[0], xdata[1]])
            count_n += 1
            if count_n % 1000 == 0:
                es.bulk(bulk_action, index=index_destination, doc_type=index_destination_doctype, timeout=30)
                bulk_action = []
                print count_n

            if count_n % 10000 == 0:
                ts = time.time()
                print "count_n %s  per  %s  second"  %(count_n, ts-tb)
                print "count %s " % count
                tb = ts

    return bulk_action, count_n, tb
def save_dg_pr_results(sorted_uids, es_num, flag):
    index_name = "user_portrait_network"
    index_type = "network"
    bulk_action = []
    for uid, rank in sorted_uids:
        if (uid == 'global'):
            continue
        user_results = {}
        user_results['uid'] = uid
        user_results[flag+'_'+str(es_num)] = rank
        if es_num == 0:
            action = {'index':{'_id':uid}}
            bulk_action.extend([action,user_results])
        else:
            try:
                item_exist = es_user_portrait.get(index=index_name, doc_type=index_type, id=uid)['_source']
                action = {'update':{'_id':uid}}
                try:
                    pr_last = item_exist[flag+'_'+str(es_num-1)]
                except:
                    pr_last = 0
                user_results[flag+'_diff_'+str(es_num)] = rank - pr_last
                bulk_action.extend([action,{'doc':user_results}])
            except:
                action = {'index':{'_id':uid}}
                pr_last = 0
                user_results[flag+'_diff_'+str(es_num)] = rank - pr_last
                bulk_action.extend([action,user_results])

    #print bulk_action
    es_user_portrait.bulk(bulk_action, index=index_name, doc_type=index_type)
Exemple #3
0
def co_search(es, user_list, bulk_action, count_n, tb):
    search_list = []
    for item in user_list:
        uid = item.get('uid', '0')  # obtain uid, notice "uid" or "user"
        search_list.append(uid)

    search_result = es.mget(index=index_destination,
                            doc_type=index_destination_doctype,
                            body={"ids": search_list},
                            _source=False)["docs"]
    search_list = []

    for item in search_result:
        if not item['found']:
            user_info = {}
            user_info['uid'] = item['_id']
            user_info['low_number'] = 0
            xdata = expand_index_action(user_info)
            bulk_action.extend([xdata[0], xdata[1]])
            count_n += 1
            if count_n % 1000 == 0:
                es.bulk(bulk_action,
                        index=index_destination,
                        doc_type=index_destination_doctype,
                        timeout=30)
                bulk_action = []
                print count_n

            if count_n % 10000 == 0:
                ts = time.time()
                print "count_n %s  per  %s  second" % (count_n, ts - tb)
                print "count %s " % count
                tb = ts

    return bulk_action, count_n, tb
def save_user_results(bulk_action):
    print 'save utils bulk action len:', len(bulk_action)
    #print 'bulk action:', bulk_action
    es.bulk(bulk_action,
            index='sensitive_user_portrait',
            doc_type=index_type,
            timeout=60)
    return True
def save_user_results(bulk_action):
    #print 'save utils bulk action len:', len(bulk_action)
    #test
    #print 'bulk_action:', bulk_action
    #portrait_index_name = 'user_portrait_0303'
    #portrait_index_type = 'user'
    es.bulk(bulk_action, index=portrait_index_name, doc_type=portrait_index_type, timeout=600)
    return True    
def deal_bulk_action(user_info_list, fansnum_max):
    start_ts = time.time()
    uid_list = user_info_list.keys()
    #acquire bulk user weibo data
    if WEIBO_API_INPUT_TYPE == 0:
        user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts = read_flow_text_sentiment(
            uid_list)
    else:
        user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts = read_flow_text(
            uid_list)
    #compute attribute--keywords, topic, online_pattern
    #get user topic results by bulk action
    topic_results_dict, topic_results_label = topic_classfiy(
        uid_list, user_keywords_dict)
    #get bulk action
    bulk_action = []
    for uid in uid_list:
        results = {}
        results['uid'] = uid
        #add user topic attribute
        user_topic_dict = topic_results_dict[uid]
        user_label_dict = topic_results_label[uid]
        results['topic'] = json.dumps(user_topic_dict)
        results['topic_string'] = topic_en2ch(user_label_dict)
        #add user keywords attribute
        keywords_dict = user_keywords_dict[uid]
        keywords_top50 = sorted(keywords_dict.items(),
                                key=lambda x: x[1],
                                reverse=True)[:50]
        keywords_top50_string = '&'.join(
            [keyword_item[0] for keyword_item in keywords_top50])
        results['keywords'] = json.dumps(keywords_top50)
        results['keywords_string'] = keywords_top50_string
        #add online_pattern
        user_online_pattern = online_pattern_dict[uid]
        results['online_pattern'] = json.dumps(user_online_pattern)
        try:
            results['online_pattern_aggs'] = '&'.join(
                user_online_pattern.keys())
        except:
            results['online_pattern_aggs'] = ''
        #add user importance
        user_domain = user_info_list[uid]['domain'].encode('utf-8')
        user_fansnum = user_info_list[uid]['fansnum']
        results['importance'] = get_importance(user_domain,
                                               results['topic_string'],
                                               user_fansnum, fansnum_max)
        #bulk action
        action = {'update': {'_id': uid}}
        bulk_action.extend([action, {'doc': results}])
    es_user_portrait.bulk(bulk_action,
                          index=portrait_index_name,
                          doc_type=portrait_index_type)
    end_ts = time.time()
    #log_should_delete
    #print '%s sec count %s' % (end_ts - start_ts, len(uid_list))
    #log_should_delete
    start_ts = end_ts
Exemple #7
0
def split_bulk_action(bulk_action, index_name):
    new_bulk_action = []
    for i in range(0, len(bulk_action)):
        if i % 2 == 0:
            new_bulk_action = [bulk_action[i], bulk_action[i + 1]]
            try:
                es.bulk(new_bulk_action, index=index_name, doc_type='user')
            except:
                print 'cron/flow3/scan_redis2es_comment.py&error-1&'
def split_bulk_action(bulk_action, index_name):
    new_bulk_action = []
    for i in range(0, len(bulk_action)):
        if i % 2 == 0:
            new_bulk_action = [bulk_action[i], bulk_action[i + 1]]
            try:
                es.bulk(new_bulk_action, index=index_name, doc_type="user")
            except:
                print "cron/flow3/scan_redis2es_comment.py&error-1&"
Exemple #9
0
def split_bulk_action(bulk_action, index_name):
    new_bulk_action = []
    for i in range(0, len(bulk_action)):
        if i % 2 == 0:
            new_bulk_action = [bulk_action[i], bulk_action[i + 1]]
            #print 'new_bulk_action:', new_bulk_action
            try:
                es.bulk(new_bulk_action, index=index_name, doc_type='user')
            except:
                error_f.writelines([new_bulk_action[0]['index']['_id'], '\n'])
def split_bulk_action(bulk_action, index_name):
    new_bulk_action = []
    for i in range(0, len(bulk_action)):
        if i % 2 == 0:
            new_bulk_action = [bulk_action[i], bulk_action[i+1]]
            #print 'new_bulk_action:', new_bulk_action
            try:
                es.bulk(new_bulk_action, index=index_name, doc_type='user')
            except:
                error_f.writelines([new_bulk_action[0]['index']['_id'], '\n'])
def deal_bulk_action(user_info_list, fansnum_max):
    start_ts = time.time()
    uid_list = user_info_list.keys()
    #acquire bulk user weibo data
    if WEIBO_API_INPUT_TYPE == 0:
        user_keywords_dict, user_weibo_dict, character_start_ts = read_flow_text_sentiment(
            uid_list)
    else:
        user_keywords_dict, user_weibo_dict, character_start_ts = read_flow_text(
            uid_list)
    #compute attribute--domain, character, importance
    #get user domain
    domain_results = domain_classfiy(uid_list, user_keywords_dict)
    domain_results_dict = domain_results[0]
    domain_results_label = domain_results[1]
    #get user character
    character_start_time = ts2datetime(character_start_ts)
    character_end_time = ts2datetime(character_start_ts +
                                     DAY * CHARACTER_TIME_GAP - DAY)
    character_sentiment_result_dict = classify_sentiment(
        uid_list, user_weibo_dict, character_start_time, character_end_time,
        WEIBO_API_INPUT_TYPE)
    character_text_result_dict = classify_topic(uid_list, user_keywords_dict)
    bulk_action = []
    for uid in uid_list:
        results = {}
        results['uid'] = uid
        #add user domain attribute
        user_domain_dict = domain_results_dict[uid]
        user_label_dict = domain_results_label[uid]
        results['domain_v3'] = json.dumps(user_domain_dict)
        results['domain'] = domain_en2ch(user_label_dict)

        #add user character_sentiment attribute
        character_sentiment = character_sentiment_result_dict[uid]
        results['character_sentiment'] = character_sentiment

        #add user character_text attribute
        character_text = character_text_result_dict[uid]
        results['character_text'] = character_text
        #get user importance
        user_topic_string = user_info_list[uid]['topic_string'].encode('utf-8')
        user_fansnum = user_info_list[uid]['fansnum']
        results['importnace'] = get_importance(results['domain'],
                                               user_topic_string, user_fansnum,
                                               fansnum_max)
        #bulk action
        action = {'update': {'_id': uid}}
        bulk_action.extend([action, {'doc': results}])
    es_user_portrait.bulk(bulk_action,
                          index=portrait_index_name,
                          doc_type=portrait_index_type)
    end_ts = time.time()
    #log_should_delete
    print '%s sec count %s' % (end_ts - start_ts, len(uid_list))
def save_user_results(bulk_action):
    #print 'save utils bulk action len:', len(bulk_action)
    #test
    #print 'bulk_action:', bulk_action
    #portrait_index_name = 'user_portrait_0303'
    #portrait_index_type = 'user'
    es.bulk(bulk_action,
            index=portrait_index_name,
            doc_type=portrait_index_type,
            timeout=600)
    return True
def save_bulk_action(uid_list, hashtag_results, geo_results, activeness_results, influence_results):
    bulk_action = []
    for uid in uid_list:
        user_results = {}
        user_results = dict(user_results, **hashtag_results[uid])
        user_results = dict(user_results, **geo_results[uid])
        user_results = dict(user_results, **activeness_results[uid])
        user_results = dict(user_results, **influence_results[uid])
        action = {'update':{'_id': uid}}
        bulk_action.extend([action, {'doc': user_results}])

    es_user_portrait.bulk(bulk_action, index=portrait_index_name, doc_type=portrait_index_type)
def main():
    ts = time.time()

    bulk_action = []
    copy_bulk_action = []
    count = 0

    delete_key = ts2datetime(ts - 7*86400) # 每天删除7天前的数据
    temp_list = recommend_redis.hget("decide_delete_list", item)
    if temp_list:
        delete_list = json.loads(temp_list) # 待删除的用户列表
        hdel("decide_delete_list", item)

        for uid in delete_list:
            del_data = expand_delete_action(uid, portrait_index_name, portrait_index_type)
            copy_del_data = expand_delete_action(uid, copy_portrait_index_name, copy_portrait_index_type)
            bulk_action.append(del_data)
            copy_bulk_action.append(copy_del_data)
            count += 1

            if count % 100 == 0:
                es.bulk(bulk_action, index=portrait_index_name, doc_type=portrait_index_type, timeout=30)
                es.bulk(copy_bulk_action, index=copy_portrait_index_name, doc_type=copy_portrait_index_type, timeout=30)
                bulk_action = []
                copy_bulk_action = []

    if bulk_action:
        es.bulk(bulk_action, index=portrait_index_name, doc_type=portrait_index_type, timeout=30)
        es.bulk(copy_bulk_action, index=copy_portrait_index_name, doc_type=copy_portrait_index_type, timeout=30)
def deal_bulk_action(user_info_list, fansnum_max):
    start_ts = time.time()
    uid_list = user_info_list.keys()
    # acquire bulk user weibo data
    if WEIBO_API_INPUT_TYPE == 0:
        user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts = read_flow_text_sentiment(
            uid_list
        )
    else:
        user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts = read_flow_text(uid_list)
    # compute attribute--keywords, topic, online_pattern
    # get user topic results by bulk action
    topic_results_dict, topic_results_label = topic_classfiy(uid_list, user_keywords_dict)
    # get bulk action
    bulk_action = []
    for uid in uid_list:
        results = {}
        results["uid"] = uid
        # add user topic attribute
        user_topic_dict = topic_results_dict[uid]
        user_label_dict = topic_results_label[uid]
        results["topic"] = json.dumps(user_topic_dict)
        results["topic_string"] = topic_en2ch(user_label_dict)
        # add user keywords attribute
        keywords_dict = user_keywords_dict[uid]
        keywords_top50 = sorted(keywords_dict.items(), key=lambda x: x[1], reverse=True)[:50]
        keywords_top50_string = "&".join([keyword_item[0] for keyword_item in keywords_top50])
        results["keywords"] = json.dumps(keywords_top50)
        results["keywords_string"] = keywords_top50_string
        # add online_pattern
        user_online_pattern = online_pattern_dict[uid]
        results["online_pattern"] = json.dumps(user_online_pattern)
        try:
            results["online_pattern_aggs"] = "&".join(user_online_pattern.keys())
        except:
            results["online_pattern_aggs"] = ""
        # add user importance
        user_domain = user_info_list[uid]["domain"].encode("utf-8")
        user_fansnum = user_info_list[uid]["fansnum"]
        results["importance"] = get_importance(user_domain, results["topic_string"], user_fansnum, fansnum_max)
        # bulk action
        action = {"update": {"_id": uid}}
        bulk_action.extend([action, {"doc": results}])
    es_user_portrait.bulk(bulk_action, index=portrait_index_name, doc_type=portrait_index_type)
    end_ts = time.time()
    # log_should_delete
    # print '%s sec count %s' % (end_ts - start_ts, len(uid_list))
    # log_should_delete
    start_ts = end_ts
def save_bulk_action(uid_list, hashtag_results, geo_results,
                     activeness_results, influence_results):
    bulk_action = []
    for uid in uid_list:
        user_results = {}
        user_results = dict(user_results, **hashtag_results[uid])
        user_results = dict(user_results, **geo_results[uid])
        user_results = dict(user_results, **activeness_results[uid])
        user_results = dict(user_results, **influence_results[uid])
        action = {'update': {'_id': uid}}
        bulk_action.extend([action, {'doc': user_results}])

    es_user_portrait.bulk(bulk_action,
                          index=portrait_index_name,
                          doc_type=portrait_index_type)
def save_bulk_action(uid_list, hashtag_results, geo_results, activeness_results, influence_results, sensitive_results, profile_results):
    bulk_action = []
    for uid in uid_list:
        user_results = {}
        user_results = dict(user_results, **hashtag_results[uid])
        user_results = dict(user_results, **geo_results[uid])
        user_results = dict(user_results, **activeness_results[uid])
        user_results = dict(user_results, **influence_results[uid])
        user_results = dict(user_results, **sensitive_results[uid])
        user_results = dict(user_results, **profile_results[uid])
        #print 'user_results_sensitive:', user_results['sensitive']
        action = {'update':{'_id': uid}}
        bulk_action.extend([action, {'doc': user_results}])

    #print 'bulk_action:', bulk_action
    es_user_portrait.bulk(bulk_action, index=portrait_index_name, doc_type=portrait_index_type)
Exemple #18
0
def deal_bulk_action(user_info_list, fansnum_max):
    start_ts = time.time()
    uid_list = user_info_list.keys()
    #acquire bulk user weibo data
    if WEIBO_API_INPUT_TYPE == 0:
        user_keywords_dict, user_weibo_dict, character_start_ts = read_flow_text_sentiment(uid_list)
    else:
        user_keywords_dict, user_weibo_dict, character_start_ts = read_flow_text(uid_list)
    #compute attribute--domain, character, importance
    #get user domain
    domain_results = domain_classfiy(uid_list, user_keywords_dict)
    domain_results_dict = domain_results[0]
    domain_results_label = domain_results[1]
    #get user character
    character_end_time = ts2datetime(character_start_ts)
    character_start_time = ts2datetime(character_start_ts - DAY * CHARACTER_TIME_GAP)
    character_sentiment_result_dict = classify_sentiment(uid_list, user_weibo_dict, character_start_time, character_end_time, WEIBO_API_INPUT_TYPE)
    character_text_result_dict = classify_topic(uid_list, user_keywords_dict)
    bulk_action = []
    for uid in uid_list:
        results = {}
        results['uid'] = uid
        #add user domain attribute
        user_domain_dict = domain_results_dict[uid]
        user_label_dict = domain_results_label[uid]
        results['domain_v3'] = json.dumps(user_domain_dict)
        results['domain'] = domain_en2ch(user_label_dict)

        #add user character_sentiment attribute
        character_sentiment = character_sentiment_result_dict[uid]
        results['character_sentiment'] = character_sentiment

        #add user character_text attribute
        character_text = character_text_result_dict[uid]
        results['character_text'] = character_text
        #get user importance
        user_topic_string = user_info_list[uid]['topic_string'].encode('utf-8')
        user_fansnum = user_info_list[uid]['fansnum']
        results['importnace'] = get_importance(results['domain'], user_topic_string, user_fansnum, fansnum_max)
        #bulk action                                
        action = {'update':{'_id': uid}}
        bulk_action.extend([action, {'doc': results}])
    es_user_portrait.bulk(bulk_action, index=portrait_index_name, doc_type=portrait_index_type)
    end_ts = time.time()
    #log_should_delete
    print '%s sec count %s' % (end_ts - start_ts, len(uid_list))
def save_dg_pr_results(sorted_uids, es_num, flag):
    index_name = "user_portrait_network"
    index_type = "network"
    bulk_action = []
    count = 0
    for uid, rank in sorted_uids:
        if (uid == 'global'):
            continue
        count += 1
        user_results = {}
        user_results['uid'] = uid
        user_results[flag + '_' + str(es_num)] = rank
        user_results['rank_' + flag + '_' + str(es_num)] = count  #rank
        if es_num == 0:
            action = {'index': {'_id': uid}}
            bulk_action.extend([action, user_results])
        else:
            try:
                item_exist = es_user_portrait.get(index=index_name,
                                                  doc_type=index_type,
                                                  id=uid)['_source']
                action = {'update': {'_id': uid}}
                try:
                    pr_last = item_exist[flag + '_' + str(es_num - 1)]
                    rank_last = item_exist['rank_' + flag + '_' +
                                           str(es_num - 1)]
                except:
                    pr_last = 0
                    rank_last = 101
                user_results[flag + '_diff_' + str(es_num)] = rank - pr_last
                user_results['rank_' + flag + '_diff_' +
                             str(es_num)] = abs(count - rank_last)
                bulk_action.extend([action, {'doc': user_results}])
            except:
                action = {'index': {'_id': uid}}
                pr_last = 0
                rank_last = 101
                user_results[flag + '_diff_' + str(es_num)] = rank - pr_last
                user_results['rank_' + flag + '_diff_' +
                             str(es_num)] = abs(count - rank_last)
                bulk_action.extend([action, user_results])

    #print bulk_action
    es_user_portrait.bulk(bulk_action, index=index_name, doc_type=index_type)
def save_bulk_action(uid_list, hashtag_results, geo_results,
                     activeness_results, influence_results, sensitive_results,
                     profile_results):
    bulk_action = []
    for uid in uid_list:
        user_results = {}
        user_results = dict(user_results, **hashtag_results[uid])
        user_results = dict(user_results, **geo_results[uid])
        user_results = dict(user_results, **activeness_results[uid])
        user_results = dict(user_results, **influence_results[uid])
        user_results = dict(user_results, **sensitive_results[uid])
        user_results = dict(user_results, **profile_results[uid])
        #print 'user_results_sensitive:', user_results['sensitive']
        action = {'update': {'_id': uid}}
        bulk_action.extend([action, {'doc': user_results}])

    #print 'bulk_action:', bulk_action
    es_user_portrait.bulk(bulk_action,
                          index=portrait_index_name,
                          doc_type=portrait_index_type)
def main():
    ts = time.time()

    bulk_action = []
    copy_bulk_action = []
    count = 0

    delete_key = ts2datetime(ts - 7 * 86400)  # 每天删除7天前的数据
    temp_list = recommend_redis.hget("decide_delete_list", delete_key)
    if temp_list:
        delete_list = json.loads(temp_list)  # 待删除的用户列表
        hdel("decide_delete_list", delete_key)

        for uid in delete_list:
            del_data = expand_delete_action(uid, portrait_index_name,
                                            portrait_index_type)
            copy_del_data = expand_delete_action(uid, copy_portrait_index_name,
                                                 copy_portrait_index_type)
            bulk_action.append(del_data)
            copy_bulk_action.append(copy_del_data)
            count += 1

            if count % 100 == 0:
                es.bulk(bulk_action,
                        index=portrait_index_name,
                        doc_type=portrait_index_type,
                        timeout=30)
                es.bulk(copy_bulk_action,
                        index=copy_portrait_index_name,
                        doc_type=copy_portrait_index_type,
                        timeout=30)
                bulk_action = []
                copy_bulk_action = []

    if bulk_action:
        es.bulk(bulk_action,
                index=portrait_index_name,
                doc_type=portrait_index_type,
                timeout=30)
        es.bulk(copy_bulk_action,
                index=copy_portrait_index_name,
                doc_type=copy_portrait_index_type,
                timeout=30)
def scan_comment():
    count = 0
    scan_cursor = 0
    now_ts = time.time()
    now_date_ts = datetime2ts(ts2datetime(now_ts))
    #get redis db number
    db_number = get_db_num(now_date_ts)
    #comment/be_comment es mappings
    '''
    comment_es_mappings(str(db_number))
    be_comment_es_mappings(str(db_number))
    '''
    #get redis db
    comment_redis = comment_redis_dict[str(db_number)]
    comment_bulk_action = []
    be_comment_bulk_action = []
    start_ts = time.time()
    #comment count/be_comment count
    comment_count = 0
    be_comment_count = 0
    while True:
        re_scan = comment_redis.scan(scan_cursor, count=100)
        re_scan_cursor = re_scan[0]
        for item in re_scan[1]:
            count += 1
            item_list = item.split('_')
            save_dict = {}
            if len(item_list)==2:
                comment_count += 1
                uid = item_list[1]
                item_result = comment_redis.hgetall(item)
                save_dict['uid'] = uid
                save_dict['uid_comment'] = json.dumps(item_result)
                comment_bulk_action.extend([{'index':{'_id':uid}}, save_dict])
            '''
            elif len(item_list)==3:
                be_comment_count += 1
                uid = item_list[2]
                item_result = comment_redis.hgetall(item)
                save_dict['uid'] = uid
                save_dict['uid_be_comment'] = json.dumps(item_result)
                be_comment_bulk_action.extend([{'index':{'_id': uid}}, save_dict])
            '''
        try:
            es.bulk(comment_bulk_action, index='1225_comment_'+str(db_number), doc_type='user')
        except:
            index_name = '1225_comment_'+str(db_number)
            split_bulk_action(comment_bulk_action, index_name)
        '''
        try:
            es.bulk(be_comment_bulk_action, index='1225_be_comment_'+str(db_number), doc_type='user')
        except:
            index_name = '1225_be_comment_'+str(db_number)
            split_bulk_action(be_comment_bulk_action, index_name)
        '''
        comment_bulk_action = []
        #be_comment_bulk_action = []
        end_ts = time.time()
        print '%s sec scan %s count user' % (end_ts - start_ts, count)
        start_ts = end_ts
        scan_cursor = re_scan[0]
        if scan_cursor==0:
            break
    print 'count:', count
    print 'end'
def scan_comment():
    count = 0
    scan_cursor = 0
    now_ts = time.time()
    now_date_ts = datetime2ts(ts2datetime(now_ts))
    #get redis db number
    db_number = get_db_num(now_date_ts)
    #comment/be_comment es mappings
    
    #get redis db
    comment_redis = sensitive_comment_redis_dict[str(db_number)]

    """
    # 1. 判断即将切换的db中是否有数据
    sensitive_redis_host_list.remove(str(db_number))
    while 1:
        other_db_number = comment_redis_dict[redis_host_list[0]]
        current_dbsize = other_db_number.dbsize()
        if current_dbsize:
            break # 已经开始写入新的db,说明前一天的数据已经写完
        else:
            time.sleep(60)
    """

    # 2. 删除之前的es
    comment_es_mappings(str(db_number))
    be_comment_es_mappings(str(db_number))

    # 3. scan
    comment_bulk_action = []
    be_comment_bulk_action = []
    start_ts = time.time()
    #comment count/be_comment count
    comment_count = 0
    be_comment_count = 0
    while True:
        re_scan = comment_redis.scan(scan_cursor, count=100)
        re_scan_cursor = re_scan[0]
        for item in re_scan[1]:
            count += 1
            item_list = item.split('_')
            save_dict = {}
            if len(item_list)==2:
                comment_count += 1
                uid = item_list[1]
                item_result = comment_redis.hgetall(item)
                save_dict['uid'] = uid
                save_dict['uid_comment'] = json.dumps(item_result)
                comment_bulk_action.extend([{'index':{'_id':uid}}, save_dict])
            elif len(item_list)==3:
                be_comment_count += 1
                uid = item_list[2]
                item_result = comment_redis.hgetall(item)
                save_dict['uid'] = uid
                save_dict['uid_be_comment'] = json.dumps(item_result)
                be_comment_bulk_action.extend([{'index':{'_id': uid}}, save_dict])
            
        #try:
        if comment_bulk_action:
            es.bulk(comment_bulk_action, index=sensitive_comment_index_name_pre+str(db_number), doc_type='user')
        #except:
        #    index_name = '1225_comment_'+str(db_number)
        #    split_bulk_action(comment_bulk_action, index_name)
        
        #try:
        if be_comment_bulk_action:
            es.bulk(be_comment_bulk_action, index=sensitive_be_comment_index_name_pre+str(db_number), doc_type='user')
        #except:
        #    index_name = '1225_be_comment_'+str(db_number)
        #    split_bulk_action(be_comment_bulk_action, index_name)
        
        comment_bulk_action = []
        be_comment_bulk_action = []
        end_ts = time.time()
        #run_type
        #if RUN_TYPE == 1:
        print '%s sec scan %s count user' % (end_ts - start_ts, count)

        start_ts = end_ts
        scan_cursor = re_scan[0]
        if scan_cursor==0:
            break
def scan_retweet():
    count = 0
    scan_cursor = 0
    now_ts = time.time()
    now_date_ts = datetime2ts(ts2datetime(now_ts))
    #get redis db number
    db_number = get_db_num(now_date_ts)
    #get redis db
    retweet_redis = retweet_redis_dict[str(db_number)]

    """
    # 1. 判断即将切换的db中是否有数据
    while 1:
        redis_host_list.pop(str(db_number))
        other_db_number = retweet_redis_dict[redis_host_list[0]] # 获得对应的redis
        current_dbsize = other_db_number.dbsize()
        if current_dbsize:
            break # 已经开始写入新的db,说明前一天的数据已经写完
        else:
            time.sleep(60)
    """
    # 2. 删除之前的es
    retweet_es_mappings(str(db_number))
    be_retweet_es_mappings(str(db_number))

    # 3. scan
    retweet_bulk_action = []
    be_retweet_bulk_action = []
    start_ts = time.time()
    #retweet count/be_retweet count
    retweet_count = 0
    be_retweet_count = 0
    while True:
        re_scan = retweet_redis.scan(scan_cursor, count=100)
        re_scan_cursor = re_scan[0]
        for item in re_scan[1]:
            count += 1
            item_list = item.split('_')
            save_dict = {}
            if len(item_list)==2:
                retweet_count += 1
                uid = item_list[1]
                item_result = retweet_redis.hgetall(item)
                save_dict['uid'] = uid
                save_dict['uid_retweet'] = json.dumps(item_result)
                retweet_bulk_action.extend([{'index':{'_id':uid}}, save_dict])
            elif len(item_list)==3:
                be_retweet_count += 1
                uid = item_list[2]
                item_result = retweet_redis.hgetall(item)
                save_dict['uid'] = uid
                save_dict['uid_be_retweet'] = json.dumps(item_result)
                be_retweet_bulk_action.extend([{'index':{'_id':uid}}, save_dict])
        
        if retweet_bulk_action:
            es.bulk(retweet_bulk_action, index='1225_retweet_'+str(db_number), doc_type='user')
        if be_retweet_bulk_action:
            es.bulk(be_retweet_bulk_action, index='1225_be_retweet_'+str(db_number), doc_type='user')
        retweet_bulk_action = []
        be_retweet_bulk_action = []
        end_ts = time.time()
        #run_type
        if RUN_TYPE == 0:
            print '%s sec scan %s count user:' %(end_ts - start_ts, count)
        start_ts = end_ts
        scan_cursor = re_scan[0]
        if scan_cursor==0:
            break
def deal_bulk_action(user_info_list, fansnum_max):
    start_ts = time.time()
    uid_list = user_info_list.keys()
    #acquire bulk user weibo data
    if WEIBO_API_INPUT_TYPE == 0:
        user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts = read_flow_text_sentiment(uid_list)
    else:
        user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts = read_flow_text(uid_list)
    #compute attribute--keywords, topic, online_pattern            
    #get user topic results by bulk action
    topic_results_dict, topic_results_label = topic_classfiy(uid_list, user_keywords_dict)
    
    #update school attribute---is_school/school_string/school_dict
    school_results_dict = get_school(uid_list)
    #get bulk action
    bulk_action = []
    for uid in uid_list:
        results = {}
        results['uid'] = uid
        results['is_school'] = school_results_dict[uid]['is_school']
        results['school_string'] = school_results_dict[uid]['school_string']
        results['school_dict'] = school_results_dict[uid]['school_dict']
        #print 'is_school, school_string, school_dict:', results['is_school'],type(results['is_school']) ,results['school_string'],type(results['school_string']), results['school_dict'], type(results['school_dict'])
        #add user topic attribute
        user_topic_dict = topic_results_dict[uid]
        user_label_dict = topic_results_label[uid]
        results['topic'] = json.dumps(user_topic_dict)
        results['topic_string'] = topic_en2ch(user_label_dict)
        #add user keywords attribute
        try:
            keywords_dict = user_keywords_dict[uid]
        except:
            keywords_dict = {}
        keywords_top50 = sorted(keywords_dict.items(), key=lambda x:x[1], reverse=True)[:50]
        keywords_top50_string = '&'.join([keyword_item[0] for keyword_item in keywords_top50])
        results['keywords'] = json.dumps(keywords_top50)
        results['keywords_string'] = keywords_top50_string
        #add online_pattern
        try:
            user_online_pattern = json.dumps(online_pattern_dict[uid])
        except:
            user_online_pattern = json.dumps({})
        results['online_pattern'] = user_online_pattern
        try:
            results['online_pattern_aggs'] = '&'.join(user_online_pattern.keys())
        except:
            results['online_pattern_aggs'] = ''
        #add user importance
        user_domain = user_info_list[uid]['domain'].encode('utf-8')
        user_fansnum = user_info_list[uid]['fansnum']
        results['importance'] = get_importance(user_domain, results['topic_string'], user_fansnum, fansnum_max)
        
        #bulk action
        action = {'update':{'_id': uid}}
        bulk_action.extend([action, {'doc': results}])
    #print 'bulk_action:', bulk_action
    es_user_portrait.bulk(bulk_action, index=portrait_index_name, doc_type=portrait_index_type)
    end_ts = time.time()
    #log_should_delete
    #print '%s sec count %s' % (end_ts - start_ts, len(uid_list))
    #log_should_delete
    start_ts = end_ts
            try:
                user_history_item.pop(del_activeness_key)
                user_history_item.pop(del_influence_key)
                user_history_item.pop(del_importance_key)
            except:
                pass
            new_user_item = dict(user_history_item, **add_info[uid])
            if add_info[uid][influence_key] < LOW_INFLUENCE_THRESHOULD:
                try:
                    new_user_item["low_number"] += 1
                except:
                    new_user_item["low_number"] = 1
            else:
                new_user_item["low_number"] = 0
            aver_activeness, aver_influence, aver_importance = average_value(new_user_item)
            new_user_item['aver_activeness'] = aver_activeness
            new_user_item['aver_influence'] = aver_influence
            new_user_item['aver_importance'] = aver_importance
            action = {'index':{'_id': uid}}
            bulk_action.extend([action, new_user_item])
            iter_count += 1
        es_user_portrait.bulk(bulk_action, index=copy_portrait_index_name, doc_type=copy_portrait_index_type)
        bulk_action = []
        add_info = {}

    print 'count:', count


if __name__=='__main__':
    scan_index_history()
def scan_index_history():
    s_re = scan(es_user_portrait, query={'query':{'match_all':{}}, 'size':1000}, index=portrait_index_name, doc_type=portrait_index_type)
    bulk_action = []
    add_info = {}
    count = 0
    start_ts = time.time()
    now_date = ts2datetime(start_ts - DAY)
    now_date = '2013-09-06'
    #now_date_string = ''.join(now_date.split('-'))
    now_date_string = now_date
    activeness_key = 'activeness_'+now_date_string
    #influence_key = now_date_string
    influence_key = now_date_string
    importance_key = "importance_" + now_date_string
    del_date = ts2datetime(time.time() - DAY*31)
    #del_date_string = ''.join(del_date.split('-'))
    del_date_string = del_date
    del_activeness_key = 'activeness_'+del_date_string
    #del_influence_key = del_date_string
    del_influence_key = del_date_string
    del_importance_key = "importance_" + del_date_string
    #get max value for importance and activeness
    max_activeness = get_max_index('activeness')
    max_influence = get_max_index('influence')
    max_importance = get_max_index('importance')
    while True:
        try:
            scan_re = s_re.next()['_source']
            count += 1
            uid = scan_re['uid']

            activeness_key = 'activeness_'+now_date_string
            influence_key = now_date_string
            importance_key = "importance_" + now_date_string
            #save to normal activeness and normal influence
            activeness_value = scan_re['activeness']
            influence_value = scan_re['influence']
            importance_value = scan_re['importance']
            normal_activeness = normal_index(activeness_value, max_activeness)
            normal_influence = normal_index(influence_value, max_influence)
            normal_importance = normal_index(importance_value, max_importance)

            add_info[uid] = {activeness_key:normal_activeness, influence_key:normal_influence, importance_key:normal_importance}
            if count % 1000==0:
                uid_list = add_info.keys()
                evaluate_history_results = es_user_portrait.mget(index=copy_portrait_index_name, doc_type=copy_portrait_index_type, body={'ids':uid_list})['docs']
                '''
                del_date = ts2datetime(time.time() - DAY*31)
                del_date_string = ''.join(s)
                del_activeness_key = 'activeness_'+del_date
                del_influence_key = del_date
                '''
                iter_count = 0
                for uid in uid_list:
                    try:
                        user_history_item = evaluate_history_results[iter_count]['_source']
                    except:
                        user_history_item = {}
                    try:
                        user_history_item.pop(del_activeness_key)
                        user_history_item.pop(del_influence_key)
                        user_history_item.pop(del_importance_key)
                    except:
                        pass
                    new_user_item = dict(user_history_item, **add_info[uid])
                    # yuankun-20151229
                    if add_info[uid][influence_key] < LOW_INFLUENCE_THRESHOULD:#更新活跃情况,出库
                        try:
                            new_user_item["low_number"] += 1
                        except:
                            new_user_item["low_number"] = 1
                    else:
                        new_user_item["low_number"] = 0
                    aver_activeness, aver_influence, aver_importance = average_value(new_user_item)
                    new_user_item['aver_activeness'] = aver_activeness
                    new_user_item['aver_influence'] = aver_influence
                    new_user_item['aver_importance'] = aver_importance
                    #print 'add_info:', add_info[uid]
                    #print 'user_history_item:', user_history_item
                    #print 'new_user_item:', new_user_item
                    action = {'index':{'_id': uid}}
                    #print 'action:', action
                    bulk_action.extend([action, new_user_item])
                es_user_portrait.bulk(bulk_action, index=copy_portrait_index_name, doc_type=copy_portrait_index_type)
                bulk_action = []
                add_info = {}
                iter_count = 0
                end_ts = time.time()
                print '%s sec count 1000' % (end_ts - start_ts)
        except StopIteration:
            print 'all done'
            if len(add_info) != 0:
                uid_list = add_info.keys() 
                evaluate_history_results = es_user_portrait.mget(index=copy_portrait_index_name, doc_type=copy_portrait_index_type, body={'ids':uid_list})['docs']
                '''
                del_date = ts2datetime(time.time() - DAY*31)
                del_activeness_key = 'activeness_'+del_date
                del_influence_key = del_date
                '''
                iter_count = 0
                for uid in uid_list:
                    try:
                        user_history_item = evaluate_history_results[iter_count]['_source']
                    except:
                        user_history_item = {}
                    try:
                        user_history_item.pop(del_activeness_key)
                        user_history_item.pop(del_influence_key)
                        user_history_item.pop(del_importance_key)
                    except:
                        pass
                    new_user_item = dict(user_history_item, **add_info[uid])
                    if add_info[uid][influence_key] < LOW_INFLUENCE_THRESHOULD:
                        try:
                            new_user_item["low_number"] += 1
                        except:
                            new_user_item["low_number"] = 1
                    else:
                        new_user_item["low_number"] = 0
                    aver_activeness, aver_influence, aver_importance = average_value(new_user_item)
                    new_user_item['aver_activeness'] = aver_activeness
                    new_user_item['aver_influence'] = aver_influence
                    new_user_item['aver_importance'] = aver_importance
                    action = {'index':{'_id': uid}}
                    bulk_action.extend([action, new_user_item])
                    iter_count += 1
                es_user_portrait.bulk(bulk_action, index=copy_portrait_index_name, doc_type=copy_portrait_index_type)
                bulk_action = []
                add_info = {}
                iter_count = 0
            break
        except Exception, e:
            raise e
def scan_retweet():
    count = 0
    scan_cursor = 0
    now_ts = time.time()
    now_date_ts = datetime2ts(ts2datetime(now_ts))
    #get redis db number
    db_number = get_db_num(now_date_ts)
    #retweet/be_retweet es mappings
    '''
    retweet_es_mappings(str(db_number))
    be_retweet_es_mappings(str(db_number))
    '''
    #get redis db
    retweet_redis = retweet_redis_dict[str(db_number)]
    retweet_bulk_action = []
    be_retweet_bulk_action = []
    start_ts = time.time()
    #retweet count/be_retweet count
    retweet_count = 0
    be_retweet_count = 0
    while True:
        re_scan = retweet_redis.scan(scan_cursor, count=100)
        re_scan_cursor = re_scan[0]
        '''
        if re_scan_cursor == 0:
            print 'scan finish'
            if retweet_bulk_action != []:
                es.bulk(retweet_bulk_action, index='retweet_'+str(db_number), doc_type='user')
            if be_retweet_bulk_action != []:
                es.bulk(be_retweet_bulk_action, index='be_retweet_'+str(db_number), doc_type='user')
            break
        '''
        for item in re_scan[1]:
            count += 1
            item_list = item.split('_')
            save_dict = {}
            if len(item_list)==2:
                retweet_count += 1
                uid = item_list[1]
                item_result = retweet_redis.hgetall(item)
                save_dict['uid'] = uid
                save_dict['uid_retweet'] = json.dumps(item_result)
                retweet_bulk_action.extend([{'index':{'_id':uid}}, save_dict])
            elif len(item_list)==3:
                be_retweet_count += 1
                uid = item_list[2]
                item_result = retweet_redis.hgetall(item)
                save_dict['uid'] = uid
                save_dict['uid_be_retweet'] = json.dumps(item_result)
                be_retweet_bulk_action.extend([{'index':{'_id':uid}}, save_dict])
        es.bulk(retweet_bulk_action, index='1225_retweet_'+str(db_number), doc_type='user')
        es.bulk(be_retweet_bulk_action, index='1225_be_retweet_'+str(db_number), doc_type='user')
        retweet_bulk_action = []
        be_retweet_bulk_action = []
        end_ts = time.time()
        print '%s sec scan %s count user:'******'count:', count
    print 'end'
Exemple #29
0
def save_user_results(bulk_action):
    print 'save utils bulk action len:', len(bulk_action)
    #print 'bulk action:', bulk_action
    print es.bulk(bulk_action, index='user_portrait_1222', doc_type=index_type, timeout=600)
    return True    
Exemple #30
0
def scan_comment():
    count = 0
    scan_cursor = 0
    now_ts = time.time()
    now_date_ts = datetime2ts(ts2datetime(now_ts))
    #get redis db number
    db_number = get_db_num(now_date_ts)
    #comment/be_comment es mappings
    '''
    comment_es_mappings(str(db_number))
    be_comment_es_mappings(str(db_number))
    '''
    #get redis db
    comment_redis = comment_redis_dict[str(db_number)]
    comment_bulk_action = []
    be_comment_bulk_action = []
    start_ts = time.time()
    #comment count/be_comment count
    comment_count = 0
    be_comment_count = 0
    while True:
        re_scan = comment_redis.scan(scan_cursor, count=100)
        re_scan_cursor = re_scan[0]
        for item in re_scan[1]:
            count += 1
            item_list = item.split('_')
            save_dict = {}
            if len(item_list) == 2:
                comment_count += 1
                uid = item_list[1]
                item_result = comment_redis.hgetall(item)
                save_dict['uid'] = uid
                save_dict['uid_comment'] = json.dumps(item_result)
                comment_bulk_action.extend([{
                    'index': {
                        '_id': uid
                    }
                }, save_dict])
            '''
            elif len(item_list)==3:
                be_comment_count += 1
                uid = item_list[2]
                item_result = comment_redis.hgetall(item)
                save_dict['uid'] = uid
                save_dict['uid_be_comment'] = json.dumps(item_result)
                be_comment_bulk_action.extend([{'index':{'_id': uid}}, save_dict])
            '''
        try:
            es.bulk(comment_bulk_action,
                    index='1225_comment_' + str(db_number),
                    doc_type='user')
        except:
            index_name = '1225_comment_' + str(db_number)
            split_bulk_action(comment_bulk_action, index_name)
        '''
        try:
            es.bulk(be_comment_bulk_action, index='1225_be_comment_'+str(db_number), doc_type='user')
        except:
            index_name = '1225_be_comment_'+str(db_number)
            split_bulk_action(be_comment_bulk_action, index_name)
        '''
        comment_bulk_action = []
        #be_comment_bulk_action = []
        end_ts = time.time()
        print '%s sec scan %s count user' % (end_ts - start_ts, count)
        start_ts = end_ts
        scan_cursor = re_scan[0]
        if scan_cursor == 0:
            break
    print 'count:', count
    print 'end'
Exemple #31
0
def scan_retweet():
    count = 0
    scan_cursor = 0
    now_ts = time.time()
    now_date_ts = datetime2ts(ts2datetime(now_ts))
    #get redis db number
    db_number = get_db_num(now_date_ts)
    #retweet/be_retweet es mappings
    '''
    retweet_es_mappings(str(db_number))
    be_retweet_es_mappings(str(db_number))
    '''
    #get redis db
    retweet_redis = retweet_redis_dict[str(db_number)]
    retweet_bulk_action = []
    be_retweet_bulk_action = []
    start_ts = time.time()
    #retweet count/be_retweet count
    retweet_count = 0
    be_retweet_count = 0
    while True:
        re_scan = retweet_redis.scan(scan_cursor, count=100)
        re_scan_cursor = re_scan[0]
        '''
        if re_scan_cursor == 0:
            print 'scan finish'
            if retweet_bulk_action != []:
                es.bulk(retweet_bulk_action, index='retweet_'+str(db_number), doc_type='user')
            if be_retweet_bulk_action != []:
                es.bulk(be_retweet_bulk_action, index='be_retweet_'+str(db_number), doc_type='user')
            break
        '''
        for item in re_scan[1]:
            count += 1
            item_list = item.split('_')
            save_dict = {}
            if len(item_list) == 2:
                retweet_count += 1
                uid = item_list[1]
                item_result = retweet_redis.hgetall(item)
                save_dict['uid'] = uid
                save_dict['uid_retweet'] = json.dumps(item_result)
                retweet_bulk_action.extend([{
                    'index': {
                        '_id': uid
                    }
                }, save_dict])
            elif len(item_list) == 3:
                be_retweet_count += 1
                uid = item_list[2]
                item_result = retweet_redis.hgetall(item)
                save_dict['uid'] = uid
                save_dict['uid_be_retweet'] = json.dumps(item_result)
                be_retweet_bulk_action.extend([{
                    'index': {
                        '_id': uid
                    }
                }, save_dict])
        es.bulk(retweet_bulk_action,
                index='1225_retweet_' + str(db_number),
                doc_type='user')
        es.bulk(be_retweet_bulk_action,
                index='1225_be_retweet_' + str(db_number),
                doc_type='user')
        retweet_bulk_action = []
        be_retweet_bulk_action = []
        end_ts = time.time()
        print '%s sec scan %s count user:'******'count:', count
    print 'end'
Exemple #32
0
def scan_index_history():
    s_re = scan(es_user_portrait,
                query={
                    'query': {
                        'match_all': {}
                    },
                    'size': 1000
                },
                index=portrait_index_name,
                doc_type=portrait_index_type)
    bulk_action = []
    add_info = {}
    count = 0
    start_ts = time.time()
    now_date = ts2datetime(start_ts - DAY)
    now_date = '2013-09-06'
    #now_date_string = ''.join(now_date.split('-'))
    now_date_string = now_date
    activeness_key = 'activeness_' + now_date_string
    #influence_key = now_date_string
    influence_key = now_date_string
    importance_key = "importance_" + now_date_string
    del_date = ts2datetime(time.time() - DAY * 31)
    #del_date_string = ''.join(del_date.split('-'))
    del_date_string = del_date
    del_activeness_key = 'activeness_' + del_date_string
    #del_influence_key = del_date_string
    del_influence_key = del_date_string
    del_importance_key = "importance_" + del_date_string
    #get max value for importance and activeness
    max_activeness = get_max_index('activeness')
    max_influence = get_max_index('influence')
    max_importance = get_max_index('importance')
    while True:
        try:
            scan_re = s_re.next()['_source']
            count += 1
            uid = scan_re['uid']

            activeness_key = 'activeness_' + now_date_string
            influence_key = now_date_string
            importance_key = "importance_" + now_date_string
            #save to normal activeness and normal influence
            activeness_value = scan_re['activeness']
            influence_value = scan_re['influence']
            importance_value = scan_re['importance']
            normal_activeness = normal_index(activeness_value, max_activeness)
            normal_influence = normal_index(influence_value, max_influence)
            normal_importance = normal_index(importance_value, max_importance)

            add_info[uid] = {
                activeness_key: normal_activeness,
                influence_key: normal_influence,
                importance_key: normal_importance
            }
            if count % 1000 == 0:
                uid_list = add_info.keys()
                evaluate_history_results = es_user_portrait.mget(
                    index=copy_portrait_index_name,
                    doc_type=copy_portrait_index_type,
                    body={'ids': uid_list})['docs']
                '''
                del_date = ts2datetime(time.time() - DAY*31)
                del_date_string = ''.join(s)
                del_activeness_key = 'activeness_'+del_date
                del_influence_key = del_date
                '''
                iter_count = 0
                for uid in uid_list:
                    try:
                        user_history_item = evaluate_history_results[
                            iter_count]['_source']
                    except:
                        user_history_item = {}
                    try:
                        user_history_item.pop(del_activeness_key)
                        user_history_item.pop(del_influence_key)
                        user_history_item.pop(del_importance_key)
                    except:
                        pass
                    new_user_item = dict(user_history_item, **add_info[uid])
                    # yuankun-20151229
                    if add_info[uid][
                            influence_key] < LOW_INFLUENCE_THRESHOULD:  #更新活跃情况,出库
                        try:
                            new_user_item["low_number"] += 1
                        except:
                            new_user_item["low_number"] = 1
                    else:
                        new_user_item["low_number"] = 0
                    aver_activeness, aver_influence, aver_importance = average_value(
                        new_user_item)
                    new_user_item['aver_activeness'] = aver_activeness
                    new_user_item['aver_influence'] = aver_influence
                    new_user_item['aver_importance'] = aver_importance
                    #print 'add_info:', add_info[uid]
                    #print 'user_history_item:', user_history_item
                    #print 'new_user_item:', new_user_item
                    action = {'index': {'_id': uid}}
                    #print 'action:', action
                    bulk_action.extend([action, new_user_item])
                es_user_portrait.bulk(bulk_action,
                                      index=copy_portrait_index_name,
                                      doc_type=copy_portrait_index_type)
                bulk_action = []
                add_info = {}
                iter_count = 0
                end_ts = time.time()
                print '%s sec count 1000' % (end_ts - start_ts)
        except StopIteration:
            print 'all done'
            if len(add_info) != 0:
                uid_list = add_info.keys()
                evaluate_history_results = es_user_portrait.mget(
                    index=copy_portrait_index_name,
                    doc_type=copy_portrait_index_type,
                    body={'ids': uid_list})['docs']
                '''
                del_date = ts2datetime(time.time() - DAY*31)
                del_activeness_key = 'activeness_'+del_date
                del_influence_key = del_date
                '''
                iter_count = 0
                for uid in uid_list:
                    try:
                        user_history_item = evaluate_history_results[
                            iter_count]['_source']
                    except:
                        user_history_item = {}
                    try:
                        user_history_item.pop(del_activeness_key)
                        user_history_item.pop(del_influence_key)
                        user_history_item.pop(del_importance_key)
                    except:
                        pass
                    new_user_item = dict(user_history_item, **add_info[uid])
                    if add_info[uid][influence_key] < LOW_INFLUENCE_THRESHOULD:
                        try:
                            new_user_item["low_number"] += 1
                        except:
                            new_user_item["low_number"] = 1
                    else:
                        new_user_item["low_number"] = 0
                    aver_activeness, aver_influence, aver_importance = average_value(
                        new_user_item)
                    new_user_item['aver_activeness'] = aver_activeness
                    new_user_item['aver_influence'] = aver_influence
                    new_user_item['aver_importance'] = aver_importance
                    action = {'index': {'_id': uid}}
                    bulk_action.extend([action, new_user_item])
                    iter_count += 1
                es_user_portrait.bulk(bulk_action,
                                      index=copy_portrait_index_name,
                                      doc_type=copy_portrait_index_type)
                bulk_action = []
                add_info = {}
                iter_count = 0
            break
        except Exception, e:
            raise e
def scan_comment():
    count = 0
    scan_cursor = 0
    now_ts = time.time()
    now_date_ts = datetime2ts(ts2datetime(now_ts))
    # get redis db number
    db_number = get_db_num(now_date_ts)
    # comment/be_comment es mappings

    # get redis db
    comment_redis = comment_redis_dict[str(db_number)]

    # 1. 判断即将切换的db中是否有数据
    while 1:
        redis_host_list.pop(str(db_number))
        other_db_number = comment_redis_dict[redis_host_list[0]]
        current_dbsize = other_db_number.dbsize()
        if current_dbsize:
            break  # 已经开始写入新的db,说明前一天的数据已经写完
        else:
            time.sleep(60)

    # 2. 删除之前的es
    comment_es_mappings(str(db_number))
    be_comment_es_mappings(str(db_number))

    # 3. scan
    comment_bulk_action = []
    be_comment_bulk_action = []
    start_ts = time.time()
    # comment count/be_comment count
    comment_count = 0
    be_comment_count = 0
    while True:
        re_scan = comment_redis.scan(scan_cursor, count=100)
        re_scan_cursor = re_scan[0]
        for item in re_scan[1]:
            count += 1
            item_list = item.split("_")
            save_dict = {}
            if len(item_list) == 2:
                comment_count += 1
                uid = item_list[1]
                item_result = comment_redis.hgetall(item)
                save_dict["uid"] = uid
                save_dict["uid_comment"] = json.dumps(item_result)
                comment_bulk_action.extend([{"index": {"_id": uid}}, save_dict])
            elif len(item_list) == 3:
                be_comment_count += 1
                uid = item_list[2]
                item_result = comment_redis.hgetall(item)
                save_dict["uid"] = uid
                save_dict["uid_be_comment"] = json.dumps(item_result)
                be_comment_bulk_action.extend([{"index": {"_id": uid}}, save_dict])

        try:
            es.bulk(comment_bulk_action, index="1225_comment_" + str(db_number), doc_type="user")
        except:
            index_name = "1225_comment_" + str(db_number)
            split_bulk_action(comment_bulk_action, index_name)

        try:
            es.bulk(be_comment_bulk_action, index="1225_be_comment_" + str(db_number), doc_type="user")
        except:
            index_name = "1225_be_comment_" + str(db_number)
            split_bulk_action(be_comment_bulk_action, index_name)

        comment_bulk_action = []
        be_comment_bulk_action = []
        end_ts = time.time()
        # run_type
        if RUN_TYPE == 0:
            print "%s sec scan %s count user" % (end_ts - start_ts, count)

        start_ts = end_ts
        scan_cursor = re_scan[0]
        if scan_cursor == 0:
            break

    # 4. flush redis
    comment_redis.flushdb()
Exemple #34
0
def save_user_results(bulk_action):
    print es_user_portrait.bulk(bulk_action,
                                index=portrait_index_name,
                                doc_type=portrait_index_type,
                                timeout=60)
    return True
    count_n = 0
    search_list = []
    user_list = []
    while 1:
        try:
            scan_re = s_re.next()['_source']
            count += 1
            user_list.append(scan_re)
            if count % 1000 == 0:
                bulk_action, count_n, tb = co_search(es, user_list, bulk_action, count_n, tb)
                user_list = []
        except StopIteration:
            print "all done"
            bulk_action, count_n, tb = co_search(es, user_list, bulk_action, count_n,tb)
            break
        except Exception, r:
            print Exception, r
            sys.exit(0)


    if bulk_action:
        es.bulk(bulk_action, index=index_destination, doc_type=index_destination_doctype, timeout=30)

    print count, count_n #数字相等才匹配

    # 2. 打印终止信息
    now_ts = ts2datetime(time.time())
    print_log = "&".join([file_path, "end", now_ts])
    print print_log

Exemple #36
0
def save_user_results(bulk_action):
    #print 'bulk_action:', bulk_action[0:2]
    es.bulk(bulk_action, index=index_name, doc_type=index_type)
    return True    
Exemple #37
0
    while 1:
        try:
            scan_re = s_re.next()['_source']
            count += 1
            user_list.append(scan_re)
            if count % 1000 == 0:
                bulk_action, count_n, tb = co_search(es, user_list,
                                                     bulk_action, count_n, tb)
                user_list = []
        except StopIteration:
            print "all done"
            bulk_action, count_n, tb = co_search(es, user_list, bulk_action,
                                                 count_n, tb)
            break
        except Exception, r:
            print Exception, r
            sys.exit(0)

    if bulk_action:
        es.bulk(bulk_action,
                index=index_destination,
                doc_type=index_destination_doctype,
                timeout=30)

    print count, count_n  #数字相等才匹配

    # 2. 打印终止信息
    now_ts = ts2datetime(time.time())
    print_log = "&".join([file_path, "end", now_ts])
    print print_log
Exemple #38
0
def scan_comment():
    count = 0
    scan_cursor = 0
    now_ts = time.time()
    now_date_ts = datetime2ts(ts2datetime(now_ts))
    #get redis db number
    db_number = get_db_num(now_date_ts)
    #comment/be_comment es mappings

    #get redis db
    comment_redis = sensitive_comment_redis_dict[str(db_number)]

    # 1. 判断即将切换的db中是否有数据
    sensitive_redis_host_list.remove(str(db_number))
    while 1:
        other_db_number = comment_redis_dict[redis_host_list[0]]
        current_dbsize = other_db_number.dbsize()
        if current_dbsize:
            break  # 已经开始写入新的db,说明前一天的数据已经写完
        else:
            time.sleep(60)

    # 2. 删除之前的es
    comment_es_mappings(str(db_number))
    be_comment_es_mappings(str(db_number))

    # 3. scan
    comment_bulk_action = []
    be_comment_bulk_action = []
    start_ts = time.time()
    #comment count/be_comment count
    comment_count = 0
    be_comment_count = 0
    while True:
        re_scan = comment_redis.scan(scan_cursor, count=100)
        re_scan_cursor = re_scan[0]
        for item in re_scan[1]:
            count += 1
            item_list = item.split('_')
            save_dict = {}
            if len(item_list) == 2:
                comment_count += 1
                uid = item_list[1]
                item_result = comment_redis.hgetall(item)
                save_dict['uid'] = uid
                save_dict['uid_comment'] = json.dumps(item_result)
                comment_bulk_action.extend([{
                    'index': {
                        '_id': uid
                    }
                }, save_dict])
            elif len(item_list) == 3:
                be_comment_count += 1
                uid = item_list[2]
                item_result = comment_redis.hgetall(item)
                save_dict['uid'] = uid
                save_dict['uid_be_comment'] = json.dumps(item_result)
                be_comment_bulk_action.extend([{
                    'index': {
                        '_id': uid
                    }
                }, save_dict])

        #try:
        if comment_bulk_action:
            es.bulk(comment_bulk_action,
                    index=sensitive_comment_index_name_pre + str(db_number),
                    doc_type='user')
        #except:
        #    index_name = '1225_comment_'+str(db_number)
        #    split_bulk_action(comment_bulk_action, index_name)

        #try:
        if be_comment_bulk_action:
            es.bulk(be_comment_bulk_action,
                    index=sensitive_be_comment_index_name_pre + str(db_number),
                    doc_type='user')
        #except:
        #    index_name = '1225_be_comment_'+str(db_number)
        #    split_bulk_action(be_comment_bulk_action, index_name)

        comment_bulk_action = []
        be_comment_bulk_action = []
        end_ts = time.time()
        #run_type
        #if RUN_TYPE == 1:
        print '%s sec scan %s count user' % (end_ts - start_ts, count)

        start_ts = end_ts
        scan_cursor = re_scan[0]
        if scan_cursor == 0:
            break

    # 4. flush redis
    comment_redis.flushdb()
Exemple #39
0
                user_history_item.pop(del_importance_key)
            except:
                pass
            new_user_item = dict(user_history_item, **add_info[uid])
            if add_info[uid][influence_key] < LOW_INFLUENCE_THRESHOULD:
                try:
                    new_user_item["low_number"] += 1
                except:
                    new_user_item["low_number"] = 1
            else:
                new_user_item["low_number"] = 0
            aver_activeness, aver_influence, aver_importance = average_value(
                new_user_item)
            new_user_item['aver_activeness'] = aver_activeness
            new_user_item['aver_influence'] = aver_influence
            new_user_item['aver_importance'] = aver_importance
            action = {'index': {'_id': uid}}
            bulk_action.extend([action, new_user_item])
            iter_count += 1
        es_user_portrait.bulk(bulk_action,
                              index=copy_portrait_index_name,
                              doc_type=copy_portrait_index_type)
        bulk_action = []
        add_info = {}

    print 'count:', count


if __name__ == '__main__':
    scan_index_history()