def deal_bulk_action(user_info_list, fansnum_max):
    start_ts = time.time()
    uid_list = user_info_list.keys()
    #acquire bulk user weibo data
    if WEIBO_API_INPUT_TYPE == 0:
        user_keywords_dict, user_weibo_dict, character_start_ts = read_flow_text_sentiment(
            uid_list)
    else:
        user_keywords_dict, user_weibo_dict, character_start_ts = read_flow_text(
            uid_list)
    #compute attribute--domain, character, importance
    #get user domain
    domain_results = domain_classfiy(uid_list, user_keywords_dict)
    domain_results_dict = domain_results[0]
    domain_results_label = domain_results[1]
    #get user character
    character_start_time = ts2datetime(character_start_ts)
    character_end_time = ts2datetime(character_start_ts +
                                     DAY * CHARACTER_TIME_GAP - DAY)
    character_sentiment_result_dict = classify_sentiment(
        uid_list, user_weibo_dict, character_start_time, character_end_time,
        WEIBO_API_INPUT_TYPE)
    character_text_result_dict = classify_topic(uid_list, user_keywords_dict)
    bulk_action = []
    for uid in uid_list:
        results = {}
        results['uid'] = uid
        #add user domain attribute
        user_domain_dict = domain_results_dict[uid]
        user_label_dict = domain_results_label[uid]
        results['domain_v3'] = json.dumps(user_domain_dict)
        results['domain'] = domain_en2ch(user_label_dict)

        #add user character_sentiment attribute
        character_sentiment = character_sentiment_result_dict[uid]
        results['character_sentiment'] = character_sentiment

        #add user character_text attribute
        character_text = character_text_result_dict[uid]
        results['character_text'] = character_text
        #get user importance
        user_topic_string = user_info_list[uid]['topic_string'].encode('utf-8')
        user_fansnum = user_info_list[uid]['fansnum']
        results['importnace'] = get_importance(results['domain'],
                                               user_topic_string, user_fansnum,
                                               fansnum_max)
        #bulk action
        action = {'update': {'_id': uid}}
        bulk_action.extend([action, {'doc': results}])
    es_user_portrait.bulk(bulk_action,
                          index=portrait_index_name,
                          doc_type=portrait_index_type)
    end_ts = time.time()
    #log_should_delete
    print '%s sec count %s' % (end_ts - start_ts, len(uid_list))
Beispiel #2
0
def deal_bulk_action(user_info_list, fansnum_max):
    start_ts = time.time()
    uid_list = user_info_list.keys()
    #acquire bulk user weibo data
    if WEIBO_API_INPUT_TYPE == 0:
        user_keywords_dict, user_weibo_dict, character_start_ts = read_flow_text_sentiment(uid_list)
    else:
        user_keywords_dict, user_weibo_dict, character_start_ts = read_flow_text(uid_list)
    #compute attribute--domain, character, importance
    #get user domain
    domain_results = domain_classfiy(uid_list, user_keywords_dict)
    domain_results_dict = domain_results[0]
    domain_results_label = domain_results[1]
    #get user character
    character_end_time = ts2datetime(character_start_ts)
    character_start_time = ts2datetime(character_start_ts - DAY * CHARACTER_TIME_GAP)
    character_sentiment_result_dict = classify_sentiment(uid_list, user_weibo_dict, character_start_time, character_end_time, WEIBO_API_INPUT_TYPE)
    character_text_result_dict = classify_topic(uid_list, user_keywords_dict)
    bulk_action = []
    for uid in uid_list:
        results = {}
        results['uid'] = uid
        #add user domain attribute
        user_domain_dict = domain_results_dict[uid]
        user_label_dict = domain_results_label[uid]
        results['domain_v3'] = json.dumps(user_domain_dict)
        results['domain'] = domain_en2ch(user_label_dict)

        #add user character_sentiment attribute
        character_sentiment = character_sentiment_result_dict[uid]
        results['character_sentiment'] = character_sentiment

        #add user character_text attribute
        character_text = character_text_result_dict[uid]
        results['character_text'] = character_text
        #get user importance
        user_topic_string = user_info_list[uid]['topic_string'].encode('utf-8')
        user_fansnum = user_info_list[uid]['fansnum']
        results['importnace'] = get_importance(results['domain'], user_topic_string, user_fansnum, fansnum_max)
        #bulk action                                
        action = {'update':{'_id': uid}}
        bulk_action.extend([action, {'doc': results}])
    es_user_portrait.bulk(bulk_action, index=portrait_index_name, doc_type=portrait_index_type)
    end_ts = time.time()
    #log_should_delete
    print '%s sec count %s' % (end_ts - start_ts, len(uid_list))
Beispiel #3
0
def role_feature_analysis(role_label, uids_list, datetime_list, create_time):

    role_feature_analysis_results = dict()

    uid_weibo_keywords_dict, keywords_dict_all_users, uid_weibo = uid_list_2_uid_keywords_dict(
        uids_list, datetime_list, label='character')

    ## 常用关键词
    keywords_dict_all_users_sort = sorted(keywords_dict_all_users.items(),
                                          key=lambda x: x[1],
                                          reverse=True)[:TOP_KEYWORDS_NUM]

    ## 政治倾向
    political_side_count_sort = political_classify_sort(
        uids_list, uid_weibo_keywords_dict)

    ## 话题偏好
    topic_count_dict_sort = topic_classfiy_sort(uids_list, datetime_list)

    ## 心理特征

    psy_feature_sort = get_psy_feature_sort(uids_list, create_time)

    ## 性格特征
    start_date = datetime_list[-1]
    end_date = datetime_list[0]
    flag = 1
    com_result = classify_sentiment(uids_list, uid_weibo, start_date, end_date,
                                    flag)

    print 'com_result::::', com_result

    com_result_list = com_result.values()
    com_result_set = set(com_result_list)
    character_result_dict = dict()

    for character in com_result_set:
        character_count = com_result_list.count(character)
        character_result_dict[character] = character_count
    character_result_dict_sort = sorted(character_result_dict.items(),
                                        key=lambda x: x[1],
                                        reverse=True)

    ## 地理位置
    geo_cityTopic_results = dict()
    province_set = set()
    for datetime in datetime_list:
        flow_text_index_name = flow_text_index_name_pre + datetime
        geo_cityTopic_results_datetime = cityTopic(uids_list,
                                                   flow_text_index_name)
        geo_cityTopic_results[datetime] = geo_cityTopic_results_datetime
        #province_set = province_set | geo_cityTopic_results_datetime.keys()  ## 求集合并集

    geo_cityTopic_results_merge = dict()

    for datetime, province_city_dict in geo_cityTopic_results.iteritems():
        ## 利用for循环
        for province, city_dict in province_city_dict.iteritems():
            if province in geo_cityTopic_results_merge.keys():
                for city, count in city_dict.iteritems():
                    if city in geo_cityTopic_results_merge[province].keys():
                        geo_cityTopic_results_merge[province][city] += count
                    else:
                        geo_cityTopic_results_merge[province][city] = count

            else:
                geo_cityTopic_results_merge[province] = city_dict

    ## 日发帖量
    day_post_median_all = []
    for datetime in datetime_list:
        day_post_median = day_post_num_compute(uids_list, datetime)
        day_post_median_all.append(day_post_median)

    ## 活跃时间
    day_hour_counts_all = []
    for datetime in datetime_list:
        day_hour_counts = active_time_compute(uids_list, datetime)
        day_hour_counts_all.append(day_hour_counts)

    day_hour_counts_all_np = np.array(day_hour_counts_all)
    day_hour_counts_aver = np.mean(day_hour_counts_all_np,
                                   axis=0).astype(np.int)  ## 对二维数组按列求和

    day_hour_counts_aver_time = np.argsort(
        -day_hour_counts_aver)  ### np.argsort(-x)  按从大到小的数据的索引排列

    role_feature_analysis_results[
        'top_keywords'] = keywords_dict_all_users_sort
    role_feature_analysis_results['political_side'] = political_side_count_sort
    role_feature_analysis_results['topic_preference'] = topic_count_dict_sort
    role_feature_analysis_results['personality'] = character_result_dict_sort
    role_feature_analysis_results['geo'] = geo_cityTopic_results_merge
    role_feature_analysis_results['day_post_num'] = day_post_median_all
    role_feature_analysis_results['active_time'] = day_hour_counts_aver
    role_feature_analysis_results['psy_feature'] = psy_feature_sort
    role_feature_analysis_results['member_uids'] = uids_list

    return role_feature_analysis_results
Beispiel #4
0
def test_cron_text_attribute(user_weibo_dict):
    #get user weibo 7day {user:[weibos]}
    print 'start cron_text_attribute'
    uid_list = user_weibo_dict.keys()
    print 'user count:', len(uid_list)
    
    #get user flow information: hashtag, activity_geo, keywords
    print 'get flow result'
    flow_result = get_flow_information(uid_list)
    print 'flow result len:', len(flow_result)
    
    #get user profile information
    print 'get register result'
    register_result = get_profile_information(uid_list)
    print 'register result len:', len(register_result)
    #get topic and domain input data
    user_weibo_string_dict = get_user_weibo_string(user_weibo_dict) # use as the tendency input data
    user_keywords_dict = get_user_keywords_dict(user_weibo_string_dict)
    #get user event results by bulk action
    event_results_dict = event_classfiy(user_weibo_string_dict)
    print 'event_result len:', len(event_results_dict)
    
    #get user topic and domain by bulk action
    print 'get topic and domain'
    topic_results_dict, topic_results_label = topic_classfiy(user_keywords_dict)
    domain_results = domain_classfiy(user_keywords_dict)
    domain_results_dict = domain_results[0]
    domain_results_label = domain_results[1]
    print 'topic result len:', len(topic_results_dict)
    print 'domain result len:', len(domain_results_dict)
    
    #get user psy attribute
    #print 'get psy result'
    #psy_results_dict = psychology_classfiy(user_weibo_dict)
    #print 'psy result len:', len(psy_results_dict)
    
    #get user character attribute
    print 'get character result'
    #type_mark = 0/1 for identify the task input status---just sentiment or text
    now_ts = time.time()
    #test
    now_ts = datetime2ts('2013-09-08')
    character_end_time = ts2datetime(now_ts - DAY)
    character_start_time = ts2datetime(now_ts - DAY * CHARACTER_TIME_GAP)
    character_type_mark = 1
    character_sentiment_result_dict = classify_sentiment(uid_list, character_start_time, character_end_time, character_type_mark)
    character_type_mark = 1
    character_text_result_dict = classify_topic(uid_list, character_start_time, character_end_time, character_type_mark)
    print 'character result len:', len(character_sentiment_result_dict), len(character_text_result_dict)
    print 'character_sentiment_result:', character_sentiment_result_dict
    print 'character_text_result:', character_text_result_dict
    
    #get user fansnum max
    fansnum_max = get_fansnum_max()
    #get user activeness by bulk_action
    print 'get activeness results'
    activeness_results = get_activity_time(uid_list)
    print 'activeness result len:', len(activeness_results)
    #get user inlfuence by bulk action
    print 'get influence'
    influence_results = get_influence(uid_list)
    print 'influence results len:', len(influence_results)
    
    # compute text attribute
    user_set = set()
    bulk_action = []
    count = 0
    for user in user_weibo_dict:
        count += 1
        results = {}       
        user_set.add(user)
        weibo_list = user_weibo_dict[user]
        uname = weibo_list[0]['uname']
        #get user text attribute: online_pattern
        results = compute_text_attribute(user, weibo_list)
        results['uid'] = str(user)
        #add user flow information: hashtag, activity_geo, keywords
        flow_dict = flow_result[str(user)]
        results = dict(results, **flow_dict)
        
        #add user topic attribute
        user_topic_dict = topic_results_dict[user]
        user_label_dict = topic_results_label[user]
        results['topic'] = json.dumps(user_topic_dict)         # {'topic1_en':pro1, 'topic2_en':pro2...}
        results['topic_string'] = topic_en2ch(user_label_dict) # 'topic1_ch&topic2_ch&topic3_ch'
        #add user event attribute
        results['tendency'] = event_results_dict[user]
        
        #add user domain attribute
        user_domain_dict = domain_results_dict[user]
        user_label_dict = domain_results_label[user]
        results['domain_v3'] = json.dumps(user_domain_dict) # [label1_en, label2_en, label3_en]
        results['domain'] = domain_en2ch(user_label_dict)      # label_ch
        
        #add user character_sentiment attribute
        character_sentiment = character_sentiment_result_dict[user]
        results['character_sentiment'] = character_sentiment
        #add user character_text attribtue
        character_text = character_text_result_dict[user]
        results['character_text'] = character_text
        
        #add user psy attribute
        user_psy_dict = [psy_results_dict[user]]
        results['psycho_status'] = json.dumps(user_psy_dict)
        
        #add user profile attribute
        register_dict = register_result[str(user)]
        results = dict(results, **register_dict)
        #add user_evaluate attribute---importance
        results['importance'] = get_importance(results['domain'], results['topic_string'], results['fansnum'], fansnum_max)
        #add user_evaluate attribute---activeness
        user_activeness_time = activeness_results[user]
        user_activeness_geo = json.loads(results['activity_geo_dict'])[-1]
        results['activeness'] = get_activeness(user_activeness_geo, user_activeness_time)
        #add user_evaluate attribute---influence
        results['influence'] = influence_results[user]
        
        #bulk_action
        action = {'index':{'_id': str(user)}}
        bulk_action.extend([action, results])
        if count >= 20:
            mark = save_user_results(bulk_action)
            print 'bulk_action:', bulk_action
            bulk_action = []
            count = 0
    
    end_ts = time.time()
    
    print 'user_set len:', len(user_set)
    print 'count:', count
    print 'bulk_action count:', len(bulk_action)
    
    print 'bulk_action:', bulk_action
    
    if bulk_action:
        status = save_user_results(bulk_action)
    
    #status = False
    return status # save by bulk
Beispiel #5
0
def test_cron_text_attribute_v2(user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts,filter_keywords_dict):
    status = False
    print 'start cron_text_attribute'
    uid_list = user_keywords_dict.keys()
    
    #get user flow information: hashtag, activity_geo, keywords
    print 'get flow result'
    flow_result = get_flow_information_v2(uid_list, user_keywords_dict)
    print 'flow result len:', len(flow_result)
    
    #get user profile information
    print 'get register result'
    register_result = get_profile_information(uid_list)
    print 'register result len:', len(register_result)
    #print user_keywords_dict
    
    #get user topic and domain by bulk action
    print 'get topic and domain'
    topic_results_dict, topic_results_label = topic_classfiy(uid_list, user_keywords_dict)
    print topic_results_dict,topic_results_label

    domain_results = domain_classfiy(uid_list, user_keywords_dict)
    domain_results_dict = domain_results[0]
    domain_results_label = domain_results[1]
    print 'topic result len:', len(topic_results_dict)
    print 'domain result len:', len(domain_results_dict)
    
    #get user character attribute
    print 'get character result'
    #type_mark = 0/1 for identify the task input status---just sentiment or text
    character_start_time = ts2datetime(character_start_ts)
    character_end_time = ts2datetime(character_start_ts + DAY * CHARACTER_TIME_GAP - DAY)
    print 'character_start_time:', character_start_time
    print 'character_end_time:', character_end_time
    character_sentiment_result_dict = classify_sentiment(uid_list, user_weibo_dict, character_start_time, character_end_time, WEIBO_API_INPUT_TYPE)
    character_text_result_dict = classify_topic(uid_list, user_keywords_dict)
    print 'character result len:', len(character_sentiment_result_dict), len(character_text_result_dict)
    
    #get user fansnum max
    fansnum_max = get_fansnum_max()
    #get user activeness by bulk_action
    print 'get activeness results'
    activeness_results = get_activity_time(uid_list)
    print 'activeness result len:', len(activeness_results)
    #get user inlfuence by bulk action
    print 'get influence'
    influence_results = get_influence(uid_list)
    print 'influence results len:', len(influence_results)
    #get user sensitive by bulk action
    print 'get sensitive'
    sensitive_results, sensitive_string_results, sensitive_dict_results = get_sensitive(uid_list)
    print 'sensitive results len:', len(sensitive_results)
    # compute text attribute
    bulk_action = []
    count = 0
    for user in uid_list:
        count += 1
        results = {}       
        #get user text attribute: online_pattern
        results['online_pattern'] = json.dumps(online_pattern_dict[user])
        try:
            results['online_pattern_aggs'] = '&'.join(online_pattern_dict[user].keys())
        except:
            results['online_pattern_aggs'] = ''
        results['uid'] = str(user)
        #add user flow information: hashtag, activity_geo, keywords
        flow_dict = flow_result[str(user)]
        results = dict(results, **flow_dict)
        #jln filter keyword
        results['filter_keywords'] = json.dumps(filter_keywords_dict[user])

        #add user topic attribute
        user_topic_dict = topic_results_dict[user]
        user_label_dict = topic_results_label[user]
        results['topic'] = json.dumps(user_topic_dict)         # {'topic1_en':pro1, 'topic2_en':pro2...}
        results['topic_string'] = topic_en2ch(user_label_dict) # 'topic1_ch&topic2_ch&topic3_ch'
        
        #add user domain attribute
        user_domain_dict = domain_results_dict[user]
        user_label_dict = domain_results_label[user]
        results['domain_v3'] = json.dumps(user_domain_dict) # [label1_en, label2_en, label3_en]
        results['domain'] = domain_en2ch(user_label_dict)      # label_ch
        
        #add user character_sentiment attribute
        character_sentiment = character_sentiment_result_dict[user]
        results['character_sentiment'] = character_sentiment
        #add user character_text attribtue
        character_text = character_text_result_dict[user]
        results['character_text'] = character_text
        
        #add user profile attribute
        register_dict = register_result[str(user)]
        results = dict(results, **register_dict)
        #add user_evaluate attribute---importance
        results['importance'] = get_importance(results['domain'], results['topic_string'], results['fansnum'], fansnum_max)
        #add user_evaluate attribute---activeness
        user_activeness_time = activeness_results[user]
        user_activeness_geo = json.loads(results['activity_geo_dict'])[-1]
        results['activeness'] = get_activeness(user_activeness_geo, user_activeness_time)
        #add user_evaluate attribute---influence
        results['influence'] = influence_results[user]
        #add user sensitive attribute
        results['sensitive'] = sensitive_results[user]
        results['sensitive_dict'] = sensitive_dict_results[user]
        results['sensitive_string'] = sensitive_string_results[user]
        #bulk_action
        action = {'index':{'_id': str(user)}}
        bulk_action.extend([action, results])
        
    status = save_user_results(bulk_action)
    
    return status