def main(): # read the uid list uid_list = read_uid_list() # get user weibo 7day {user:[weibos]} user_weibo_dict = read_user_weibo(uid_list) uid_list = user_weibo_dict.keys() #print 'uid_list:', len(uid_list) #print 'user weibo dict:', len(user_weibo_dict) flow_result = get_flow_information(uid_list) register_result = get_profile_information(uid_list) # compute text attribute bulk_action = [] for user in user_weibo_dict: weibo_list = user_weibo_dict[user] uname = weibo_list[0]['uname'] results = compute_text_attribute(user, weibo_list) results['uid'] = str(user) flow_dict = flow_result[str(user)] results = dict(results, **flow_dict) # deal to the bulk action user_info = {'uid':str(user), 'domain':results['domain'], 'topic':results['topic'], 'activity_geo':results['activity_geo']} evaluation_index = get_evaluate_index(user_info, status='insert') results = dict(results, **evaluation_index) #print 'register_result:', register_result register_dict = register_result[str(user)] results = dict(results, **register_dict) action = {'index':{'_id': str(user)}} bulk_action.extend([action, results]) status = save_user_results(bulk_action) return True # save by bulk
def compute_attribute(user_weibo_dict): # test uid_list = user_weibo_dict.keys() times = len(uid_list)/1000 bulk_action = [] count = 0 count_list = set() for i in range(times+1): flow_result = get_flow_information(uid_list[1000*i:1000*(i+1)]) # 流数据更新 register_result = get_profile_information(uid_list) # 背景信息数据更新 for user in uid_list: weibo_list = user_weibo_dict[user] results = compute_text_attribute(user, weibo_list) # 文本属性计算 results['uid'] = str(user) flow_dict = flow_result[str(user)] results.update(flow_dict) user_info = {'uid':str(user), 'domain':results['domain'], 'topic':results['topic'], 'activity_geo':results['geo_string']} evaluation_index = get_evaluate_index(user_info, status='insert') results.update(evaluation_index) register_dict = register_result[user] results.update(register_dict) action = {'index':{'_id':str(user)}} bulk_action.extend([action, results]) count_list.add(user) count += 1 if count % 200 == 0: es.bulk(bulk_action, index='sensitive_user_portrait_0103', doc_type="user", timeout=60) bulk_action = [] print count if bulk_action: es.bulk(bulk_action, index='sensitive_user_portrait_0103', doc_type="user", timeout=60) return "1"
def compute_attribute(uid_list=[]): # test user_weibo_dict = read_user_weibo(uid_list) uid_list = user_weibo_dict.keys() flow_result = get_flow_information(uid_list) register_result = get_profile_information(uid_list) bulk_action = [] count = 0 count_list = set() for user in uid_list: weibo_list = user_weibo_dict[user] uname = weibo_list[0]['uname'] results = compute_text_attribute(user, weibo_list) results['uname'] = uname results['uid'] = str(user) flow_dict = flow_result[str(user)] results.update(flow_dict) user_info = {'uid':str(user), 'domain':results['domain'], 'topic':results['topic'], 'activity_geo':results['geo_string']} evaluation_index = get_evaluate_index(user_info, status='insert') results.update(evaluation_index) register_dict = register_result[user] results.update(register_dict) action = {'index':{'_id':str(user)}} bulk_action.extend([action, results]) count_list.add(user) count += 1 if count % 200 == 0: es.bulk(bulk_action, index=index_name, doc_type="user", timeout=60) bulk_action = [] print count if bulk_action: status = save_user_results(bulk_action) return "1"
def week_update_portrait(user_weibo_dict): # {uid: [weibo_text]} uid_list = user_weibo_dict.keys() register_result = get_profile_information(uid_list) # 背景信息数据更新 bulk_action = [] count = 0 for user in uid_list: result = dict() weibo_list = user_weibo_dict[user] register_dict = register_result[user] result.update(register_dict) # results['domain'] = attri_domain(weibo_list) result["domain"] = "test_domain" result["domain_string"] = "&".join(result["domain"]) # psycho_status = attr_psycho_status(user, weibo_list) psycho_status = {"positive": 0.5, "negetive": 0.2, "neutral": 0.3} result["psycho_status_string"] = "&".join(psycho_status.keys()) result["psycho_status"] = json.dumps(psycho_status) # topic = attr_topic(weibo_list) topic = {"政治": 0.3, "民生": 0.7} result["topic"] = json.dumps(topic) result["topic_string"] = "&".join(result["topic"].keys()) # politics_trend = attri_politics(user, weibo_list) politics_trend = "left" result["politics_trend"] = politics_trend action = {"update": {"_id": str(user)}} results = {"doc": result} bulk_action.extend([action, results]) if count % 1000 == 0: es.bulk(bulk_action, index="sensitive_user_portrait", doc_type="user", timeout=60) bulk_action = [] print count if bulk_action: es.bulk(bulk_action, index="sensitive_user_portrait", doc_type="user", timeout=60) return "1"
def compute2in(uid_list, user_weibo_dict): #get user flow information: hashtag, activity_geo, keywords flow_result = get_flow_information(uid_list) #get user topic information topic_results_dict, topic_results_label = topic_classfiy(user_weibo_list) #get user domain information domain_results = domain_classfiy(user_weibo_dict) domain_results_dict = domain_results[0] domain_results_label = domain_results[1] #get user psy information psy_results_dict = psychology_classfiy(user_weibo_dict) #get user profile information register_result = get_profile_information(uid_list) #get user fansnum max fansnum_max = get_fansnum_max() #get user activeness by bulk_action activeness_results = get_activity_time(uid_list) #get user inlfuence by bulk action influence_results = get_influence(uid_list) #deal bulk action for user in user_weibo_dict: weibo_list = user_weibo_dict[user] uname = weibo_list[0]['uname'] #compute text attribute: online_pattern results = compute_text_attribute(user, weibo_list) results['uname'] = uname results['uid'] = str(user) #add flow information: hashtag, activity_geo, keywords flow_dict = flow_result[str(user)] results = dict(results, **flow_dict) #add topic attribute topic_dict = topic_results_dict[user] results['topic'] = json.dumps(topic_dic) #{topic1_en:pro1, topic2_en:pro, ...} topic_label = topic_results_label[user] results['topic_string'] = topic_en2ch(topic_label) #topic1_ch&topic2_ch&topic3_ch #add domain attribute user_domain_dict = domain_results_dict[user] user_domain_label = domain_results_label[user] results['domain_v3'] = json.dumps(user_domain_dict) #[domain_en1, domain_en2, domain_en3] results['domain_string'] = domain_en2ch(user_domain_label) #domain_ch #add psy attribute user_psy_dict = psy_results_dict[user] results['psycho_status'] = json.dumps(user_psy_dict) #add user profile attribute register_dict = register_result[str(user)] results = dict(results, **register_dict) #add user_evaluate attribute---importance results['importance'] = get_importance(results['domain'], results['topic_string'], results['fansnum'], fansnum) #add user_evaluate attribute---activeness user_activeness_time = activeness_results[user] user_activeness_geo = json.loads(results['activity_geo_dict'])[-1] results['activeness'] = get_activeness(user_activeness_geo, user_activeness_time) #add user_evaluate attribute---influence results['influence'] = influence_results[user] #bulk_action action = {'index':{'_id':str(user)}} bulk_action.extend([action, results]) status = save_user_results(bulk_action) return True
def week_update_portrait(user_weibo_dict): # {uid: [weibo_text]} uid_list = user_weibo_dict.keys() register_result = get_profile_information(uid_list) # 背景信息数据更新 bulk_action = [] count = 0 for user in uid_list: result = dict() weibo_list = user_weibo_dict[user] register_dict = register_result[user] result.update(register_dict) # results['domain'] = attri_domain(weibo_list) result['domain'] = 'test_domain' result['domain_string'] = "&".join(result['domain']) # psycho_status = attr_psycho_status(user, weibo_list) psycho_status = {'positive': 0.5, 'negetive': 0.2, 'neutral': 0.3} result['psycho_status_string'] = '&'.join(psycho_status.keys()) result['psycho_status'] = json.dumps(psycho_status) # topic = attr_topic(weibo_list) topic = {'政治': 0.3, '民生': 0.7} result['topic'] = json.dumps(topic) result['topic_string'] = '&'.join(result['topic'].keys()) # politics_trend = attri_politics(user, weibo_list) politics_trend = 'left' result['politics_trend'] = politics_trend action = {'update': {'_id': str(user)}} results = {'doc': result} bulk_action.extend([action, results]) if count % 1000 == 0: es.bulk(bulk_action, index='sensitive_user_portrait', doc_type='user', timeout=60) bulk_action = [] print count if bulk_action: es.bulk(bulk_action, index='sensitive_user_portrait', doc_type='user', timeout=60) return '1'
def compute2in(uid_list, user_weibo_dict, status='insert'): flow_result = get_flow_information(uid_list) register_result = get_profile_information(uid_list) for user in user_weibo_dict: weibo_list = user_weibo_dict[user] uname = weibo_list[0]['uname'] results = compute_text_attribute(user, weibo_list) results['uname'] = uname results['uid'] = str(user) flow_dict = flow_result[str(user)] results = dict(results, **flow_dict) user_info = {'uid':str(user), 'domain':results['domain'], 'topic':results['topic'], 'activity_geo':results['activity_geo']} evaluation_index = get_evaluate_index(user_info, status='insert') results = dict(results, **evaluation_index) register_dict = register_result[user] results = dict(results, **register_dict) if status=='insert': action = {'index':{'_id':str(user)}} else: action = {'update':{'_id', str(user)}} results = {'doc': results} bulk_action.extend([action, results]) status = save_user_results(bulk_action) return True
def test_cron_text_attribute(user_weibo_dict): #get user weibo 7day {user:[weibos]} print 'start cron_text_attribute' uid_list = user_weibo_dict.keys() print 'user count:', len(uid_list) #get user flow information: hashtag, activity_geo, keywords print 'get flow result' flow_result = get_flow_information(uid_list) print 'flow result len:', len(flow_result) #get user profile information print 'get register result' register_result = get_profile_information(uid_list) print 'register result len:', len(register_result) #get topic and domain input data user_weibo_string_dict = get_user_weibo_string(user_weibo_dict) # use as the tendency input data user_keywords_dict = get_user_keywords_dict(user_weibo_string_dict) #get user event results by bulk action event_results_dict = event_classfiy(user_weibo_string_dict) print 'event_result len:', len(event_results_dict) #get user topic and domain by bulk action print 'get topic and domain' topic_results_dict, topic_results_label = topic_classfiy(user_keywords_dict) domain_results = domain_classfiy(user_keywords_dict) domain_results_dict = domain_results[0] domain_results_label = domain_results[1] print 'topic result len:', len(topic_results_dict) print 'domain result len:', len(domain_results_dict) #get user psy attribute #print 'get psy result' #psy_results_dict = psychology_classfiy(user_weibo_dict) #print 'psy result len:', len(psy_results_dict) #get user character attribute print 'get character result' #type_mark = 0/1 for identify the task input status---just sentiment or text now_ts = time.time() #test now_ts = datetime2ts('2013-09-08') character_end_time = ts2datetime(now_ts - DAY) character_start_time = ts2datetime(now_ts - DAY * CHARACTER_TIME_GAP) character_type_mark = 1 character_sentiment_result_dict = classify_sentiment(uid_list, character_start_time, character_end_time, character_type_mark) character_type_mark = 1 character_text_result_dict = classify_topic(uid_list, character_start_time, character_end_time, character_type_mark) print 'character result len:', len(character_sentiment_result_dict), len(character_text_result_dict) print 'character_sentiment_result:', character_sentiment_result_dict print 'character_text_result:', character_text_result_dict #get user fansnum max fansnum_max = get_fansnum_max() #get user activeness by bulk_action print 'get activeness results' activeness_results = get_activity_time(uid_list) print 'activeness result len:', len(activeness_results) #get user inlfuence by bulk action print 'get influence' influence_results = get_influence(uid_list) print 'influence results len:', len(influence_results) # compute text attribute user_set = set() bulk_action = [] count = 0 for user in user_weibo_dict: count += 1 results = {} user_set.add(user) weibo_list = user_weibo_dict[user] uname = weibo_list[0]['uname'] #get user text attribute: online_pattern results = compute_text_attribute(user, weibo_list) results['uid'] = str(user) #add user flow information: hashtag, activity_geo, keywords flow_dict = flow_result[str(user)] results = dict(results, **flow_dict) #add user topic attribute user_topic_dict = topic_results_dict[user] user_label_dict = topic_results_label[user] results['topic'] = json.dumps(user_topic_dict) # {'topic1_en':pro1, 'topic2_en':pro2...} results['topic_string'] = topic_en2ch(user_label_dict) # 'topic1_ch&topic2_ch&topic3_ch' #add user event attribute results['tendency'] = event_results_dict[user] #add user domain attribute user_domain_dict = domain_results_dict[user] user_label_dict = domain_results_label[user] results['domain_v3'] = json.dumps(user_domain_dict) # [label1_en, label2_en, label3_en] results['domain'] = domain_en2ch(user_label_dict) # label_ch #add user character_sentiment attribute character_sentiment = character_sentiment_result_dict[user] results['character_sentiment'] = character_sentiment #add user character_text attribtue character_text = character_text_result_dict[user] results['character_text'] = character_text #add user psy attribute user_psy_dict = [psy_results_dict[user]] results['psycho_status'] = json.dumps(user_psy_dict) #add user profile attribute register_dict = register_result[str(user)] results = dict(results, **register_dict) #add user_evaluate attribute---importance results['importance'] = get_importance(results['domain'], results['topic_string'], results['fansnum'], fansnum_max) #add user_evaluate attribute---activeness user_activeness_time = activeness_results[user] user_activeness_geo = json.loads(results['activity_geo_dict'])[-1] results['activeness'] = get_activeness(user_activeness_geo, user_activeness_time) #add user_evaluate attribute---influence results['influence'] = influence_results[user] #bulk_action action = {'index':{'_id': str(user)}} bulk_action.extend([action, results]) if count >= 20: mark = save_user_results(bulk_action) print 'bulk_action:', bulk_action bulk_action = [] count = 0 end_ts = time.time() print 'user_set len:', len(user_set) print 'count:', count print 'bulk_action count:', len(bulk_action) print 'bulk_action:', bulk_action if bulk_action: status = save_user_results(bulk_action) #status = False return status # save by bulk
def test_cron_text_attribute_v2(user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts,filter_keywords_dict): status = False print 'start cron_text_attribute' uid_list = user_keywords_dict.keys() #get user flow information: hashtag, activity_geo, keywords print 'get flow result' flow_result = get_flow_information_v2(uid_list, user_keywords_dict) print 'flow result len:', len(flow_result) #get user profile information print 'get register result' register_result = get_profile_information(uid_list) print 'register result len:', len(register_result) #print user_keywords_dict #get user topic and domain by bulk action print 'get topic and domain' topic_results_dict, topic_results_label = topic_classfiy(uid_list, user_keywords_dict) print topic_results_dict,topic_results_label domain_results = domain_classfiy(uid_list, user_keywords_dict) domain_results_dict = domain_results[0] domain_results_label = domain_results[1] print 'topic result len:', len(topic_results_dict) print 'domain result len:', len(domain_results_dict) #get user character attribute print 'get character result' #type_mark = 0/1 for identify the task input status---just sentiment or text character_start_time = ts2datetime(character_start_ts) character_end_time = ts2datetime(character_start_ts + DAY * CHARACTER_TIME_GAP - DAY) print 'character_start_time:', character_start_time print 'character_end_time:', character_end_time character_sentiment_result_dict = classify_sentiment(uid_list, user_weibo_dict, character_start_time, character_end_time, WEIBO_API_INPUT_TYPE) character_text_result_dict = classify_topic(uid_list, user_keywords_dict) print 'character result len:', len(character_sentiment_result_dict), len(character_text_result_dict) #get user fansnum max fansnum_max = get_fansnum_max() #get user activeness by bulk_action print 'get activeness results' activeness_results = get_activity_time(uid_list) print 'activeness result len:', len(activeness_results) #get user inlfuence by bulk action print 'get influence' influence_results = get_influence(uid_list) print 'influence results len:', len(influence_results) #get user sensitive by bulk action print 'get sensitive' sensitive_results, sensitive_string_results, sensitive_dict_results = get_sensitive(uid_list) print 'sensitive results len:', len(sensitive_results) # compute text attribute bulk_action = [] count = 0 for user in uid_list: count += 1 results = {} #get user text attribute: online_pattern results['online_pattern'] = json.dumps(online_pattern_dict[user]) try: results['online_pattern_aggs'] = '&'.join(online_pattern_dict[user].keys()) except: results['online_pattern_aggs'] = '' results['uid'] = str(user) #add user flow information: hashtag, activity_geo, keywords flow_dict = flow_result[str(user)] results = dict(results, **flow_dict) #jln filter keyword results['filter_keywords'] = json.dumps(filter_keywords_dict[user]) #add user topic attribute user_topic_dict = topic_results_dict[user] user_label_dict = topic_results_label[user] results['topic'] = json.dumps(user_topic_dict) # {'topic1_en':pro1, 'topic2_en':pro2...} results['topic_string'] = topic_en2ch(user_label_dict) # 'topic1_ch&topic2_ch&topic3_ch' #add user domain attribute user_domain_dict = domain_results_dict[user] user_label_dict = domain_results_label[user] results['domain_v3'] = json.dumps(user_domain_dict) # [label1_en, label2_en, label3_en] results['domain'] = domain_en2ch(user_label_dict) # label_ch #add user character_sentiment attribute character_sentiment = character_sentiment_result_dict[user] results['character_sentiment'] = character_sentiment #add user character_text attribtue character_text = character_text_result_dict[user] results['character_text'] = character_text #add user profile attribute register_dict = register_result[str(user)] results = dict(results, **register_dict) #add user_evaluate attribute---importance results['importance'] = get_importance(results['domain'], results['topic_string'], results['fansnum'], fansnum_max) #add user_evaluate attribute---activeness user_activeness_time = activeness_results[user] user_activeness_geo = json.loads(results['activity_geo_dict'])[-1] results['activeness'] = get_activeness(user_activeness_geo, user_activeness_time) #add user_evaluate attribute---influence results['influence'] = influence_results[user] #add user sensitive attribute results['sensitive'] = sensitive_results[user] results['sensitive_dict'] = sensitive_dict_results[user] results['sensitive_string'] = sensitive_string_results[user] #bulk_action action = {'index':{'_id': str(user)}} bulk_action.extend([action, results]) status = save_user_results(bulk_action) return status
def test_cron_text_attribute_v2(user_keywords_dict, user_weibo_dict, online_pattern_dict, character_start_ts, relation_mark_dict, task_mark, submit_user_dict, submit_ts_dict): #mark index or update if submit_user_dict and submit_ts_dict: save_type = 'index' else: save_type = 'update' status = False print 'start cron_text_attribute' uid_list = user_keywords_dict.keys() #get user flow information: hashtag, activity_geo, keywords, ip print 'get flow result' flow_result = get_flow_information_v2(uid_list, user_keywords_dict) print 'flow result len:', len(flow_result) #get user profile information print 'get register result' register_result = get_profile_information(uid_list) print 'register result len:', len(register_result) #get user topic and domain by bulk action print 'get topic and domain' topic_results_dict, topic_results_label = topic_classfiy( uid_list, user_keywords_dict) domain_results = domain_classfiy(uid_list, user_keywords_dict) domain_results_dict = domain_results[0] domain_results_label = domain_results[1] print 'topic result len:', len(topic_results_dict) print 'domain result len:', len(domain_results_dict) #get user fansnum max fansnum_max, user_fansnum_dict = get_fansnum_max(uid_list) print 'fansnum len:', len(user_fansnum_dict) #get user activeness by bulk_action print 'get activeness results' activeness_results = get_activity_time(uid_list) print 'activeness result len:', len(activeness_results) #get user inlfuence by bulk action print 'get influence' influence_results = get_influence(uid_list) print 'influence results len:', len(influence_results) # compute text attribute bulk_action = [] count = 0 for user in uid_list: count += 1 results = {} #add submit_user and submit_ts if save_type == 'index': results['submit_user'] = submit_user_dict[user] results['submit_ts'] = submit_ts_dict[user] #get user text attribute: online_pattern results['online_pattern'] = json.dumps(online_pattern_dict[user]) try: results['online_pattern_aggs'] = '&'.join( online_pattern_dict[user].keys()) except: results['online_pattern_aggs'] = '' results['uid'] = str(user) #add user flow information: hashtag, activity_geo, keywords, ip flow_dict = flow_result[str(user)] results = dict(results, **flow_dict) #add user topic attribute user_topic_dict = topic_results_dict[user] user_label_dict = topic_results_label[user] results['topic'] = json.dumps( user_topic_dict) # {'topic1_en':pro1, 'topic2_en':pro2...} results['topic_string'] = topic_en2ch( user_label_dict) # 'topic1_ch&topic2_ch&topic3_ch' #add user domain attribute user_domain_dict = domain_results_dict[user] user_label_dict = domain_results_label[user] results['domain_v3'] = json.dumps( user_domain_dict) # [label1_en, label2_en, label3_en] results['domain'] = domain_en2ch(user_label_dict) # label_ch #add user profile attribute register_dict = register_result[str(user)] results = dict(results, **register_dict) #add user_evaluate attribute---importance results['importance'] = get_importance(results['domain'], results['topic_string'], user_fansnum_dict[user], fansnum_max) #add user_evaluate attribute---activeness user_activeness_time = activeness_results[user] user_activeness_geo = json.loads(results['activity_geo_dict'])[-1] results['activeness'] = get_activeness(user_activeness_geo, user_activeness_time) #add user_evaluate attribute---influence results['influence'] = influence_results[user] #bulk_action if save_type == 'index': action = {'index': {'_id': str(user)}} bulk_action.extend([action, results]) else: action = {'update': {'_id': str(user)}} bulk_action.extend([action, {'doc': results}]) status = save_user_results(bulk_action) print 'save es_user_portrait:', status #compute relation if task_mark == 'user': save_status = person_organization(uid_list, relation_mark_dict) print 'save_status:', save_status if status and save_status: status = True else: status = False #print 'save neo4j:', save_status return status