def compute_attribute(user_weibo_dict): # test uid_list = user_weibo_dict.keys() times = len(uid_list)/1000 bulk_action = [] count = 0 count_list = set() for i in range(times+1): flow_result = get_flow_information(uid_list[1000*i:1000*(i+1)]) # 流数据更新 register_result = get_profile_information(uid_list) # 背景信息数据更新 for user in uid_list: weibo_list = user_weibo_dict[user] results = compute_text_attribute(user, weibo_list) # 文本属性计算 results['uid'] = str(user) flow_dict = flow_result[str(user)] results.update(flow_dict) user_info = {'uid':str(user), 'domain':results['domain'], 'topic':results['topic'], 'activity_geo':results['geo_string']} evaluation_index = get_evaluate_index(user_info, status='insert') results.update(evaluation_index) register_dict = register_result[user] results.update(register_dict) action = {'index':{'_id':str(user)}} bulk_action.extend([action, results]) count_list.add(user) count += 1 if count % 200 == 0: es.bulk(bulk_action, index='sensitive_user_portrait_0103', doc_type="user", timeout=60) bulk_action = [] print count if bulk_action: es.bulk(bulk_action, index='sensitive_user_portrait_0103', doc_type="user", timeout=60) return "1"
def main(): # read the uid list uid_list = read_uid_list() # get user weibo 7day {user:[weibos]} user_weibo_dict = read_user_weibo(uid_list) uid_list = user_weibo_dict.keys() #print 'uid_list:', len(uid_list) #print 'user weibo dict:', len(user_weibo_dict) flow_result = get_flow_information(uid_list) register_result = get_profile_information(uid_list) # compute text attribute bulk_action = [] for user in user_weibo_dict: weibo_list = user_weibo_dict[user] uname = weibo_list[0]['uname'] results = compute_text_attribute(user, weibo_list) results['uid'] = str(user) flow_dict = flow_result[str(user)] results = dict(results, **flow_dict) # deal to the bulk action user_info = {'uid':str(user), 'domain':results['domain'], 'topic':results['topic'], 'activity_geo':results['activity_geo']} evaluation_index = get_evaluate_index(user_info, status='insert') results = dict(results, **evaluation_index) #print 'register_result:', register_result register_dict = register_result[str(user)] results = dict(results, **register_dict) action = {'index':{'_id': str(user)}} bulk_action.extend([action, results]) status = save_user_results(bulk_action) return True # save by bulk
def update_attribute_day(): # scan the user_portrait and bulk action to update status = False results = {} count = 0 index_name = "user_portrait" index_type = "user" s_re = scan(es, query={"query": {"match_all": {}}, "size": 1000}, index=index_name, doc_type=index_type) while True: bulk_action = [] while True: try: scan_re = s_re.next()["_source"] count += 1 except StopIteration: print "all done" if bulk_action: # print 'bulk_action:', bulk_action status = save_user_results(bulk_action) # print 'status:', status sys.exit(0) except Exception, r: print Exception, r sys.exit(0) uid = scan_re["uid"] user_info = {"uid": uid} evaluate_result = get_evaluate_index(user_info, status="update") results = {} results = dict(results, **evaluate_result) action = {"update": {"_id": str(uid)}} bulk_action.extend([action, {"doc": results}])
def compute_attribute(uid_list=[]): # test user_weibo_dict = read_user_weibo(uid_list) uid_list = user_weibo_dict.keys() flow_result = get_flow_information(uid_list) register_result = get_profile_information(uid_list) bulk_action = [] count = 0 count_list = set() for user in uid_list: weibo_list = user_weibo_dict[user] uname = weibo_list[0]['uname'] results = compute_text_attribute(user, weibo_list) results['uname'] = uname results['uid'] = str(user) flow_dict = flow_result[str(user)] results.update(flow_dict) user_info = {'uid':str(user), 'domain':results['domain'], 'topic':results['topic'], 'activity_geo':results['geo_string']} evaluation_index = get_evaluate_index(user_info, status='insert') results.update(evaluation_index) register_dict = register_result[user] results.update(register_dict) action = {'index':{'_id':str(user)}} bulk_action.extend([action, results]) count_list.add(user) count += 1 if count % 200 == 0: es.bulk(bulk_action, index=index_name, doc_type="user", timeout=60) bulk_action = [] print count if bulk_action: status = save_user_results(bulk_action) return "1"
def compute2in(uid_list, user_weibo_dict, status='insert'): flow_result = get_flow_information(uid_list) register_result = get_profile_information(uid_list) for user in user_weibo_dict: weibo_list = user_weibo_dict[user] uname = weibo_list[0]['uname'] results = compute_text_attribute(user, weibo_list) results['uname'] = uname results['uid'] = str(user) flow_dict = flow_result[str(user)] results = dict(results, **flow_dict) user_info = {'uid':str(user), 'domain':results['domain'], 'topic':results['topic'], 'activity_geo':results['activity_geo']} evaluation_index = get_evaluate_index(user_info, status='insert') results = dict(results, **evaluation_index) register_dict = register_result[user] results = dict(results, **register_dict) if status=='insert': action = {'index':{'_id':str(user)}} else: action = {'update':{'_id', str(user)}} results = {'doc': results} bulk_action.extend([action, results]) status = save_user_results(bulk_action) return True