def compute_attribute(user_weibo_dict): # test uid_list = user_weibo_dict.keys() times = len(uid_list)/1000 bulk_action = [] count = 0 count_list = set() for i in range(times+1): flow_result = get_flow_information(uid_list[1000*i:1000*(i+1)]) # 流数据更新 register_result = get_profile_information(uid_list) # 背景信息数据更新 for user in uid_list: weibo_list = user_weibo_dict[user] results = compute_text_attribute(user, weibo_list) # 文本属性计算 results['uid'] = str(user) flow_dict = flow_result[str(user)] results.update(flow_dict) user_info = {'uid':str(user), 'domain':results['domain'], 'topic':results['topic'], 'activity_geo':results['geo_string']} evaluation_index = get_evaluate_index(user_info, status='insert') results.update(evaluation_index) register_dict = register_result[user] results.update(register_dict) action = {'index':{'_id':str(user)}} bulk_action.extend([action, results]) count_list.add(user) count += 1 if count % 200 == 0: es.bulk(bulk_action, index='sensitive_user_portrait_0103', doc_type="user", timeout=60) bulk_action = [] print count if bulk_action: es.bulk(bulk_action, index='sensitive_user_portrait_0103', doc_type="user", timeout=60) return "1"
def compute_attribute(uid_list=[]): # test user_weibo_dict = read_user_weibo(uid_list) uid_list = user_weibo_dict.keys() flow_result = get_flow_information(uid_list) register_result = get_profile_information(uid_list) bulk_action = [] count = 0 count_list = set() for user in uid_list: weibo_list = user_weibo_dict[user] uname = weibo_list[0]['uname'] results = compute_text_attribute(user, weibo_list) results['uname'] = uname results['uid'] = str(user) flow_dict = flow_result[str(user)] results.update(flow_dict) user_info = {'uid':str(user), 'domain':results['domain'], 'topic':results['topic'], 'activity_geo':results['geo_string']} evaluation_index = get_evaluate_index(user_info, status='insert') results.update(evaluation_index) register_dict = register_result[user] results.update(register_dict) action = {'index':{'_id':str(user)}} bulk_action.extend([action, results]) count_list.add(user) count += 1 if count % 200 == 0: es.bulk(bulk_action, index=index_name, doc_type="user", timeout=60) bulk_action = [] print count if bulk_action: status = save_user_results(bulk_action) return "1"
def save_text2es(): count = 0 bulk_action = [] user_weibo_dict = dict() csvfile = open('./sensitive_uid_text_2.csv', 'rb') reader = csv.reader(csvfile) for line in reader: count += 1 weibo = dict() user = line[0] weibo['text'] = line[1].decode('utf-8', 'ignore') weibo['mid'] = line[2] weibo['geo'] = ip2geo(line[3]) weibo['timestamp'] = line[4] weibo['message_type'] = line[5] weibo['uid'] = user sentiment = attr_liwc([weibo['text']]) weibo['sentiment'] = json.dumps(sentiment) if not isinstance(weibo['text'], str): text = (weibo['text']).encode('utf-8', 'ignore') sw_dict = sensitive_words_extract(text) if sw_dict: weibo['sensitive_words'] = json.dumps(sw_dict) weibo['sensitive'] = 1 else: weibo['sensitive'] = 0 action = {'index':{'_id':weibo['mid']}} bulk_action.extend([action, weibo]) if count % 1000 == 0: es.bulk(bulk_action, index='sensitive_user_text', doc_type='user', timeout=30) bulk_action = [] print count if bulk_action: es.bulk(bulk_action, index='sensitive_user_text', doc_type='user', timeout=30)
def week_update_portrait(user_weibo_dict): # {uid: [weibo_text]} uid_list = user_weibo_dict.keys() register_result = get_profile_information(uid_list) # 背景信息数据更新 bulk_action = [] count = 0 for user in uid_list: result = dict() weibo_list = user_weibo_dict[user] register_dict = register_result[user] result.update(register_dict) # results['domain'] = attri_domain(weibo_list) result["domain"] = "test_domain" result["domain_string"] = "&".join(result["domain"]) # psycho_status = attr_psycho_status(user, weibo_list) psycho_status = {"positive": 0.5, "negetive": 0.2, "neutral": 0.3} result["psycho_status_string"] = "&".join(psycho_status.keys()) result["psycho_status"] = json.dumps(psycho_status) # topic = attr_topic(weibo_list) topic = {"政治": 0.3, "民生": 0.7} result["topic"] = json.dumps(topic) result["topic_string"] = "&".join(result["topic"].keys()) # politics_trend = attri_politics(user, weibo_list) politics_trend = "left" result["politics_trend"] = politics_trend action = {"update": {"_id": str(user)}} results = {"doc": result} bulk_action.extend([action, results]) if count % 1000 == 0: es.bulk(bulk_action, index="sensitive_user_portrait", doc_type="user", timeout=60) bulk_action = [] print count if bulk_action: es.bulk(bulk_action, index="sensitive_user_portrait", doc_type="user", timeout=60) return "1"
def save_user_results(bulk_action): #print 'bulk_action:', bulk_action[0:2] date = ts2datetime(time.time()-24*3600).replace('-','') es.bulk(bulk_action, index=index_name, doc_type=index_type) """ for item in bulk_action: user = item[0]['index']['_id'] print user status = r.hget('identify_in_sensitive_'+str(date), user) if status: r.hset('identify_in_sensitive_'+str(date), user, '3') else: r.hset('identify_in_influence_'+str(date), user, '3') """ return True
def save_user_results(bulk_action): #print 'bulk_action:', bulk_action[0:2] date = ts2datetime(time.time() - 24 * 3600).replace('-', '') es.bulk(bulk_action, index=index_name, doc_type=index_type) """ for item in bulk_action: user = item[0]['index']['_id'] print user status = r.hget('identify_in_sensitive_'+str(date), user) if status: r.hset('identify_in_sensitive_'+str(date), user, '3') else: r.hset('identify_in_influence_'+str(date), user, '3') """ return True
def update_portrait(): user_weibo_dict = read_user_weibo() uid_list = user_weibo_dict.keys() flow_result = get_flow_information(uid_list) bulk_action = [] count = 0 for user in uid_list: action = {'update':{'_id': str(user)}} result = {'doc':flow_result[user]} bulk_action.extend([action, result]) count += 1 if count % 500 == 0: es.bulk(bulk_action, index='sensitive_user_portrait', doc_type='user', timeout=60) bulk_action = [] print count if bulk_action: es.bulk(bulk_action, index='sensitive_user_portrait', doc_type='user', timeout=60) return '1'
def week_update_portrait(user_weibo_dict): # {uid: [weibo_text]} uid_list = user_weibo_dict.keys() register_result = get_profile_information(uid_list) # 背景信息数据更新 bulk_action = [] count = 0 for user in uid_list: result = dict() weibo_list = user_weibo_dict[user] register_dict = register_result[user] result.update(register_dict) # results['domain'] = attri_domain(weibo_list) result['domain'] = 'test_domain' result['domain_string'] = "&".join(result['domain']) # psycho_status = attr_psycho_status(user, weibo_list) psycho_status = {'positive': 0.5, 'negetive': 0.2, 'neutral': 0.3} result['psycho_status_string'] = '&'.join(psycho_status.keys()) result['psycho_status'] = json.dumps(psycho_status) # topic = attr_topic(weibo_list) topic = {'政治': 0.3, '民生': 0.7} result['topic'] = json.dumps(topic) result['topic_string'] = '&'.join(result['topic'].keys()) # politics_trend = attri_politics(user, weibo_list) politics_trend = 'left' result['politics_trend'] = politics_trend action = {'update': {'_id': str(user)}} results = {'doc': result} bulk_action.extend([action, results]) if count % 1000 == 0: es.bulk(bulk_action, index='sensitive_user_portrait', doc_type='user', timeout=60) bulk_action = [] print count if bulk_action: es.bulk(bulk_action, index='sensitive_user_portrait', doc_type='user', timeout=60) return '1'
def save_text2es(): count = 0 bulk_action = [] user_weibo_dict = dict() csvfile = open('./sensitive_uid_text_2.csv', 'rb') reader = csv.reader(csvfile) for line in reader: count += 1 weibo = dict() user = line[0] weibo['text'] = line[1].decode('utf-8', 'ignore') weibo['mid'] = line[2] weibo['geo'] = ip2geo(line[3]) weibo['timestamp'] = line[4] weibo['message_type'] = line[5] weibo['uid'] = user sentiment = attr_liwc([weibo['text']]) weibo['sentiment'] = json.dumps(sentiment) if not isinstance(weibo['text'], str): text = (weibo['text']).encode('utf-8', 'ignore') sw_dict = sensitive_words_extract(text) if sw_dict: weibo['sensitive_words'] = json.dumps(sw_dict) weibo['sensitive'] = 1 else: weibo['sensitive'] = 0 action = {'index': {'_id': weibo['mid']}} bulk_action.extend([action, weibo]) if count % 1000 == 0: es.bulk(bulk_action, index='sensitive_user_text', doc_type='user', timeout=30) bulk_action = [] print count if bulk_action: es.bulk(bulk_action, index='sensitive_user_text', doc_type='user', timeout=30)
def daily_update_portrait(user_weibo_dict): # {uid: [weibo_text]} uid_list = user_weibo_dict.keys() bulk_action = [] count = 0 for user in uid_list: results = dict() weibo_list = user_weibo_dict[user] flow_result = get_flow_information(user) text_result = temporary_text_update(user, weibo_list) evaluate_result = evaluate_index(user, status='update') results.update(flow_result) results.update(text_result) results.update(evaluate_result) action = {'update':{'_id': str(user)}} result = {'doc':results} bulk_action.extend([action, result]) if count % 1000 == 0: es.bulk(bulk_action, index='sensitive_user_portrait', doc_type='user', timeout=60) bulk_action = [] print count if bulk_action: es.bulk(bulk_action, index='sensitive_user_portrait', doc_type='user', timeout=60) return '1'
# add sentiment field to weibo sentiment = get_sentiment_attribute(text) item['sentiment'] = sentiment # add hashtag field to weibo hashtag_string = get_hashtag_attribute(text) if hashtag_string != '': item['hashtag'] = hashtag_string # save action, xdata = expand_index_action(item) bulk_action.extend([action, xdata]) count += 1 if count % 1 == 0 and count != 0: print 'start bulk_action %s' % count es.bulk(bulk_action, index=index_name, doc_type=index_type, timeout=60) bulk_action = [] count = 0 if read_count % 10000 == 0: te = time.time() print '[%s] cal speed: %s sec/per %s' % ( datetime.now().strftime('%Y-%m-%d %H:%M:%S'), te - ts, 10000) if read_count % 100000 == 0: print '[%s] total cal %s, cost %s sec [avg %s per/sec]' % ( datetime.now().strftime('%Y-%m-%d %H:%M:%S'), read_count, te - tb, read_count / (te - tb)) ts = te
word = word.decode('utf-8') word_set.add(word) sensitive_word_string = '&'.join(list(word_set)) item['sensitive_word'] = sensitive_word_string else: item['sensitive'] = 0 # add sentiment field to weibo sentiment = get_sentiment_attribute(text) item['sentiment'] = sentiment # add hashtag field to weibo hashtag_string = get_hashtag_attribute(text) if hashtag_string != '': item['hashtag'] = hashtag_string # save action, xdata = expand_index_action(item) bulk_action.extend([action, xdata]) count += 1 if count % 1 == 0 and count != 0: print 'start bulk_action %s' % count es.bulk(bulk_action, index=index_name, doc_type=index_type, timeout=60) bulk_action = [] count = 0 if read_count % 10000 == 0: te = time.time() print '[%s] cal speed: %s sec/per %s' % (datetime.now().strftime('%Y-%m-%d %H:%M:%S'), te - ts, 10000) if read_count % 100000 == 0: print '[%s] total cal %s, cost %s sec [avg %s per/sec]' % (datetime.now().strftime('%Y-%m-%d %H:%M:%S'), read_count, te - tb, read_count / (te - tb)) ts = te