def get_topic_user(): result = {} index_name = 'weibo_user' index_type = 'user' ''' test_user_list = [['1499104401', '1265965213', '3270699555', '2073915493', '1686474312'],\ ['2803301701', '2105426467', '1665372775', '3716504593', '2892376557'],\ ['1457530250', '1698513182', '2793591492', '2218894100', '1737961042'],\ ['1656818110', '1660127070', '1890124610', '1182391230', '1243861100'],\ ['1680430844', '2998045524', '2202896360', '1639498782', '3494698730'],\ ['2587093162', '1677675054', '1871767009', '1193111400', '1672418622'],\ ['1730726640', '1752502540', '1868725480', '1262486750', '1235733080'],\ ['1250041100', '2275231150', '1268642530', '1658606270', '1857599860'],\ ['1929496477', '2167425990', '1164667670', '2417139911', '1708853044'],\ ['1993292930', '1645823930', '1890926610', '1641561810', '2023833990'],\ ['2005471590', '1233628160', '2074684140', '1396715380', '1236762250'],\ ['1423592890', '2612799560', '1926127090', '2684951180', '1760607220']] ''' for topic in search_dict: result[topic] = [] user_list = search_dict[topic] profile_result = es_user_profile.mget(index=index_name, doc_type=index_type, body={'ids':user_list})['docs'] for profile in profile_result: uid = profile['_id'] try: uname = profile['_source']['nick_name'] photo_url = profile['_source']['photo_url'] except: uname = 'unknown' photo_url = 'unknown' result[topic].append([uid, uname, photo_url]) return result
def recommend_new_words(date_list): results = [] for date in date_list: date = date.replace('-', '') words_dict = r.hgetall('recommend_sensitive_words_' + date) if words_dict: for key, value in words_dict.items(): detail = [] detail.append(key) value = json.loads(value) uid_list = value[0] uname = [] try: search_results = es_user_profile.mget( index='weibo_user', doc_type='user', body={'ids': uid_list})['docs'] for item in search_results: if item['found']: uname.append(item['_source']['nick_name']) else: uname.append('unknown') except: uname = uid_list detail.extend([uname, value[1]]) results.append(detail) sorted_results = sorted(results, key=lambda x: x[2], reverse=True) return sorted_results
def get_sensitive_user_detail(uid_list, date, sensitive): results = [] index_name = str(date).replace('-','') # index_name:20130901 user_bci_results = es_cluster.mget(index=index_name, doc_type='bci', body={'ids':uid_list}, _source=True)['docs'] user_profile_results = es_user_profile.mget(index="weibo_user", doc_type="user", body={"ids":uid_list}, _source=True)['docs'] for i in range(0, len(uid_list)): personal_info = ['']*6 uid = uid_list[i] personal_info[0] = uid_list[i] if user_profile_results[i]['found']: profile_dict = user_profile_results[i]['_source'] personal_info[1] = profile_dict['nick_name'] personal_info[2] = profile_dict['user_location'] personal_info[3] = profile_dict['fansnum'] personal_info[4] = profile_dict['statusnum'] if user_bci_results[i]['found']: personal_info[5] = user_bci_results[i]['_source'].get('user_index', 0) else: personal_info[5] = 0 if sensitive: sensitive_words = r_cluster.hget('sensitive_' + index_name, str(uid)) if sensitive_words: sensitive_dict = json.loads(sensitive_words) personal_info.append(sensitive_dict.keys()) else: personal_info.append([]) results.append(personal_info) return results
def recommend_new_words(date_list): results = [] for date in date_list: date = date.replace('-', '') words_dict = r.hgetall('recommend_sensitive_words_'+date) if words_dict: for key, value in words_dict.items(): detail = [] detail.append(key) value = json.loads(value) uid_list = value[0] uname = [] try: search_results = es_user_profile.mget(index='weibo_user', doc_type='user', body={'ids': uid_list})['docs'] for item in search_results: if item['found']: uname.append(item['_source']['nick_name']) else: uname.append('unknown') except: uname = uid_list detail.extend([uname,value[1]]) results.append(detail) sorted_results = sorted(results, key=lambda x:x[2], reverse=True) return sorted_results
def get_sensitive_user_detail(uid_list, date, sensitive): es_cluster = es_user_profile ts = datetime2ts(date) results = [] index_name = pre_influence_index + str(date).replace( '-', '') # index_name:20130901 user_bci_results = es_bci.mget(index=index_name, doc_type='bci', body={'ids': uid_list}, _source=False, fields=['user_index'])['docs'] user_profile_results = es_user_profile.mget(index="weibo_user", doc_type="user", body={"ids": uid_list}, _source=True)['docs'] top_influnce_value = get_top_value("user_index", es_bci, index_name, "bci") for i in range(0, len(uid_list)): personal_info = [''] * 6 uid = uid_list[i] personal_info[0] = uid_list[i] personal_info[1] = uid_list[i] if user_profile_results[i]['found']: profile_dict = user_profile_results[i]['_source'] uname = profile_dict['nick_name'] if uname: personal_info[1] = uname personal_info[2] = profile_dict['user_location'] personal_info[3] = profile_dict['fansnum'] personal_info[4] = profile_dict['statusnum'] if user_bci_results[i]['found']: try: tmp_bci = user_bci_results[i]['fields']['user_index'][0] influence = math.log( tmp_bci / float(top_influnce_value) * 9 + 1, 10) * 100 personal_info[5] = influence except: personal_info[5] = 0 else: personal_info[5] = 0 if sensitive: sensitive_words = redis_cluster.hget('sensitive_' + str(ts), str(uid)) if sensitive_words: sensitive_dict = json.loads(sensitive_words) personal_info.append(sensitive_dict.keys()) else: personal_info.append([]) else: personal_info.append([]) results.append(personal_info) return results
def search_follower(uid, sensitive): results = dict() stat_results = dict() for db_num in R_DICT: r = R_DICT[db_num] if sensitive: br_uid_results = r.hgetall('sensitive_be_retweet_' + str(uid)) else: br_uid_results = r.hgetall('be_retweet_' + str(uid)) if br_uid_results: for br_uid in br_uid_results: if br_uid != uid: try: stat_results[br_uid] += br_uid_results[br_uid] except: stat_results[br_uid] = br_uid_results[br_uid] if not stat_results: return [None, 0] try: sort_stat_results = sorted(stat_results.items(), key=lambda x: x[1], reverse=True)[:20] except: return [None, 0] uid_list = [item[0] for item in sort_stat_results] es_profile_results = es_user_profile.mget(index='weibo_user', doc_type='user', body={'ids': uid_list})['docs'] es_portrait_results = es.mget(index='sensitive_user_portrait', doc_type='user', body={'ids': uid_list})['docs'] result_list = [] for i in range(len(es_profile_results)): item = es_profile_results[i] uid = item['_id'] try: source = item['_source'] uname = source['nick_name'] except: uname = u'unknown' portrait_item = es_portrait_results[i] try: source = portrait_item['_source'] in_status = 1 except: in_status = 0 result_list.append([uid, [uname, stat_results[uid], in_status]]) return [result_list[:20], len(stat_results)]
def identify_user_out(input_uid_list): out_user_list = [] in_user_list = [] input_len = len(input_uid_list) iter_count = 0 print 'identify user out' #get user list who is out user_portrait while iter_count < input_len: iter_user_list = input_uid_list[iter_count: iter_count+DETECT_ITER_COUNT] try: portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, body={'ids':iter_user_list}, _source=False)['docs'] except: portrait_result = [] for item in portrait_result: uid = item['_id'] if item['found'] != True: out_user_list.append(uid) else: in_user_list.append(uid) iter_count += DETECT_ITER_COUNT print 'get out user portrait information' #get user profile information for out user_portrait iter_count = 0 out_user_count = len(out_user_list) out_user_result = [] while iter_count < out_user_count: iter_user_list = out_user_list[iter_count: iter_count+DETECT_ITER_COUNT] try: profile_result = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={'ids':iter_user_list}, _source=True)['docs'] except: profile_result = [] for item in profile_result: uid = item['_id'] if item['found']==True: source = item['_source'] uname = source['nick_name'] fansnum = source['fansnum'] statusnum = source['statusnum'] friendsnum = source['friendsnum'] else: uname = u'未知' fansnum = u'未知' statusnum = u'未知' friendsnum = u'未知' out_user_result.append([uid, uname, fansnum, statusnum, friendsnum]) iter_count += DETECT_ITER_COUNT sort_out_user_result = sorted(out_user_result, key=lambda x:x[2], reverse=True) return in_user_list, sort_out_user_result
def search_retweet(uid, sensitive): stat_results = dict() results = dict() for db_num in R_DICT: r = R_DICT[db_num] if not sensitive: ruid_results = r.hgetall('retweet_' + str(uid)) else: ruid_results = r.hgetall('sensitive_retweet_' + str(uid)) # because of sensitive weibo if ruid_results: for ruid in ruid_results: if ruid != uid: if stat_results.has_key(ruid): stat_results[ruid] += ruid_results[ruid] else: stat_results[ruid] = ruid_results[ruid] if stat_results: sort_stat_results = sorted(stat_results.items(), key=lambda x: x[1], reverse=True)[:20] else: return [None, 0] uid_list = [item[0] for item in sort_stat_results] es_profile_results = es_user_profile.mget(index='weibo_user', doc_type='user', body={'ids': uid_list})['docs'] es_portrait_results = es.mget(index='sensitive_user_portrait', doc_type='user', body={'ids': uid_list})['docs'] result_list = [] for i in range(len(es_profile_results)): item = es_profile_results[i] uid = item['_id'] if item['found']: uname = item['_source']['nick_name'] else: uname = u'unknown' portrait_item = es_portrait_results[i] if portrait_item['found']: in_status = 1 else: in_status = 0 result_list.append([uid, [uname, stat_results[uid], in_status]]) return [result_list[:20], len(stat_results)]
def get_top_user(): results = dict() for domain in domain_dict: results[domain] = [] user_list = domain_dict[domain] profile_result = es_user_profile.mget(index=index_name, doc_type=index_type, body={'ids':user_list})['docs'] for profile in profile_result: uid = profile['_id'] try: uname = profile['_source']['nick_name'] photo_url = profile['_source']['photo_url'] except: uname = 'unknown' photo_url = 'unknown' results[domain].append([uid, uname, photo_url]) return results
def search_follower(uid, sensitive): results = dict() stat_results = dict() for db_num in R_DICT: r = R_DICT[db_num] if sensitive: br_uid_results = r.hgetall("sensitive_be_retweet_" + str(uid)) else: br_uid_results = r.hgetall("be_retweet_" + str(uid)) if br_uid_results: for br_uid in br_uid_results: if br_uid != uid: try: stat_results[br_uid] += br_uid_results[br_uid] except: stat_results[br_uid] = br_uid_results[br_uid] if not stat_results: return [None, 0] try: sort_stat_results = sorted(stat_results.items(), key=lambda x: x[1], reverse=True)[:20] except: return [None, 0] uid_list = [item[0] for item in sort_stat_results] es_profile_results = es_user_profile.mget(index="weibo_user", doc_type="user", body={"ids": uid_list})["docs"] es_portrait_results = es.mget(index="sensitive_user_portrait", doc_type="user", body={"ids": uid_list})["docs"] result_list = [] for i in range(len(es_profile_results)): item = es_profile_results[i] uid = item["_id"] try: source = item["_source"] uname = source["nick_name"] except: uname = u"unknown" portrait_item = es_portrait_results[i] try: source = portrait_item["_source"] in_status = 1 except: in_status = 0 result_list.append([uid, [uname, stat_results[uid], in_status]]) return [result_list[:20], len(stat_results)]
def search_follower(uid, sensitive): results = dict() stat_results = dict() if 1: r = r_cluster if sensitive: br_uid_results = r.hgetall('sensitive_be_retweet_'+str(uid)) else: br_uid_results = r.hgetall('be_retweet_'+str(uid)) if br_uid_results: for br_uid in br_uid_results: if br_uid != uid: try: stat_results[br_uid] += br_uid_results[br_uid] except: stat_results[br_uid] = br_uid_results[br_uid] if not stat_results: return [None, 0] try: sort_stat_results = sorted(stat_results.items(), key=lambda x:x[1], reverse=True)[:20] except: return [None, 0] uid_list = [item[0] for item in sort_stat_results] es_profile_results = es_user_profile.mget(index='weibo_user', doc_type='user', body={'ids':uid_list})['docs'] es_portrait_results = es.mget(index='sensitive_user_portrait', doc_type='user', body={'ids':uid_list})['docs'] result_list = [] for i in range(len(es_profile_results)): item = es_profile_results[i] uid = item['_id'] try: source = item['_source'] uname = source['nick_name'] except: uname = u'unknown' portrait_item = es_portrait_results[i] try: source = portrait_item['_source'] in_status = 1 except: in_status = 0 result_list.append([uid,[uname, stat_results[uid], in_status]]) return [result_list[:20], len(stat_results)]
def get_user_url(uid_list): results = [] try: es_results = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids": uid_list})['docs'] except: es_results = {} for item in es_results: temp = [] if item['found']: temp.append(item['_source']["photo_url"]) temp.append(item['_source']['nick_name']) temp.append(item['_id']) else: temp.append("unknown") temp.append(item['_id']) temp.append(item['_id']) results.append(temp) return results
def search_retweet(uid, sensitive): stat_results = dict() results = dict() if 1: r = r_cluster if not sensitive: ruid_results = r.hgetall('retweet_'+str(uid)) else: ruid_results = r.hgetall('sensitive_retweet_'+str(uid)) # because of sensitive weibo if ruid_results: for ruid in ruid_results: if ruid != uid: if stat_results.has_key(ruid): stat_results[ruid] += ruid_results[ruid] else: stat_results[ruid] = ruid_results[ruid] if stat_results: sort_stat_results = sorted(stat_results.items(), key=lambda x:x[1], reverse=True)[:20] else: return [None, 0] uid_list = [item[0] for item in sort_stat_results] es_profile_results = es_user_profile.mget(index='weibo_user', doc_type='user', body={'ids':uid_list})['docs'] es_portrait_results = es.mget(index='sensitive_user_portrait', doc_type='user', body={'ids':uid_list})['docs'] result_list = [] for i in range(len(es_profile_results)): item = es_profile_results[i] uid = item['_id'] if item['found']: uname = item['_source']['nick_name'] else: uname = u'unknown' portrait_item = es_portrait_results[i] if portrait_item['found']: in_status = 1 else: in_status = 0 result_list.append([uid,[uname, stat_results[uid], in_status]]) return [result_list[:20], len(stat_results)]
def search_retweet(uid, sensitive): stat_results = dict() results = dict() for db_num in R_DICT: r = R_DICT[db_num] if not sensitive: ruid_results = r.hgetall("retweet_" + str(uid)) else: ruid_results = r.hgetall("sensitive_retweet_" + str(uid)) # because of sensitive weibo if ruid_results: for ruid in ruid_results: if ruid != uid: if stat_results.has_key(ruid): stat_results[ruid] += ruid_results[ruid] else: stat_results[ruid] = ruid_results[ruid] if stat_results: sort_stat_results = sorted(stat_results.items(), key=lambda x: x[1], reverse=True)[:20] else: return [None, 0] uid_list = [item[0] for item in sort_stat_results] es_profile_results = es_user_profile.mget(index="weibo_user", doc_type="user", body={"ids": uid_list})["docs"] es_portrait_results = es.mget(index="sensitive_user_portrait", doc_type="user", body={"ids": uid_list})["docs"] result_list = [] for i in range(len(es_profile_results)): item = es_profile_results[i] uid = item["_id"] if item["found"]: uname = item["_source"]["nick_name"] else: uname = u"unknown" portrait_item = es_portrait_results[i] if portrait_item["found"]: in_status = 1 else: in_status = 0 result_list.append([uid, [uname, stat_results[uid], in_status]]) return [result_list[:20], len(stat_results)]
def get_sensitive_user_detail(uid_list, date, sensitive): es_cluster = es_user_profile ts = datetime2ts(date) results = [] index_name = pre_influence_index + str(date).replace('-','') # index_name:20130901 user_bci_results = es_bci.mget(index=index_name, doc_type='bci', body={'ids':uid_list}, _source=False, fields=['user_index'])['docs'] user_profile_results = es_user_profile.mget(index="weibo_user", doc_type="user", body={"ids":uid_list}, _source=True)['docs'] top_influnce_value = get_top_value("user_index", es_bci, index_name, "bci") for i in range(0, len(uid_list)): personal_info = ['']*6 uid = uid_list[i] personal_info[0] = uid_list[i] personal_info[1] = uid_list[i] if user_profile_results[i]['found']: profile_dict = user_profile_results[i]['_source'] uname = profile_dict['nick_name'] if uname: personal_info[1] = uname personal_info[2] = profile_dict['user_location'] personal_info[3] = profile_dict['fansnum'] personal_info[4] = profile_dict['statusnum'] if user_bci_results[i]['found']: try: tmp_bci = user_bci_results[i]['fields']['user_index'][0] influence = math.log(tmp_bci/float(top_influnce_value)*9+1, 10)*100 personal_info[5] = influence except: personal_info[5] = 0 else: personal_info[5] = 0 if sensitive: sensitive_words = redis_cluster.hget('sensitive_' + str(ts), str(uid)) if sensitive_words: sensitive_dict = json.loads(sensitive_words) personal_info.append(sensitive_dict.keys()) else: personal_info.append([]) else: personal_info.append([]) results.append(personal_info) return results
def get_sensitive_user_detail(uid_list, date, sensitive): results = [] index_name = str(date).replace('-', '') # index_name:20130901 user_bci_results = es_cluster.mget(index=index_name, doc_type='bci', body={'ids': uid_list}, _source=True)['docs'] user_profile_results = es_user_profile.mget(index="weibo_user", doc_type="user", body={"ids": uid_list}, _source=True)['docs'] for i in range(0, len(uid_list)): personal_info = [''] * 6 uid = uid_list[i] personal_info[0] = uid_list[i] if user_profile_results[i]['found']: profile_dict = user_profile_results[i]['_source'] personal_info[1] = profile_dict['nick_name'] personal_info[2] = profile_dict['user_location'] personal_info[3] = profile_dict['fansnum'] personal_info[4] = profile_dict['statusnum'] if user_bci_results[i]['found']: personal_info[5] = user_bci_results[i]['_source'].get( 'user_index', 0) else: personal_info[5] = 0 if sensitive: sensitive_words = r_cluster.hget('sensitive_' + index_name, str(uid)) if sensitive_words: sensitive_dict = json.loads(sensitive_words) personal_info.append(sensitive_dict.keys()) else: personal_info.append([]) results.append(personal_info) return results
def get_task_detail_2(task_name, ts, user): results = dict() index_name = task_name _id = user + "-" + task_name task_detail = es.get(index=index_manage_sensing_task, doc_type=task_doc_type, id=_id)["_source"] task_name = task_detail['task_name'] history_status = json.loads(task_detail['history_status']) start_time = task_detail['create_at'] create_by = task_detail['create_by'] stop_time = task_detail['stop_time'] remark = task_detail.get('remark', '') portrait_detail = [] count = 0 # 计数 top_influence = get_top_influence("influence") top_activeness = get_top_influence("activeness") top_importance = get_top_influence("importance") time_series = [] # 时间 #positive_sentiment_list = [] # 情绪列表 #neutral_sentiment_list = [] #negetive_sentiment_list = [] all_weibo_list = [] origin_weibo_list = [] # 微博列表 retweeted_weibo_list = [] #retweeted_weibo_count = [] # 别人转发他的数量 #comment_weibo_count = [] #total_number_count = [] #burst_time_list = [] # 爆发时间列表 important_user_set = set() # 重要人物列表 out_portrait_users = set() # 未入库 ts = int(ts) time_series = history_status #for item in history_status: # if int(item[0]) <= ts: # time_series.append(item[0]) # 到目前为止的所有的时间戳 # get detail task information from es if time_series: flow_detail = es.mget(index=index_sensing_task, doc_type=_id, body={"ids": time_series})['docs'] else: flow_detail = {} if flow_detail: for item in flow_detail: item = item['_source'] timestamp = item['timestamp'] #sentiment_distribution = json.loads(item["sentiment_distribution"]) #positive_sentiment_list.append(int(sentiment_distribution['1'])) #negetive_sentiment_list.append(int(sentiment_distribution['2'])+int(sentiment_distribution['3']) \ # +int(sentiment_distribution['4'])+int(sentiment_distribution['5'])+int(sentiment_distribution['6'])) #neutral_sentiment_list.append(int(sentiment_distribution['0'])) origin_weibo_list.append(item["origin_weibo_number"]) # real retweeted_weibo_list.append(item['retweeted_weibo_number']) # real all_weibo_list.append(item["origin_weibo_number"]+item['retweeted_weibo_number']) #retweeted_weibo_count.append(item['retweeted_weibo_count']) #comment_weibo_count.append(item['comment_weibo_count']) #total_number_count.append(item['weibo_total_number']) temp_important_user_list = json.loads(item['important_users']) unfiltered_users = json.loads(item['unfilter_users']) temp_out_portrait_users = set(unfiltered_users) - set(temp_important_user_list) # 未入库 important_user_set = important_user_set | set(temp_important_user_list) out_portrait_users = out_portrait_users | set(temp_out_portrait_users) #burst_reason = item.get("burst_reason", "") #if burst_reason: # burst_time_list.append([timestamp, count, burst_reason]) count += 1 #################################################################################### # 统计爆发原因,下相应的结论 """ weibo_variation_count = 0 weibo_variation_time = [] sentiment_variation_count = 0 sentiment_variation_time = [] sensitive_variation_count = 0 # sensitive sensitive_variation_time = [] # sensitive common_variation_count = 0 common_variation_time = [] if burst_time_list: for item in burst_time_list: tmp_common = 0 x1 = 0 x2 = 0 x3 = 0 if signal_count_varition in item[2]: weibo_variation_count += 1 weibo_variation_time.append([ts2date_min(item[0]), total_number_count[item[1]]]) x1 = total_number_count[item[1]] tmp_common += 1 if signal_sentiment_varition in item[2]: tmp_common += 1 sentiment_variation_count += 1 x2 = negetive_sentiment_list[item[1]] sentiment_variation_time.append([ts2date_min(item[0]), negetive_sentiment_list[item[1]]]) if signal_sensitive_variation in item[2]: tmp_common += 1 sensitive_variation_count += 1 x3 = sensitive_total_number_list[item[1]] sensitive_variation_time.append([ts2date_min(item[0]), all_weibo_list[item[1]]]) if tmp_common >= 2: common_variation_count += 1 common_variation_time.append([ts2date_min(item[0]), x1, x2, x3]) warning_conclusion = remark variation_distribution = [] if weibo_variation_count: variation_distribution.append(weibo_variation_time) else: variation_distribution.append([]) if sentiment_variation_count: variation_distribution.append(sentiment_variation_time) else: variation_distribution.append([]) if sensitive_variation_count: variation_distribution.append(sensitive_variation_time) else: variation_distribution.append([]) if common_variation_count: variation_distribution.append(common_variation_time) else: variation_distribution.append([]) results['warning_conclusion'] = warning_conclusion results['variation_distribution'] = variation_distribution # 每个用户的热度 """ # 获取重要用户的个人信息 important_uid_list = list(important_user_set) out_portrait_users_list = list(out_portrait_users) user_detail_info = [] # out_user_detail_info = [] if important_uid_list: user_results = es.mget(index=portrait_index_name, doc_type=portrait_index_type, body={"ids":important_uid_list}, fields=['uid', 'uname', 'domain', 'topic_string', "photo_url", 'importance', 'influence', 'activeness'])['docs'] for item in user_results: if item['found']: temp = [] #if int(item['fields']['importance'][0]) < IMPORTANT_USER_THRESHOULD: # continue temp.append(item['fields']['uid'][0]) uname = item['fields']['uname'][0] if not uname or uname == "未知": uname = item['fields']['uid'][0] temp.append(uname) temp.append(item['fields']['photo_url'][0]) temp.append(item['fields']['domain'][0]) temp.append(item['fields']['topic_string'][0].split('&')) #hot_count = count_hot_uid(item['fields']['uid'][0], start_time, stop_time) #temp.append(hot_count) temp.append(math.log(item['fields']['importance'][0]/float(top_importance)*9+1, 10)*100) temp.append(math.log(item['fields']['influence'][0]/float(top_influence)*9+1, 10)*100) temp.append(math.log(item['fields']['activeness'][0]/float(top_activeness)*9+1, 10)*100) user_detail_info.append(temp) # 排序 if user_detail_info: user_detail_info = sorted(user_detail_info, key=lambda x:x[6], reverse=True) else: user_detail_info = [] if out_portrait_users_list: profile_results = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":out_portrait_users_list})["docs"] bci_index = "bci_" + ts2datetime(ts-DAY).replace('-','') influence_results = es.mget(index=bci_index, doc_type="bci", body={"ids":out_portrait_users_list}, fields=["user_index"])['docs'] bci_results = es_profile.mget(index="bci_history", doc_type="bci", body={"ids":out_portrait_users_list}, fields=['user_fansnum'])['docs'] top_influence = get_top_all_influence("user_index", ts) count = 0 if profile_results: for item in profile_results: temp = [] if item['found']: uid = item['_source']['uid'] temp = get_user_profile([uid], ['nick_name', 'user_location', 'statusnum', 'fansnum'])[0] else: temp = [item['_id'], item['_id'], '', '', ''] """ if item['found']: temp.append(item['_source']['uid']) if item['_source']['nick_name']: temp.append(item['_source']['nick_name']) else: temp.append(item['_source']['uid']) temp.append(item['_source']['user_location']) temp.append(item['_source']['statusnum']) temp.append(item['_source']['friendsnum']) else: temp.append(item['_id']) temp.append(item['_id']) temp.extend(['']) temp.append('--') temp.append('--') try: user_fansnum = bci_results[count]["fields"]["user_fansnum"][0] except: user_fansnum = 0 temp.append(user_fansnum) temp_influ = influence_results[count] if temp_influ.get('found', 0): user_index = temp_influ['fields']['user_index'][0] temp.append(math.log(user_index/float(top_influence)*9+1, 10)*100) else: temp.append(0) """ count += 1 out_user_detail_info.append(temp) revise_time_series = [] for item in time_series: revise_time_series.append(ts2date_min(item)) results['important_user_detail'] = user_detail_info results['out_portrait_user_detail'] = out_user_detail_info #results['burst_time'] = burst_time_list # 爆发时间点,以及爆发原因 results['time_series'] = revise_time_series #results['positive_sentiment_list'] = positive_sentiment_list #esults['negetive_sentiment_list'] = negetive_sentiment_list #results['neutral_sentiment_list'] = neutral_sentiment_list results['all_weibo_list'] = all_weibo_list results['origin_weibo_list'] = origin_weibo_list results['retweeted_weibo_list'] = retweeted_weibo_list #results['comment_weibo_count'] = comment_weibo_count #results['retweeted_weibo_count'] = retweeted_weibo_count #results['total_number_list'] = total_number_count return results
def get_sensitive_text_detail(task_name, ts, user, order): _id = user + '-' + task_name task_detail = es.get(index=index_sensing_task, doc_type=_id, id=ts)['_source'] weibo_detail = json.loads(task_detail['sensitive_weibo_detail']) weibo_detail_list = [] if weibo_detail: for iter_mid, item in weibo_detail.iteritems(): tmp = [] tmp.append(iter_mid) tmp.append(item[iter_mid]) tmp.append(item['retweeted']) tmp.append(item['comment']) weibo_detail_list.append(tmp) mid_list = weibo_detail.keys() results = [] query_body = { "query": { "filtered": { "filter": { "terms": { "mid": mid_list } } } } } index_list = [] datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts - DAY) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: index_list.append(index_name) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es_1 = es_text.indices.exists(index_name_1) if exist_es_1: index_list.append(index_name_1) if index_list and mid_list: search_results = es_text.search(index=index_list, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] uid_list = [] text_dict = dict() # 文本信息 portrait_dict = dict() # 背景信息 if search_results: for item in search_results: uid_list.append(item["_source"]['uid']) text_dict[item['_id']] = item['_source'] # _id是mid if uid_list: portrait_result = es_profile.mget( index=profile_index_name, doc_type=profile_index_type, body={"ids": uid_list}, fields=['nick_name', 'photo_url'])["docs"] for item in portrait_result: if item['found']: portrait_dict[item['_id']] = { "nick_name": item["fields"]["nick_name"][0], "photo_url": item["fields"]["photo_url"][0] } else: portrait_dict[item['_id']] = { "nick_name": item['_id'], "photo_url": "" } if order == "total": sorted_list = sorted(weibo_detail_list, key=lambda x: x[1], reverse=True) elif order == "retweeted": sorted_list = sorted(weibo_detail_list, key=lambda x: x[2], reverse=True) elif order == "comment": sorted_list = sorted(weibo_detail_list, key=lambda x: x[3], reverse=True) else: sorted_list = weibo_detail_list count_n = 0 for item in sorted_list: mid = item[0] iter_text = text_dict.get(mid, {}) temp = [] # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type if iter_text: uid = iter_text['uid'] temp.append(uid) iter_portrait = portrait_dict.get(uid, {}) if iter_portrait: temp.append(iter_portrait['nick_name']) temp.append(iter_portrait['photo_url']) else: temp.extend([uid, '']) temp.append(iter_text["text"]) temp.append(iter_text["sentiment"]) temp.append(ts2date(iter_text['timestamp'])) temp.append(iter_text['geo']) temp.append(iter_text['message_type']) temp.append(item[2]) temp.append(item[3]) temp.append(iter_text.get('sensitive', 0)) count_n += 1 results.append(temp) if results and order == "ts": results = sorted(results, key=lambda x: x[5], reverse=True) if results and order == "sensitive": results = sorted(results, key=lambda x: x[-1], reverse=True) return results
def get_retweet_weibo_detail(ts, user, task_name, size, text_type, type_value): _id = user + '-' + task_name task_detail = es_user_portrait.get(index=index_sensing_task, doc_type=_id, id=ts)['_source'] origin_weibo_detail = json.loads(task_detail['origin_weibo_detail']) retweeted_weibo_detail = json.loads(task_detail['retweeted_weibo_detail']) mid_list = [] mid_list.extend(origin_weibo_detail.keys()) mid_list.extend(retweeted_weibo_detail.keys()) query_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"range":{ "timestamp":{ "gte": ts - time_interval, "lt": ts } }}, {"terms": {"root_mid": mid_list}} ] } } } }, "sort": {"timestamp": {"order": "desc"}}, "size": 100 } if text_type == "message_type": query_body['query']['filtered']['filter']['bool']['must'].append({"term":{text_type: type_value}}) if text_type == "sentiment": #if isinstance(type_value, str): if len(type_value) == 1: query_body['query']['filtered']['filter']['bool']['must'].append({"term":{text_type: type_value}}) else: query_body['query']['filtered']['filter']['bool']['must'].append({"terms":{text_type: type_value}}) datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts-time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es_1 = es_text.indices.exists(index_name_1) # 1. 查询微博 if datetime == datetime_1 and exist_es: search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] elif datetime != datetime_1 and exist_es_1: search_results = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] #print search_results # 2. 获取微博相关信息 results = [] uid_list = [] if search_results: for item in search_results: uid_list.append(item["_source"]['uid']) if uid_list: portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"] for i in range(len(uid_list)): item = search_results[i]['_source'] temp = [] # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type temp.append(item['uid']) if portrait_result[i]['found']: temp.append(portrait_result[i]["fields"]["nick_name"][0]) temp.append(portrait_result[i]["fields"]["photo_url"][0]) else: temp.append(item['uid']) temp.append("") temp.append(item["text"]) #print item['text'] temp.append(item["sentiment"]) temp.append(ts2date(item['timestamp'])) temp.append(item['geo']) temp.append(item["message_type"]) results.append(temp) return results
def get_positive_weibo_detail(ts, social_sensors, keywords_list, size, sentiment_type=1): former_mid_list = query_mid_list(ts-time_interval, keywords_list, time_segment, social_sensors) # 前一段时间内的微博mid list current_mid_list = query_mid_list(ts, keywords_list, time_interval, social_sensors) mid_list = [] mid_list.extend(former_mid_list) mid_list.extend(current_mid_list) query_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"range":{ "timestamp":{ "gte": ts - time_interval, "lt": ts } }}, ], "should":[ {"terms": {"root_mid": mid_list}}, {"terms": {"mid": mid_list}}, {"terms":{"keywords_string": keywords_list}} ] } } } }, "sort": {"timestamp": {"order": "desc"}}, "size": 100 } #if social_sensors and int(sentiment_type) == 1: # query_body["query"]["filtered"]["filter"]["bool"]["must"].append({"terms":{"uid": social_sensors}}) if int(sentiment_type) == 1 or int(sentiment_type) == 0: query_body["query"]["filtered"]["filter"]["bool"]["must"].append({"term":{"sentiment":sentiment_type}}) else: query_body["query"]["filtered"]["filter"]["bool"]["must"] = [{"terms":{"sentiment": ["2", "3"]}}] # 判断当前ts和ts-time_interval是否属于同一天,确定查询哪个es datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts-time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es_1 = es_text.indices.exists(index_name_1) # 1. 聚合原创微博mid list if datetime == datetime_1 and exist_es: search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] elif datetime != datetime_1 and exist_es_1: search_results = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] uid_list = [] results = [] if search_results: for item in search_results: uid_list.append(item["_source"]['uid']) if uid_list: portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"] for i in range(len(uid_list)): item = search_results[i]['_source'] temp = [] # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type temp.append(item['uid']) if portrait_result[i]['found']: temp.append(portrait_result[i]["fields"]["nick_name"][0]) temp.append(portrait_result[i]["fields"]["photo_url"][0]) else: temp.append("unknown") temp.append("") temp.append(item["text"]) temp.append(item["sentiment"]) temp.append(ts2date(item['timestamp'])) temp.append(item['geo']) keywords_set = set(item['keywords_string'].split('&')) common_keywords = set(keywords_list) & keywords_set temp.append(list(common_keywords)) temp.append(item['message_type']) results.append(temp) return results
def get_origin_weibo_detail(ts, user, task_name, size, order, message_type=1): _id = user + '-' + task_name task_detail = es_user_portrait.get(index=index_sensing_task, doc_type=_id, id=ts)['_source'] mid_value = json.loads(task_detail['mid_topic_value']) duplicate_dict = json.loads(task_detail['duplicate_dict']) tmp_duplicate_dict = dict() for k,v in duplicate_dict.iteritems(): try: tmp_duplicate_dict[v].append(k) except: tmp_duplicate_dict[v] = [k, v] if message_type == 1: weibo_detail = json.loads(task_detail['origin_weibo_detail']) elif message_type == 2: weibo_detail = json.loads(task_detail['retweeted_weibo_detail']) else: weibo_detail = json.loads(task_detail['sensitive_weibo_detail']) weibo_detail_list = [] if weibo_detail: for iter_mid, item in weibo_detail.iteritems(): tmp = [] tmp.append(iter_mid) tmp.append(item[iter_mid]) tmp.append(item['retweeted']) tmp.append(item['comment']) weibo_detail_list.append(tmp) mid_list = weibo_detail.keys() results = [] query_body = { "query":{ "filtered":{ "filter":{ "terms":{"mid": mid_list} } } }, "size": 1000, "sort": {"timestamp": {"order": "desc"}} } index_list = [] datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts-DAY) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: index_list.append(index_name) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es_1 = es_text.indices.exists(index_name_1) if exist_es_1: index_list.append(index_name_1) if index_list and mid_list: search_results = es_text.search(index=index_list, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] uid_list = [] text_dict = dict() # 文本信息 portrait_dict = dict() # 背景信息 sort_results = [] if search_results: for item in search_results: uid_list.append(item["_source"]['uid']) text_dict[item['_id']] = item['_source'] # _id是mid if uid_list: portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"] for item in portrait_result: if item['found']: portrait_dict[item['_id']] = {"nick_name": item["fields"]["nick_name"][0], "photo_url": item["fields"]["photo_url"][0]} else: portrait_dict[item['_id']] = {"nick_name": item['_id'], "photo_url":""} if order == "total": sorted_list = sorted(weibo_detail_list, key=lambda x:x[1], reverse=True) elif order == "retweeted": sorted_list = sorted(weibo_detail_list, key=lambda x:x[2], reverse=True) elif order == "comment": sorted_list = sorted(weibo_detail_list, key=lambda x:x[3], reverse=True) else: sorted_list = weibo_detail_list count_n = 0 results_dict = dict() mid_index_dict = dict() for item in sorted_list: # size mid = item[0] iter_text = text_dict.get(mid, {}) temp = [] # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type if iter_text: uid = iter_text['uid'] temp.append(uid) iter_portrait = portrait_dict.get(uid, {}) if iter_portrait: temp.append(iter_portrait['nick_name']) temp.append(iter_portrait['photo_url']) else: temp.extend([uid,'']) temp.append(iter_text["text"]) temp.append(iter_text["sentiment"]) temp.append(ts2date(iter_text['timestamp'])) temp.append(iter_text['geo']) if message_type == 1: temp.append(1) elif message_type == 2: temp.append(3) else: temp.append(iter_text['message_type']) temp.append(item[2]) temp.append(item[3]) temp.append(iter_text.get('sensitive', 0)) temp.append(iter_text['timestamp']) temp.append(mid_value[mid]) temp.append(mid) results.append(temp) count_n += 1 results = sorted(results, key=operator.itemgetter(-4, -2, -6), reverse=True) # -4 -2 -3 sort_results = [] count = 0 for item in results: sort_results.append([item]) mid_index_dict[item[-1]] = count count += 1 if tmp_duplicate_dict: remove_list = [] value_list = tmp_duplicate_dict.values() # [[mid, mid], ] for item in value_list: tmp = [] for mid in item: if mid_index_dict.get(mid, 0): tmp.append(mid_index_dict[mid]) if len(tmp) > 1: tmp_min = min(tmp) else: continue tmp.remove(tmp_min) for iter_count in tmp: sort_results[tmp_min].extend(sort_results[iter_count]) remove_list.append(sort_results[iter_count]) if remove_list: for item in remove_list: sort_results.remove(item) return sort_results
def get_sensitive_text_detail(task_name, ts, user, order): _id = user + '-' + task_name task_detail = es.get(index=index_sensing_task, doc_type=_id, id=ts)['_source'] weibo_detail = json.loads(task_detail['sensitive_weibo_detail']) weibo_detail_list = [] if weibo_detail: for iter_mid, item in weibo_detail.iteritems(): tmp = [] tmp.append(iter_mid) tmp.append(item[iter_mid]) tmp.append(item['retweeted']) tmp.append(item['comment']) weibo_detail_list.append(tmp) mid_list = weibo_detail.keys() results = [] query_body = { "query":{ "filtered":{ "filter":{ "terms":{"mid": mid_list} } } } } index_list = [] datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts-DAY) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: index_list.append(index_name) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es_1 = es_text.indices.exists(index_name_1) if exist_es_1: index_list.append(index_name_1) if index_list and mid_list: search_results = es_text.search(index=index_list, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] uid_list = [] text_dict = dict() # 文本信息 portrait_dict = dict() # 背景信息 if search_results: for item in search_results: uid_list.append(item["_source"]['uid']) text_dict[item['_id']] = item['_source'] # _id是mid if uid_list: portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"] for item in portrait_result: if item['found']: portrait_dict[item['_id']] = {"nick_name": item["fields"]["nick_name"][0], "photo_url": item["fields"]["photo_url"][0]} else: portrait_dict[item['_id']] = {"nick_name": item['_id'], "photo_url":""} if order == "total": sorted_list = sorted(weibo_detail_list, key=lambda x:x[1], reverse=True) elif order == "retweeted": sorted_list = sorted(weibo_detail_list, key=lambda x:x[2], reverse=True) elif order == "comment": sorted_list = sorted(weibo_detail_list, key=lambda x:x[3], reverse=True) else: sorted_list = weibo_detail_list count_n = 0 for item in sorted_list: mid = item[0] iter_text = text_dict.get(mid, {}) temp = [] # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type if iter_text: uid = iter_text['uid'] temp.append(uid) iter_portrait = portrait_dict.get(uid, {}) if iter_portrait: temp.append(iter_portrait['nick_name']) temp.append(iter_portrait['photo_url']) else: temp.extend([uid,'']) temp.append(iter_text["text"]) temp.append(iter_text["sentiment"]) temp.append(ts2date(iter_text['timestamp'])) temp.append(iter_text['geo']) temp.append(iter_text['message_type']) temp.append(item[2]) temp.append(item[3]) temp.append(iter_text.get('sensitive', 0)) count_n += 1 results.append(temp) if results and order == "ts": results = sorted(results, key=lambda x:x[5], reverse=True) if results and order == "sensitive": results = sorted(results, key=lambda x:x[-1], reverse=True) return results
def get_temporal_rank(task_type, sort="retweeted", number=100): number = int(number) - 1 if int(task_type) == 0: # 到目前位置 sort_list = r.zrange("influence_%s" %sort, 0, number, withscores=True, desc=True) elif int(task_type) == 1: sort_list = r.zrange("influence_%s_1" %sort, 0, number, withscores=True, desc=True) elif int(task_type) == 2: sort_list = r.zrange("influence_%s_2" %sort, 0, number, withscores=True, desc=True) elif int(task_type) == 3: sort_list = r.zrange("influence_%s_3" %sort, 0, number, withscores=True, desc=True) else: sort_list = r.zrange("influence_%s_4" %sort, 0, number, withscores=True, desc=True) uid_list = [] for item in sort_list: uid_list.append(item[0]) if sort == "retweeted": other = "comment" else: other = "retweeted" results = [] # 查看背景信息 if uid_list: profile_result = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list})["docs"] bci_result = es_user_profile.mget(index="bci_history", doc_type="bci", body={"ids":uid_list}, fields=['user_fansnum',"weibo_month_sum" ])["docs"] count = 0 for item in profile_result: _id = item['_id'] index = profile_result.index(item) tmp = [] tmp.append(item['_id']) if item['found']: item = item['_source'] tmp.append(item['nick_name']) tmp.append(item['statusnum']) tmp.append(item['user_location']) tmp.append(item['fansnum']) else: tmp.extend(['',0,'',0]) try: user_fansnum = bci_result[count]['fileds']['user_fansnum'] tmp[4] = user_fansnum except: pass try: weibo_number = bci_result[count]['fileds']["weibo_month_sum"] tmp[2] = weibo_number except: pass count_1 = int(sort_list[index][1]) if int(task_type) == 0: tmp_count = r.zscore("influence_%s" %other, _id) if tmp_count: count_2 = int(tmp_count) else: count_2 = 0 else: tmp_count = r.zscore("influence_%s_%s" %(other,task_type), _id) if tmp_count: count_2 = int(tmp_count) else: count_2 = 0 if sort == "retweeted": tmp.append(count_1) tmp.append(count_2) else: tmp.append(count_2) tmp.append(count_1) results.append(tmp) count += 1 if uid_list: count = 0 portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, body={"ids":uid_list})["docs"] for item in portrait_result: if item['found']: results[count].append("1") else: results[count].append("0") count += 1 return results
def identify_user_out(input_uid_list): out_user_list = [] in_user_list = [] input_len = len(input_uid_list) iter_count = 0 print 'identify user out' #get user list who is out user_portrait while iter_count < input_len: iter_user_list = input_uid_list[iter_count:iter_count + DETECT_ITER_COUNT] try: portrait_result = es_user_portrait.mget( index=portrait_index_name, doc_type=portrait_index_type, body={'ids': iter_user_list}, _source=False)['docs'] except: portrait_result = [] for item in portrait_result: uid = item['_id'] if item['found'] != True: out_user_list.append(uid) else: in_user_list.append(uid) iter_count += DETECT_ITER_COUNT print 'get out user portrait information' #get user profile information for out user_portrait iter_count = 0 out_user_count = len(out_user_list) out_user_result = [] while iter_count < out_user_count: iter_user_list = out_user_list[iter_count:iter_count + DETECT_ITER_COUNT] try: profile_result = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={'ids': iter_user_list}, _source=True)['docs'] except: profile_result = [] for item in profile_result: uid = item['_id'] if item['found'] == True: source = item['_source'] uname = source['nick_name'] fansnum = source['fansnum'] statusnum = source['statusnum'] friendsnum = source['friendsnum'] else: uname = u'未知' fansnum = u'未知' statusnum = u'未知' friendsnum = u'未知' out_user_result.append( [uid, uname, fansnum, statusnum, friendsnum]) iter_count += DETECT_ITER_COUNT sort_out_user_result = sorted(out_user_result, key=lambda x: x[2], reverse=True) return in_user_list, sort_out_user_result