def get_user_sensitive_words(uid): user_sensitive_words_dict = {} now_ts = time.time() now_date = ts2datetime(now_ts) # 2015-09-22 ts = datetime2ts(now_date) #test ts = datetime2ts('2013-09-08') for i in range(1, 8): ts = ts - 3600 * 24 date = ts2datetime(ts).replace('-', '') results = r_cluster.hget('sensitive_' + str(date), uid) if results: sensitive_words_dict = json.loads(results) for word in sensitive_words_dict: if user_sensitive_words_dict.has_key(word): user_sensitive_words_dict[word] += sensitive_words_dict[ word] else: user_sensitive_words_dict[word] = sensitive_words_dict[ word] sort_sensitive_words_dict = sorted(user_sensitive_words_dict.items(), key=lambda x: x[1], reverse=True) return sort_sensitive_words_dict
def get_user_geo(uid): results = [] user_geo_result = {} user_ip_dict = {} user_ip_result = {} # ordinary ip user_sensitive_ip_result = {} # sensitive ip now_ts = time.time() now_date = ts2datetime(now_ts) # 2015-09-22 ts = datetime2ts(now_date) #test ts = datetime2ts('2013-09-08') for i in range(1, 8): ts = ts - 3600 * 24 date = ts2datetime(ts).replace('-', '') results = r_cluster.hget('ip_' + str(date), uid) sensitive_results = r_cluster.hget('sensitive_ip' + str(date), uid) if results: ip_results = json.loads(results) for ip in ip_results: if user_ip_result.has_key(ip): user_ip_result[ip] += ip_results[ip] else: user_ip_result[ip] = ip_results[ip] if sensitive_results: sensitive_ip_results = json.loads(sensitive_results) for ip in sensitive_ip_results: if user_sensitive_ip_result.has_key(ip): user_sensitive_ip_result[ip] += sensitive_ip_results[ip] else: user_sensitive_ip_result[ip] = sensitive_ip_results[ip] ordinary_key_set = set(user_ip_result.keys()) sensitive_key_set = set(user_sensitive_ip_result.keys()) for key in sensitive_key_set: if key in ordinary_key_set: user_ip_result[key] += user_sensitive_ip_result[key] else: user_ip_result[key] = user_sensitive_ip_result[key] user_geo_dict = ip2geo(user_ip_result) sorted_user_geo_dict = sorted(user_geo_dict.items(), key=lambda x: x[1], reverse=True) sensitive_user_geo_dict = ip2geo(user_sensitive_ip_result) sorted_sensitive_user_geo_dict = sorted(sensitive_user_geo_dict.items(), key=lambda x: x[1], reverse=True) return_list = [] return_list = [sorted_user_geo_dict, sorted_sensitive_user_geo_dict] # total and sensitive return return_list
def get_user_geo(uid): results = [] user_geo_result = {} user_ip_dict = {} user_ip_result = {} # ordinary ip user_sensitive_ip_result = {} # sensitive ip now_ts = time.time() now_date = ts2datetime(now_ts) # 2015-09-22 ts = datetime2ts(now_date) #test ts = datetime2ts('2013-09-08') for i in range(1,8): ts = ts - 3600*24 date = ts2datetime(ts).replace('-','') results = r_cluster.hget('ip_'+str(date), uid) sensitive_results = r_cluster.hget('sensitive_ip'+str(date), uid) if results: ip_results = json.loads(results) for ip in ip_results: if user_ip_result.has_key(ip): user_ip_result[ip] += ip_results[ip] else: user_ip_result[ip] = ip_results[ip] if sensitive_results: sensitive_ip_results = json.loads(sensitive_results) for ip in sensitive_ip_results: if user_sensitive_ip_result.has_key(ip): user_sensitive_ip_result[ip] += sensitive_ip_results[ip] else: user_sensitive_ip_result[ip] = sensitive_ip_results[ip] ordinary_key_set = set(user_ip_result.keys()) sensitive_key_set = set(user_sensitive_ip_result.keys()) for key in sensitive_key_set: if key in ordinary_key_set: user_ip_result[key] += user_sensitive_ip_result[key] else: user_ip_result[key] = user_sensitive_ip_result[key] user_geo_dict = ip2geo(user_ip_result) sorted_user_geo_dict = sorted(user_geo_dict.items(), key=lambda x:x[1], reverse=True) sensitive_user_geo_dict = ip2geo(user_sensitive_ip_result) sorted_sensitive_user_geo_dict = sorted(sensitive_user_geo_dict.items(), key=lambda x:x[1], reverse=True) return_list = [] return_list = [sorted_user_geo_dict, sorted_sensitive_user_geo_dict] # total and sensitive return return_list
def get_influence_content(uid, timestamp_from, timestamp_to): weibo_list = [] # split timestamp range to new_range_dict_list from_date_ts = datetime2ts(ts2datetime(timestamp_from)) to_date_ts = datetime2ts(ts2datetime(timestamp_to)) new_range_dict_list = [] if from_date_ts != to_date_ts: iter_date_ts = from_date_ts while iter_date_ts < to_date_ts: iter_next_date_ts = iter_date_ts + DAY new_range_dict_list.append({"range": {"timestamp": {"gte": iter_date_ts, "lt": iter_next_date_ts}}}) iter_date_ts = iter_next_date_ts if new_range_dict_list[0]["range"]["timestamp"]["gte"] < timestamp_from: new_range_dict_list[0]["range"]["timestamp"]["gte"] = timestamp_from if new_range_dict_list[-1]["range"]["timestamp"]["lt"] > timestamp_to: new_range_dict_list[-1]["range"]["timestamp"]["lt"] = timestamp_to else: new_range_dict_list = [{"range": {"timestamp": {"gte": timestamp_from, "lt": timestamp_to}}}] # iter date to search flow_text iter_result = [] for range_item in new_range_dict_list: range_from_ts = range_item["range"]["timestamp"]["gte"] range_from_date = ts2datetime(range_from_ts) flow_text_index_name = flow_text_index_name_pre + range_from_date query = [] query.append({"term": {"uid": uid}}) query.append(range_item) try: flow_text_exist = es_flow_text.search( index=flow_text_index_name, doc_type=flow_text_index_type, body={"query": {"bool": {"must": query}}, "sort": [{"timestamp": "asc"}]}, )["hits"]["hits"] except: flow_text_exist = [] iter_result.extend(flow_text_exist) # get weibo list for item in flow_text_exist: source = item["_source"] weibo = {} weibo["timestamp"] = ts2date(source["timestamp"]) weibo["ip"] = source["ip"] weibo["text"] = source["text"] if source["geo"]: weibo["geo"] = "\t".join(source["geo"].split("&")) else: weibo["geo"] = "" weibo_list.append(weibo) return weibo_list
def get_network(task_exist): task_name = task_exist['task_name'] submit_date = task_exist['submit_date'] submit_ts = date2ts(submit_date) time_segment = 24*3600 now_ts = time.time() now_date = ts2datetime(now_ts) now_date_ts = datetime2ts(now_date) #test now_date_ts = datetime2ts('2013-09-07') iter_date_ts = now_date_ts iter_count = 1 date_list = [] top_list_dict = {} while True: if iter_count >= 8 or iter_date_ts < submit_ts: break iter_date = ts2datetime(iter_date_ts) date_list.append(iter_date) key = 'inner_' + str(iter_date) try: task_date_result = es.get(index=monitor_index_name, doc_type=task_name, id=key)['_source'] except: task_date_result = {} #print 'task_name, key, task_date_result:', task_name, key, task_date_result iter_field = ['top1', 'top2', 'top3', 'top4', 'top5'] for field in iter_field: user_count_item = json.loads(task_date_result[field]) uid = user_count_item[0] uname = uid2uname(uid) count = user_count_item[1] try: top_list_dict[field].append([uid, uname, count]) except: top_list_dict[field] = [[uid, uname, count]] iter_date_ts -= time_segment # get inner-retweet group from es---field: inner_graph ''' try: inner_graph = json.loads(task_date_result['inner_graph']) except: inner_graph = {} ''' abnormal_index = compute_inner_polarization(top_list_dict) return [date_list, top_list_dict, abnormal_index]
def ajax_upload_track_file(): results = {} upload_data = request.form['upload_data'] task_name = request.form['task_name'] state = request.args.form['state'] now_ts = time.time() now_date = ts2datetime(now_ts) now_date_ts = datetime2ts(now_date) time_segment = int((now_ts - now_Date_ts) / 900) + 1 trans_ts = now_date_ts + time_segment * 900 line_list = upload_data.split('\n') input_data = {} #submit task and start time is 15min multiple input_data['submit_date'] = trans_ts input_data['task_name'] = task_name uid_list = [] for line in line_list: uid = line[:10] if len(uid) == 10: uid_list.append(uid) input_data['uid_list'] = uid_list input_data[ 'status'] = 1 # status show the track task is doing or end; doing 1, end 0 input_data['count'] = len(uid_list) status = submit_track_task(input_data) return json.dumps(status)
def search_mention(uid, sensitive): date = ts2datetime(time.time()).replace('-','') stat_results = dict() results = dict() test_ts = time.time() test_ts = datetime2ts('2013-09-07') for i in range(0,7): ts = test_ts -i*24*3600 date = ts2datetime(ts).replace('-', '') if not sensitive: at_temp = r_cluster.hget('at_' + str(date), str(uid)) else: at_temp = r_cluster.hget('sensitive_at_' + str(date), str(uid)) if not at_temp: continue else: result_dict = json.loads(at_temp) for at_uid in result_dict: if stat_results.has_key(at_uid): stat_results[uid] += result_dict[at_uid] else: stat_results[uid] = result_dict[at_uid] if not stat_results: return [None, 0] in_status = identify_uid_list_in(result_dict.keys()) for at_uid in result_dict: if at_uid in in_status: results[at_uid] = [result_dict[at_uid], '1'] else: results[at_uid] = [result_dict[at_uid], '0'] sorted_results = sorted(results.items(), key=lambda x:x[1][0], reverse=True) return [sorted_results[0:20], len(results)]
def influence_distribute(): row = [0, 200, 500, 700, 900, 1100, 10000] result = [] ts = time.time() ts = datetime2ts('2013-09-08') # test ts = ts - 8*3600*24 for j in range(7): detail = [] ts += 3600*24 date = ts2datetime(ts).replace('-', '') for i in range(6): low_limit = row[i] upper_limit = row[i+1] query_body = { "query": { "filtered": { "filter": { "range": { date: { "gte": low_limit, "lt": upper_limit } } } } } } number = es.count(index='copy_sensitive_user_portrait', doc_type="user", body=query_body)['count'] detail.append(number) result.append(detail) return [row, result]
def ajax_upload_track_file(): results = {} upload_data = request.form['upload_data'] task_name = request.form['task_name'] state = request.args.form['state'] now_ts = time.time() now_date = ts2datetime(now_ts) now_date_ts = datetime2ts(now_date) time_segment = int((now_ts - now_Date_ts) / 900) + 1 trans_ts = now_date_ts + time_segment * 900 line_list = upload_data.split('\n') input_data = {} #submit task and start time is 15min multiple input_data['submit_date'] = trans_ts input_data['task_name'] = task_name uid_list = [] for line in line_list: uid = line[:10] if len(uid)==10: uid_list.append(uid) input_data['uid_list'] = uid_list input_data['status'] = 1 # status show the track task is doing or end; doing 1, end 0 input_data['count'] = len(uid_list) status = submit_track_task(input_data) return json.dumps(status)
def get_user_sensitive_words(uid): user_sensitive_words_dict = {} if RUN_TYPE: now_ts = time.time() now_date = ts2datetime(now_ts) # 2015-09-22 else: now_date = "2013-09-08" ts = datetime2ts(now_date) #test #ts = datetime2ts('2013-09-08') for i in range(1,8): ts = ts - 3600*24 date = ts2datetime(ts).replace('-','') results = r_cluster.hget('sensitive_'+str(ts), uid) if results: sensitive_words_dict = json.loads(results) for word in sensitive_words_dict: if user_sensitive_words_dict.has_key(word): user_sensitive_words_dict[word] += sensitive_words_dict[word] else: user_sensitive_words_dict[word] = sensitive_words_dict[word] sort_sensitive_words_dict = sorted(user_sensitive_words_dict.items(), key=lambda x:x[1], reverse=True) return sort_sensitive_words_dict
def get_group_user_track(uid): results = [] # step1:get user_portrait activity_geo_dict try: portrait_result = es_user_portrait.get( index=portrait_index_name, doc_type=portrait_index_type, id=uid, _source=False, fields=["activity_geo_dict"] ) except: portrait_result = {} if portrait_result == {}: return "uid is not in user_portrait" activity_geo_dict = json.loads(portrait_result["fields"]["activity_geo_dict"][0]) now_date_ts = datetime2ts(ts2datetime(int(time.time()))) start_ts = now_date_ts - DAY * len(activity_geo_dict) # step2: iter date to get month track for geo_item in activity_geo_dict: iter_date = ts2datetime(start_ts) sort_day_dict = sorted(geo_item.items(), key=lambda x: x[1], reverse=True) if sort_day_dict: results.append([iter_date, sort_day_dict[0][0]]) else: results.append([iter_date, ""]) start_ts = start_ts + DAY return results
def get_group_user_track(uid): results = [] #step1:get user_portrait activity_geo_dict try: portrait_result = es_user_portrait.get(index=portrait_index_name, doc_type=portrait_index_type,\ id=uid, _source=False, fields=['activity_geo_dict']) except: portrait_result = {} if portrait_result == {}: return 'uid is not in user_portrait' activity_geo_dict = json.loads( portrait_result['fields']['activity_geo_dict'][0]) now_date_ts = datetime2ts(ts2datetime(int(time.time()))) start_ts = now_date_ts - DAY * len(activity_geo_dict) #step2: iter date to get month track for geo_item in activity_geo_dict: iter_date = ts2datetime(start_ts) sort_day_dict = sorted(geo_item.items(), key=lambda x: x[1], reverse=True) if sort_day_dict: results.append([iter_date, sort_day_dict[0][0]]) else: results.append([iter_date, '']) start_ts = start_ts + DAY return results
def end_track_task(task_name): status = 0 try: task_exist = es.get(index=index_name, doc_type=index_type, id=task_name)['_source'] except: return 'task name not exist' task_status = task_exist['status'] if status == '0': return 'task have end' else: task_exist['status'] = 0 # made end time now_ts = time.time() now_date = ts2datetime(now_ts) now_date_ts = datetime2ts(now_date) time_segment = int((now_ts - now_date_ts) / 900) + 1 end_ts = now_date_ts + time_segment * 900 end_date = ts2date(end_ts) task_exist['end_date'] = end_date task_user = task_exist['uid_list'] status = change_user_count(task_user) if status == 0: return 'change user task count fail' else: es.index(index=index_name, doc_type=index_type, id=task_name, body=task_exist) status = delete_task_redis(task_name) if status == 0: return 'delete task from redis fail' else: return 'success change status to end'
def influence_distribute(): row = [0, 200, 500, 700, 900, 1100, 10000] result = [] ts = time.time() ts = datetime2ts('2013-09-08') # test ts = ts - 8 * 3600 * 24 for j in range(7): detail = [] ts += 3600 * 24 date = ts2datetime(ts).replace('-', '') for i in range(6): low_limit = row[i] upper_limit = row[i + 1] query_body = { "query": { "filtered": { "filter": { "range": { date: { "gte": low_limit, "lt": upper_limit } } } } } } number = es.count(index='copy_sensitive_user_portrait', doc_type="user", body=query_body)['count'] detail.append(number) result.append(detail) return [row, result]
def get_user_hashtag(uid): user_hashtag_dict = {} sensitive_user_hashtag_dict = {} now_ts = time.time() now_date = ts2datetime(now_ts) # 2015-09-22 ts = datetime2ts(now_date) #test ts = datetime2ts('2013-09-08') for i in range(1, 8): ts = ts - 3600 * 24 date = ts2datetime(ts).replace('-', '') results = r_cluster.hget('hashtag_' + str(date), uid) sensitive_results = r_cluster.hget('sensitive_hashtag_' + str(date), uid) if results: hashtag_dict = json.loads(results) for hashtag in hashtag_dict: if user_hashtag_dict.has_key(hashtag): user_hashtag_dict[hashtag] += hashtag_dict[hashtag] else: user_hashtag_dict[hashtag] = hashtag_dict[hashtag] if sensitive_results: sensitive_hashtag_dict = json.loads(sensitive_results) for hashtag in sensitive_hashtag_dict: if sensitive_user_hashtag_dict.has_key(hashtag): sensitive_user_hashtag_dict[ hashtag] += sensitive_hashtag_dict[hashtag] else: sensitive_user_hashtag_dict[ hashtag] = sensitive_hashtag_dict[hashtag] ordinary_key_set = set(user_hashtag_dict.keys()) sensitive_key_set = set(sensitive_user_hashtag_dict.keys()) for key in sensitive_key_set: if key in ordinary_key_set: user_hashtag_dict[key] += sensitive_user_hashtag_dict[key] else: user_hashtag_dict[key] = sensitive_user_hashtag_dict[key] sort_hashtag_dict = sorted(user_hashtag_dict.items(), key=lambda x: x[1], reverse=True) sort_sensitive_dict = sorted(sensitive_user_hashtag_dict.items(), key=lambda x: x[1], reverse=True) return [sort_hashtag_dict, sort_sensitive_dict]
def get_text_index(date): now_ts = datetime2ts(date) index_list = [] for i in range(7): ts = now_ts - i*DAY tmp_index = pre_text_index + ts2datetime(ts) index_list.append(tmp_index) return index_list
def ajax_ip(): uid = request.args.get('uid', '') #run_type if RUN_TYPE == 1: now_ts = time.time() else: now_ts = datetime2ts('2013-09-02') result = search_ip(now_ts, uid) if not result: result = {} return json.dumps(result)
def ajax_activity_day(): results = {} uid = str(request.args.get('uid', '')) #run_type if RUN_TYPE == 1: now_ts = time.time() else: now_ts = datetime2ts("2013-09-01") results = search_activity(now_ts, uid) if not results: results = {} return json.dumps(results)
def ajax_location(): uid = request.args.get('uid', '') uid = str(uid) time_type = request.args.get('time_type', '') # type = day; week; month #run_type if RUN_TYPE == 1: now_ts = time.time() else: now_ts = datetime2ts("2013-09-01") results = search_location(now_ts, uid, time_type) return json.dumps(results)
def search_detect_task(task_name, submit_date, state, process, detect_type, submit_user): results = [] query = [{'match':{'task_type': 'detect'}}] condition_num = 0 if task_name: task_name_list = task_name.split(' ') for item in task_name_list: query.append({'wildcard':{'task_name': '*'+item+'*'}}) condition_num += 1 if submit_date: submit_date_ts = datetime2ts(submit_date) submit_date_from = submit_date_ts submit_date_to = submit_date_ts + DAY query.append({'range':{'submit_date':{'gte':submit_date_from, 'lt':submit_date_to}}}) condition_num += 1 if state: state_list = state.split(' ') for item in state_list: query.append({'wildcard':{'state': '*'+item+'*'}}) condition_num += 1 if process: query.append({'range':{'detect_process':{'from': int(process), 'to': MAX_PROCESS}}}) condition_num += 1 if detect_type: detect_type_list = detect_type.split(',') nest_body_list = [] for type_item in detect_type_list: nest_body_list.append({'wildcard':{'detect_type': '*'+type_item+'*'}}) query.append({'bool':{'should': nest_body_list}}) condition_num += 1 if submit_user: query.append({'wildcard':{'submit_user': '******'+submit_user+'*'}}) condition_num += 1 try: search_result = es_group_result.search(index=group_index_name, doc_type=group_index_type, \ body={'query':{'bool': {'must': query}}, 'sort':[{'submit_date': {'order': 'desc'}}], 'size':MAX_VALUE})['hits']['hits'] except: search_result = [] #get group information table for group_item in search_result: source = group_item['_source'] task_name = source['task_name'] submit_date = ts2datetime(int(source['submit_date'])) submit_user = source['submit_user'] detect_type = source['detect_type'] state = source['state'] process = source['detect_process'] results.append([task_name, submit_user, submit_date, detect_type, state, process]) return results
def lastest_identify_in(): results = dict() now_ts = time.time() now_ts = datetime2ts('2013-09-08') for i in range(1,8): ts = now_ts - i * 3600 *24 date = ts2datetime(ts).replace('-','') words_dict = r.hgetall('history_in_'+date) for item in words_dict: results[item] = json.loads(words_dict[item]) return results
def lastest_identify_in(): results = dict() now_ts = time.time() now_ts = datetime2ts('2013-09-08') for i in range(1, 8): ts = now_ts - i * 3600 * 24 date = ts2datetime(ts).replace('-', '') words_dict = r.hgetall('history_in_' + date) for item in words_dict: results[item] = json.loads(words_dict[item]) return results
def show_in_history(date): print date results = [] sensitive_uid_list = [] influence_uid_list = [] sen_iden_in_name = "identify_in_sensitive_" + str(date) inf_iden_in_name = "identify_in_influence_" + str(date) man_iden_in_name = "identify_in_manual_" + str(date) sen_iden_in_results = r.hgetall(sen_iden_in_name) inf_iden_in_results = r.hgetall(inf_iden_in_name) man_iden_in_results = r.hgetall(man_iden_in_name) sensitive_uid_list = sen_iden_in_results.keys() influence_uid_list = inf_iden_in_results.keys() manual_uid_list = man_iden_in_results.keys() #compute_results = r.hgetall('compute') results = [] work_date = ts2datetime(datetime2ts(date) - DAY) if sensitive_uid_list: sensitive_results = get_sensitive_user_detail(sensitive_uid_list, work_date, 1) else: sensitive_results = [] for item in sensitive_results: uid = item[0] status = sen_iden_in_results[uid] item.append(status) results.append(item) if influence_uid_list: influence_results = get_sensitive_user_detail(influence_uid_list, work_date, 0) else: influence_results = [] for item in influence_results: uid = item[0] status = inf_iden_in_results[uid] item.append(status) results.append(item) if manual_uid_list: manual_results = get_sensitive_user_detail(manual_uid_list, work_date, 0) else: manual_results = [] for item in manual_results: uid = item[0] status = man_iden_in_results[uid] item.append(status) results.append(item) sorted_results = sorted(results, key=lambda x: x[5], reverse=True) return sorted_results
def get_user_hashtag(uid): user_hashtag_dict = {} sensitive_user_hashtag_dict = {} now_ts = time.time() now_date = ts2datetime(now_ts) # 2015-09-22 ts = datetime2ts(now_date) #test ts = datetime2ts('2013-09-08') for i in range(1,8): ts = ts - 3600*24 date = ts2datetime(ts).replace('-','') results = r_cluster.hget('hashtag_'+str(date), uid) sensitive_results = r_cluster.hget('sensitive_hashtag_'+str(date), uid) if results: hashtag_dict = json.loads(results) for hashtag in hashtag_dict: if user_hashtag_dict.has_key(hashtag): user_hashtag_dict[hashtag] += hashtag_dict[hashtag] else: user_hashtag_dict[hashtag] = hashtag_dict[hashtag] if sensitive_results: sensitive_hashtag_dict = json.loads(sensitive_results) for hashtag in sensitive_hashtag_dict: if sensitive_user_hashtag_dict.has_key(hashtag): sensitive_user_hashtag_dict[hashtag] += sensitive_hashtag_dict[hashtag] else: sensitive_user_hashtag_dict[hashtag] = sensitive_hashtag_dict[hashtag] ordinary_key_set = set(user_hashtag_dict.keys()) sensitive_key_set = set(sensitive_user_hashtag_dict.keys()) for key in sensitive_key_set: if key in ordinary_key_set: user_hashtag_dict[key] += sensitive_user_hashtag_dict[key] else: user_hashtag_dict[key] = sensitive_user_hashtag_dict[key] sort_hashtag_dict = sorted(user_hashtag_dict.items(), key=lambda x:x[1], reverse=True) sort_sensitive_dict = sorted(sensitive_user_hashtag_dict.items(), key=lambda x:x[1], reverse=True) return [sort_hashtag_dict, sort_sensitive_dict]
def get_sensitive_user_detail(uid_list, date, sensitive): es_cluster = es_user_profile ts = datetime2ts(date) results = [] index_name = pre_influence_index + str(date).replace( '-', '') # index_name:20130901 user_bci_results = es_bci.mget(index=index_name, doc_type='bci', body={'ids': uid_list}, _source=False, fields=['user_index'])['docs'] user_profile_results = es_user_profile.mget(index="weibo_user", doc_type="user", body={"ids": uid_list}, _source=True)['docs'] top_influnce_value = get_top_value("user_index", es_bci, index_name, "bci") for i in range(0, len(uid_list)): personal_info = [''] * 6 uid = uid_list[i] personal_info[0] = uid_list[i] personal_info[1] = uid_list[i] if user_profile_results[i]['found']: profile_dict = user_profile_results[i]['_source'] uname = profile_dict['nick_name'] if uname: personal_info[1] = uname personal_info[2] = profile_dict['user_location'] personal_info[3] = profile_dict['fansnum'] personal_info[4] = profile_dict['statusnum'] if user_bci_results[i]['found']: try: tmp_bci = user_bci_results[i]['fields']['user_index'][0] influence = math.log( tmp_bci / float(top_influnce_value) * 9 + 1, 10) * 100 personal_info[5] = influence except: personal_info[5] = 0 else: personal_info[5] = 0 if sensitive: sensitive_words = redis_cluster.hget('sensitive_' + str(ts), str(uid)) if sensitive_words: sensitive_dict = json.loads(sensitive_words) personal_info.append(sensitive_dict.keys()) else: personal_info.append([]) else: personal_info.append([]) results.append(personal_info) return results
def show_in_history(date): print date results = [] sensitive_uid_list = [] influence_uid_list = [] sen_iden_in_name = "identify_in_sensitive_" + str(date) inf_iden_in_name = "identify_in_influence_" + str(date) man_iden_in_name = "identify_in_manual_" + str(date) sen_iden_in_results = r.hgetall(sen_iden_in_name) inf_iden_in_results = r.hgetall(inf_iden_in_name) man_iden_in_results = r.hgetall(man_iden_in_name) sensitive_uid_list = sen_iden_in_results.keys() influence_uid_list = inf_iden_in_results.keys() manual_uid_list = man_iden_in_results.keys() #compute_results = r.hgetall('compute') results = [] work_date = ts2datetime(datetime2ts(date)-DAY) if sensitive_uid_list: sensitive_results = get_sensitive_user_detail(sensitive_uid_list, work_date, 1) else: sensitive_results = [] for item in sensitive_results: uid = item[0] status = sen_iden_in_results[uid] item.append(status) results.append(item) if influence_uid_list: influence_results = get_sensitive_user_detail(influence_uid_list, work_date, 0) else: influence_results = [] for item in influence_results: uid = item[0] status = inf_iden_in_results[uid] item.append(status) results.append(item) if manual_uid_list: manual_results = get_sensitive_user_detail(manual_uid_list, work_date, 0) else: manual_results = [] for item in manual_results: uid = item[0] status = man_iden_in_results[uid] item.append(status) results.append(item) sorted_results = sorted(results, key=lambda x:x[5], reverse=True) return sorted_results
def ajax_full_text_search(): if RUN_TYPE: ts = time.time() else: ts = datetime2ts("2013-09-02") now_date = ts2datetime(ts) start_time = request.args.get("start_time", now_date) # 2013-09-01 end_time = request.args.get("end_time", now_date) uid = request.args.get("uid", "") size = request.args.get("number", 100) keywords = request.args.get("keywords", "") # 逗号分隔 results = full_text_search(keywords, uid, start_time, end_time, size) return json.dumps(results)
def recommend_in_sensitive(date): sensitive_name = "recomment_" + str(date) + "_sensitive" compute_name = "compute" re_sen_set = r.hkeys(sensitive_name) # 敏感人物推荐 iden_in_set = r.hkeys(compute_name) # 已经入库用户 if not re_sen_set: return [] # 那一天不存在数据 uid_list = list(set(re_sen_set) - set(iden_in_set)) sensitive = 1 work_date = ts2datetime(datetime2ts(date)-DAY) if uid_list: results = get_sensitive_user_detail(uid_list, work_date, sensitive) else: results = [] return results
def recommend_in_sensitive(date): sensitive_name = "recomment_" + str(date) + "_sensitive" compute_name = "compute" re_sen_set = r.hkeys(sensitive_name) # 敏感人物推荐 iden_in_set = r.hkeys(compute_name) # 已经入库用户 if not re_sen_set: return [] # 那一天不存在数据 uid_list = list(set(re_sen_set) - set(iden_in_set)) sensitive = 1 work_date = ts2datetime(datetime2ts(date) - DAY) if uid_list: results = get_sensitive_user_detail(uid_list, work_date, sensitive) else: results = [] return results
def identify_in(date, words_list): # identify_in date and words_list(include level and category, [word, level, category]) # date is date when new words were recommended ts = time.time() ts = datetime2ts('2013-09-07') time_list = [] for i in range(7): now_ts = int(ts) - i * 24 * 3600 now_date = ts2datetime(now_ts).replace('-', '') time_list.append(now_date) for item in words_list: r.hset('sensitive_words', item[0], json.dumps([item[1], item[2]])) r.hset('history_in_' + date, item[0], json.dumps([item[1], item[2]])) for date in time_list: r.hdel('recommend_sensitive_words_' + date, item[0]) return '1'
def identify_in(date, words_list): # identify_in date and words_list(include level and category, [word, level, category]) # date is date when new words were recommended ts = time.time() ts = datetime2ts('2013-09-07') time_list = [] for i in range(7): now_ts = int(ts) - i*24*3600 now_date = ts2datetime(now_ts).replace('-', '') time_list.append(now_date) for item in words_list: r.hset('sensitive_words', item[0], json.dumps([item[1], item[2]])) r.hset('history_in_'+date, item[0], json.dumps([item[1], item[2]])) for date in time_list: r.hdel('recommend_sensitive_words_'+date, item[0]) return '1'
def recommend_in_top_influence(date): influence_name = "recomment_" + date + "_influence" identify_in_name = "compute" re_inf_set = r.hkeys(influence_name) iden_in_set = r.hkeys(identify_in_name) # 已经入库用户 if not re_inf_set: return [] else: uid_list = list(set(re_inf_set) - set(iden_in_set)) sensitive = 0 work_date = ts2datetime(datetime2ts(date) - DAY) if uid_list: results = get_sensitive_user_detail(uid_list, work_date, sensitive) else: results = [] return results
def recommend_in_top_influence(date): influence_name = "recomment_" + date + "_influence" identify_in_name = "compute" re_inf_set = r.hkeys(influence_name) iden_in_set = r.hkeys(identify_in_name) # 已经入库用户 if not re_inf_set: return [] else: uid_list = list(set(re_inf_set) - set(iden_in_set)) sensitive = 0 work_date = ts2datetime(datetime2ts(date)-DAY) if uid_list: results = get_sensitive_user_detail(uid_list, work_date, sensitive) else: results = [] return results
def ajax_show_sensitive_history_in(): results = [] now_date = ts2datetime(time.time()) date = request.args.get('date', now_date) # in date:2013-09-01 if str(date) == "all": ts = time.time() now_ts = datetime2ts(now_date) for i in range(7): ts = now_ts - i*24*3600 date = ts2datetime(ts) temp = show_in_history(date, 1) results.extend(temp) else: results = show_in_history(date, 1) # history in, include status if results: return json.dumps(results) else: return json.dumps([])
def ajax_show_influence_history_in(): results = [] now_date = ts2datetime(time.time()) date = request.args.get('date', now_date) if str(date) == "all": ts = time.time() now_ts = datetime2ts('2013-09-07') for i in range(7): ts = now_ts - i*24*3600 date = ts2datetime(ts) date = str(date).replace('-', '') temp = show_in_history(date, 1) results.extend(temp) else: date = str(date).replace('-','') results = show_in_history(date, 0) # history in, include status if results: return json.dumps(results) else: return json.dumps([])
def ajax_show_influence_history_in(): results = [] now_date = ts2datetime(time.time()) date = request.args.get('date', now_date) if str(date) == "all": ts = time.time() now_ts = datetime2ts('2013-09-07') for i in range(7): ts = now_ts - i * 24 * 3600 date = ts2datetime(ts) date = str(date).replace('-', '') temp = show_in_history(date, 1) results.extend(temp) else: date = str(date).replace('-', '') results = show_in_history(date, 0) # history in, include status if results: return json.dumps(results) else: return json.dumps([])
def get_sensitive_user_detail(uid_list, date, sensitive): es_cluster = es_user_profile ts = datetime2ts(date) results = [] index_name = pre_influence_index + str(date).replace('-','') # index_name:20130901 user_bci_results = es_bci.mget(index=index_name, doc_type='bci', body={'ids':uid_list}, _source=False, fields=['user_index'])['docs'] user_profile_results = es_user_profile.mget(index="weibo_user", doc_type="user", body={"ids":uid_list}, _source=True)['docs'] top_influnce_value = get_top_value("user_index", es_bci, index_name, "bci") for i in range(0, len(uid_list)): personal_info = ['']*6 uid = uid_list[i] personal_info[0] = uid_list[i] personal_info[1] = uid_list[i] if user_profile_results[i]['found']: profile_dict = user_profile_results[i]['_source'] uname = profile_dict['nick_name'] if uname: personal_info[1] = uname personal_info[2] = profile_dict['user_location'] personal_info[3] = profile_dict['fansnum'] personal_info[4] = profile_dict['statusnum'] if user_bci_results[i]['found']: try: tmp_bci = user_bci_results[i]['fields']['user_index'][0] influence = math.log(tmp_bci/float(top_influnce_value)*9+1, 10)*100 personal_info[5] = influence except: personal_info[5] = 0 else: personal_info[5] = 0 if sensitive: sensitive_words = redis_cluster.hget('sensitive_' + str(ts), str(uid)) if sensitive_words: sensitive_dict = json.loads(sensitive_words) personal_info.append(sensitive_dict.keys()) else: personal_info.append([]) else: personal_info.append([]) results.append(personal_info) return results
def get_inner_top_weibo(task_name, date, uid): result = [] # step1: identify the task exist # step2: search weibo from monitor_user_text by condition: task_user, date task_exist = identify_task(task_name) if not task_exist: return 'the task is not exist' task_user = task_exist['uid_list'] if uid not in task_user: return 'the user is not exist' end_ts = datetime2ts(date) time_segment = 24*3600 start_ts = end_ts - time_segment query_body = [] #term search: uid query_body.append({'term': uid}) #range search: date-24*3600, date query_body.append({'range':{'timestamp': {'from': start_ts, 'to': end_ts}}}) try: weibo_result = es.search(index=text_index_name, doc_type=text_index_type, \ body={'query':{'bool':{'must': query_body}}, 'sort':[{'timestamp':{'order':'asc'}}], 'size':10000})['hits']['hits'] except Exception, e: raise e
def search_mention(uid, sensitive): date = ts2datetime(time.time()).replace('-', '') stat_results = dict() results = dict() test_ts = time.time() test_ts = datetime2ts('2013-09-07') for i in range(0, 7): ts = test_ts - i * 24 * 3600 date = ts2datetime(ts).replace('-', '') if not sensitive: at_temp = r_cluster.hget('at_' + str(date), str(uid)) else: at_temp = r_cluster.hget('sensitive_at_' + str(date), str(uid)) if not at_temp: continue else: result_dict = json.loads(at_temp) for at_uid in result_dict: if stat_results.has_key(at_uid): stat_results[uid] += result_dict[at_uid] else: stat_results[uid] = result_dict[at_uid] if not stat_results: return [None, 0] in_status = identify_uid_list_in(result_dict.keys()) for at_uid in result_dict: if at_uid in in_status: results[at_uid] = [result_dict[at_uid], '1'] else: results[at_uid] = [result_dict[at_uid], '0'] sorted_results = sorted(results.items(), key=lambda x: x[1][0], reverse=True) return [sorted_results[0:20], len(results)]
def get_user_geo(uid): results = [] user_geo_result = {} user_ip_dict = {} user_ip_result = {} # ordinary ip user_sensitive_ip_result = {} # sensitive ip if RUN_TYPE: now_ts = time.time() now_date = ts2datetime(now_ts) # 2015-09-22 else: now_date = "2013-09-08" ts = datetime2ts(now_date) for i in range(1, 8): ts = ts - 3600 * 24 date = ts2datetime(ts) if WORK_TYPE == 0: index_name = ip_index_pre + str(date) sensitive_index_name = sen_ip_index_pre + str(date) exist_bool = es_cluster.indices.exists(index=index_name) sensitive_exist_bool = es_cluster.indices.exists( index=sensitive_index_name) if exist_bool: try: tmp_ip_result = es_cluster.get(index=index_name, doc_type="ip", id=uid)['_source'] results = tmp_ip_result['ip_dict'] except: results = dict() else: results = dict() if sensitive_exist_bool: try: tmp_sensitive_ip_result = es_cluster.get( index=sensitive_index_name, doc_type="sensitive_ip", id=uid)['_source'] sensitive_results = tmp_sensitive_ip_result[ 'sensitive_ip_dict'] except: sensitive_results = dict() else: sensitive_results = dict() else: results = redis_ip.hget('ip_' + str(ts), uid) sensitive_results = redis_ip.hget('sensitive_ip' + str(ts), uid) if results: ip_results = json.loads(results) for ip in ip_results: if user_ip_result.has_key(ip): user_ip_result[ip] += ip_results[ip] else: user_ip_result[ip] = ip_results[ip] if sensitive_results: sensitive_ip_results = json.loads(sensitive_results) for ip in sensitive_ip_results: if user_sensitive_ip_result.has_key(ip): user_sensitive_ip_result[ip] += sensitive_ip_results[ip] else: user_sensitive_ip_result[ip] = sensitive_ip_results[ip] ordinary_key_set = set(user_ip_result.keys()) sensitive_key_set = set(user_sensitive_ip_result.keys()) for key in sensitive_key_set: if key in ordinary_key_set: user_ip_result[key] += user_sensitive_ip_result[key] else: user_ip_result[key] = user_sensitive_ip_result[key] user_geo_dict = ip2geo(user_ip_result) sorted_user_geo_dict = sorted(user_geo_dict.items(), key=lambda x: x[1], reverse=True) sensitive_user_geo_dict = ip2geo(user_sensitive_ip_result) sorted_sensitive_user_geo_dict = sorted(sensitive_user_geo_dict.items(), key=lambda x: x[1], reverse=True) return_list = [] return_list = [sorted_user_geo_dict, sorted_sensitive_user_geo_dict] # total and sensitive return return_list
def ajax_event_detect(): results = {} query_dict = { } # {'attribute':attribute_query_list, 'event':event_query_list, 'filter':filter_dict} input_dict = { } # {'task_information':task_information_dict, 'query_dict': query_dict} attribute_query_list = [] event_query_list = [] query_condition_num = 0 #step1: get attribtue query dict for item in DETECT_EVENT_ATTRIBUTE: item_value_string = request.args.get(item, '') if item_value_string != '': item_value_list = item_value_string.split(',') nest_body_list = [] nest_body_list.append({'terms': {item: item_value_list}}) query_condition_num += 1 attribute_query_list.extend(nest_body_list) for item in DETECT_EVENT_SELECT_ATTRIBUTE: item_value_string = request.args.get(item, '') if item_value_string != '': attribute_query_list.append({"term": {item: item_value_string}}) query_condition_num += 1 query_dict['attribute'] = attribute_query_list #step2: get event query dict #step2.1: get event fuzz item for item in DETECT_TEXT_FUZZ_ITEM: item_value_string = request.args.get(item, '') item_value_list = item_value_string.split(' ') nest_body_list = [] if item_value_string != '': for item_value in item_value_list: nest_body_list.append( {'wildcard': { item: '*' + item_value + '*' }}) event_query_list.append({'bool': {'should': nest_body_list}}) query_condition_num += 1 #step2.2: get event range item for item in DETECT_EVENT_TEXT_RANGE_ITEM: now_time = int(time.time()) now_date_ts = datetime2ts(ts2datetime(now_time)) item_value_from = request.args.get(item + '_from', now_date_ts - DAY) item_value_to = request.args.get(item + '_to', now_date_ts) if item_value_from != '' and item_value_to != '': if int(item_value_from) > int(item_value_to): return 'invalid input for range' else: query_condition_num += 1 event_query_list.append({ 'range': { item: { 'gte': int(item_value_from), 'lt': int(item_value_to) } } }) else: return 'invalid input for range' query_dict['event'] = event_query_list #identify the query condition at least 1 if query_condition_num < 1: return 'invalid input for query' #step3: get filter dict filter_dict = {} for filter_item in DETECT_QUERY_FILTER: if filter_item == 'count': filter_item_value = request.args.get(filter_item, DETECT_DEFAULT_COUNT) filter_item_value = int(filter_item_value) else: filter_item_from = request.args.get(filter_item + '_from', DETECT_FILTER_VALUE_FROM) filter_item_to = request.args.get(filter_item + '_to', DETECT_FILTER_VALUE_TO) if int(filter_item_from) > int(filter_item_to) or ( not item_value_from) or (not item_value_to): return 'invalid input for filter' filter_item_value = { 'gte': int(filter_item_from), 'lt': int(filter_item_to) } filter_dict[filter_item] = filter_item_value if filter_dict['count'] == 0: return 'invalid input for count' query_dict['filter'] = filter_dict #step4: get task information dict task_information_dict = {} task_information_dict['task_name'] = request.args.get('task_name', '') task_information_dict['submit_date'] = int(time.time()) task_information_dict['state'] = request.args.get('state', '') task_information_dict['submit_user'] = request.args.get( 'submit_user', 'admin') task_information_dict['task_id'] = task_information_dict[ 'submit_user'] + task_information_dict['task_name'] task_information_dict['task_type'] = 'detect' task_information_dict['detect_type'] = 'event' task_information_dict['detect_process'] = 0 #step5: save to es and redis input_dict['task_information'] = task_information_dict input_dict['query_condition'] = query_dict status = save_detect_event_task(input_dict) return json.dumps(status)
def ajax_event_detect(): results = {} query_dict = {} # {'attribute':attribute_query_list, 'event':event_query_list, 'filter':filter_dict} input_dict = {} # {'task_information':task_information_dict, 'query_dict': query_dict} attribute_query_list = [] event_query_list = [] query_condition_num = 0 #step1: get attribtue query dict for item in DETECT_EVENT_ATTRIBUTE: item_value_string = request.args.get(item, '') if item_value_string != '': item_value_list = item_value_string.split(',') nest_body_list = [] nest_body_list.append({'terms': {item: item_value_list}}) query_condition_num += 1 attribute_query_list.extend(nest_body_list) for item in DETECT_EVENT_SELECT_ATTRIBUTE: item_value_string = request.args.get(item, '') if item_value_string != '': attribute_query_list.append({"term": {item: item_value_string}}) query_condition_num += 1 query_dict['attribute'] = attribute_query_list #step2: get event query dict #step2.1: get event fuzz item for item in DETECT_TEXT_FUZZ_ITEM: item_value_string = request.args.get(item, '') item_value_list = item_value_string.split(' ') nest_body_list = [] if item_value_string != '': for item_value in item_value_list: nest_body_list.append({'wildcard':{item: '*'+item_value+'*'}}) event_query_list.append({'bool':{'should':nest_body_list}}) query_condition_num += 1 #step2.2: get event range item for item in DETECT_EVENT_TEXT_RANGE_ITEM: now_time = int(time.time()) now_date_ts = datetime2ts(ts2datetime(now_time)) item_value_from = request.args.get(item+'_from', now_date_ts - DAY) item_value_to = request.args.get(item+'_to', now_date_ts) if item_value_from != '' and item_value_to != '': if int(item_value_from) > int(item_value_to): return 'invalid input for range' else: query_condition_num += 1 event_query_list.append({'range':{item: {'gte': int(item_value_from), 'lt':int(item_value_to)}}}) else: return 'invalid input for range' query_dict['event'] = event_query_list #identify the query condition at least 1 if query_condition_num < 1: return 'invalid input for query' #step3: get filter dict filter_dict = {} for filter_item in DETECT_QUERY_FILTER: if filter_item == 'count': filter_item_value = request.args.get(filter_item, DETECT_DEFAULT_COUNT) filter_item_value = int(filter_item_value) else: filter_item_from = request.args.get(filter_item+'_from', DETECT_FILTER_VALUE_FROM) filter_item_to = request.args.get(filter_item+'_to', DETECT_FILTER_VALUE_TO) if int(filter_item_from) > int(filter_item_to) or (not item_value_from) or (not item_value_to): return 'invalid input for filter' filter_item_value = {'gte': int(filter_item_from), 'lt': int(filter_item_to)} filter_dict[filter_item] = filter_item_value if filter_dict['count'] == 0: return 'invalid input for count' query_dict['filter'] = filter_dict #step4: get task information dict task_information_dict = {} task_information_dict['task_name'] = request.args.get('task_name', '') task_information_dict['submit_date'] = int(time.time()) task_information_dict['state'] = request.args.get('state', '') task_information_dict['submit_user'] = request.args.get('submit_user', 'admin') task_information_dict['task_id'] = task_information_dict['submit_user'] + task_information_dict['task_name'] task_information_dict['task_type'] = 'detect' task_information_dict['detect_type'] = 'event' task_information_dict['detect_process'] = 0 #step5: save to es and redis input_dict['task_information'] = task_information_dict input_dict['query_condition'] = query_dict status = save_detect_event_task(input_dict) return json.dumps(status)
def search_task(task_name, submit_user, submit_date, state, status): results = [] # query = [{"term": {"submit_user": submit_user}}] query = [] condition_num = 0 if task_name: task_name_list = task_name.split(" ") for item in task_name_list: query.append({"wildcard": {"task_name": "*" + item + "*"}}) condition_num += 1 if submit_date: submit_date_ts = datetime2ts(submit_date) submit_date_start = submit_date_ts submit_date_end = submit_date_ts + DAY query.append({"range": {"submit_date": {"gte": submit_date_start, "lt": submit_date_end}}}) condition_num += 1 if state: state_list = state.split(" ") for item in state_list: query.append({"wildcard": {"state": "*" + item + "*"}}) condition_num += 1 if status: query.append({"match": {"status": status}}) condition_num += 1 if condition_num > 0: query.append({"term": {"task_type": "analysis"}}) try: source = es_group_result.search( index=group_index_name, doc_type=group_index_type, body={"query": {"bool": {"must": query}}, "sort": [{"count": {"order": "desc"}}], "size": MAX_VALUE}, ) except Exception as e: raise e else: query.append({"term": {"task_type": "analysis"}}) source = es.search( index=group_index_name, doc_type=group_index_type, body={"query": {"bool": {"must": query}}, "sort": [{"count": {"order": "desc"}}], "size": MAX_VALUE}, ) try: task_dict_list = source["hits"]["hits"] except: return None print "step yes" result = [] for task_dict in task_dict_list: try: state = task_dict["_source"]["state"] except: state = "" try: status = task_dict["_source"]["status"] except: status = 0 result.append( [ task_dict["_source"]["task_name"], task_dict["_source"]["submit_date"], task_dict["_source"]["count"], state, status, ] ) return result
def influenced_people(uid, mid, influence_style, date, default_number=20): # uid # which weibo----mid, retweeted weibo ---seek for root_mid # influence_style: retweeted(0) or comment(1) date1 = ts2datetime(datetime2ts(date)).replace('-', '') index_name = pre_index + date1 index_flow_text = pre_text_index + date text_result = es.get(index=index_flow_text, doc_type=flow_text_index_type, id=mid)["_source"] temp_mid = text_result.get("root_mid",'') #判断微博是否是原创微博 if temp_mid: mid_type = 1 # 非原创微博 else: mid_type = 0 # 原创微博 query_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ ] } } } }, "size": 30000 } if RUN_TYPE: query_body["sort"] = {"user_fansnum":{"order":"desc"}} if int(mid_type) == 0: if int(influence_style) == 0: # origin weibo, all retweeted people query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term": {"root_uid": uid}}, {"term": {"message_type": 3}}, {"term": {"root_mid": mid}}]) else: # commented people query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term": {"directed_uid": uid}}, {"term": {"message_type": 2}}, {"term": {"root_mid": mid}}]) else: if int(influence_style) == 0: # origin weibo, all retweeted people query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term": {"directed_uid": uid}}, {"term": {"message_type": 3}}, {"term": {"root_mid": temp_mid}}]) else: # commented people query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term": {"directed_uid": uid}}, {"term": {"message_type": 2}}, {"term": {"root_mid": temp_mid}}]) search_results = es.search(index=index_flow_text, doc_type=flow_text_index_type, body=query_body, _source=False, fields=["uid"], timeout=30)["hits"]["hits"] results = [] # uid_list if search_results: for item in search_results: if int(item["fields"]["uid"][0]) == int(uid): pass else: results.append(item["fields"]["uid"][0]) results = list(set(results)) else: results = [] bci_index = "bci_" + date.replace('-','') if results: portrait_results = es_user_portrait.mget(index=user_portrait, doc_type=portrait_index_type, body={"ids": results}, fields=["domain", "topic_string", "activity_geo_dict","importance", "influence"])["docs"] bci_results = es_cluster.mget(index=bci_index, doc_type='bci', body={"ids":results}, fields=['user_index'])['docs'] else: portrait_results = {} bci_results = {} in_portrait = [] out_portrait = [] in_portrait_info = [] retweeted_domain = {} retweeted_topic = {} retweeted_geo = {} average_influence = 0 total_influence = 0 count = 0 if bci_results: total_influence = 0 for item in bci_results: if item['found']: total_influence += item['fields']['user_index'][0] try: average_influence = total_influence/len(results) except: average_influence = 0 if portrait_results: for item in portrait_results: if item["found"]: temp = [] count += 1 temp.append(item['_id']) temp.append(item["fields"]["importance"][0]) in_portrait.append(temp) temp_domain = item["fields"]["domain"][0].split('&') temp_topic = item["fields"]["topic_string"][0].split('&') temp_geo = json.loads(item["fields"]["activity_geo_dict"][0])[-1].keys() #total_influence += item["fields"]["influence"][0] retweeted_domain = aggregation(temp_domain, retweeted_domain) retweeted_topic = aggregation(temp_topic, retweeted_topic) retweeted_geo = aggregation(temp_geo, retweeted_geo) else: out_portrait.append(item['_id']) retweeted_domain = proportion(retweeted_domain) retweeted_topic = proportion(retweeted_topic) retweeted_geo = proportion(retweeted_geo) #try: # average_influence = total_influence/count #except: # average_influence = 0 sorted_retweeted_domain = sorted(retweeted_domain.items(),key=lambda x:x[1], reverse=True) sorted_retweeted_topic = sorted(retweeted_topic.items(),key=lambda x:x[1], reverse=True) sorted_retweeted_geo = sorted(retweeted_geo.items(), key=lambda x:x[1], reverse=True) retweeted_results = dict() retweeted_results["domian"] = sorted_retweeted_domain[:5] retweeted_results["topic"] = sorted_retweeted_topic[:5] retweeted_results["geo"] = sorted_retweeted_geo[:5] retweeted_results["influence"] = average_influence in_portrait = sorted(in_portrait, key=lambda x:x[1], reverse=True) temp_list = [] for item in in_portrait: temp_list.append(item[0]) retweeted_results['in_portrait_number'] = len(temp_list) retweeted_results['out_portrait_number'] = len(out_portrait) in_portrait_url = get_user_url(temp_list[:default_number]) out_portrait_url = get_user_url(out_portrait[:default_number]) return_results = dict() return_results["influence_users"] = [in_portrait_url, out_portrait_url] return_results["influence_distribution"] = retweeted_results return return_results
def get_social_inter_content(uid1, uid2, type_mark): weibo_list = [] # get two type relation about uid1 and uid2 # search weibo list now_ts = int(time.time()) # run_type if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = datetime2ts(RUN_TEST_TIME) # uid2uname uid2uname = {} try: portrait_result = es_user_portrait.mget( index=portrait_index_name, doc_type=portrait_index_type, body={"ids": [uid1, uid2]}, _source=False, fields=["uid", "uname"], )["docs"] except: portrait_result = [] for item in portrait_result: uid = item["_id"] if item["found"] == True: uname = item["fields"]["uname"][0] uid2uname[uid] = uname else: uid2uname[uid] = "unknown" # iter date to search weibo list for i in range(7, 0, -1): iter_date_ts = now_date_ts - i * DAY iter_date = ts2datetime(iter_date_ts) flow_text_index_name = flow_text_index_name_pre + str(iter_date) query = [] query.append({"bool": {"must": [{"term": {"uid": uid1}}, {"term": {"directed_uid": int(uid2)}}]}}) if type_mark == "out": query.append({"bool": {"must": [{"term": {"uid": uid2}}, {"term": {"directed_uid": int(uid1)}}]}}) try: flow_text_result = es_flow_text.search( index=flow_text_index_name, doc_type=flow_text_index_type, body={ "query": {"bool": {"should": query}}, "sort": [{"timestamp": {"order": "asc"}}], "size": MAX_VALUE, }, )["hits"]["hits"] except: flow_text_result = [] for flow_text in flow_text_result: source = flow_text["_source"] weibo = {} weibo["timestamp"] = source["timestamp"] weibo["ip"] = source["ip"] weibo["geo"] = source["geo"] weibo["text"] = "\t".join(source["text"].split("&")) weibo["uid"] = source["uid"] weibo["uname"] = uid2uname[weibo["uid"]] weibo["directed_uid"] = str(source["directed_uid"]) weibo["directed_uname"] = uid2uname[str(source["directed_uid"])] weibo_list.append(weibo) return weibo_list
def search_full_text(uid, date): result = [] ts = datetime2ts(date) next_ts = ts + 24 * 3600 query_body = { "query": { "filtered": { "filter": { "bool": { "must": [{ "term": { "uid": uid } }, { "range": { "timestamp": { "gte": ts, "lt": next_ts } } }] } } } }, "size": 200 } search_results = es.search(index='sensitive_user_text', doc_type="user", body=query_body)['hits']['hits'] for item in search_results: detail = [] source = item['_source'] detail.append(source['sensitive']) detail.append(source['message_type']) ts = source['timestamp'] re_time = time.strftime('%H:%M:%S', time.localtime(float(ts))) detail.append(re_time) geo_string = source['geo'] geo_list = geo_string.split('/t') if len(geo_list) >= 3: geo = '/t'.join(geo_list[-2:]) else: geo = geo_string detail.append(geo) detail.append(source['text']) date = date.replace('-', '') mid = source['mid'] try: weibo_bci = es.get(index=date, doc_type='bci', id=uid)['_source'] except: weibo_bci = {} retweeted_number = 0 comment_number = 0 if source['sensitive']: if int(source['message_type']) == 1: if weibo_bci: if weibo_bci.get('s_origin_weibo_retweeted_detail', {}): retweeted_detail = json.loads( weibo_bci['s_origin_weibo_retweeted_detail']) else: retweeted_detail = {} retweeted_number = retweeted_detail.get(mid, 0) if weibo_bci.get('s_origin_weibo_comment_detail', {}): comment_detail = json.loads( weibo_bci['s_origin_weibo_comment_detail']) else: comment_detail = {} comment_number = comment_detail.get(mid, 0) elif int(source['message_type']) == 2: if weibo_bci: if weibo_bci.get('s_retweeted_weibo_retweeted_detail', {}): retweeted_detail = json.loads( weibo_bci['s_retweeted_weibo_retweeted_detail']) else: retweeted_detail = {} retweeted_number = retweeted_detail.get(mid, 0) if weibo_bci.get('s_retweetd_weibo_comment_detail', {}): comment_detail = json.loads( weibo_bci['s_retweeted_weibo_comment_detail']) else: comment_detail = {} comment_number = comment_detail.get(mid, 0) else: pass else: if int(source['message_type']) == 1: if weibo_bci: print weibo_bci['origin_weibo_retweeted_detail'] if weibo_bci.get('origin_weibo_retweeted_detail', {}): retweeted_detail = json.loads( weibo_bci['origin_weibo_retweeted_detail']) else: retweeted_detail = {} retweeted_number = retweeted_detail.get(mid, 0) if weibo_bci.get('origin_weibo_comment_detail', {}): comment_detail = json.loads( weibo_bci['origin_weibo_comment_detail']) else: comment_detail = {} comment_number = comment_detail.get(mid, 0) elif int(source['message_type']) == 2: if weibo_bci: if weibo_bci.get('retweeted_weibo_retweeted_detail', {}): retweeted_detail = json.loads( weibo_bci['retweeted_weibo_retweeted_detail']) else: retweeted_detail = {} retweeted_number = retweeted_detail.get(mid, 0) if weibo_bci.get('retweetd_weibo_comment_detail', {}): comment_detail = json.loads( weibo_bci['retweeted_weibo_comment_detail']) else: comment_detail = {} comment_number = comment_detail.get(mid, 0) else: pass detail.append(retweeted_number) detail.append(comment_number) result.append(detail) return result
def search_attribute_portrait(uid): return_results = {} index_name = "sensitive_user_portrait" index_type = "user" try: search_result = es.get(index=index_name, doc_type=index_type, id=uid) except: return None results = search_result['_source'] #return_results = results user_sensitive = user_type(uid) if user_sensitive: #return_results.update(sensitive_attribute(uid)) return_results['user_type'] = 1 return_results['sensitive'] = 1 else: return_results['user_type'] = 0 return_results['sensitive'] = 0 if results['photo_url'] == 0: results['photo_url'] = 'unknown' if results['location'] == 0: results['location'] = 'unknown' return_results['photo_url'] = results['photo_url'] return_results['uid'] = results['uid'] return_results['uname'] = results['uname'] if return_results['uname'] == 0: return_results['uname'] = 'unknown' return_results['location'] = results['location'] return_results['fansnum'] = results['fansnum'] return_results['friendsnum'] = results['friendsnum'] return_results['gender'] = results['gender'] return_results['psycho_status'] = json.loads(results['psycho_status']) keyword_list = [] if results['keywords']: keywords_dict = json.loads(results['keywords']) sort_word_list = sorted(keywords_dict.items(), key=lambda x: x[1], reverse=True) return_results['keywords'] = sort_word_list else: return_results['keywords'] = [] return_results['retweet'] = search_retweet(uid, 0) return_results['follow'] = search_follower(uid, 0) return_results['at'] = search_mention(uid, 0) if results['ip'] and results['geo_activity']: ip_dict = json.loads(results['ip']) geo_dict = json.loads(results['geo_activity']) geo_description = active_geo_description(ip_dict, geo_dict) return_results['geo_description'] = geo_description else: return_results['geo_description'] = '' geo_top = [] temp_geo = {} if results['geo_activity']: geo_dict = json.loads(results['geo_activity']) if len(geo_dict) < 7: ts = time.time() ts = datetime2ts('2013-09-08') - 8 * 24 * 3600 for i in range(7): ts = ts + 24 * 3600 date = ts2datetime(ts).replace('-', '') if geo_dict.has_key(date): pass else: geo_dict[date] = {} activity_geo_list = sorted(geo_dict.items(), key=lambda x: x[0], reverse=False) geo_list = geo_dict.values() for k, v in activity_geo_list: sort_v = sorted(v.items(), key=lambda x: x[1], reverse=True) top_geo = [item[0] for item in sort_v] geo_top.append([k, top_geo[0:2]]) for iter_key in v.keys(): if temp_geo.has_key(iter_key): temp_geo[iter_key] += v[iter_key] else: temp_geo[iter_key] = v[iter_key] sort_geo_dict = sorted(temp_geo.items(), key=lambda x: x[1], reverse=True) return_results['top_activity_geo'] = sort_geo_dict return_results['activity_geo_distribute'] = geo_top else: return_results['top_activity_geo'] = [] return_results['activity_geo_distribute'] = geo_top hashtag_dict = get_user_hashtag(uid)[0] return_results['hashtag'] = hashtag_dict ''' emotion_result = {} emotion_conclusion_dict = {} if results['emotion_words']: emotion_words_dict = json.loads(results['emotion_words']) for word_type in emotion_mark_dict: try: word_dict = emotion_words_dict[word_type] if word_type=='126' or word_type=='127': emotion_conclusion_dict[word_type] = word_dict sort_word_dict = sorted(word_dict.items(), key=lambda x:x[1], reverse=True) word_list = sort_word_dict[:5] except: results['emotion_words'] = emotion_result emotion_result[emotion_mark_dict[word_type]] = word_list return_results['emotion_words'] = emotion_result ''' # topic if results['topic']: topic_dict = json.loads(results['topic']) sort_topic_dict = sorted(topic_dict.items(), key=lambda x: x[1], reverse=True) return_results['topic'] = sort_topic_dict[:5] else: return_results['topic'] = [] # domain if results['domain']: domain_string = results['domain'] domain_list = domain_string.split('_') return_results['domain'] = domain_list else: return_results['domain'] = [] ''' # emoticon if results['emotion']: emotion_dict = json.loads(results['emotion']) sort_emotion_dict = sorted(emotion_dict.items(), key=lambda x:x[1], reverse=True) return_results['emotion'] = sort_emotion_dict[:5] else: return_results['emotion'] = [] ''' # on_line pattern if results['online_pattern']: online_pattern_dict = json.loads(results['online_pattern']) sort_online_pattern_dict = sorted(online_pattern_dict.items(), key=lambda x: x[1], reverse=True) return_results['online_pattern'] = sort_online_pattern_dict[:5] else: return_results['online_pattern'] = [] ''' #psycho_feature if results['psycho_feature']: psycho_feature_list = results['psycho_feature'].split('_') return_results['psycho_feature'] = psycho_feature_list else: return_results['psycho_feature'] = [] ''' # self_state try: profile_result = es_user_profile.get(index='weibo_user', doc_type='user', id=uid) self_state = profile_result['_source'].get('description', '') return_results['description'] = self_state except: return_results['description'] = '' if results['importance']: query_body = { 'query': { 'range': { 'importance': { 'from': results['importance'], 'to': 100000 } } } } importance_rank = es.count(index='sensitive_user_portrait', doc_type='user', body=query_body) if importance_rank['_shards']['successful'] != 0: return_results['importance_rank'] = importance_rank['count'] else: return_results['importance_rank'] = 0 else: return_results['importance_rank'] = 0 return_results['importance'] = results['importance'] if results['activeness']: query_body = { 'query': { 'range': { 'activeness': { 'from': results['activeness'], 'to': 10000 } } } } activeness_rank = es.count(index='sensitive_user_portrait', doc_type='user', body=query_body) print activeness_rank if activeness_rank['_shards']['successful'] != 0: return_results['activeness_rank'] = activeness_rank['count'] else: return_results['activeness_rank'] = 0 else: return_results['activeness_rank'] = 0 return_results['activeness'] = results['activeness'] if results['influence']: query_body = { 'query': { 'range': { 'influence': { 'from': results['influence'], 'to': 100000 } } } } influence_rank = es.count(index='sensitive_user_portrait', doc_type='user', body=query_body) if influence_rank['_shards']['successful'] != 0: return_results['influence_rank'] = influence_rank['count'] else: return_results['influence_rank'] = 0 else: return_results['influence_rank'] = 0 return_results['influence'] = results['influence'] if results['sensitive']: query_body = { 'query': { 'range': { 'sensitive': { 'from': results['sensitive'], 'to': 100000 } } } } influence_rank = es.count(index='sensitive_user_portrait', doc_type='user', body=query_body) if influence_rank['_shards']['successful'] != 0: return_results['sensitive_rank'] = influence_rank['count'] else: return_results['sensitive_rank'] = 0 else: return_results['sensitive_rank'] = 0 return_results['sensitive'] = results['sensitive'] query_body = {'query': {"match_all": {}}} all_count = es.count(index='sensitive_user_portrait', doc_type='user', body=query_body) if all_count['_shards']['successful'] != 0: return_results['all_count'] = all_count['count'] else: print 'es_sensitive_user_portrait error' return_results['all_count'] = 0 # link link_ratio = results['link'] return_results['link'] = link_ratio weibo_trend = get_user_trend(uid)[0] return_results['time_description'] = active_time_description(weibo_trend) return_results['time_trend'] = weibo_trend # user influence trend influence_detail = [] influence_value = [] attention_value = [] ts = time.time() ts = datetime2ts('2013-09-08') - 8 * 24 * 3600 for i in range(1, 8): date = ts2datetime(ts + i * 24 * 3600).replace('-', '') detail = [0] * 10 try: item = es.get(index=date, doc_type='bci', id=uid)['_source'] ''' if return_results['utype']: detail[0] = item.get('s_origin_weibo_number', 0) detail[1] = item.get('s_retweeted_weibo_number', 0) detail[2] = item.get('s_origin_weibo_retweeted_total_number', 0) + item.get('s_retweeted_weibo_retweeted_total_number', 0) detail[3] = item.get('s_origin_weibo_comment_total_number', 0) + item.get('s_retweeted_weibo_comment_total_number', 0) else: ''' if 1: detail[0] = item.get('origin_weibo_number', 0) detail[1] = item.get('retweeted_weibo_number', 0) detail[2] = item.get( 'origin_weibo_retweeted_total_number', 0) + item.get( 'retweeted_weibo_retweeted_total_number', 0) detail[3] = item.get( 'origin_weibo_comment_total_number', 0) + item.get( 'retweeted_weibo_comment_total_number', 0) retweeted_id = item.get('origin_weibo_top_retweeted_id', '0') detail[4] = retweeted_id if retweeted_id: try: detail[5] = es.get(index='sensitive_user_text', doc_type='user', id=retweeted_id)['_source']['text'] except: detail[5] = '' else: detail[5] = '' detail[6] = item.get('origin_weibo_retweeted_top_number', 0) detail[7] = item.get('origin_weibo_top_comment_id', '0') if detail[7]: try: detail[8] = es.get(index='sensitive_user_text', doc_type='user', id=detail[7])['_source']['text'] except: detail[8] = '' else: detail[8] = '' detail[9] = item.get('origin_weibo_comment_top_number', 0) attention_number = detail[2] + detail[3] attention = 2 / (1 + math.exp(-0.005 * attention_number)) - 1 influence_value.append([date, item['user_index']]) influence_detail.append([date, detail]) attention_value.append(attention) except: influence_value.append([date, 0]) influence_detail.append([date, detail]) attention_value.append(0) return_results['influence_trend'] = influence_value return_results['common_influence_detail'] = influence_detail return_results['attention_degree'] = attention_value return return_results
from search import sensitive_search_mention, sensitive_search_attention, sensitive_search_follower, sensitive_search_be_comment, \ sensitive_search_bidirect_interaction from search import delete_action, search_identify_uid, get_activeness_trend from search import get_activity_weibo, search_comment, search_be_comment, sensitive_search_comment from search import search_bidirect_interaction, search_preference_attribute, search_sentiment_trend from search import search_sentiment_weibo, get_influence_trend, search_remark, edit_remark, search_user_group from sensitive_user_portrait.search_user_profile import es_get_source from sensitive_user_portrait.global_utils import es_user_portrait as es from sensitive_user_portrait.parameter import SOCIAL_DEFAULT_COUNT, SENTIMENT_TREND_DEFAULT_TYPE from sensitive_user_portrait.parameter import DEFAULT_SENTIMENT, DAY from sensitive_user_portrait.parameter import RUN_TYPE, RUN_TEST_TIME, WORK_TYPE from sensitive_user_portrait.time_utils import ts2datetime, datetime2ts from personal_influence import get_user_influence, influenced_detail, influenced_people, influenced_user_detail, statistics_influence_people, tag_vector, comment_on_influence, detail_weibo_influence, influence_summary # use to test 13-09-08 test_time = datetime2ts(RUN_TEST_TIME) # custom_attribute attribute_index_name = 'custom_attribute' attribute_index_type = 'attribute' mod = Blueprint('attribute', __name__, url_prefix='/attribute') @mod.route('/search_user_group/') def ajax_search_user_group(): uid = request.args.get('uid', '') results = search_user_group(uid) return json.dumps(results)
def get_user_trend(uid): activity_dict = {} if RUN_TYPE: now_ts = time.time() now_date = ts2datetime(now_ts) # 2015-09-22 else: now_date = "2013-09-08" ts = datetime2ts(now_date) #test #ts = datetime2ts('2013-09-08') timestamp = ts return_results = dict() return_sensitive_results = {} for i in range(1, 8): ts = timestamp - 24 * 3600 * i date = ts2datetime(ts) if WORK_TYPE == 0: index_name = act_index_pre + date sensitive_index_name = sen_act_index_pre + date exist_bool = es_cluster.indices.exists(index=index_name) sensitive_exist_bool = es_cluster.indices.exists( index=sensitive_index_name) if exist_bool: try: tmp_act_result = es_cluster.get(index=index_name, doc_type="activity", id=uid)['_source'] results = tmp_act_result['activity_dict'] except: results = dict() else: results = dict() if sensitive_exist_bool: try: tmp_sensitive_act_result = es_cluster.get( index=sensitive_index_name, doc_type="sensitive_activity", id=uid)['_source'] sensitive_results = tmp_sensitive_ip_result[ 'sensitive_activity_dict'] except: sensitive_results = dict() else: sensitive_results = dict() else: results = redis_activity.hget('activity_' + str(ts), uid) sensitive_results = redis_activity.hget( 'sensitive_activity_' + str(ts), uid) if results: result_dict = json.loads(results) key_set = set(result_dict.keys()) for key in result_dict.keys(): return_results[int(key) * 900 + ts] = result_dict[key] else: pass if sensitive_results: sensitive_result_dict = json.loads(sensitive_results) for key in sensitive_result_dict.keys(): return_sensitive_results[int(key) * 900 + ts] = sensitive_result_dict[key] else: pass trend_dict = {} for i in range(1, 8): ts = timestamp - i * 24 * 3600 for j in range(0, 6): base_time = ts + j * 900 * 16 num = 0 for k in range(16): seg_time = base_time + k * 900 if seg_time in return_results: num += return_results[seg_time] trend_dict[base_time] = num sensitive_trend_dict = {} for i in range(1, 8): ts = timestamp - i * 24 * 3600 for j in range(0, 6): base_time = ts + j * 900 * 16 num = 0 for k in range(16): seg_time = base_time + k * 900 if seg_time in return_sensitive_results: num += return_sensitive_results[seg_time] sensitive_trend_dict[base_time] = num ordinary_key_set = set(trend_dict.keys()) sensitive_key_set = set(sensitive_trend_dict.keys()) for key in sensitive_key_set: if key in ordinary_key_set: trend_dict[key] += sensitive_trend_dict[key] else: trend_dict[key] = sensitive_trend_dict[key] sorted_dict = sorted(trend_dict.items(), key=lambda x: x[0], reverse=False) sorted_sensitive_dict = sorted(sensitive_trend_dict.items(), key=lambda x: x[0], reverse=False) return [sorted_dict, sorted_sensitive_dict] # total and sensitive
def full_text_search(keywords, uid, start_time, end_time, size): results = [] uid_list = [] user_profile_list = [] query_body = { "query": { "filtered":{ "filter":{ "bool": { "must": [] } } } }, "size":size, "sort":{"timestamp":{"order": 'desc'}} } if RUN_TYPE: query_body["sort"] = {"user_fansnum":{"order": 'desc'}} if uid: query_body["query"]["filtered"]["filter"]["bool"]["must"].append({"term":{"uid":uid}}) if keywords: keywords_list = keywords.split(',') for word in keywords_list: query_body["query"]["filtered"]["filter"]["bool"]["must"].append({'wildcard':{'text':{'wildcard':'*'+word+'*'}}}) index_list = [] exist_bool = es_flow_text.indices.exists(index="flow_text_"+end_time) if start_time: start_ts = datetime2ts(start_time) end_ts = datetime2ts(end_time) ts = end_ts while 1: index_name = "flow_text_"+ts2datetime(ts) exist_bool = es_flow_text.indices.exists(index=index_name) if exist_bool: index_list.append(index_name) if ts == start_ts: break else: ts -= 3600*24 print index_list # 没有可行的es if not index_list: return [] search_results = es_flow_text.search(index=index_list, doc_type="text", body=query_body)["hits"]["hits"] for item in search_results: uid_list.append(item['_source']['uid']) history_max = get_history_max() personal_field = ["nick_name", "fansnum", "statusnum","user_location"] user_info = get_user_profile(uid_list, personal_field) bci_results = ES_CLUSTER_FLOW1.mget(index="bci_history", doc_type="bci", body={"ids":uid_list}, _source=False, fields=["bci_day_last"])["docs"] sensitive_results = es_sensitive_history.mget(index="sensitive_history", doc_type="sensitive", body={"ids":uid_list}, _source=False, fields=["last_value"])["docs"] count = 0 for item in search_results: item = item['_source'] uid_list.append(item['uid']) iter_item = [] iter_item.append(item['uid']) iter_item.append(user_info[count][1]) iter_item.append(item['text']) iter_item.append(ts2date(item['timestamp'])) iter_item.append(item['geo']) if item.get("sensitive_words_string", ''): iter_item.append(item['sensitive_words_string'].split('&')) else: iter_item.append([]) iter_item.append(item.get('retweeted', 0)) iter_item.append(item.get('comment', 0)) count += 1 results.append(iter_item) user_set = set() count = 0 for item in user_info: if item[0] in user_set: continue else: user_set.add(item[0]) if bci_results[count]["found"]: bci_value = bci_results[count]["fields"]["bci_day_last"][0] item.append(normalize_index(bci_value, history_max["max_bci"])) else: item.append(0) if sensitive_results[count]["found"]: sensitive_value = sensitive_results[count]['fields']['last_value'][0] item.append(normalize_index(sensitive_value, history_max["max_sensitive"])) else: item.append(0) user_profile_list.append(item) return results, user_profile_list
def full_text_search(keywords, uid, start_time, end_time, size): results = [] uid_list = [] user_profile_list = [] query_body = { "query": { "bool": { "must": [] } }, "size":size, "sort":{"timestamp":{"order": 'desc'}} } if RUN_TYPE: query_body["sort"] = {"user_fansnum":{"order": 'desc'}} if uid: query_body["query"]["bool"]["must"].append({"term":{"uid":uid}}) if keywords: keywords_list = keywords.split(',') for word in keywords_list: query_body["query"]["bool"]["must"].append({'wildcard':{'text':{'wildcard':'*'+word+'*'}}}) index_list = [] exist_bool = es_flow_text.indices.exists(index="flow_text_"+end_time) if start_time: start_ts = datetime2ts(start_time) end_ts = datetime2ts(end_time) ts = end_ts while 1: index_name = "flow_text_"+ts2datetime(ts) exist_bool = es_flow_text.indices.exists(index=index_name) if exist_bool: index_list.append(index_name) if ts == start_ts: break else: ts -= 3600*24 print index_list # 没有可行的es if not index_list: return [[], []] search_results = es_flow_text.search(index=index_list, doc_type="text", body=query_body)["hits"]["hits"] for item in search_results: uid_list.append(item['_source']['uid']) user_info = [] if uid_list: history_max = get_history_max() personal_field = ["nick_name", "fansnum", "statusnum","user_location"] user_info = get_user_profile(uid_list, personal_field) bci_results = ES_CLUSTER_FLOW1.mget(index="bci_history", doc_type="bci", body={"ids":uid_list}, _source=False, fields=["bci_day_last"])["docs"] in_portrait = es_user_portrait.mget(index="sensitive_user_portrait", doc_type="user", body={"ids":uid_list}, _source=False)["docs"] sensitive_results = es_sensitive_history.mget(index="sensitive_history", doc_type="sensitive", body={"ids":uid_list}, _source=False, fields=["last_value"])["docs"] print "len search: ", len(search_results) count = 0 # uid uname text date geo sensitive_words retweeted comment for item in search_results: item = item['_source'] uid_list.append(item['uid']) iter_item = [] iter_item.append(item['uid']) iter_item.append(user_info[count][1]) iter_item.append(item['text']) iter_item.append(ts2date(item['timestamp'])) iter_item.append(item['geo']) if item.get("sensitive_words_string", ''): iter_item.append(item['sensitive_words_string'].split('&')) else: iter_item.append([]) iter_item.append(item.get('retweeted', 0)) iter_item.append(item.get('comment', 0)) count += 1 results.append(iter_item) user_set = set() count = 0 # uid "nick_name", "fansnum", "statusnum","user_location", bci, sensitive for item in user_info: if item[0] in user_set: continue else: user_set.add(item[0]) if bci_results[count]["found"]: if bci_results[count].has_key("fields"): bci_value = bci_results[count]["fields"]["bci_day_last"][0] else: bci_value = 0 item.append(normalize_index(bci_value, history_max["max_bci"])) else: item.append(0) if sensitive_results[count]["found"]: if sensitive_results[count].has_key("fields"): sensitive_value = sensitive_results[count]['fields']['last_value'][0] else: sensitive_value = 0 item.append(normalize_index(sensitive_value, history_max["max_sensitive"])) else: item.append(0) if in_portrait[count]["found"]: item.append("1") else: item.append("0") user_profile_list.append(item) return results, user_profile_list
def sensitive_attribute(uid, date): results = {} portrait = {} utype = user_type(uid) if not utype: results['utype'] = 0 return results results['utype'] = 1 results['uid'] = uid portrait_result = es.get(index='sensitive_user_portrait', doc_type='user', id=uid)['_source'] results['uname'] = portrait_result['uname'] if portrait_result['uname'] == 0: results['uname'] = 'unknown' if portrait_result['photo_url'] == 0: portrait_result['photo_url'] = 'unknown' if portrait_result['location'] == 0: portrait_result['location'] = 'unknown' results['photo_url'] = portrait_result['photo_url'] # sensitive weibo number statistics date = ts2datetime(time.time() - 24 * 3600).replace('-', '') date = '20130907' # test influence_results = [] try: influence_results = es.get(index=date, doc_type='bci', id=uid)['_source'] results['sensitive_origin_weibo_number'] = influence_results.get( 's_origin_weibo_number', 0) results['sensitive_retweeted_weibo_number'] = influence_results.get( 's_retweeted_weibo_number', 0) results['sensitive_comment_weibo_number'] = int( influence_results.get('s_comment_weibo_number', 0)) results[ 'sensitive_retweeted_weibo_retweeted_total_number'] = influence_results.get( 's_retweeted_weibo_retweeted_total_number', 0) results[ 'sensitive_origin_weibo_retweeted_total_number'] = influence_results.get( 's_origin_weibo_retweeted_total_number', 0) results[ 'sensitive_origin_weibo_comment_total_number'] = influence_results.get( 's_origin_weibo_comment_total_number', 0) results[ 'sensitive_retweeted_weibo_comment_total_number'] = influence_results.get( 's_retweeted_weibo_comment_total_number', 0) except: results['sensitive_origin_weibo_number'] = 0 results['sensitive_retweeted_weibo_number'] = 0 results['sensitive_comment_weibo_number'] = 0 results['sensitive_origin_weibo_retweeted_total_number'] = 0 results['sensitive_origin_weibo_comment_total_number'] = 0 results['sensitive_retweeted_weibo_retweeted_total_number'] = 0 results['sensitive_retweeted_weibo_comment_total_number'] = 0 try: item = es.get(index=date, doc_type='bci', id=uid)['_source'] except: item = {} results['origin_weibo_total_number'] = item.get( 'origin_weibo_number', 0) + results['sensitive_origin_weibo_number'] results['retweeted_weibo_total_number'] = item.get( 'retweeted_weibo_number', 0) + results['sensitive_retweeted_weibo_number'] results['comment_weibo_total_number'] = int( item.get('comment_weibo_number', 0)) + int( results['sensitive_comment_weibo_number']) results['origin_weibo_retweeted_total_number'] = item.get( 'origin_weibo_retweeted_total_number', 0) + results['sensitive_origin_weibo_retweeted_total_number'] results['origin_weibo_comment_total_number'] = item.get( 'origin_weibo_comment_total_number', 0) + results['sensitive_origin_weibo_comment_total_number'] results['retweeted_weibo_retweeted_total_number'] = item.get( 'retweeted_weibo_retweeted_total_number', 0) + results['sensitive_retweeted_weibo_retweeted_total_number'] results['retweeted_weibo_comment_total_number'] = item.get( 'retweeted_weibo_comment_total_number', 0) + results['sensitive_retweeted_weibo_comment_total_number'] results['sensitive_text'] = sort_sensitive_text(uid) results['sensitive_geo_distribute'] = [] results['sensitive_time_distribute'] = get_user_trend(uid)[1] results['sensitive_hashtag'] = [] results['sensitive_words'] = [] results['sensitive_hashtag_dict'] = [] results['sensitive_words_dict'] = [] results['sensitive_hashtag_description'] = '' sentiment_trend = user_sentiment_trend(uid) emotion_number = sentiment_trend[0] results['negetive_index'] = float(emotion_number[2]) / ( emotion_number[2] + emotion_number[1] + emotion_number[0]) results['negetive_influence'] = float(emotion_number[1]) / ( emotion_number[2] + emotion_number[1] + emotion_number[0]) sentiment_dict = sentiment_trend[1] datetime = ts2datetime(time.time()).replace('-', '') return_sentiment = dict() return_sentiment['positive'] = [] return_sentiment['neutral'] = [] return_sentiment['negetive'] = [] ts = time.time() ts = datetime2ts('2013-09-08') - 8 * 24 * 3600 for i in range(1, 8): ts = ts + 24 * 3600 date = ts2datetime(ts).replace('-', '') temp = sentiment_dict.get(date, {}) return_sentiment['positive'].append([temp.get('positive', 0), date]) return_sentiment['negetive'].append([temp.get('negetive', 0), date]) return_sentiment['neutral'].append([temp.get('neutral', 0), date]) results['sentiment_trend'] = return_sentiment if 1: portrait_results = es.get(index="sensitive_user_portrait", doc_type='user', id=uid)['_source'] results['politics_trend'] = portrait_results['politics_trend'] results['domain'] = portrait_results['domain'] results['sensitive'] = portrait_results['sensitive'] temp_hashtag = portrait_results['sensitive_hashtag_dict'] temp_sensitive_words = portrait_results['sensitive_words_dict'] temp_sensitive_geo = portrait_results['sensitive_geo_activity'] if temp_sensitive_geo: sensitive_geo_dict = json.loads(temp_sensitive_geo) if len(sensitive_geo_dict) < 7: ts = time.time() ts = datetime2ts('2013-09-08') - 8 * 24 * 3600 for i in range(7): ts = ts + 24 * 3600 date = ts2datetime(ts).replace('-', '') if sensitive_geo_dict.has_key(date): pass else: sensitive_geo_dict[date] = {} sorted_sensitive_geo = sorted(sensitive_geo_dict.items(), key=lambda x: x[0], reverse=False) sensitive_geo_list = [] for k, v in sorted_sensitive_geo: temp_list = [] sorted_geo = sorted(v.items(), key=lambda x: x[1], reverse=True)[0:2] # print sorted_geo temp_list.extend([k, sorted_geo]) sensitive_geo_list.append(temp_list) results['sensitive_geo_distribute'] = sensitive_geo_list if temp_hashtag: hashtag_dict = json.loads( portrait_results['sensitive_hashtag_dict']) if len(hashtag_dict) < 7: ts = time.time() ts = datetime2ts('2013-09-08') - 8 * 24 * 3600 for i in range(7): ts = ts + 24 * 3600 date = ts2datetime(ts).replace('-', '') if hashtag_dict.has_key(date): hashtag_dict_detail = hashtag_dict[date] hashtag_dict[date] = sorted( hashtag_dict_detail.items(), key=lambda x: x[1], reverse=True) else: hashtag_dict[date] = {} results['sensitive_hashtag_description'] = hashtag_description( hashtag_dict) else: hashtag_dict = {} if temp_sensitive_words: sensitive_words_dict = json.loads(temp_sensitive_words) if len(sensitive_words_dict) < 7: ts = time.time() ts = datetime2ts('2013-09-08') - 8 * 24 * 3600 for i in range(7): ts = ts + 24 * 3600 date = ts2datetime(ts).replace('-', '') if sensitive_words_dict.has_key(date): pass else: sensitive_words_dict[date] = {} else: sensitive_words_dict = {} date = ts2datetime(time.time() - 24 * 3600).replace('-', '') date = '20130907' today_sensitive_words = sensitive_words_dict.get(date, {}) results['today_sensitive_words'] = today_sensitive_words all_hashtag_dict = {} for item in hashtag_dict: detail_hashtag_dict = hashtag_dict[item] for key in detail_hashtag_dict: if all_hashtag_dict.has_key(key[0]): all_hashtag_dict[key[0]] += key[1] else: all_hashtag_dict[key[0]] = key[1] all_sensitive_words_dict = {} for item in sensitive_words_dict: detail_words_dict = sensitive_words_dict[item] for key in detail_words_dict: if all_sensitive_words_dict.has_key(key): all_sensitive_words_dict[key] += detail_words_dict[key] else: all_sensitive_words_dict[key] = detail_words_dict[key] sorted_hashtag = sorted(all_hashtag_dict.items(), key=lambda x: x[1], reverse=True) sorted_words = sorted(all_sensitive_words_dict.items(), key=lambda x: x[1], reverse=True) sorted_hashtag_dict = sorted(hashtag_dict.items(), key=lambda x: x[0], reverse=False) sorted_words_dict = sorted(sensitive_words_dict.items(), key=lambda x: x[0], reverse=False) new_sorted_dict = sort_sensitive_words(sorted_words) results['sensitive_hashtag'] = sorted_hashtag results['sensitive_words'] = new_sorted_dict results['sensitive_hashtag_dict'] = sorted_hashtag_dict results['sensitive_words_dict'] = sorted_words_dict results['sensitive_retweet'] = search_retweet(uid, 1) results['sensitive_follow'] = search_follower(uid, 1) results['sensitive_at'] = search_mention(uid, 1) return results