def statistics_influence_people(uid, date, style, sensitive=0): # output: different retweeted and comment, uids' domain distribution, topic distribution, registeration geo distribution results = {} # retwweted weibo people and comment weibo people date1 = str(date).replace('-', '') index_name = pre_index + date1 print index_name index_flow_text = pre_text_index + date try: bci_result = es_cluster.get(index=index_name, doc_type=influence_doctype, id=uid)["_source"] except: bci_result = [] return results origin_mid = [] # origin weibo mid retweeted_mid = [] # retweeted weibo mid query_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ ] } } } }, "size":1000 } if sensitive: query_body["query"]["filtered"]["filter"]["bool"]["must"].append({"range":{"sensitive":{"gt":0}}}) body_1 = copy.deepcopy(query_body) body_2 = copy.deepcopy(query_body) body_1["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term":{"message_type": 1}}, {"term":{"uid": uid}}]) result_1 = es.search(index=index_flow_text, doc_type=flow_text_index_type, body=body_1)["hits"]["hits"] if result_1: for item in result_1: origin_mid.append(item['_id']) body_1["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term":{"message_type": 3}}, {"term":{"uid": uid}}]) result_2 = es.search(index=index_flow_text, doc_type=flow_text_index_type, body=body_2)["hits"]["hits"] if result_2: for item in result_2: if item['_source'].get('root_mid', ''): retweeted_mid.append(item['_source']['root_mid']) if int(style) == 0: # retweeted retweeted_results = influenced_user_detail(uid, date, origin_mid, retweeted_mid, 3) results = retweeted_results else: comment_results = influenced_user_detail(uid, date, origin_mid, retweeted_mid, 2) results = comment_results return results
def query_hot_mid(ts, keywords_list, text_type,size=100): query_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"range":{ "timestamp":{ "gte":ts - time_interval, "lt": ts } }}, {"terms": {"keywords_string": keywords_list}}, {"term": {"message_type": "0"}} ] } } } }, "aggs":{ "all_interests":{ "terms":{"field": "root_mid", "size": size} } } } datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts-time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) index_name_1 = flow_text_index_name_pre + datetime_1 exist_bool_1 = es_text.indices.exists(index_name_1) print datetime, datetime_1 if datetime == datetime_1 and exist_es: search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)["aggregations"]["all_interests"]["buckets"] elif datetime != datetime_1 and exist_bool_1: search_results = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=query_body)["aggregations"]["all_interests"]["buckets"] else: search_results = [] hot_mid_list = [] if search_results: for item in search_results: print item temp = [] temp.append(item['key']) temp.append(item['doc_count']) hot_mid_list.append(temp) #print hot_mid_list return hot_mid_list
def get_activity_weibo(task_name, submit_user, start_ts, time_segment=FOUR_HOUR): results = [] #step1: get task_name uid task_id = submit_user + task_name try: group_result = es_group_result.get(index=group_index_name, doc_type=group_index_type ,\ id=task_id, _source=False, fields=['uid_list']) except: group_result = {} if group_result == {}: return 'task name invalid' try: uid_list = group_result['fields']['uid_list'] except: uid_list = [] if uid_list == []: return 'task uid list null' #step2: get uid2uname uid2uname = {} try: user_portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, \ body = {'ids':uid_list}, _source=False, fields=['uname'])['docs'] except: user_portrait_result = [] for item in user_portrait_result: uid = item['_id'] if item['found'] == True: uname = item['fields']['uname'][0] uid2uname[uid] = uname #step3: search time_segment weibo end_ts = start_ts + time_segment time_date = ts2datetime(start_ts) flow_text_index_name = flow_text_index_name_pre + time_date query = [] query.append({'terms': {'uid': uid_list}}) query.append({'range': {'timestamp': {'gte': start_ts, 'lt': end_ts}}}) try: flow_text_es_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type, \ body={'query':{'bool':{'must':query}}, 'sort':'timestamp', 'size':MAX_VALUE})['hits']['hits'] except: flow_text_es_result = [] for item in flow_text_es_result: weibo = {} source = item['_source'] weibo['timestamp'] = ts2date(source['timestamp']) weibo['ip'] = source['ip'] weibo['text'] = source['text'] if source['geo']: weibo['geo'] = '\t'.join(source['geo']) else: weibo['geo'] = '' results.append(weibo) return results
def get_influence_content(uid, timestamp_from, timestamp_to): weibo_list = [] # split timestamp range to new_range_dict_list from_date_ts = datetime2ts(ts2datetime(timestamp_from)) to_date_ts = datetime2ts(ts2datetime(timestamp_to)) new_range_dict_list = [] if from_date_ts != to_date_ts: iter_date_ts = from_date_ts while iter_date_ts < to_date_ts: iter_next_date_ts = iter_date_ts + DAY new_range_dict_list.append({"range": {"timestamp": {"gte": iter_date_ts, "lt": iter_next_date_ts}}}) iter_date_ts = iter_next_date_ts if new_range_dict_list[0]["range"]["timestamp"]["gte"] < timestamp_from: new_range_dict_list[0]["range"]["timestamp"]["gte"] = timestamp_from if new_range_dict_list[-1]["range"]["timestamp"]["lt"] > timestamp_to: new_range_dict_list[-1]["range"]["timestamp"]["lt"] = timestamp_to else: new_range_dict_list = [{"range": {"timestamp": {"gte": timestamp_from, "lt": timestamp_to}}}] # iter date to search flow_text iter_result = [] for range_item in new_range_dict_list: range_from_ts = range_item["range"]["timestamp"]["gte"] range_from_date = ts2datetime(range_from_ts) flow_text_index_name = flow_text_index_name_pre + range_from_date query = [] query.append({"term": {"uid": uid}}) query.append(range_item) try: flow_text_exist = es_flow_text.search( index=flow_text_index_name, doc_type=flow_text_index_type, body={"query": {"bool": {"must": query}}, "sort": [{"timestamp": "asc"}]}, )["hits"]["hits"] except: flow_text_exist = [] iter_result.extend(flow_text_exist) # get weibo list for item in flow_text_exist: source = item["_source"] weibo = {} weibo["timestamp"] = ts2date(source["timestamp"]) weibo["ip"] = source["ip"] weibo["text"] = source["text"] if source["geo"]: weibo["geo"] = "\t".join(source["geo"].split("&")) else: weibo["geo"] = "" weibo_list.append(weibo) return weibo_list
def influenced_detail(uid, date, weibo_style, order): date1 = str(date).replace('-', '') index_name = pre_index + date1 index_text = "flow_text_" + date weibo_style = int(weibo_style) if int(weibo_style) == 1: query_body_origin = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"term":{"message_type": 1}}, {"term":{"uid": uid}} ] } } } }, "size": 10000 } result_1 = es.search(index=index_text, doc_type="text", body=query_body_origin)['hits']['hits'] origin_set = [] if result_1: for item in result_1: origin_set.append([item['_id'], item['_source'].get("retweeted", 0), item['_source'].get("comment", 0), item['_source'].get("sensitive", 0)]) detail_text = get_text(origin_set, date, order) elif int(weibo_style) == 2: query_body_comment = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"term":{"message_type": 2}}, {"term":{"uid": uid}} ] } } } }, "size": 10000 } result_3 = es.search(index=index_text, doc_type="text", body=query_body_comment)['hits']['hits'] comment_set = [] if result_3: for item in result_3: comment_set.append([item['_id'], item['_source'].get("retweeted", 0), item['_source'].get("comment", 0), item['_source'].get("sensitive", 0)]) detail_text = get_text(comment_set, date, order) else: query_body_retweeted = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"term":{"message_type": 3}}, {"term":{"uid": uid}} ] } } } }, "size": 10000 } result_2 = es.search(index=index_text, doc_type="text", body=query_body_retweeted)['hits']['hits'] retweeted_set = [] if result_2: for item in result_2: retweeted_set.append([item['_id'], item['_source'].get("retweeted", 0), item['_source'].get("comment", 0), item['_source'].get("sensitive", 0)]) detail_text = get_text(retweeted_set, date, order) return detail_text
def search_group_sentiment_weibo(task_name, submit_user, start_ts, sentiment): weibo_list = [] # step1:get task_name uid task_id = submit_user + task_name try: group_result = es_group_result.get( index=group_index_name, doc_type=group_index_type, id=task_id, _source=False, fields=["uid_list"] ) except: group_result = {} if group_result == {}: return "task name invalid" try: uid_list = group_result["fields"]["uid_list"] except: uid_list = [] if uid_list == []: return "task uid list null" # step3: get ui2uname uid2uname = {} try: user_portrait_result = es_user_portrait.mget( index=portrait_index_name, doc_type=portrait_index_type, body={"ids": uid_list}, _source=False, fields=["uname"], )["docs"] except: user_portrait_result = [] for item in user_portrait_result: uid = item["_id"] if item["found"] == True: uname = item["fields"]["uname"][0] uid2uname[uid] = uname # step4:iter date to search weibo weibo_list = [] iter_date = ts2datetime(start_ts) flow_text_index_name = flow_text_index_name_pre + str(iter_date) # step4: get query_body if sentiment != "2": query_body = [ {"terms": {"uid": uid_list}}, {"term": {"sentiment": sentiment}}, {"range": {"timestamp": {"gte": start_ts, "lt": start_ts + DAY}}}, ] else: query_body = [ {"terms": {"uid": uid_list}}, {"terms": {"sentiment": SENTIMENT_SECOND}}, {"range": {"timestamp": {"gte": start_ts, "lt": start_ts + DAY}}}, ] try: flow_text_result = es_flow_text.search( index=flow_text_index_name, doc_type=flow_text_index_type, body={ "query": {"bool": {"must": query_body}}, "sort": [{"timestamp": {"order": "asc"}}], "size": MAX_VALUE, }, )["hits"]["hits"] except: flow_text_result = [] for flow_text_item in flow_text_result: source = flow_text_item["_source"] weibo = {} weibo["uid"] = source["uid"] weibo["uname"] = uid2uname[weibo["uid"]] weibo["ip"] = source["ip"] try: weibo["geo"] = "\t".join(source["geo"].split("&")) except: weibo["geo"] = "" weibo["text"] = source["text"] weibo["timestamp"] = source["timestamp"] weibo["sentiment"] = source["sentiment"] weibo_list.append(weibo) return weibo_list
def get_social_inter_content(uid1, uid2, type_mark): weibo_list = [] # get two type relation about uid1 and uid2 # search weibo list now_ts = int(time.time()) # run_type if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = datetime2ts(RUN_TEST_TIME) # uid2uname uid2uname = {} try: portrait_result = es_user_portrait.mget( index=portrait_index_name, doc_type=portrait_index_type, body={"ids": [uid1, uid2]}, _source=False, fields=["uid", "uname"], )["docs"] except: portrait_result = [] for item in portrait_result: uid = item["_id"] if item["found"] == True: uname = item["fields"]["uname"][0] uid2uname[uid] = uname else: uid2uname[uid] = "unknown" # iter date to search weibo list for i in range(7, 0, -1): iter_date_ts = now_date_ts - i * DAY iter_date = ts2datetime(iter_date_ts) flow_text_index_name = flow_text_index_name_pre + str(iter_date) query = [] query.append({"bool": {"must": [{"term": {"uid": uid1}}, {"term": {"directed_uid": int(uid2)}}]}}) if type_mark == "out": query.append({"bool": {"must": [{"term": {"uid": uid2}}, {"term": {"directed_uid": int(uid1)}}]}}) try: flow_text_result = es_flow_text.search( index=flow_text_index_name, doc_type=flow_text_index_type, body={ "query": {"bool": {"should": query}}, "sort": [{"timestamp": {"order": "asc"}}], "size": MAX_VALUE, }, )["hits"]["hits"] except: flow_text_result = [] for flow_text in flow_text_result: source = flow_text["_source"] weibo = {} weibo["timestamp"] = source["timestamp"] weibo["ip"] = source["ip"] weibo["geo"] = source["geo"] weibo["text"] = "\t".join(source["text"].split("&")) weibo["uid"] = source["uid"] weibo["uname"] = uid2uname[weibo["uid"]] weibo["directed_uid"] = str(source["directed_uid"]) weibo["directed_uname"] = uid2uname[str(source["directed_uid"])] weibo_list.append(weibo) return weibo_list
def get_activity_weibo(task_name, submit_user, start_ts, time_segment=FOUR_HOUR): results = [] # step1: get task_name uid task_id = submit_user + task_name try: group_result = es_group_result.get( index=group_index_name, doc_type=group_index_type, id=task_id, _source=False, fields=["uid_list"] ) except: group_result = {} if group_result == {}: return "task name invalid" try: uid_list = group_result["fields"]["uid_list"] except: uid_list = [] if uid_list == []: return "task uid list null" # step2: get uid2uname uid2uname = {} try: user_portrait_result = es_user_portrait.mget( index=portrait_index_name, doc_type=portrait_index_type, body={"ids": uid_list}, _source=False, fields=["uname"], )["docs"] except: user_portrait_result = [] for item in user_portrait_result: uid = item["_id"] if item["found"] == True: uname = item["fields"]["uname"][0] uid2uname[uid] = uname # step3: search time_segment weibo end_ts = start_ts + time_segment time_date = ts2datetime(start_ts) flow_text_index_name = flow_text_index_name_pre + time_date query = [] query.append({"terms": {"uid": uid_list}}) query.append({"range": {"timestamp": {"gte": start_ts, "lt": end_ts}}}) try: flow_text_es_result = es_flow_text.search( index=flow_text_index_name, doc_type=flow_text_index_type, body={"query": {"bool": {"must": query}}, "sort": "timestamp", "size": MAX_VALUE}, )["hits"]["hits"] except: flow_text_es_result = [] for item in flow_text_es_result: weibo = {} source = item["_source"] weibo["timestamp"] = ts2date(source["timestamp"]) weibo["ip"] = source["ip"] weibo["text"] = source["text"] if source["geo"]: weibo["geo"] = "\t".join(source["geo"]) else: weibo["geo"] = "" results.append(weibo) return results
def bci_detail(date, uid, sensitive=0): if not sensitive: bci_index = "bci_" + date.replace("-", "") try: bci_result = es_bci.get(index=bci_index, doc_type="bci", id=uid)["_source"] except: bci_result = dict() try: origin_retweeted = json.loads(bci_result.get("origin_weibo_retweeted_detail", [])) except: origin_retweeted = [] origin_weibo_retweeted_brust_average = bci_result.get("origin_weibo_retweeted_brust_average", 0) # 爆发数 try: origin_comment = json.loads(bci_result.get("origin_weibo_comment_detail", [])) except: origin_comment = [] origin_weibo_comment_brust_average = bci_result.get("origin_weibo_comment_brust_average", 0) try: retweeted_retweeted = json.loads(bci_result.get("retweeted_weibo_retweeted_detail", [])) except: retweeted_retweeted = [] retweeted_weibo_retweeted_brust_average = bci_result.get("retweeted_weibo_retweeted_brust_average", 0) try: retweeted_comment = json.loads(bci_result.get("retweeted_weibo_comment_detail", [])) except: retweeted_comment = [] retweeted_weibo_comment_brust_average = bci_result.get("retweeted_weibo_comment_brust_average", 0) origin_query = query_body(1, uid) text_index = "flow_text_" + date if not sensitive: origin_text = es_text.search(index=text_index, doc_type="text", body=origin_query)["hits"]["hits"] else: sensitive_origin_query = origin_query["query"]["filtered"]["filter"]["bool"]["must"].append( {"range": {"sensitive": {"gt": 0}}} ) origin_text = es_text.search(index=text_index, doc_type="text", body=sensitive_origin_query)["hits"]["hits"] # print origin_text retweeted_query = query_body(3, uid) if not sensitive: retweeted_text = es_text.search(index=text_index, doc_type="text", body=retweeted_query)["hits"]["hits"] else: sensitive_retweeted_query = retweeted_query["query"]["filtered"]["filter"]["bool"]["must"].append( {"range": {"sensitive": {"gt": 0}}} ) retweeted_text = es_text.search(index=text_index, doc_type="text", body=sensitive_retweeted_query)["hits"][ "hits" ] origin_weibo_number = len(origin_text) # 1 retweeted_weibo_number = len(retweeted_text) # 2 retweet_total_number = 0 # 转发总数 comment_total_number = 0 # 评论总数 origin_retweet_total_number = 0 # 原创被转发总数 origin_comment_total_number = 0 # 原创被评论总数 retweet_retweet_total_number = 0 # 转发被转发总数 retweet_comment_total_number = 0 # 转发被评论总数 origin_retweet_average_number = 0 # 原创被转发平均数 origin_comment_average_number = 0 # 原创被评论平均数 retweet_retweet_average_number = 0 # 转发被转发平均数 retweet_comment_average_number = 0 # 转发被评论平均数 origin_retweet_top_number = 0 # 原创被转发最高 origin_comment_top_number = 0 # 原创被评论最高 retweet_retweet_top_number = 0 # 转发被转发最高 retweet_comment_top_number = 0 # 转发被评论最高 origin_sensitive_words_dict = dict() retweeted_sensitive_words_dict = dict() for item in origin_text: retweet_total_number += item["_source"].get("retweeted", 0) comment_total_number += item["_source"].get("comment", 0) origin_retweet_total_number += item["_source"].get("retweeted", 0) origin_comment_total_number += item["_source"].get("comment", 0) if origin_retweet_top_number < item["_source"].get("retweeted", 0): origin_retweet_top_number = item["_source"].get("retweeted", 0) if origin_comment_top_number < item["_source"].get("comment", 0): origin_comment_top_number = item["_source"].get("comment", 0) if sensitive: sensitive_words_dict = json.loads(item["_source"]["sensitive_words_dict"]) if sensitive_words_dict: for k, v in sensitive_words_dict.iteritems(): try: origin_sensitive_words_dict[k] += v except: origin_sensitive_words_dict[k] = v for item in retweeted_text: retweet_total_number += item["_source"].get("retweeted", 0) comment_total_number += item["_source"].get("comment", 0) retweet_retweet_total_number += item["_source"].get("retweeted", 0) retweet_comment_total_number += item["_source"].get("comment", 0) if retweet_retweet_top_number < item["_source"].get("retweeted", 0): retweeet_retweet_top_number = item["_source"].get("retweeted", 0) if retweet_comment_top_number < item["_source"].get("comment", 0): retweet_comment_top_number = item["_source"].get("comment", 0) if sensitive: sensitive_words_dict = json.loads(item["_source"]["sensitive_words_dict"]) if sensitive_words_dict: for k, v in sensitive_words_dict.iteritems(): try: retweeted_sensitive_words_dict[k] += v except: retweeted_sensitive_words_dict[k] = v try: average_retweet_number = retweet_total_number / (origin_weibo_number + retweeted_weibo_number) # 平均转发数 except: average_retweet_number = 0 try: average_comment_number = comment_total_number / (origin_weibo_number + retweeted_weibo_number) # 平均评论数 except: average_comment_number = 0 try: origin_retweet_average_number = origin_retweet_total_number / origin_weibo_number except: origin_retweet_average_number = 0 try: origin_comment_average_number = origin_comment_total_number / origin_weibo_number except: origin_comment_average_number = 0 try: retweet_retweet_average_number = retweet_retweet_total_number / retweeted_weibo_number except: retweet_retweet_average_number = 0 try: retweet_comment_average_number = retweet_comment_total_number / retweeted_weibo_number except: retweet_comment_average_number = 0 result = dict() result["origin_weibo_number"] = origin_weibo_number result["retweeted_weibo_number"] = retweeted_weibo_number result["origin_weibo_retweeted_total_number"] = origin_retweet_total_number result["origin_weibo_comment_total_number"] = origin_comment_total_number result["retweeted_weibo_retweeted_total_number"] = retweet_retweet_total_number result["retweeted_weibo_comment_total_number"] = retweet_comment_total_number result["origin_weibo_retweeted_average_number"] = origin_retweet_average_number result["origin_weibo_comment_average_number"] = origin_comment_average_number result["retweeted_weibo_retweeted_average_number"] = retweet_retweet_average_number result["retweeted_weibo_comment_average_number"] = retweet_comment_average_number result["origin_weibo_retweeted_top_number"] = origin_retweet_top_number result["origin_weibo_comment_top_number"] = origin_comment_top_number result["retweeted_weibo_retweeted_top_number"] = retweet_retweet_top_number result["retweeted_weibo_comment_top_number"] = retweet_comment_top_number if not sensitive: result["origin_weibo_comment_brust_average"] = origin_weibo_comment_brust_average result["origin_weibo_retweeted_brust_average"] = origin_weibo_retweeted_brust_average result["retweeted_weibo_comment_brust_average"] = retweeted_weibo_comment_brust_average result["retweeted_weibo_retweeted_brust_average"] = retweeted_weibo_retweeted_brust_average result["user_index"] = bci_result.get("user_index", 0) else: result["retweeted_sensitive_words_list"] = sorted( retweeted_sensitive_words_dict.items(), key=lambda x: x[1], reverse=True ) result["origin_sensitive_words_list"] = sorted( origin_sensitive_words_dict.items(), key=lambda x: x[1], reverse=True ) result["retweeted_sensitive_words_number"] = len(retweeted_sensitive_words_dict) result["origin_sensitive_words_number"] = len(origin_sensitive_words_dict) return result
def full_text_search(keywords, uid, start_time, end_time, size): results = [] uid_list = [] user_profile_list = [] query_body = { "query": { "bool": { "must": [] } }, "size":size, "sort":{"timestamp":{"order": 'desc'}} } if RUN_TYPE: query_body["sort"] = {"user_fansnum":{"order": 'desc'}} if uid: query_body["query"]["bool"]["must"].append({"term":{"uid":uid}}) if keywords: keywords_list = keywords.split(',') for word in keywords_list: query_body["query"]["bool"]["must"].append({'wildcard':{'text':{'wildcard':'*'+word+'*'}}}) index_list = [] exist_bool = es_flow_text.indices.exists(index="flow_text_"+end_time) if start_time: start_ts = datetime2ts(start_time) end_ts = datetime2ts(end_time) ts = end_ts while 1: index_name = "flow_text_"+ts2datetime(ts) exist_bool = es_flow_text.indices.exists(index=index_name) if exist_bool: index_list.append(index_name) if ts == start_ts: break else: ts -= 3600*24 print index_list # 没有可行的es if not index_list: return [[], []] search_results = es_flow_text.search(index=index_list, doc_type="text", body=query_body)["hits"]["hits"] for item in search_results: uid_list.append(item['_source']['uid']) user_info = [] if uid_list: history_max = get_history_max() personal_field = ["nick_name", "fansnum", "statusnum","user_location"] user_info = get_user_profile(uid_list, personal_field) bci_results = ES_CLUSTER_FLOW1.mget(index="bci_history", doc_type="bci", body={"ids":uid_list}, _source=False, fields=["bci_day_last"])["docs"] in_portrait = es_user_portrait.mget(index="sensitive_user_portrait", doc_type="user", body={"ids":uid_list}, _source=False)["docs"] sensitive_results = es_sensitive_history.mget(index="sensitive_history", doc_type="sensitive", body={"ids":uid_list}, _source=False, fields=["last_value"])["docs"] print "len search: ", len(search_results) count = 0 # uid uname text date geo sensitive_words retweeted comment for item in search_results: item = item['_source'] uid_list.append(item['uid']) iter_item = [] iter_item.append(item['uid']) iter_item.append(user_info[count][1]) iter_item.append(item['text']) iter_item.append(ts2date(item['timestamp'])) iter_item.append(item['geo']) if item.get("sensitive_words_string", ''): iter_item.append(item['sensitive_words_string'].split('&')) else: iter_item.append([]) iter_item.append(item.get('retweeted', 0)) iter_item.append(item.get('comment', 0)) count += 1 results.append(iter_item) user_set = set() count = 0 # uid "nick_name", "fansnum", "statusnum","user_location", bci, sensitive for item in user_info: if item[0] in user_set: continue else: user_set.add(item[0]) if bci_results[count]["found"]: if bci_results[count].has_key("fields"): bci_value = bci_results[count]["fields"]["bci_day_last"][0] else: bci_value = 0 item.append(normalize_index(bci_value, history_max["max_bci"])) else: item.append(0) if sensitive_results[count]["found"]: if sensitive_results[count].has_key("fields"): sensitive_value = sensitive_results[count]['fields']['last_value'][0] else: sensitive_value = 0 item.append(normalize_index(sensitive_value, history_max["max_sensitive"])) else: item.append(0) if in_portrait[count]["found"]: item.append("1") else: item.append("0") user_profile_list.append(item) return results, user_profile_list
def get_social_inter_content(uid1, uid2, type_mark): weibo_list = [] #get two type relation about uid1 and uid2 #search weibo list now_ts = int(time.time()) #run_type if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = datetime2ts(RUN_TEST_TIME) #uid2uname uid2uname = {} try: portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type ,\ body={'ids': [uid1, uid2]}, _source=False, fields=['uid', 'uname'])['docs'] except: portrait_result = [] for item in portrait_result: uid = item['_id'] if item['found'] == True: uname = item['fields']['uname'][0] uid2uname[uid] = uname else: uid2uname[uid] = 'unknown' #iter date to search weibo list for i in range(7, 0, -1): iter_date_ts = now_date_ts - i * DAY iter_date = ts2datetime(iter_date_ts) flow_text_index_name = flow_text_index_name_pre + str(iter_date) query = [] query.append({ 'bool': { 'must': [{ 'term': { 'uid': uid1 } }, { 'term': { 'directed_uid': int(uid2) } }] } }) if type_mark == 'out': query.append({ 'bool': { 'must': [{ 'term': { 'uid': uid2 } }, { 'term': { 'directed_uid': int(uid1) } }] } }) try: flow_text_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body={'query': {'bool':{'should': query}}, 'sort':[{'timestamp':{'order': 'asc'}}], 'size':MAX_VALUE})['hits']['hits'] except: flow_text_result = [] for flow_text in flow_text_result: source = flow_text['_source'] weibo = {} weibo['timestamp'] = source['timestamp'] weibo['ip'] = source['ip'] weibo['geo'] = source['geo'] weibo['text'] = '\t'.join(source['text'].split('&')) weibo['uid'] = source['uid'] weibo['uname'] = uid2uname[weibo['uid']] weibo['directed_uid'] = str(source['directed_uid']) weibo['directed_uname'] = uid2uname[str(source['directed_uid'])] weibo_list.append(weibo) return weibo_list
def get_influence_content(uid, timestamp_from, timestamp_to): weibo_list = [] #split timestamp range to new_range_dict_list from_date_ts = datetime2ts(ts2datetime(timestamp_from)) to_date_ts = datetime2ts(ts2datetime(timestamp_to)) new_range_dict_list = [] if from_date_ts != to_date_ts: iter_date_ts = from_date_ts while iter_date_ts < to_date_ts: iter_next_date_ts = iter_date_ts + DAY new_range_dict_list.append({ 'range': { 'timestamp': { 'gte': iter_date_ts, 'lt': iter_next_date_ts } } }) iter_date_ts = iter_next_date_ts if new_range_dict_list[0]['range']['timestamp']['gte'] < timestamp_from: new_range_dict_list[0]['range']['timestamp'][ 'gte'] = timestamp_from if new_range_dict_list[-1]['range']['timestamp']['lt'] > timestamp_to: new_range_dict_list[-1]['range']['timestamp']['lt'] = timestamp_to else: new_range_dict_list = [{ 'range': { 'timestamp': { 'gte': timestamp_from, 'lt': timestamp_to } } }] #iter date to search flow_text iter_result = [] for range_item in new_range_dict_list: range_from_ts = range_item['range']['timestamp']['gte'] range_from_date = ts2datetime(range_from_ts) flow_text_index_name = flow_text_index_name_pre + range_from_date query = [] query.append({'term': {'uid': uid}}) query.append(range_item) try: flow_text_exist = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body={'query':{'bool':{'must': query}}, 'sort':[{'timestamp':'asc'}]})['hits']['hits'] except: flow_text_exist = [] iter_result.extend(flow_text_exist) # get weibo list for item in flow_text_exist: source = item['_source'] weibo = {} weibo['timestamp'] = ts2date(source['timestamp']) weibo['ip'] = source['ip'] weibo['text'] = source['text'] if source['geo']: weibo['geo'] = '\t'.join(source['geo'].split('&')) else: weibo['geo'] = '' weibo_list.append(weibo) return weibo_list
def get_origin_weibo_detail(ts, user, task_name, size, order, message_type=1): _id = user + '-' + task_name task_detail = es_user_portrait.get(index=index_sensing_task, doc_type=_id, id=ts)['_source'] mid_value = json.loads(task_detail['mid_topic_value']) duplicate_dict = json.loads(task_detail['duplicate_dict']) tmp_duplicate_dict = dict() for k,v in duplicate_dict.iteritems(): try: tmp_duplicate_dict[v].append(k) except: tmp_duplicate_dict[v] = [k, v] if message_type == 1: weibo_detail = json.loads(task_detail['origin_weibo_detail']) elif message_type == 2: weibo_detail = json.loads(task_detail['retweeted_weibo_detail']) else: weibo_detail = json.loads(task_detail['sensitive_weibo_detail']) weibo_detail_list = [] if weibo_detail: for iter_mid, item in weibo_detail.iteritems(): tmp = [] tmp.append(iter_mid) tmp.append(item[iter_mid]) tmp.append(item['retweeted']) tmp.append(item['comment']) weibo_detail_list.append(tmp) mid_list = weibo_detail.keys() results = [] query_body = { "query":{ "filtered":{ "filter":{ "terms":{"mid": mid_list} } } }, "size": 1000, "sort": {"timestamp": {"order": "desc"}} } index_list = [] datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts-DAY) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: index_list.append(index_name) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es_1 = es_text.indices.exists(index_name_1) if exist_es_1: index_list.append(index_name_1) if index_list and mid_list: search_results = es_text.search(index=index_list, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] uid_list = [] text_dict = dict() # 文本信息 portrait_dict = dict() # 背景信息 sort_results = [] if search_results: for item in search_results: uid_list.append(item["_source"]['uid']) text_dict[item['_id']] = item['_source'] # _id是mid if uid_list: portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"] for item in portrait_result: if item['found']: portrait_dict[item['_id']] = {"nick_name": item["fields"]["nick_name"][0], "photo_url": item["fields"]["photo_url"][0]} else: portrait_dict[item['_id']] = {"nick_name": item['_id'], "photo_url":""} if order == "total": sorted_list = sorted(weibo_detail_list, key=lambda x:x[1], reverse=True) elif order == "retweeted": sorted_list = sorted(weibo_detail_list, key=lambda x:x[2], reverse=True) elif order == "comment": sorted_list = sorted(weibo_detail_list, key=lambda x:x[3], reverse=True) else: sorted_list = weibo_detail_list count_n = 0 results_dict = dict() mid_index_dict = dict() for item in sorted_list: # size mid = item[0] iter_text = text_dict.get(mid, {}) temp = [] # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type if iter_text: uid = iter_text['uid'] temp.append(uid) iter_portrait = portrait_dict.get(uid, {}) if iter_portrait: temp.append(iter_portrait['nick_name']) temp.append(iter_portrait['photo_url']) else: temp.extend([uid,'']) temp.append(iter_text["text"]) temp.append(iter_text["sentiment"]) temp.append(ts2date(iter_text['timestamp'])) temp.append(iter_text['geo']) if message_type == 1: temp.append(1) elif message_type == 2: temp.append(3) else: temp.append(iter_text['message_type']) temp.append(item[2]) temp.append(item[3]) temp.append(iter_text.get('sensitive', 0)) temp.append(iter_text['timestamp']) temp.append(mid_value[mid]) temp.append(mid) results.append(temp) count_n += 1 results = sorted(results, key=operator.itemgetter(-4, -2, -6), reverse=True) # -4 -2 -3 sort_results = [] count = 0 for item in results: sort_results.append([item]) mid_index_dict[item[-1]] = count count += 1 if tmp_duplicate_dict: remove_list = [] value_list = tmp_duplicate_dict.values() # [[mid, mid], ] for item in value_list: tmp = [] for mid in item: if mid_index_dict.get(mid, 0): tmp.append(mid_index_dict[mid]) if len(tmp) > 1: tmp_min = min(tmp) else: continue tmp.remove(tmp_min) for iter_count in tmp: sort_results[tmp_min].extend(sort_results[iter_count]) remove_list.append(sort_results[iter_count]) if remove_list: for item in remove_list: sort_results.remove(item) return sort_results
def influenced_people(uid, mid, influence_style, date, default_number=20): # uid # which weibo----mid, retweeted weibo ---seek for root_mid # influence_style: retweeted(0) or comment(1) date1 = ts2datetime(datetime2ts(date)).replace('-', '') index_name = pre_index + date1 index_flow_text = pre_text_index + date text_result = es.get(index=index_flow_text, doc_type=flow_text_index_type, id=mid)["_source"] temp_mid = text_result.get("root_mid",'') #判断微博是否是原创微博 if temp_mid: mid_type = 1 # 非原创微博 else: mid_type = 0 # 原创微博 query_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ ] } } } }, "size": 30000 } if RUN_TYPE: query_body["sort"] = {"user_fansnum":{"order":"desc"}} if int(mid_type) == 0: if int(influence_style) == 0: # origin weibo, all retweeted people query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term": {"root_uid": uid}}, {"term": {"message_type": 3}}, {"term": {"root_mid": mid}}]) else: # commented people query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term": {"directed_uid": uid}}, {"term": {"message_type": 2}}, {"term": {"root_mid": mid}}]) else: if int(influence_style) == 0: # origin weibo, all retweeted people query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term": {"directed_uid": uid}}, {"term": {"message_type": 3}}, {"term": {"root_mid": temp_mid}}]) else: # commented people query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term": {"directed_uid": uid}}, {"term": {"message_type": 2}}, {"term": {"root_mid": temp_mid}}]) search_results = es.search(index=index_flow_text, doc_type=flow_text_index_type, body=query_body, _source=False, fields=["uid"], timeout=30)["hits"]["hits"] results = [] # uid_list if search_results: for item in search_results: if int(item["fields"]["uid"][0]) == int(uid): pass else: results.append(item["fields"]["uid"][0]) results = list(set(results)) else: results = [] bci_index = "bci_" + date.replace('-','') if results: portrait_results = es_user_portrait.mget(index=user_portrait, doc_type=portrait_index_type, body={"ids": results}, fields=["domain", "topic_string", "activity_geo_dict","importance", "influence"])["docs"] bci_results = es_cluster.mget(index=bci_index, doc_type='bci', body={"ids":results}, fields=['user_index'])['docs'] else: portrait_results = {} bci_results = {} in_portrait = [] out_portrait = [] in_portrait_info = [] retweeted_domain = {} retweeted_topic = {} retweeted_geo = {} average_influence = 0 total_influence = 0 count = 0 if bci_results: total_influence = 0 for item in bci_results: if item['found']: total_influence += item['fields']['user_index'][0] try: average_influence = total_influence/len(results) except: average_influence = 0 if portrait_results: for item in portrait_results: if item["found"]: temp = [] count += 1 temp.append(item['_id']) temp.append(item["fields"]["importance"][0]) in_portrait.append(temp) temp_domain = item["fields"]["domain"][0].split('&') temp_topic = item["fields"]["topic_string"][0].split('&') temp_geo = json.loads(item["fields"]["activity_geo_dict"][0])[-1].keys() #total_influence += item["fields"]["influence"][0] retweeted_domain = aggregation(temp_domain, retweeted_domain) retweeted_topic = aggregation(temp_topic, retweeted_topic) retweeted_geo = aggregation(temp_geo, retweeted_geo) else: out_portrait.append(item['_id']) retweeted_domain = proportion(retweeted_domain) retweeted_topic = proportion(retweeted_topic) retweeted_geo = proportion(retweeted_geo) #try: # average_influence = total_influence/count #except: # average_influence = 0 sorted_retweeted_domain = sorted(retweeted_domain.items(),key=lambda x:x[1], reverse=True) sorted_retweeted_topic = sorted(retweeted_topic.items(),key=lambda x:x[1], reverse=True) sorted_retweeted_geo = sorted(retweeted_geo.items(), key=lambda x:x[1], reverse=True) retweeted_results = dict() retweeted_results["domian"] = sorted_retweeted_domain[:5] retweeted_results["topic"] = sorted_retweeted_topic[:5] retweeted_results["geo"] = sorted_retweeted_geo[:5] retweeted_results["influence"] = average_influence in_portrait = sorted(in_portrait, key=lambda x:x[1], reverse=True) temp_list = [] for item in in_portrait: temp_list.append(item[0]) retweeted_results['in_portrait_number'] = len(temp_list) retweeted_results['out_portrait_number'] = len(out_portrait) in_portrait_url = get_user_url(temp_list[:default_number]) out_portrait_url = get_user_url(out_portrait[:default_number]) return_results = dict() return_results["influence_users"] = [in_portrait_url, out_portrait_url] return_results["influence_distribution"] = retweeted_results return return_results
def get_positive_weibo_detail(ts, social_sensors, keywords_list, size, sentiment_type=1): former_mid_list = query_mid_list(ts-time_interval, keywords_list, time_segment, social_sensors) # 前一段时间内的微博mid list current_mid_list = query_mid_list(ts, keywords_list, time_interval, social_sensors) mid_list = [] mid_list.extend(former_mid_list) mid_list.extend(current_mid_list) query_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"range":{ "timestamp":{ "gte": ts - time_interval, "lt": ts } }}, ], "should":[ {"terms": {"root_mid": mid_list}}, {"terms": {"mid": mid_list}}, {"terms":{"keywords_string": keywords_list}} ] } } } }, "sort": {"timestamp": {"order": "desc"}}, "size": 100 } #if social_sensors and int(sentiment_type) == 1: # query_body["query"]["filtered"]["filter"]["bool"]["must"].append({"terms":{"uid": social_sensors}}) if int(sentiment_type) == 1 or int(sentiment_type) == 0: query_body["query"]["filtered"]["filter"]["bool"]["must"].append({"term":{"sentiment":sentiment_type}}) else: query_body["query"]["filtered"]["filter"]["bool"]["must"] = [{"terms":{"sentiment": ["2", "3"]}}] # 判断当前ts和ts-time_interval是否属于同一天,确定查询哪个es datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts-time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es_1 = es_text.indices.exists(index_name_1) # 1. 聚合原创微博mid list if datetime == datetime_1 and exist_es: search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] elif datetime != datetime_1 and exist_es_1: search_results = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] uid_list = [] results = [] if search_results: for item in search_results: uid_list.append(item["_source"]['uid']) if uid_list: portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"] for i in range(len(uid_list)): item = search_results[i]['_source'] temp = [] # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type temp.append(item['uid']) if portrait_result[i]['found']: temp.append(portrait_result[i]["fields"]["nick_name"][0]) temp.append(portrait_result[i]["fields"]["photo_url"][0]) else: temp.append("unknown") temp.append("") temp.append(item["text"]) temp.append(item["sentiment"]) temp.append(ts2date(item['timestamp'])) temp.append(item['geo']) keywords_set = set(item['keywords_string'].split('&')) common_keywords = set(keywords_list) & keywords_set temp.append(list(common_keywords)) temp.append(item['message_type']) results.append(temp) return results
def get_retweet_weibo_detail(ts, user, task_name, size, text_type, type_value): _id = user + '-' + task_name task_detail = es_user_portrait.get(index=index_sensing_task, doc_type=_id, id=ts)['_source'] origin_weibo_detail = json.loads(task_detail['origin_weibo_detail']) retweeted_weibo_detail = json.loads(task_detail['retweeted_weibo_detail']) mid_list = [] mid_list.extend(origin_weibo_detail.keys()) mid_list.extend(retweeted_weibo_detail.keys()) query_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"range":{ "timestamp":{ "gte": ts - time_interval, "lt": ts } }}, {"terms": {"root_mid": mid_list}} ] } } } }, "sort": {"timestamp": {"order": "desc"}}, "size": 100 } if text_type == "message_type": query_body['query']['filtered']['filter']['bool']['must'].append({"term":{text_type: type_value}}) if text_type == "sentiment": #if isinstance(type_value, str): if len(type_value) == 1: query_body['query']['filtered']['filter']['bool']['must'].append({"term":{text_type: type_value}}) else: query_body['query']['filtered']['filter']['bool']['must'].append({"terms":{text_type: type_value}}) datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts-time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es_1 = es_text.indices.exists(index_name_1) # 1. 查询微博 if datetime == datetime_1 and exist_es: search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] elif datetime != datetime_1 and exist_es_1: search_results = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] #print search_results # 2. 获取微博相关信息 results = [] uid_list = [] if search_results: for item in search_results: uid_list.append(item["_source"]['uid']) if uid_list: portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"] for i in range(len(uid_list)): item = search_results[i]['_source'] temp = [] # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type temp.append(item['uid']) if portrait_result[i]['found']: temp.append(portrait_result[i]["fields"]["nick_name"][0]) temp.append(portrait_result[i]["fields"]["photo_url"][0]) else: temp.append(item['uid']) temp.append("") temp.append(item["text"]) #print item['text'] temp.append(item["sentiment"]) temp.append(ts2date(item['timestamp'])) temp.append(item['geo']) temp.append(item["message_type"]) results.append(temp) return results
def bci_detail(date, uid, sensitive=0): if not sensitive: bci_index = "bci_" + date.replace('-', '') try: bci_result = es_bci.get(index=bci_index, doc_type="bci", id=uid)['_source'] except: bci_result = dict() try: origin_retweeted = json.loads( bci_result.get("origin_weibo_retweeted_detail", [])) except: origin_retweeted = [] origin_weibo_retweeted_brust_average = bci_result.get( "origin_weibo_retweeted_brust_average", 0) # 爆发数 try: origin_comment = json.loads( bci_result.get("origin_weibo_comment_detail", [])) except: origin_comment = [] origin_weibo_comment_brust_average = bci_result.get( "origin_weibo_comment_brust_average", 0) try: retweeted_retweeted = json.loads( bci_result.get("retweeted_weibo_retweeted_detail", [])) except: retweeted_retweeted = [] retweeted_weibo_retweeted_brust_average = bci_result.get( 'retweeted_weibo_retweeted_brust_average', 0) try: retweeted_comment = json.loads( bci_result.get("retweeted_weibo_comment_detail", [])) except: retweeted_comment = [] retweeted_weibo_comment_brust_average = bci_result.get( 'retweeted_weibo_comment_brust_average', 0) origin_query = query_body(1, uid) text_index = "flow_text_" + date if not sensitive: origin_text = es_text.search(index=text_index, doc_type="text", body=origin_query)["hits"]["hits"] else: sensitive_origin_query = origin_query["query"]["filtered"]["filter"][ "bool"]["must"].append({"range": { "sensitive": { "gt": 0 } }}) origin_text = es_text.search( index=text_index, doc_type="text", body=sensitive_origin_query)["hits"]["hits"] #print origin_text retweeted_query = query_body(3, uid) if not sensitive: retweeted_text = es_text.search(index=text_index, doc_type="text", body=retweeted_query)["hits"]["hits"] else: sensitive_retweeted_query = retweeted_query["query"]["filtered"][ "filter"]["bool"]["must"].append( {"range": { "sensitive": { "gt": 0 } }}) retweeted_text = es_text.search( index=text_index, doc_type="text", body=sensitive_retweeted_query)["hits"]["hits"] origin_weibo_number = len(origin_text) # 1 retweeted_weibo_number = len(retweeted_text) #2 retweet_total_number = 0 # 转发总数 comment_total_number = 0 # 评论总数 origin_retweet_total_number = 0 # 原创被转发总数 origin_comment_total_number = 0 # 原创被评论总数 retweet_retweet_total_number = 0 # 转发被转发总数 retweet_comment_total_number = 0 # 转发被评论总数 origin_retweet_average_number = 0 # 原创被转发平均数 origin_comment_average_number = 0 # 原创被评论平均数 retweet_retweet_average_number = 0 # 转发被转发平均数 retweet_comment_average_number = 0 # 转发被评论平均数 origin_retweet_top_number = 0 # 原创被转发最高 origin_comment_top_number = 0 # 原创被评论最高 retweet_retweet_top_number = 0 # 转发被转发最高 retweet_comment_top_number = 0 # 转发被评论最高 origin_sensitive_words_dict = dict() retweeted_sensitive_words_dict = dict() for item in origin_text: retweet_total_number += item['_source'].get('retweeted', 0) comment_total_number += item['_source'].get('comment', 0) origin_retweet_total_number += item['_source'].get('retweeted', 0) origin_comment_total_number += item['_source'].get('comment', 0) if origin_retweet_top_number < item['_source'].get('retweeted', 0): origin_retweet_top_number = item['_source'].get('retweeted', 0) if origin_comment_top_number < item['_source'].get('comment', 0): origin_comment_top_number = item['_source'].get('comment', 0) if sensitive: sensitive_words_dict = json.loads( item['_source']['sensitive_words_dict']) if sensitive_words_dict: for k, v in sensitive_words_dict.iteritems(): try: origin_sensitive_words_dict[k] += v except: origin_sensitive_words_dict[k] = v for item in retweeted_text: retweet_total_number += item['_source'].get('retweeted', 0) comment_total_number += item['_source'].get('comment', 0) retweet_retweet_total_number += item['_source'].get('retweeted', 0) retweet_comment_total_number += item['_source'].get('comment', 0) if retweet_retweet_top_number < item['_source'].get('retweeted', 0): retweeet_retweet_top_number = item['_source'].get('retweeted', 0) if retweet_comment_top_number < item['_source'].get('comment', 0): retweet_comment_top_number = item['_source'].get('comment', 0) if sensitive: sensitive_words_dict = json.loads( item['_source']['sensitive_words_dict']) if sensitive_words_dict: for k, v in sensitive_words_dict.iteritems(): try: retweeted_sensitive_words_dict[k] += v except: retweeted_sensitive_words_dict[k] = v try: average_retweet_number = retweet_total_number / ( origin_weibo_number + retweeted_weibo_number) # 平均转发数 except: average_retweet_number = 0 try: average_comment_number = comment_total_number / ( origin_weibo_number + retweeted_weibo_number) # 平均评论数 except: average_comment_number = 0 try: origin_retweet_average_number = origin_retweet_total_number / origin_weibo_number except: origin_retweet_average_number = 0 try: origin_comment_average_number = origin_comment_total_number / origin_weibo_number except: origin_comment_average_number = 0 try: retweet_retweet_average_number = retweet_retweet_total_number / retweeted_weibo_number except: retweet_retweet_average_number = 0 try: retweet_comment_average_number = retweet_comment_total_number / retweeted_weibo_number except: retweet_comment_average_number = 0 result = dict() result["origin_weibo_number"] = origin_weibo_number result["retweeted_weibo_number"] = retweeted_weibo_number result["origin_weibo_retweeted_total_number"] = origin_retweet_total_number result["origin_weibo_comment_total_number"] = origin_comment_total_number result[ "retweeted_weibo_retweeted_total_number"] = retweet_retweet_total_number result[ "retweeted_weibo_comment_total_number"] = retweet_comment_total_number result[ "origin_weibo_retweeted_average_number"] = origin_retweet_average_number result[ "origin_weibo_comment_average_number"] = origin_comment_average_number result[ "retweeted_weibo_retweeted_average_number"] = retweet_retweet_average_number result[ "retweeted_weibo_comment_average_number"] = retweet_comment_average_number result["origin_weibo_retweeted_top_number"] = origin_retweet_top_number result["origin_weibo_comment_top_number"] = origin_comment_top_number result["retweeted_weibo_retweeted_top_number"] = retweet_retweet_top_number result["retweeted_weibo_comment_top_number"] = retweet_comment_top_number if not sensitive: result[ "origin_weibo_comment_brust_average"] = origin_weibo_comment_brust_average result[ "origin_weibo_retweeted_brust_average"] = origin_weibo_retweeted_brust_average result[ "retweeted_weibo_comment_brust_average"] = retweeted_weibo_comment_brust_average result[ "retweeted_weibo_retweeted_brust_average"] = retweeted_weibo_retweeted_brust_average result['user_index'] = bci_result.get('user_index', 0) else: result["retweeted_sensitive_words_list"] = sorted( retweeted_sensitive_words_dict.items(), key=lambda x: x[1], reverse=True) result["origin_sensitive_words_list"] = sorted( origin_sensitive_words_dict.items(), key=lambda x: x[1], reverse=True) result["retweeted_sensitive_words_number"] = len( retweeted_sensitive_words_dict) result["origin_sensitive_words_number"] = len( origin_sensitive_words_dict) return result
def get_sensitive_text_detail(task_name, ts, user, order): _id = user + '-' + task_name task_detail = es.get(index=index_sensing_task, doc_type=_id, id=ts)['_source'] weibo_detail = json.loads(task_detail['sensitive_weibo_detail']) weibo_detail_list = [] if weibo_detail: for iter_mid, item in weibo_detail.iteritems(): tmp = [] tmp.append(iter_mid) tmp.append(item[iter_mid]) tmp.append(item['retweeted']) tmp.append(item['comment']) weibo_detail_list.append(tmp) mid_list = weibo_detail.keys() results = [] query_body = { "query": { "filtered": { "filter": { "terms": { "mid": mid_list } } } } } index_list = [] datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts - DAY) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: index_list.append(index_name) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es_1 = es_text.indices.exists(index_name_1) if exist_es_1: index_list.append(index_name_1) if index_list and mid_list: search_results = es_text.search(index=index_list, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] uid_list = [] text_dict = dict() # 文本信息 portrait_dict = dict() # 背景信息 if search_results: for item in search_results: uid_list.append(item["_source"]['uid']) text_dict[item['_id']] = item['_source'] # _id是mid if uid_list: portrait_result = es_profile.mget( index=profile_index_name, doc_type=profile_index_type, body={"ids": uid_list}, fields=['nick_name', 'photo_url'])["docs"] for item in portrait_result: if item['found']: portrait_dict[item['_id']] = { "nick_name": item["fields"]["nick_name"][0], "photo_url": item["fields"]["photo_url"][0] } else: portrait_dict[item['_id']] = { "nick_name": item['_id'], "photo_url": "" } if order == "total": sorted_list = sorted(weibo_detail_list, key=lambda x: x[1], reverse=True) elif order == "retweeted": sorted_list = sorted(weibo_detail_list, key=lambda x: x[2], reverse=True) elif order == "comment": sorted_list = sorted(weibo_detail_list, key=lambda x: x[3], reverse=True) else: sorted_list = weibo_detail_list count_n = 0 for item in sorted_list: mid = item[0] iter_text = text_dict.get(mid, {}) temp = [] # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type if iter_text: uid = iter_text['uid'] temp.append(uid) iter_portrait = portrait_dict.get(uid, {}) if iter_portrait: temp.append(iter_portrait['nick_name']) temp.append(iter_portrait['photo_url']) else: temp.extend([uid, '']) temp.append(iter_text["text"]) temp.append(iter_text["sentiment"]) temp.append(ts2date(iter_text['timestamp'])) temp.append(iter_text['geo']) temp.append(iter_text['message_type']) temp.append(item[2]) temp.append(item[3]) temp.append(iter_text.get('sensitive', 0)) count_n += 1 results.append(temp) if results and order == "ts": results = sorted(results, key=lambda x: x[5], reverse=True) if results and order == "sensitive": results = sorted(results, key=lambda x: x[-1], reverse=True) return results
def search_group_sentiment_weibo(task_name, submit_user, start_ts, sentiment): weibo_list = [] #step1:get task_name uid task_id = submit_user + task_name try: group_result = es_group_result.get(index=group_index_name, doc_type=group_index_type,\ id=task_id, _source=False, fields=['uid_list']) except: group_result = {} if group_result == {}: return 'task name invalid' try: uid_list = group_result['fields']['uid_list'] except: uid_list = [] if uid_list == []: return 'task uid list null' #step3: get ui2uname uid2uname = {} try: user_portrait_result = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type,\ body={'ids':uid_list}, _source=False, fields=['uname'])['docs'] except: user_portrait_result = [] for item in user_portrait_result: uid = item['_id'] if item['found'] == True: uname = item['fields']['uname'][0] uid2uname[uid] = uname #step4:iter date to search weibo weibo_list = [] iter_date = ts2datetime(start_ts) flow_text_index_name = flow_text_index_name_pre + str(iter_date) #step4: get query_body if sentiment != '2': query_body = [{'terms': {'uid': uid_list}}, {'term':{'sentiment': sentiment}}, \ {'range':{'timestamp':{'gte':start_ts, 'lt': start_ts+DAY}}}] else: query_body = [{'terms':{'uid':uid_list}}, {'terms':{'sentiment': SENTIMENT_SECOND}},\ {'range':{'timestamp':{'gte':start_ts, 'lt':start_ts+DAY}}}] try: flow_text_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\ body={'query':{'bool':{'must': query_body}}, 'sort': [{'timestamp':{'order':'asc'}}], 'size': MAX_VALUE})['hits']['hits'] except: flow_text_result = [] for flow_text_item in flow_text_result: source = flow_text_item['_source'] weibo = {} weibo['uid'] = source['uid'] weibo['uname'] = uid2uname[weibo['uid']] weibo['ip'] = source['ip'] try: weibo['geo'] = '\t'.join(source['geo'].split('&')) except: weibo['geo'] = '' weibo['text'] = source['text'] weibo['timestamp'] = source['timestamp'] weibo['sentiment'] = source['sentiment'] weibo_list.append(weibo) return weibo_list
def influenced_user_detail(uid, date, origin_retweeted_mid, retweeted_retweeted_mid, message_type, default_number=20): query_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must": [ ] } } } }, "size":20000, } if RUN_TYPE == 1: query_body["sort"] = {"user_fansnum":{"order":"desc"}} #详细影响到的人 date1 = str(date).replace('-', '') index_name = pre_index + date1 index_flow_text = pre_text_index + date origin_retweeted_uid = [] # influenced user uid_list retweeted_retweeted_uid = [] origin_comment_uid = [] retweeted_comment_uid = [] query_origin = copy.deepcopy(query_body) query_retweeted = copy.deepcopy(query_body) if origin_retweeted_mid: # 所有转发该条原创微博的用户 query_origin["query"]["filtered"]["filter"]["bool"]["must"].append({"terms": {"root_mid": origin_retweeted_mid}}) query_origin["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term":{"message_type": message_type}}, {"term":{"root_uid": uid}}]) origin_retweeted_result = es.search(index=index_flow_text, doc_type=flow_text_index_type, body=query_origin, fields=["uid"])["hits"]["hits"] if origin_retweeted_result: for item in origin_retweeted_result: origin_retweeted_uid.append(item["fields"]["uid"][0]) if retweeted_retweeted_mid: # 所有评论该条原创微博的用户 query_retweeted["query"]["filtered"]["filter"]["bool"]["must"].append({"terms": {"root_mid": retweeted_retweeted_mid}}) query_retweeted["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term":{"message_type": message_type}},{"term": {"directed_uid": uid}}]) retweeted_retweeted_result = es.search(index=index_flow_text, doc_type=flow_text_index_type, body=query_retweeted, fields=["uid"])["hits"]["hits"] if retweeted_retweeted_result: for item in retweeted_retweeted_result: retweeted_retweeted_uid.append(item["fields"]["uid"][0]) retweeted_uid_list = [] # all retweeted user list retweeted_results = {} # statistics of all retweeted uid information retweeted_domain = {} retweeted_topic = {} retweeted_geo = {} bci_results = {} in_portrait = [] out_portrait = [] average_influence = 0 total_influence = 0 count = 0 all_uid_set = set(origin_retweeted_uid) | set(retweeted_retweeted_uid) retweeted_uid_list.extend(origin_retweeted_uid) retweeted_uid_list.extend(retweeted_retweeted_uid) retweeted_uid_list = list(set(retweeted_uid_list) - set([uid])) # filter uids if retweeted_uid_list: user_portrait_result = es_user_portrait.mget(index=user_portrait, doc_type=portrait_index_type, body={"ids": retweeted_uid_list}, fields=["domain", "topic_string", "activity_geo_dict","importance", "influence"])["docs"] bci_index = "bci_" + date.replace('-', '') bci_results = es_cluster.mget(index=bci_index, doc_type="bci", body={"ids":retweeted_uid_list}, fields=['user_index'])["docs"] for item in user_portrait_result: if item["found"]: temp = [] count += 1 temp.append(item['_id']) temp.append(item["fields"]["importance"][0]) in_portrait.append(temp) temp_domain = item["fields"]["domain"][0].split('&') temp_topic = item["fields"]["topic_string"][0].split('&') temp_geo = json.loads(item["fields"]["activity_geo_dict"][0])[-1].keys() #total_influence += item["fields"]["influence"][0] retweeted_domain = aggregation(temp_domain, retweeted_domain) retweeted_topic = aggregation(temp_topic, retweeted_topic) retweeted_geo = aggregation(temp_geo, retweeted_geo) else: out_portrait.append(item['_id']) retweeted_domain = proportion(retweeted_domain) retweeted_topic = proportion(retweeted_topic) retweeted_geo = proportion(retweeted_geo) if bci_results: total_influence = 0 for item in bci_results: if item['found']: total_influence += item['fields']['user_index'][0] try: average_influence = total_influence/len(retweeted_uid_list) except: average_influence = 0 sorted_retweeted_domain = sorted(retweeted_domain.items(),key=lambda x:x[1], reverse=True) sorted_retweeted_topic = sorted(retweeted_topic.items(),key=lambda x:x[1], reverse=True) sorted_retweeted_geo = sorted(retweeted_geo.items(), key=lambda x:x[1], reverse=True) retweeted_results["domian"] = sorted_retweeted_domain[:5] retweeted_results["topic"] = sorted_retweeted_topic[:5] retweeted_results["geo"] = sorted_retweeted_geo[:5] retweeted_results["influence"] = average_influence in_portrait = sorted(in_portrait, key=lambda x:x[1], reverse=True) temp_list = [] for item in in_portrait: temp_list.append(item[0]) retweeted_results['in_portrait_number'] = len(temp_list) retweeted_results['out_portrait_number'] = len(out_portrait) in_portrait_url = get_user_url(temp_list[:default_number]) out_portrait_url = get_user_url(out_portrait[:default_number]) retweeted_results["in_portrait"] = in_portrait_url retweeted_results["out_portrait"] = out_portrait_url retweeted_results["total_number"] = len(temp_list) + len(out_portrait) return retweeted_results
def aggregation_hot_keywords(start_time, stop_time, keywords_list): start_time = int(start_time) stop_time = int(stop_time) query_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"terms": {"keywords_string": keywords_list}}, {"range":{ "timestamp":{ "gte":start_time, "lt": stop_time } }} ] } } } }, "aggs":{ "all_keywords":{ "terms": {"field": "keywords_string", "size": PRE_AGGREGATION_NUMBER} } } } keywords_dict = dict() datetime = ts2datetime(float(stop_time)) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)["aggregations"]['all_keywords']['buckets'] if search_results: for item in search_results: keywords_dict[item['key']] = item['doc_count'] datetime_1 = ts2datetime(float(start_time)) if datetime_1 == datetime: pass else: ts = float(stop_time) while 1: keywords_dict_1 = dict() ts = ts-day_time datetime = ts2datetime(ts) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: search_results_1 = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)["aggregations"]['all_keywords']['buckets'] if search_results_1: print search_results_1 for item in search_results_1: keywords_dict_1[item['key']] = item['doc_count'] for iter_key in keywords_dict_1.keys(): if keywords_dict.has_key(iter_key): keywords_dict[iter_key] += keywords_dict_1[iter_key] else: keywords_dict[iter_key] = keywords_dict_1[iter_key] if datetime_1 == datetime: break print keywords_dict return_dict = sorted(keywords_dict.items(), key=lambda x:x[1], reverse=True)[:AGGRAGATION_KEYWORDS_NUMBER] return return_dict
def full_text_search(keywords, uid, start_time, end_time, size): results = [] uid_list = [] user_profile_list = [] query_body = { "query": { "filtered":{ "filter":{ "bool": { "must": [] } } } }, "size":size, "sort":{"timestamp":{"order": 'desc'}} } if RUN_TYPE: query_body["sort"] = {"user_fansnum":{"order": 'desc'}} if uid: query_body["query"]["filtered"]["filter"]["bool"]["must"].append({"term":{"uid":uid}}) if keywords: keywords_list = keywords.split(',') for word in keywords_list: query_body["query"]["filtered"]["filter"]["bool"]["must"].append({'wildcard':{'text':{'wildcard':'*'+word+'*'}}}) index_list = [] exist_bool = es_flow_text.indices.exists(index="flow_text_"+end_time) if start_time: start_ts = datetime2ts(start_time) end_ts = datetime2ts(end_time) ts = end_ts while 1: index_name = "flow_text_"+ts2datetime(ts) exist_bool = es_flow_text.indices.exists(index=index_name) if exist_bool: index_list.append(index_name) if ts == start_ts: break else: ts -= 3600*24 print index_list # 没有可行的es if not index_list: return [] search_results = es_flow_text.search(index=index_list, doc_type="text", body=query_body)["hits"]["hits"] for item in search_results: uid_list.append(item['_source']['uid']) history_max = get_history_max() personal_field = ["nick_name", "fansnum", "statusnum","user_location"] user_info = get_user_profile(uid_list, personal_field) bci_results = ES_CLUSTER_FLOW1.mget(index="bci_history", doc_type="bci", body={"ids":uid_list}, _source=False, fields=["bci_day_last"])["docs"] sensitive_results = es_sensitive_history.mget(index="sensitive_history", doc_type="sensitive", body={"ids":uid_list}, _source=False, fields=["last_value"])["docs"] count = 0 for item in search_results: item = item['_source'] uid_list.append(item['uid']) iter_item = [] iter_item.append(item['uid']) iter_item.append(user_info[count][1]) iter_item.append(item['text']) iter_item.append(ts2date(item['timestamp'])) iter_item.append(item['geo']) if item.get("sensitive_words_string", ''): iter_item.append(item['sensitive_words_string'].split('&')) else: iter_item.append([]) iter_item.append(item.get('retweeted', 0)) iter_item.append(item.get('comment', 0)) count += 1 results.append(iter_item) user_set = set() count = 0 for item in user_info: if item[0] in user_set: continue else: user_set.add(item[0]) if bci_results[count]["found"]: bci_value = bci_results[count]["fields"]["bci_day_last"][0] item.append(normalize_index(bci_value, history_max["max_bci"])) else: item.append(0) if sensitive_results[count]["found"]: sensitive_value = sensitive_results[count]['fields']['last_value'][0] item.append(normalize_index(sensitive_value, history_max["max_sensitive"])) else: item.append(0) user_profile_list.append(item) return results, user_profile_list
def get_sensitive_text_detail(task_name, ts, user, order): _id = user + '-' + task_name task_detail = es.get(index=index_sensing_task, doc_type=_id, id=ts)['_source'] weibo_detail = json.loads(task_detail['sensitive_weibo_detail']) weibo_detail_list = [] if weibo_detail: for iter_mid, item in weibo_detail.iteritems(): tmp = [] tmp.append(iter_mid) tmp.append(item[iter_mid]) tmp.append(item['retweeted']) tmp.append(item['comment']) weibo_detail_list.append(tmp) mid_list = weibo_detail.keys() results = [] query_body = { "query":{ "filtered":{ "filter":{ "terms":{"mid": mid_list} } } } } index_list = [] datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts-DAY) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: index_list.append(index_name) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es_1 = es_text.indices.exists(index_name_1) if exist_es_1: index_list.append(index_name_1) if index_list and mid_list: search_results = es_text.search(index=index_list, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] else: search_results = [] uid_list = [] text_dict = dict() # 文本信息 portrait_dict = dict() # 背景信息 if search_results: for item in search_results: uid_list.append(item["_source"]['uid']) text_dict[item['_id']] = item['_source'] # _id是mid if uid_list: portrait_result = es_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list}, fields=['nick_name', 'photo_url'])["docs"] for item in portrait_result: if item['found']: portrait_dict[item['_id']] = {"nick_name": item["fields"]["nick_name"][0], "photo_url": item["fields"]["photo_url"][0]} else: portrait_dict[item['_id']] = {"nick_name": item['_id'], "photo_url":""} if order == "total": sorted_list = sorted(weibo_detail_list, key=lambda x:x[1], reverse=True) elif order == "retweeted": sorted_list = sorted(weibo_detail_list, key=lambda x:x[2], reverse=True) elif order == "comment": sorted_list = sorted(weibo_detail_list, key=lambda x:x[3], reverse=True) else: sorted_list = weibo_detail_list count_n = 0 for item in sorted_list: mid = item[0] iter_text = text_dict.get(mid, {}) temp = [] # uid, nick_name, photo_url, text, sentiment, timestamp, geo, common_keywords, message_type if iter_text: uid = iter_text['uid'] temp.append(uid) iter_portrait = portrait_dict.get(uid, {}) if iter_portrait: temp.append(iter_portrait['nick_name']) temp.append(iter_portrait['photo_url']) else: temp.extend([uid,'']) temp.append(iter_text["text"]) temp.append(iter_text["sentiment"]) temp.append(ts2date(iter_text['timestamp'])) temp.append(iter_text['geo']) temp.append(iter_text['message_type']) temp.append(item[2]) temp.append(item[3]) temp.append(iter_text.get('sensitive', 0)) count_n += 1 results.append(temp) if results and order == "ts": results = sorted(results, key=lambda x:x[5], reverse=True) if results and order == "sensitive": results = sorted(results, key=lambda x:x[-1], reverse=True) return results