def get_event_uid_count(task_name,start_ts,end_ts): event_uid_set = set() query_body = { 'query':{ 'filtered':{ 'filter':{ 'range':{ 'timestamp':{'gte': start_ts, 'lt':end_ts} } } } }, 'size':99999999 } es_results = es.search(index=task_name,doc_type='text',body=query_body)['hits']['hits'] for result in es_results: #print 'result:::',result event_uid_set.add(result['_source']['uid']) try: event_uid_set.add(result['_source']['root_uid']) except: continue try: event_uid_set.add(result['_source']['directed_uid']) except: continue uids_count = len(event_uid_set) return uids_count
def create_task_list(given_ts): # 1. search from manage_sensing_task # 2. push to redis list-----task_work # print start info current_path = os.getcwd() file_path = os.path.join(current_path, 'task_list.py') now_ts = datehour2ts(ts2datehour(time.time() - 3600)) print_log = "&".join([file_path, "start", ts2date(now_ts)]) print print_log #ts = ts - 3600 query_body = {"query": {"match_all": {}}} search_results = es.search(index=index_sensing, doc_type=type_sensing, body=query_body)['hits']['hits'] count = 0 if search_results: for iter_item in search_results: _id = iter_item['_id'] item = iter_item['_source'] task = [] task.append(item['task_name']) # task_name task.append(json.loads(item['social_sensors'])) # social sensors #task.append(now_ts) task.append(given_ts) r.lpush('task_name', json.dumps(task)) count += 1 print count print_log = "&".join([file_path, "end", ts2date(time.time())]) print print_log
def search_times(task_name, uid, ts): query_body = { "query":{ "filtered":{ "filter":{ "bool": { "must":[ {"range":{ "timestamp":{ "lt": ts } } } ], "should":[ {"term":{"directed_uid": int(uid)}}, {"term": {"root_uid": str(uid)}} ] } }}}, "aggs":{ "uid_count":{ "cardinality":{"field": "uid"} } } } count = es.search(index=task_name, doc_type="text", body=query_body)["aggregations"]["uid_count"]["value"] return count
def create_task(): #ts = time.time() #current_ts = datehour2ts(ts2datehour(ts)) index_name = index_manage_event_analysis index_type = type_manage_event_analysis query_body = { "query": { "term": { "event_value_finish": "0" } }, "size": 10000 } results = es.search(index=index_name, doc_type=index_type, body=query_body)["hits"]["hits"] #item_finish_status = {} for item in results: topic = item["_source"]["task_name"] en_name = item["_source"]["pinyin_task_name"] start_ts = item['_source']['start_time'] end_ts = item['_source']['stop_time'] print "push task_name: ", en_name r_event_analysis.lpush(task_event_analysis, json.dumps([topic, en_name, start_ts, end_ts]))
def create_task(): ts = time.time() current_ts = datehour2ts(ts2datehour(ts)) query_body = {"query": {"term": {"finish": "0"}}, "size": 10000} results = es_prediction.search(index=index_manage_prediction_task,\ doc_type=type_manage_prediction_task, body=query_body)["hits"]["hits"] for item in results: task_name = item["_source"]["pinyin_task_name"] print "push task_name: ", task_name update_time = item["_source"]["scan_text_time"] stop_time = item["_source"]["stop_time"] if current_ts > stop_time: es_prediction.update(index=index_manage_prediction_task,doc_type=type_manage_prediction_task,\ id=task_name, body={"doc":{"finish":"1"}}) during = item["_source"]["micro_during"] if current_ts - update_time >= during: r_micro.lpush( task_micro_prediction, json.dumps([ task_name, item["_source"]["scan_text_time"], current_ts, during ])) return True
def create_task(): ts = time.time() if RUN_TYPE: current_ts = datehour2ts(ts2datehour(ts)) else: current_ts = 1482861600 query_body = { "query": { "term":{"finish":"0"} }, "size":10000 } results = es_prediction.search(index=index_manage_prediction_task,\ doc_type=type_manage_prediction_task, body=query_body)["hits"]["hits"] for item in results: print item task_name = item["_source"]["pinyin_task_name"] stop_time = item["_source"]["stop_time"] print stop_time, current_ts if stop_time < current_ts: es_prediction.update(index=index_manage_prediction_task,\ doc_type=type_manage_prediction_task, id=task_name, body={"doc":{"macro_trendline_finish":"1", "finish": "1"}}) else: r_trendline.lpush(task_trendline, task_name)
def extend_network(task_name, ts): index_name = task_name # mu qian can yu de yonghu shu query_uid = { "query":{ "filtered":{ "filter":{ "range":{ "timestamp":{ "lt": ts } } } } }, "aggs":{ "uid_count":{"cardinality":{"field": "uid"}} } } uid_count = es_prediction.search(index=index_name, doc_type="text", \ body=query_uid)["aggregations"]["uid_count"]["value"] try: extend_retweet_threshold = float(r_stimulation.get("extend_retweet_threshold")) except: r_stimulation.set("extend_retweet_threshold", 10000) extend_retweet_threshold = 10000 user_list = organize_network(task_name, ts) exist_user_set = set(user_list) in_user_list = list() ####已存在的用户列表 in_user_info = [] count = 0 all_user_dict = dict() ## participate user >>> extended list list_len = len(user_list) len_1000 = list_len/1000 for i in range(len_1000+1): tmp_uid = user_list[i*1000: (i+1)*1000] es_results = es_retweet.mget(index=index_be_retweet,doc_type=index_type_be_retweet, body={"ids":tmp_uid})["docs"] for item in es_results: if item["found"]: count +=1 if count % 1000 == 0: print "extend network: ", count uid_be_retweet = json.loads(item["_source"]["uid_be_retweet"]) retweet_count = len(uid_be_retweet) if retweet_count < extend_retweet_threshold: # 对外扩展的阈值 continue uid_retweet_list = uid_be_retweet.keys() uid_retweet_list = list(set(uid_retweet_list)-exist_user_set) all_user_dict[item["_id"]] = uid_retweet_list # 扩展的用户 retweet_count = len(uid_be_retweet) in_user_list.append(item["_id"]) in_user_info.append([math.log(retweet_count+1), math.log(uid_count+1)]) return uid_count,in_user_list, in_user_info, all_user_dict
def search_hot_mid(task_name, ts): query_body = { "query": { "range":{ "timestamp":{ "lt": ts } } }, "aggs":{ "hot_mid":{ "terms":{"field": "root_mid", "size": 100} } } } mid_list = [] return_list = [] # return hot mid uid_list = [] es_results = es.search(index=task_name, doc_type="text", body=query_body)["aggregations"]["hot_mid"]["buckets"] for item in es_results: if item["doc_count"] >= 500: mid_list.append(item["key"]) if mid_list: weibo_results = es.mget(index=task_name, doc_type="text", body={"ids":mid_list})["docs"] for item in weibo_results: if item["found"]: mid = item["_id"] retweet, comment = search_retweet_comment(task_name, mid) detail = item["_source"] detail["retweet"] = retweet detail["comment"] = comment uid_list.append(detail["uid"]) return_list.append(detail) if uid_list: profile_results = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":uid_list})["docs"] for i in range(len(uid_list)): detail = profile_results[i] if detail["found"]: return_list[i]["uname"] = detail["_source"]["nick_name"] return_list[i]["photo_url"] = detail["_source"]["photo_url"] return_list[i]["fansnum"] = detail["_source"]["fansnum"] return_list[i]["statusnum"] = detail["_source"]["statusnum"] else: return_list[i]["uname"] = detail["_id"] return_list[i]["photo_url"] = "" return_list[i]["fansnum"] = "" return_list[i]["statusnum"] = "" return return_list
def create_task(): query_body = { "query": { "bool":{ "must":[ {"term": {"finish": "0"}} ] } }, "size": 10000 } es_results = es_prediction.search(index=index_manage_interfere_task, doc_type=type_manage_interfere_task,\ body=query_body)["hits"]["hits"] task_list = [] if int(RUN_TYPE) == 1: current_ts = datehour2ts(ts2datehour(time.time())) else: current_ts = 1482681600 + 18*3600 for item in es_results: tmp = [] task_detail = item["_source"] task_name = task_detail['pinyin_task_name'] update_time = task_detail["update_time"] sti_during = task_detail["stimulation_during"] stop_time = task_detail["stop_time"] if RUN_TYPE == 1: if stop_time > current_ts: es.update(index=index_manage_interfere_task, doc_type=type_manage_interfere_task,\ id=task_name, body={"doc":{"finish": "1"}}) tmp.append(task_name) tmp.append(task_detail["stop_time"]) tmp.append(task_detail["scan_text_finish"]) tmp.append(current_ts) if current_ts - update_time >= sti_during: r_stimulation.lpush(task_stimulation, json.dumps(tmp)) # update: processing status es_prediction.update(index=index_manage_interfere_task,doc_type=type_manage_interfere_task,\ id=task_name, body={"doc":{"stimulation_processing_status":"1"}}) return True
def dispose_data(task_name, current_ts, during=3600): K = 2 ######## task_detail = es_prediction.get(index=index_manage_prediction_task, doc_type=type_manage_prediction_task, id=task_name)["_source"] start_time = int(task_detail["start_time"]) origin_task_name = task_name task_name = "micro_prediction_" + task_name query_body = { "query": { "range": { "update_time": { "lte": current_ts } } }, "size": K, "sort": { "update_time": { "order": "desc" } } } sort_query_body = { "query": { "range": { "update_time": { "lte": current_ts } } } } total_count = [] total_fans_list = [] total_origin_list = [] total_retweet_list = [] total_comment_list = [] total_uid_list = [] total_positive_list = [] total_negetive_list = [] average_origin_ts = [] average_retweet_ts = [] feature_list = [] results = es_prediction.search(index=task_name, doc_type=index_type_prediction_task, body=query_body)["hits"]["hits"] location = es_prediction.count(index=task_name, doc_type=index_type_prediction_task, body=sort_query_body)["count"] if len(results) != K: short_len = K - len(results) results.extend([[]] * short_len) print "former result: ", len(results), K results.reverse() for item in results: if item: item = item["_source"] #total_fans_list.append(item["total_fans_number"]) total_origin_list.append(item["origin_weibo_number"]) total_retweet_list.append(item["retweeted_weibo_number"]) total_comment_list.append(item["comment_weibo_number"]) total_count.append(item["total_count"]) total_uid_list.append(item["total_uid_count"]) total_positive_list.append(item["positive_count"]) total_negetive_list.append(item["negetive_count"]) average_origin_ts.append(item["average_origin_ts"]) average_retweet_ts.append(item["average_retweet_ts"]) else: #total_fans_list.append(0) total_origin_list.append(0) total_retweet_list.append(0) total_comment_list.append(0) total_uid_list.append(0) total_count.append(0) total_positive_list.append(0) total_negetive_list.append(0) average_origin_ts.append(0) average_retweet_ts.append(0) print "total_count: ", total_count feature_list = [] feature_list.append(math.log(int(total_retweet_list[-1] + 1))) feature_list.append(math.log(int(total_comment_list[-1] + 1))) feature_list.append(math.log(int(total_positive_list[-1] + 1))) feature_list.append(math.log(int(total_negetive_list[-2] + 1))) feature_list.append(math.log(int(total_negetive_list[-1] + 1))) feature_list.append(math.log(int(total_count[-1] + 1))) feature_list.append(math.log(int(total_uid_list[-1] + 1))) if int(during) == 3 * 3600: feature_list.append(average_origin_ts[-1]) feature_list.append(average_retweet_ts[-1]) # load model and prediction if int(during) == 3600: if total_count[-1] - total_count[-2] >= -0.2 * total_count[-2]: with open("model-up.pkl", "r") as f: gbdt = pickle.load(f) else: with open("model-down.pkl", "r") as f: gbdt = pickle.load(f) elif int(during) == 3 * 3600: with open("model-3.pkl", "r") as f: gbdt = pickle.load(f) print "feature_list: ", feature_list pred = gbdt.predict(feature_list) for item in pred: prediction_value = item prediction_value = math.exp(prediction_value) print "prediction_valie: ", prediction_value # update scan processing #es_prediction.update(index=index_manage_prediction_task,doc_type=type_manage_prediction_task, \ # id=origin_task_name, body={"doc":{"scan_text_processing":"0"}}) # update prediction value in es task_detail = es_prediction.get(index=index_manage_prediction_task, \ doc_type=type_manage_prediction_task, id=origin_task_name)["_source"] if current_ts >= int(task_detail["stop_time"]): task_detail["finish"] = "1" task_detail["processing_status"] = "0" # update task info es_prediction.index(index=index_manage_prediction_task, \ doc_type=type_manage_prediction_task, id=origin_task_name, body=task_detail) # update prediction es_prediction.update(index=task_name, doc_type=index_type_prediction_task, id=current_ts, body={"doc": { "prediction_value": prediction_value }}) return True
def dispose_data(task_name, current_ts): es_result = es_prediction.get(index=index_manage_prediction_task, doc_type=type_manage_prediction_task, id=task_name)["_source"] macro_during = es_result['macro_during'] start_ts = datehour2ts(ts2datehour(es_result["submit_time"])) task_start_ts = start_ts end_ts = datehour2ts(ts2datehour(es_result["stop_time"])) index_micro = "micro_prediction_" + task_name query_body = { "query": { "filtered": { "filter": { "range": { "update_time": { "lte": current_ts } } } } }, "size": 10000, "sort": { "update_time": { "order": "asc" } } } micro_results = es_prediction.search(index=index_micro, doc_type="micro_task", body=query_body)["hits"]["hits"] total_list = [] for item in micro_results: total_list.append(item["_source"]["total_count"]) # 每个时间段内的微博量 total_len = (end_ts - start_ts) / macro_during times = int(macro_during) / 3600 lenth = len(total_list) / times adjust_list = [] time_list = [] count = 0 i = 0 for item in total_list: count += item i += 1 if i % times == 0: if start_ts <= current_ts: adjust_list.append(count) count = 0 time_list.append(start_ts) else: break start_ts += 3600 # 总得时间走势图 total_time_list = [] for i in range(total_len): total_time_list.append(task_start_ts + i * macro_during) left_time = list(set(total_time_list) - set(time_list)) left_time = sorted(left_time) return adjust_list, total_len, time_list, left_time
def organize_feature(task_name, mid, ts): result = dict() try: result = es.get(index=task_name, doc_type="text", id=mid)["_source"] except: pass if not result: return [0, 0, 0, 0, 0, 0, 0] ts = result["timestamp"] query_body = {"query": {"term": {"root_mid": mid}}} #total_weibo #count = es.count(index=index_list, doc_type="text", body=query_body)["count"] query_body_uid = { "query": { "term": { "root_mid": mid } }, "aggs": { "uid_count": { "cardinality": { "field": "uid" } } } } # total_uid #total_uid_count = es.search(index=index_list, doc_type="text", body=query_body_uid)['aggregations']["uid_count"]["value"] feature_list = [] feature_list.append(math.log(result["user_fansnum"] + 1)) query_body_ts = { "query": { "bool": { "must": [{ "term": { "root_mid": mid } }, { "range": { "timestamp": { "lt": ts + 3600 * 10 } } }] } }, "aggs": { "weibo_type": { "terms": { "field": "message_type" } } } } comment = 0 retweet = 0 tmp_count = es.search( index=task_name, doc_type="text", body=query_body_ts)['aggregations']["weibo_type"]["buckets"] if tmp_count: for item in tmp_count: if int(item["key"]) == 2: comment = item["doc_count"] elif int(item["key"]) == 3: retweet = item["doc_count"] feature_list.append(comment + retweet) feature_list.append(retweet) feature_list.append(comment) feature_list.append(retweet / float(comment + retweet + 1)) feature_list.append(comment / float(comment + retweet + 1)) query_body_uid = { "query": { "bool": { "must": [{ "term": { "root_mid": mid } }, { "range": { "timestamp": { "lt": ts + 3600 * 10 } } }] } }, "aggs": { "uid_count": { "cardinality": { "field": "uid" } } } } uid_count = es.search( index=task_name, doc_type="text", body=query_body_uid)['aggregations']["uid_count"]["value"] feature_list.append(uid_count) #feature_list.append(topic_field_dict[topic]) return feature_list
def update_prediction(ts): # current ts query_body = { "query": { "range": { "timestamp": { "gte": ts - 10 * 3600, "lte": ts } } }, "size": 20000, "sort": { "timestamp": { "order": "asc" } } } es_results = es_prediction.search(index="social_sensing_text",doc_type="text",\ body=query_body, _source=False,fields=["mid","timestamp"])["hits"]["hits"] print "get results lenth: ", len(es_results) mid_list = [] mid_ts_list = [] feature_list = [] count = 0 bulk_action = [] with open("prediction_uid.pkl", "r") as f: uid_model = pickle.load(f) with open("prediction_weibo.pkl", "r") as f: weibo_model = pickle.load(f) print "finish loading" for item in es_results: mid = item["fields"]["mid"][0] mid_ts = item["fields"]["timestamp"][0] iter_feature = organize_feature(mid, mid_ts) feature_list.append(iter_feature) mid_list.append(mid) mid_ts_list.append(mid_ts) count += 1 if count % 100 == 0: """ weibo_prediction_result = weibo_model.predict(feature_list) uid_prediction_result = uid_model.predict(feature_list) print "finish prediction" for i in range(len(mid_list)): iter_dict = dict() iter_dict["mid"] = mid_list[i] iter_dict["uid_prediction"] = uid_prediction_result[i] iter_dict["weibo_prediction"] = weibo_prediction_result[i] tmp_trendline = trendline_list(mid_list[i], weibo_prediction_result[i],mid_ts_list[i]) iter_dict["trendline"] = json.dumps(tmp_trendline) bulk_action.extend([{"update":{"_id":mid_list[i]}}, {"doc":iter_dict}]) print uid_prediction_result[i], weibo_prediction_result[i],mid_list[i] print es_prediction.bulk(bulk_action,index="social_sensing_text",doc_type="text",timeout=600) """ bulk_action = [] mid_list = [] mid_ts_list = [] feature_list = [] print "iter count: ", count if mid_list: weibo_prediction_result = weibo_model.predict(feature_list) uid_prediction_result = uid_model.predict(feature_list) print "finish prediction" for i in range(len(mid_list)): iter_dict = dict() iter_dict["mid"] = mid_list[i] iter_dict["uid_prediction"] = uid_prediction_result[i] iter_dict["weibo_prediction"] = weibo_prediction_result[i] tmp_trendline = trendline_list(mid_list[i], weibo_prediction_result[i], mid_ts_list[i]) iter_dict["trendline"] = json.dumps(tmp_trendline) bulk_action.extend([{ "update": { "_id": mid_list[i] } }, { "doc": iter_dict }]) print uid_prediction_result[i], weibo_prediction_result[ i], mid_list[i] es_prediction.bulk(bulk_action, index="social_sensing_text", doc_type="text", timeout=600)
def potential_user(task_name, ts): index_name = "stimulation_"+task_name index_type = "stimulation_results" #查询当前root_mid query_body = { "query": { "bool":{ "must":[ {"range":{ "timestamp":{ "lt": ts } }}, {"term":{"message_type":1}}, {"range":{ "user_fansnum":{ "gte": 10000 } }} ] } }, "size": 10000 } es_results = es.search(index=task_name, doc_type="text", body=query_body)["hits"]["hits"] mid_list = [] uid_list = [] feature_list = [] prediction_uid = [] prediction_weibo = [] with open("prediction_uid.pkl", "r") as f: uid_model = pickle.load(f) with open("prediction_weibo.pkl", "r") as f: weibo_model = pickle.load(f) for item in es_results: mid_list.append(item["_id"]) uid_list.append(item["_source"]["uid"]) tmp_feature_list = organize_feature(task_name,item["_id"], ts) feature_list.append(tmp_feature_list) weibo_prediction_result = weibo_model.predict(feature_list) uid_prediction_result = uid_model.predict(feature_list) future_total = 0 current_total = 0 results_dict = dict() in_potential_list = [] for i in range(len(mid_list)): mid = mid_list[i] uid = uid_list[i] iter_count = es.count(index=task_name, doc_type="text", body={"query":{"term":{"root_mid":mid}}})["count"] pre_count = weibo_prediction_result[i] future_total += abs(pre_count-iter_count) if pre_count >= 500 and iter_count <= 500: current_total += abs(pre_count-iter_count) if not results_dict.has_key(uid): results_dict[uid] = dict() tmp = dict() tmp["mid"] = mid tmp["current_count"] = iter_count tmp["prediction_count"] = int(pre_count) weibo_detail = es.get(index=task_name, doc_type="text", id=mid)["_source"] tmp.update(weibo_detail) retweet, comment = search_retweet_comment(task_name, mid) tmp["retweeted"] = retweet tmp["comment"] = comment results_dict[uid][mid] = tmp # user profile tmp_in_list = results_dict.keys() if tmp_in_list: profile_results = es_user_profile.mget(index=profile_index_name, doc_type=profile_index_type, body={"ids":tmp_in_list})["docs"] for i in range(len(tmp_in_list)): detail = profile_results[i] tmp = [] uid = tmp_in_list[i] if detail["found"]: tmp.append(detail["_source"]["nick_name"]) tmp.append(detail["_source"]["photo_url"]) tmp.append(detail["_source"]["fansnum"]) tmp.append(detail["_source"]["statusnum"]) else: tmp.append(detail["_id"]) tmp.extend(["","",""]) results_dict[uid]["user_profile"] = tmp return results_dict, future_total, current_total
def get_event_trend(task_name,start_ts,end_ts): #trend_end_ts = end_ts ''' if end_ts - start_ts < 10800: if end_ts - start_ts < 7200: trend_start_ts = else: trend_start_ts = trend_end_ts - 10800 ''' ''' trend_middle_ts = trend_start_ts + 36000 #事件趋势统计 #event_trend = defaultdict(list) event_trend_delta_list = [] #trend_input_list = [] if trend_end_ts != trend_start_ts: trend_middle_ts = trend_start_ts + 3600 query_body = { "query":{ "bool":{ "must":[ {"range":{ "timestamp":{ "gte":trend_start_ts, "lt":trend_middle_ts } } } ] } }, "size":99999999 } es_result = es.count(index=task_name,doc_type='text',body=query_body) weibo_count = es_result['count'] ''' event_trend = [] trend_input_list = [] event_trend_delta = [] i = 0 while i<10: end_ts = start_ts + 2*3600 trend_input_list.append((task_name,start_ts,end_ts)) start_ts = end_ts + 1 i = i + 2 #print 'input_list::::',input_list for item in trend_input_list: trend_event = item[0] trend_start_ts = item[1] trend_end_ts = item[2] query_body = { "query":{ "bool":{ "must":[ {"range":{ "timestamp":{ "gte":trend_start_ts, "lt":trend_end_ts } } } ] } }, "size":99999999, "sort":{"user_fansnum":"desc"} } es_results = es.search(index=task_name,doc_type='text',body=query_body)["hits"]["hits"] #前几小时微博数量趋势统计 event_trend.append(len(es_results)) for i in range(len(event_trend)-1): #print 'event_trend[event][i+1]::',event_trend[event][i+1] #print 'event_trend[event][i]::',event_trend[event] delta = event_trend[i+1]-event_trend[i] print 'delta::::',delta event_trend_delta.append(delta) #fo_trend.write(str(delta)+'\t') #fo_trend.write('\n') print 'event_trend_delta:::',event_trend_delta return event_trend_delta
def feature_compute(task_name,start_ts,end_ts): query_body = { "query":{ "bool":{ "must":[ {"range":{ "timestamp":{ "gte":start_ts, "lt":end_ts } } } ] } }, "size":99999999, "sort":{"user_fansnum":"desc"} } es_results = es.search(index=task_name,doc_type='text',body=query_body)["hits"]["hits"] #前几小时微博数量趋势统计 #event_trend[event].append(len(es_results)) #话题领域 #print 'len(es_re)',len(es_results) #field_multi = topic_field(es_results) #参与用户粉丝总数\转发总数\评论总数及平均数统计 total_user_fans = 0 average_user_fans = 0 total_comment = 0 average_comment = 0 total_retweet = 0 average_retweet = 0 #敏感微博数量及比例统计 total_sensitive = 0 total_sensitive_ratio = 0 #负向情绪微博数量及比例统计 total_negtive = 0 total_negtive_ratio = 0 #重要用户数量及比例统计(粉丝数>100000) total_important_user = 0 total_important_user_ratio = 0 #微博总数 total_num = len(es_results) #@层级数统计 #at_count = defaultdict(int) at_count = {} at_count['at_0'] = 0 at_count['at_1'] = 0 at_count['at_2'] = 0 at_count['at>3'] = 0 for result in es_results: total_user_fans += result['_source']['user_fansnum'] total_comment += result['_source']['comment'] total_retweet += result['_source']['retweeted'] if result['_source']['sensitive'] > 0: total_sensitive += 1 if result['_source']['sentiment'] > 1: total_negtive += 1 if result['_source']['user_fansnum'] > 10000: total_important_user += 1 text = result['_source']['text'] at_list = re.findall('//@',text) #print 'at_list:::',at_list if len(at_list) == 0: at_count['at_0'] += 1 elif len(at_list) == 1: at_count['at_1'] += 1 elif len(at_list) == 2: at_count['at_2'] += 1 else: at_count['at>3'] += 1 #print 'at+count::',at_count average_user_fans = float(total_user_fans)/total_num average_comment = float(total_comment)/total_num average_retweet = float(total_retweet)/total_num total_sensitive_ratio = float(total_sensitive)/total_num total_negtive_ratio = float(total_negtive)/total_num total_important_use_ratio = float(total_important_user)/total_num query_body_type_count = { 'query':{ 'bool':{ 'must':[ {'range':{ 'timestamp':{ 'gte':start_ts, 'lt':end_ts } }} ] } }, 'size':999999999, 'aggs':{ 'all_weibo':{ 'terms':{'field':'message_type'} } } } #统计各类型微博数量 es_weibo_type_count = es.search(index=task_name,doc_type='text',body=query_body_type_count,request_timeout=999999)['aggregations']['all_weibo']['buckets'] total_origin_type = 0 total_retweet_type = 0 total_comment_type = 0 total_type = 0 #print 'es_weibo_type_count:::',es_weibo_type_count weibo_type_count = dict() for item in es_weibo_type_count: if item['key'] == 1: total_origin_type = item['doc_count'] elif item['key'] == 2: total_retweet_type = item['doc_count'] elif item['key'] == 3: total_comment_type = item['doc_count'] total_type = total_origin_type + total_retweet_type + total_comment_type origin_ratio = float(total_origin_type)/total_type retweet_ratio = float(total_retweet_type)/total_type comment_ratio = float(total_comment_type)/total_type uids_count = get_event_uid_count(task_name,start_ts,end_ts) event_trend_delta_list = get_event_trend(task_name,start_ts,end_ts) feature_list = [uids_count,total_num,total_user_fans,total_comment,total_retweet,\ total_sensitive,total_sensitive_ratio,total_negtive,total_important_user,total_origin_type,\ origin_ratio,total_retweet_type,retweet_ratio,total_comment_type,comment_ratio,\ at_count,event_trend_delta_list] return feature_list