def create_task(): ts = time.time() current_ts = datehour2ts(ts2datehour(ts)) query_body = {"query": {"term": {"finish": "0"}}, "size": 10000} results = es_prediction.search(index=index_manage_prediction_task,\ doc_type=type_manage_prediction_task, body=query_body)["hits"]["hits"] for item in results: task_name = item["_source"]["pinyin_task_name"] print "push task_name: ", task_name update_time = item["_source"]["scan_text_time"] stop_time = item["_source"]["stop_time"] if current_ts > stop_time: es_prediction.update(index=index_manage_prediction_task,doc_type=type_manage_prediction_task,\ id=task_name, body={"doc":{"finish":"1"}}) during = item["_source"]["micro_during"] if current_ts - update_time >= during: r_micro.lpush( task_micro_prediction, json.dumps([ task_name, item["_source"]["scan_text_time"], current_ts, during ])) return True
def create_task(): ts = time.time() if RUN_TYPE: current_ts = datehour2ts(ts2datehour(ts)) else: current_ts = 1482861600 query_body = { "query": { "term":{"finish":"0"} }, "size":10000 } results = es_prediction.search(index=index_manage_prediction_task,\ doc_type=type_manage_prediction_task, body=query_body)["hits"]["hits"] for item in results: print item task_name = item["_source"]["pinyin_task_name"] stop_time = item["_source"]["stop_time"] print stop_time, current_ts if stop_time < current_ts: es_prediction.update(index=index_manage_prediction_task,\ doc_type=type_manage_prediction_task, id=task_name, body={"doc":{"macro_trendline_finish":"1", "finish": "1"}}) else: r_trendline.lpush(task_trendline, task_name)
def extend_network(task_name): file_name = task_name + ".txt" f = open(task_name + ".txt", "w") line = 0 user2number_dict = dict() # mapping: number-uid number2user_dict = dict() count = 0 user_list = organize_network(task_name) list_len = len(user_list) len_1000 = list_len / 1000 for i in range(len_1000 + 1): tmp_uid = user_list[i * 1000:(i + 1) * 1000] es_results = es_retweet.mget(index=index_be_retweet, doc_type=index_type_be_retweet, body={"ids": tmp_uid})["docs"] for item in es_results: if item["found"]: print count uid_be_retweet = json.loads(item["_source"]["uid_be_retweet"]) be_retweet_list = uid_be_retweet.keys() uid = item["_id"] if user2number_dict.has_key(uid): uid_count = user2number_dict[uid] else: count += 1 uid_count = count user2number_dict[uid] = count number2user_dict[count] = uid for each in be_retweet_list: if user2number_dict.has_key(each): each_number = user2number_dict[each] else: count += 1 user2number_dict[each] = count number2user_dict[count] = uid each_number = count if each_number != uid_count: f.write(str(uid_count) + " " + str(each_number) + "\n") line += 1 f.close() cmd = 'sed -i "" -e "1i %s %s" %s' % (count, line, file_name) p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) es_prediction.update(index=index_manage_interfere_task, doc_type=type_manage_interfere_task,\ id=task_name, body={"doc":{"network_exist": "1"}}) print "finish: ", count file_user = open("user_" + task_name + ".txt", "w") for uid in user2number_dict.keys(): file_user.write(str(uid) + '\n')
def dispose_results(task_name, ts, future_total, current_total): index_name = "stimulation_"+task_name index_type = "stimulation_results" results = es.get(index=index_name, doc_type=index_type, id=ts)["_source"] future_results = json.loads(results["future_results"]) future_list = [] # 未来传播路径 diffusion_path = dict() # 未来传播数值 diffusion_value = dict() for start_uid, end_dict in future_results.iteritems(): diffusion_path[start_uid] = end_dict.keys() future_list.extend(end_dict.keys()) diffusion_value.update(end_dict) # 未来传播者信息 # uid nick_name, photo_url, fans_num, weibo_num, prediction_value future_list = list(set(future_list)) future_user_info = get_future_user(future_list) #print future_user_info for i in range(len(future_list)): uid = future_user_info[i][0] future_user_info[i].append(int(diffusion_value[uid])) # 当前热门微博、用户信息 current_hot_mid = search_hot_mid(task_name, ts) # 当前潜在热门微博 potential_mid, t1, t2 = potential_user(task_name, ts) future_total += t1 current_total += t2 ratio = float(current_total)/future_total update_dict = dict() update_dict["diffusion_path"] = json.dumps(diffusion_path) update_dict["future_user_info"] = json.dumps(future_user_info) update_dict["current_hot_weibo"] = json.dumps(current_hot_mid) update_dict["potential_hot_weibo"] = json.dumps(potential_mid) update_dict["ratio"] = ratio es.update(index=index_name, doc_type=index_type, id=ts, body={"doc":update_dict}) return True
def create_task(): query_body = { "query": { "bool":{ "must":[ {"term": {"finish": "0"}} ] } }, "size": 10000 } es_results = es_prediction.search(index=index_manage_interfere_task, doc_type=type_manage_interfere_task,\ body=query_body)["hits"]["hits"] task_list = [] if int(RUN_TYPE) == 1: current_ts = datehour2ts(ts2datehour(time.time())) else: current_ts = 1482681600 + 18*3600 for item in es_results: tmp = [] task_detail = item["_source"] task_name = task_detail['pinyin_task_name'] update_time = task_detail["update_time"] sti_during = task_detail["stimulation_during"] stop_time = task_detail["stop_time"] if RUN_TYPE == 1: if stop_time > current_ts: es.update(index=index_manage_interfere_task, doc_type=type_manage_interfere_task,\ id=task_name, body={"doc":{"finish": "1"}}) tmp.append(task_name) tmp.append(task_detail["stop_time"]) tmp.append(task_detail["scan_text_finish"]) tmp.append(current_ts) if current_ts - update_time >= sti_during: r_stimulation.lpush(task_stimulation, json.dumps(tmp)) # update: processing status es_prediction.update(index=index_manage_interfere_task,doc_type=type_manage_interfere_task,\ id=task_name, body={"doc":{"stimulation_processing_status":"1"}}) return True
def scan_flow_text(): while 1: task_detail = r_scan_text.rpop(task_scan_text) if not task_detail: break task_detail = json.loads(task_detail) task_name = task_detail[0] must_keys = task_detail[1] should_keys = task_detail[2] end_ts = task_detail[3] start_ts = task_detail[4] source = task_detail[5] scan_event_text(task_name, must_keys, should_keys, end_ts, start_ts) scan_weibo(task_name, start_ts, end_ts) if source == "prediction": es_prediction.update(index=index_manage_prediction_task,doc_type=type_manage_prediction_task, \ id=task_name, body={"doc":{"scan_text_processing":"2"}}) elif source == "analysis": es_prediction.update(index=index_event_analysis ,doc_type=type_event_analysis, \ id=task_name, body={"doc":{"scan_text_processing":"0"}}) elif source == "interfere": es_prediction.update(index=index_manage_interfere_task ,doc_type=type_manage_interfere_task, \ id=task_name, body={"doc":{"scan_text_processing":"1"}})
def create_task(): #ts = time.time() #current_ts = datehour2ts(ts2datehour(ts)) index_name = index_manage_event_analysis index_type = type_manage_event_analysis query_body = { "query": { "term": { "event_value_finish": "0" } }, "size": 10000 } results = es.search(index=index_name, doc_type=index_type, body=query_body)["hits"]["hits"] item_finish_status = {} for item in results: topic = item["_source"]["task_name"] en_name = item["_source"]["pinyin_task_name"] start_ts = item['_source']['start_ts'] end_ts = item['_source']['end_ts'] print "push task_name: ", en_name r_event_analysis.lpush(task_event_analysis, json.dumps([topic, en_name, start_ts, end_ts])) #修改状态为已进入队列但尚未计算 item_finish_status['event_value_finish'] = 1 es.update(index=index_name, doc_type=index_type, id=en_name, body={'doc': item_finish_status})
def dispose_data(task_name, current_ts, during=3600): K = 2 ######## task_detail = es_prediction.get(index=index_manage_prediction_task, doc_type=type_manage_prediction_task, id=task_name)["_source"] start_time = int(task_detail["start_time"]) origin_task_name = task_name task_name = "micro_prediction_" + task_name query_body = { "query": { "range": { "update_time": { "lte": current_ts } } }, "size": K, "sort": { "update_time": { "order": "desc" } } } sort_query_body = { "query": { "range": { "update_time": { "lte": current_ts } } } } total_count = [] total_fans_list = [] total_origin_list = [] total_retweet_list = [] total_comment_list = [] total_uid_list = [] total_positive_list = [] total_negetive_list = [] average_origin_ts = [] average_retweet_ts = [] feature_list = [] results = es_prediction.search(index=task_name, doc_type=index_type_prediction_task, body=query_body)["hits"]["hits"] location = es_prediction.count(index=task_name, doc_type=index_type_prediction_task, body=sort_query_body)["count"] if len(results) != K: short_len = K - len(results) results.extend([[]] * short_len) print "former result: ", len(results), K results.reverse() for item in results: if item: item = item["_source"] #total_fans_list.append(item["total_fans_number"]) total_origin_list.append(item["origin_weibo_number"]) total_retweet_list.append(item["retweeted_weibo_number"]) total_comment_list.append(item["comment_weibo_number"]) total_count.append(item["total_count"]) total_uid_list.append(item["total_uid_count"]) total_positive_list.append(item["positive_count"]) total_negetive_list.append(item["negetive_count"]) average_origin_ts.append(item["average_origin_ts"]) average_retweet_ts.append(item["average_retweet_ts"]) else: #total_fans_list.append(0) total_origin_list.append(0) total_retweet_list.append(0) total_comment_list.append(0) total_uid_list.append(0) total_count.append(0) total_positive_list.append(0) total_negetive_list.append(0) average_origin_ts.append(0) average_retweet_ts.append(0) print "total_count: ", total_count feature_list = [] feature_list.append(math.log(int(total_retweet_list[-1] + 1))) feature_list.append(math.log(int(total_comment_list[-1] + 1))) feature_list.append(math.log(int(total_positive_list[-1] + 1))) feature_list.append(math.log(int(total_negetive_list[-2] + 1))) feature_list.append(math.log(int(total_negetive_list[-1] + 1))) feature_list.append(math.log(int(total_count[-1] + 1))) feature_list.append(math.log(int(total_uid_list[-1] + 1))) if int(during) == 3 * 3600: feature_list.append(average_origin_ts[-1]) feature_list.append(average_retweet_ts[-1]) # load model and prediction if int(during) == 3600: if total_count[-1] - total_count[-2] >= -0.2 * total_count[-2]: with open("model-up.pkl", "r") as f: gbdt = pickle.load(f) else: with open("model-down.pkl", "r") as f: gbdt = pickle.load(f) elif int(during) == 3 * 3600: with open("model-3.pkl", "r") as f: gbdt = pickle.load(f) print "feature_list: ", feature_list pred = gbdt.predict(feature_list) for item in pred: prediction_value = item prediction_value = math.exp(prediction_value) print "prediction_valie: ", prediction_value # update scan processing #es_prediction.update(index=index_manage_prediction_task,doc_type=type_manage_prediction_task, \ # id=origin_task_name, body={"doc":{"scan_text_processing":"0"}}) # update prediction value in es task_detail = es_prediction.get(index=index_manage_prediction_task, \ doc_type=type_manage_prediction_task, id=origin_task_name)["_source"] if current_ts >= int(task_detail["stop_time"]): task_detail["finish"] = "1" task_detail["processing_status"] = "0" # update task info es_prediction.index(index=index_manage_prediction_task, \ doc_type=type_manage_prediction_task, id=origin_task_name, body=task_detail) # update prediction es_prediction.update(index=task_name, doc_type=index_type_prediction_task, id=current_ts, body={"doc": { "prediction_value": prediction_value }}) return True
}, "future_user_info": { "type": "string", "index": "no" }, "current_hot_weibo": { "type": "string", "index": "no" }, "potential_hot_weibo": { "type": "string", "index": "no" } } } } } index_name = "stimulation_" + task_name exist_bool = es.indices.exists(index=index_name) if not exist_bool: es.indices.create(index=index_name, body=index_info, ignore=400) return "1" if __name__ == "__main__": es.update(index="manage_interfere_task", doc_type="interfere_task", id=\ "mao_ze_dong_dan_chen_ji_nian_ri", body={"doc":{"scan_text_finish":"1"}})
} } } } } if not es.indices.exists(index=index_manage_interfere_task): es.indices.create(index=index_manage_interfere_task, body=index_info, ignore=400) if __name__ == "__main__": #manage_interfere_task() es.indices.put_mapping( index=index_manage_interfere_task, doc_type=type_manage_interfere_task, body={"properties": { "update_time": { "type": "long" } }}) es.update(index=index_manage_interfere_task, doc_type=type_manage_interfere_task, id="mao_ze_dong_dan_chen_ji_nian_ri", body={"doc": { "update_time": 1482724800 }})
"timestamp": { "type": "long", }, # data update time "update_time": { "type": "long" }, "prediction_value": { "type": "double" }, "create_by": { "type": "string", "index": "not_analyzed" } } } } } if not es.indices.exists(index=task_name): es.indices.create(index=task_name, body=index_info, ignore=400) return "1" if __name__ == "__main__": #es.indices.create(index="micro_prediction_task", ignore=400) es.update(index="manage_prediction_task", doc_type="prediction_task", id="mao_ze_dong_dan_chen_ji_nian_ri",\ body={"doc":{"finish":"0"}})
def predict_user_influence(task_name, stop_time, ts): future_total = 0 # 未来传播总量 current_total = 0 # 可控范围 uid_count, in_user_list, in_user_info, all_user_dict = extend_network(task_name, ts) with open("gbdt.pkl", "r") as f: gbdt = pickle.load(f) # 已出现的重要用户阈值 try: in_user_threshold = float(r_stimulation.get("in_user_threshold")) except: r_stimulation.set("in_user_threshold", 1000) in_user_threshold = 1000 in_results = gbdt.predict(in_user_info) print "len(in_user_list): ", len(in_user_list) prediction_in = dict() for i in range(len(in_user_list)): if math.exp(in_results[i]) > in_user_threshold: # 1000 prediction_in[in_user_list[i]] = math.exp(in_results[i]) future_dict = dict() count = 0 for k,v in all_user_dict.iteritems(): uid = k print "k: ", k print "v: ", len(v) tmp_prediction_list = [] # tmp storage tmp_uid_list = [] if 1: user_list = v list_len = len(user_list) len_1000 = list_len/1000 for i in range(len_1000+1): tmp_uid = user_list[i*1000: (i+1)*1000] if not tmp_uid: continue es_results = es_retweet.mget(index=index_be_retweet,doc_type=index_type_be_retweet, body={"ids":tmp_uid})["docs"] for item in es_results: if item["found"]: count += 1 uid_be_retweet = json.loads(item["_source"]["uid_be_retweet"]) retweet_count = len(uid_be_retweet) if retweet_count < 1000: continue tmp = [] tmp.append(math.log(retweet_count+1)) tmp.append(math.log(uid_count+1)) tmp_prediction_list.append(tmp) tmp_uid_list.append(item["_id"]) if count % 1000 == 0: iter_prediction_list, t1, t2 = prediction_model(uid,gbdt, tmp_prediction_list, tmp_uid_list, future_dict) future_dict = iter_prediction_list tmp_prediction_list = [] tmp_uid_list = [] future_total += t1 current_total += t2 print "iter prediction: ", count if tmp_prediction_list: iter_prediction_list, t1, t2 = prediction_model(uid,gbdt, tmp_prediction_list, tmp_uid_list, future_dict) future_dict = iter_prediction_list future_total += t1 current_total += t2 print "future_dict: ", future_dict # storage save_results(task_name, ts, prediction_in, future_dict) # do left things dispose_results(task_name, ts, future_total, current_total) # update processing state es_prediction.update(index=index_manage_interfere_task,doc_type=type_manage_interfere_task,\ id=task_name, body={"doc":{"stimulation_processing_status":"0", "update_time": ts, "scan_text_finish":"0"}}) # stop task if ts >= stop_time: es_prediction.update(index=index_manage_interfere_task,doc_type=\ type_manage_interfere_task,id=task_name,body={"doc":{"finish":"1"}})
def rank_predict(event, start_ts, end_ts): feature_list = feature_compute(event, start_ts, end_ts) print 'feature_list:::::', feature_list feature_list_gbdt = [] for i in range(len(feature_list)): #把多个分开(extend if i == 15: feature_list[i] = json.loads(json.dumps(feature_list[i])) print 'type::', type(feature_list[i]) print 'feature_list[i][at_0]::', feature_list[i]['at_0'] feature_list_gbdt.append(feature_list[i]['at_0']) feature_list_gbdt.append(feature_list[i]['at_1']) feature_list_gbdt.append(feature_list[i]['at_2']) feature_list_gbdt.append(feature_list[i]['at>3']) elif i == 16: feature_list[i] = json.loads(json.dumps(feature_list[i])) print 'feature_list[i]:::', feature_list[i] feature_list_gbdt.append(feature_list[i][0]) feature_list_gbdt.append(feature_list[i][1]) feature_list_gbdt.append(feature_list[i][2]) feature_list_gbdt.append(feature_list[i][3]) else: feature_list_gbdt.append(feature_list[i]) print 'feature_list_gbdt:::::', feature_list_gbdt #加载微博模型 with open("0305_macro-prediction-weibos-value.pkl", "rb") as f: gbdt = pickle.load(f) pred = gbdt.predict(feature_list_gbdt) for item in pred: predict_weibo_value = item #加载用户模型 with open("0305_macro-prediction-uids-value.pkl", "rb") as f: gbdt = pickle.load(f) pred = gbdt.predict(feature_list_gbdt) for item in pred: predict_user_value = item predict_rank = get_rank(predict_user_value) ## 存入事件信息表 #for i in range(len(feature_list)): feature_results = {} feature_results['event'] = event ''' feature_results['topic_field'] = feature_list[0] feature_results['total_num'] = feature_list[1] feature_results['total_user_fans'] = feature_list[2] feature_results['total_comment'] = feature_list[3] feature_results['total_retweet'] = feature_list[4] feature_results['total_sensitive'] = feature_list[5] feature_results['total_sensitive_ratio'] = feature_list[6] feature_results['total_negtive'] = feature_list[7] feature_results['total_important_user'] = feature_list[8] feature_results['total_origin_type'] = feature_list[9] feature_results['origin_ratio'] = feature_list[10] feature_results['total_retweet_type'] = feature_list[11] feature_results['retweet_ratio'] = feature_list[12] feature_results['total_comment_type'] = feature_list[13] feature_results['comment_ratio'] = feature_list[14] feature_results['at_count'] = feature_list[15] feature_results['event_uid_count'] = feature_list[16] feature_results['event_trend_delta'] = feature_list[17] feature_results['predict_weibo_value'] = predict_weibo_value feature_results['predict_user_value'] = predict_user_value feature_results['predict_rank'] = predict_rank feature_results['update_time'] = time.time() ''' #feature_results['topic_field'] = feature_list[0] feature_results['uid_count'] = feature_list[0] feature_results['total_num'] = feature_list[1] feature_results['total_user_fans'] = feature_list[2] feature_results['total_comment'] = feature_list[3] feature_results['total_retweet'] = feature_list[4] feature_results['total_sensitive'] = feature_list[5] feature_results['total_sensitive_ratio'] = feature_list[6] feature_results['total_negtive'] = feature_list[7] feature_results['total_important_user'] = feature_list[8] feature_results['total_origin_type'] = feature_list[9] feature_results['origin_ratio'] = feature_list[10] feature_results['total_retweet_type'] = feature_list[11] feature_results['retweet_ratio'] = feature_list[12] feature_results['total_comment_type'] = feature_list[13] feature_results['comment_ratio'] = feature_list[14] feature_results['at_count'] = json.dumps(feature_list[15]) feature_results['event_trend_delta'] = json.dumps(feature_list[16]) feature_results['predict_weibo_value'] = predict_weibo_value feature_results['predict_user_value'] = predict_user_value feature_results['predict_rank'] = predict_rank feature_results['update_time'] = time.time() ''' save_event_info_results(event,topic_field,total_num,total_user_fans,\ total_comment,total_retweet,total_sensitive,\ total_sensitive_ratio,total_negtive,total_important_user,\ total_origin_type,origin_ratio,total_retweet_type,retweet_ratio,\ total_comment_type,comment_ratio,at_count,event_uid_count,\ event_trend_delta,predict_value,predict_rank,update_time) ''' #update macro features & results feature_results = json.dumps(feature_results) try: item_exists = es_prediction.get(index=index_macro_feature_result,doc_type= type_macro_feature_result,\ id=event)['_source'] es_prediction.update(index=index_macro_feature_result,doc_type=type_macro_feature_result,\ id=event,body={'doc':feature_results}) except: es_prediction.index(index=index_macro_feature_result,doc_type=type_macro_feature_result,\ id=event,body=feature_results) # update task info —— "macro_value_finish" task_detail = es_prediction.get(index=index_manage_prediction_task, \ doc_type=type_manage_prediction_task, id=event)["_source"] task_detail["macro_value_finish"] = '1' es_prediction.index(index=index_manage_prediction_task, \ doc_type=type_manage_prediction_task, id=event, body=task_detail) print 'feature_results::::', feature_results