def test(): item = {} ''' item['task_name'] = '天津老太摆射击摊被判刑' #'毛泽东诞辰纪念日' item['pinyin_task_name'] = 'tian_jin_lao_tai_she_ji_qiang_bei_pan_xing' #"mao_ze_dong_dan_chen_ji_nian_ri" item['start_time'] = 1482768502 #1482681600 item['stop_time'] = 1483455435 #1483113600 item['submit_user'] = '******' item['submit_time'] = time.time() item['must_keywords'] = ['射击','判刑'] item['should_keywords'] = ['天津','老太'] item['event_value_finish'] = 0 item['scan_text_finish'] = 0 ''' item['task_name'] = '毛泽东诞辰纪念日' item['pinyin_task_name'] = "mao_ze_dong_dan_chen_ji_nian_ri" item['start_time'] = 1482681600 item['stop_time'] = 1483113600 item['submit_user'] = '******' item['submit_time'] = time.time() item['must_keywords'] = ['毛泽东'] item['should_keywords'] = ['诞辰', '纪念日'] item['event_value_finish'] = 0 item['scan_text_finish'] = 0 mappings_event_analysis_task() es.index(index=index_manage_event_analysis, doc_type=type_manage_event_analysis, id=item['pinyin_task_name'], body=item) '''
def save_results(task_name, ts, prediction_in, future_dict): mappings_stimulation(task_name) work_index = "stimulation_"+task_name work_type = "stimulation_results" update_body = {"update_time": ts, "in_results":json.dumps(prediction_in), "future_results": json.dumps(future_dict)} es_prediction.index(index=work_index, doc_type=work_type, id=ts, body=update_body) return True
def test(): item = {} item['task_name'] = '毛泽东诞辰纪念日' item['pinyin_task_name'] = "mao_ze_dong_dan_chen_ji_nian_ri" item['start_ts'] = 1482681600 item['end_ts'] = 1483113600 item['event_value_finish'] = 0 item['scan_text_finish'] = 0 mappings_event_analysis_task() es.index(index=index_manage_event_analysis, doc_type=type_manage_event_analysis, id=item['pinyin_task_name'], body=item) '''
def organize_feature(task_name, event, start_ts, end_ts, during=3600): data = [] index_list = [] task_name = "micro_prediction_" + task_name while 1: data_dict = dict() if start_ts >= end_ts: break results_list = user_fansnum(event, start_ts, start_ts + during) for i in range(len(data_order)): data_dict[data_order[i]] = results_list[i] data_dict["update_time"] = start_ts + minimal_time_interval start_ts += during print "start timestamp: ", start_ts es_prediction.index(index=task_name, doc_type=index_type_prediction_task, id=start_ts, body=data_dict)
def dispose_data(task_name, current_ts, during=3600): K = 2 ######## task_detail = es_prediction.get(index=index_manage_prediction_task, doc_type=type_manage_prediction_task, id=task_name)["_source"] start_time = int(task_detail["start_time"]) origin_task_name = task_name task_name = "micro_prediction_" + task_name query_body = { "query": { "range": { "update_time": { "lte": current_ts } } }, "size": K, "sort": { "update_time": { "order": "desc" } } } sort_query_body = { "query": { "range": { "update_time": { "lte": current_ts } } } } total_count = [] total_fans_list = [] total_origin_list = [] total_retweet_list = [] total_comment_list = [] total_uid_list = [] total_positive_list = [] total_negetive_list = [] average_origin_ts = [] average_retweet_ts = [] feature_list = [] results = es_prediction.search(index=task_name, doc_type=index_type_prediction_task, body=query_body)["hits"]["hits"] location = es_prediction.count(index=task_name, doc_type=index_type_prediction_task, body=sort_query_body)["count"] if len(results) != K: short_len = K - len(results) results.extend([[]] * short_len) print "former result: ", len(results), K results.reverse() for item in results: if item: item = item["_source"] #total_fans_list.append(item["total_fans_number"]) total_origin_list.append(item["origin_weibo_number"]) total_retweet_list.append(item["retweeted_weibo_number"]) total_comment_list.append(item["comment_weibo_number"]) total_count.append(item["total_count"]) total_uid_list.append(item["total_uid_count"]) total_positive_list.append(item["positive_count"]) total_negetive_list.append(item["negetive_count"]) average_origin_ts.append(item["average_origin_ts"]) average_retweet_ts.append(item["average_retweet_ts"]) else: #total_fans_list.append(0) total_origin_list.append(0) total_retweet_list.append(0) total_comment_list.append(0) total_uid_list.append(0) total_count.append(0) total_positive_list.append(0) total_negetive_list.append(0) average_origin_ts.append(0) average_retweet_ts.append(0) print "total_count: ", total_count feature_list = [] feature_list.append(math.log(int(total_retweet_list[-1] + 1))) feature_list.append(math.log(int(total_comment_list[-1] + 1))) feature_list.append(math.log(int(total_positive_list[-1] + 1))) feature_list.append(math.log(int(total_negetive_list[-2] + 1))) feature_list.append(math.log(int(total_negetive_list[-1] + 1))) feature_list.append(math.log(int(total_count[-1] + 1))) feature_list.append(math.log(int(total_uid_list[-1] + 1))) if int(during) == 3 * 3600: feature_list.append(average_origin_ts[-1]) feature_list.append(average_retweet_ts[-1]) # load model and prediction if int(during) == 3600: if total_count[-1] - total_count[-2] >= -0.2 * total_count[-2]: with open("model-up.pkl", "r") as f: gbdt = pickle.load(f) else: with open("model-down.pkl", "r") as f: gbdt = pickle.load(f) elif int(during) == 3 * 3600: with open("model-3.pkl", "r") as f: gbdt = pickle.load(f) print "feature_list: ", feature_list pred = gbdt.predict(feature_list) for item in pred: prediction_value = item prediction_value = math.exp(prediction_value) print "prediction_valie: ", prediction_value # update scan processing #es_prediction.update(index=index_manage_prediction_task,doc_type=type_manage_prediction_task, \ # id=origin_task_name, body={"doc":{"scan_text_processing":"0"}}) # update prediction value in es task_detail = es_prediction.get(index=index_manage_prediction_task, \ doc_type=type_manage_prediction_task, id=origin_task_name)["_source"] if current_ts >= int(task_detail["stop_time"]): task_detail["finish"] = "1" task_detail["processing_status"] = "0" # update task info es_prediction.index(index=index_manage_prediction_task, \ doc_type=type_manage_prediction_task, id=origin_task_name, body=task_detail) # update prediction es_prediction.update(index=task_name, doc_type=index_type_prediction_task, id=current_ts, body={"doc": { "prediction_value": prediction_value }}) return True
from elasticsearch import Elasticsearch social_sensors = ["1738004582", "1784473157", "2286908003", "1717833412", "1314608344", "1644114654",\ "1686546714", "1656737654", "2028810631", "1677991972", "3881380517", "1847582585", "1651428902",\ "1420157965", "1913382117", "1884334303", "1734530730", "1893278624", "1720962692", "1700648435",\ "3288875501", "1672519561", "2034347300", "1688864597", "2615417307", "1191965271", "1643971635", \ "1778758223", "1216431741", "1698823241", "1977460817", "1644729004", "1231759973", "1231759973",\ "1315591982", "1656831930", "1926909715", "1699432410", "1660452532", "1722628512", "1267454277",\ "1640601392", "2443459455", "3921730119", "1867571077", "1718493627", "1653460650", "1737737970",\ "2616293707", "3271121353", "1642591402", "1326410461", "1645705403", "1985593262", "1654164742",\ "1638781994", "2993049293", "1653944045", "5977555696", "1992613670", "1726393244", "1216431741",\ "1724367710", "1880087643", "2827102952", "1974808274", "1700720163", "3164957712", "3266943013",\ "2127460165", "2083844833", "5305757517", "2803301701", "2656274875", "1618051664", "1974576991", \ "1642512402", "1649173367", "1658388624", "1697601814", "1703371307", "1638782947", "1402977920", \ "1893801487", "2108053230", "1649469284", "1975995305", "2810373291", "1749990115", "1663937380", \ "1497087080", "1652484947", "2162541102", "2462605080", "1650111241", "1265998927", "1698857957", \ "1887790981", "1698233740", "3712035812", "5044281310", "1701401324", "1571497285", "1635764393"] user = "******" task_detail = dict() task_detail["task_name"] = id_sensing task_detail["remark"] = "感知热门事件" task_detail["social_sensors"] = json.dumps(list(social_sensors)) task_detail["history_status"] = json.dumps([]) print es.index(index=index_sensing, doc_type=type_sensing, id=id_sensing, body=task_detail)
def social_sensing(task_detail): # 任务名 传感器 终止时间 之前状态 创建者 时间 task_name = task_detail[0] social_sensors = task_detail[1] ts = int(task_detail[2]) wb = Workbook() ws = wb.create_sheet() print ts2date(ts) # PART 1 #forward_result = get_forward_numerical_info(task_name, ts, create_by) # 之前时间阶段内的原创微博list/retweeted forward_origin_weibo_list = query_mid_list(ts-time_interval, social_sensors, forward_time_range) forward_retweeted_weibo_list = query_mid_list(ts-time_interval, social_sensors, forward_time_range, 3) # 当前阶段内原创微博list current_mid_list = query_mid_list(ts, social_sensors, time_interval) current_retweeted_mid_list = query_mid_list(ts, social_sensors, time_interval, 3) all_mid_list = [] all_mid_list.extend(current_mid_list) all_mid_list.extend(current_retweeted_mid_list) all_mid_list.extend(forward_origin_weibo_list) all_mid_list.extend(forward_retweeted_weibo_list) all_origin_list = [] all_origin_list.extend(current_mid_list) all_origin_list.extend(forward_origin_weibo_list) all_origin_list = list(set(all_origin_list)) all_retweeted_list = [] all_retweeted_list.extend(current_retweeted_mid_list) all_retweeted_list.extend(forward_retweeted_weibo_list)#被转发微博的mid/root-mid all_retweeted_list = list(set(all_retweeted_list)) print "all mid list: ", len(all_mid_list) #print "all_origin_list", all_origin_list #print "all_retweeted_list", all_retweeted_list # 查询微博在当前时间内的转发和评论数, 聚合按照message_type statistics_count = query_related_weibo(ts, all_mid_list, time_interval) if all_origin_list: origin_weibo_detail = query_hot_weibo(ts, all_origin_list, time_interval) # 原创微博详情 else: origin_weibo_detail = {} if all_retweeted_list: retweeted_weibo_detail = query_hot_weibo(ts, all_retweeted_list, time_interval) # 转发微博详情 else: retweeted_weibo_detail = {} current_total_count = statistics_count['total_count'] # 当前阶段内所有微博总数 current_retweeted_count = statistics_count['retweeted'] current_comment_count = statistics_count['comment'] """ # 聚合当前时间内重要的人 important_uid_list = [] datetime = ts2datetime(ts-time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: search_results = get_important_user(ts, all_mid_list, time_interval) important_uid_list = search_results # 根据获得uid_list,从人物库中匹配重要人物 if important_uid_list: important_results = es_user_portrait.mget(index=portrait_index_name,doc_type=portrait_index_type, body={"ids": important_uid_list})['docs'] else: important_results = [] filter_important_list = [] # uid_list if important_results: for item in important_results: if item['found']: #if item['_source']['importance'] > IMPORTANT_USER_THRESHOULD: filter_important_list.append(item['_id']) print "filter_important_list", filter_important_list print "important_results", important_uid_list """ #判断感知 # 感知到的事, all_mid_list sensitive_text_list = [] tmp_sensitive_warning = "" text_dict = dict() # 文本信息 mid_value = dict() # 文本赋值 duplicate_dict = dict() # 重合字典 portrait_dict = dict() # 背景信息 classify_text_dict = dict() # 分类文本 classify_uid_list = [] duplicate_text_list = [] sensitive_words_dict = dict() sensitive_weibo_detail = {} # 有事件发生时开始 if 1: index_list = [] important_words = [] datetime_1 = ts2datetime(ts) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es = es_text.indices.exists(index=index_name_1) if exist_es: index_list.append(index_name_1) datetime_2 = ts2datetime(ts-DAY) index_name_2 = flow_text_index_name_pre + datetime_2 exist_es = es_text.indices.exists(index=index_name_2) if exist_es: index_list.append(index_name_2) if index_list and all_mid_list: query_body = { "query":{ "filtered":{ "filter":{ "terms":{"mid": all_mid_list} } } }, "size": 5000 } search_results = es_text.search(index=index_list, doc_type="text", body=query_body)['hits']['hits'] tmp_sensitive_warning = "" text_dict = dict() # 文本信息 mid_value = dict() # 文本赋值 duplicate_dict = dict() # 重合字典 portrait_dict = dict() # 背景信息 classify_text_dict = dict() # 分类文本 classify_uid_list = [] duplicate_text_list = [] sensitive_words_dict = dict() if search_results: for item in search_results: iter_uid = item['_source']['uid'] iter_mid = item['_source']['mid'] iter_text = item['_source']['text'].encode('utf-8', 'ignore') iter_sensitive = item['_source'].get('sensitive', 0) duplicate_text_list.append({"_id":iter_mid, "title": "", "content":iter_text.decode("utf-8",'ignore')}) if iter_sensitive: tmp_sensitive_warning = signal_sensitive_variation #涉及到敏感词的微博 sensitive_words_dict[iter_mid] = iter_sensitive keywords_dict = json.loads(item['_source']['keywords_dict']) personal_keywords_dict = dict() for k, v in keywords_dict.iteritems(): k = k.encode('utf-8', 'ignore') personal_keywords_dict[k] = v classify_text_dict[iter_mid] = personal_keywords_dict classify_uid_list.append(iter_uid) # 去重 if duplicate_text_list: dup_results = duplicate(duplicate_text_list) for item in dup_results: if item['duplicate']: duplicate_dict[item['_id']] = item['same_from'] # 分类 mid_value = dict() if classify_text_dict: classify_results = topic_classfiy(classify_uid_list, classify_text_dict) #print "classify_results: ", classify_results for k,v in classify_results.iteritems(): # mid:value mid_value[k] = topic_value_dict[v[0]] if sensitive_words_dict: sensitive_mid_list = sensitive_words_dict.keys() sensitivie_weibo_detail = query_hot_weibo(ts, sensitive_mid_list, time_interval) results = dict() results['mid_topic_value'] = json.dumps(mid_value) results['duplicate_dict'] = json.dumps(duplicate_dict) results['sensitive_words_dict'] = json.dumps(sensitive_words_dict) results['sensitive_weibo_detail'] = json.dumps(sensitive_weibo_detail) results['origin_weibo_number'] = len(all_origin_list) results['retweeted_weibo_number'] = len(all_retweeted_list) results['origin_weibo_detail'] = json.dumps(origin_weibo_detail) results['retweeted_weibo_detail'] = json.dumps(retweeted_weibo_detail) results['retweeted_weibo_count'] = current_retweeted_count results['comment_weibo_count'] = current_comment_count results['weibo_total_number'] = current_total_count results['timestamp'] = ts # es存储当前时段的信息 es_prediction.index(index=index_sensing_task, doc_type=type_sensing_task, id=ts, body=results) #print results #temp_titles = list(results.keys()) #temp_results = list(results.values()) #ws.append(temp_titles) #ws.append(temp_results) #wb.save('./temp/temp'+str(ts)+'.xlsx') #查找并展示经济类的相关微博 #eco_mid_list = get_economics_mids(mid_value) #size = 10 #get_origin_weibo_detail(ts,size,'retweeted') #print eco_mid_list #eco_weibos = get_weibo_content(index_list,eco_mid_list) #print eco_weibos #eco_content = eco_weibos['_source']['text'] #weibo_content = '' #for aaa in eco_weibos: #weibo_content += aaa['_source']['text']+'\n' #save_results(weibo_content,ts) return "1"
def rank_predict(event, start_ts, end_ts): feature_list = feature_compute(event, start_ts, end_ts) print 'feature_list:::::', feature_list feature_list_gbdt = [] for i in range(len(feature_list)): #把多个分开(extend if i == 15: feature_list[i] = json.loads(json.dumps(feature_list[i])) print 'type::', type(feature_list[i]) print 'feature_list[i][at_0]::', feature_list[i]['at_0'] feature_list_gbdt.append(feature_list[i]['at_0']) feature_list_gbdt.append(feature_list[i]['at_1']) feature_list_gbdt.append(feature_list[i]['at_2']) feature_list_gbdt.append(feature_list[i]['at>3']) elif i == 16: feature_list[i] = json.loads(json.dumps(feature_list[i])) print 'feature_list[i]:::', feature_list[i] feature_list_gbdt.append(feature_list[i][0]) feature_list_gbdt.append(feature_list[i][1]) feature_list_gbdt.append(feature_list[i][2]) feature_list_gbdt.append(feature_list[i][3]) else: feature_list_gbdt.append(feature_list[i]) print 'feature_list_gbdt:::::', feature_list_gbdt #加载微博模型 with open("0305_macro-prediction-weibos-value.pkl", "rb") as f: gbdt = pickle.load(f) pred = gbdt.predict(feature_list_gbdt) for item in pred: predict_weibo_value = item #加载用户模型 with open("0305_macro-prediction-uids-value.pkl", "rb") as f: gbdt = pickle.load(f) pred = gbdt.predict(feature_list_gbdt) for item in pred: predict_user_value = item predict_rank = get_rank(predict_user_value) ## 存入事件信息表 #for i in range(len(feature_list)): feature_results = {} feature_results['event'] = event ''' feature_results['topic_field'] = feature_list[0] feature_results['total_num'] = feature_list[1] feature_results['total_user_fans'] = feature_list[2] feature_results['total_comment'] = feature_list[3] feature_results['total_retweet'] = feature_list[4] feature_results['total_sensitive'] = feature_list[5] feature_results['total_sensitive_ratio'] = feature_list[6] feature_results['total_negtive'] = feature_list[7] feature_results['total_important_user'] = feature_list[8] feature_results['total_origin_type'] = feature_list[9] feature_results['origin_ratio'] = feature_list[10] feature_results['total_retweet_type'] = feature_list[11] feature_results['retweet_ratio'] = feature_list[12] feature_results['total_comment_type'] = feature_list[13] feature_results['comment_ratio'] = feature_list[14] feature_results['at_count'] = feature_list[15] feature_results['event_uid_count'] = feature_list[16] feature_results['event_trend_delta'] = feature_list[17] feature_results['predict_weibo_value'] = predict_weibo_value feature_results['predict_user_value'] = predict_user_value feature_results['predict_rank'] = predict_rank feature_results['update_time'] = time.time() ''' #feature_results['topic_field'] = feature_list[0] feature_results['uid_count'] = feature_list[0] feature_results['total_num'] = feature_list[1] feature_results['total_user_fans'] = feature_list[2] feature_results['total_comment'] = feature_list[3] feature_results['total_retweet'] = feature_list[4] feature_results['total_sensitive'] = feature_list[5] feature_results['total_sensitive_ratio'] = feature_list[6] feature_results['total_negtive'] = feature_list[7] feature_results['total_important_user'] = feature_list[8] feature_results['total_origin_type'] = feature_list[9] feature_results['origin_ratio'] = feature_list[10] feature_results['total_retweet_type'] = feature_list[11] feature_results['retweet_ratio'] = feature_list[12] feature_results['total_comment_type'] = feature_list[13] feature_results['comment_ratio'] = feature_list[14] feature_results['at_count'] = json.dumps(feature_list[15]) feature_results['event_trend_delta'] = json.dumps(feature_list[16]) feature_results['predict_weibo_value'] = predict_weibo_value feature_results['predict_user_value'] = predict_user_value feature_results['predict_rank'] = predict_rank feature_results['update_time'] = time.time() ''' save_event_info_results(event,topic_field,total_num,total_user_fans,\ total_comment,total_retweet,total_sensitive,\ total_sensitive_ratio,total_negtive,total_important_user,\ total_origin_type,origin_ratio,total_retweet_type,retweet_ratio,\ total_comment_type,comment_ratio,at_count,event_uid_count,\ event_trend_delta,predict_value,predict_rank,update_time) ''' #update macro features & results feature_results = json.dumps(feature_results) try: item_exists = es_prediction.get(index=index_macro_feature_result,doc_type= type_macro_feature_result,\ id=event)['_source'] es_prediction.update(index=index_macro_feature_result,doc_type=type_macro_feature_result,\ id=event,body={'doc':feature_results}) except: es_prediction.index(index=index_macro_feature_result,doc_type=type_macro_feature_result,\ id=event,body=feature_results) # update task info —— "macro_value_finish" task_detail = es_prediction.get(index=index_manage_prediction_task, \ doc_type=type_manage_prediction_task, id=event)["_source"] task_detail["macro_value_finish"] = '1' es_prediction.index(index=index_manage_prediction_task, \ doc_type=type_manage_prediction_task, id=event, body=task_detail) print 'feature_results::::', feature_results