def create_task_list(given_ts): # 1. search from manage_sensing_task # 2. push to redis list-----task_work # print start info current_path = os.getcwd() file_path = os.path.join(current_path, 'task_list.py') now_ts = datehour2ts(ts2datehour(time.time() - 3600)) print_log = "&".join([file_path, "start", ts2date(now_ts)]) print print_log #ts = ts - 3600 query_body = {"query": {"match_all": {}}} search_results = es.search(index=index_sensing, doc_type=type_sensing, body=query_body)['hits']['hits'] count = 0 if search_results: for iter_item in search_results: _id = iter_item['_id'] item = iter_item['_source'] task = [] task.append(item['task_name']) # task_name task.append(json.loads(item['social_sensors'])) # social sensors #task.append(now_ts) task.append(given_ts) r.lpush('task_name', json.dumps(task)) count += 1 print count print_log = "&".join([file_path, "end", ts2date(time.time())]) print print_log
def create_task(): ts = time.time() if RUN_TYPE: current_ts = datehour2ts(ts2datehour(ts)) else: current_ts = 1482861600 query_body = { "query": { "term":{"finish":"0"} }, "size":10000 } results = es_prediction.search(index=index_manage_prediction_task,\ doc_type=type_manage_prediction_task, body=query_body)["hits"]["hits"] for item in results: print item task_name = item["_source"]["pinyin_task_name"] stop_time = item["_source"]["stop_time"] print stop_time, current_ts if stop_time < current_ts: es_prediction.update(index=index_manage_prediction_task,\ doc_type=type_manage_prediction_task, id=task_name, body={"doc":{"macro_trendline_finish":"1", "finish": "1"}}) else: r_trendline.lpush(task_trendline, task_name)
def create_task(): query_body = { "query": { "bool":{ "must":[ {"term": {"finish": "0"}} ] } }, "size": 10000 } es_results = es_prediction.search(index=index_manage_interfere_task, doc_type=type_manage_interfere_task,\ body=query_body)["hits"]["hits"] task_list = [] if int(RUN_TYPE) == 1: current_ts = datehour2ts(ts2datehour(time.time())) else: current_ts = 1482681600 + 18*3600 for item in es_results: tmp = [] task_detail = item["_source"] task_name = task_detail['pinyin_task_name'] update_time = task_detail["update_time"] sti_during = task_detail["stimulation_during"] stop_time = task_detail["stop_time"] if RUN_TYPE == 1: if stop_time > current_ts: es.update(index=index_manage_interfere_task, doc_type=type_manage_interfere_task,\ id=task_name, body={"doc":{"finish": "1"}}) tmp.append(task_name) tmp.append(task_detail["stop_time"]) tmp.append(task_detail["scan_text_finish"]) tmp.append(current_ts) if current_ts - update_time >= sti_during: r_stimulation.lpush(task_stimulation, json.dumps(tmp)) # update: processing status es_prediction.update(index=index_manage_interfere_task,doc_type=type_manage_interfere_task,\ id=task_name, body={"doc":{"stimulation_processing_status":"1"}}) return True
def create_task_list(): # 1. search from manage_sensing_task # 2. push to redis list-----task_work # print start info current_path = os.getcwd() file_path = os.path.join(current_path, 'task_list.py') if S_TYPE == 'test': now_ts = datetime2ts(S_DATE) else: now_ts = datehour2ts(ts2datehour(time.time() - 3600)) print_log = " ".join([file_path, "--start:"]) print print_log query_body = {"query": {"match_all": {}}} search_results = es.search(index=index_manage_sensing, doc_type=type_manage_sensing, body=query_body)['hits']['hits'] count = 0 if search_results: for iter_item in search_results: _id = iter_item['_id'] item = iter_item['_source'] task = [] task.append(item['task_name']) # task_name try: task.append(json.loads( item['social_sensors'])) # social sensors except: task.append(item['social_sensors']) # social sensors task.append(now_ts) r.lpush("task_name", json.dumps(task)) count += 1 print 'task_count_sum:', count
def dispose_data(task_name, current_ts): es_result = es_prediction.get(index=index_manage_prediction_task, doc_type=type_manage_prediction_task, id=task_name)["_source"] macro_during = es_result['macro_during'] start_ts = datehour2ts(ts2datehour(es_result["submit_time"])) task_start_ts = start_ts end_ts = datehour2ts(ts2datehour(es_result["stop_time"])) index_micro = "micro_prediction_" + task_name query_body = { "query": { "filtered": { "filter": { "range": { "update_time": { "lte": current_ts } } } } }, "size": 10000, "sort": { "update_time": { "order": "asc" } } } micro_results = es_prediction.search(index=index_micro, doc_type="micro_task", body=query_body)["hits"]["hits"] total_list = [] for item in micro_results: total_list.append(item["_source"]["total_count"]) # 每个时间段内的微博量 total_len = (end_ts - start_ts) / macro_during times = int(macro_during) / 3600 lenth = len(total_list) / times adjust_list = [] time_list = [] count = 0 i = 0 for item in total_list: count += item i += 1 if i % times == 0: if start_ts <= current_ts: adjust_list.append(count) count = 0 time_list.append(start_ts) else: break start_ts += 3600 # 总得时间走势图 total_time_list = [] for i in range(total_len): total_time_list.append(task_start_ts + i * macro_during) left_time = list(set(total_time_list) - set(time_list)) left_time = sorted(left_time) return adjust_list, total_len, time_list, left_time
def trendline_list(mid, total_value, mid_ts): if RUN_TYPE: ts = time.time() else: ts = datetime2ts("2016-11-20") """ index_list = [] for i in range(diffusion_time): index_list.append("flow_text_"+ts2datetime(ts-i*24*3600)) result = dict() for iter_index in index_list: if not es.indices.exists(index=iter_index): continue try: result = es.get(index=iter_index, doc_type="text", id=mid)["_source"] break except: pass if not result: return [] """ nn = 24 * 3600 / diffusion_time_interval ### current_list = [] rising_list = [] falling_list = [] exist_time_list = [] total_time_list = [] timestamp = mid_ts start_ts = mid_ts timestamp = datehour2ts(ts2datehour(timestamp)) for i in range(diffusion_time * nn): total_time_list.append(timestamp + i * diffusion_time_interval) # diffusion more than 5 days, return time list as far if 1: while 1: query_body = { "query": { "bool": { "must": [{ "term": { "root_mid": mid } }, { "range": { "timestamp": { "gte": timestamp, "lt": timestamp + diffusion_time_interval } } }] } } } index_name = "flow_text_" + ts2datetime(timestamp) count = es.count(index=index_name, doc_type="text", body=query_body)["count"] current_list.append(count) exist_time_list.append(timestamp) timestamp += diffusion_time_interval if timestamp >= ts: break left_set = set(total_time_list) - set(exist_time_list) left_list = sorted(list(left_set), reverse=False) max_value = max(current_list) index_exist = len(current_list) value = current_list expected_value = total_value * 0.8 / (0.2 * nn * diffusion_time) if expected_value <= max_value: top_value = (max_value + total_value) / 2 else: top_value = expected_value # weibo prediction k = 5 h = 0.5 peak = spd(value, h, k) flag = judge(peak, value) if len(flag) == 2: paras = getTwoBeauties(value, flag[0], flag[1]) paras[-1] = diffusion_time * nn series = bassTwoPeaks(paras) else: paras = getSingleBeauty(value) paras[-1] = diffusion_time * nn series = bassOnePeak(paras) # 预测峰值位置 predict_climax = series.index(max(series)) if predict_climax > index_exist: predict_climax_left = predict_climax - len(current_list) rise_trend, fall_trend = get_trend(left_list, predict_climax_left, value[-1], top_value) true_climax = exist_time_list[0] + ( exist_time_list[1] - exist_time_list[0]) * predict_climax else: top_value = value[-1] rise_trend, fall_trend = get_trend(left_list, 0, value[-1], 1) true_climax = exist_time_list[value.index(max(value))] top_value = max(value) results = dict() results["climax"] = [true_climax, top_value] results["rise_trend"] = rise_trend results["fall_trend"] = fall_trend new_list = [] for i in range(len(exist_time_list)): new_list.append([exist_time_list[i], value[i]]) results["exist_trend"] = new_list return results
def compute_recommend_subopnion(task_detail): print '开始分析计算......' task_id = task_detail['task_id'].strip('"') keywords_string = task_detail['keywords_string'] keywords_list = keywords_string.split('&') ## 以 & 切分关键词,得到list xnr_user_no = task_detail['xnr_user_no'] mid = task_detail['mid'] query_item = 'keywords_string' nest_query_list = [] for keyword in keywords_list: nest_query_list.append({'wildcard': {query_item: '*' + keyword + '*'}}) ''' ## 重点关注当前虚拟人的关注用户 if S_TYPE == 'test': # followers_list = get_result['followers_list'] # nest_query_list.append({'terms':followers_list}) print '全部用户' else: get_result = es.get(index=weibo_xnr_fans_followers_index_name,doc_type=weibo_xnr_fans_followers_index_type,\ id=xnr_user_no)['_source'] followers_list = get_result['followers_list'] nest_query_list.append({'terms':followers_list}) ''' if S_TYPE == 'test': create_time = datetime2ts(S_DATE) else: create_time = datehour2ts(ts2datehour(time.time() - 3600)) #get_flow_text_index_list(create_time) #index_name_list_list = get_flow_text_index_list(now_timestamp) index_name_list = get_flow_text_index_list(create_time) print 'index_name_list::', index_name_list es_results = es_flow_text.search(index=index_name_list,doc_type='text',\ body={'query':{'bool':{'must':nest_query_list}},'size':MAX_SEARCH_SIZE})['hits']['hits'] weibo_list = [] ## 内容推荐和子观点分析的输入 if es_results: for item in es_results: item = item['_source'] weibo = item['text'] weibo_list.append(weibo) ## 内容推荐 ## 得到推荐句子列表 print 'weibo_list::::::', weibo_list print '开始内容推荐计算......' if weibo_list: content_results = summary_main(weibo_list) else: content_results = [] print '开始保存内容推荐计算结果......' mark = save_content_recommendation_results(xnr_user_no, mid, task_id.encode('utf-8'), content_results) print 'mark_content:::', mark if mark == False: print '内容推荐结果保存过程中遇到错误,把计算任务重新push到队列中' add_task_2_queue(keyword_task_queue_name, task_detail) else: print '内容推荐计算结果保存完毕......' ## 子观点分析 ''' 输入: weibo_data:微博列表,[weibo1,weibo2,...] k_cluster:子话题个数 (默认为5) 输出: opinion_name:子话题名称字典,{topic1:name1,topic2:name2,...} word_result:子话题关键词对,{topic1:[w1,w2,...],topic2:[w1,w2,...],...} text_list:子话题对应的文本,{topic1:[text1,text2,...],topic2:[text1,text2,..],..} ''' print '开始子观点计算......' if weibo_list: opinion_name, word_result, text_list = opinion_main(weibo_list, k_cluster=5) sub_opinion_results = dict() for topic, text in text_list.iteritems(): topic_name = opinion_name[topic] sub_opinion_results[topic_name] = text[:SUB_OPINION_WEIBO_LIMIT] else: sub_opinion_results = {} print '开始保存子观点计算结果......' mark = save_subopnion_results(xnr_user_no, mid, task_id, sub_opinion_results) print 'mark_opinion:::', mark if mark == False: print '子观点计算结果保存过程中遇到错误,把计算任务重新push到队列中' add_task_2_queue(keyword_task_queue_name, task_detail) else: print '子观点计算结果保存完毕......'
source = item["news_source"] # 新闻来源 date = item["news_date"].strip() # 日期 #print url, title, content, source, date count += 1 print count except: continue # 建索引的代码从这里开始写 index_dict = dict() index_dict["url"] = url index_dict["title"] = title index_dict["content"] = content index_dict["source"] = source try: index_dict["timestamp"] = datehour2ts(date) except: index_dict["timestamp"] = datetime2ts(date) bulk_action.extend([{"index": {"_id": url}}, index_dict]) index_count += 1 if index_count != 0 and index_count % 100 == 0: es.bulk(bulk_action, index="news", doc_type="text") bulk_action = [] print "finish index: ", index_count if bulk_action: es.bulk(bulk_action, index="news", doc_type="text") print "total index: ", index_count f.close()
def task_list(): create_task() if RUN_TYPE: current_ts = datehour2ts(ts2datehour(time.time())) else: current_ts = 1482861600 while 1: task_detail = r_trendline.rpop(task_trendline) print task_detail if not task_detail: break task_name = task_detail while 1: micro_index = "micro_prediction_" + task_name es_exist = es_prediction.exists(index=micro_index, doc_type="micro_task", id=current_ts) if not es_exist: time.sleep(60) else: break # obtain time series value, total_len, time_list, left_list = dispose_data( task_name, current_ts) # macro prediction result try: es_macro_result = es_prediction.get(index=index_macro_feature_result,\ doc_type=type_macro_feature_result,id=task_name)["_source"] prediction_total_value = es_macro_result["predict_weibo_value"] top_value = prediction_total_value * 0.8 / (0.2 * total_len) except: top_value = 0 # 已知的最大值和位置 max_exist = max(value) index_exist = len(value) if top_value < max_exist: top_value = 2 * max_exist # weibo prediction k = 5 h = 0.5 peak = spd(value, h, k) flag = judge(peak, value) if len(flag) == 2: print("Two peaks:") paras = getTwoBeauties(value, flag[0], flag[1]) paras[-1] = total_len series = bassTwoPeaks(paras) else: print("Single peak:") paras = getSingleBeauty(value) paras[-1] = total_len series = bassOnePeak(paras) # 预测峰值位置 predict_climax = series.index(max(series)) if predict_climax > index_exist: predict_climax_left = predict_climax - len(value) # 剩余走势图 climax位置 起止点值 最大值 rise_trend, fall_trend = get_trend(left_list, predict_climax_left, value[-1], top_value) true_climax = time_list[0] + (time_list[1] - time_list[0]) * predict_climax else: top_value = value[-1] rise_trend, fall_trend = get_trend(left_list, 0, value[-1], 1) true_climax = time_list[value.index(max(value))] results = dict() results["climax"] = [true_climax, top_value] results["rise_trend"] = rise_trend results["fall_trend"] = fall_trend new_list = [] for i in range(len(time_list)): new_list.append([time_list[i], value[i]]) results["exist_trend"] = new_list r_trendline.set("trendline_" + task_name, json.dumps(results)) print results