for item in weibo_results: iter_uid = item['_source']['uid'] iter_mid = item['_source']['mid'] iter_text = item['_source']['text'].encode('utf-8', 'ignore') #f.write(str(iter_text)+"\n") keywords_dict = json.loads(item['_source']['keywords_dict']) personal_keywords_dict = dict() for k, v in keywords_dict.iteritems(): k = k.encode('utf-8', 'ignore') personal_keywords_dict[k] = v classify_text_dict[iter_mid] = personal_keywords_dict classify_uid_list.append(iter_uid) #f.close() if classify_text_dict: classify_results = topic_classfiy(classify_uid_list, classify_text_dict) mid_value = dict() #print "classify_results: ", classify_results for k,v in classify_results.iteritems(): # mid:category mid_value[k] = v[0] for item in weibo_results: action = {"index":{"_id":item['_id']}} item['_source']['category'] = mid_value[item['_id']] bulk_action.extend([action, item["_source"]]) count += 1 if count % 1000 == 0: es_user_portrait.bulk(bulk_action, index=monitor_index_name, doc_type=monitor_index_type, timeout=600) bulk_action = [] if bulk_action:
def social_sensing(task_detail): # 任务名 传感器 终止时间 之前状态 创建者 时间 task_name = task_detail[0] social_sensors = task_detail[1] stop_time = task_detail[2] forward_warning_status = task_detail[3] create_by = task_detail[4] ts = int(task_detail[5]) new = int(task_detail[6]) print ts2date(ts) # PART 1 forward_result = get_forward_numerical_info(task_name, ts, create_by) # 之前时间阶段内的原创微博list/retweeted forward_origin_weibo_list = query_mid_list(ts - time_interval, social_sensors, forward_time_range) forward_retweeted_weibo_list = query_mid_list(ts - time_interval, social_sensors, forward_time_range, 3) # 当前阶段内原创微博list current_mid_list = query_mid_list(ts, social_sensors, time_interval) current_retweeted_mid_list = query_mid_list(ts, social_sensors, time_interval, 3) all_mid_list = [] all_mid_list.extend(current_mid_list) all_mid_list.extend(current_retweeted_mid_list) all_mid_list.extend(forward_origin_weibo_list) all_mid_list.extend(forward_retweeted_weibo_list) all_origin_list = [] all_origin_list.extend(current_mid_list) all_origin_list.extend(forward_origin_weibo_list) all_retweeted_list = [] all_retweeted_list.extend(current_retweeted_mid_list) all_retweeted_list.extend(forward_retweeted_weibo_list) # 被转发微博的mid/root-mid print "all mid list: ", len(all_mid_list) # print "all_origin_list", all_origin_list # print "all_retweeted_list", all_retweeted_list # 查询微博在当前时间内的转发和评论数, 聚合按照message_type statistics_count = query_related_weibo(ts, all_mid_list, time_interval) if all_origin_list: origin_weibo_detail = query_hot_weibo(ts, all_origin_list, time_interval) # 原创微博详情 else: origin_weibo_detail = {} if all_retweeted_list: retweeted_weibo_detail = query_hot_weibo(ts, all_retweeted_list, time_interval) # 转发微博详情 else: retweeted_weibo_detail = {} current_total_count = statistics_count["total_count"] # 当前阶段内所有微博总数 current_retweeted_count = statistics_count["retweeted"] current_comment_count = statistics_count["comment"] # PART 2 # 聚合当前时间内积极、中性、悲伤、愤怒情绪分布 # sentiment_dict = {"0": "neutral", "1":"positive", "2":"sad", "3": "anger"} sentiment_count = {"0": 0, "1": 0, "2": 0, "3": 0} search_results = aggregation_sentiment_related_weibo(ts, all_mid_list, time_interval) sentiment_count = search_results print "sentiment_count: ", sentiment_count negetive_key = ["2", "3", "4", "5", "6"] negetive_count = 0 for key in negetive_key: negetive_count += sentiment_count[key] # 聚合当前时间内重要的人 important_uid_list = [] datetime = ts2datetime(ts - time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: search_results = get_important_user(ts, all_mid_list, time_interval) important_uid_list = search_results # 根据获得uid_list,从人物库中匹配重要人物 if important_uid_list: important_results = es_user_portrait.mget( index=portrait_index_name, doc_type=portrait_index_type, body={"ids": important_uid_list} )["docs"] else: important_results = [] filter_important_list = [] # uid_list if important_results: for item in important_results: if item["found"]: # if item['_source']['importance'] > IMPORTANT_USER_THRESHOULD: filter_important_list.append(item["_id"]) # 判断感知 burst_reason = signal_nothing_variation warning_status = signal_nothing finish = unfinish_signal # "0" process_status = "1" if forward_result[0]: # 根据移动平均判断是否有时间发生 mean_count = forward_result[1] std_count = forward_result[2] mean_sentiment = forward_result[3] std_sentiment = forward_result[4] if ( mean_count >= MEAN_COUNT and current_total_count > mean_count + 1.96 * std_count or current_total_count >= len(all_mid_list) * AVERAGE_COUNT ): # 异常点发生 if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track else: warning_status = signal_brust burst_reason += signal_count_varition # 数量异常 if ( negetive_count > mean_sentiment + 1.96 * std_sentiment and mean_sentiment >= MEAN_COUNT or negetive_count >= len(all_mid_list) * AVERAGE_COUNT ): warning_status = signal_brust burst_reason += signal_sentiment_varition # 负面情感异常, "12"表示两者均异常 if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track if int(stop_time) <= ts: # 检查任务是否已经完成 finish = finish_signal process_status = "0" # 感知到的事, all_mid_list tmp_burst_reason = burst_reason topic_list = [] sensitive_text_list = [] # 有事件发生时开始 # if warning_status: if 1: index_list = [] important_words = [] datetime_1 = ts2datetime(ts) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es = es_text.indices.exists(index=index_name_1) if exist_es: index_list.append(index_name_1) datetime_2 = ts2datetime(ts - DAY) index_name_2 = flow_text_index_name_pre + datetime_2 exist_es = es_text.indices.exists(index=index_name_2) if exist_es: index_list.append(index_name_2) if index_list and all_mid_list: query_body = {"query": {"filtered": {"filter": {"terms": {"mid": all_mid_list}}}}, "size": 5000} search_results = es_text.search(index=index_list, doc_type="text", body=query_body)["hits"]["hits"] tmp_sensitive_warning = "" text_dict = dict() # 文本信息 mid_value = dict() # 文本赋值 duplicate_dict = dict() # 重合字典 portrait_dict = dict() # 背景信息 classify_text_dict = dict() # 分类文本 classify_uid_list = [] duplicate_text_list = [] sensitive_words_dict = dict() if search_results: for item in search_results: iter_uid = item["_source"]["uid"] iter_mid = item["_source"]["mid"] iter_text = item["_source"]["text"].encode("utf-8", "ignore") iter_sensitive = item["_source"].get("sensitive", 0) duplicate_text_list.append({"_id": iter_mid, "title": "", "content": iter_text}) if iter_sensitive: tmp_sensitive_warning = signal_sensitive_variation # 涉及到敏感词的微博 sensitive_words_dict[iter_mid] = iter_sensitive keywords_dict = json.loads(item["_source"]["keywords_dict"]) personal_keywords_dict = dict() for k, v in keywords_dict.iteritems(): k = k.encode("utf-8", "ignore") personal_keywords_dict[k] = v classify_text_dict[iter_mid] = personal_keywords_dict classify_uid_list.append(iter_uid) # 去重 if duplicate_text_list: dup_results = duplicate(duplicate_text_list) for item in dup_results: if item["duplicate"]: duplicate_dict[item["_id"]] = item["same_from"] # 分类 if classify_text_dict: classify_results = topic_classfiy(classify_uid_list, classify_text_dict) mid_value = dict() # print "classify_results: ", classify_results for k, v in classify_results.iteritems(): # mid:value mid_value[k] = topic_value_dict[v[0]] if tmp_sensitive_warning: warning_status = signal_brust burst_reason += signal_sensitive_variation sensitive_weibo_detail = {} if sensitive_words_dict: sensitive_mid_list = sensitive_words_dict.keys() sensitivie_weibo_detail = query_hot_weibo(ts, sensitive_mid_list, time_interval) results = dict() results["mid_topic_value"] = json.dumps(mid_value) results["duplicate_dict"] = json.dumps(duplicate_dict) results["sensitive_words_dict"] = json.dumps(sensitive_words_dict) results["sensitive_weibo_detail"] = json.dumps(sensitive_weibo_detail) results["origin_weibo_number"] = len(all_origin_list) results["retweeted_weibo_number"] = len(all_retweeted_list) results["origin_weibo_detail"] = json.dumps(origin_weibo_detail) results["retweeted_weibo_detail"] = json.dumps(retweeted_weibo_detail) results["retweeted_weibo_count"] = current_retweeted_count results["comment_weibo_count"] = current_comment_count results["weibo_total_number"] = current_total_count results["sentiment_distribution"] = json.dumps(sentiment_count) results["important_users"] = json.dumps(filter_important_list) results["unfilter_users"] = json.dumps(important_uid_list) results["burst_reason"] = tmp_burst_reason results["timestamp"] = ts # results['clustering_topic'] = json.dumps(topic_list) # es存储当前时段的信息 doctype = create_by + "-" + task_name es_user_portrait.index(index=index_sensing_task, doc_type=doctype, id=ts, body=results) # 更新manage social sensing的es信息 if not new: temporal_result = es_user_portrait.get(index=index_manage_social_task, doc_type=task_doc_type, id=doctype)[ "_source" ] temporal_result["warning_status"] = warning_status temporal_result["burst_reason"] = tmp_burst_reason temporal_result["finish"] = finish temporal_result["processing_status"] = process_status history_status = json.loads(temporal_result["history_status"]) history_status.append([ts, task_name, warning_status]) temporal_result["history_status"] = json.dumps(history_status) es_user_portrait.index(index=index_manage_social_task, doc_type=task_doc_type, id=doctype, body=temporal_result) else: print "test" return "1"
def social_sensing(): all_fid_list, end_ts = count_statis() if S_TYPE == 'test': all_fid_list = ALL_FID_LIST index_list = [] for i in range(7): timestamp = end_ts - i * DAY flow_text_index_name = flow_text_index_name_pre + ts2datetime( timestamp) index_list.append(flow_text_index_name) #index_list = [flow_text_index_name_pre+date_1,flow_text_index_name_pre+date_2] print 'index_list...', index_list # 感知到的事, all_fid_list sensitive_text_list = [] tmp_sensitive_warning = "" text_dict = dict() # 文本信息 fid_value = dict() # 文本赋值 duplicate_dict = dict() # 重合字典 portrait_dict = dict() # 背景信息 classify_text_dict = dict() # 分类文本 classify_uid_list = [] classify_fid_list = [] duplicate_text_list = [] sensitive_words_dict = dict() sensitive_weibo_detail = {} all_text_dict = dict() fid_ts_dict = dict() # 文本发布时间 # 有事件发生时开始 #if 1: if index_list and all_fid_list: query_body = { "query": { "filtered": { "filter": { "terms": { "fid": all_fid_list } } } }, "size": 5000 } search_results = es.search(index=index_list, doc_type="text", body=query_body)['hits']['hits'] print "search fid len: ", len(search_results) if search_results: for item in search_results: iter_uid = item['_source']['uid'] iter_fid = item['_source']['fid'] fid_ts_dict[iter_fid] = item["_source"]["timestamp"] iter_text = item['_source']['text'].encode('utf-8', 'ignore') iter_sensitive = item['_source'].get('sensitive', 0) tmp_text = get_weibo(item['_source']) all_text_dict[iter_fid] = tmp_text duplicate_text_list.append({ "_id": iter_fid, "title": "", "content": iter_text.decode("utf-8", 'ignore') }) if iter_sensitive: tmp_sensitive_warning = signal_sensitive_variation #涉及到敏感词的微博 sensitive_words_dict[iter_fid] = iter_sensitive keywords_dict = json.loads(item['_source']['keywords_dict']) personal_keywords_dict = dict() for k, v in keywords_dict.iteritems(): k = k.encode('utf-8', 'ignore') personal_keywords_dict[k] = v classify_text_dict[iter_fid] = personal_keywords_dict #classify_uid_list.append(iter_uid) classify_fid_list.append(iter_fid) # 去重 print "start duplicate" if duplicate_text_list: dup_results = duplicate(duplicate_text_list) for item in dup_results: if item['duplicate']: duplicate_dict[item['_id']] = item['same_from'] # 分类 print "start classify" fid_value = dict() if classify_text_dict: #classify_results = topic_classfiy(classify_uid_list, classify_text_dict) classify_results = topic_classfiy(classify_fid_list, classify_text_dict) #print "classify_results: ", classify_results for k, v in classify_results.iteritems(): # fid:value #fid_value[k] = topic_value_dict[v[0]] fid_value[k] = v[0] # organize data fid_list = all_text_dict.keys() print "final fid:", len(fid_list) print "intersection: ", len(set(fid_list) & set(all_fid_list)) bulk_action = [] count = 0 #social_sensing_index_name = "fb_social_sensing_text_" + ts2datetime(end_ts) social_sensing_index_name = "fb_social_sensing_text" mappings_social_sensing_text(social_sensing_index_name) for fid in fid_list: iter_dict = dict() if duplicate_dict.has_key(fid): iter_dict["duplicate"] = duplicate_dict[fid] else: iter_dict["duplicate"] = "" iter_dict["compute_status"] = 0 # 尚未计算 iter_dict["topic_field"] = fid_value[fid] iter_dict["detect_ts"] = end_ts #iter_dict["xnr_user_no"] = xnr_user_no iter_dict.update(all_text_dict[fid]) count += 1 print 'iter_dict:::', iter_dict # _id = xnr_user_no + '_' + fid bulk_action.extend([{"index": {"_id": fid}}, iter_dict]) if count % 500 == 0: es.bulk(bulk_action, index=social_sensing_index_name, doc_type="text", timeout=600) bulk_action = [] if bulk_action: es.bulk(bulk_action, index=social_sensing_index_name, doc_type="text", timeout=600) return "1"
def social_sensing(task_detail): # 任务名 传感器 终止时间 之前状态 创建者 时间 task_name = task_detail[0] social_sensors = task_detail[1] stop_time = task_detail[2] create_by = task_detail[3] ts = int(task_detail[4]) print ts2date(ts) # PART 1 #forward_result = get_forward_numerical_info(task_name, ts, create_by) # 之前时间阶段内的原创微博list/retweeted forward_origin_weibo_list = query_mid_list(ts-time_interval, social_sensors, forward_time_range) forward_retweeted_weibo_list = query_mid_list(ts-time_interval, social_sensors, forward_time_range, 3) # 当前阶段内原创微博list current_mid_list = query_mid_list(ts, social_sensors, time_interval) current_retweeted_mid_list = query_mid_list(ts, social_sensors, time_interval, 3) all_mid_list = [] all_mid_list.extend(current_mid_list) all_mid_list.extend(current_retweeted_mid_list) all_mid_list.extend(forward_origin_weibo_list) all_mid_list.extend(forward_retweeted_weibo_list) all_origin_list = [] all_origin_list.extend(current_mid_list) all_origin_list.extend(forward_origin_weibo_list) all_retweeted_list = [] all_retweeted_list.extend(current_retweeted_mid_list) all_retweeted_list.extend(forward_retweeted_weibo_list)#被转发微博的mid/root-mid print "all mid list: ", len(all_mid_list) #print "all_origin_list", all_origin_list #print "all_retweeted_list", all_retweeted_list # 查询微博在当前时间内的转发和评论数, 聚合按照message_type statistics_count = query_related_weibo(ts, all_mid_list, time_interval) if all_origin_list: origin_weibo_detail = query_hot_weibo(ts, all_origin_list, time_interval) # 原创微博详情 else: origin_weibo_detail = {} if all_retweeted_list: retweeted_weibo_detail = query_hot_weibo(ts, all_retweeted_list, time_interval) # 转发微博详情 else: retweeted_weibo_detail = {} current_total_count = statistics_count['total_count'] # 当前阶段内所有微博总数 current_retweeted_count = statistics_count['retweeted'] current_comment_count = statistics_count['comment'] # 聚合当前时间内重要的人 important_uid_list = [] datetime = ts2datetime(ts-time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: search_results = get_important_user(ts, all_mid_list, time_interval) important_uid_list = search_results # 根据获得uid_list,从人物库中匹配重要人物 if important_uid_list: important_results = es_user_portrait.mget(index=portrait_index_name,doc_type=portrait_index_type, body={"ids": important_uid_list})['docs'] else: important_results = [] filter_important_list = [] # uid_list if important_results: for item in important_results: if item['found']: #if item['_source']['importance'] > IMPORTANT_USER_THRESHOULD: filter_important_list.append(item['_id']) print "filter_important_list", filter_important_list print "important_results", important_uid_list #判断感知 finish = unfinish_signal # "0" process_status = "1" if int(stop_time) <= ts: # 检查任务是否已经完成 finish = finish_signal process_status = "0" # 感知到的事, all_mid_list sensitive_text_list = [] # 有事件发生时开始 if 1: index_list = [] important_words = [] datetime_1 = ts2datetime(ts) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es = es_text.indices.exists(index=index_name_1) if exist_es: index_list.append(index_name_1) datetime_2 = ts2datetime(ts-DAY) index_name_2 = flow_text_index_name_pre + datetime_2 exist_es = es_text.indices.exists(index=index_name_2) if exist_es: index_list.append(index_name_2) if index_list and all_mid_list: query_body = { "query":{ "filtered":{ "filter":{ "terms":{"mid": all_mid_list} } } }, "size": 5000 } search_results = es_text.search(index=index_list, doc_type="text", body=query_body)['hits']['hits'] tmp_sensitive_warning = "" text_dict = dict() # 文本信息 mid_value = dict() # 文本赋值 duplicate_dict = dict() # 重合字典 portrait_dict = dict() # 背景信息 classify_text_dict = dict() # 分类文本 classify_uid_list = [] duplicate_text_list = [] sensitive_words_dict = dict() if search_results: for item in search_results: iter_uid = item['_source']['uid'] iter_mid = item['_source']['mid'] iter_text = item['_source']['text'].encode('utf-8', 'ignore') iter_sensitive = item['_source'].get('sensitive', 0) duplicate_text_list.append({"_id":iter_mid, "title": "", "content":iter_text}) if iter_sensitive: tmp_sensitive_warning = signal_sensitive_variation #涉及到敏感词的微博 sensitive_words_dict[iter_mid] = iter_sensitive keywords_dict = json.loads(item['_source']['keywords_dict']) personal_keywords_dict = dict() for k, v in keywords_dict.iteritems(): k = k.encode('utf-8', 'ignore') personal_keywords_dict[k] = v classify_text_dict[iter_mid] = personal_keywords_dict classify_uid_list.append(iter_uid) # 去重 if duplicate_text_list: dup_results = duplicate(duplicate_text_list) for item in dup_results: if item['duplicate']: duplicate_dict[item['_id']] = item['same_from'] # 分类 if classify_text_dict: classify_results = topic_classfiy(classify_uid_list, classify_text_dict) mid_value = dict() #print "classify_results: ", classify_results for k,v in classify_results.iteritems(): # mid:value mid_value[k] = topic_value_dict[v[0]] sensitive_weibo_detail = {} if sensitive_words_dict: sensitive_mid_list = sensitive_words_dict.keys() sensitivie_weibo_detail = query_hot_weibo(ts, sensitive_mid_list, time_interval) results = dict() results['mid_topic_value'] = json.dumps(mid_value) results['duplicate_dict'] = json.dumps(duplicate_dict) results['sensitive_words_dict'] = json.dumps(sensitive_words_dict) results['sensitive_weibo_detail'] = json.dumps(sensitive_weibo_detail) results['origin_weibo_number'] = len(all_origin_list) results['retweeted_weibo_number'] = len(all_retweeted_list) results['origin_weibo_detail'] = json.dumps(origin_weibo_detail) results['retweeted_weibo_detail'] = json.dumps(retweeted_weibo_detail) results['retweeted_weibo_count'] = current_retweeted_count results['comment_weibo_count'] = current_comment_count results['weibo_total_number'] = current_total_count results['important_users'] = json.dumps(filter_important_list) results['unfilter_users'] = json.dumps(important_uid_list) results['timestamp'] = ts #results['clustering_topic'] = json.dumps(topic_list) # es存储当前时段的信息 doctype = create_by + '-' + task_name es_user_portrait.index(index=index_sensing_task, doc_type=doctype, id=ts, body=results) # 更新manage social sensing的es信息 temporal_result = es_user_portrait.get(index=index_manage_social_task, doc_type=task_doc_type, id=doctype)['_source'] temporal_result['finish'] = finish temporal_result['processing_status'] = process_status history_status = json.loads(temporal_result['history_status']) history_status.append(ts) temporal_result['history_status'] = json.dumps(history_status) es_user_portrait.index(index=index_manage_social_task, doc_type=task_doc_type, id=doctype, body=temporal_result) return "1"
def social_sensing(task_detail): # 任务名 传感器 终止时间 之前状态 创建者 时间 task_name = task_detail[0] social_sensors = task_detail[1] stop_time = task_detail[2] forward_warning_status = task_detail[3] create_by = task_detail[4] ts = int(task_detail[5]) new = int(task_detail[6]) print ts2date(ts) # PART 1 forward_result = get_forward_numerical_info(task_name, ts, create_by) # 之前时间阶段内的原创微博list/retweeted forward_origin_weibo_list = query_mid_list(ts - time_interval, social_sensors, forward_time_range) forward_retweeted_weibo_list = query_mid_list(ts - time_interval, social_sensors, forward_time_range, 3) # 当前阶段内原创微博list current_mid_list = query_mid_list(ts, social_sensors, time_interval) current_retweeted_mid_list = query_mid_list(ts, social_sensors, time_interval, 3) all_mid_list = [] all_mid_list.extend(current_mid_list) all_mid_list.extend(current_retweeted_mid_list) all_mid_list.extend(forward_origin_weibo_list) all_mid_list.extend(forward_retweeted_weibo_list) all_origin_list = [] all_origin_list.extend(current_mid_list) all_origin_list.extend(forward_origin_weibo_list) all_retweeted_list = [] all_retweeted_list.extend(current_retweeted_mid_list) all_retweeted_list.extend( forward_retweeted_weibo_list) #被转发微博的mid/root-mid print "all mid list: ", len(all_mid_list) #print "all_origin_list", all_origin_list #print "all_retweeted_list", all_retweeted_list # 查询微博在当前时间内的转发和评论数, 聚合按照message_type statistics_count = query_related_weibo(ts, all_mid_list, time_interval) if all_origin_list: origin_weibo_detail = query_hot_weibo(ts, all_origin_list, time_interval) # 原创微博详情 else: origin_weibo_detail = {} if all_retweeted_list: retweeted_weibo_detail = query_hot_weibo(ts, all_retweeted_list, time_interval) # 转发微博详情 else: retweeted_weibo_detail = {} current_total_count = statistics_count['total_count'] # 当前阶段内所有微博总数 current_retweeted_count = statistics_count['retweeted'] current_comment_count = statistics_count['comment'] # PART 2 # 聚合当前时间内积极、中性、悲伤、愤怒情绪分布 # sentiment_dict = {"0": "neutral", "1":"positive", "2":"sad", "3": "anger"} sentiment_count = {"0": 0, "1": 0, "2": 0, "3": 0} search_results = aggregation_sentiment_related_weibo( ts, all_mid_list, time_interval) sentiment_count = search_results print "sentiment_count: ", sentiment_count negetive_key = ["2", "3", "4", "5", "6"] negetive_count = 0 for key in negetive_key: negetive_count += sentiment_count[key] # 聚合当前时间内重要的人 important_uid_list = [] datetime = ts2datetime(ts - time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: search_results = get_important_user(ts, all_mid_list, time_interval) important_uid_list = search_results # 根据获得uid_list,从人物库中匹配重要人物 if important_uid_list: important_results = es_user_portrait.mget( index=portrait_index_name, doc_type=portrait_index_type, body={"ids": important_uid_list})['docs'] else: important_results = [] filter_important_list = [] # uid_list if important_results: for item in important_results: if item['found']: #if item['_source']['importance'] > IMPORTANT_USER_THRESHOULD: filter_important_list.append(item['_id']) #判断感知 burst_reason = signal_nothing_variation warning_status = signal_nothing finish = unfinish_signal # "0" process_status = "1" if forward_result[0]: # 根据移动平均判断是否有时间发生 mean_count = forward_result[1] std_count = forward_result[2] mean_sentiment = forward_result[3] std_sentiment = forward_result[4] if mean_count >= MEAN_COUNT and current_total_count > mean_count + 1.96 * std_count or current_total_count >= len( all_mid_list) * AVERAGE_COUNT: # 异常点发生 if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track else: warning_status = signal_brust burst_reason += signal_count_varition # 数量异常 if negetive_count > mean_sentiment + 1.96 * std_sentiment and mean_sentiment >= MEAN_COUNT or negetive_count >= len( all_mid_list) * AVERAGE_COUNT: warning_status = signal_brust burst_reason += signal_sentiment_varition # 负面情感异常, "12"表示两者均异常 if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track if int(stop_time) <= ts: # 检查任务是否已经完成 finish = finish_signal process_status = "0" # 感知到的事, all_mid_list tmp_burst_reason = burst_reason topic_list = [] sensitive_text_list = [] # 有事件发生时开始 #if warning_status: if 1: index_list = [] important_words = [] datetime_1 = ts2datetime(ts) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es = es_text.indices.exists(index=index_name_1) if exist_es: index_list.append(index_name_1) datetime_2 = ts2datetime(ts - DAY) index_name_2 = flow_text_index_name_pre + datetime_2 exist_es = es_text.indices.exists(index=index_name_2) if exist_es: index_list.append(index_name_2) if index_list and all_mid_list: query_body = { "query": { "filtered": { "filter": { "terms": { "mid": all_mid_list } } } }, "size": 5000 } search_results = es_text.search(index=index_list, doc_type="text", body=query_body)['hits']['hits'] tmp_sensitive_warning = "" text_dict = dict() # 文本信息 mid_value = dict() # 文本赋值 duplicate_dict = dict() # 重合字典 portrait_dict = dict() # 背景信息 classify_text_dict = dict() # 分类文本 classify_uid_list = [] duplicate_text_list = [] sensitive_words_dict = dict() if search_results: for item in search_results: iter_uid = item['_source']['uid'] iter_mid = item['_source']['mid'] iter_text = item['_source']['text'].encode( 'utf-8', 'ignore') iter_sensitive = item['_source'].get('sensitive', 0) duplicate_text_list.append({ "_id": iter_mid, "title": "", "content": iter_text }) if iter_sensitive: tmp_sensitive_warning = signal_sensitive_variation #涉及到敏感词的微博 sensitive_words_dict[iter_mid] = iter_sensitive keywords_dict = json.loads( item['_source']['keywords_dict']) personal_keywords_dict = dict() for k, v in keywords_dict.iteritems(): k = k.encode('utf-8', 'ignore') personal_keywords_dict[k] = v classify_text_dict[iter_mid] = personal_keywords_dict classify_uid_list.append(iter_uid) # 去重 if duplicate_text_list: dup_results = duplicate(duplicate_text_list) for item in dup_results: if item['duplicate']: duplicate_dict[item['_id']] = item['same_from'] # 分类 if classify_text_dict: classify_results = topic_classfiy(classify_uid_list, classify_text_dict) mid_value = dict() #print "classify_results: ", classify_results for k, v in classify_results.iteritems(): # mid:value mid_value[k] = topic_value_dict[v[0]] if tmp_sensitive_warning: warning_status = signal_brust burst_reason += signal_sensitive_variation sensitive_weibo_detail = {} if sensitive_words_dict: sensitive_mid_list = sensitive_words_dict.keys() sensitivie_weibo_detail = query_hot_weibo( ts, sensitive_mid_list, time_interval) results = dict() results['mid_topic_value'] = json.dumps(mid_value) results['duplicate_dict'] = json.dumps(duplicate_dict) results['sensitive_words_dict'] = json.dumps(sensitive_words_dict) results['sensitive_weibo_detail'] = json.dumps(sensitive_weibo_detail) results['origin_weibo_number'] = len(all_origin_list) results['retweeted_weibo_number'] = len(all_retweeted_list) results['origin_weibo_detail'] = json.dumps(origin_weibo_detail) results['retweeted_weibo_detail'] = json.dumps(retweeted_weibo_detail) results['retweeted_weibo_count'] = current_retweeted_count results['comment_weibo_count'] = current_comment_count results['weibo_total_number'] = current_total_count results['sentiment_distribution'] = json.dumps(sentiment_count) results['important_users'] = json.dumps(filter_important_list) results['unfilter_users'] = json.dumps(important_uid_list) results['burst_reason'] = tmp_burst_reason results['timestamp'] = ts #results['clustering_topic'] = json.dumps(topic_list) # es存储当前时段的信息 doctype = create_by + '-' + task_name es_user_portrait.index(index=index_sensing_task, doc_type=doctype, id=ts, body=results) # 更新manage social sensing的es信息 if not new: temporal_result = es_user_portrait.get(index=index_manage_social_task, doc_type=task_doc_type, id=doctype)['_source'] temporal_result['warning_status'] = warning_status temporal_result['burst_reason'] = tmp_burst_reason temporal_result['finish'] = finish temporal_result['processing_status'] = process_status history_status = json.loads(temporal_result['history_status']) history_status.append([ts, task_name, warning_status]) temporal_result['history_status'] = json.dumps(history_status) es_user_portrait.index(index=index_manage_social_task, doc_type=task_doc_type, id=doctype, body=temporal_result) else: print "test" return "1"
def social_sensing(task_detail): ''' with open("prediction_uid.pkl", "r") as f: uid_model = pickle.load(f) with open("prediction_weibo.pkl", "r") as f: weibo_model = pickle.load(f) ''' # 任务名 传感器 终止时间 之前状态 创建者 时间 task_name = task_detail[0] social_sensors = task_detail[1] #ts = int(task_detail[2]) ts = float(task_detail[2]) #xnr_user_no = task_detail[3] print ts2date(ts) index_list = [] important_words = [] datetime_1 = ts2datetime(ts) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es = es_text.indices.exists(index=index_name_1) if exist_es: index_list.append(index_name_1) datetime_2 = ts2datetime(ts-DAY) index_name_2 = flow_text_index_name_pre + datetime_2 exist_es = es_text.indices.exists(index=index_name_2) if exist_es: index_list.append(index_name_2) if es_text.indices.exists(index=flow_text_index_name_pre+ts2datetime(ts-2*DAY)): index_list.append(flow_text_index_name_pre+ts2datetime(ts-2*DAY)) # PART 1 #forward_result = get_forward_numerical_info(task_name, ts, create_by) # 之前时间阶段内的原创微博list/retweeted forward_origin_weibo_list, forward_1 = query_mid_list(ts-time_interval, social_sensors, forward_time_range) forward_retweeted_weibo_list, forward_3 = query_mid_list(ts-time_interval, social_sensors, forward_time_range, 3) # 当前阶段内原创微博list current_mid_list, current_1 = query_mid_list(ts, social_sensors, time_interval) current_retweeted_mid_list, current_3 = query_mid_list(ts, social_sensors, time_interval, 3) all_mid_list = [] all_mid_list.extend(current_mid_list) all_mid_list.extend(current_retweeted_mid_list) all_mid_list.extend(forward_origin_weibo_list) all_mid_list.extend(forward_retweeted_weibo_list) all_origin_list = [] all_origin_list.extend(current_mid_list) all_origin_list.extend(forward_origin_weibo_list) all_origin_list = list(set(all_origin_list)) all_retweeted_list = [] all_retweeted_list.extend(current_retweeted_mid_list) all_retweeted_list.extend(forward_retweeted_weibo_list)#被转发微博的mid/root-mid all_retweeted_list = list(set(all_retweeted_list)) all_mid_list = filter_mid(all_mid_list) all_origin_list = filter_mid(all_origin_list) all_retweeted_list = filter_mid(all_retweeted_list) print "all mid list: ", len(all_mid_list) print "all_origin_list", len(all_origin_list) print "all_retweeted_list", len(all_retweeted_list) # 查询微博在当前时间内的转发和评论数, 聚合按照message_type #statistics_count = query_related_weibo(ts, all_mid_list, time_interval) if all_origin_list: #origin_weibo_detail = query_hot_weibo(ts, all_origin_list, time_interval) # 原创微博详情 origin_weibo_detail = dict() for mid in all_origin_list: retweet_count = es_text.count(index=index_list, doc_type="text", body={"query":{"bool":{"must":[{"term":{"root_mid": mid}}, {"term":{"message_type":3}}]}}})["count"] comment_count = es_text.count(index=index_list, doc_type="text", body={"query":{"bool":{"must":[{"term":{"root_mid": mid}}, {"term":{"message_type":2}}]}}})["count"] tmp = dict() tmp["retweeted"] = retweet_count tmp["comment"] = comment_count origin_weibo_detail[mid] = tmp else: origin_weibo_detail = {} print "len(origin_weibo_detail): ", len(origin_weibo_detail) if all_retweeted_list: retweeted_weibo_detail = dict() for mid in all_retweeted_list: retweet_count = es_text.count(index=index_list, doc_type="text", body={"query":{"bool":{"must":[{"term":{"root_mid": mid}}, {"term":{"message_type":3}}]}}})["count"] comment_count = es_text.count(index=index_list, doc_type="text", body={"query":{"bool":{"must":[{"term":{"root_mid": mid}}, {"term":{"message_type":2}}]}}})["count"] tmp = dict() tmp["retweeted"] = retweet_count tmp["comment"] = comment_count retweeted_weibo_detail[mid] = tmp #retweeted_weibo_detail = query_hot_weibo(ts, all_retweeted_list, time_interval) # 转发微博详情 else: retweeted_weibo_detail = {} print "len(retweeted_weibo_detail): ", len(retweeted_weibo_detail) #current_total_count = statistics_count['total_count'] # 当前阶段内所有微博总数 #current_retweeted_count = statistics_count['retweeted'] #current_comment_count = statistics_count['comment'] #all_mid_list = list(set(all_origin_list[:100]) | set(all_retweeted_list[:100])) # 感知到的事, all_mid_list sensitive_text_list = [] tmp_sensitive_warning = "" text_dict = dict() # 文本信息 mid_value = dict() # 文本赋值 duplicate_dict = dict() # 重合字典 portrait_dict = dict() # 背景信息 classify_text_dict = dict() # 分类文本 classify_uid_list = [] duplicate_text_list = [] sensitive_words_dict = dict() sensitive_weibo_detail = {} trendline_dict = dict() all_text_dict = dict() # 有事件发生时开始 if 1: print "index_list:", index_list if index_list and all_mid_list: query_body = { "query":{ "filtered":{ "filter":{ "terms":{"mid": all_mid_list} } } }, "size": 5000 } search_results = es_text.search(index=index_list, doc_type="text", body=query_body)['hits']['hits'] print "search mid len: ", len(search_results) tmp_sensitive_warning = "" text_dict = dict() # 文本信息 mid_value = dict() # 文本赋值 duplicate_dict = dict() # 重合字典 portrait_dict = dict() # 背景信息 classify_text_dict = dict() # 分类文本 #classify_uid_list = [] classify_mid_list = [] duplicate_text_list = [] sensitive_words_dict = dict() mid_ts_dict = dict() # 文本发布时间 uid_prediction_dict = dict() weibo_prediction_dict = dict() trendline_dict = dict() feature_prediction_list = [] # feature mid_prediction_list = [] # dui ying mid if search_results: for item in search_results: iter_uid = item['_source']['uid'] iter_mid = item['_source']['mid'] mid_ts_dict[iter_mid] = item["_source"]["timestamp"] iter_text = item['_source']['text'].encode('utf-8', 'ignore') iter_sensitive = item['_source'].get('sensitive', 0) tmp_text = get_weibo(item['_source']) all_text_dict[iter_mid] = tmp_text duplicate_text_list.append({"_id":iter_mid, "title": "", "content":iter_text.decode("utf-8",'ignore')}) if iter_sensitive: tmp_sensitive_warning = signal_sensitive_variation #涉及到敏感词的微博 sensitive_words_dict[iter_mid] = iter_sensitive keywords_dict = json.loads(item['_source']['keywords_dict']) personal_keywords_dict = dict() for k, v in keywords_dict.iteritems(): k = k.encode('utf-8', 'ignore') personal_keywords_dict[k] = v classify_text_dict[iter_mid] = personal_keywords_dict #classify_uid_list.append(iter_uid) classify_mid_list.append(iter_mid) # 去重 print "start duplicate" if duplicate_text_list: dup_results = duplicate(duplicate_text_list) for item in dup_results: if item['duplicate']: duplicate_dict[item['_id']] = item['same_from'] # 分类 print "start classify" mid_value = dict() if classify_text_dict: #classify_results = topic_classfiy(classify_uid_list, classify_text_dict) classify_results = topic_classfiy(classify_mid_list, classify_text_dict) #print "classify_results: ", classify_results for k,v in classify_results.iteritems(): # mid:value #mid_value[k] = topic_value_dict[v[0]] mid_value[k]=v[0] #feature_list = organize_feature(k, mid_ts_dict[k]) #feature_prediction_list.append(feature_list) # feature list #mid_prediction_list.append(k) # corresponding # prediction """ print "start prediction" weibo_prediction_result = weibo_model.predict(feature_prediction_list) uid_prediction_result = uid_model.predict(feature_prediction_list) for i in range(len(mid_prediction_list)): if i % 100 == 0: print i uid_prediction_dict[mid_prediction_list[i]] = uid_prediction_result[i] weibo_prediction_dict[mid_prediction_list[i]] = weibo_prediction_result[i] tmp_trendline = trendline_list(mid_prediction_list[i], weibo_prediction_result[i], mid_ts_dict[mid_prediction_list[i]]) trendline_dict[mid_prediction_list[i]] = tmp_trendline """ # organize data mid_list = all_text_dict.keys() print "final mid:", len(mid_list) print "intersection: ", len(set(mid_list)&set(all_mid_list)) bulk_action = [] count = 0 for mid in mid_list: iter_dict = dict() if origin_weibo_detail.has_key(mid): iter_dict.update(origin_weibo_detail[mid]) iter_dict["type"] = 1 elif retweeted_weibo_detail.has_key(mid): iter_dict.update(retweeted_weibo_detail[mid]) iter_dict["type"] = 3 else: iter_dict["retweeted"] = 0 iter_dict["comment"] = 0 print "mid in all_mid_list: ", mid in set(all_mid_list) #iter_dict["trendline"] = json.dumps(trendline_dict[mid]) if duplicate_dict.has_key(mid): iter_dict["duplicate"] = duplicate_dict[mid] else: iter_dict["duplicate"] = "" #iter_dict["uid_prediction"] = uid_prediction_dict[mid] #iter_dict["weibo_prediction"] = weibo_prediction_dict[mid] iter_dict["compute_status"] = 0 # 尚未计算 iter_dict["topic_field"] = mid_value[mid] iter_dict["detect_ts"] = ts #iter_dict["xnr_user_no"] = xnr_user_no iter_dict.update(all_text_dict[mid]) count += 1 #print 'iter_dict:::',iter_dict # _id = xnr_user_no + '_' + mid _id = mid bulk_action.extend([{"index":{"_id": _id}}, iter_dict]) if count % 500 == 0: es_xnr.bulk(bulk_action, index="social_sensing_text", doc_type="text", timeout=600) bulk_action = [] if bulk_action: es_xnr.bulk(bulk_action, index="social_sensing_text", doc_type="text", timeout=600) return "1"
def social_sensing(task_detail): # 任务名, 传感器, 任务创建时间(感知时间的起点) task_name = task_detail[0] social_sensors = task_detail[1] ts = float(task_detail[2]) print 'sensing_start_time:',ts2date(ts) index_list = ["flow_text_gangdu"] # 被感知的数据库,后期根据情况修改 # 前两天之内的原创、转发微博 list/retweeted (不包含当前一个小时) forward_origin_weibo_list, forward_1 = query_mid_list(ts-time_interval, social_sensors, forward_time_range) forward_retweeted_weibo_list, forward_3 = query_mid_list(ts-time_interval, social_sensors, forward_time_range, 3) # 前一个小时内原创、转发微博 list/retweeted current_origin_weibo_list, current_1 = query_mid_list(ts, social_sensors, time_interval) current_retweeted_weibo_list, current_3 = query_mid_list(ts, social_sensors, time_interval, 3) all_mid_list = [] all_mid_list.extend(current_origin_weibo_list) all_mid_list.extend(current_retweeted_weibo_list) all_mid_list.extend(forward_origin_weibo_list) all_mid_list.extend(forward_retweeted_weibo_list) all_origin_list = [] all_origin_list.extend(current_origin_weibo_list) all_origin_list.extend(forward_origin_weibo_list) all_origin_list = list(set(all_origin_list)) all_retweeted_list = [] all_retweeted_list.extend(current_retweeted_weibo_list) all_retweeted_list.extend(forward_retweeted_weibo_list) #被转发微博的mid/root_mid all_retweeted_list = list(set(all_retweeted_list)) all_mid_list = filter_mid(all_mid_list) all_origin_list = filter_mid(all_origin_list) all_retweeted_list = filter_mid(all_retweeted_list) print "all mid list: ", len(all_mid_list) print "all_origin_list", len(all_origin_list) print "all_retweeted_list", len(all_retweeted_list) # 查询微博在当前时间内的转发和评论数, 聚合按照message_type if all_origin_list: origin_weibo_detail = dict() for mid in all_origin_list: retweet_count = es_flow_text.count(index=index_list, doc_type="text", body={"query":{"bool":{"must":[{"term":{"root_mid": mid}}, {"term":{"message_type":3}}]}}})["count"] comment_count = es_flow_text.count(index=index_list, doc_type="text", body={"query":{"bool":{"must":[{"term":{"root_mid": mid}}, {"term":{"message_type":2}}]}}})["count"] tmp = dict() tmp["retweeted_stat"] = retweet_count tmp["comment_stat"] = comment_count origin_weibo_detail[mid] = tmp else: origin_weibo_detail = {} print "len(origin_weibo_detail): ", len(origin_weibo_detail) if all_retweeted_list: retweeted_weibo_detail = dict() for mid in all_retweeted_list: retweet_count = es_flow_text.count(index=index_list, doc_type="text", body={"query":{"bool":{"must":[{"term":{"root_mid": mid}}, {"term":{"message_type":3}}]}}})["count"] comment_count = es_flow_text.count(index=index_list, doc_type="text", body={"query":{"bool":{"must":[{"term":{"root_mid": mid}}, {"term":{"message_type":2}}]}}})["count"] tmp = dict() tmp["retweeted_stat"] = retweet_count tmp["comment_stat"] = comment_count retweeted_weibo_detail[mid] = tmp else: retweeted_weibo_detail = {} print "len(retweeted_weibo_detail): ", len(retweeted_weibo_detail) # 有事件发生时开始,查询所有的 all_mid_list, 一小时+两天 if index_list and all_mid_list: query_body = { "query":{ "filtered":{ "filter":{ "terms":{"mid": all_mid_list} } } }, "size": 5000 } search_results = es_flow_text.search(index=index_list, doc_type="text", body=query_body)['hits']['hits'] print "search mid len: ", len(search_results) all_text_dict = dict() # 感知到的事, all_mid_list mid_value = dict() # 文本赋值 duplicate_dict = dict() # 重合字典 classify_text_dict = dict() # 分类文本 sensitive_words_dict = dict() duplicate_text_list = [] classify_mid_list = [] if search_results: for item in search_results: iter_mid = item['_source']['mid'] iter_text = item['_source']['text'].encode('utf-8', 'ignore') iter_sensitive = item['_source'].get('sensitive', 0) tmp_text = get_weibo(item['_source']) all_text_dict[iter_mid] = tmp_text duplicate_text_list.append({"_id":iter_mid, "title": "", "content":iter_text.decode("utf-8",'ignore')}) if iter_sensitive: sensitive_words_dict[iter_mid] = iter_sensitive keywords_dict = json.loads(item['_source']['keywords_dict']) personal_keywords_dict = dict() for k, v in keywords_dict.iteritems(): k = k.encode('utf-8', 'ignore') personal_keywords_dict[k] = v classify_text_dict[iter_mid] = personal_keywords_dict classify_mid_list.append(iter_mid) # 去重 print "start duplicate:",'----' if duplicate_text_list: dup_results = duplicate(duplicate_text_list) for item in dup_results: if item['duplicate']: duplicate_dict[item['_id']] = item['same_from'] print '----', "duplicate finished:" # 分类 print "start classify:",'----' mid_value = dict() if classify_text_dict: classify_results = topic_classfiy(classify_mid_list, classify_text_dict) for k,v in classify_results.iteritems(): # mid:value mid_value[k]=v[0] print '----', "classify finished:" mid_list = all_text_dict.keys() mid_duplicate_list = set(duplicate_dict.keys())|set(duplicate_dict.values()) intersection_list = set(mid_list)-(set(duplicate_dict.keys())|set(duplicate_dict.values())) print "final mid:", len(mid_list) print "duplicate mid:", len(mid_duplicate_list) print "duplicate:", len(set(duplicate_dict.values())) print "single: ", len(intersection_list) # 将字典键值对倒过来 reverse_duplicate_dict = defaultdict(list) for k,v in duplicate_dict.iteritems(): reverse_duplicate_dict[v].append(k) for term in intersection_list: reverse_duplicate_dict[term] = [term] bulk_action = [] count = 0 for id in reverse_duplicate_dict.keys(): iter_dict = dict() inter_mid_list = [] inter_mid_list.append(id) inter_mid_list.extend(reverse_duplicate_dict[id]) # 计算发起者 timestamp_list = [] for mid in inter_mid_list: timestamp_list.append(all_text_dict[mid]['timestamp']) mid_initial = inter_mid_list[timestamp_list.index(min(timestamp_list))] # 计算推动者 push_list = [] for mid in inter_mid_list: if origin_weibo_detail.has_key(mid): retweeted_stat = origin_weibo_detail[mid]['retweeted_stat'] elif retweeted_weibo_detail.has_key(mid): retweeted_stat = retweeted_weibo_detail[mid] else: retweeted_stat = 0 push_list.append(retweeted_stat) mid_push = inter_mid_list[push_list.index(max(push_list))] mid = mid_push if origin_weibo_detail.has_key(mid): iter_dict.update(origin_weibo_detail[mid]) # update 函数把字典dict2的键/值对更新到dict里 iter_dict["type"] = 1 elif retweeted_weibo_detail.has_key(mid): iter_dict.update(retweeted_weibo_detail[mid]) iter_dict["type"] = 0 else: iter_dict["retweeted_stat"] = 0 iter_dict["comment_stat"] = 0 iter_dict["type"] = -1 # iter_dict["name"] = '' # iter_dict["heat"] = iter_dict["retweeted_stat"] + iter_dict["comment_stat"] iter_dict["status"] = 0 # 是否加入监测 iter_dict["delete"] = 0 # 是否删除 iter_dict["topic_field"] = eng2chi_dict[mid_value[mid]] # 分类标签 iter_dict["detect_ts"] = ts # 感知开始时间 iter_dict["initiator"] = all_text_dict[mid_initial]['uid'] # 发起者 iter_dict["push"] = all_text_dict[mid_push]['uid'] # 发起者 iter_dict.update(all_text_dict[mid]) count += 1 _id = mid bulk_action.extend([{"index":{"_id": _id}}, iter_dict]) if count % 500 == 0: es_sensor.bulk(bulk_action, index=index_content_sensing, doc_type=type_content_sensing, timeout=600) bulk_action = [] if bulk_action: es_sensor.bulk(bulk_action, index=index_content_sensing, doc_type=type_content_sensing) return "1"
def social_sensing(task_detail): # 任务名 传感器 终止时间 之前状态 创建者 时间 task_name = task_detail[0] social_sensors = task_detail[1] ts = int(task_detail[2]) wb = Workbook() ws = wb.create_sheet() print ts2date(ts) # PART 1 #forward_result = get_forward_numerical_info(task_name, ts, create_by) # 之前时间阶段内的原创微博list/retweeted forward_origin_weibo_list = query_mid_list(ts-time_interval, social_sensors, forward_time_range) forward_retweeted_weibo_list = query_mid_list(ts-time_interval, social_sensors, forward_time_range, 3) # 当前阶段内原创微博list current_mid_list = query_mid_list(ts, social_sensors, time_interval) current_retweeted_mid_list = query_mid_list(ts, social_sensors, time_interval, 3) all_mid_list = [] all_mid_list.extend(current_mid_list) all_mid_list.extend(current_retweeted_mid_list) all_mid_list.extend(forward_origin_weibo_list) all_mid_list.extend(forward_retweeted_weibo_list) all_origin_list = [] all_origin_list.extend(current_mid_list) all_origin_list.extend(forward_origin_weibo_list) all_origin_list = list(set(all_origin_list)) all_retweeted_list = [] all_retweeted_list.extend(current_retweeted_mid_list) all_retweeted_list.extend(forward_retweeted_weibo_list)#被转发微博的mid/root-mid all_retweeted_list = list(set(all_retweeted_list)) print "all mid list: ", len(all_mid_list) #print "all_origin_list", all_origin_list #print "all_retweeted_list", all_retweeted_list # 查询微博在当前时间内的转发和评论数, 聚合按照message_type statistics_count = query_related_weibo(ts, all_mid_list, time_interval) if all_origin_list: origin_weibo_detail = query_hot_weibo(ts, all_origin_list, time_interval) # 原创微博详情 else: origin_weibo_detail = {} if all_retweeted_list: retweeted_weibo_detail = query_hot_weibo(ts, all_retweeted_list, time_interval) # 转发微博详情 else: retweeted_weibo_detail = {} current_total_count = statistics_count['total_count'] # 当前阶段内所有微博总数 current_retweeted_count = statistics_count['retweeted'] current_comment_count = statistics_count['comment'] """ # 聚合当前时间内重要的人 important_uid_list = [] datetime = ts2datetime(ts-time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: search_results = get_important_user(ts, all_mid_list, time_interval) important_uid_list = search_results # 根据获得uid_list,从人物库中匹配重要人物 if important_uid_list: important_results = es_user_portrait.mget(index=portrait_index_name,doc_type=portrait_index_type, body={"ids": important_uid_list})['docs'] else: important_results = [] filter_important_list = [] # uid_list if important_results: for item in important_results: if item['found']: #if item['_source']['importance'] > IMPORTANT_USER_THRESHOULD: filter_important_list.append(item['_id']) print "filter_important_list", filter_important_list print "important_results", important_uid_list """ #判断感知 # 感知到的事, all_mid_list sensitive_text_list = [] tmp_sensitive_warning = "" text_dict = dict() # 文本信息 mid_value = dict() # 文本赋值 duplicate_dict = dict() # 重合字典 portrait_dict = dict() # 背景信息 classify_text_dict = dict() # 分类文本 classify_uid_list = [] duplicate_text_list = [] sensitive_words_dict = dict() sensitive_weibo_detail = {} # 有事件发生时开始 if 1: index_list = [] important_words = [] datetime_1 = ts2datetime(ts) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es = es_text.indices.exists(index=index_name_1) if exist_es: index_list.append(index_name_1) datetime_2 = ts2datetime(ts-DAY) index_name_2 = flow_text_index_name_pre + datetime_2 exist_es = es_text.indices.exists(index=index_name_2) if exist_es: index_list.append(index_name_2) if index_list and all_mid_list: query_body = { "query":{ "filtered":{ "filter":{ "terms":{"mid": all_mid_list} } } }, "size": 5000 } search_results = es_text.search(index=index_list, doc_type="text", body=query_body)['hits']['hits'] tmp_sensitive_warning = "" text_dict = dict() # 文本信息 mid_value = dict() # 文本赋值 duplicate_dict = dict() # 重合字典 portrait_dict = dict() # 背景信息 classify_text_dict = dict() # 分类文本 classify_uid_list = [] duplicate_text_list = [] sensitive_words_dict = dict() if search_results: for item in search_results: iter_uid = item['_source']['uid'] iter_mid = item['_source']['mid'] iter_text = item['_source']['text'].encode('utf-8', 'ignore') iter_sensitive = item['_source'].get('sensitive', 0) duplicate_text_list.append({"_id":iter_mid, "title": "", "content":iter_text.decode("utf-8",'ignore')}) if iter_sensitive: tmp_sensitive_warning = signal_sensitive_variation #涉及到敏感词的微博 sensitive_words_dict[iter_mid] = iter_sensitive keywords_dict = json.loads(item['_source']['keywords_dict']) personal_keywords_dict = dict() for k, v in keywords_dict.iteritems(): k = k.encode('utf-8', 'ignore') personal_keywords_dict[k] = v classify_text_dict[iter_mid] = personal_keywords_dict classify_uid_list.append(iter_uid) # 去重 if duplicate_text_list: dup_results = duplicate(duplicate_text_list) for item in dup_results: if item['duplicate']: duplicate_dict[item['_id']] = item['same_from'] # 分类 mid_value = dict() if classify_text_dict: classify_results = topic_classfiy(classify_uid_list, classify_text_dict) #print "classify_results: ", classify_results for k,v in classify_results.iteritems(): # mid:value mid_value[k] = topic_value_dict[v[0]] if sensitive_words_dict: sensitive_mid_list = sensitive_words_dict.keys() sensitivie_weibo_detail = query_hot_weibo(ts, sensitive_mid_list, time_interval) results = dict() results['mid_topic_value'] = json.dumps(mid_value) results['duplicate_dict'] = json.dumps(duplicate_dict) results['sensitive_words_dict'] = json.dumps(sensitive_words_dict) results['sensitive_weibo_detail'] = json.dumps(sensitive_weibo_detail) results['origin_weibo_number'] = len(all_origin_list) results['retweeted_weibo_number'] = len(all_retweeted_list) results['origin_weibo_detail'] = json.dumps(origin_weibo_detail) results['retweeted_weibo_detail'] = json.dumps(retweeted_weibo_detail) results['retweeted_weibo_count'] = current_retweeted_count results['comment_weibo_count'] = current_comment_count results['weibo_total_number'] = current_total_count results['timestamp'] = ts # es存储当前时段的信息 es_prediction.index(index=index_sensing_task, doc_type=type_sensing_task, id=ts, body=results) #print results #temp_titles = list(results.keys()) #temp_results = list(results.values()) #ws.append(temp_titles) #ws.append(temp_results) #wb.save('./temp/temp'+str(ts)+'.xlsx') #查找并展示经济类的相关微博 #eco_mid_list = get_economics_mids(mid_value) #size = 10 #get_origin_weibo_detail(ts,size,'retweeted') #print eco_mid_list #eco_weibos = get_weibo_content(index_list,eco_mid_list) #print eco_weibos #eco_content = eco_weibos['_source']['text'] #weibo_content = '' #for aaa in eco_weibos: #weibo_content += aaa['_source']['text']+'\n' #save_results(weibo_content,ts) return "1"