def export_random_user(): import random query_body={ 'query':{ 'match_all':{} }, 'size':50000 } result=es_user_portrait.search(index=portrait_index_name, doc_type=portrait_index_type, body=query_body)['hits']['hits'] id_list = [user['_id'] for user in result] random.shuffle(id_list) print type(id_list), len(id_list) id_list = id_list[:9000] print len(id_list) final_results = [] for idx, uid in enumerate(id_list): try: user_bci = es_user_portrait.get(index=portrait_index_name, doc_type=portrait_index_type, id=uid)['_source'] user_profile = es_user_profile.get(index=profile_index_name, doc_type=profile_index_type, id=uid)['_source'] hb = dict(user_bci.items() + user_profile.items()) final_results.append(hb) print idx, 'over!!' except: print 'not found', uid print 'final len', len(final_results) fw = file('random_user.json', 'w') fw.write(json.dumps(final_results)) fw.close()
def export_date(): query_body={ 'query':{ 'match_all':{} }, 'size':1000, 'sort':{'influence':{'order':'desc'}} } result=es_user_portrait.search(index=portrait_index_name, doc_type=portrait_index_type, body=query_body)['hits']['hits'] id_list = [user['_id'] for user in result] print len(id_list) final_results = [] for idx, uid in enumerate(id_list): print idx, 'over!!' try: user_bci = es_user_portrait.get(index=portrait_index_name, doc_type=portrait_index_type, id=uid)['_source'] user_profile = es_user_profile.get(index=profile_index_name, doc_type=profile_index_type, id=uid)['_source'] hb = dict(user_bci.items() + user_profile.items()) final_results.append(hb) except: print 'not found', uid print 'final len', len(final_results) fw = file('high_influence_user.json', 'w') fw.write(json.dumps(final_results)) fw.close()
def compute_influence_num(xnr_user_no): uid = xnr_user_no2uid(xnr_user_no) if S_TYPE == 'test': current_time = datetime2ts(S_DATE_BCI) uid = S_UID else: current_time = int(time.time()) - DAY datetime = ts2datetime(current_time) new_datetime = datetime[0:4]+datetime[5:7]+datetime[8:10] index_name = weibo_bci_index_name_pre + new_datetime try: bci_xnr = es_user_portrait.get(index=index_name,doc_type=weibo_bci_index_type,id=uid)['_source']['user_index'] bci_max = es_user_portrait.search(index=index_name,doc_type=weibo_bci_index_type,body=\ {'query':{'match_all':{}},'sort':{'user_index':{'order':'desc'}}})['hits']['hits'][0]['_source']['user_index'] influence = float(bci_xnr)/bci_max*100 influence = round(influence,2) # 保留两位小数 except: influence = 0 return influence
def save_dg_pr_results(sorted_uids, es_num, flag): index_name = "user_portrait_network" index_type = "network" bulk_action = [] for uid, rank in sorted_uids: if (uid == 'global'): continue user_results = {} user_results['uid'] = uid user_results[flag+'_'+str(es_num)] = rank if es_num == 0: action = {'index':{'_id':uid}} bulk_action.extend([action,user_results]) else: try: item_exist = es_user_portrait.get(index=index_name, doc_type=index_type, id=uid)['_source'] action = {'update':{'_id':uid}} try: pr_last = item_exist[flag+'_'+str(es_num-1)] except: pr_last = 0 user_results[flag+'_diff_'+str(es_num)] = rank - pr_last bulk_action.extend([action,{'doc':user_results}]) except: action = {'index':{'_id':uid}} pr_last = 0 user_results[flag+'_diff_'+str(es_num)] = rank - pr_last bulk_action.extend([action,user_results]) #print bulk_action es_user_portrait.bulk(bulk_action, index=index_name, doc_type=index_type)
def get_domain_topic(uid): result = dict() index_time = 'user_portrait' index_type = 'user' result = es_user_portrait.get(index=index_time, doc_type=index_type, id=uid)['_source'] if result: #print 'domain, toic:', result['domain'], result['topic'] return result['domain'], result['topic'] else: return None, None
def acquire_user_by_id(uid): try: result = es_user_portrait.get(index=profile_index_name,doc_type=profile_index_type,id=uid)['_source'] user = {} if result: user['name'] = result['nick_name'] user['location'] = result['user_location'] user['count1'] = result['fansnum'] user['count2'] = result['friendsnum'] return user except: return None
def save_count_results(all_uids_count, es_num): index_name = "user_portrait_network_count" index_type = "network" item = {} date = ts2datetime(time.time()) item['period_'+str(es_num)] = all_uids_count try: item_exist = es_user_portrait.get(index=index_name, doc_type=index_type, id=date)['_source'] es_user_portrait.update(index=index_name, doc_type=index_type,id=date,body=item) except: item['start_ts'] = date es_user_portrait.index(index=index_name, doc_type=index_type,id=date,body=item)
def update_weibo_user_portrait_info(uid): user_exist = es_user_portrait.exists(index=portrait_index_name, doc_type=portrait_index_type, id=uid) if user_exist: user_data = es_user_portrait.get(index=portrait_index_name, doc_type=portrait_index_type, id=uid)['_source'] portrait_info = { 'influence': user_data.get('influence', 0), 'sensitive': user_data.get('sensitive', 0), 'topic_string': user_data.get('topic_string', ''), } return portrait_info return {'influence': 0, 'sensitive': 0, 'topic_string': ''}
def getResult(search_id): item = es.get(index=WEIBO_RANK_KEYWORD_TASK_INDEX , doc_type=WEIBO_RANK_KEYWORD_TASK_TYPE , id=search_id) try: result_obj = {} result_obj['keyword'] = json.loads(item['_source']['keyword']) result_obj['sort_scope'] = item['_source']['sort_scope'] result_obj['sort_norm'] = item['_source']['sort_norm'] result_obj['start_time'] = ts2datetime(item['_source']['start_time']) result_obj['end_time'] =ts2datetime(item['_source']['end_time']) result_obj['result'] = json.loads(item['_source']['result']) result_obj['text_results'] = json.loads(item['_source']['text_results']) result_obj['number'] = item['_source']['number'] return result_obj except : return []
def get_single_user_portrait(seed_user_dict): if 'uid' in seed_user_dict: uid = seed_user_dict['uid'] try: user_portrait_result = es_user_portrait.get(index=portrait_index_name, doc_type=portrait_index_type, id=uid)['_source'] except: user_portrait_result = {} else: uname = seed_user_dict['uname'] query = {'term':{'uname': uname}} try: user_portrait_result = es_user_portrait.search(index=portrait_index_name, doc_type=portrait_index_type ,\ body={'query':{'bool':{'must': quuery}}})['_source'] except: user_portrait_result = {} return user_portrait_result
def getResult(search_id): item = es.get(index=USER_RANK_KEYWORD_TASK_INDEX , doc_type=USER_RANK_KEYWORD_TASK_TYPE , id=search_id) try: # result_obj = {} # result_obj['keyword'] = json.loads(item['_source']['keyword']) # result_obj['sort_scope'] = item['_source']['sort_scope'] # result_obj['sort_norm'] = item['_source']['sort_norm'] # result_obj['start_time'] = ts2datetime(item['_source']['start_time']) # result_obj['end_time'] =ts2datetime(item['_source']['end_time']) # result_obj['result'] = json.loads(item['_source']['result']) # # with open("social_sensors.txt", "wb") as f: # # for item in result_obj['result']: # # f.write(str(item)+"\n") # result_obj['text_results'] = json.loads(item['_source']['text_results']) # result_obj['number'] = item['_source']['number'] return json.loads(item['_source']['result']) except : return []
def get_retweeted_top(): top_results = [] k = 100000 count = 0 now_ts = time.time() date = ts2datetime(now_ts-3600*24) index_time = ''.join(date.split('-')) # test index_time = '20130907' index_type = 'bci' query_body = { 'query':{ 'match_all':{} }, 'size':k, 'sort':[{'origin_weibo_retweeted_top_number':{'order':'desc'}}] } try: result = es_cluster.search(index=index_time, doc_type=index_type, body=query_body)['hits']['hits'] except: return None #print 'result:', len(result) for item in result: if count==100: break uid = item['_id'] try: exist_result = es.get(index='user_portrait', doc_type='user', id=uid) #print 'exist_result:', exist_result try: source = exist_result['_source'] count += 1 #print 'count:', count uname = source['uname'] top_mid = item['_source']['origin_weibo_top_retweeted_id'] top_retweeted_number = item['_source']['origin_weibo_retweeted_top_number'] top_results.append([uid, uname, top_mid, top_retweeted_number]) except: continue except: continue #print 'retweeted top user:'******'top_retweeted_user':json.dumps(top_results)}
def getResult(search_id): item = es.get(index=USER_RANK_KEYWORD_TASK_INDEX, doc_type=USER_RANK_KEYWORD_TASK_TYPE, id=search_id) try: # result_obj = {} # result_obj['keyword'] = json.loads(item['_source']['keyword']) # result_obj['sort_scope'] = item['_source']['sort_scope'] # result_obj['sort_norm'] = item['_source']['sort_norm'] # result_obj['start_time'] = ts2datetime(item['_source']['start_time']) # result_obj['end_time'] =ts2datetime(item['_source']['end_time']) # result_obj['result'] = json.loads(item['_source']['result']) # # with open("social_sensors.txt", "wb") as f: # # for item in result_obj['result']: # # f.write(str(item)+"\n") # result_obj['text_results'] = json.loads(item['_source']['text_results']) # result_obj['number'] = item['_source']['number'] return json.loads(item['_source']['result']) except: return []
def social_sensing(task_detail): # 任务名 传感器 终止时间 之前状态 创建者 时间 task_name = task_detail[0] social_sensors = task_detail[1] stop_time = task_detail[2] forward_warning_status = task_detail[3] create_by = task_detail[4] ts = int(task_detail[5]) new = int(task_detail[6]) print ts2date(ts) # PART 1 forward_result = get_forward_numerical_info(task_name, ts, create_by) # 之前时间阶段内的原创微博list/retweeted forward_origin_weibo_list = query_mid_list(ts - time_interval, social_sensors, forward_time_range) forward_retweeted_weibo_list = query_mid_list(ts - time_interval, social_sensors, forward_time_range, 3) # 当前阶段内原创微博list current_mid_list = query_mid_list(ts, social_sensors, time_interval) current_retweeted_mid_list = query_mid_list(ts, social_sensors, time_interval, 3) all_mid_list = [] all_mid_list.extend(current_mid_list) all_mid_list.extend(current_retweeted_mid_list) all_mid_list.extend(forward_origin_weibo_list) all_mid_list.extend(forward_retweeted_weibo_list) all_origin_list = [] all_origin_list.extend(current_mid_list) all_origin_list.extend(forward_origin_weibo_list) all_retweeted_list = [] all_retweeted_list.extend(current_retweeted_mid_list) all_retweeted_list.extend(forward_retweeted_weibo_list) # 被转发微博的mid/root-mid print "all mid list: ", len(all_mid_list) # print "all_origin_list", all_origin_list # print "all_retweeted_list", all_retweeted_list # 查询微博在当前时间内的转发和评论数, 聚合按照message_type statistics_count = query_related_weibo(ts, all_mid_list, time_interval) if all_origin_list: origin_weibo_detail = query_hot_weibo(ts, all_origin_list, time_interval) # 原创微博详情 else: origin_weibo_detail = {} if all_retweeted_list: retweeted_weibo_detail = query_hot_weibo(ts, all_retweeted_list, time_interval) # 转发微博详情 else: retweeted_weibo_detail = {} current_total_count = statistics_count["total_count"] # 当前阶段内所有微博总数 current_retweeted_count = statistics_count["retweeted"] current_comment_count = statistics_count["comment"] # PART 2 # 聚合当前时间内积极、中性、悲伤、愤怒情绪分布 # sentiment_dict = {"0": "neutral", "1":"positive", "2":"sad", "3": "anger"} sentiment_count = {"0": 0, "1": 0, "2": 0, "3": 0} search_results = aggregation_sentiment_related_weibo(ts, all_mid_list, time_interval) sentiment_count = search_results print "sentiment_count: ", sentiment_count negetive_key = ["2", "3", "4", "5", "6"] negetive_count = 0 for key in negetive_key: negetive_count += sentiment_count[key] # 聚合当前时间内重要的人 important_uid_list = [] datetime = ts2datetime(ts - time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: search_results = get_important_user(ts, all_mid_list, time_interval) important_uid_list = search_results # 根据获得uid_list,从人物库中匹配重要人物 if important_uid_list: important_results = es_user_portrait.mget( index=portrait_index_name, doc_type=portrait_index_type, body={"ids": important_uid_list} )["docs"] else: important_results = [] filter_important_list = [] # uid_list if important_results: for item in important_results: if item["found"]: # if item['_source']['importance'] > IMPORTANT_USER_THRESHOULD: filter_important_list.append(item["_id"]) # 判断感知 burst_reason = signal_nothing_variation warning_status = signal_nothing finish = unfinish_signal # "0" process_status = "1" if forward_result[0]: # 根据移动平均判断是否有时间发生 mean_count = forward_result[1] std_count = forward_result[2] mean_sentiment = forward_result[3] std_sentiment = forward_result[4] if ( mean_count >= MEAN_COUNT and current_total_count > mean_count + 1.96 * std_count or current_total_count >= len(all_mid_list) * AVERAGE_COUNT ): # 异常点发生 if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track else: warning_status = signal_brust burst_reason += signal_count_varition # 数量异常 if ( negetive_count > mean_sentiment + 1.96 * std_sentiment and mean_sentiment >= MEAN_COUNT or negetive_count >= len(all_mid_list) * AVERAGE_COUNT ): warning_status = signal_brust burst_reason += signal_sentiment_varition # 负面情感异常, "12"表示两者均异常 if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track if int(stop_time) <= ts: # 检查任务是否已经完成 finish = finish_signal process_status = "0" # 感知到的事, all_mid_list tmp_burst_reason = burst_reason topic_list = [] sensitive_text_list = [] # 有事件发生时开始 # if warning_status: if 1: index_list = [] important_words = [] datetime_1 = ts2datetime(ts) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es = es_text.indices.exists(index=index_name_1) if exist_es: index_list.append(index_name_1) datetime_2 = ts2datetime(ts - DAY) index_name_2 = flow_text_index_name_pre + datetime_2 exist_es = es_text.indices.exists(index=index_name_2) if exist_es: index_list.append(index_name_2) if index_list and all_mid_list: query_body = {"query": {"filtered": {"filter": {"terms": {"mid": all_mid_list}}}}, "size": 5000} search_results = es_text.search(index=index_list, doc_type="text", body=query_body)["hits"]["hits"] tmp_sensitive_warning = "" text_dict = dict() # 文本信息 mid_value = dict() # 文本赋值 duplicate_dict = dict() # 重合字典 portrait_dict = dict() # 背景信息 classify_text_dict = dict() # 分类文本 classify_uid_list = [] duplicate_text_list = [] sensitive_words_dict = dict() if search_results: for item in search_results: iter_uid = item["_source"]["uid"] iter_mid = item["_source"]["mid"] iter_text = item["_source"]["text"].encode("utf-8", "ignore") iter_sensitive = item["_source"].get("sensitive", 0) duplicate_text_list.append({"_id": iter_mid, "title": "", "content": iter_text}) if iter_sensitive: tmp_sensitive_warning = signal_sensitive_variation # 涉及到敏感词的微博 sensitive_words_dict[iter_mid] = iter_sensitive keywords_dict = json.loads(item["_source"]["keywords_dict"]) personal_keywords_dict = dict() for k, v in keywords_dict.iteritems(): k = k.encode("utf-8", "ignore") personal_keywords_dict[k] = v classify_text_dict[iter_mid] = personal_keywords_dict classify_uid_list.append(iter_uid) # 去重 if duplicate_text_list: dup_results = duplicate(duplicate_text_list) for item in dup_results: if item["duplicate"]: duplicate_dict[item["_id"]] = item["same_from"] # 分类 if classify_text_dict: classify_results = topic_classfiy(classify_uid_list, classify_text_dict) mid_value = dict() # print "classify_results: ", classify_results for k, v in classify_results.iteritems(): # mid:value mid_value[k] = topic_value_dict[v[0]] if tmp_sensitive_warning: warning_status = signal_brust burst_reason += signal_sensitive_variation sensitive_weibo_detail = {} if sensitive_words_dict: sensitive_mid_list = sensitive_words_dict.keys() sensitivie_weibo_detail = query_hot_weibo(ts, sensitive_mid_list, time_interval) results = dict() results["mid_topic_value"] = json.dumps(mid_value) results["duplicate_dict"] = json.dumps(duplicate_dict) results["sensitive_words_dict"] = json.dumps(sensitive_words_dict) results["sensitive_weibo_detail"] = json.dumps(sensitive_weibo_detail) results["origin_weibo_number"] = len(all_origin_list) results["retweeted_weibo_number"] = len(all_retweeted_list) results["origin_weibo_detail"] = json.dumps(origin_weibo_detail) results["retweeted_weibo_detail"] = json.dumps(retweeted_weibo_detail) results["retweeted_weibo_count"] = current_retweeted_count results["comment_weibo_count"] = current_comment_count results["weibo_total_number"] = current_total_count results["sentiment_distribution"] = json.dumps(sentiment_count) results["important_users"] = json.dumps(filter_important_list) results["unfilter_users"] = json.dumps(important_uid_list) results["burst_reason"] = tmp_burst_reason results["timestamp"] = ts # results['clustering_topic'] = json.dumps(topic_list) # es存储当前时段的信息 doctype = create_by + "-" + task_name es_user_portrait.index(index=index_sensing_task, doc_type=doctype, id=ts, body=results) # 更新manage social sensing的es信息 if not new: temporal_result = es_user_portrait.get(index=index_manage_social_task, doc_type=task_doc_type, id=doctype)[ "_source" ] temporal_result["warning_status"] = warning_status temporal_result["burst_reason"] = tmp_burst_reason temporal_result["finish"] = finish temporal_result["processing_status"] = process_status history_status = json.loads(temporal_result["history_status"]) history_status.append([ts, task_name, warning_status]) temporal_result["history_status"] = json.dumps(history_status) es_user_portrait.index(index=index_manage_social_task, doc_type=task_doc_type, id=doctype, body=temporal_result) else: print "test" return "1"
def specific_keywords_burst_dection(task_detail): task_name = task_detail[0] social_sensors = task_detail[1] keywords_list = task_detail[2] sensitive_words = task_detail[3] stop_time = task_detail[4] forward_warning_status = task_detail[5] ts = int(task_detail[7]) forward_result = get_forward_numerical_info(task_name, ts, keywords_list) # 之前时间阶段内的原创微博list forward_origin_weibo_list = query_mid_list(ts-time_interval, keywords_list, forward_time_range) # 当前阶段内原创微博list current_mid_list = query_mid_list(ts, keywords_list, time_interval) all_mid_list = [] all_mid_list.extend(current_mid_list) all_mid_list.extend(forward_origin_weibo_list) print "all mid list: ", len(all_mid_list) # 查询当前的原创微博和之前12个小时的原创微博在当前时间内的转发和评论数, 聚合按照message_type statistics_count = query_related_weibo(ts, all_mid_list, time_interval, keywords_list) current_total_count = statistics_count['total_count'] # 当前阶段内所有微博总数 print "current all weibo: ", statistics_count current_origin_count = statistics_count['origin'] current_retweeted_count = statistics_count['retweeted'] current_comment_count = statistics_count['comment'] # 针对敏感微博的监测,给定传感器和敏感词的前提下,只要传感器的微博里提及到敏感词即会认为是预警 # 聚合当前时间内积极、中性、悲伤、愤怒情绪分布 # sentiment_dict = {"0": "neutral", "1":"positive", "2":"sad", "3": "anger"} sentiment_count = {"0": 0, "1": 0, "2": 0, "3": 0} datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts-time_interval) if datetime != datetime_1: index_name = flow_text_index_name_pre + datetime_1 else: index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: search_results = aggregation_sentiment_related_weibo(ts, all_mid_list, time_interval, keywords_list) sentiment_count = search_results print "sentiment_count: ", sentiment_count negetive_count = sentiment_count['2'] + sentiment_count['3'] # 聚合当前时间内重要的人 important_uid_list = [] if exist_es: #search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=aggregation_sensor_keywords(ts-time_interval, ts, [], "root_uid", size=IMPORTANT_USER_NUMBER))['aggregations']['all_keywords']['buckets'] search_results = query_hot_weibo(ts, all_mid_list, time_interval, keywords_list, aggregation_field="root_uid", size=100) important_uid_list = search_results.keys() if datetime != datetime_1: index_name_1 = flow_text_index_name_pre + datetime_1 if es_text.indices.exists(index_name_1): #search_results_1 = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=aggregation_sensor_keywords(ts-time_interval, ts, [], "root_uid", size=IMPORTANT_USER_NUMBER))['aggregations']['all_keywords']['buckets'] search_results_1 = query_hot_weibo(ts, all_mid_list, time_interval, keywords_list, aggregation_field="root_uid", size=100) if search_results_1: for item in search_results_1: important_uid_list.append(item['key']) # 根据获得uid_list,从人物库中匹配重要人物 if important_uid_list: important_results = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, body={"ids": important_uid_list})['docs'] else: important_results = {} filter_important_list = [] # uid_list if important_results: for item in important_results: if item['found']: if item['_source']['importance'] > IMPORTANT_USER_THRESHOULD: filter_important_list.append(item['_id']) print filter_important_list # 6. 敏感词识别,如果传感器的微博中出现这么一个敏感词,那么就会预警------PS.敏感词是一个 sensitive_origin_weibo_number = 0 sensitive_retweeted_weibo_number = 0 sensitive_comment_weibo_number = 0 sensitive_total_weibo_number = 0 if sensitive_words: query_sensitive_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"range":{ "timestamp":{ "gte": ts - time_interval, "lt": ts }} }, {"terms": {"keywords_string": sensitive_words}} ] } } } }, "aggs":{ "all_list":{ "terms":{"field": "message_type"} } } } if social_sensors: query_sensitive_body['query']['filtered']['filter']['bool']['must'].append({"terms":{"uid": social_sensors}}) sensitive_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_sensitive_body)['aggregations']['all_list']["buckets"] if sensitive_results: for item in sensitive_results: if int(item["key"]) == 1: sensitive_origin_weibo_number = item['doc_count'] elif int(item["key"]) == 2: sensitive_comment_weibo_number = item['doc_count'] elif int(item["key"]) == 3: sensitive_retweeted_weibo_number = item["doc_count"] else: pass sensitive_total_weibo_number = sensitive_origin_weibo_number + sensitive_comment_weibo_number + sensitive_retweeted_weibo_number burst_reason = signal_nothing_variation warning_status = signal_nothing finish = unfinish_signal # "0" process_status = "1" if sensitive_total_weibo_number > WARNING_SENSITIVE_COUNT: # 敏感微博的数量异常 print "======================" if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track else: warning_status = signal_brust burst_reason = signal_sensitive_variation if forward_result[0]: # 根据移动平均判断是否有时间发生 mean_count = forward_result[1] std_count = forward_result[2] mean_sentiment = forward_result[3] std_sentiment = forward_result[4] if current_total_count > mean_count+1.96*std_count: # 异常点发生 print "=====================================================" if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track else: warning_status = signal_brust burst_reason += signal_count_varition # 数量异常 if negetive_count > mean_sentiment+1.96*std_sentiment: warning_status = signal_brust burst_reason += signal_sentiment_varition # 负面情感异常, "12"表示两者均异常 if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track if int(stop_time) <= ts: # 检查任务是否已经完成 finish = finish_signal process_status = "0" # 7. 感知到的事, all_mid_list tmp_burst_reason = burst_reason topic_list = [] # 判断是否有敏感微博出现:有,则聚合敏感微博,replace;没有,聚合普通微博 if burst_reason: # 有事情发生 text_list = [] mid_set = set() if signal_sensitive_variation in burst_reason: query_sensitive_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"range":{ "timestamp":{ "gte": ts - time_interval, "lt": ts }} }, {"terms": {"keywords_string": sensitive_words}} ] } } } }, "size": 10000 } if social_sensors: query_sensitive_body['query']['filtered']['filter']['bool']['must'].append({"terms":{"uid": social_sensors}}) sensitive_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_sensitive_body)['hits']['hits'] if sensitive_results: for item in sensitive_results: iter_mid = item['_source']['mid'] iter_text = item['_source']['text'] temp_dict = dict() temp_dict["mid"] = iter_mid temp_dict["text"] = iter_text if iter_mid not in mid_set: text_list.append(temp_dict) # 整理后的文本,mid,text mid_set.add(iter_mid) burst_reason.replace(signal_sensitive_variation, "") current_origin_mid_list = query_mid_list(ts, keywords_list, time_interval, 1) print "current_origin_mid_list:", len(current_origin_mid_list) if burst_reason and current_mid_list: origin_sensing_text = es_text.mget(index=index_name, doc_type=flow_text_index_type, body={"ids": current_origin_mid_list}, fields=["mid", "text"])["docs"] if origin_sensing_text: for item in origin_sensing_text: if item["found"]: iter_mid = item["fields"]["mid"][0] iter_text = item["fields"]["text"][0] temp_dict = dict() temp_dict["mid"] = iter_mid temp_dict["text"] = iter_text if iter_mid not in mid_set: text_list.append(temp_dict) # 整理后的文本,mid,text mid_set.add(iter_mid) if len(text_list) == 1: top_word = freq_word(text_list[0]) topic_list = [top_word.keys()] elif len(text_list) == 0: topic_list = [] tmp_burst_reason = "" #没有相关微博,归零 print "***********************************" else: feature_words, input_word_dict = tfidf(text_list) #生成特征词和输入数据 word_label, evaluation_results = kmeans(feature_words, text_list) #聚类 inputs = text_classify(text_list, word_label, feature_words) clustering_topic = cluster_evaluation(inputs) print "========================================================================================" print "=========================================================================================" sorted_dict = sorted(clustering_topic.items(), key=lambda x:x[1], reverse=True) topic_list = [] if sorted_dict: for item in sorted_dict: topic_list.append(word_label[item[0]]) print "topic_list, ", topic_list if not topic_list: warning_status = signal_nothing tmp_burst_reason = signal_nothing_variation results = dict() results['origin_weibo_number'] = current_origin_count results['retweeted_weibo_number'] = current_retweeted_count results['comment_weibo_number'] = current_comment_count results['weibo_total_number'] = current_total_count results['sensitive_origin_weibo_number'] = sensitive_origin_weibo_number results['sensitive_retweeted_weibo_number'] = sensitive_retweeted_weibo_number results['sensitive_comment_weibo_number'] = sensitive_comment_weibo_number results['sensitive_weibo_total_number'] = sensitive_total_weibo_number results['sentiment_distribution'] = json.dumps(sentiment_count) results['important_users'] = json.dumps(filter_important_list) results['burst_reason'] = tmp_burst_reason results['timestamp'] = ts if tmp_burst_reason: results['clustering_topic'] = json.dumps(topic_list) # es存储当前时段的信息 doctype = task_name es_user_portrait.index(index=index_sensing_task, doc_type=doctype, id=ts, body=results) # 更新manage social sensing的es信息 temporal_result = es_user_portrait.get(index=index_manage_social_task, doc_type=task_doc_type, id=task_name)['_source'] temporal_result['warning_status'] = warning_status temporal_result['burst_reason'] = tmp_burst_reason temporal_result['finish'] = finish temporal_result['processing_status'] = process_status history_status = json.loads(temporal_result['history_status']) history_status.append([ts, ' '.join(keywords_list), warning_status]) temporal_result['history_status'] = json.dumps(history_status) es_user_portrait.index(index=index_manage_social_task, doc_type=task_doc_type, id=task_name, body=temporal_result) return "1"
def social_sensing(task_detail): # 任务名 传感器 终止时间 之前状态 创建者 时间 task_name = task_detail[0] social_sensors = task_detail[1] stop_time = task_detail[2] create_by = task_detail[3] ts = int(task_detail[4]) print ts2date(ts) # PART 1 #forward_result = get_forward_numerical_info(task_name, ts, create_by) # 之前时间阶段内的原创微博list/retweeted forward_origin_weibo_list = query_mid_list(ts-time_interval, social_sensors, forward_time_range) forward_retweeted_weibo_list = query_mid_list(ts-time_interval, social_sensors, forward_time_range, 3) # 当前阶段内原创微博list current_mid_list = query_mid_list(ts, social_sensors, time_interval) current_retweeted_mid_list = query_mid_list(ts, social_sensors, time_interval, 3) all_mid_list = [] all_mid_list.extend(current_mid_list) all_mid_list.extend(current_retweeted_mid_list) all_mid_list.extend(forward_origin_weibo_list) all_mid_list.extend(forward_retweeted_weibo_list) all_origin_list = [] all_origin_list.extend(current_mid_list) all_origin_list.extend(forward_origin_weibo_list) all_retweeted_list = [] all_retweeted_list.extend(current_retweeted_mid_list) all_retweeted_list.extend(forward_retweeted_weibo_list)#被转发微博的mid/root-mid print "all mid list: ", len(all_mid_list) #print "all_origin_list", all_origin_list #print "all_retweeted_list", all_retweeted_list # 查询微博在当前时间内的转发和评论数, 聚合按照message_type statistics_count = query_related_weibo(ts, all_mid_list, time_interval) if all_origin_list: origin_weibo_detail = query_hot_weibo(ts, all_origin_list, time_interval) # 原创微博详情 else: origin_weibo_detail = {} if all_retweeted_list: retweeted_weibo_detail = query_hot_weibo(ts, all_retweeted_list, time_interval) # 转发微博详情 else: retweeted_weibo_detail = {} current_total_count = statistics_count['total_count'] # 当前阶段内所有微博总数 current_retweeted_count = statistics_count['retweeted'] current_comment_count = statistics_count['comment'] # 聚合当前时间内重要的人 important_uid_list = [] datetime = ts2datetime(ts-time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: search_results = get_important_user(ts, all_mid_list, time_interval) important_uid_list = search_results # 根据获得uid_list,从人物库中匹配重要人物 if important_uid_list: important_results = es_user_portrait.mget(index=portrait_index_name,doc_type=portrait_index_type, body={"ids": important_uid_list})['docs'] else: important_results = [] filter_important_list = [] # uid_list if important_results: for item in important_results: if item['found']: #if item['_source']['importance'] > IMPORTANT_USER_THRESHOULD: filter_important_list.append(item['_id']) print "filter_important_list", filter_important_list print "important_results", important_uid_list #判断感知 finish = unfinish_signal # "0" process_status = "1" if int(stop_time) <= ts: # 检查任务是否已经完成 finish = finish_signal process_status = "0" # 感知到的事, all_mid_list sensitive_text_list = [] # 有事件发生时开始 if 1: index_list = [] important_words = [] datetime_1 = ts2datetime(ts) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es = es_text.indices.exists(index=index_name_1) if exist_es: index_list.append(index_name_1) datetime_2 = ts2datetime(ts-DAY) index_name_2 = flow_text_index_name_pre + datetime_2 exist_es = es_text.indices.exists(index=index_name_2) if exist_es: index_list.append(index_name_2) if index_list and all_mid_list: query_body = { "query":{ "filtered":{ "filter":{ "terms":{"mid": all_mid_list} } } }, "size": 5000 } search_results = es_text.search(index=index_list, doc_type="text", body=query_body)['hits']['hits'] tmp_sensitive_warning = "" text_dict = dict() # 文本信息 mid_value = dict() # 文本赋值 duplicate_dict = dict() # 重合字典 portrait_dict = dict() # 背景信息 classify_text_dict = dict() # 分类文本 classify_uid_list = [] duplicate_text_list = [] sensitive_words_dict = dict() if search_results: for item in search_results: iter_uid = item['_source']['uid'] iter_mid = item['_source']['mid'] iter_text = item['_source']['text'].encode('utf-8', 'ignore') iter_sensitive = item['_source'].get('sensitive', 0) duplicate_text_list.append({"_id":iter_mid, "title": "", "content":iter_text}) if iter_sensitive: tmp_sensitive_warning = signal_sensitive_variation #涉及到敏感词的微博 sensitive_words_dict[iter_mid] = iter_sensitive keywords_dict = json.loads(item['_source']['keywords_dict']) personal_keywords_dict = dict() for k, v in keywords_dict.iteritems(): k = k.encode('utf-8', 'ignore') personal_keywords_dict[k] = v classify_text_dict[iter_mid] = personal_keywords_dict classify_uid_list.append(iter_uid) # 去重 if duplicate_text_list: dup_results = duplicate(duplicate_text_list) for item in dup_results: if item['duplicate']: duplicate_dict[item['_id']] = item['same_from'] # 分类 if classify_text_dict: classify_results = topic_classfiy(classify_uid_list, classify_text_dict) mid_value = dict() #print "classify_results: ", classify_results for k,v in classify_results.iteritems(): # mid:value mid_value[k] = topic_value_dict[v[0]] sensitive_weibo_detail = {} if sensitive_words_dict: sensitive_mid_list = sensitive_words_dict.keys() sensitivie_weibo_detail = query_hot_weibo(ts, sensitive_mid_list, time_interval) results = dict() results['mid_topic_value'] = json.dumps(mid_value) results['duplicate_dict'] = json.dumps(duplicate_dict) results['sensitive_words_dict'] = json.dumps(sensitive_words_dict) results['sensitive_weibo_detail'] = json.dumps(sensitive_weibo_detail) results['origin_weibo_number'] = len(all_origin_list) results['retweeted_weibo_number'] = len(all_retweeted_list) results['origin_weibo_detail'] = json.dumps(origin_weibo_detail) results['retweeted_weibo_detail'] = json.dumps(retweeted_weibo_detail) results['retweeted_weibo_count'] = current_retweeted_count results['comment_weibo_count'] = current_comment_count results['weibo_total_number'] = current_total_count results['important_users'] = json.dumps(filter_important_list) results['unfilter_users'] = json.dumps(important_uid_list) results['timestamp'] = ts #results['clustering_topic'] = json.dumps(topic_list) # es存储当前时段的信息 doctype = create_by + '-' + task_name es_user_portrait.index(index=index_sensing_task, doc_type=doctype, id=ts, body=results) # 更新manage social sensing的es信息 temporal_result = es_user_portrait.get(index=index_manage_social_task, doc_type=task_doc_type, id=doctype)['_source'] temporal_result['finish'] = finish temporal_result['processing_status'] = process_status history_status = json.loads(temporal_result['history_status']) history_status.append(ts) temporal_result['history_status'] = json.dumps(history_status) es_user_portrait.index(index=index_manage_social_task, doc_type=task_doc_type, id=doctype, body=temporal_result) return "1"
def key_words_search(task_id, search_type , pre , during , start_time , keyword_list , search_key = '' , sort_norm = '', sort_scope = '' ,time = 7 , isall = False, number = 100): number = int(number) should = [] for key in keyword_list: if search_type == "hashtag": should.append({"prefix":{"text": "#" + key + "#"}}) else: should.append({"wildcard":{"text": "*" +key + "*"}}) index_list = [] date = ts2datetime(start_time) index_name = pre + date while during: if es_flow_text.indices.exists(index=index_name): index_list.append(index_name) start_time = start_time + DAY date = ts2datetime(start_time) index_name = pre + date during -= 1 print index_list uid_set = set() text_results = [] sorted_text_results = [] query_body = { "query":{ "bool":{ "must":should } }, "sort":{"user_fansnum":{"order":"desc"}}, "size":5000 } results = es_flow_text.search(index = index_list , doc_type = 'text' , body = query_body, _source=False, fields=["uid", "user_fansnum","text", "message_type", "sentiment","timestamp", "geo", "retweeted", "comment"])["hits"]["hits"] id_index = 0 index_list = [] un_uid_list = [] for item in results : if item['fields']['uid'][0] not in uid_set: uid_set.add(item['fields']['uid'][0]) un_uid_list.append(item['fields']['uid'][0]) index_list.append(id_index) id_index += 1 #get_all_filed(sort_norm , time) uid_list = [] print "un_uid_list: ", len(un_uid_list) portrait_list = [] count = 0 in_index = 0 if not isall and un_uid_list : # 库内 portrait_results = es_user_portrait.mget(index=USER_INDEX_NAME, doc_type=USER_INDEX_TYPE, body={"ids":un_uid_list}, _source=False, fields=['uname'])["docs"] for item in portrait_results: if item["found"]: portrait_list.append(item['_id']) nick_name = item['fields']['uname'][0] if nick_name == 'unknown': nick_name = item['_id'] index = index_list[in_index] weibo_url = weiboinfo2url(results[index]['fields']['uid'][0], results[index]['_id']) text_results.extend([results[index]['fields']['uid'][0], results[index]['fields']['user_fansnum'][0], results[index]['fields']['text'][0], results[index]['fields']['message_type'][0], results[index]['fields']['sentiment'][0], ts2date(results[index]['fields']['timestamp'][0]), results[index]['fields']['geo'][0], results[index]['fields']['retweeted'][0], results[index]['fields']['comment'][0], nick_name, weibo_url]) count += 1 if count == number: break print "portrait_len, ", len(portrait_list) in_index += 1 if portrait_list: uid_list = in_sort_filter(time,sort_norm ,sort_scope ,None , portrait_list , True, number) # sort for iter_uid in uid_list: iter_index = portrait_list.index(iter_uid) sorted_text_results.append(text_results[i]) elif un_uid_list: profile_result = es_user_profile.mget(index="weibo_user", doc_type="user", body={"ids":un_uid_list}, fields=['nick_name'])["docs"] for i in range(len(profile_result)): index = index_list[i] try: nick_name = profile_result[i]['fields']['nick_name'][0] except: nick_name = un_uid_list[i] item = results[index] weibo_url = weiboinfo2url(item['fields']['uid'][0], results[index]['_id']) text_results.append([item['fields']['uid'][0], item['fields']['user_fansnum'][0], item['fields']['text'][0], item['fields']['message_type'][0], item['fields']['sentiment'][0], ts2date(item['fields']['timestamp'][0]), results[index]['fields']['geo'][0], results[index]['fields']['retweeted'][0], results[index]['fields']['comment'][0], nick_name, weibo_url]) if i == number: break uid_list = all_sort_filter(un_uid_list[:number] , sort_norm , time ,True, number) sorted_text_results = [] f = open("small.txt", "wb") for iter_uid in uid_list: iter_index = un_uid_list.index(iter_uid) f.write(str(iter_uid)+"\n") sorted_text_results.append(text_results[iter_index]) f.close() print "filter_uid_list: ", len(uid_list) if uid_list: results = make_up_user_info(uid_list,isall,time,sort_norm) else: results = [] print "results: ", len(results) # 修改状态 task_detail = es_user_portrait.get(index=USER_RANK_KEYWORD_TASK_INDEX , doc_type=USER_RANK_KEYWORD_TASK_TYPE, id=task_id) item = task_detail['_source'] item['status'] = 1 item['result'] = json.dumps(results) item['text_results'] = json.dumps(sorted_text_results) item['number'] = len(results) es_user_portrait.index(index = USER_RANK_KEYWORD_TASK_INDEX , doc_type=USER_RANK_KEYWORD_TASK_TYPE , id=task_id, body=item) return "1"
def sensors_keywords_detection(task_detail): task_name = task_detail[0] social_sensors = task_detail[1] keywords_list = task_detail[2] sensitive_words = task_detail[3] stop_time = task_detail[4] forward_warning_status = task_detail[5] ts = task_detail[7] forward_result = get_forward_numerical_info(task_name, ts, keywords_list) # 1. 聚合前12个小时内传感人物发布的所有与关键词相关的原创微博 forward_origin_weibo_list = query_mid_list(ts-time_interval, keywords_list, forward_time_range, 1, social_sensors) # 2. 聚合当前阶段内的原创微博 current_mid_list = query_mid_list(ts, keywords_list, time_interval, 1, social_sensors) all_mid_list = [] all_mid_list.extend(current_mid_list) all_mid_list.extend(forward_origin_weibo_list) all_mid_list = list(set(all_mid_list)) print len(all_mid_list) # 3. 查询当前的原创微博和之前12个小时的原创微博在当前时间内的转发和评论数, 聚合按照message_type statistics_count = query_related_weibo(ts, all_mid_list, time_interval, keywords_list, 1, social_sensors) current_total_count = statistics_count['total_count'] # 当前阶段内所有微博总数 print "current all weibo: ", statistics_count current_origin_count = statistics_count['origin'] current_retweeted_count = statistics_count['retweeted'] current_comment_count = statistics_count['comment'] # 4. 聚合当前时间内积极、中性、悲伤、愤怒情绪分布 sentiment_count = {"0": 0, "1": 0, "2": 0, "3": 0} datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts-time_interval) if datetime == datetime_1: index_name = flow_text_index_name_pre + datetime else: index_name = flow_text_index_name_pre + datetime_1 exist_es = es_text.indices.exists(index_name) if exist_es: search_results = aggregation_sentiment_related_weibo(ts, all_mid_list, time_interval, keywords_list, 1) sentiment_count = search_results print "sentiment_count: ", sentiment_count negetive_count = sentiment_count['2'] + sentiment_count['3'] # 5. 那些社会传感器参与事件讨论 query_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"range":{ "timestamp":{ "gte": ts - time_interval, "lt": ts } }}, {"terms":{"uid": social_sensors}} ], "should":[ {"terms": {"root_mid": all_mid_list}}, {"terms": {"mid": all_mid_list}} ] } } } }, "size": 10000 } datetime = ts2datetime(ts) datetime_1 = ts2datetime(ts - time_interval) if datetime == datetime_1: index_name = flow_text_index_name_pre + datetime else: index_name = flow_text_index_name_pre + datetime_1 search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)['hits']['hits'] attend_users = [] if search_results: for item in search_results: attend_users.append(item['_source']['uid']) important_users = list(set(attend_users)) print "important users", important_users # 6. 敏感词识别,如果传感器的微博中出现这么一个敏感词,那么就会预警------PS.敏感词是一个危险的设置 sensitive_origin_weibo_number = 0 sensitive_retweeted_weibo_number = 0 sensitive_comment_weibo_number = 0 sensitive_total_weibo_number = 0 if sensitive_words: query_sensitive_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"range":{ "timestamp":{ "gte": ts - time_interval, "lt": ts }} }, {"terms": {"keywords_string": sensitive_words}}, {"terms": {"uid": social_sensors}} ] } } } }, "aggs":{ "all_list":{ "terms":{"field": "message_type"} } } } sensitive_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_sensitive_body)['aggregations']['all_list']["buckets"] if sensitive_results: for item in sensitive_results: if int(item["key"]) == 1: sensitive_origin_weibo_number = item['doc_count'] elif int(item["key"]) == 2: sensitive_comment_weibo_number = item['doc_count'] elif int(item["key"]) == 3: sensitive_retweeted_weibo_number = item["doc_count"] else: pass sensitive_total_weibo_number = sensitive_origin_weibo_number + sensitive_comment_weibo_number + sensitive_retweeted_weibo_number burst_reason = signal_nothing_variation warning_status = signal_nothing finish = unfinish_signal # "0" if sensitive_total_weibo_number: # 敏感微博的数量异常 print "======================" if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track else: warning_status = signal_brust burst_reason = signal_sensitive_variation if forward_result[0]: # 根据移动平均判断是否有时间发生 mean_count = forward_result[1] std_count = forward_result[2] mean_sentiment = forward_result[3] std_sentiment = forward_result[4] if current_total_count > mean_count+1.96*std_count: # 异常点发生 print "=====================================================" if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track else: warning_status = signal_brust burst_reason += signal_count_varition # 数量异常 if negetive_count > mean_sentiment+1.96*std_sentiment: warning_status = signal_brust burst_reason += signal_sentiment_varition # 负面情感异常, "12"表示两者均异常 if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track if int(stop_time) <= ts: # 检查任务是否已经完成 finish = finish_signal tmp_burst_reason = burst_reason topic_list = [] # 7. 感知到的事, all_mid_list if burst_reason: # 有事情发生 text_list = [] mid_set = set() if signal_sensitive_variation in burst_reason: query_sensitive_body = { "query":{ "filtered":{ "filter":{ "bool":{ "must":[ {"range":{ "timestamp":{ "gte": ts - time_interval, "lt": ts }} }, {"terms": {"keywords_string": sensitive_words}} ] } } } }, "size": 10000 } if social_sensors: query_sensitive_body['query']['filtered']['filter']['bool']['must'].append({"terms":{"uid": social_sensors}}) sensitive_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_sensitive_body)['hits']["hits"] if sensitive_results: for item in sensitive_results: iter_mid = item['_source']['mid'] iter_text = item['_source']['text'] temp_dict = dict() temp_dict["mid"] = iter_mid temp_dict["text"] = iter_text if iter_mid not in mid_set: text_list.append(temp_dict) # 整理后的文本,mid,text mid_set.add(iter_mid) burst_reason.replace(signal_sensitive_variation, "") if burst_reason and all_mid_list: sensing_text = es_text.mget(index=index_name, doc_type=flow_text_index_type, body={"ids": all_mid_list}, fields=["mid", "text"])["docs"] if sensing_text: for item in sensing_text: if item['found']: iter_mid = item["fields"]["mid"][0] iter_text = item["fields"]["text"][0] temp_dict = dict() temp_dict["mid"] = iter_mid temp_dict["text"] = iter_text if iter_mid not in mid_set: text_list.append(temp_dict) mid_set.add(iter_mid) if len(text_list) == 1: top_word = freq_word(text_list[0]) topic_list = top_word.keys() elif len(text_list) == 0: topic_list = [] tmp_burst_reason = "" #没有相关微博,归零 print "***********************************" else: feature_words, input_word_dict = tfidf(text_list) #生成特征词和输入数据 word_label, evaluation_results = kmeans(feature_words, text_list) #聚类 inputs = text_classify(text_list, word_label, feature_words) clustering_topic = cluster_evaluation(inputs) sorted_dict = sorted(clustering_topic.items(), key=lambda x:x[1], reverse=True)[0:5] topic_list = [] if sorted_dict: for item in sorted_dict: topic_list.append(word_label[item[0]]) print "topic_list:", topic_list if not topic_list: tmp_burst_reason = signal_nothing_variation warning_status = signal_nothing results = dict() results['sensitive_origin_weibo_number'] = sensitive_origin_weibo_number results['sensitive_retweeted_weibo_number'] = sensitive_retweeted_weibo_number results['sensitive_comment_weibo_number'] = sensitive_comment_weibo_number results['sensitive_weibo_total_number'] = sensitive_total_weibo_number results['origin_weibo_number'] = current_origin_count results['retweeted_weibo_number'] = current_retweeted_count results['comment_weibo_number'] = current_comment_count results['weibo_total_number'] = current_total_count results['sentiment_distribution'] = json.dumps(sentiment_count) results['important_users'] = json.dumps(important_users) results['burst_reason'] = tmp_burst_reason results['timestamp'] = ts if tmp_burst_reason: results["clustering_topic"] = json.dumps(topic_list) # es存储当前时段的信息 doctype = task_name es_user_portrait.index(index=index_sensing_task, doc_type=doctype, id=ts, body=results) # 更新manage social sensing的es信息 temporal_result = es_user_portrait.get(index=index_manage_social_task, doc_type=task_doc_type, id=task_name)['_source'] temporal_result['warning_status'] = warning_status temporal_result['burst_reason'] = tmp_burst_reason temporal_result['finish'] = finish history_status = json.loads(temporal_result['history_status']) history_status.append([ts, ' '.join(keywords_list), warning_status]) temporal_result['history_status'] = json.dumps(history_status) es_user_portrait.index(index=index_manage_social_task, doc_type=task_doc_type, id=task_name, body=temporal_result) return "1"
# -*- coding:utf-8 -*- import json import sys reload(sys) sys.path.append('../../') from global_utils import R_SOCIAL_SENSING as r from global_utils import es_user_portrait as es from parameter import INDEX_MANAGE_SOCIAL_SENSING as index_name from parameter import DOC_TYPE_MANAGE_SOCIAL_SENSING as task_doc_type from time_utils import ts2datetime, datetime2ts, ts2date task_name = "两会".decode('utf-8') task_detail = es.get(index="manage_sensing_task", doc_type="task", id=task_name)['_source'] #task_detail['create_at'] = 1456934400 #task_detail['keywords'] = json.dumps(["两会", "人大", "政协"]) #task_detail['sensitive_words'] = json.dumps([]) #task_detail['task_type'] = "2" task_detail['stop_time'] = '1457020800' task_detail['finish'] = '1' task_detail['processing_status'] = "0" es.index(index="manage_sensing_task", doc_type="task", id=task_name, body=task_detail) print task_detail
def get_tweets_distribute(xnr_user_no): topic_distribute_dict = {} topic_distribute_dict['radar'] = {} uid = xnr_user_no2uid(xnr_user_no) if xnr_user_no: es_results = es.get(index=weibo_xnr_fans_followers_index_name,doc_type=weibo_xnr_fans_followers_index_type,\ id=xnr_user_no)["_source"] followers_list = es_results['followers_list'] if S_TYPE == 'test': uid=PORTRAI_UID followers_list=PORTRAIT_UID_LIST # 关注者topic分布 results = es_user_portrait.mget(index=portrait_index_name,doc_type=portrait_index_type,\ body={'ids':followers_list})['docs'] topic_list_followers = [] for result in results: if result['found'] == True: result = result['_source'] topic_string_first = result['topic_string'].split('&') topic_list_followers.extend(topic_string_first) topic_list_followers_count = Counter(topic_list_followers) #topic_distribute_dict['topic_follower'] = topic_list_followers_count # 虚拟人topic分布 try: xnr_results = es_user_portrait.get(index=portrait_index_name,doc_type=portrait_index_type,\ id=uid)['_source'] topic_string = xnr_results['topic_string'].split('&') topic_xnr_count = Counter(topic_string) #topic_distribute_dict['topic_xnr'] = topic_xnr_count except: topic_xnr_count = {} #topic_distribute_dict['topic_xnr'] = topic_xnr_count # 整理雷达图数据 # if topic_xnr_count: # for topic, value in topic_xnr_count.iteritems(): # try: # topic_value = float(value)/(topic_list_followers_count[topic]) # except: # continue # topic_distribute_dict['radar'][topic] = topic_value if topic_xnr_count: for topic, value in topic_list_followers_count.iteritems(): try: topic_value = float(topic_xnr_count[topic])/value except: continue topic_distribute_dict['radar'][topic] = topic_value # 整理仪表盘数据 mark = 0 if topic_xnr_count: n_topic = len(topic_list_followers_count.keys()) for topic,value in topic_xnr_count.iteritems(): try: mark += float(value)/(topic_list_followers_count[topic]*n_topic) print topic print mark except: continue topic_distribute_dict['mark'] = mark return topic_distribute_dict
def social_sensing(task_detail): # 任务名 传感器 终止时间 之前状态 创建者 时间 task_name = task_detail[0] social_sensors = task_detail[1] stop_time = task_detail[2] forward_warning_status = task_detail[3] create_by = task_detail[4] ts = int(task_detail[5]) new = int(task_detail[6]) print ts2date(ts) # PART 1 forward_result = get_forward_numerical_info(task_name, ts, create_by) # 之前时间阶段内的原创微博list/retweeted forward_origin_weibo_list = query_mid_list(ts-time_interval, social_sensors, forward_time_range) forward_retweeted_weibo_list = query_mid_list(ts-time_interval, social_sensors, forward_time_range, 3) # 当前阶段内原创微博list current_mid_list = query_mid_list(ts, social_sensors, time_interval) current_retweeted_mid_list = query_mid_list(ts, social_sensors, time_interval, 3) all_mid_list = [] all_mid_list.extend(current_mid_list) all_mid_list.extend(current_retweeted_mid_list) all_mid_list.extend(forward_origin_weibo_list) all_mid_list.extend(forward_retweeted_weibo_list) all_origin_list = [] all_origin_list.extend(current_mid_list) all_origin_list.extend(forward_origin_weibo_list) all_retweeted_list = [] all_retweeted_list.extend(current_retweeted_mid_list) all_retweeted_list.extend(forward_retweeted_weibo_list)#被转发微博的mid/root-mid print "all mid list: ", len(all_mid_list) #print "all_origin_list", all_origin_list #print "all_retweeted_list", all_retweeted_list # 查询微博在当前时间内的转发和评论数, 聚合按照message_type statistics_count = query_related_weibo(ts, all_mid_list, time_interval) if all_origin_list: origin_weibo_detail = query_hot_weibo(ts, all_origin_list, time_interval) # 原创微博详情 else: origin_weibo_detail = {} if all_retweeted_list: retweeted_weibo_detail = query_hot_weibo(ts, all_retweeted_list, time_interval) # 转发微博详情 else: retweeted_weibo_detail = {} current_total_count = statistics_count['total_count'] # 当前阶段内所有微博总数 current_retweeted_count = statistics_count['retweeted'] current_comment_count = statistics_count['comment'] # PART 2 # 聚合当前时间内积极、中性、悲伤、愤怒情绪分布 # sentiment_dict = {"0": "neutral", "1":"positive", "2":"sad", "3": "anger"} sentiment_count = {"0": 0, "1": 0, "2": 0, "3": 0} search_results = aggregation_sentiment_related_weibo(ts, all_mid_list, time_interval) sentiment_count = search_results print "sentiment_count: ", sentiment_count negetive_key = ["2", "3", "4", "5", "6"] negetive_count = 0 for key in negetive_key: negetive_count += sentiment_count[key] # 聚合当前时间内重要的人 important_uid_list = [] datetime = ts2datetime(ts-time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: search_results = get_important_user(ts, all_mid_list, time_interval) important_uid_list = search_results # 根据获得uid_list,从人物库中匹配重要人物 if important_uid_list: important_results = es_user_portrait.mget(index=portrait_index_name,doc_type=portrait_index_type, body={"ids": important_uid_list})['docs'] else: important_results = [] filter_important_list = [] # uid_list if important_results: for item in important_results: if item['found']: #if item['_source']['importance'] > IMPORTANT_USER_THRESHOULD: filter_important_list.append(item['_id']) #判断感知 burst_reason = signal_nothing_variation warning_status = signal_nothing finish = unfinish_signal # "0" process_status = "1" if forward_result[0]: # 根据移动平均判断是否有时间发生 mean_count = forward_result[1] std_count = forward_result[2] mean_sentiment = forward_result[3] std_sentiment = forward_result[4] if mean_count >= MEAN_COUNT and current_total_count > mean_count+1.96*std_count or current_total_count >= len(all_mid_list)*AVERAGE_COUNT: # 异常点发生 if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track else: warning_status = signal_brust burst_reason += signal_count_varition # 数量异常 if negetive_count > mean_sentiment+1.96*std_sentiment and mean_sentiment >= MEAN_COUNT or negetive_count >= len(all_mid_list)*AVERAGE_COUNT: warning_status = signal_brust burst_reason += signal_sentiment_varition # 负面情感异常, "12"表示两者均异常 if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track if int(stop_time) <= ts: # 检查任务是否已经完成 finish = finish_signal process_status = "0" # 感知到的事, all_mid_list tmp_burst_reason = burst_reason topic_list = [] sensitive_text_list = [] # 有事件发生时开始 #if warning_status: if 1: index_list = [] important_words = [] datetime_1 = ts2datetime(ts) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es = es_text.indices.exists(index=index_name_1) if exist_es: index_list.append(index_name_1) datetime_2 = ts2datetime(ts-DAY) index_name_2 = flow_text_index_name_pre + datetime_2 exist_es = es_text.indices.exists(index=index_name_2) if exist_es: index_list.append(index_name_2) if index_list and all_mid_list: query_body = { "query":{ "filtered":{ "filter":{ "terms":{"mid": all_mid_list} } } }, "size": 2000 } search_results = es_text.search(index=index_list, doc_type="text", body=query_body)['hits']['hits'] text_list = [] tmp_sensitive_warning = "" sensitive_words_dict = dict() if search_results: for item in search_results: iter_mid = item['_source']['mid'] iter_text = item['_source']['text'] iter_sensitive = item['_source'].get('sensitive', 0) if iter_sensitive: tmp_sensitive_warning = signal_sensitive_variation #涉及到敏感词的微博 sensitive_words_dict[iter_mid] = iter_sensitive temp_dict = dict() temp_dict["mid"] = iter_mid temp_dict["text"] = iter_text text_list.append(temp_dict) if tmp_sensitive_warning: warning_status = signal_brust burst_reason += signal_sensitive_variation sensitive_weibo_detail = {} if sensitive_words_dict: sensitive_mid_list = sensitive_words_dict.keys() sensitivie_weibo_detail = query_hot_weibo(ts, sensitive_mid_list, time_interval) """ if len(text_list) == 1: top_word = freq_word(text_list[0]) topic_list = [top_word.keys()] elif len(text_list) == 0: topic_list = [] tmp_burst_reason = "" #没有相关微博,归零 print "no relate weibo text" else: feature_words, input_word_dict = tfidf(text_list) #生成特征词和输入数据 word_label, evaluation_results = kmeans(feature_words, text_list) #聚类 inputs = text_classify(text_list, word_label, feature_words) clustering_topic = cluster_evaluation(inputs) print "clustering weibo topic" sorted_dict = sorted(clustering_topic.items(), key=lambda x:x[1], reverse=True) topic_list = [] if sorted_dict: for item in sorted_dict: if item[0] != "other": topic_list.append(word_label[item[0]]) print "topic list: ", len(topic_list) """ results = dict() if sensitive_weibo_detail: print "sensitive_weibo_detail: ", sensitive_weibo_detail results['sensitive_words_dict'] = json.dumps(sensitive_words_dict) results['sensitive_weibo_detail'] = json.dumps(sensitive_weibo_detail) results['origin_weibo_number'] = len(all_origin_list) results['retweeted_weibo_number'] = len(all_retweeted_list) results['origin_weibo_detail'] = json.dumps(origin_weibo_detail) results['retweeted_weibo_detail'] = json.dumps(retweeted_weibo_detail) results['retweeted_weibo_count'] = current_retweeted_count results['comment_weibo_count'] = current_comment_count results['weibo_total_number'] = current_total_count results['sentiment_distribution'] = json.dumps(sentiment_count) results['important_users'] = json.dumps(filter_important_list) results['unfilter_users'] = json.dumps(important_uid_list) results['burst_reason'] = tmp_burst_reason results['timestamp'] = ts #results['clustering_topic'] = json.dumps(topic_list) # es存储当前时段的信息 doctype = create_by + '-' + task_name es_user_portrait.index(index=index_sensing_task, doc_type=doctype, id=ts, body=results) # 更新manage social sensing的es信息 if not new: temporal_result = es_user_portrait.get(index=index_manage_social_task, doc_type=task_doc_type, id=doctype)['_source'] temporal_result['warning_status'] = warning_status temporal_result['burst_reason'] = tmp_burst_reason temporal_result['finish'] = finish temporal_result['processing_status'] = process_status history_status = json.loads(temporal_result['history_status']) history_status.append([ts, task_name, warning_status]) temporal_result['history_status'] = json.dumps(history_status) es_user_portrait.index(index=index_manage_social_task, doc_type=task_doc_type, id=doctype, body=temporal_result) else: print "test" return "1"
def social_sensing(task_detail): # 任务名 传感器 终止时间 之前状态 创建者 时间 task_name = task_detail[0] social_sensors = task_detail[1] stop_time = task_detail[2] forward_warning_status = task_detail[3] create_by = task_detail[4] ts = int(task_detail[5]) # PART 1 forward_result = get_forward_numerical_info(task_name, ts, create_by) # 之前时间阶段内的原创微博list/retweeted forward_origin_weibo_list = query_mid_list(ts-time_interval, social_sensors, forward_time_range) forward_retweeted_weibo_list = query_mid_list(ts-time_interval, social_sensors, forward_time_range, 3) # 当前阶段内原创微博list current_mid_list = query_mid_list(ts, social_sensors, time_interval) current_retweeted_mid_list = query_mid_list(ts, social_sensors, time_interval, 3) all_mid_list = [] all_mid_list.extend(current_mid_list) all_mid_list.extend(current_retweeted_mid_list) all_mid_list.extend(forward_origin_weibo_list) all_mid_list.extend(forward_retweeted_weibo_list) all_origin_list = [] all_origin_list.extend(current_mid_list) all_origin_list.extend(forward_origin_weibo_list) all_retweeted_list = [] all_retweeted_list.extend(current_retweeted_mid_list) all_retweeted_list.extend(forward_retweeted_weibo_list)#被转发微博的mid/root-mid print "all mid list: ", len(all_mid_list) print "all_origin_list", all_origin_list print "all_retweeted_list", all_retweeted_list # 查询微博在当前时间内的转发和评论数, 聚合按照message_type statistics_count = query_related_weibo(ts, all_mid_list, time_interval) if all_origin_list: origin_weibo_detail = query_hot_weibo(ts, all_origin_list, time_interval) # 原创微博详情 else: origin_weibo_detail = {} if all_retweeted_list: retweeted_weibo_detail = query_hot_weibo(ts, all_retweeted_list, time_interval) # 转发微博详情 else: retweeted_weibo_detail = {} current_total_count = statistics_count['total_count'] # 当前阶段内所有微博总数 current_retweeted_count = statistics_count['retweeted'] current_comment_count = statistics_count['comment'] # PART 2 # 聚合当前时间内积极、中性、悲伤、愤怒情绪分布 # sentiment_dict = {"0": "neutral", "1":"positive", "2":"sad", "3": "anger"} sentiment_count = {"0": 0, "1": 0, "2": 0, "3": 0} search_results = aggregation_sentiment_related_weibo(ts, all_mid_list, time_interval) sentiment_count = search_results print "sentiment_count: ", sentiment_count negetive_key = ["2", "3", "4", "5", "6"] negetive_count = 0 for key in negetive_key: negetive_count += sentiment_count[key] # 聚合当前时间内重要的人 important_uid_list = [] datetime = ts2datetime(ts-time_interval) index_name = flow_text_index_name_pre + datetime exist_es = es_text.indices.exists(index_name) if exist_es: search_results = get_important_user(ts, all_mid_list, time_interval) important_uid_list = search_results.keys() # 根据获得uid_list,从人物库中匹配重要人物 if important_uid_list: important_results = es_user_portrait.mget(index=portrait_index_name,doc_type=portrait_index_type, body={"ids": important_uid_list})['docs'] else: important_results = {} filter_important_list = [] # uid_list if important_results: for item in important_results: if item['found']: #if item['_source']['importance'] > IMPORTANT_USER_THRESHOULD: filter_important_list.append(item['_id']) print filter_important_list #判断感知 burst_reason = signal_nothing_variation warning_status = signal_nothing finish = unfinish_signal # "0" process_status = "1" if forward_result[0]: # 根据移动平均判断是否有时间发生 mean_count = forward_result[1] std_count = forward_result[2] mean_sentiment = forward_result[3] std_sentiment = forward_result[4] if mean_count >= MEAN_COUNT and current_total_count > mean_count+1.96*std_count or current_total_count >= len(social_sensors)*0.2*AVERAGE_COUNT: # 异常点发生 print "=====================================================" if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track else: warning_status = signal_brust burst_reason += signal_count_varition # 数量异常 if negetive_count > mean_sentiment+1.96*std_sentiment and mean_sentiment >= MEAN_COUNT or negetive_count >= len(social_sensors)*0.2*AVERAGE_COUNT: warning_status = signal_brust burst_reason += signal_sentiment_varition # 负面情感异常, "12"表示两者均异常 if forward_warning_status == signal_brust: # 已有事件发生,改为事件追踪 warning_status = signal_track if int(stop_time) <= ts: # 检查任务是否已经完成 finish = finish_signal process_status = "0" # 感知到的事, all_mid_list tmp_burst_reason = burst_reason topic_list = [] # 有事件发生时开始 if warning_status: index_list = [] important_words = [] datetime_1 = ts2datetime(ts) index_name_1 = flow_text_index_name_pre + datetime_1 exist_es = es_text.indices.exists(index=index_name_1) if exist_es: index_list.append(index_name_1) datetime_2 = ts2datetime(ts-DAY) index_name_2 = flow_text_index_name_pre + datetime_2 exist_es = es_text.indices.exists(index=index_name_2) if exist_es: index_list.append(index_name_2) if index_list and all_mid_list: query_body = { "query":{ "filtered":{ "filter":{ "terms":{"mid": all_mid_list} } } }, "size": 2000 } search_results = es_text.search(index=index_list, doc_type="text", body=query_body)['hits']['hits'] text_list = [] if search_results: for item in search_results: iter_mid = item['_source']['mid'] iter_text = item['_source']['text'] temp_dict = dict() temp_dict["mid"] = iter_mid temp_dict["text"] = iter_text text_list.append(temp_dict) for item in text_list: print item['text'] if len(text_list) == 1: top_word = freq_word(text_list[0]) topic_list = [top_word.keys()] elif len(text_list) == 0: topic_list = [] tmp_burst_reason = "" #没有相关微博,归零 print "***********************************" else: feature_words, input_word_dict = tfidf(text_list) #生成特征词和输入数据 word_label, evaluation_results = kmeans(feature_words, text_list) #聚类 inputs = text_classify(text_list, word_label, feature_words) clustering_topic = cluster_evaluation(inputs) print "===============================================================" print "===============================================================" sorted_dict = sorted(clustering_topic.items(), key=lambda x:x[1], reverse=True) topic_list = [] if sorted_dict: for item in sorted_dict: topic_list.append(word_label[item[0]]) print "topic_list, ", topic_list #if not topic_list: # warning_status = signal_nothing # tmp_burst_reason = signal_nothing_variation results = dict() results['origin_weibo_number'] = len(all_origin_list) results['retweeted_weibo_number'] = len(all_retweeted_list) results['origin_weibo_detail'] = json.dumps(origin_weibo_detail) results['retweeted_weibo_detail'] = json.dumps(retweeted_weibo_detail) results['retweeted_weibo_count'] = current_retweeted_count results['comment_weibo_count'] = current_comment_count results['weibo_total_number'] = current_total_count results['sentiment_distribution'] = json.dumps(sentiment_count) results['important_users'] = json.dumps(filter_important_list) results['unfilter_users'] = json.dumps(important_uid_list) results['burst_reason'] = tmp_burst_reason results['timestamp'] = ts if tmp_burst_reason: results['clustering_topic'] = json.dumps(topic_list) # es存储当前时段的信息 doctype = create_by + '-' + task_name es_user_portrait.index(index=index_sensing_task, doc_type=doctype, id=ts, body=results) # 更新manage social sensing的es信息 temporal_result = es_user_portrait.get(index=index_manage_social_task, doc_type=task_doc_type, id=doctype)['_source'] temporal_result['warning_status'] = warning_status temporal_result['burst_reason'] = tmp_burst_reason temporal_result['finish'] = finish temporal_result['processing_status'] = process_status history_status = json.loads(temporal_result['history_status']) history_status.append([ts, task_name, warning_status]) temporal_result['history_status'] = json.dumps(history_status) es_user_portrait.index(index=index_manage_social_task, doc_type=task_doc_type, id=doctype, body=temporal_result) return "1"