def get_event_list(): query_body = { 'query':{ 'match_all':{} }, 'aggs':{ 'all_events':{ 'terms':{'field':'event', "size":1000} } } } es_event_results = es_event.search(index=index_name,doc_type=index_type,body=query_body,request_timeout=99999999)['aggregations']['all_events']['buckets'] event_list = [] event_count = {} for event_item in es_event_results: event_list.append(event_item['key']) event_count[event_item['key']] = event_item['doc_count'] ''' for k,v in event_count.iteritems(): print k.encode('utf-8')+'\t'+str(v) ''' input_list=[] for event in event_list: query_body_start_ts = { 'query':{ 'bool':{ 'must':[ {'term':{'event':event}} ] } }, 'sort':{'timestamp':'asc'}, 'size':10 } es_start_ts_results = es_event.search(index=index_name,doc_type=index_type,body=query_body_start_ts)['hits']['hits'] #获得主函数输入 input_list (event,start_ts,end_ts) start_ts = es_start_ts_results[0]['_source']['timestamp'] end_ts = start_ts + 36000 input_list.append((event,start_ts,end_ts,event_count[event])) print 'input_list done!' return input_list,event_list
def get_event_uid_count(event_list): event_uid_count={} for event in event_list: event_uid_set = set() query_body = { 'query':{ 'bool':{ 'must':[ {'term':{'event':event}} ] } }, 'size':99999999 } es_results = es_event.search(index=index_name,doc_type=index_type,body=query_body)['hits']['hits'] for result in es_results: #print 'result:::',result event_uid_set.add(result['_source']['uid']) try: event_uid_set.add(result['_source']['root_uid']) except: continue try: event_uid_set.add(result['_source']['directed_uid']) except: continue event_uid_count[event] = len(event_uid_set) print 'event_uid_count:::',event_uid_count return event_uid_count
def get_event_trend(input_list): #事件趋势统计 event_trend = defaultdict(list) event_trend_delta = defaultdict(list) trend_input_list = [] for item in input_list: event = item[0] start_ts = item[1] i = 0 while i < 10: end_ts = start_ts + 2 * 3600 trend_input_list.append((event, start_ts, end_ts)) start_ts = end_ts + 1 i = i + 2 #print 'input_list::::',input_list for item in trend_input_list: trend_event = item[0] trend_start_ts = item[1] trend_end_ts = item[2] query_body = { "query": { "bool": { "must": [{ "range": { "timestamp": { "gte": trend_start_ts, "lt": trend_end_ts } } }, { "term": { "event": trend_event } }] } }, "size": 99999999, "sort": { "user_fansnum": "desc" } } es_results = es_event.search(index=index_name, doc_type=index_type, body=query_body)["hits"]["hits"] #前几小时微博数量趋势统计 event_trend[trend_event].append(len(es_results)) for key in event_trend: for i in range(len(event_trend[key]) - 1): #print 'event_trend[event][i+1]::',event_trend[event][i+1] #print 'event_trend[event][i]::',event_trend[event] delta = event_trend[key][i + 1] - event_trend[key][i] print 'delta::::', delta event_trend_delta[key].append(delta) fo_trend.write(str(delta) + '\t') fo_trend.write('\n') print 'event_trend_delta:::', event_trend_delta return event_trend_delta
def feature_extract(event, event_count, start_ts, end_ts, event_trend_delte_event): query_body = { "query": { "bool": { "must": [{ "range": { "timestamp": { "gte": start_ts, "lt": end_ts } } }, { "term": { "event": event } }] } }, "size": 99999999, "sort": { "user_fansnum": "desc" } } es_results = es_event.search(index=index_name, doc_type=index_type, body=query_body)["hits"]["hits"] #前几小时微博数量趋势统计 #event_trend[event].append(len(es_results)) #话题领域 #print 'len(es_re)',len(es_results) field_multi = topic_field(es_results) #参与用户粉丝总数\转发总数\评论总数及平均数统计 total_user_fans = 0 average_user_fans = 0 total_comment = 0 average_comment = 0 total_retweet = 0 average_retweet = 0 #敏感微博数量及比例统计 total_sensitive = 0 total_sensitive_ratio = 0 #负向情绪微博数量及比例统计 total_negtive = 0 total_negtive_ratio = 0 #重要用户数量及比例统计(粉丝数>100000) total_important_user = 0 total_important_user_ratio = 0 #微博总数 total_num = len(es_results) #@层级数统计 at_count = defaultdict(int) for result in es_results: total_user_fans += result['_source']['user_fansnum'] total_comment += result['_source']['comment'] total_retweet += result['_source']['retweeted'] if result['_source']['sensitive'] > 0: total_sensitive += 1 if result['_source']['sentiment'] > 1: total_negtive += 1 if result['_source']['user_fansnum'] > 10000: total_important_user += 1 text = result['_source']['text'] at_list = re.findall('//@', text) #print 'at_list:::',at_list if len(at_list) == 0: at_count['at_0'] += 1 elif len(at_list) == 1: at_count['at_1'] += 1 elif len(at_list) == 2: at_count['at_2'] += 1 else: at_count['at>3'] += 1 #print 'at+count::',at_count average_user_fans = float(total_user_fans) / total_num average_comment = float(total_comment) / total_num average_retweet = float(total_retweet) / total_num total_sensitive_ratio = float(total_sensitive) / total_num total_negtive_ratio = float(total_negtive) / total_num total_important_use_ratio = float(total_important_user) / total_num query_body_type_count = { 'query': { 'bool': { 'must': [{ 'term': { 'event': event } }, { 'range': { 'timestamp': { 'gte': start_ts, 'lt': end_ts } } }] } }, 'size': 999999999, 'aggs': { 'all_weibo': { 'terms': { 'field': 'message_type' } } } } #统计各类型微博数量 es_weibo_type_count = es_event.search( index=index_name, doc_type=index_type, body=query_body_type_count, request_timeout=999999)['aggregations']['all_weibo']['buckets'] total_origin_type = 0 total_retweet_type = 0 total_comment_type = 0 total_type = 0 #print 'es_weibo_type_count:::',es_weibo_type_count weibo_type_count = dict() for item in es_weibo_type_count: if item['key'] == 1: total_origin_type = item['doc_count'] elif item['key'] == 2: total_retweet_type = item['doc_count'] elif item['key'] == 3: total_comment_type = item['doc_count'] total_type = total_origin_type + total_retweet_type + total_comment_type origin_ratio = float(total_origin_type) / total_type retweet_ratio = float(total_retweet_type) / total_type comment_ratio = float(total_comment_type) / total_type #print 'weibo_type_ratio::',origin_ratio,retweet_ratio,comment_ratio,total_type #print 'weibo_type_count::',total_origin_type,total_retweet_type,total_comment_type,total_type #fo_feature.write(field_multi[0]+'\t'+str(total_num)+'\t'+str(total_user_fans)+'\t'+str(total_comment)+'\t'+str(total_retweet)+'\t'+str(total_sensitive)+'\t'+str(total_sensitive_ratio)+'\t'+str(total_negtive)+'\t'+str(total_negtive_ratio)+'\t'+str(total_important_user)+'\t'+str(total_important_user_ratio)+'\t'+str(total_origin_type)+'\t'+str(origin_ratio)+'\t'+str(total_retweet_type)+'\t'+str(retweet_ratio)+'\t'+str(total_comment_type)+'\t'+str(comment_ratio)+'\n') fo_feature.write(field_multi[0] + '\t' + str(total_num) + '\t' + str(total_user_fans) + '\t' + str(total_comment) + '\t' + str(total_retweet) + '\t' + str(total_sensitive) + '\t' + str(total_sensitive_ratio) + '\t' + str(total_negtive) + '\t' + str(total_important_user) + '\t' + str(total_origin_type) + '\t' + str(origin_ratio) + '\t' + str(total_retweet_type) + '\t' + str(retweet_ratio) + '\t' + str(total_comment_type) + '\t' + str(comment_ratio) + '\t' + str(at_count['at_0']) + '\t' + str(at_count['at_1']) + '\t' + str(at_count['at_2']) + '\t' + str(at_count['at>3']) + '\n') #fo_at.write(str(at_count['at_0'])+'\t'+str(at_count['at_1'])+'\t'+str(at_count['at_2'])+'\t'+str(at_count['at>3'])+'\n') fo_truth.write(str(event_count) + '\n')