def get_maker_weibos_sort_es(topic, identifyDate, identifyWindow, sort_item): query_body = {'query': {'term': {'en_name': topic}}} network_results = es.search(index=index_event_analysis_results, doc_type=type_event_analysis_results, body=query_body)['hits']['hits'] weibos = [] for network_result in network_results: network_result = network_result['_source']['network_results'] network_result = json.loads(network_result) maker_results = network_result['maker_results'] for uid, maker_info in maker_results.iteritems(): user_info = maker_info['user_info'] user_info = json.loads(user_info) weibos_info = maker_info['weibo_info'] weibos_info = json.loads(weibos_info) for weibo_info in weibos_info: weibo_info['_source']['uname'] = user_info['name'] weibo_info['_source']['photo_url'] = user_info[ 'profile_image_url'] if weibo_info in weibos: continue else: weibos.append(weibo_info) sorted_weibos = sorted(weibos, key=lambda x: x['_source'][sort_item], reverse=True) #print 'sorted_weibos::::::::::::::',sorted_weibos return sorted_weibos
def ajax_show_all_task(): query_body = { "query": { "match_all": {} }, "sort": { "submit_time": { "order": "desc" } }, "size": 1000 } es_results = es_prediction.search(index=index_manage_interfere_task, doc_type=type_manage_interfere_task, \ body=query_body)["hits"]["hits"] #print '84::::::::',es_results ''' task_list = [] for item in es_results: tmp = [] item_detail = item["_source"] tmp.append(item_detail["task_name"]) tmp.append(item_detail["submit_user"]) # 可忽略 tmp.append(item_detail["submit_time"]) tmp.append(item_detail["stop_time"]) tmp.append(item_detail["update_time"]) tmp.append(item_detail["remark"]) task_list.append(tmp) ''' task_list = [] for item in es_results: task_list.append(item["_source"]) return json.dumps(task_list)
def get_gexf_es(topic, identifyDate, identifyWindow): #key = _utf8_unicode(topic) +'_' + str(identifyDate) + '_' + str(identifyWindow) + '_' + 'source_graph' #key = str(key) #gexf2es(key, value) print 'topic:::', topic query_body = {'query': {'term': {'en_name': topic}}} network_results = es.search(index=index_event_analysis_results, doc_type=type_event_analysis_results, body=query_body)['hits']['hits'] for network_result in network_results: network_result = network_result['_source']['network_results'] network_result = json.loads(network_result) gexf_results = network_result['long_gexf'] for key, value in gexf_results.iteritems(): #print value.keys() #print value['date'] #print value['window'] #print type(value) #print key result = value['gexf'] print type(result) #result = read_long_gexf(topic, identifyDate, identifyWindow) #result = gexf_process(result) #print result return result
def ajax_show_analysis_task(): current_time = time.time() query_body = { "query": { "range": { "submit_time": { "gte": current_time - 20 * 24 * 3600 } } }, "size": 1000, "sort": { "submit_time": { "order": "desc" } } } es_results = es_prediction.search(index=index_manage_event_analysis,doc_type=\ type_manage_event_analysis, body=query_body)["hits"]["hits"] results = [] for item in es_results: results.append(item["_source"]) print '173::::::::::::::', results return json.dumps(results)
def get_sen_province_count_es_final(en_name, start_ts, end_ts, unit=MinInterval): sen_geo_results_dict = {} sen_geo_count_results_final = [] query_body = {'query': {'term': {'en_name': en_name}}} es_results = es.search(index=index_event_analysis_results,doc_type=type_event_analysis_results,\ body=query_body)['hits']['hits'] for es_result in es_results: sen_results = es_result['_source']['sentiment_results'] sen_geo_results_dict = json.loads(sen_results) during = sen_geo_results_dict['during'] geo_counts = sen_geo_results_dict['geo_count'] print 'geo_counts:::::::::::::', geo_counts for sen, geo_list in geo_counts.iteritems(): #print 'type:::::geo_list',geo_list sen_geo_count_results = {} sen_geo_count_results[sen] = [] if geo_list: for province, city_dict in geo_list.iteritems(): sen_geo_count_results[sen].append([province, city_dict]) else: continue sen_geo_count_results_final.append(sen_geo_count_results) #print 'sen_geo_count_results_final::::::::::::::::::',sen_geo_count_results_final return sen_geo_count_results_final
def ajax_show_all_task(): query_body = { "query": { "match_all": {} }, "sort": { "submit_time": { "order": "desc" } }, "size": 1000 } es_results = es.search(index=index_manage_event_analysis, doc_type=type_manage_event_analysis, \ body=query_body)["hits"]["hits"] #print '84::::::::',es_results task_list = [] ''' for item in es_results: #print '96::::::::',item tmp = [] item_detail = item["_source"] tmp.append(item_detail["task_name"]) tmp.append(item_detail["submit_user"]) tmp.append(item_detail["submit_time"]) tmp.append(item_detail["start_time"]) tmp.append(item_detail["stop_time"]) tmp.append(item_detail["event_value_finish"]) task_list.append(tmp) ''' for item in es_results: task_list.append(item['_source']) return json.dumps(task_list)
def get_time_count_es(en_name, start_ts, end_ts, unit=MinInterval): mtype_count_results = {} time_results_dict = {} query_body = {'query': {'term': {'en_name': en_name}}} es_results = es.search(index=index_event_analysis_results,doc_type=type_event_analysis_results,\ body=query_body)['hits']['hits'] for es_result in es_results: time_results = es_result['_source']['time_results'] time_results_dict = json.loads(time_results) during = time_results_dict['during'] counts = time_results_dict['count'] #print 'counts:::::::::::',counts if (end_ts - start_ts < unit): upbound = long(math.ceil(end_ts / (unit * 1.0)) * unit) mtype_count_results[upbound] = {} for key, value in counts.iteritems(): if key == upbound: for k, v in value.iteritems(): try: mtype_count_results[upbound][k] += v except: mtype_count_results[upbound][k] = v else: upbound = long(math.ceil(end_ts / (unit * 1.0)) * unit) lowbound = long((start_ts / unit) * unit) interval = (upbound - lowbound) / unit for i in range(interval, 0, -1): begin_ts = upbound - unit * i end_ts = begin_ts + unit mtype_count_results[end_ts] = {} for key, value in counts.iteritems(): #print 'begin_ts::::::::::::',type(begin_ts) #print 'key:::::::::::',type(key) #print 'end_ts::::::::::::',type(end_ts) #print 'value::::::::::::',value key = int(key) #print 'key:::::::::::',type(key) if key > begin_ts and key <= end_ts: #print '++++++++++++++++++++++++++++++++++++++++++++++' for k, v in value.iteritems(): #print 'k:::::::::::::::::::',k #print 'v:::::::::::::::::::',v try: mtype_count_results[end_ts][k] += v except: mtype_count_results[end_ts][k] = v return mtype_count_results
def get_symbol_weibo(topic,start_ts,end_ts,unit=MinInterval): #鱼骨图 weibos = {} query_body = { 'query':{ 'bool':{ 'must':[ {'term':{'name':topic}}, {'range':{'start_ts':{'lte':start_ts}}}, {'range':{'end_ts':{'gte':end_ts}}} ] } } } ''' query_body = { 'query':{ 'match_all':{} } } ''' print query_body symbol = es.search(index=topics_river_index_name,doc_type=topics_river_index_type,body=query_body)['hits']['hits'][0]['_source'] #symbol = es.search(index=topics_river_index_name,doc_type=topics_river_index_type,body=query_body)['hits']['hits'] print 'symbol:::',symbol features = json.loads(symbol['features']) symbol_weibos = json.loads(symbol['cluster_dump_dict']) #print symbol_weibos begin_ts = end_ts - unit for clusterid,contents in symbol_weibos.iteritems(): j = 0 content = set() for i in contents: ts = full_datetime2ts(i['datetime']) title = re.findall(r'【.*】',i['content'].encode('utf8')) if title: title = title[0] print 'title::',title.encode('utf-8') if ts >= start_ts and ts <= end_ts and title not in content: #start_ts应该改成begin_ts,现在近15分钟没数据,所以用所有的 try: weibos[features[clusterid][0]].append(i) except: weibos[features[clusterid][0]] = [i] content.add(title) j += 1 #print content if j == 3: break else: continue #print weibos return weibos
def ajax_show_analysis_task(): query_body = { "query": { "range": { "submit_time": { "gte": time.time() - 20 * 24 * 3600 } } }, "size": 1000, "sort": { "submit_time": { "order": "desc" } } } task_set = set() es_results = es_prediction.search(index=index_manage_event_analysis,doc_type=\ type_manage_event_analysis, body=query_body)["hits"]["hits"] analysis_results = [] for item in es_results: task_set.add(item["_source"]["pinyin_task_name"]) analysis_results.append(item["_source"]) prediction_results = [] es_results = es_prediction.search(index=index_manage_prediction_task, doc_type=\ type_manage_prediction_task,body=query_body)["hits"]["hits"] for item in es_results: task_name = item["_source"]["pinyin_task_name"] if task_name not in task_set: prediction_results.append(item["_source"]) task_set.add(task_name) return_dict = dict() return_dict["event_analysis_task"] = analysis_results return_dict["event_prediction_task"] = prediction_results return json.dumps(return_dict)
def province_weibo_count_es(topic, start_ts, end_ts, unit=MinInterval): geo_count_results = {} geo_results_dict = {} query_body = {'query': {'term': {'en_name': topic}}} es_results = es.search(index=index_event_analysis_results,doc_type=type_event_analysis_results,\ body=query_body)['hits']['hits'] print 'len:::::::::', len(es_results) for es_result in es_results: geo_results = es_result['_source']['geo_results'] geo_results_dict = json.loads(geo_results) #sort_ts_attr = geo_results_dict['sort_ts_attr'] #repost_list = geo_results_dict['repost_list'] geo_cityTopic_results = geo_results_dict['geo_cityTopic_results'] geo_cityCount = geo_cityTopic_results['geo_cityCount'] #print 'geo_cityTopic_results[]:::::::::::',geo_cityTopic_results['geo_cityCount'] ''' if (end_ts - start_ts < unit): upbound = long(math.ceil(end_ts / (unit * 1.0)) * unit) mtype_count_results[upbound] = {} for key,value in counts.iteritems(): if key == upbound: for k,v in value.iteritems(): try: mtype_count_results[upbound][k] += v except: mtype_count_results[upbound][k] = v else: upbound = long(math.ceil(end_ts / (unit * 1.0)) * unit) lowbound = long((start_ts / unit) * unit) interval = (upbound-lowbound)/unit for i in range(interval, 0, -1): begin_ts = upbound - unit * i end_ts = begin_ts + unit mtype_count_results[end_ts] = {} for key,value in counts.iteritems(): if key > begin_ts and key <= end_ts: for k,v in value.iteritems(): try: mtype_count_results[end_ts][k] += v except: mtype_count_results[end_ts][k] = v ''' #return geo_count_results return geo_cityCount
def get_predict_count(task_name, start_ts, end_ts): query_body = { "query": { "range": { "update_time": { "gte": start_ts, "lte": end_ts } } }, "sort": { "update_time": { "order": "asc" } }, "size": 100000 } index_name = 'micro_prediction_' + task_name results = es_prediction.search(index=index_name, doc_type="micro_task", body=query_body)["hits"]["hits"] return_list = [] truth_value_list = [] prediction_value_list = [] ts_list = [] for item in results: truth_value = item["_source"]["total_count"] truth_value_list.append(truth_value) try: prediction_value = item["_source"]["prediction_value"] prediction_value_list.append(prediction_value) except: pass ts = item["_source"]["update_time"] ts_list.append(ts) print len(truth_value_list), len(ts_list), len(prediction_value_list) prediction_value_list.insert(0, truth_value_list[0]) for i in range(len(ts_list)): return_list.append( [ts_list[i], truth_value_list[i], prediction_value_list[i]]) # final return_list.append( [ts_list[-1] + minimal_time_interval, 0, prediction_value_list[-1]]) return json.dumps(return_list)
def get_macro_prediction_count(task_name): query_body = {'query': {'term': {'event': task_name}}} print 'task_name::::::', task_name es_results = es_prediction.search(index=index_macro_feature_result,doc_type=type_macro_feature_result,\ body=query_body)['hits']['hits'] weibo_count = 0 user_count = 0 rank = 0.0 print 'es::::::::::::', es_results for es_result in es_results: weibo_count = es_result['_source']['predict_weibo_value'] user_count = es_result['_source']['predict_user_value'] rank = es_result['_source']['predict_rank'] return weibo_count, user_count, rank
def get_trend_maker_es(topic, identifyDate, identifyWindow): query_body = {'query': {'term': {'en_name': topic}}} network_results = es.search(index=index_event_analysis_results, doc_type=type_event_analysis_results, body=query_body)['hits']['hits'] for network_result in network_results: network_result = network_result['_source']['network_results'] network_result = json.loads(network_result) maker_results = network_result['maker_results'] for uid, info_dict in maker_results.iteritems(): user_info = info_dict['user_info'] user_info = json.loads(user_info) maker_results[uid]['user_info'] = user_info return maker_results
def get_task_detail_2(): results = dict() time_series = [] # 时间 origin_weibo_list = [] # 微博列表 retweeted_weibo_list = [] all_weibo_list = [] query_body = { "query": { "match_all": {} }, "size": 7 * 24, "sort": { "timestamp": { "order": "desc" } } } flow_detail = es_prediction.search(index="social_sensing_task", doc_type="social_sensing", body=query_body)["hits"]["hits"] flow_detail.reverse() if flow_detail: for item in flow_detail: item = item['_source'] timestamp = item['timestamp'] time_series.append(timestamp) origin_weibo_list.append(item["origin_weibo_number"]) # real retweeted_weibo_list.append(item['retweeted_weibo_number']) # real all_weibo_list.append(item["origin_weibo_number"] + item['retweeted_weibo_number']) results['time_series'] = time_series results['all_weibo_list'] = all_weibo_list results['origin_weibo_list'] = origin_weibo_list results['retweeted_weibo_list'] = retweeted_weibo_list return results
def get_subopinion(topic): query_body = { 'query':{ 'filtered':{ 'filter':{ 'term':{ 'name':topic } } } } } features = es.search(index=subopinion_index_name,doc_type=subopinion_index_type,body=query_body)['hits']['hits'] print 'features::::::',features[0] print 'features_keys::::::',features[0]['_source'].keys() if features: feature = json.loads(features[0]['_source']['features']) print 'feature.values()::::::::::::::',feature.values() return feature.values() else: return 'no results'
def ajax_show_task(): query_body = { "query": { "match_all": {} }, "size": 10000, "sort": { "submit_time": { "order": "desc" } } } results = es_prediction.search(index=index_manage_prediction_task, doc_type=type_manage_prediction_task, body=query_body)["hits"]["hits"] task_list = [] for item in results: task_list.append(item["_source"]) return json.dumps(task_list)
def get_topics_river(topic,start_ts,end_ts,unit=MinInterval):#主题河 #topic='event' query_body = { 'query':{ 'bool':{ 'must':[ {'term':{'name':topic}}, {'range':{'start_ts':{'lte':start_ts}}}, {'range':{'end_ts':{'gte':end_ts}}} ] } } } #print '????',query_body #print es.search(index=topics_river_index_name,doc_type=topics_river_index_type,body=query_body)['hits']['hits'] news_topics = json.loads(es.search(index=topics_river_index_name,doc_type=topics_river_index_type,body=query_body)['hits']['hits'][0]['_source']['features']) zhutihe_results = cul_key_weibo_time_count(topic,news_topics,start_ts,end_ts,unit) results = {} for k,v in news_topics.iteritems(): if len(v)>0: results[v[0]] = zhutihe_results[k] return results
def get_during_keywords_es(topic,start_ts,end_ts): #关键词云,unit=MinInterval keywords = [] # if (end_ts-start_ts)>unit: # begin_ts = end_ts-unit # else: # begin_ts = start_ts # print begin_ts,end_ts query_body = { 'query':{ 'filtered':{ 'filter':{ 'range':{ 'timestamp':{'gte': start_ts, 'lt':end_ts} } } } }, 'size':MAX_LANGUAGE_WEIBO } keywords_dict = {} weibo_text = [] keyword_weibo = es.search(index=topic,doc_type='text',body=query_body)['hits']['hits'] for key_weibo in keyword_weibo: weibo_text.append(key_weibo['_source']['text'].encode('utf-8')) keywords_dict = get_weibo(weibo_text,n_gram=2,n_count=100) ''' print keyword_weibo for key_weibo in keyword_weibo: keywords_dict_list = json.loads(key_weibo['_source']['keywords_dict']) # #print keywords_dict_list,type(keywords_dict_list) for k,v in keywords_dict_list.iteritems(): try: keywords_dict[k] += v except: keywords_dict[k] = v ''' word_results = sorted(keywords_dict.iteritems(),key=lambda x:x[1],reverse=True)[:MAX_FREQUENT_WORDS] return json.dumps(word_results)
def cul_key_weibo_time_count(topic,news_topics,start_ts,over_ts,during): key_weibo_time_count = {} time_dict = {} during = Day for clusterid,keywords in news_topics.iteritems(): #{u'd2e97cf7-fc43-4982-8405-2d215b3e1fea': [u'\u77e5\u8bc6', u'\u5e7f\u5dde', u'\u9009\u624b']} if len(keywords)>0: start_ts = int(start_ts) over_ts = int(over_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during for i in range(interval, 0, -1): #时间段取每900秒的 begin_ts = over_ts - during * i end_ts = begin_ts + during must_list=[] must_list.append({'range':{'timestamp':{'gte':begin_ts,'lt':end_ts}}}) temp = [] for word in keywords: sentence = {'wildcard':{'keywords_string':'*'+word+'*'}} temp.append(sentence) must_list.append({'bool':{'should':temp}}) query_body = {'query':{ 'bool':{ 'must':must_list } } } key_weibo = es.search(index=topic,doc_type='text',body=query_body) key_weibo_count = key_weibo['hits']['total'] #分时间段的类的数量 time_dict[ts2datetime(end_ts)] = key_weibo_count key_weibo_time_count[clusterid] = sorted(time_dict.items(),key=lambda x:x[0]) return key_weibo_time_count
def get_weibo_content_es(topic, start_ts, end_ts, province, sort_item='timestamp', unit=Fifteenminutes): weibo_dict = {} query_body = {'query': {'term': {'en_name': topic}}} es_results = es.search(index=index_event_analysis_results,doc_type=type_event_analysis_results,\ body=query_body)['hits']['hits'] for es_result in es_results: geo_results = es_result['_source']['geo_results'] geo_results_dict = json.loads(geo_results) #sort_ts_attr = geo_results_dict['sort_ts_attr'] #repost_list = geo_results_dict['repost_list'] geo_cityTopic_results = geo_results_dict['geo_cityTopic_results'] geo_weibos = geo_cityTopic_results['geo_weibos'] #geo_weibos = _json_loads(geo_weibos) for weibo_type, geo_weibo_list in geo_weibos.iteritems(): for geo_weibo in geo_weibo_list: province_weibo = geo_weibo[0] city = geo_weibo[1] weibo = geo_weibo[2] #print 'weibo::::::::::::::::::',weibo if province_weibo == province: weibo_content = {} weibo_content['text'] = weibo['_source']['text'] weibo_content['uid'] = weibo['_source']['uid'] weibo_content['timestamp'] = weibo['_source']['timestamp'] weibo_content['sentiment'] = weibo['_source']['sentiment'] weibo_content['comment'] = weibo['_source']['comment'] weibo_content['retweeted'] = weibo['_source']['retweeted'] weibo_content['keywords'] = weibo['_source']['keywords_dict'] weibo_content['mid'] = weibo['_source']['mid'] try: user = es_user_portrait.get( index=profile_index_name, doc_type=profile_index_type, id=weibo_content['uid'])['_source'] weibo_content['uname'] = user['nick_name'] weibo_content['photo_url'] = user['photo_url'] except: weibo_content['uname'] = 'unknown' weibo_content['photo_url'] = 'unknown' weibo_dict[weibo_content['mid']] = weibo_content try: results = sorted(weibo_dict.items(), key=lambda x: x[1][sort_item], reverse=True) except: results = [] #print results return results
def get_weibo_by_time_es(topic, start_ts, end_ts, sort_item='timestamp'): #print topic,start_ts,end_ts,weibo_es query_body = { 'query': { 'bool': { 'must': [{ 'range': { 'timestamp': { 'lte': int(end_ts), 'gte': int(start_ts) } } }] } }, 'size': 200, 'sort': { sort_item: { 'order': 'desc' } } } items = es.search(index=topic, body=query_body)['hits']['hits'] #items = db.session.query(PropagateWeibos).filter(PropagateWeibos.topic==topic).all() weibo_dict = {} if items: for item in items: #print item,type(item) #mtype = item.mtype #weibos = _json_loads(item.weibos) weibo = item['_source'] #print mtype #print 'weibo"""""""""""""""',weibo weibo_content = {} weibo_content['text'] = weibo['text'] weibo_content['uid'] = weibo['uid'] weibo_content['timestamp'] = weibo['timestamp'] #weibo_content['sentiment'] = weibo['sentiment'] try: weibo_content['comment'] = weibo['comment'] except: weibo_content['comment'] = 0 try: weibo_content['retweeted'] = weibo['retweeted'] except: weibo_content['retweeted'] = 0 try: user = es_user_portrait.get(index=profile_index_name, doc_type=profile_index_type, id=weibo_content['uid'])['_source'] weibo_content['uname'] = user['nick_name'] weibo_content['photo_url'] = user['photo_url'] except: weibo_content['uname'] = 'unknown' weibo_content['photo_url'] = 'unknown' #weibo_content['keywords'] = weibo['keywords_dict'] weibo_content['mid'] = weibo['mid'] #print weibo_content weibo_dict[weibo_content['mid']] = weibo_content results = sorted(weibo_dict.items(), key=lambda x: x[1][sort_item], reverse=True) else: results = [] #results = sorted(weibo_dict.items(),key=lambda x:x[1]['retweeted'],reverse=False) #for result in results: #print result return results
def get_sen_province_count_es(en_name, start_ts, end_ts, unit=MinInterval): sen_geo_count_results = {} sen_geo_results_dict = {} query_body = {'query': {'term': {'en_name': en_name}}} es_results = es.search(index=index_event_analysis_results,doc_type=type_event_analysis_results,\ body=query_body)['hits']['hits'] for es_result in es_results: sen_results = es_result['_source']['sentiment_results'] sen_geo_results_dict = json.loads(sen_results) during = sen_geo_results_dict['during'] geo_counts = sen_geo_results_dict['geo_count'] ''' for i_sen in [0,1,2,3,4,5,6,7]: sen_geo_count_results[i_sen] = {} ''' for end_ts, sen_geo_dict in geo_counts.iteritems(): for sen, geo_list in sen_geo_dict.iteritems(): print 'type:::::geo_list', geo_list if geo_list: for province, city_dict in geo_list.iteritems(): for k, v in city_dict.iteritems(): if k == 'total': try: sen_geo_count_results[sen][province][ 'total'] += v except: try: sen_geo_count_results[sen][province][ 'total'] = v except: #sen_geo_count_results[sen] = {province:{'total':v}} try: sen_geo_count_results[sen][ province] = { 'total': v } except: sen_geo_count_results[sen] = { province: { 'total': v } } else: try: sen_geo_count_results[sen][province][k] += v except: #sen_geo_count_results[sen][province][k] = v try: sen_geo_count_results[sen][province][k] = v except: #sen_geo_count_results[sen] = {province:{k:v}} try: sen_geo_count_results[sen][ province] = { k: v } except: sen_geo_count_results[sen] = { province: { k: v } } ''' if k == 'total': continue try: sen_geo_count_results[sen][province]['total'] += v except: try: sen_geo_count_results[sen][province] = {'total':v} except: sen_geo_count_results[sen] = {province:{'total':v}} try: sen_geo_count_results[sen][province][k] += v except: #sen_geo_count_results[sen][province][k] = v try: sen_geo_count_results[sen][province] = {k:v} except: sen_geo_count_results[sen] = {province:{k:v}} ''' else: continue #print 'sen_geo_count_results.keys():',sen_geo_count_results.keys() ''' for key in sen_geo_count_results.keys(): for k in sen_geo_count_results[key].keys(): print sen_geo_count_results[key][k].keys() ''' #print 'sen_geo_count_results:::::::::::::::::::::::',sen_geo_count_results return sen_geo_count_results '''
def get_weibo_content_es(topic, begin_ts, end_ts, sen, sort_item='timestamp'): #print topic #sentiments = SENTIMENT_FIRST + SENTIMENT_SECOND all_sen_weibo = {} #for sentiment in sentiments: province_dict = {} query_body = { 'query': { 'bool': { 'must': [ { 'term': { 'sentiment': sen } }, #一个话题,不同情绪下给定时间里按关键词聚合 { 'range': { 'timestamp': { 'gte': begin_ts, 'lt': end_ts } } } ] } }, 'sort': { sort_item: { "order": "desc" } } } items = es.search(index=topic, doc_type='text', body=query_body)['hits']['hits'] #字典 weibo_dict = {} if items: for item in items: #print item,type(item) #mtype = item.mtype #weibos = _json_loads(item.weibos) weibo = item['_source'] #print mtype #print 'weibo"""""""""""""""',weibo weibo_content = {} weibo_content['text'] = weibo['text'] weibo_content['uid'] = weibo['uid'] weibo_content['timestamp'] = weibo['timestamp'] #weibo_content['sentiment'] = weibo['sentiment'] try: weibo_content['comment'] = weibo['comment'] except: weibo_content['comment'] = 0 try: weibo_content['retweeted'] = weibo['retweeted'] except: weibo_content['retweeted'] = 0 try: user = es_user_portrait.get(index=profile_index_name, doc_type=profile_index_type, id=weibo_content['uid'])['_source'] weibo_content['uname'] = user['nick_name'] weibo_content['photo_url'] = user['photo_url'] except: weibo_content['uname'] = 'unknown' weibo_content['photo_url'] = 'unknown' #weibo_content['keywords'] = weibo['keywords_dict'] weibo_content['mid'] = weibo['mid'] #print weibo_content weibo_dict[weibo_content['mid']] = weibo_content results = sorted(weibo_dict.items(), key=lambda x: x[1][sort_item], reverse=True) else: results = [] #results = sorted(weibo_dict.items(),key=lambda x:x[1]['retweeted'],reverse=False) #for result in results: #print result ''' if len(sentiment_weibo) > 0: all_sen_weibo[sen] = [] for i in range(0,len(sentiment_weibo)): #print sentiment_weibo[i]['_source']['retweeted'] all_sen_weibo[sen].append(sentiment_weibo[i]['_source']) else: all_sen_weibo[sen] = [] ''' #return all_sen_weibo return results
def get_weibo_content(topic,start_ts,end_ts,opinion,sort_item='timestamp'): #微博内容 weibo_dict = {} #a = json.dumps(opinion) #opinion = '圣保罗_班底_巴西_康熙' #[u'毛泽东', u'纪念日', u'亲人', u'毛泽东思想', u'万岁'] #opinion = json.dumps(opinion) #opinion = '毛泽东_纪念日_亲人_毛泽东思想_万岁' # opinion_str = opinion[0] # opinion_str = '_'.join(opinion) # for i in range(1,len(opinion)): # opinion_str = opinion_str + '_' + opinion_str # query_body = { # 'query':{ # 'bool':{ # 'must':[ # {'wildcard':{'keys':opinion}}, # {'term':{'name':topic}}, # {'range':{'start_ts':{'lte':start_ts}}}, # {'range':{'end_ts':{'gte':end_ts}}} # ] # } # }, # 'size':100000 # } #没有查到uid 每次的id不一样 # print 'opinion:::::::;;',opinion # start_ts = int(start_ts) query_body = { 'query':{ 'bool':{ 'must':[ # {'wildcard':{'keys':'*'+opinion+'*'}}, {"match_phrase":{"keys": opinion}}, #{'term':{'keys':opinion}}, {'term':{'name':topic}}, {'range':{'start_ts':{'gte':start_ts}}}, {'range':{'end_ts':{'lte':end_ts}}} ] } }, 'size':1000000 } #没有查到uid 每次的id不一样 # query_body = { # 'query':{ # 'match_all':{} # }, # 'size':1000000 # } print 'query_body:::::;',query_body weibos = es.search(index=subopinion_index_name,doc_type=subopinion_index_type,body=query_body)['hits']['hits'] #print weibo_es,subopinion_index_name,subopinion_index_type,query_body print len(weibos) #keys_list = [] for weibo in weibos: print weibo['_source'].keys() print 'start_ts:::::::::',weibo["_source"]["start_ts"] print 'end_ts:::::::::::',weibo["_source"]["end_ts"] print 'name:::::::::::::',weibo['_source']["name"] print 'keys:::::::::::::',weibo["_source"]["keys"] #print 'keys_list:::::::',keys_list print 'opinion:::::',opinion print 'start_ts::::',start_ts print 'end_ts::::::',end_ts print 'topic:::::::',topic if weibos: # print 'weibos:::::::::::;',weibos[0]['_source']['keys'] weibos = json.loads(weibos[0]['_source']['cluster_dump_dict']) for weibo in weibos.values():#jln0825 weibo = weibo[0] weibo_content = {} weibo_content['text'] = weibo['text'] weibo_content['uid'] = weibo['uid'] weibo_content['timestamp'] = full_datetime2ts(weibo['datetime']) weibo_content['comment'] = weibo['comment'] weibo_content['retweeted'] = weibo['retweeted'] weibo_content['mid'] = weibo['id'] try: user = es.get(index=profile_index_name,doc_type=profile_index_type,id=weibo_content['uid'])['_source'] weibo_content['uname'] = user['nick_name'] weibo_content['photo_url'] = user['photo_url'] except: weibo_content['uname'] = 'unknown' weibo_content['photo_url'] = 'unknown' weibo_dict[weibo_content['mid']] = weibo_content results = sorted(weibo_dict.items(),key=lambda x:x[1][sort_item],reverse=True) #print results return results else: return 'no results'