def repost_search(topic, startts, endts): repost_list = [] ts_arr = [] if topic and topic != '': query_body = { #把原创和转发的数据都取出来 (有size的限制,有没有需要分别取转发和原创?) 'query':{ 'bool':{ 'must':[ {'terms':{'message_type':[1,3]}}, {'range':{ 'timestamp':{'gte': startts, 'lt':endts} } }] } }, #'sort':{"message_type":{"order":"desc"}}, 'size':MAX_REPOST_SEARCH_SIZE } repost_search = weibo_es.search(index=topic,doc_type=weibo_index_type,body=query_body)['hits']['hits'] #print repost_search for weibo in repost_search: location_dict = geo_list(weibo['_source'],topic) if location_dict: repost_list.append(location_dict) ts_arr.append(weibo['_source']['timestamp']) #print len(repost_list) save_rt_results(topic, repost_list) return sorted(list(set(ts_arr))), repost_list
def compute_mtype_weibo(topic,begin_ts,end_ts,w_limit): #print topic mtypes = ['1', '2', '3'] all_mtype_weibo = {} results = {} for mtype in mtypes: query_body = { 'query':{ 'bool':{ 'must':[ {'term':{'message_type': mtype}}, {'range':{ 'timestamp':{'gte': begin_ts, 'lt':end_ts} } }] } }, 'sort':{"retweeted":{"order":"desc"}}, 'size':w_limit } mtype_weibo = weibo_es.search(index=topic,doc_type=weibo_index_type,body=query_body)['hits']['hits'] #print mtype_weibo if len(mtype_weibo) > 0: all_mtype_weibo[mtype] = [] for i in range(0, len(mtype_weibo)): all_mtype_weibo[mtype].append(mtype_weibo[i]['_source']) else: all_mtype_weibo[mtype] = [] results[end_ts] = all_mtype_weibo #print results return results
def getEsIndexName(topic_name): #body={"query": {"match_all": {}}} query_body = {'query': {'match': {'name': topic_name}}} try: res = weibo_es.search(index='topics', body=query_body)['hits']['hits'] return res[0]['_source']['index_name'] except: return -1
def compute_mtype_keywords(topic, begin_ts, end_ts, during, k_limit): all_keyword_dict = {} mtype_with_keyword = {} mtypes = ['1', '2', '3'] #三种微博类型:原创、转发、评论 for mtype in mtypes: query_body = { 'query': { 'bool': { 'must': [{ 'term': { 'message_type': mtype } }, { 'range': { 'timestamp': { 'gte': begin_ts, 'lt': end_ts } } }] } }, 'aggs': { 'all_interests': { 'terms': { 'field': 'keywords_string', 'size': k_limit } } } } show_keywords_dict = weibo_es.search(index=topic,doc_type=weibo_index_type,body=query_body)\ ['aggregations']['all_interests']['buckets'] #print 'show_keywords_dict::::::::::::::::::::',show_keywords_dict keyword_dict = {} for keyword in show_keywords_dict: key = keyword['key'] count = keyword['doc_count'] try: keyword_dict[key] += count except: keyword_dict[key] = count mtype_with_keyword[mtype] = sorted(keyword_dict.items(), key=lambda x: x[1], reverse=True)[:k_limit] #mtype_with_keyword['limit'] = k_limit #mtype_with_keyword['during'] = during #all_keyword_dict[end_ts] = mtype_with_keyword #all_keyword_dict['limit'] = k_limit #results = all_keyword_dict #print results #return results return mtype_with_keyword
def compute_sentiment_count(topic, begin_ts, end_ts, during): all_sentiment_dict = {} query_body = { 'query': { 'filtered': { 'filter': { 'range': { 'timestamp': { 'gte': begin_ts, 'lt': end_ts } } } } }, 'aggs': { 'all_interests': { 'terms': { 'field': 'sentiment', 'size': SENTIMENT_TYPE_COUNT } #, # 'aggs':{ # 'geo':{ # 'terms':{ # 'field':'geo' # } # } # } } } } weibo_sentiment_count = weibo_es.search(index=topic,doc_type=weibo_index_type,body=query_body)\ ['aggregations']['all_interests']['buckets'] #print 'wwwwwwwwwwwwwwwwwwwwww' #print weibo_sentiment_count iter_sentiment_dict = {} for sentiment_item in weibo_sentiment_count: sentiment = sentiment_item['key'] sentiment_count = sentiment_item['doc_count'] try: iter_sentiment_dict[sentiment] += sentiment_count #'1':4 except: iter_sentiment_dict[sentiment] = sentiment_count #print '============================' #all_sentiment_dict[end_ts] = iter_sentiment_dict #按时间段存各个情绪的数量值 #results = sorted(all_sentiment_dict.items(), key=lambda x:x[0]) #按时间段对情绪数量排序 #results = all_sentiment_dict #print type(results) #trend_results = {} #for sentiment in SENTIMENT_FIRST: # trend_results[sentiment] = [[item[0], item[1][sentiment]] for item in sort_sentiment_dict] #results = trend_results #print results #save_rt_results('count', topic, results, during) #save_rt_results_es('count', topic, results, during) #return results return iter_sentiment_dict
def geo_list(r, topic): #对每条微博得到转微博、mid、话题、时间、原地理位置、转发地理位置 # {original:xx, mid:xx, topic:xx, ts:xx, origin_location:xx, repost_location:xx} location_dict = {} message_type = r['message_type'] if message_type == 3: # 转发 geo = r['geo'].encode('utf8') try: repost_location = str(split_city(geo)) #把元组转换成了字符串 except: return None #print r['mid'],r['root_mid'] if r['root_mid']: query_body = { 'query': { 'filtered': { 'filter': { 'term': { 'mid': r['root_mid'] } } } } } item = weibo_es.search(index=topic, doc_type=weibo_index_type, body=query_body)['hits']['hits'] if item != []: try: origin_location = str( split_city(item[0]['_source']['geo'].encode('utf8'))) except: return None #if repost_location[2:4] != 'unknown' and origin_location[2:4] != 'un': if repost_location[2:4] != 'un' and origin_location[ 2:4] != 'un': # str(['unknown','unknown'])所以2,3位‘un’ location_dict['original'] = 0 location_dict['mid'] = r['mid'] location_dict['topic'] = topic location_dict['ts'] = r['timestamp'] location_dict['origin_location'] = origin_location location_dict['repost_location'] = repost_location return location_dict else: geo = r['geo'].encode('utf8') try: origin_location = str(split_city(geo)) except: return None if origin_location[2:4] != 'un': location_dict['original'] = 1 location_dict['mid'] = r['mid'] location_dict['topic'] = topic location_dict['ts'] = r['timestamp'] location_dict['origin_location'] = origin_location location_dict['repost_location'] = None return location_dict return None
def get_topic_weibo(topic, en_name, start_ts, end_ts): query_body = {'query': {'match_all': {}}, 'sort': 'timestamp', 'size': 1} try: task_exist = weibo_es.search(index=en_name, doc_type=topic_index_type, body=query_body)['hits']['hits'] except: get_mappings(en_name) find_flow_texts(start_ts, end_ts, topic, en_name)
def save_trend_maker(topic, date, windowsize, trend_maker): #xapian_search_weibo = getXapianWeiboByTopic(topic_xapian_id) # topic id 要做一下处理 makers = trend_maker rank = 0 user_exist_list = [] #db.session.execute("DROP TABLE trend_maker") #db.session.create(TrendMaker) ''' items_exist = db.session.query(TrendMaker).filter(TrendMaker.topic==topic ,\ TrendMaker.date==date ,\ TrendMaker.windowsize==windowsize).all() for item_exist in items_exist: db.session.delete(item_exist) db.session.commit() ''' for maker in makers: uid = maker[0] if uid in user_exist_list: continue user_exist_list.append(uid) if rank >= trend_maker_count: break rank += 1 mid = maker[1] value = maker[2] #内容相关度---关键词命中个数 key_item = maker[3] # 命中的关键词 user_info = get_user_info(uid) query_body = { 'query': { 'bool': { 'must': { 'term': { 'mid': mid } } } }, 'size': 1000000 # 返回条数限制 待删 } weibo_info = weibo_es.search(index=topic, doc_type=weibo_index_type, body=query_body)['hits']['hits'] print 'trend_maker weibo_info:', weibo_info #domain = uid2domain(uid) domain = 'Unknown' timestamp = int(weibo_info[0]['_source']['timestamp']) # 修改model item = TrendMaker(topic, date, windowsize, uid, timestamp, json.dumps(user_info), json.dumps(weibo_info), domain, rank, json.dumps(value), json.dumps(key_item)) print item db.session.add(item) db.session.commit() print 'save_trend_maker success'
def test_weibo(): query_body = { 'query': { 'match_all': {} }, 'size':1, 'sort':{'retweeted':{'order':'desc'}} } weibo = weibo_es.search(index='aoyunhui',doc_type=weibo_index_type,body=query_body)['hits']['hits'][0]['_source'] mid = weibo['mid'] ts = weibo['timestamp'] print mid,ts find_tree(mid,ts)
def getWeiboByNameStEt(topic,start_date,end_date): print weibo_es query_body= { 'query':{ 'filtered':{ 'filter':{ 'range':{'timestamp':{'gte':start_date,'lte':end_date}} } } } } search_result = weibo_es.search(index=topic,doc_type=weibo_index_type,body=query_body) print search_result return search_result
def test(topic, begin_ts, end_ts): kcounts_dict = {} #unit = 900 # PropagateKeywords unit=900 #limit = 50 limit = fu_tr_top_keyword if MYSQL_TOPIC_LEN == 0: topic = topic[:20] #print 'get_keywords begin_ts:', begin_ts #print 'get_keywords end_ts:', end_ts print topic, unit, limit index_name = index_event_analysis_results index_type = type_event_analysis_results query_body = { 'query': { 'bool': { 'must': [{ 'term': { 'en_name': topic } }] } }, 'size': 1000000 # 返回条数限制 待删 } items = weibo_es.search(index=index_name, doc_type=index_type, body=query_body)['hits']['hits'] for item in items: #kcount_dict = parseKcount(item['_source']['kcount']) print 'time_results::::::', type(item['_source']['time_results']) #time_results = json.loads(item['_source']['time_results']) time_results = item['_source']['time_results'] time_results = json.loads(time_results) print 'time_results::::::', type(time_results) kcount_items = time_results['kcount'] during = time_results['during'] k_limit = time_results['k_limit'] print 'kcount_items::::::', kcount_items for kcount_item, value in kcount_items.iteritems(): print 'kcount_item:::::::::::::;', kcount_item print 'value::::::::::::::::::', value for k in value: print 'k::::::::::::', k
def subopinion_content(topic, start_ts, end_ts, weibo_limit): query_body = { 'query': { 'bool': { 'must_not': [{ 'wildcard': { 'text': '*【*】*' } }], 'must': [{ 'range': { 'timestamp': { 'lt': end_ts, 'gte': start_ts } } }] } }, 'size': weibo_limit } subopinion_results = weibo_es.search( index=topic, doc_type=weibo_index_type, body=query_body)['hits']['hits'] #['_source'] normal_list = [] for key_weibo in subopinion_results: text_weibo = key_weibo['_source']['text'] mid_weibo = key_weibo['_source']['mid'] timestamp = key_weibo['_source']['timestamp'] try: comment = key_weibo['_source']['comment'] except: comment = 0 try: retweeted = key_weibo['_source']['retweeted'] except: retweeted = 0 uid = key_weibo['_source']['uid'] normal_list.append({ 'news_id': 'weibo', 'content': text_weibo, 'id': mid_weibo, 'datetime': ts2datetime_full(timestamp), 'comment': comment, 'retweeted': retweeted, 'uid': uid }) return normal_list
def weibo_TopicNameTransfer(topicname, start_ts, end_ts): # 测试用时间戳 待删 begin_ts = start_ts end_ts = end_ts query_body = { 'query':{ 'bool':{ 'must':[ {'term':{'name': topicname}} ] } } } weibo_pinyin_name = weibo_es.search(index='topics', doc_type=weibo_index_type, body=query_body)['hits']['hits'] print weibo_pinyin_name print weibo_pinyin_name[0]['_source']['en_name'] return weibo_pinyin_name[0]['_source']['en_name']
def get_topicweibo_byid(uid, topic): query_body = { 'query': { 'bool': { 'must': { 'term': { 'uid': uid } } } }, 'size': 1000 } es_search_weibos = weibo_es.search(index=topic, doc_type=weibo_index_type, body=query_body)['hits']['hits'] return es_search_weibos
def save_trend_pusher(topic, date, windowsize, trend_pusher): #xapian_search_weibo = getXapianWeiboByTopic(topic_xapian_id) # topic id 要做一下处理 pushers = trend_pusher rank = 0 user_exist_list = [] items_exist = db.session.query(TrendPusher).filter(TrendPusher.topic==topic ,\ TrendPusher.date==date ,\ TrendPusher.windowsize==windowsize).all() for item_exist in items_exist: db.session.delete(item_exist) db.session.commit() for pusher in pushers: uid = pusher[0] if uid in user_exist_list: continue user_exist_list.append(uid) if rank >= trend_pusher_count: break rank += 1 mid = pusher[1] user_info = get_user_info(uid) #weibo_info = xapian_search_weibo.search_by_id(wid, fields=weibo_fields_list) query_body = { 'query': { 'bool': { 'must': { 'term': { 'mid': mid } } } }, 'size': 1000000 # 返回条数限制 待删 } weibo_info = weibo_es.search(index=topic, doc_type=weibo_index_type, body=query_body)['hits']['hits'] #domain = uid2domain(uid) timestamp = int(weibo_info[0]['_source']['timestamp']) item = TrendPusher(topic, date, windowsize, uid, timestamp, json.dumps(user_info), json.dumps(weibo_info), 'Unknown', rank) db.session.add(item) db.session.commit() print 'save_trend_pusher success'
def read_long_gexf(topic, identifyDate, identifyWindow): name = str(identifyDate)+str(identifyWindow) query_body = { #"term":{"date":identifyDate} "query":{"match_phrase":{"name":name}} } index_name = topic+'_gexffile' try: res = es.search(index=index_name, body=query_body)['hits']['hits'] except: return [] print es,index_name,query_body if len(res) > 0: #print '!!!!' #print type(json.loads(res[0]['_source']['gexf'])) return res[0]['_source']['gexf'] else: return []
def read_long_gexf(topic, identifyDate, identifyWindow): name = str(identifyDate)+str(identifyWindow) query_body = { #"term":{"date":identifyDate} "query":{"match_phrase":{"name":name}} } index_name = topic+'_gexffile' try: res = es.search(index=index_name, body=query_body)['hits']['hits'] except: return [] print es,index_name,query_body if len(res) > 0: #print '!!!!' #print type(json.loads(res[0]['_source']['gexf'])) return res[0]['_source']['gexf'] else: return []
def compute_mtype_count(topic, begin_ts, end_ts, during): all_mtype_dict = {} #print begin_ts,end_ts query_body = { 'query': { 'filtered': { 'filter': { 'range': { 'timestamp': { 'gte': begin_ts, 'lt': end_ts } } } } }, 'aggs': { 'all_interests': { 'terms': { 'field': 'message_type', 'size': MTYPE_COUNT } } } } weibo_mtype_count = weibo_es.search(index=topic, doc_type=weibo_index_type,body=query_body)\ ['aggregations']['all_interests']['buckets'] print 'weibo_mtype_count:::::::::::::::::', weibo_mtype_count print begin_ts, end_ts, len(weibo_mtype_count) iter_mtype_dict = {} for mtype_item in weibo_mtype_count: mtype = mtype_item['key'] mtype_count = mtype_item['doc_count'] try: iter_mtype_dict[mtype] += mtype_count except: iter_mtype_dict[mtype] = mtype_count #iter_mtype_dict['during'] = during #all_mtype_dict[end_ts] = iter_mtype_dict #results = all_mtype_dict #return results return iter_mtype_dict
def find_tree(mid,ts): query_body = { 'query':{ 'bool':{ 'must':[ {'term':{'root_mid':mid}}, #一个话题,不同情绪下给定时间里按关键词聚合 {'range':{ 'timestamp':{'gte': ts} } }] } }, 'size':1000, 'sort':{'timestamp':{'order':'asc'}} } weibo = weibo_es.search(index='aoyunhui',doc_type=weibo_index_type,body=query_body)['hits']['hits'] for content in weibo: if content['directed_uid']: 字典?
def news_content(topic, start_ts, end_ts, news_limit=NEWS_LIMIT): query_body = { 'query': { 'bool': { 'must': [{ 'wildcard': { 'text': '*【*】*' } }, { 'range': { 'timestamp': { 'lt': end_ts, 'gte': start_ts } } }] } }, 'size': news_limit } news_results = weibo_es.search( index=topic, doc_type=weibo_index_type, body=query_body)['hits']['hits'] #['_source'] # print topic,weibo_index_type,start_ts,end_ts,query_body # print news_results news_list = [] for key_weibo in news_results: text_weibo = key_weibo['_source']['text'] mid_weibo = key_weibo['_source']['mid'] timestamp = key_weibo['_source']['timestamp'] comment = key_weibo['_source']['comment'] retweeted = key_weibo['_source']['retweeted'] uid = key_weibo['_source']['uid'] news_list.append({ 'news_id': 'news', 'content168': text_weibo, 'id': mid_weibo, 'datetime': ts2datetime_full(timestamp), 'comment': comment, 'retweeted': retweeted }) return news_list
def test(topic, start_ts, end_ts): print start_ts, end_ts query_body = { 'query': { 'filtered': { 'filter': { 'range': { 'timestamp': { 'gte': start_ts, 'lt': end_ts } } } } } } weibo = weibo_es.search(index=topic, doc_type=weibo_index_type, body=query_body)['hits']['hits'] #字典 print weibo
def getEsWeiboByTopic(topic_index_name): ''' query_body = { 'query':{ 'bool':{ 'must':[ {'term':{'sentiment':sentiment}}, #一个话题,不同情绪下给定时间里按关键词聚合 {'range':{ 'timestamp':{'gte': begin_ts, 'lt':end_ts} } }] } }, 'sort':{"retweeted":{"order":"desc"}}, 'size':w_limit } ''' #body={"query": {"match_all": {}}} res = weibo_es.search(index=topic_index_name, doc_type=weibo_index_type, body={"query": {"match_all": {}}},\ size = 1000000)['hits']['hits'] return res
def getWeiboByNameStEt(topic, start_date, end_date): print weibo_es query_body = { 'query': { 'filtered': { 'filter': { 'range': { 'timestamp': { 'gte': start_date, 'lte': end_date } } } } } } search_result = weibo_es.search(index=topic, doc_type=weibo_index_type, body=query_body) print search_result return search_result
def counts_aggs(en_name, start_ts, end_ts): index_name = en_name index_type = 'text' query_body = { 'query': { 'bool': { 'must': [{ 'term': { 'pinyin_task_name': en_name } }, { 'range': { 'timestamp': { 'gte': start_ts, 'lt': end_ts } } }] } }, 'aggs': { 'diff_uids': { 'cardinality': { 'field': 'uid' } } } } result = weibo_es.search(index=index_name, doc_type=index_type, body=query_body) weibo_counts = result['hits']['total'] uid_counts = result['aggregations']['diff_uids']['value'] return weibo_counts, uid_counts
def cul_key_weibo_time_count(topic, news_topics, start_ts, over_ts, during): key_weibo_time_count = {} time_dict = {} for clusterid, keywords in news_topics.iteritems( ): #{u'd2e97cf7-fc43-4982-8405-2d215b3e1fea': [u'\u77e5\u8bc6', u'\u5e7f\u5dde', u'\u9009\u624b']} start_ts = int(start_ts) over_ts = int(over_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during for i in range(interval, 0, -1): #时间段取每900秒的 begin_ts = over_ts - during * i end_ts = begin_ts + during must_list = [] must_list.append( {'range': { 'timestamp': { 'gte': begin_ts, 'lt': end_ts } }}) temp = [] for word in keywords: sentence = {"wildcard": {"keywords_string": "*" + word + "*"}} temp.append(sentence) must_list.append({'bool': {'should': temp}}) query_body = {"query": {"bool": {"must": must_list}}} key_weibo = weibo_es.search(index=topic, doc_type=weibo_index_type, body=query_body) key_weibo_count = key_weibo['hits']['total'] #分时间段的类的数量 time_dict[end_ts] = key_weibo_count key_weibo_time_count[clusterid] = time_dict return key_weibo_time_count
def get_first_node(topic, start_ts, end_ts, windowsize, date): ''' 根据timestamp,获取top20的用户----微博可能就不只20条了 根据微博获取对应的用户信息------可能会出现用户重复的情况,这里只取时间最早的那一个 将其保存 ''' if topic and topic != '': # topic = topic.encode('utf-8') print topic # datestr = start_date.replace('-','') # xapian_search_weibo = getXapianWeiboByTopic(topic_id=topic_xapian_id) query_body = { 'query': { 'bool': { 'should': [{ 'term': { 'message_type': 1 } }, { 'term': { 'message_type': 3 } }], 'must': # {'term':{'name': topic}}, { 'range': { 'timestamp': { 'gte': start_ts, 'lt': end_ts } } } } }, 'size': 1000, # 返回条数限制 待删 'sort': { "timestamp": { "order": "asc" } } } es_search_weibos = weibo_es.search(index=topic, doc_type=weibo_index_type, body=query_body)['hits']['hits'] #print es_search_weibos user_list = [] time_top_nodes = es_search_weibos if not time_top_nodes: print 'search error' else: # print 'time_top_nodes:', time_top_nodes s = 0 domain_count_list, domain_user_list = init_domain_list() print 'start_node:' # print time_top_nodes[1] uid_package = [] for node in time_top_nodes: # print 'node:', node node = node['_source'] uid = node['uid'] #print uid #user_domain = uid2domain(uid, topic) #传入topic用于获取用户关于某一话题的全部微博 #print user_domain timestamp = node['timestamp'] #print timestamp if not uid in uid_package: uid_package.append(uid) else: continue #print 'start geting user info' user_info = get_user_info(uid) # 获取top_time微博对应的用户信息 #print 'end geting user info' user_weibos = get_topicweibo_byid(uid, topic) save_first_nodes(topic, date, windowsize, uid, timestamp, user_info, user_weibos)
def get_pushers(topic, new_peaks, new_bottom, ts_list): #unit = 900 #p_during = Hour p_ts_list = [] results = [] end_ts = ts_list[new_peaks[0]] begin_ts = ts_list[new_bottom[0]] print 'pusher_start_ts:', ts2date(begin_ts) print 'pusher_end_ts:', ts2date( end_ts) #有两个时间 起点和终点 波峰和波谷 现在搞反了 不知道为什么 if begin_ts > end_ts: begin_ts = ts_list[0] interval = (end_ts - begin_ts) / p_during print end_ts - begin_ts print p_during print interval for i in range(interval, 0, -1): begin_ts = end_ts - p_during * i over_ts = begin_ts + p_during #print '383',begin_ts,over_ts p_ts_list.append(over_ts) items = db.session.query(PropagateCount).filter(PropagateCount.topic==topic ,\ PropagateCount.end<=over_ts ,\ PropagateCount.end>begin_ts ,\ PropagateCount.range==unit).all() if items: result = Merge_propagate(items) else: result = 0 results.append(float(result)) #print 'pusher_line:', results #try: print results print p_ts_list try: max_k_timestamp = get_max_k_timestamp(results, p_ts_list) # 获取增速最快的时间点 except: max_k_timestamp = end_ts #save max_k_timestamp # save_mak_k(max_k_timestamp) end = max_k_timestamp start = max_k_timestamp - p_during query_body = { 'query': { 'bool': { 'must': # {'term':{'name': topic}}, { 'range': { 'timestamp': { 'gte': end, 'lt': end + 3600 } #3600 } } } }, 'size': 1000000, # 返回条数限制 待删 'sort': { "timestamp": { "order": "asc" } } } es_search_weibos = weibo_es.search(index=topic, doc_type=weibo_index_type, body=query_body)['hits']['hits'] #xapian_search_weibo = getXapianWeiboByTopic(topic_xapian_id) #query_dict = { #'timestamp':{'$gt':end, '$lt':end+3600} #} #以下是通过找到斜率最大的时间段内所有微博中转发数最大的用户 #results = xapian_search_weibo.search(query=query_dict, sort_by=['reposts_count'], fields=['_id', 'user', 'reposts_count']) results = es_search_weibos print 'pusher_search_count:', len(results) #print 'pusher_query_dict:', query_dict pusher_list = [] count = 0 for result in results: count += 1 if count > 100: break wid = result['_source']['mid'] uid = result['_source']['uid'] value = result['_source']['retweeted'] pusher_list.append((uid, wid, value)) # sort by reposts_count # sort_by_rc(pusher_list) return pusher_list
def sort_makers(keyword_data, begin_ts, end_ts, ts_list, topic): begin_ts = begin_ts - Hour #query_dict = {'timestamp':{'$gt': begin_ts, '$lt': end_ts}} print '323', begin_ts, end_ts, topic query_body = { 'query': { 'bool': { 'should': [ { 'term': { 'message_type': 1 } }, { 'term': { 'message_type': 3 #不确定 待删 } } ], 'must': # {'term':{'name': topic}}, { 'range': { 'timestamp': { 'gte': begin_ts, 'lt': end_ts } } } } }, 'size': 1000000, # 返回条数限制 待删 'sort': { "timestamp": { "order": "asc" } } } es_search_weibos = weibo_es.search(index=topic, doc_type=weibo_index_type, body=query_body)['hits']['hits'] num = 0 print 'len(es_search_weibos):', len(es_search_weibos) if len(es_search_weibos) == 0: return [] weibo_term = {} #print es_search_weibos for weibo in es_search_weibos: #print weibo num += 1 if num > fu_tr_top_keyword: break uid = weibo['_source']['uid'] wid = weibo['_source']['mid'] #terms_list = weibo['_source']['terms'] terms_dict = json.loads(weibo['_source']['keywords_dict']) #print '****************', type(terms_dict) key_term_count = [] key_term = [] for item in terms_dict: key_term.append(item) key_term_count.append(terms_dict[item]) weibo_term[uid] = [wid, key_term_count, key_term] sort_weibo_term = sorted(weibo_term.items(), key=lambda x: x[1][1], reverse=True) return sort_weibo_term[:fu_tr_top_keyword]
def cityTopic(topic, start_ts, over_ts, during=Fifteenminutes, n_limit=TOP_WEIBOS_LIMIT): if topic and topic != '': #start_ts = int(start_ts) #over_ts = int(over_ts) #over_ts = ts2HourlyTime(over_ts, during) #interval = (over_ts - start_ts) / during geo_cityTopic_results = {} geo_cityTopic_results['geo_weibos'] = {} geo_cityTopic_results['geo_cityCount'] = {} province_dict = {} for k, v in mtype_kv.iteritems(): #v代表转发、评论、原创 first_item = {} query_body = { #按message_type得到微博 'query': { 'bool': { 'must': [{ 'term': { 'message_type': v } }, { 'range': { 'timestamp': { 'gte': start_ts, 'lt': over_ts } } }] } }, 'sort': { SORT_FIELD: { "order": "desc" } }, 'size': 10000000 } mtype_weibo = weibo_es.search(index=topic, doc_type=weibo_index_type, body=query_body)['hits']['hits'] #save_ws_results(topic, end_ts, during, n_limit, mtype_weibo) #微博直接保存下来 if len(mtype_weibo) == 0: continue first_item = mtype_weibo[0]['_source'] #数每个地方的不同类型的数量 count_i = 0 for weibo in mtype_weibo: #对于每条微博 count_i += 1 try: geo = weibo['_source']['geo'].encode('utf8') except: continue #print geo,type(geo) province, city = split_city(geo) #print province,city if count_i <= n_limit: try: geo_cityTopic_results['geo_weibos'][v].append( [province, city, weibo]) except: geo_cityTopic_results['geo_weibos'][v] = [[ province, city, weibo ]] if province != 'unknown': try: province_dict[province][city] += 1 except: try: province_dict[province][city] = 1 except: province_dict[province] = {city: 1} try: province_dict[province]['total'] += 1 except: try: province_dict[province]['total'] = 1 except: province_dict[province] = {'total': 1} geo_cityTopic_results['geo_cityCount'][v] = province_dict return geo_cityTopic_results
def get_interval_count(topic, date, windowsize): index_name = index_event_analysis_results index_type = type_event_analysis_results results = [] ts_list = [] start_date = ts2datetime(datetime2ts(date) - windowsize * Day) unit = 900 print 'start_date:', start_date start_ts = datetime2ts(start_date) ts_list = [start_ts] end_ts = datetime2ts(date) interval = (end_ts - start_ts) / during print 'interval:', interval print topic ''' if MYSQL_TOPIC_LEN == 0: topic0 = topic[:20] else: topic0=topic ''' for i in range(interval, 0, -1): #print 'i:', i begin_ts = long(end_ts) - during * i over_ts = begin_ts + during #print 'begin_ts:', begin_ts#ts2date(begin_ts) #print 'over_ts:', over_ts#ts2date(over_ts) ts_list.append(over_ts) ''' items = db.session.query(PropagateCount).filter(PropagateCount.topic==topic0 ,\ PropagateCount.end<=over_ts ,\ PropagateCount.end>begin_ts ,\ PropagateCount.range==unit).all() #).all() ''' ''' query_body = { 'query':{ 'bool':{ 'must':[ {'range':{'end_ts':{'gt':begin_ts,'lte':over_ts}}}, {'term':{'en_name':topic0}}, {'term':{'range':unit}} ] } }, 'size': 1000000 # 返回条数限制 待删 } items = weibo_es.search(index=index_name,doc_type=index_type,body=query_body)['hits']['hits'] ''' query_body = { 'query': { 'bool': { 'must': [{ 'term': { 'en_name': topic } }] } }, 'size': 1000000 } es_results = weibo_es.search(index=index_name, doc_type=index_type, body=query_body)['hits']['hits'] #print 'results::::::::::',results print 'len_results:::::::::::', len(es_results) count = 0 for result in es_results: result = result['_source'] time_results = json.loads(result['time_results']) count_results = time_results['count'] print 'type_time_results:::::::', type(time_results) time_time = time_results.keys() print 'time_results.keys:::::', time_time.sort() #print 'time_results.keys:::::',len(time_time.sort()) if time_results['during'] == unit: print 'count_results.keys()::::;', count_results.keys() for end_ts_count in count_results.keys(): if end_ts_count > begin_ts and end_ts_count <= over_ts: count += 1 ''' if items: result = len(items) else: result = 0 results.append(float(result)) ''' ''' if count: result = count else: result = 0 ''' results.append(float(count)) print 'results::::::::::', results #print abababa print 'detect_peak_bottom_line::::::', results new_zeros = detect_peaks(results) # 返回峰值出现的时间区间的序号 new_bottom = detect_bottom(results) # get the first bottom print 'new_zeros:::::::::::::::::', new_zeros print 'new_bottom::::::::::::::::', new_bottom print 'ts_list:::::::::::::::::::', ts_list # 存趋势时间范围 # save_peak_bottom(new_zeros, new_bottom) #trend_maker = get_makers(topic, new_zeros, new_bottom, ts_list, topic_xapian_id) trend_maker = get_makers(topic, new_zeros, new_bottom, ts_list) print 'trend_makers:', trend_maker trend_pusher = get_pushers(topic, new_zeros, new_bottom, ts_list) print 'trend_pushers:', trend_pusher #save_trend_maker(topic, date, windowsize, trend_maker) maker_results = save_trend_maker_es(topic, date, windowsize, trend_maker) #save_trend_pusher(topic, date, windowsize, trend_pusher) pusher_results = save_trend_pusher_es(topic, date, windowsize, trend_pusher) return maker_results, pusher_results
def get_pushers(topic, new_peaks, new_bottom, ts_list): #unit = 900 #p_during = Hour counts_dict = {} p_ts_list = [] results = [] end_ts = ts_list[new_peaks[0]] begin_ts = ts_list[new_bottom[0]] print 'pusher_start_ts:', ts2date(begin_ts) print 'pusher_end_ts:', ts2date( end_ts) #有两个时间 起点和终点 波峰和波谷 现在搞反了 不知道为什么 if begin_ts > end_ts: begin_ts = ts_list[0] interval = (end_ts - begin_ts) / p_during print end_ts - begin_ts print p_during print interval for i in range(interval, 0, -1): begin_ts = end_ts - p_during * i over_ts = begin_ts + p_during #print '383',begin_ts,over_ts p_ts_list.append(over_ts) ''' items = db.session.query(PropagateCount).filter(PropagateCount.topic==topic ,\ PropagateCount.end<=over_ts ,\ PropagateCount.end>begin_ts ,\ PropagateCount.range==unit).all() ''' index_name = index_event_analysis_results index_type = type_event_analysis_results ''' query_body = { 'query':{ 'bool':{ 'must':[ {'range':{'end_ts':{'gt':begin_ts,'lte':over_ts}}}, {'term':{'en_name':topic}}, {'term':{'range':unit}} ] } }, 'size': 1000000 # 返回条数限制 待删 } ''' query_body = { 'query': { 'bool': { 'must': [{ 'term': { 'en_name': topic } }] } }, 'size': 1000000 # 返回条数限制 待删 } items = weibo_es.search(index=index_name, doc_type=index_type, body=query_body)['hits']['hits'] if items: for item in items: #kcount_dict = parseKcount(item['_source']['kcount']) #time_results = json.loads(item['_source']['time_results']) time_results = item['_source']['time_results'] count_items = time_results['count'] for count_item in count_items: for k, v in count_item.iteritems(): if k > begin_ts and k <= end_ts: during = v['during'] if during == unit: for key, value in v.iteritems(): if key not in ['during']: try: counts_dict[key] += value except KeyError: counts_dict[key] = value ''' range = count_items['range'] if range == unit: for k,v in count_items.iteritems(): if k not in ['range']: if k > begin_ts and k <= end_ts: for key,value in count_items[k].iteritems(): try: counts_dict[key] += value except KeyError: counts_dict[key] = value ''' #items = weibo_es.search(index=index_name,doc_type=index_type,body=query_body)['hits']['hits'] ''' if items: result = Merge_propagate(items) else: result = 0 ''' for k, v in counts_dict.iteritems(): result += v results.append(float(result)) #print 'pusher_line:', results #try: print results print p_ts_list try: max_k_timestamp = get_max_k_timestamp(results, p_ts_list) # 获取增速最快的时间点 except: max_k_timestamp = end_ts #save max_k_timestamp # save_mak_k(max_k_timestamp) end = max_k_timestamp start = max_k_timestamp - p_during query_body = { 'query': { 'bool': { 'must': # {'term':{'name': topic}}, { 'range': { 'timestamp': { 'gte': end, 'lt': end + 3600 } #3600 } } } }, 'size': 1000000, # 返回条数限制 待删 'sort': { "timestamp": { "order": "asc" } } } es_search_weibos = weibo_es.search(index=topic, doc_type=weibo_index_type, body=query_body)['hits']['hits'] #xapian_search_weibo = getXapianWeiboByTopic(topic_xapian_id) #query_dict = { #'timestamp':{'$gt':end, '$lt':end+3600} #} #以下是通过找到斜率最大的时间段内所有微博中转发数最大的用户 #results = xapian_search_weibo.search(query=query_dict, sort_by=['reposts_count'], fields=['_id', 'user', 'reposts_count']) results = es_search_weibos print 'pusher_search_count:', len(results) #print 'pusher_query_dict:', query_dict pusher_list = [] count = 0 for result in results: count += 1 if count > 100: break wid = result['_source']['mid'] uid = result['_source']['uid'] value = result['_source']['retweeted'] pusher_list.append((uid, wid, value)) # sort by reposts_count # sort_by_rc(pusher_list) return pusher_list
def get_keyword(topic, begin_ts, end_ts, top): kcounts_dict = {} #unit = 900 # PropagateKeywords unit=900 #limit = 50 limit = fu_tr_top_keyword ''' if MYSQL_TOPIC_LEN == 0: topic=topic[:20] ''' #print 'get_keywords begin_ts:', begin_ts #print 'get_keywords end_ts:', end_ts print topic, unit, limit ''' items = db.session.query(PropagateKeywords).filter(PropagateKeywords.end>begin_ts ,\ PropagateKeywords.end<=end_ts ,\ PropagateKeywords.topic==topic ,\ PropagateKeywords.range==unit ,\ PropagateKeywords.limit==limit).all() ''' index_name = index_event_analysis_results index_type = type_event_analysis_results ''' query_body = { 'query':{ 'bool':{ 'must':[ {'range':{'end_ts':{'gt':begin_ts,'lte':end_ts}}}, {'term':{'en_name':topic}}, {'term':{'range':unit}}, {'term':{'limit':limit}} ] } }, 'size': 1000000 # 返回条数限制 待删 } ''' query_body = { 'query': { 'bool': { 'must': [{ 'term': { 'en_name': topic } }] } }, 'size': 1000000 # 返回条数限制 待删 } items = weibo_es.search(index=index_name, doc_type=index_type, body=query_body)['hits']['hits'] if items: for item in items: #kcount_dict = parseKcount(item['_source']['kcount']) print 'time_results::::::', type(item['_source']['time_results']) #time_results = json.loads(item['_source']['time_results']) time_results = item['_source']['time_results'] time_results = json.loads(time_results) print 'time_results::::::', type(time_results) kcount_items = time_results['kcount'] during = time_results['during'] k_limit = time_results['k_limit'] print 'type(kcount_items)::::::', type(kcount_items) for key, kcount_item in kcount_items.iteritems(): #print 'kcount_item::::::::::::::::::',kcount_item #print 'type(kcount_item)::::::::',type(kcount_item) #for k,v in kcount_item.iteritems(): if key > begin_ts and key <= end_ts: #during = v['during'] #limit = v['limit'] if during == unit and k_limit == limit: for k, v in kcount_item.iteritems(): #if key not in ['during','limit']: try: kcounts_dict[k] += v except KeyError: kcounts_dict[k] = v ''' for kcount_item in kcount_items: print 'kcount_item::::::::::::::::::',kcount_item print 'type(kcount_item)::::::::',type(kcount_item) for k,v in kcount_item.iteritems(): if k > begin_ts and k <= end_ts: #during = v['during'] #limit = v['limit'] if during == unit and k_limit == limit: for key,value in v.iteritems(): #if key not in ['during','limit']: try: kcounts_dict[key] += value except KeyError: kcounts_dict[key] = value ''' ''' range = kcount_items['range'] limit = kcount_items['limit'] if range == unit and limit == limit: for k,v in kcount_items.iteritems(): if k not in ['range','limit']: if k > begin_ts and k <= end_ts: for key,value in kcount_items[k].iteritems(): try: kcounts_dict[key] += value except KeyError: kcounts_dict[key] = value for k,v in kcount_dict.iteritems(): try: kcounts_dict[k] += v except KeyError: kcounts_dict[k] = v ''' keyword_data = _top_keywords(kcounts_dict, top) else: keyword_data = [] return keyword_data