def get_symbol_weibo(task_source, task_id, start_ts, end_ts, unit=MinInterval): #鱼骨图 if S_TYPE == 'test': if task_source == 'weibo': start_ts = datetime2ts(S_DATE) - 5 * 24 * 3600 over_ts = datetime2ts(S_DATE) else: start_ts = datetime2ts(S_DATE_FB) - 5 * 24 * 3600 over_ts = datetime2ts(S_DATE_FB) weibos = {} query_body = {'query': {'bool': {'must': [{'term': {'name': task_id}}]}}} #print query_body symbol = es_intel.search(index=topics_river_index_name, doc_type=topics_river_index_type, body=query_body)['hits']['hits'] #print 'symbol..',symbol symbol = es_intel.search(index=topics_river_index_name, doc_type=topics_river_index_type, body=query_body)['hits']['hits'][0]['_source'] #symbol = es.search(index=topics_river_index_name,doc_type=topics_river_index_type,body=query_body)['hits']['hits'] #print 'symbol:::',symbol features = json.loads(symbol['features']) symbol_weibos = json.loads(symbol['cluster_dump_dict']) #print symbol_weibos begin_ts = end_ts - unit for clusterid, contents in symbol_weibos.iteritems(): j = 0 content = set() for i in contents: ts = full_datetime2ts(i['datetime']) title = re.findall(r'【.*】', i['content'].encode('utf8')) if title: title = title[0] #print 'title::',title.encode('utf-8') if ts >= start_ts and ts <= end_ts and title not in content: #start_ts应该改成begin_ts,现在近15分钟没数据,所以用所有的 try: weibos[features[clusterid][0]].append(i) except: weibos[features[clusterid][0]] = [i] content.add(title) j += 1 #print content if j == 3: break else: continue #print weibos return weibos
def get_topics_river(task_source, task_id, start_ts, end_ts, unit=MinInterval): #主题河 #topic='event' query_body = {'query': {'bool': {'must': [{'term': {'name': task_id}}]}}} news_topics = json.loads( es_intel.search( index=topics_river_index_name, doc_type=topics_river_index_type, body=query_body)['hits']['hits'][0]['_source']['features']) #print 'news_topics---------------11111111111111111111111111111111111111111111' #print news_topics zhutihe_results = cul_key_weibo_time_count(task_source, task_id, news_topics, start_ts, end_ts, unit) #print 'zhutihe_results---------------11111111111111111111111111111111111111111111' #print zhutihe_results results = {} for k, v in news_topics.iteritems(): if len(v) > 0: results[v[0]] = zhutihe_results[k] return results
def cul_key_weibo_time_count(task_source, task_id, news_topics, start_ts, over_ts, during): if S_TYPE == 'test': if task_source == 'weibo': start_ts = datetime2ts(S_DATE) - 5 * 24 * 3600 over_ts = datetime2ts(S_DATE) else: start_ts = datetime2ts(S_DATE_FB) - 5 * 24 * 3600 over_ts = datetime2ts(S_DATE_FB) key_weibo_time_count = {} time_dict = {} during = Day for clusterid, keywords in news_topics.iteritems( ): #{u'd2e97cf7-fc43-4982-8405-2d215b3e1fea': [u'\u77e5\u8bc6', u'\u5e7f\u5dde', u'\u9009\u624b']} if len(keywords) > 0: start_ts = int(start_ts) over_ts = int(over_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during for i in range(interval, 0, -1): #时间段取每900秒的 begin_ts = over_ts - during * i end_ts = begin_ts + during must_list = [] must_list.append( {'range': { 'timestamp': { 'gte': begin_ts, 'lt': end_ts } }}) temp = [] for word in keywords: sentence = { 'wildcard': { 'keywords_string': '*' + word + '*' } } temp.append(sentence) must_list.append({'bool': {'should': temp}}) query_body = {'query': {'bool': {'must': must_list}}} key_weibo = es_intel.search(index=task_id, doc_type=task_source, body=query_body) key_weibo_count = key_weibo['hits']['total'] #分时间段的类的数量 time_dict[ts2datetime(end_ts)] = key_weibo_count key_weibo_time_count[clusterid] = sorted(time_dict.items(), key=lambda x: x[0]) return key_weibo_time_count