def search_topics(keyword): result = [] query_body = { 'query': { 'bool': { 'should': [{ 'wildcard': { 'en_name': '*' + keyword + '*' } }, { 'wildcard': { 'name': '*' + keyword + '*' } }] } } } print query_body results = weibo_es.search(index=topic_index_name, doc_type=topic_index_type, body=query_body) if results: topics = results['hits']['hits'] for topic in topics: result.append(topic['_source']) #try: # result[topic['_source']['en_name']].append([topic['_source']['name'],topic['_source']['start_ts'],topic['_source']['end_ts'],topic['_source']['comput_status']]) #except: # result[topic['_source']['en_name']] = [[topic['_source']['name'],topic['_source']['start_ts'],topic['_source']['end_ts'],topic['_source']['comput_status']]] return json.dumps(result)
def get_during_keywords(topic,start_ts,end_ts): #关键词云,unit=MinInterval keywords = [] # if (end_ts-start_ts)>unit: # begin_ts = end_ts-unit # else: # begin_ts = start_ts # print begin_ts,end_ts query_body = { 'query':{ 'filtered':{ 'filter':{ 'range':{ 'timestamp':{'gte': start_ts, 'lt':end_ts} } } } }, 'size':MAX_LANGUAGE_WEIBO } keywords_dict = {} keyword_weibo = weibo_es.search(index=topic,doc_type=weibo_index_type,body=query_body)['hits']['hits'] print keyword_weibo for key_weibo in keyword_weibo: keywords_dict_list = json.loads(key_weibo['_source']['keywords_dict']) # #print keywords_dict_list,type(keywords_dict_list) for k,v in keywords_dict_list.iteritems(): try: keywords_dict[k] += v except: keywords_dict[k] = v word_results = sorted(keywords_dict.iteritems(),key=lambda x:x[1],reverse=True)[:MAX_FREQUENT_WORDS] return json.dumps(word_results)
def get_subopinion(topic, start_ts, end_ts): query_body = { 'query': { 'filtered': { 'filter': { 'bool': { 'must': [{ 'term': { 'name': topic } }, { 'term': { 'start_ts': start_ts } }, { 'term': { 'end_ts': end_ts } }] } } } } } print '6666666666', topic features = weibo_es.search(index=subopinion_index_name, doc_type=subopinion_index_type, body=query_body)['hits']['hits'] if features: feature = json.loads(features[0]['_source']['features']) return feature.values() else: return 'no results'
def get_topics(user): results = {'recommend':{},'own':{}} query_body={ 'query':{ 'filtered':{ 'filter':{ 'bool':{ 'must':[{'term':{'comput_status':1}}], 'must_not':[{'term':{'submit_user':user}}] } } } }, 'sort':{'submit_ts':{'order':'desc'}}, 'size':1000 } topics = weibo_es.search(index=topic_index_name,doc_type=topic_index_type,body=query_body) if topics: topics = topics['hits']['hits'] for topic in topics: try: results['recommend'][topic['_source']['en_name']].append([topic['_source']['name'],topic['_source']['start_ts'],topic['_source']['end_ts'],topic['_source']['comput_status']]) except: results['recommend'][topic['_source']['en_name']] = [[topic['_source']['name'],topic['_source']['start_ts'],topic['_source']['end_ts'],topic['_source']['comput_status']]] query_own = { 'query':{ 'filtered':{ 'filter':{ 'term':{'submit_user':user} } } }, "size": 1000 } own_topics = weibo_es.search(index=topic_index_name,doc_type=topic_index_type,body=query_own) if own_topics: topics = own_topics['hits']['hits'] for topic in topics: try: results['own'][topic['_source']['en_name']].append([topic['_source']['name'],topic['_source']['start_ts'],topic['_source']['end_ts'],topic['_source']['comput_status']]) except: results['own'][topic['_source']['en_name']] = [[topic['_source']['name'],topic['_source']['start_ts'],topic['_source']['end_ts'],topic['_source']['comput_status']]] #print results return json.dumps(results)
def get_subopinion(topic): query_body = {'query': {'filtered': {'filter': {'term': {'name': topic}}}}} features = weibo_es.search(index=subopinion_index_name, doc_type=subopinion_index_type, body=query_body)['hits']['hits'] if features: feature = json.loads(features[0]['_source']['features']) return feature.values() else: return 'no results'
def get_weibo_by_time(topic,start_ts,end_ts,sort_item='timestamp'): print topic,start_ts,end_ts,weibo_es query_body = { 'query':{ 'bool':{ 'must':[ {'range':{'timestamp':{'lte':int(end_ts),'gte':int(start_ts)}}} ] } }, 'size':200, 'sort':{sort_item:{'order':'desc'}} } items = weibo_es.search(index=topic,body=query_body)['hits']['hits'] #items = db.session.query(PropagateWeibos).filter(PropagateWeibos.topic==topic).all() weibo_dict = {} if items: for item in items: #print item,type(item) #mtype = item.mtype #weibos = _json_loads(item.weibos) weibo = item['_source'] #print mtype weibo_content = {} weibo_content['text'] = weibo['text'] weibo_content['uid'] = weibo['uid'] weibo_content['timestamp'] = weibo['timestamp'] #weibo_content['sentiment'] = weibo['sentiment'] try: weibo_content['comment'] = weibo['comment'] except: weibo_content['comment'] = 0 try: weibo_content['retweeted'] = weibo['retweeted'] except: weibo_content['retweeted'] = 0 try: user = es_user_profile.get(index=profile_index_name,doc_type=profile_index_type,id=weibo_content['uid'])['_source'] weibo_content['uname'] = user['nick_name'] weibo_content['photo_url'] = user['photo_url'] except: weibo_content['uname'] = 'unknown' weibo_content['photo_url'] = 'unknown' #weibo_content['keywords'] = weibo['keywords_dict'] weibo_content['mid'] = weibo['mid'] #print weibo_content weibo_dict[weibo_content['mid']] = weibo_content results = sorted(weibo_dict.items(),key=lambda x:x[1]['retweeted'],reverse=False) else: results = [] #results = sorted(weibo_dict.items(),key=lambda x:x[1]['retweeted'],reverse=False) #for result in results: #print result return results
def get_topics_river(topic, start_ts, end_ts, unit=MinInterval): #主题河 #topic='event' query_body = { 'query': { 'bool': { 'must': [{ 'term': { 'name': topic } }, { 'range': { 'start_ts': { 'lte': start_ts } } }, { 'range': { 'end_ts': { 'gte': end_ts } } }] } } } #print '????',query_body print weibo_es.search(index=topics_river_index_name, doc_type=topics_river_index_type, body=query_body)['hits']['hits'] news_topics = json.loads( weibo_es.search( index=topics_river_index_name, doc_type=topics_river_index_type, body=query_body)['hits']['hits'][0]['_source']['features']) zhutihe_results = cul_key_weibo_time_count(topic, news_topics, start_ts, end_ts, unit) results = {} for k, v in news_topics.iteritems(): if len(v) > 0: results[v[0]] = zhutihe_results[k] return results
def get_symbol_weibo(topic, start_ts, end_ts, unit=MinInterval): #鱼骨图 weibos = {} query_body = { 'query': { 'bool': { 'must': [{ 'term': { 'name': topic } }, { 'range': { 'start_ts': { 'lte': start_ts } } }, { 'range': { 'end_ts': { 'gte': end_ts } } }] } } } symbol = weibo_es.search(index=topics_river_index_name, doc_type=topics_river_index_type, body=query_body)['hits']['hits'][0]['_source'] features = json.loads(symbol['features']) symbol_weibos = json.loads(symbol['cluster_dump_dict']) #print symbol_weibos begin_ts = end_ts - unit for clusterid, contents in symbol_weibos.iteritems(): j = 0 content = set() for i in contents: ts = full_datetime2ts(i['datetime']) title = re.findall(r'【.*】', i['content'].encode('utf8'))[0] if ts >= start_ts and ts <= end_ts and title not in content: #start_ts应该改成begin_ts,现在近15分钟没数据,所以用所有的 try: weibos[features[clusterid][0]].append(i) except: weibos[features[clusterid][0]] = [i] content.add(title) j += 1 #print content if j == 3: break #print weibos return weibos
def search_topic_by_topic(topic): result = [] query_body = {'query': {'term': {'name': topic}}} results = weibo_es.search(index=topic_index_name, doc_type=topic_index_type, body=query_body) if results: topics = results['hits']['hits'] for topic in topics: result.append(topic['_source']) #try: # result[topic['_source']['en_name']].append([topic['_source']['name'],topic['_source']['start_ts'],topic['_source']['end_ts'],topic['_source']['comput_status']]) #except: return json.dumps(result)
def cul_key_weibo_time_count(topic, news_topics, start_ts, over_ts, during): key_weibo_time_count = {} time_dict = {} during = Day for clusterid, keywords in news_topics.iteritems( ): #{u'd2e97cf7-fc43-4982-8405-2d215b3e1fea': [u'\u77e5\u8bc6', u'\u5e7f\u5dde', u'\u9009\u624b']} if len(keywords) > 0: start_ts = int(start_ts) over_ts = int(over_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during for i in range(interval, 0, -1): #时间段取每900秒的 begin_ts = over_ts - during * i end_ts = begin_ts + during must_list = [] must_list.append( {'range': { 'timestamp': { 'gte': begin_ts, 'lt': end_ts } }}) temp = [] for word in keywords: sentence = { 'wildcard': { 'keywords_string': '*' + word + '*' } } temp.append(sentence) must_list.append({'bool': {'should': temp}}) query_body = {'query': {'bool': {'must': must_list}}} key_weibo = weibo_es.search(index=topic, doc_type=weibo_index_type, body=query_body) key_weibo_count = key_weibo['hits']['total'] #分时间段的类的数量 time_dict[ts2datetime(end_ts)] = key_weibo_count key_weibo_time_count[clusterid] = sorted(time_dict.items(), key=lambda x: x[0]) return key_weibo_time_count
def get_weibo_content(topic,start_ts,end_ts,opinion,sort_item='timestamp'): #微博内容 weibo_dict = {} #a = json.dumps(opinion) #opinion = '圣保罗_班底_巴西_康熙' query_body = { 'query':{ 'bool':{ 'must':[ {'wildcard':{'keys':opinion}}, {'term':{'name':topic}}, {'range':{'start_ts':{'lte':start_ts}}}, {'range':{'end_ts':{'gte':end_ts}}} ] } } } #没有查到uid 每次的id不一样 weibos = weibo_es.search(index=subopinion_index_name,doc_type=subopinion_index_type,body=query_body)['hits']['hits'] #print weibo_es,subopinion_index_name,subopinion_index_type,query_body print len(weibos) if weibos: weibos = json.loads(weibos[0]['_source']['cluster_dump_dict']) for weibo in weibos.values():#jln0825 weibo = weibo[0] weibo_content = {} weibo_content['text'] = weibo['text'] weibo_content['uid'] = weibo['uid'] weibo_content['timestamp'] = full_datetime2ts(weibo['datetime']) weibo_content['comment'] = weibo['comment'] weibo_content['retweeted'] = weibo['retweeted'] weibo_content['mid'] = weibo['id'] try: user = es_user_profile.get(index=profile_index_name,doc_type=profile_index_type,id=weibo_content['uid'])['_source'] weibo_content['uname'] = user['nick_name'] weibo_content['photo_url'] = user['photo_url'] except: weibo_content['uname'] = 'unknown' weibo_content['photo_url'] = 'unknown' weibo_dict[weibo_content['mid']] = weibo_content results = sorted(weibo_dict.items(),key=lambda x:x[1][sort_item],reverse=True) #print results return results else: return 'no results'
def get_key_topics(keyword): result = {} query_body = { 'query': { 'bool': { 'must': [ {'term':{'comput_status':1}}, {'wildcard':{'name':'*'+keyword+'*'}} ] } } } results = weibo_es.search(index=topic_index_name,doc_type=topic_index_type,body=query_body) if results: topics = results['hits']['hits'] for topic in topics: try: result[topic['_source']['en_name']].append([topic['_source']['name'],topic['_source']['start_ts'],topic['_source']['end_ts'],topic['_source']['comput_status']]) except: result[topic['_source']['en_name']] = [[topic['_source']['name'],topic['_source']['start_ts'],topic['_source']['end_ts'],topic['_source']['comput_status']]] return json.dumps(result)
def submit(topic,start_ts,end_ts,submit_user): # print str(topic.decode('utf-8')) query_body={ 'query':{ 'filtered':{ 'filter':{ 'term':{ 'name':topic } } } } } print weibo_es find_topic = weibo_es.search(index=topic_index_name,doc_type=topic_index_type,body=query_body)['hits']['hits'] print find_topic if len(find_topic)>0: en_name = find_topic[0]['_source']['en_name'] else: en_name = p.get_pinyin(topic)+'-'+str(int(time.time())) submit_id = start_ts+'_'+end_ts+'_'+en_name+'_'+submit_user query_body={ 'name':topic, 'en_name':en_name, 'start_ts':start_ts, 'end_ts':end_ts, 'submit_user':submit_user, 'comput_status':0, 'submit_ts':int(time.time()) } try: print weibo_es.get(index=topic_index_name, doc_type=topic_index_type, id=submit_id)['_source'] result = 'already_have' except: weibo_es.index(index=topic_index_name,doc_type=topic_index_type,id=submit_id,body=query_body) result = 'success' r.lpush(topic_queue_name,json.dumps(query_body)) #该push到redis里,然后改status 计算完了再改回来 return result