def get_recommend_follows(task_detail): recommend_results = dict() # daily_interests_list = task_detail['daily_interests'].split(',') monitor_keywords_list = task_detail['monitor_keywords'].split(',') create_time = time.time() if S_TYPE == 'test': create_time = datetime2ts(S_DATE) index_name_list = get_flow_text_index_list(create_time) '''#FB flow_text中没有daily_interests字段 ## 日常兴趣关注 try: query_body = { 'query':{ 'filtered':{ 'filter':{ 'terms':{'daily_interests':daily_interests_list} } } }, # 'sort':{'user_fansnum':{'order':'desc'}}, 'size':DAILY_INTEREST_TOP_USER, '_source':['uid'] } es_results = es_flow_text.search(index=index_name_list,doc_type='text',body=query_body)['hits']['hits'] daily_interest_uid_set = set() for result in es_results: daily_interest_uid_set.add(result['_source']['uid']) daily_interest_uid_list = list(daily_interest_uid_set) es_daily_interests_results = es_user_profile.mget(index=profile_index_name,doc_type=profile_index_type,\ body={'ids':daily_interest_uid_list})['docs'] nick_name_dict = {} es_daily_interests_results = es_daily_interests_results[:max(NICK_NAME_TOP,len(es_daily_interests_results))] for result in es_daily_interests_results: if result['found'] == True: result = result['_source'] nick_name_dict[result['uid']] = result['nick_name'] else: continue recommend_results['daily_interests'] = nick_name_dict except Exception,e: print e print '没有找到日常兴趣相符的用户' recommend_results['daily_interests'] = {} ''' ## 监测词关注 nest_query_list = [] #文本中可能存在英文或者繁体字,所以都匹配一下 monitor_en_keywords_list = trans(monitor_keywords_list, target_language='en') for i in range(len(monitor_keywords_list)): monitor_keyword = monitor_keywords_list[i] monitor_traditional_keyword = simplified2traditional(monitor_keyword) if len(monitor_en_keywords_list) == len( monitor_keywords_list): #确保翻译没出错 monitor_en_keyword = monitor_en_keywords_list[i] nest_query_list.append({ 'wildcard': { 'keywords_string': '*' + monitor_en_keyword + '*' } }) nest_query_list.append( {'wildcard': { 'keywords_string': '*' + monitor_keyword + '*' }}) nest_query_list.append({ 'wildcard': { 'keywords_string': '*' + monitor_traditional_keyword + '*' } }) try: query_body_monitor = { 'query': { 'bool': { # 'must':nest_query_list 'should': nest_query_list } }, # 'sort':{'user_fansnum':{'order':'desc'}}, 'size': MONITOR_TOP_USER, '_source': ['uid'] } es_results = es_flow_text.search( index=index_name_list, doc_type='text', body=query_body_monitor)['hits']['hits'] monitor_keywords_uid_set = set() for result in es_results: monitor_keywords_uid_set.add(result['_source']['uid']) monitor_keywords_uid_list = list(monitor_keywords_uid_set) es_monitor_keywords_results = es_user_profile.mget(index=profile_index_name,doc_type=profile_index_type,\ body={'ids':monitor_keywords_uid_list})['docs'] nick_name_dict = {} es_monitor_keywords_results = es_monitor_keywords_results[:max( NICK_NAME_TOP, len(es_monitor_keywords_results))] for result in es_monitor_keywords_results: if result['found'] == True: result = result['_source'] nick_name_dict[result['uid']] = result['name'] else: continue recommend_results['monitor_keywords'] = nick_name_dict except Exception, e: print e print '没有找到监测词相符的用户' recommend_results['monitor_keywords'] = {}
def get_opinions(task_source, task_id, xnr_user_no, opinion_keywords_list, opinion_type, intel_type): query_item = 'text' nest_query_list = [] tweets_list = [] if task_source == 'weibo': if S_TYPE == 'test': current_time = datetime2ts(S_DATE) else: current_time = int(time.time()) index_name_list = get_flow_text_index_list(current_time, days=5) sort_item = 'retweeted' for keyword in opinion_keywords_list: nest_query_list.append( {'wildcard': { query_item: '*' + keyword + '*' }}) uid_list = [] if len(nest_query_list) == 1: SHOULD_PERCENT = 1 else: SHOULD_PERCENT = 1 if intel_type == 'all': query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } elif intel_type == 'follow': try: follow_results = es_xnr.get(index=weibo_xnr_fans_followers_index_name,doc_type=weibo_xnr_fans_followers_index_type,\ id=xnr_user_no)['_source'] if follow_results: for follow_result in follow_results: uid_list = follow_result['_source']['followers'] except: uid_list = [] query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT, 'must': [{ 'terms': { 'uid': uid_list } }] } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } elif intel_type == 'influence': date = ts2datetime(current_time - 24 * 3600) if S_TYPE == 'test': date = S_DATE_BCI weibo_bci_index_name = weibo_bci_index_name_pre + date[:4] + date[ 5:7] + date[8:10] query_body_bci = { 'query': { 'match_all': {} }, 'sort': { 'user_index': { 'order': 'desc' } }, 'size': 500 } weino_bci_results = es_user_portrait.search( index=weibo_bci_index_name, doc_type=weibo_bci_index_type, body=query_body_bci)['hits']['hits'] if weino_bci_results: for bci_result in weino_bci_results: uid = bci_result['_source']['user'] uid_list.append(uid) query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT, 'must': [{ 'terms': { 'uid': uid_list } }] } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } else: query_sensitive = { 'query': { 'match_all': {} }, "aggs": { "uids": { "terms": { "field": "uid", "order": { "avg_sensitive": "desc" } }, "aggs": { "avg_sensitive": { "avg": { "field": "sensitive" } } } } }, 'size': 500000 } es_sensitive_result = es_flow_text.search(index=index_name_list,doc_type='text',\ body=query_sensitive)['aggregations']['uids']['buckets'] for item in es_sensitive_result: uid = item['key'] uid_list.append(uid) query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT, 'must': [{ 'terms': { 'uid': uid_list } }] } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } # 得到tweets_list tweets_results = es_flow_text.search(index=index_name_list, doc_type='text', body=query_body)['hits']['hits'] if tweets_results: for item in tweets_results: item = item['_source'] weibo = item['text'] tweets_list.append(weibo) else: if S_TYPE == 'test': current_time = datetime2ts(S_DATE_FB) else: current_time = int(time.time()) uid_list = [] sort_item = 'share' opinion_keywords_list = [ word.encode('utf-8') for word in opinion_keywords_list ] en_keywords_list = trans(opinion_keywords_list, target_language='en') for i in range(len(opinion_keywords_list)): keyword = opinion_keywords_list[i].decode('utf-8') traditional_keyword = simplified2traditional(keyword) if len(en_keywords_list) == len(opinion_keywords_list): #确保翻译没出错 en_keyword = en_keywords_list[i] nest_query_list.append( {'wildcard': { query_item: '*' + en_keyword + '*' }}) nest_query_list.append( {'wildcard': { query_item: '*' + keyword + '*' }}) nest_query_list.append( {'wildcard': { query_item: '*' + traditional_keyword + '*' }}) if len(nest_query_list) == 1: SHOULD_PERCENT = 1 else: SHOULD_PERCENT = 1 if task_source == 'facebook': index_name_list = fb_get_flow_text_index_list(current_time, days=5) if intel_type == 'all': query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } elif intel_type == 'follow': try: follow_results = es_xnr.get(index=fb_xnr_fans_followers_index_name,doc_type=fb_xnr_fans_followers_index_type,\ id=xnr_user_no)['_source'] if follow_results: for follow_result in follow_results: uid_list = follow_result['_source']['fans_list'] except: uid_list = [] query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT, 'must': [{ 'terms': { 'uid': uid_list } }] } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } elif intel_type == 'influence': fb_bci_index_name = fb_bci_index_name_pre + ts2datetime( current_time) query_body_bci = { 'query': { 'match_all': {} }, 'sort': { 'influence': { 'order': 'desc' } }, 'size': 500 } fb_bci_results = es_xnr.search( index=fb_bci_index_name, doc_type=fb_bci_index_type, body=query_body_bci)['hits']['hits'] #print 'fb_bci_results...',len(fb_bci_results) if fb_bci_results: for bci_result in fb_bci_results: uid = bci_result['_source']['uid'] uid_list.append(uid) query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT, 'must': [{ 'terms': { 'uid': uid_list } }] } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } else: query_sensitive = { 'query': { 'match_all': {} }, "aggs": { "uids": { "terms": { "field": "uid", "order": { "avg_sensitive": "desc" } }, "aggs": { "avg_sensitive": { "avg": { "field": "sensitive" } } } } }, 'size': 500 } es_sensitive_result = es_xnr.search(index=index_name_list,doc_type='text',\ body=query_sensitive)['aggregations']['uids']['buckets'] #print 'es_sensitive_result...',len(es_sensitive_result) for item in es_sensitive_result: uid = item['key'] uid_list.append(uid) query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT, 'must': [{ 'terms': { 'uid': uid_list } }] } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } #print 'query_body...',query_body tweets_results = es_xnr.search(index=index_name_list, doc_type='text', body=query_body)['hits']['hits'] if tweets_results: for item in tweets_results: item = item['_source'] weibo = item['text'] tweets_list.append(weibo) else: index_name_list = tw_get_flow_text_index_list(current_time, days=5) if intel_type == 'all': query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } elif intel_type == 'follow': try: follow_results = es_xnr.get(index=tw_xnr_fans_followers_index_name,doc_type=tw_xnr_fans_followers_index_type,\ id=xnr_user_no)['_source'] if follow_results: for follow_result in follow_results: uid_list = follow_result['_source'][ 'followers_list'] except: uid_list = [] query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT, 'must': [{ 'terms': { 'uid': uid_list } }] } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } elif intel_type == 'influence': tw_bci_index_name = tw_bci_index_name_pre + ts2datetime( current_time) query_body_bci = { 'query': { 'match_all': {} }, 'sort': { 'influence': { 'order': 'desc' } }, 'size': 500 } tw_bci_results = es_xnr.search( index=tw_bci_index_name, doc_type=tw_bci_index_type, body=query_body_bci)['hits']['hits'] if tw_bci_results: for bci_result in tw_bci_results: uid = bci_result['_source']['uid'] uid_list.append(uid) query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT, 'must': [{ 'terms': { 'uid': uid_list } }] } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } else: query_sensitive = { 'query': { 'match_all': {} }, "aggs": { "uids": { "terms": { "field": "uid", "order": { "avg_sensitive": "desc" } }, "aggs": { "avg_sensitive": { "avg": { "field": "sensitive" } } } } }, 'size': 500 } es_sensitive_result = es_xnr.search(index=index_name_list,doc_type='text',\ body=query_sensitive)['aggregations']['uids']['buckets'] for item in es_sensitive_result: uid = item['key'] uid_list.append(uid) query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT, 'must': [{ 'terms': { 'uid': uid_list } }] } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } print 'index_name_list...', index_name_list print 'query_body........', query_body tweets_results = es_xnr.search(index=index_name_list, doc_type='text', body=query_body)['hits']['hits'] if tweets_results: for item in tweets_results: item = item['_source'] weibo = item['text'] tweets_list.append(weibo) if tweets_list: opinion_name, word_result, text_list = opinion_main(tweets_list, k_cluster=5) sub_opinion_results = dict() topic_keywords_list = [] summary_text_list = [] for topic, text in text_list.iteritems(): topic_name = opinion_name[topic] sub_opinion_results[topic_name] = text[:SUB_OPINION_WEIBO_LIMIT] topic_keywords_list.extend(topic_name.split('&')) summary_text_list.extend(text) #try: print 'summary_text_list..', len(summary_text_list) print 'topic_keywords_list..', topic_keywords_list summary = text_generation_main(summary_text_list, topic_keywords_list) #summary = summary_main(summary_text_list) #except: # summary = '' else: sub_opinion_results = {} summary = '' print '开始保存子观点计算结果......' print 'summary....', summary mark = save_intelligent_opinion_results(task_id, sub_opinion_results, summary, intel_type) return mark
def detect_by_keywords(keywords, datetime_list): keywords_list = [] model = gensim.models.KeyedVectors.load_word2vec_format(WORD2VEC_PATH, binary=True) for word in keywords: simi_list = model.most_similar(word, topn=20) for simi_word in simi_list: keywords_list.append(simi_word[0]) group_uid_list = set() if datetime_list == []: return [] query_item = 'text' flow_text_index_name_list = [] for datetime in datetime_list: flow_text_index_name = flow_text_index_name_pre + datetime flow_text_index_name_list.append(flow_text_index_name) nest_query_list = [] #文本中可能存在英文或者繁体字,所以都匹配一下 en_keywords_list = trans(keywords_list, target_language='en') for i in range(len(keywords_list)): keyword = keywords_list[i] traditional_keyword = simplified2traditional(keyword) if len(en_keywords_list) == len(keywords_list): #确保翻译没出错 en_keyword = en_keywords_list[i] nest_query_list.append( {'wildcard': { query_item: '*' + en_keyword + '*' }}) nest_query_list.append({'wildcard': {query_item: '*' + keyword + '*'}}) nest_query_list.append( {'wildcard': { query_item: '*' + traditional_keyword + '*' }}) count = MAX_DETECT_COUNT if len(nest_query_list) == 1: SHOULD_PERCENT = 1 # 绝对数量。 保证至少匹配一个词 else: SHOULD_PERCENT = '3' # 相对数量。 2个词时,保证匹配2个词,3个词时,保证匹配2个词 query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT, # 'must_not':{'terms':{'uid':white_uid_list}} } }, 'aggs': { 'all_uids': { 'terms': { 'field': 'uid', 'order': { '_count': 'desc' }, 'size': count } } } } es_results = es_flow_text.search(index=flow_text_index_name_list,doc_type=flow_text_index_type,\ body=query_body,request_timeout=999999)['aggregations']['all_uids']['buckets'] for i in range(len(es_results)): uid = es_results[i]['key'] group_uid_list.add(uid) group_uid_list = list(group_uid_list) return group_uid_list
def find_flow_texts(task_source, task_id, event_keywords): # 得到nest_query_list nest_query_list = [] keywords_list = event_keywords.split('&') keywords_list = [word.encode('utf-8') for word in keywords_list] query_item = 'text' if task_source != 'weibo': #文本中可能存在英文或者繁体字,所以都匹配一下 en_keywords_list = trans(keywords_list, target_language='en') for i in range(len(keywords_list)): keyword = keywords_list[i].decode('utf-8') traditional_keyword = simplified2traditional(keyword) if len(en_keywords_list) == len(keywords_list): #确保翻译没出错 en_keyword = en_keywords_list[i] nest_query_list.append( {'wildcard': { query_item: '*' + en_keyword + '*' }}) nest_query_list.append( {'wildcard': { query_item: '*' + keyword + '*' }}) nest_query_list.append( {'wildcard': { query_item: '*' + traditional_keyword + '*' }}) else: for keyword in keywords_list: nest_query_list.append( {'wildcard': { query_item: '*' + keyword + '*' }}) if len(nest_query_list) == 1: SHOULD_PERCENT = 1 else: SHOULD_PERCENT = 1 # 匹配文本 if task_source == 'weibo': sort_item = 'retweeted' if S_TYPE == 'test': current_time = datetime2ts(S_DATE) else: current_time = int(time.time() + 24 * 3600) #test #current_time = int(datetime2ts("2018-05-13")) index_name_list = get_flow_text_index_list(current_time, days=2) es_name = es_flow_text elif task_source == 'facebook': sort_item = 'share' if S_TYPE == 'test': current_time = datetime2ts(S_DATE_FB) else: current_time = int(time.time() + 24 * 3600) index_name_list = fb_get_flow_text_index_list(current_time, days=2) es_name = es_xnr else: sort_item = 'share' if S_TYPE == 'test': current_time = datetime2ts(S_DATE_TW) else: current_time = int(time.time() + 24 * 3600) index_name_list = tw_get_flow_text_index_list(current_time, days=2) es_name = es_xnr query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': 100000 } print 'es_name...', es_name print 'index_name_list..', index_name_list search_results = es_name.search(index=index_name_list, doc_type='text', body=query_body)['hits']['hits'] print 'len..search_results..', len(search_results) save2topic_es(task_source, task_id, search_results)
except Exception, e: print u'扩展词 Exception:', str(e) group_uid_list = set() if datetime_list == []: return [] query_item = 'text' flow_text_index_name_list = [] for datetime in datetime_list: flow_text_index_name = flow_text_index_name_pre + datetime flow_text_index_name_list.append(flow_text_index_name) nest_query_list = [] #文本中可能存在英文或者繁体字,所以都匹配一下 en_keywords_list = trans(keywords_list, target_language='en') for i in range(len(keywords_list)): keyword = keywords_list[i] traditional_keyword = simplified2traditional(keyword) if len(en_keywords_list) == len(keywords_list): #确保翻译没出错 en_keyword = en_keywords_list[i] nest_query_list.append( {'wildcard': { query_item: '*' + en_keyword + '*' }}) nest_query_list.append({'wildcard': {query_item: '*' + keyword + '*'}}) nest_query_list.append( {'wildcard': { query_item: '*' + traditional_keyword + '*'