def get_opinions(task_source, task_id, xnr_user_no, opinion_keywords_list, opinion_type, intel_type): query_item = 'text' nest_query_list = [] tweets_list = [] if task_source == 'weibo': if S_TYPE == 'test': current_time = datetime2ts(S_DATE) else: current_time = int(time.time()) index_name_list = get_flow_text_index_list(current_time, days=5) sort_item = 'retweeted' for keyword in opinion_keywords_list: nest_query_list.append( {'wildcard': { query_item: '*' + keyword + '*' }}) uid_list = [] if len(nest_query_list) == 1: SHOULD_PERCENT = 1 else: SHOULD_PERCENT = 1 if intel_type == 'all': query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } elif intel_type == 'follow': try: follow_results = es_xnr.get(index=weibo_xnr_fans_followers_index_name,doc_type=weibo_xnr_fans_followers_index_type,\ id=xnr_user_no)['_source'] if follow_results: for follow_result in follow_results: uid_list = follow_result['_source']['followers'] except: uid_list = [] query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT, 'must': [{ 'terms': { 'uid': uid_list } }] } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } elif intel_type == 'influence': date = ts2datetime(current_time - 24 * 3600) if S_TYPE == 'test': date = S_DATE_BCI weibo_bci_index_name = weibo_bci_index_name_pre + date[:4] + date[ 5:7] + date[8:10] query_body_bci = { 'query': { 'match_all': {} }, 'sort': { 'user_index': { 'order': 'desc' } }, 'size': 500 } weino_bci_results = es_user_portrait.search( index=weibo_bci_index_name, doc_type=weibo_bci_index_type, body=query_body_bci)['hits']['hits'] if weino_bci_results: for bci_result in weino_bci_results: uid = bci_result['_source']['user'] uid_list.append(uid) query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT, 'must': [{ 'terms': { 'uid': uid_list } }] } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } else: query_sensitive = { 'query': { 'match_all': {} }, "aggs": { "uids": { "terms": { "field": "uid", "order": { "avg_sensitive": "desc" } }, "aggs": { "avg_sensitive": { "avg": { "field": "sensitive" } } } } }, 'size': 500000 } es_sensitive_result = es_flow_text.search(index=index_name_list,doc_type='text',\ body=query_sensitive)['aggregations']['uids']['buckets'] for item in es_sensitive_result: uid = item['key'] uid_list.append(uid) query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT, 'must': [{ 'terms': { 'uid': uid_list } }] } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } # 得到tweets_list tweets_results = es_flow_text.search(index=index_name_list, doc_type='text', body=query_body)['hits']['hits'] if tweets_results: for item in tweets_results: item = item['_source'] weibo = item['text'] tweets_list.append(weibo) else: if S_TYPE == 'test': current_time = datetime2ts(S_DATE_FB) else: current_time = int(time.time()) uid_list = [] sort_item = 'share' opinion_keywords_list = [ word.encode('utf-8') for word in opinion_keywords_list ] en_keywords_list = trans(opinion_keywords_list, target_language='en') for i in range(len(opinion_keywords_list)): keyword = opinion_keywords_list[i].decode('utf-8') traditional_keyword = simplified2traditional(keyword) if len(en_keywords_list) == len(opinion_keywords_list): #确保翻译没出错 en_keyword = en_keywords_list[i] nest_query_list.append( {'wildcard': { query_item: '*' + en_keyword + '*' }}) nest_query_list.append( {'wildcard': { query_item: '*' + keyword + '*' }}) nest_query_list.append( {'wildcard': { query_item: '*' + traditional_keyword + '*' }}) if len(nest_query_list) == 1: SHOULD_PERCENT = 1 else: SHOULD_PERCENT = 1 if task_source == 'facebook': index_name_list = fb_get_flow_text_index_list(current_time, days=5) if intel_type == 'all': query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } elif intel_type == 'follow': try: follow_results = es_xnr.get(index=fb_xnr_fans_followers_index_name,doc_type=fb_xnr_fans_followers_index_type,\ id=xnr_user_no)['_source'] if follow_results: for follow_result in follow_results: uid_list = follow_result['_source']['fans_list'] except: uid_list = [] query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT, 'must': [{ 'terms': { 'uid': uid_list } }] } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } elif intel_type == 'influence': fb_bci_index_name = fb_bci_index_name_pre + ts2datetime( current_time) query_body_bci = { 'query': { 'match_all': {} }, 'sort': { 'influence': { 'order': 'desc' } }, 'size': 500 } fb_bci_results = es_xnr.search( index=fb_bci_index_name, doc_type=fb_bci_index_type, body=query_body_bci)['hits']['hits'] #print 'fb_bci_results...',len(fb_bci_results) if fb_bci_results: for bci_result in fb_bci_results: uid = bci_result['_source']['uid'] uid_list.append(uid) query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT, 'must': [{ 'terms': { 'uid': uid_list } }] } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } else: query_sensitive = { 'query': { 'match_all': {} }, "aggs": { "uids": { "terms": { "field": "uid", "order": { "avg_sensitive": "desc" } }, "aggs": { "avg_sensitive": { "avg": { "field": "sensitive" } } } } }, 'size': 500 } es_sensitive_result = es_xnr.search(index=index_name_list,doc_type='text',\ body=query_sensitive)['aggregations']['uids']['buckets'] #print 'es_sensitive_result...',len(es_sensitive_result) for item in es_sensitive_result: uid = item['key'] uid_list.append(uid) query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT, 'must': [{ 'terms': { 'uid': uid_list } }] } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } #print 'query_body...',query_body tweets_results = es_xnr.search(index=index_name_list, doc_type='text', body=query_body)['hits']['hits'] if tweets_results: for item in tweets_results: item = item['_source'] weibo = item['text'] tweets_list.append(weibo) else: index_name_list = tw_get_flow_text_index_list(current_time, days=5) if intel_type == 'all': query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } elif intel_type == 'follow': try: follow_results = es_xnr.get(index=tw_xnr_fans_followers_index_name,doc_type=tw_xnr_fans_followers_index_type,\ id=xnr_user_no)['_source'] if follow_results: for follow_result in follow_results: uid_list = follow_result['_source'][ 'followers_list'] except: uid_list = [] query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT, 'must': [{ 'terms': { 'uid': uid_list } }] } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } elif intel_type == 'influence': tw_bci_index_name = tw_bci_index_name_pre + ts2datetime( current_time) query_body_bci = { 'query': { 'match_all': {} }, 'sort': { 'influence': { 'order': 'desc' } }, 'size': 500 } tw_bci_results = es_xnr.search( index=tw_bci_index_name, doc_type=tw_bci_index_type, body=query_body_bci)['hits']['hits'] if tw_bci_results: for bci_result in tw_bci_results: uid = bci_result['_source']['uid'] uid_list.append(uid) query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT, 'must': [{ 'terms': { 'uid': uid_list } }] } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } else: query_sensitive = { 'query': { 'match_all': {} }, "aggs": { "uids": { "terms": { "field": "uid", "order": { "avg_sensitive": "desc" } }, "aggs": { "avg_sensitive": { "avg": { "field": "sensitive" } } } } }, 'size': 500 } es_sensitive_result = es_xnr.search(index=index_name_list,doc_type='text',\ body=query_sensitive)['aggregations']['uids']['buckets'] for item in es_sensitive_result: uid = item['key'] uid_list.append(uid) query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT, 'must': [{ 'terms': { 'uid': uid_list } }] } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } print 'index_name_list...', index_name_list print 'query_body........', query_body tweets_results = es_xnr.search(index=index_name_list, doc_type='text', body=query_body)['hits']['hits'] if tweets_results: for item in tweets_results: item = item['_source'] weibo = item['text'] tweets_list.append(weibo) if tweets_list: opinion_name, word_result, text_list = opinion_main(tweets_list, k_cluster=5) sub_opinion_results = dict() topic_keywords_list = [] summary_text_list = [] for topic, text in text_list.iteritems(): topic_name = opinion_name[topic] sub_opinion_results[topic_name] = text[:SUB_OPINION_WEIBO_LIMIT] topic_keywords_list.extend(topic_name.split('&')) summary_text_list.extend(text) #try: print 'summary_text_list..', len(summary_text_list) print 'topic_keywords_list..', topic_keywords_list summary = text_generation_main(summary_text_list, topic_keywords_list) #summary = summary_main(summary_text_list) #except: # summary = '' else: sub_opinion_results = {} summary = '' print '开始保存子观点计算结果......' print 'summary....', summary mark = save_intelligent_opinion_results(task_id, sub_opinion_results, summary, intel_type) return mark
def compute_recommend_subopnion(task_detail): print '开始分析计算......' task_id = task_detail['task_id'].strip('"') keywords_string = task_detail['keywords_string'] keywords_list = keywords_string.split('&') ## 以 & 切分关键词,得到list xnr_user_no = task_detail['xnr_user_no'] mid = task_detail['mid'] query_item = 'keywords_string' nest_query_list = [] for keyword in keywords_list: nest_query_list.append({'wildcard': {query_item: '*' + keyword + '*'}}) ''' ## 重点关注当前虚拟人的关注用户 if S_TYPE == 'test': # followers_list = get_result['followers_list'] # nest_query_list.append({'terms':followers_list}) print '全部用户' else: get_result = es.get(index=tw_xnr_fans_followers_index_name,doc_type=tw_xnr_fans_followers_index_type,\ id=xnr_user_no)['_source'] followers_list = get_result['followers_list'] nest_query_list.append({'terms':followers_list}) ''' if S_TYPE == 'test': create_time = datetime2ts(S_DATE_FB) else: create_time = datehour2ts(ts2datehour(time.time() - 3600)) #tw_get_flow_text_index_list(create_time) #index_name_list_list = tw_get_flow_text_index_list(now_timestamp) index_name_list = tw_get_flow_text_index_list(create_time) print 'index_name_list::', index_name_list es_results = es.search(index=index_name_list,doc_type='text',\ body={'query':{'bool':{'must':nest_query_list}},'size':MAX_SEARCH_SIZE})['hits']['hits'] tw_list = [] ## 内容推荐和子观点分析的输入 if es_results: for item in es_results: item = item['_source'] tw = item['text'] tw_list.append(tw) ## 内容推荐 ## 得到推荐句子列表 #print 'tw_list::::::',tw_list # print '开始内容推荐计算......' # if tw_list: # content_results = summary_main(tw_list) # else: # content_results = [] # print '开始保存内容推荐计算结果......' # mark = save_content_recommendation_results(xnr_user_no,mid,task_id.encode('utf-8'),content_results) # print 'mark_content:::',mark # if mark == False: # print '内容推荐结果保存过程中遇到错误,把计算任务重新push到队列中' # add_task_2_queue(keyword_task_queue_name,task_detail) # else: # print '内容推荐计算结果保存完毕......' ## 子观点分析 ''' 输入: tw_data:微博列表,[tw1,tw2,...] k_cluster:子话题个数 (默认为5) 输出: opinion_name:子话题名称字典,{topic1:name1,topic2:name2,...} word_result:子话题关键词对,{topic1:[w1,w2,...],topic2:[w1,w2,...],...} text_list:子话题对应的文本,{topic1:[text1,text2,...],topic2:[text1,text2,..],..} ''' print '开始子观点计算......' if tw_list: opinion_name, word_result, text_list = opinion_main(tw_list, k_cluster=5) sub_opinion_results = dict() for topic, text in text_list.iteritems(): topic_name = opinion_name[topic] sub_opinion_results[topic_name] = text[:SUB_OPINION_WEIBO_LIMIT] else: sub_opinion_results = {} print '开始保存子观点计算结果......' mark = save_subopnion_results(xnr_user_no, mid, task_id, sub_opinion_results) print 'mark_opinion:::', mark if mark == False: print '子观点计算结果保存过程中遇到错误,把计算任务重新push到队列中' add_task_2_queue(keyword_task_queue_name, task_detail) else: print '子观点计算结果保存完毕......'
def comments_rubbish_clustering_calculation(comments, cluster_num, \ cluster_eva_min_size=CLUSTER_EVA_MIN_SIZE, \ version=COMMENT_CLUSTERING_PROCESS_FOR_CLUTO_VERSION): """评论垃圾过滤、聚类 input: comments comment中包含news_id, news_content cluster_infos: 聚簇信息 item_infos:单条信息列表, 数据字段:clusterid、weight、same_from、duplicate """ # 无意义信息的clusterid,包括ad_filter分出来的广告,svm分出的垃圾,主客观分类器分出的新闻 NON_CLUSTER_ID = 'nonsense' # 其他类的clusterid OTHER_CLUSTER_ID = 'other' # 直接显示的clusterid DIRECT_CLUSTER_ID = 'direct' DIRECT_CLUSTER_FEATURE = [u'聚簇'] # 最小聚类输入信息条数,少于则不聚类 MIN_CLUSTERING_INPUT = 20 # 簇信息,主要是簇的特征词信息 clusters_infos = {'features': dict()} # 单条信息list,每条信息存储 clusterid weight sentiment字段 items_infos = [] # 数据字段预处理 print('\tData preprocess...') inputs = [] for r in comments: r['title'] = '' r['content168'] = r['content'] #.encode('utf-8') r['content'] = r['content168'] r['text'] = r['content'] if 'news_content' in r and r['news_content']: r['news_content'] = r['news_content'] #.encode('utf-8') else: r['news_content'] = '' # 简单规则过滤广告 item = ad_filter(r) if item['ad_label'] == 0: inputs.append(item) else: item['clusterid'] = NON_CLUSTER_ID + '_rub' items_infos.append(item) print('\tAd filter %d data, data list have: %d' % (len(inputs), len(items_infos))) # svm去除垃圾 print('\tSvm rubbish classify...') if len(inputs) == 0: items = [] else: items = rubbish_classifier(inputs) inputs = [] for item in items: if item['rub_label'] == 1: item['clusterid'] = NON_CLUSTER_ID + '_rub' items_infos.append(item) else: inputs.append(item) print('\tSvm rubbish classify %d data, data list have: %d' % (len(inputs), len(items_infos))) #开始聚类 print('\tStart clustring opinion...') opinion_name, word_result, text_list, word_main = opinion_main( inputs, cluster_num) # if len(inputs) >= 500: # opinion_name,word_result,text_list = opinion_main(inputs,10) # else: # opinion_name,word_result,text_list = opinion_main(inputs,5) print('\tEnd clustring opinion...') for k, v in word_result.items(): #name = opinion_name[k] clusters_infos['features'][k] = v clusters_infos['word_main'] = word_main final_inputs = [] for k, v in text_list.items(): for item in v: row = copy.deepcopy(item) row['clusterid'] = k final_inputs.append(row) # 去重,根据子观点类别去重 cluster_items = dict() for r in final_inputs: clusterid = r['clusterid'] try: cluster_items[clusterid].append(r) except KeyError: cluster_items[clusterid] = [r] for clusterid, items in cluster_items.items(): results = duplicate(items) items_infos.extend(results) return {'cluster_infos': clusters_infos, 'item_infos': items_infos}