def main(file_name): text_list = [] reader = csv.reader(file('./text_data/%s.csv' % file_name, 'rb')) for line in reader: text = line[1] text_list.append(text) print 'text_list..', text_list print '' keywords = ['父亲', '拒绝'] summary = text_generation_main(text_list, keywords) print 'summary...', summary with open('./result/%s.csv' % file_name, 'w') as f: writer = csv.writer(f) writer.writerow([summary]) f.close()
def get_models_text(task_id, task_source, opinion_keywords_list): if task_source == 'weibo': sort_item = 'retweeted' else: sort_item = 'share' query_body_pos = { 'query': { 'terms': { 'sentiment': SENTIMENT_POS } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } query_body_neg = { 'query': { 'terms': { 'sentiment': SENTIMENT_NEG } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } query_body_news = { 'query': { 'bool': { 'must': [{ 'wildcard': { 'text': '*【*】*' } }] } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } results_pos = es_intel.search(index=task_id, doc_type=task_source, body=query_body_pos)['hits']['hits'] results_neg = es_intel.search(index=task_id, doc_type=task_source, body=query_body_neg)['hits']['hits'] results_news = es_intel.search(index=task_id, doc_type=task_source, body=query_body_news)['hits']['hits'] text_list_pos = [] text_list_neg = [] text_list_news = [] for result_pos in results_pos: text_list_pos.append(result_pos['_source']['text']) for result_neg in results_neg: text_list_neg.append(result_neg['_source']['text']) for result_news in results_news: text_list_news.append(result_news['_source']['text']) model_text_dict = {} model_text_pos = text_generation_main(text_list_pos, opinion_keywords_list) model_text_neg = text_generation_main(text_list_neg, opinion_keywords_list) model_text_news = text_generation_main(text_list_news, opinion_keywords_list) model_text_dict['model_text_pos'] = model_text_pos model_text_dict['model_text_neg'] = model_text_neg model_text_dict['model_text_news'] = model_text_news print 'model_text_dict..', model_text_dict save2models_text(task_id, model_text_dict)
def get_opinions(task_source, task_id, xnr_user_no, opinion_keywords_list, opinion_type, intel_type): query_item = 'text' nest_query_list = [] tweets_list = [] if task_source == 'weibo': if S_TYPE == 'test': current_time = datetime2ts(S_DATE) else: current_time = int(time.time()) index_name_list = get_flow_text_index_list(current_time, days=5) sort_item = 'retweeted' for keyword in opinion_keywords_list: nest_query_list.append( {'wildcard': { query_item: '*' + keyword + '*' }}) uid_list = [] if len(nest_query_list) == 1: SHOULD_PERCENT = 1 else: SHOULD_PERCENT = 1 if intel_type == 'all': query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } elif intel_type == 'follow': try: follow_results = es_xnr.get(index=weibo_xnr_fans_followers_index_name,doc_type=weibo_xnr_fans_followers_index_type,\ id=xnr_user_no)['_source'] if follow_results: for follow_result in follow_results: uid_list = follow_result['_source']['followers'] except: uid_list = [] query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT, 'must': [{ 'terms': { 'uid': uid_list } }] } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } elif intel_type == 'influence': date = ts2datetime(current_time - 24 * 3600) if S_TYPE == 'test': date = S_DATE_BCI weibo_bci_index_name = weibo_bci_index_name_pre + date[:4] + date[ 5:7] + date[8:10] query_body_bci = { 'query': { 'match_all': {} }, 'sort': { 'user_index': { 'order': 'desc' } }, 'size': 500 } weino_bci_results = es_user_portrait.search( index=weibo_bci_index_name, doc_type=weibo_bci_index_type, body=query_body_bci)['hits']['hits'] if weino_bci_results: for bci_result in weino_bci_results: uid = bci_result['_source']['user'] uid_list.append(uid) query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT, 'must': [{ 'terms': { 'uid': uid_list } }] } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } else: query_sensitive = { 'query': { 'match_all': {} }, "aggs": { "uids": { "terms": { "field": "uid", "order": { "avg_sensitive": "desc" } }, "aggs": { "avg_sensitive": { "avg": { "field": "sensitive" } } } } }, 'size': 500000 } es_sensitive_result = es_flow_text.search(index=index_name_list,doc_type='text',\ body=query_sensitive)['aggregations']['uids']['buckets'] for item in es_sensitive_result: uid = item['key'] uid_list.append(uid) query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT, 'must': [{ 'terms': { 'uid': uid_list } }] } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } # 得到tweets_list tweets_results = es_flow_text.search(index=index_name_list, doc_type='text', body=query_body)['hits']['hits'] if tweets_results: for item in tweets_results: item = item['_source'] weibo = item['text'] tweets_list.append(weibo) else: if S_TYPE == 'test': current_time = datetime2ts(S_DATE_FB) else: current_time = int(time.time()) uid_list = [] sort_item = 'share' opinion_keywords_list = [ word.encode('utf-8') for word in opinion_keywords_list ] en_keywords_list = trans(opinion_keywords_list, target_language='en') for i in range(len(opinion_keywords_list)): keyword = opinion_keywords_list[i].decode('utf-8') traditional_keyword = simplified2traditional(keyword) if len(en_keywords_list) == len(opinion_keywords_list): #确保翻译没出错 en_keyword = en_keywords_list[i] nest_query_list.append( {'wildcard': { query_item: '*' + en_keyword + '*' }}) nest_query_list.append( {'wildcard': { query_item: '*' + keyword + '*' }}) nest_query_list.append( {'wildcard': { query_item: '*' + traditional_keyword + '*' }}) if len(nest_query_list) == 1: SHOULD_PERCENT = 1 else: SHOULD_PERCENT = 1 if task_source == 'facebook': index_name_list = fb_get_flow_text_index_list(current_time, days=5) if intel_type == 'all': query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } elif intel_type == 'follow': try: follow_results = es_xnr.get(index=fb_xnr_fans_followers_index_name,doc_type=fb_xnr_fans_followers_index_type,\ id=xnr_user_no)['_source'] if follow_results: for follow_result in follow_results: uid_list = follow_result['_source']['fans_list'] except: uid_list = [] query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT, 'must': [{ 'terms': { 'uid': uid_list } }] } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } elif intel_type == 'influence': fb_bci_index_name = fb_bci_index_name_pre + ts2datetime( current_time) query_body_bci = { 'query': { 'match_all': {} }, 'sort': { 'influence': { 'order': 'desc' } }, 'size': 500 } fb_bci_results = es_xnr.search( index=fb_bci_index_name, doc_type=fb_bci_index_type, body=query_body_bci)['hits']['hits'] #print 'fb_bci_results...',len(fb_bci_results) if fb_bci_results: for bci_result in fb_bci_results: uid = bci_result['_source']['uid'] uid_list.append(uid) query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT, 'must': [{ 'terms': { 'uid': uid_list } }] } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } else: query_sensitive = { 'query': { 'match_all': {} }, "aggs": { "uids": { "terms": { "field": "uid", "order": { "avg_sensitive": "desc" } }, "aggs": { "avg_sensitive": { "avg": { "field": "sensitive" } } } } }, 'size': 500 } es_sensitive_result = es_xnr.search(index=index_name_list,doc_type='text',\ body=query_sensitive)['aggregations']['uids']['buckets'] #print 'es_sensitive_result...',len(es_sensitive_result) for item in es_sensitive_result: uid = item['key'] uid_list.append(uid) query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT, 'must': [{ 'terms': { 'uid': uid_list } }] } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } #print 'query_body...',query_body tweets_results = es_xnr.search(index=index_name_list, doc_type='text', body=query_body)['hits']['hits'] if tweets_results: for item in tweets_results: item = item['_source'] weibo = item['text'] tweets_list.append(weibo) else: index_name_list = tw_get_flow_text_index_list(current_time, days=5) if intel_type == 'all': query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } elif intel_type == 'follow': try: follow_results = es_xnr.get(index=tw_xnr_fans_followers_index_name,doc_type=tw_xnr_fans_followers_index_type,\ id=xnr_user_no)['_source'] if follow_results: for follow_result in follow_results: uid_list = follow_result['_source'][ 'followers_list'] except: uid_list = [] query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT, 'must': [{ 'terms': { 'uid': uid_list } }] } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } elif intel_type == 'influence': tw_bci_index_name = tw_bci_index_name_pre + ts2datetime( current_time) query_body_bci = { 'query': { 'match_all': {} }, 'sort': { 'influence': { 'order': 'desc' } }, 'size': 500 } tw_bci_results = es_xnr.search( index=tw_bci_index_name, doc_type=tw_bci_index_type, body=query_body_bci)['hits']['hits'] if tw_bci_results: for bci_result in tw_bci_results: uid = bci_result['_source']['uid'] uid_list.append(uid) query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT, 'must': [{ 'terms': { 'uid': uid_list } }] } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } else: query_sensitive = { 'query': { 'match_all': {} }, "aggs": { "uids": { "terms": { "field": "uid", "order": { "avg_sensitive": "desc" } }, "aggs": { "avg_sensitive": { "avg": { "field": "sensitive" } } } } }, 'size': 500 } es_sensitive_result = es_xnr.search(index=index_name_list,doc_type='text',\ body=query_sensitive)['aggregations']['uids']['buckets'] for item in es_sensitive_result: uid = item['key'] uid_list.append(uid) query_body = { 'query': { 'bool': { 'should': nest_query_list, 'minimum_should_match': SHOULD_PERCENT, 'must': [{ 'terms': { 'uid': uid_list } }] } }, 'sort': { sort_item: { 'order': 'desc' } }, 'size': MAX_SEARCH_SIZE } print 'index_name_list...', index_name_list print 'query_body........', query_body tweets_results = es_xnr.search(index=index_name_list, doc_type='text', body=query_body)['hits']['hits'] if tweets_results: for item in tweets_results: item = item['_source'] weibo = item['text'] tweets_list.append(weibo) if tweets_list: opinion_name, word_result, text_list = opinion_main(tweets_list, k_cluster=5) sub_opinion_results = dict() topic_keywords_list = [] summary_text_list = [] for topic, text in text_list.iteritems(): topic_name = opinion_name[topic] sub_opinion_results[topic_name] = text[:SUB_OPINION_WEIBO_LIMIT] topic_keywords_list.extend(topic_name.split('&')) summary_text_list.extend(text) #try: print 'summary_text_list..', len(summary_text_list) print 'topic_keywords_list..', topic_keywords_list summary = text_generation_main(summary_text_list, topic_keywords_list) #summary = summary_main(summary_text_list) #except: # summary = '' else: sub_opinion_results = {} summary = '' print '开始保存子观点计算结果......' print 'summary....', summary mark = save_intelligent_opinion_results(task_id, sub_opinion_results, summary, intel_type) return mark