Example #1
0
def main(file_name):

    text_list = []
    reader = csv.reader(file('./text_data/%s.csv' % file_name, 'rb'))
    for line in reader:
        text = line[1]
        text_list.append(text)
    print 'text_list..', text_list
    print ''
    keywords = ['父亲', '拒绝']
    summary = text_generation_main(text_list, keywords)
    print 'summary...', summary
    with open('./result/%s.csv' % file_name, 'w') as f:
        writer = csv.writer(f)
        writer.writerow([summary])
    f.close()
def get_models_text(task_id, task_source, opinion_keywords_list):

    if task_source == 'weibo':
        sort_item = 'retweeted'
    else:
        sort_item = 'share'

    query_body_pos = {
        'query': {
            'terms': {
                'sentiment': SENTIMENT_POS
            }
        },
        'sort': {
            sort_item: {
                'order': 'desc'
            }
        },
        'size': MAX_SEARCH_SIZE
    }

    query_body_neg = {
        'query': {
            'terms': {
                'sentiment': SENTIMENT_NEG
            }
        },
        'sort': {
            sort_item: {
                'order': 'desc'
            }
        },
        'size': MAX_SEARCH_SIZE
    }

    query_body_news = {
        'query': {
            'bool': {
                'must': [{
                    'wildcard': {
                        'text': '*【*】*'
                    }
                }]
            }
        },
        'sort': {
            sort_item: {
                'order': 'desc'
            }
        },
        'size': MAX_SEARCH_SIZE
    }

    results_pos = es_intel.search(index=task_id,
                                  doc_type=task_source,
                                  body=query_body_pos)['hits']['hits']
    results_neg = es_intel.search(index=task_id,
                                  doc_type=task_source,
                                  body=query_body_neg)['hits']['hits']
    results_news = es_intel.search(index=task_id,
                                   doc_type=task_source,
                                   body=query_body_news)['hits']['hits']

    text_list_pos = []
    text_list_neg = []
    text_list_news = []

    for result_pos in results_pos:
        text_list_pos.append(result_pos['_source']['text'])

    for result_neg in results_neg:
        text_list_neg.append(result_neg['_source']['text'])

    for result_news in results_news:
        text_list_news.append(result_news['_source']['text'])

    model_text_dict = {}

    model_text_pos = text_generation_main(text_list_pos, opinion_keywords_list)
    model_text_neg = text_generation_main(text_list_neg, opinion_keywords_list)
    model_text_news = text_generation_main(text_list_news,
                                           opinion_keywords_list)

    model_text_dict['model_text_pos'] = model_text_pos
    model_text_dict['model_text_neg'] = model_text_neg
    model_text_dict['model_text_news'] = model_text_news

    print 'model_text_dict..', model_text_dict

    save2models_text(task_id, model_text_dict)
def get_opinions(task_source, task_id, xnr_user_no, opinion_keywords_list,
                 opinion_type, intel_type):

    query_item = 'text'
    nest_query_list = []
    tweets_list = []
    if task_source == 'weibo':

        if S_TYPE == 'test':
            current_time = datetime2ts(S_DATE)

        else:
            current_time = int(time.time())

        index_name_list = get_flow_text_index_list(current_time, days=5)
        sort_item = 'retweeted'
        for keyword in opinion_keywords_list:
            nest_query_list.append(
                {'wildcard': {
                    query_item: '*' + keyword + '*'
                }})
        uid_list = []

        if len(nest_query_list) == 1:
            SHOULD_PERCENT = 1
        else:
            SHOULD_PERCENT = 1

        if intel_type == 'all':
            query_body = {
                'query': {
                    'bool': {
                        'should': nest_query_list,
                        'minimum_should_match': SHOULD_PERCENT
                    }
                },
                'sort': {
                    sort_item: {
                        'order': 'desc'
                    }
                },
                'size': MAX_SEARCH_SIZE
            }

        elif intel_type == 'follow':

            try:
                follow_results = es_xnr.get(index=weibo_xnr_fans_followers_index_name,doc_type=weibo_xnr_fans_followers_index_type,\
                    id=xnr_user_no)['_source']

                if follow_results:
                    for follow_result in follow_results:
                        uid_list = follow_result['_source']['followers']
            except:
                uid_list = []

            query_body = {
                'query': {
                    'bool': {
                        'should': nest_query_list,
                        'minimum_should_match': SHOULD_PERCENT,
                        'must': [{
                            'terms': {
                                'uid': uid_list
                            }
                        }]
                    }
                },
                'sort': {
                    sort_item: {
                        'order': 'desc'
                    }
                },
                'size': MAX_SEARCH_SIZE
            }

        elif intel_type == 'influence':
            date = ts2datetime(current_time - 24 * 3600)

            if S_TYPE == 'test':
                date = S_DATE_BCI

            weibo_bci_index_name = weibo_bci_index_name_pre + date[:4] + date[
                5:7] + date[8:10]

            query_body_bci = {
                'query': {
                    'match_all': {}
                },
                'sort': {
                    'user_index': {
                        'order': 'desc'
                    }
                },
                'size': 500
            }

            weino_bci_results = es_user_portrait.search(
                index=weibo_bci_index_name,
                doc_type=weibo_bci_index_type,
                body=query_body_bci)['hits']['hits']
            if weino_bci_results:
                for bci_result in weino_bci_results:
                    uid = bci_result['_source']['user']
                    uid_list.append(uid)

            query_body = {
                'query': {
                    'bool': {
                        'should': nest_query_list,
                        'minimum_should_match': SHOULD_PERCENT,
                        'must': [{
                            'terms': {
                                'uid': uid_list
                            }
                        }]
                    }
                },
                'sort': {
                    sort_item: {
                        'order': 'desc'
                    }
                },
                'size': MAX_SEARCH_SIZE
            }

        else:

            query_sensitive = {
                'query': {
                    'match_all': {}
                },
                "aggs": {
                    "uids": {
                        "terms": {
                            "field": "uid",
                            "order": {
                                "avg_sensitive": "desc"
                            }
                        },
                        "aggs": {
                            "avg_sensitive": {
                                "avg": {
                                    "field": "sensitive"
                                }
                            }
                        }
                    }
                },
                'size': 500000
            }

            es_sensitive_result = es_flow_text.search(index=index_name_list,doc_type='text',\
                    body=query_sensitive)['aggregations']['uids']['buckets']
            for item in es_sensitive_result:
                uid = item['key']
                uid_list.append(uid)

            query_body = {
                'query': {
                    'bool': {
                        'should': nest_query_list,
                        'minimum_should_match': SHOULD_PERCENT,
                        'must': [{
                            'terms': {
                                'uid': uid_list
                            }
                        }]
                    }
                },
                'sort': {
                    sort_item: {
                        'order': 'desc'
                    }
                },
                'size': MAX_SEARCH_SIZE
            }

        # 得到tweets_list

        tweets_results = es_flow_text.search(index=index_name_list,
                                             doc_type='text',
                                             body=query_body)['hits']['hits']

        if tweets_results:
            for item in tweets_results:
                item = item['_source']
                weibo = item['text']
                tweets_list.append(weibo)

    else:
        if S_TYPE == 'test':
            current_time = datetime2ts(S_DATE_FB)
        else:
            current_time = int(time.time())
        uid_list = []
        sort_item = 'share'
        opinion_keywords_list = [
            word.encode('utf-8') for word in opinion_keywords_list
        ]
        en_keywords_list = trans(opinion_keywords_list, target_language='en')
        for i in range(len(opinion_keywords_list)):
            keyword = opinion_keywords_list[i].decode('utf-8')
            traditional_keyword = simplified2traditional(keyword)

            if len(en_keywords_list) == len(opinion_keywords_list):  #确保翻译没出错
                en_keyword = en_keywords_list[i]
                nest_query_list.append(
                    {'wildcard': {
                        query_item: '*' + en_keyword + '*'
                    }})

            nest_query_list.append(
                {'wildcard': {
                    query_item: '*' + keyword + '*'
                }})
            nest_query_list.append(
                {'wildcard': {
                    query_item: '*' + traditional_keyword + '*'
                }})

        if len(nest_query_list) == 1:
            SHOULD_PERCENT = 1
        else:
            SHOULD_PERCENT = 1

        if task_source == 'facebook':
            index_name_list = fb_get_flow_text_index_list(current_time, days=5)

            if intel_type == 'all':
                query_body = {
                    'query': {
                        'bool': {
                            'should': nest_query_list,
                            'minimum_should_match': SHOULD_PERCENT
                        }
                    },
                    'sort': {
                        sort_item: {
                            'order': 'desc'
                        }
                    },
                    'size': MAX_SEARCH_SIZE
                }

            elif intel_type == 'follow':

                try:
                    follow_results = es_xnr.get(index=fb_xnr_fans_followers_index_name,doc_type=fb_xnr_fans_followers_index_type,\
                        id=xnr_user_no)['_source']

                    if follow_results:
                        for follow_result in follow_results:
                            uid_list = follow_result['_source']['fans_list']
                except:
                    uid_list = []

                query_body = {
                    'query': {
                        'bool': {
                            'should': nest_query_list,
                            'minimum_should_match': SHOULD_PERCENT,
                            'must': [{
                                'terms': {
                                    'uid': uid_list
                                }
                            }]
                        }
                    },
                    'sort': {
                        sort_item: {
                            'order': 'desc'
                        }
                    },
                    'size': MAX_SEARCH_SIZE
                }

            elif intel_type == 'influence':
                fb_bci_index_name = fb_bci_index_name_pre + ts2datetime(
                    current_time)
                query_body_bci = {
                    'query': {
                        'match_all': {}
                    },
                    'sort': {
                        'influence': {
                            'order': 'desc'
                        }
                    },
                    'size': 500
                }

                fb_bci_results = es_xnr.search(
                    index=fb_bci_index_name,
                    doc_type=fb_bci_index_type,
                    body=query_body_bci)['hits']['hits']
                #print 'fb_bci_results...',len(fb_bci_results)
                if fb_bci_results:
                    for bci_result in fb_bci_results:
                        uid = bci_result['_source']['uid']
                        uid_list.append(uid)

                query_body = {
                    'query': {
                        'bool': {
                            'should': nest_query_list,
                            'minimum_should_match': SHOULD_PERCENT,
                            'must': [{
                                'terms': {
                                    'uid': uid_list
                                }
                            }]
                        }
                    },
                    'sort': {
                        sort_item: {
                            'order': 'desc'
                        }
                    },
                    'size': MAX_SEARCH_SIZE
                }

            else:

                query_sensitive = {
                    'query': {
                        'match_all': {}
                    },
                    "aggs": {
                        "uids": {
                            "terms": {
                                "field": "uid",
                                "order": {
                                    "avg_sensitive": "desc"
                                }
                            },
                            "aggs": {
                                "avg_sensitive": {
                                    "avg": {
                                        "field": "sensitive"
                                    }
                                }
                            }
                        }
                    },
                    'size': 500
                }

                es_sensitive_result = es_xnr.search(index=index_name_list,doc_type='text',\
                        body=query_sensitive)['aggregations']['uids']['buckets']
                #print 'es_sensitive_result...',len(es_sensitive_result)
                for item in es_sensitive_result:
                    uid = item['key']
                    uid_list.append(uid)

                query_body = {
                    'query': {
                        'bool': {
                            'should': nest_query_list,
                            'minimum_should_match': SHOULD_PERCENT,
                            'must': [{
                                'terms': {
                                    'uid': uid_list
                                }
                            }]
                        }
                    },
                    'sort': {
                        sort_item: {
                            'order': 'desc'
                        }
                    },
                    'size': MAX_SEARCH_SIZE
                }

            #print 'query_body...',query_body
            tweets_results = es_xnr.search(index=index_name_list,
                                           doc_type='text',
                                           body=query_body)['hits']['hits']

            if tweets_results:
                for item in tweets_results:
                    item = item['_source']
                    weibo = item['text']
                    tweets_list.append(weibo)

        else:
            index_name_list = tw_get_flow_text_index_list(current_time, days=5)

            if intel_type == 'all':
                query_body = {
                    'query': {
                        'bool': {
                            'should': nest_query_list,
                            'minimum_should_match': SHOULD_PERCENT
                        }
                    },
                    'sort': {
                        sort_item: {
                            'order': 'desc'
                        }
                    },
                    'size': MAX_SEARCH_SIZE
                }

            elif intel_type == 'follow':

                try:
                    follow_results = es_xnr.get(index=tw_xnr_fans_followers_index_name,doc_type=tw_xnr_fans_followers_index_type,\
                        id=xnr_user_no)['_source']

                    if follow_results:
                        for follow_result in follow_results:
                            uid_list = follow_result['_source'][
                                'followers_list']
                except:
                    uid_list = []

                query_body = {
                    'query': {
                        'bool': {
                            'should': nest_query_list,
                            'minimum_should_match': SHOULD_PERCENT,
                            'must': [{
                                'terms': {
                                    'uid': uid_list
                                }
                            }]
                        }
                    },
                    'sort': {
                        sort_item: {
                            'order': 'desc'
                        }
                    },
                    'size': MAX_SEARCH_SIZE
                }

            elif intel_type == 'influence':
                tw_bci_index_name = tw_bci_index_name_pre + ts2datetime(
                    current_time)
                query_body_bci = {
                    'query': {
                        'match_all': {}
                    },
                    'sort': {
                        'influence': {
                            'order': 'desc'
                        }
                    },
                    'size': 500
                }

                tw_bci_results = es_xnr.search(
                    index=tw_bci_index_name,
                    doc_type=tw_bci_index_type,
                    body=query_body_bci)['hits']['hits']
                if tw_bci_results:
                    for bci_result in tw_bci_results:
                        uid = bci_result['_source']['uid']
                        uid_list.append(uid)

                query_body = {
                    'query': {
                        'bool': {
                            'should': nest_query_list,
                            'minimum_should_match': SHOULD_PERCENT,
                            'must': [{
                                'terms': {
                                    'uid': uid_list
                                }
                            }]
                        }
                    },
                    'sort': {
                        sort_item: {
                            'order': 'desc'
                        }
                    },
                    'size': MAX_SEARCH_SIZE
                }

            else:

                query_sensitive = {
                    'query': {
                        'match_all': {}
                    },
                    "aggs": {
                        "uids": {
                            "terms": {
                                "field": "uid",
                                "order": {
                                    "avg_sensitive": "desc"
                                }
                            },
                            "aggs": {
                                "avg_sensitive": {
                                    "avg": {
                                        "field": "sensitive"
                                    }
                                }
                            }
                        }
                    },
                    'size': 500
                }

                es_sensitive_result = es_xnr.search(index=index_name_list,doc_type='text',\
                        body=query_sensitive)['aggregations']['uids']['buckets']
                for item in es_sensitive_result:
                    uid = item['key']
                    uid_list.append(uid)

                query_body = {
                    'query': {
                        'bool': {
                            'should': nest_query_list,
                            'minimum_should_match': SHOULD_PERCENT,
                            'must': [{
                                'terms': {
                                    'uid': uid_list
                                }
                            }]
                        }
                    },
                    'sort': {
                        sort_item: {
                            'order': 'desc'
                        }
                    },
                    'size': MAX_SEARCH_SIZE
                }

            print 'index_name_list...', index_name_list
            print 'query_body........', query_body
            tweets_results = es_xnr.search(index=index_name_list,
                                           doc_type='text',
                                           body=query_body)['hits']['hits']

            if tweets_results:
                for item in tweets_results:
                    item = item['_source']
                    weibo = item['text']
                    tweets_list.append(weibo)

    if tweets_list:
        opinion_name, word_result, text_list = opinion_main(tweets_list,
                                                            k_cluster=5)
        sub_opinion_results = dict()

        topic_keywords_list = []
        summary_text_list = []

        for topic, text in text_list.iteritems():

            topic_name = opinion_name[topic]
            sub_opinion_results[topic_name] = text[:SUB_OPINION_WEIBO_LIMIT]

            topic_keywords_list.extend(topic_name.split('&'))
            summary_text_list.extend(text)

        #try:
        print 'summary_text_list..', len(summary_text_list)
        print 'topic_keywords_list..', topic_keywords_list
        summary = text_generation_main(summary_text_list, topic_keywords_list)
        #summary = summary_main(summary_text_list)
        #except:
        #    summary = ''

    else:
        sub_opinion_results = {}
        summary = ''

    print '开始保存子观点计算结果......'
    print 'summary....', summary
    mark = save_intelligent_opinion_results(task_id, sub_opinion_results,
                                            summary, intel_type)

    return mark