Ejemplo n.º 1
0
def save2opinion_corpus(task_id, opinion_results):

    item_exist = dict()
    item_exist['task_id'] = task_id
    item_exist['corpus_results'] = json.dumps(opinion_results)

    es_intel.index(index=opinion_corpus_results_index_name,doc_type=opinion_corpus_results_index_type,\
                id=task_id,body=item_exist)

    item_task = dict()
    item_task['compute_status'] = 3  ## 保存观点语料结果,更新计算状态
    es_xnr.update(index=writing_task_index_name,doc_type=writing_task_index_type,\
            id=task_id, body={'doc':item_task})
Ejemplo n.º 2
0
def save2models_text(task_id, model_text_dict):

    item_exist = dict()
    item_exist['task_id'] = task_id
    item_exist['model_text_pos'] = model_text_dict['model_text_pos']
    item_exist['model_text_neg'] = model_text_dict['model_text_neg']
    item_exist['model_text_news'] = model_text_dict['model_text_news']

    # 保存智能发帖模板文本结果
    print 'item_exist...', item_exist

    es_intel.index(index=intel_models_text_index_name,doc_type=intel_models_text_index_type,\
                id=task_id,body=item_exist)

    item_task = dict()
    item_task['compute_status'] = 2  ## 保存智能发帖模板文本结果,更新计算状态
    es_xnr.update(index=writing_task_index_name,doc_type=writing_task_index_type,\
            id=task_id, body={'doc':{'compute_status':2}})
Ejemplo n.º 3
0
def save_intelligent_opinion_results(task_id,sub_opinion_results,summary, intel_type):

    try:
        item_exist = dict()
        item_exist['task_id'] = task_id
        item_exist['subopinion_tweets'] = json.dumps(sub_opinion_results)
        item_exist['summary'] = summary
        # 保存子观点结果
        es_intel.index(index=intel_opinion_results_index_name,doc_type=intel_type,\
                id=task_id,body=item_exist)

        item_task = dict() 
        item_task['compute_status'] = 2  ## 保存子观点结果,更新计算状态
        es_xnr.update(index=writing_task_index_name,doc_type=writing_task_index_type,\
             id=task_id, body={'doc':{'compute_status':2}})

        mark = True

    except:
        mark = False

    return mark
Ejemplo n.º 4
0
def news_comments_list(task_source,
                       taskid,
                       weibo_list,
                       cluster_num=-1,
                       cluster_eva_min_size=default_cluster_eva_min_size,
                       vsm=default_vsm,
                       calculation_label=1):  #weibo_list把微博读进来
    """计算饼图数据,并将饼图数据和去重后的推荐文本写到文件
    """

    print 'weibo_list..len...', len(weibo_list)
    params = {"taskid": taskid, "cluster_num": cluster_num, "cluster_eva_min_size": cluster_eva_min_size, \
            "vsm": vsm, "calculation_label": calculation_label}

    comments = weibo_list
    logfile = os.path.join(LOG_FOLDER, taskid + '.log')

    cal_results = comments_calculation_v2(comments, logfile=logfile, cluster_num=cluster_num, \
            cluster_eva_min_size=int(cluster_eva_min_size), version=vsm)
    #print cal_results
    features = cal_results['cluster_infos']['features']
    item_infos = cal_results['item_infos']
    cluster_ratio = dict()
    senti_ratio = dict()
    sentiment_results = dict()
    cluster_results = dict()
    rub_results = []

    # 过滤前文本数
    before_filter_count = len(item_infos)
    # 过滤后文本数
    after_filter_count = 0

    download_items = []
    for comment in item_infos:
        # print comment
        download_item = {}
        download_item["id"] = comment["id"]
        download_item["title"] = comment["title"]
        download_item["text"] = comment["text"]
        # download_item["timestamp"] = comment["timestamp"]
        download_item["datetime"] = comment["datetime"]
        download_item["clusterid"] = comment["clusterid"]
        download_item["sentiment"] = comment["sentiment"]
        download_item["ad_label"] = comment["ad_label"]
        if (comment["clusterid"][:8] != 'nonsense') and (comment["clusterid"]
                                                         != 'other'):
            download_item["duplicate"] = comment["duplicate"]
            download_item["same_from"] = comment["same_from"]
        download_items.append(download_item)
        if ('clusterid'
                in comment) and (comment['clusterid'][:8] != 'nonsense'):
            clusterid = comment['clusterid']

            try:
                cluster_ratio[clusterid] += 1
            except KeyError:
                cluster_ratio[clusterid] = 1
            try:
                cluster_results[clusterid].append(comment)
            except KeyError:
                cluster_results[clusterid] = [comment]

        if ('sentiment' in comment) and (comment['sentiment'] in emotions_vk_v1) and ('clusterid' in comment) \
                and (comment['clusterid'][:8] != 'nonsense'):
            sentiment = comment['sentiment']

            try:
                senti_ratio[sentiment] += 1
            except KeyError:
                senti_ratio[sentiment] = 1
            try:
                sentiment_results[sentiment].append(comment)
            except KeyError:
                sentiment_results[sentiment] = [comment]

            after_filter_count += 1

        if comment['clusterid'][:8] == 'nonsense':
            rub_results.append(comment)

    ratio_results = dict()
    ratio_total_count = sum(cluster_ratio.values())
    for clusterid, ratio in cluster_ratio.iteritems():
        if clusterid in features:
            feature = features[clusterid]
            if feature and len(feature):
                ratio_results[','.join(
                    feature[:3])] = float(ratio) / float(ratio_total_count)

    sentiratio_results = dict()
    sentiratio_total_count = sum(senti_ratio.values())
    for sentiment, ratio in senti_ratio.iteritems():
        if sentiment in emotions_vk_v1:
            label = emotions_vk_v1[sentiment]
            if label and len(label):
                sentiratio_results[label] = float(ratio) / float(
                    sentiratio_total_count)

    # 情感分类去重
    sentiment_dump_dict = dict()
    for sentiment, contents in sentiment_results.iteritems():
        dump_dict = dict()
        for comment in contents:
            same_from_sentiment = comment["same_from_sentiment"]
            try:
                dump_dict[same_from_sentiment].append(comment)
            except KeyError:
                dump_dict[same_from_sentiment] = [comment]
        sentiment_dump_dict[sentiment] = dump_dict

    # 子观点分类去重
    cluster_dump_dict = dict()
    for clusterid, contents in cluster_results.iteritems():
        if clusterid in features:
            feature = features[clusterid]
            if feature and len(feature):
                dump_dict = dict()
                for comment in contents:
                    same_from_cluster = comment["same_from"]
                    try:
                        dump_dict[same_from_cluster].append(comment)
                    except KeyError:
                        dump_dict[same_from_cluster] = [comment]
                    for k, v in dump_dict.iteritems():
                        sort_dump_dict = sorted(v,
                                                key=lambda x: x['weight'],
                                                reverse=True)
                    cluster_dump_dict[clusterid] = sort_dump_dict

    #task = taskid.split('_')
    index_body = {
        'name': taskid,
        'features': json.dumps(features),
        'cluster_dump_dict': json.dumps(cluster_dump_dict)
    }
    es_intel.index(index=topics_river_index_name,
                   doc_type=topics_river_index_type,
                   id=taskid,
                   body=index_body)

    return json.dumps({
        "features": features,
        "cluster_dump_dict": cluster_dump_dict
    })