Esempio n. 1
0
def ratio():
    """子观点占比
    """
    if os.path.exists(temp_file):
        os.remove(temp_file)

    topic_name = request.args.get('query', default_weibo_topic_name) # 话题名
    news_id = request.args.get('news_id', default_weibo_news_id)
    topicid = em.getEventIDByName(topic_name)
    cluster_num = request.args.get('cluster_num', default_cluster_num)
    if cluster_num == default_cluster_num:
        cluster_num = -1
    cluster_eva_min_size = request.args.get('cluster_eva_min_size', default_cluster_eva_min_size)
    vsm = request.args.get('vsm', default_vsm)
    calculation_label = int(request.args.get('calcu', 1)) # 默认进行重新计算, 0表示从从已有结果数据文件加载数据

    eventcomment = EventComments(topicid)
    comments = eventcomment.getNewsComments(news_id)
    if not comments:
        return json.dumps({"status":"fail"})

    cal_results = comments_calculation_v2(comments, cluster_num=cluster_num, \
            cluster_eva_min_size=int(cluster_eva_min_size), version=vsm)
    features = cal_results['cluster_infos']['features']
    item_infos = cal_results['item_infos']

    cluster_ratio = dict()
    senti_ratio = dict()
    sentiment_results = dict()
    cluster_results = dict()
    for comment in item_infos:
        if ('clusterid' in comment) and (comment['clusterid'][:8] != 'nonsense') : 
            clusterid = comment['clusterid']

            try:
                cluster_ratio[clusterid] += 1
            except KeyError:
                cluster_ratio[clusterid] = 1
            try:
                cluster_results[clusterid].append(comment)
            except KeyError:
                cluster_results[clusterid] = [comment]

        if ('sentiment' in comment) and (comment['sentiment'] in emotions_vk_v1) \
                and ('clusterid' in comment) \
                and (comment['clusterid'][:8] != 'nonsense'):
            sentiment = comment['sentiment']

            try:
                senti_ratio[sentiment] += 1
            except KeyError:
                senti_ratio[sentiment] = 1
            try:
                sentiment_results[sentiment].append(comment)
            except KeyError:
                sentiment_results[sentiment] = [comment]

    ratio_results = dict()
    ratio_total_count = sum(cluster_ratio.values())
    for clusterid, ratio in cluster_ratio.iteritems():
        if clusterid in features:
            feature = features[clusterid]
            if feature and len(feature):
                ratio_results[','.join(feature[:3])] = float(ratio) / float(ratio_total_count)

    sentiratio_results = dict()
    sentiratio_total_count = sum(senti_ratio.values())
    for sentiment, ratio in senti_ratio.iteritems():
        if sentiment in emotions_vk_v1:
            label = emotions_vk_v1[sentiment]
            if label and len(label):
                sentiratio_results[label] = float(ratio) / float(sentiratio_total_count)

    # 情感分类去重
    sentiment_dump_dict = dict()
    for sentiment, contents in sentiment_results.iteritems():
        dump_dict = dict()
        for comment in contents:
            same_from_sentiment = comment["same_from_sentiment"]
            try:
                dump_dict[same_from_sentiment].append(comment)
            except KeyError:
                dump_dict[same_from_sentiment] = [comment]
        sentiment_dump_dict[sentiment] = dump_dict


    # 子观点分类去重
    cluster_dump_dict = dict()
    for clusterid, contents in cluster_results.iteritems():
        if clusterid in features:
            feature = features[clusterid]
            if feature and len(feature):
                dump_dict = dict()
                for comment in contents:
                    same_from_cluster = comment["same_from"]
                    try:
                        dump_dict[same_from_cluster].append(comment)
                    except KeyError:
                        dump_dict[same_from_cluster] = [comment]
                    cluster_dump_dict[clusterid] = dump_dict

    dump_file = open(temp_file, 'w')
    dump_file.write(json.dumps({"features":features, "senti_dump_dict":sentiment_dump_dict,\
            "cluster_dump_dict":cluster_dump_dict}));
    dump_file.close();

    return json.dumps({"ratio":ratio_results, "sentiratio":sentiratio_results,})
Esempio n. 2
0
def news_comments_list(taskid,
                       start_ts,
                       over_ts,
                       weibo_list,
                       cluster_num=-1,
                       cluster_eva_min_size=default_cluster_eva_min_size,
                       vsm=default_vsm,
                       calculation_label=1):  #weibo_list把微博读进来
    """计算饼图数据,并将饼图数据和去重后的推荐文本写到文件
    taskid = request.args.get('taskid', default_task_id)
    cluster_num = request.args.get('cluster_num', default_cluster_num) #若无此参数,取-1;否则取用户设定值
    if cluster_num == default_cluster_num:
        cluster_num = -1
    cluster_eva_min_size = request.args.get('cluster_eva_min_size', default_cluster_eva_min_size)
    vsm = request.args.get('vsm', default_vsm)
    calculation_label = int(request.args.get('calcu', 1)) # 默认进行重新计算, 0表示从从已有结果数据文件加载数据
    """
    params = {"taskid": taskid, "cluster_num": cluster_num, "cluster_eva_min_size": cluster_eva_min_size, \
            "vsm": vsm, "calculation_label": calculation_label}

    comments = weibo_list
    logfile = os.path.join(LOG_FOLDER, taskid + '.log')

    cal_results = comments_calculation_v2(comments, logfile=logfile, cluster_num=cluster_num, \
            cluster_eva_min_size=int(cluster_eva_min_size), version=vsm)
    #print cal_results
    features = cal_results['cluster_infos']['features']
    item_infos = cal_results['item_infos']
    cluster_ratio = dict()
    senti_ratio = dict()
    sentiment_results = dict()
    cluster_results = dict()
    rub_results = []

    # 过滤前文本数
    before_filter_count = len(item_infos)
    # 过滤后文本数
    after_filter_count = 0

    download_items = []
    for comment in item_infos:
        # print comment
        download_item = {}
        download_item["id"] = comment["id"]
        download_item["title"] = comment["title"]
        download_item["text"] = comment["text"]
        # download_item["timestamp"] = comment["timestamp"]
        download_item["datetime"] = comment["datetime"]
        download_item["clusterid"] = comment["clusterid"]
        download_item["sentiment"] = comment["sentiment"]
        download_item["ad_label"] = comment["ad_label"]
        if (comment["clusterid"][:8] != 'nonsense') and (comment["clusterid"]
                                                         != 'other'):
            download_item["duplicate"] = comment["duplicate"]
            download_item["same_from"] = comment["same_from"]
        download_items.append(download_item)
        if ('clusterid'
                in comment) and (comment['clusterid'][:8] != 'nonsense'):
            clusterid = comment['clusterid']

            try:
                cluster_ratio[clusterid] += 1
            except KeyError:
                cluster_ratio[clusterid] = 1
            try:
                cluster_results[clusterid].append(comment)
            except KeyError:
                cluster_results[clusterid] = [comment]

        if ('sentiment' in comment) and (comment['sentiment'] in emotions_vk_v1) and ('clusterid' in comment) \
                and (comment['clusterid'][:8] != 'nonsense'):
            sentiment = comment['sentiment']

            try:
                senti_ratio[sentiment] += 1
            except KeyError:
                senti_ratio[sentiment] = 1
            try:
                sentiment_results[sentiment].append(comment)
            except KeyError:
                sentiment_results[sentiment] = [comment]

            after_filter_count += 1

        if comment['clusterid'][:8] == 'nonsense':
            rub_results.append(comment)

    ratio_results = dict()
    ratio_total_count = sum(cluster_ratio.values())
    for clusterid, ratio in cluster_ratio.iteritems():
        if clusterid in features:
            feature = features[clusterid]
            if feature and len(feature):
                ratio_results[','.join(
                    feature[:3])] = float(ratio) / float(ratio_total_count)

    sentiratio_results = dict()
    sentiratio_total_count = sum(senti_ratio.values())
    for sentiment, ratio in senti_ratio.iteritems():
        if sentiment in emotions_vk_v1:
            label = emotions_vk_v1[sentiment]
            if label and len(label):
                sentiratio_results[label] = float(ratio) / float(
                    sentiratio_total_count)

    # 情感分类去重
    sentiment_dump_dict = dict()
    for sentiment, contents in sentiment_results.iteritems():
        dump_dict = dict()
        for comment in contents:
            same_from_sentiment = comment["same_from_sentiment"]
            try:
                dump_dict[same_from_sentiment].append(comment)
            except KeyError:
                dump_dict[same_from_sentiment] = [comment]
        sentiment_dump_dict[sentiment] = dump_dict

    # 子观点分类去重
    cluster_dump_dict = dict()
    for clusterid, contents in cluster_results.iteritems():
        if clusterid in features:
            feature = features[clusterid]
            if feature and len(feature):
                dump_dict = dict()
                for comment in contents:
                    same_from_cluster = comment["same_from"]
                    try:
                        dump_dict[same_from_cluster].append(comment)
                    except KeyError:
                        dump_dict[same_from_cluster] = [comment]
                    for k, v in dump_dict.iteritems():
                        sort_dump_dict = sorted(v,
                                                key=lambda x: x['weight'],
                                                reverse=True)
                    cluster_dump_dict[clusterid] = sort_dump_dict

    # dump_file = open(task_result_file+'_news', 'w')
    # dump_file.write(json.dumps({"params": params, "features":features, "senti_dump_dict":sentiment_dump_dict, \
    #         "cluster_dump_dict":cluster_dump_dict, "ratio":ratio_results, "sentiratio": sentiratio_results, \
    #         "before_filter_count": before_filter_count, "after_filter_count": after_filter_count}))
    # dump_file.close()
    # new_file = open(task_result_file+'_news_2','w')
    # print task_result_file+'2'  #所有的微博
    # for i in xrange(0,len(download_items)):
    #     new_file.write(json.dumps(download_items[i])+'\n')
    # new_file.close
    #task = taskid.split('_')
    #index_body={'name':task[0],'start_ts':task[1],'end_ts':task[2],'features':json.dumps(features),'cluster_dump_dict':json.dumps(cluster_dump_dict)}

    index_body = {
        'name': taskid,
        'start_ts': start_ts,
        'end_ts': over_ts,
        'features': json.dumps(features),
        'cluster_dump_dict': json.dumps(cluster_dump_dict)
    }
    weibo_es.index(index=topics_river_index_name,
                   doc_type=topics_river_index_type,
                   id=taskid,
                   body=index_body)

    return json.dumps({
        "features": features,
        "cluster_dump_dict": cluster_dump_dict
    })
Esempio n. 3
0
def news_comments_list(task_source,
                       taskid,
                       weibo_list,
                       cluster_num=-1,
                       cluster_eva_min_size=default_cluster_eva_min_size,
                       vsm=default_vsm,
                       calculation_label=1):  #weibo_list把微博读进来
    """计算饼图数据,并将饼图数据和去重后的推荐文本写到文件
    """

    print 'weibo_list..len...', len(weibo_list)
    params = {"taskid": taskid, "cluster_num": cluster_num, "cluster_eva_min_size": cluster_eva_min_size, \
            "vsm": vsm, "calculation_label": calculation_label}

    comments = weibo_list
    logfile = os.path.join(LOG_FOLDER, taskid + '.log')

    cal_results = comments_calculation_v2(comments, logfile=logfile, cluster_num=cluster_num, \
            cluster_eva_min_size=int(cluster_eva_min_size), version=vsm)
    #print cal_results
    features = cal_results['cluster_infos']['features']
    item_infos = cal_results['item_infos']
    cluster_ratio = dict()
    senti_ratio = dict()
    sentiment_results = dict()
    cluster_results = dict()
    rub_results = []

    # 过滤前文本数
    before_filter_count = len(item_infos)
    # 过滤后文本数
    after_filter_count = 0

    download_items = []
    for comment in item_infos:
        # print comment
        download_item = {}
        download_item["id"] = comment["id"]
        download_item["title"] = comment["title"]
        download_item["text"] = comment["text"]
        # download_item["timestamp"] = comment["timestamp"]
        download_item["datetime"] = comment["datetime"]
        download_item["clusterid"] = comment["clusterid"]
        download_item["sentiment"] = comment["sentiment"]
        download_item["ad_label"] = comment["ad_label"]
        if (comment["clusterid"][:8] != 'nonsense') and (comment["clusterid"]
                                                         != 'other'):
            download_item["duplicate"] = comment["duplicate"]
            download_item["same_from"] = comment["same_from"]
        download_items.append(download_item)
        if ('clusterid'
                in comment) and (comment['clusterid'][:8] != 'nonsense'):
            clusterid = comment['clusterid']

            try:
                cluster_ratio[clusterid] += 1
            except KeyError:
                cluster_ratio[clusterid] = 1
            try:
                cluster_results[clusterid].append(comment)
            except KeyError:
                cluster_results[clusterid] = [comment]

        if ('sentiment' in comment) and (comment['sentiment'] in emotions_vk_v1) and ('clusterid' in comment) \
                and (comment['clusterid'][:8] != 'nonsense'):
            sentiment = comment['sentiment']

            try:
                senti_ratio[sentiment] += 1
            except KeyError:
                senti_ratio[sentiment] = 1
            try:
                sentiment_results[sentiment].append(comment)
            except KeyError:
                sentiment_results[sentiment] = [comment]

            after_filter_count += 1

        if comment['clusterid'][:8] == 'nonsense':
            rub_results.append(comment)

    ratio_results = dict()
    ratio_total_count = sum(cluster_ratio.values())
    for clusterid, ratio in cluster_ratio.iteritems():
        if clusterid in features:
            feature = features[clusterid]
            if feature and len(feature):
                ratio_results[','.join(
                    feature[:3])] = float(ratio) / float(ratio_total_count)

    sentiratio_results = dict()
    sentiratio_total_count = sum(senti_ratio.values())
    for sentiment, ratio in senti_ratio.iteritems():
        if sentiment in emotions_vk_v1:
            label = emotions_vk_v1[sentiment]
            if label and len(label):
                sentiratio_results[label] = float(ratio) / float(
                    sentiratio_total_count)

    # 情感分类去重
    sentiment_dump_dict = dict()
    for sentiment, contents in sentiment_results.iteritems():
        dump_dict = dict()
        for comment in contents:
            same_from_sentiment = comment["same_from_sentiment"]
            try:
                dump_dict[same_from_sentiment].append(comment)
            except KeyError:
                dump_dict[same_from_sentiment] = [comment]
        sentiment_dump_dict[sentiment] = dump_dict

    # 子观点分类去重
    cluster_dump_dict = dict()
    for clusterid, contents in cluster_results.iteritems():
        if clusterid in features:
            feature = features[clusterid]
            if feature and len(feature):
                dump_dict = dict()
                for comment in contents:
                    same_from_cluster = comment["same_from"]
                    try:
                        dump_dict[same_from_cluster].append(comment)
                    except KeyError:
                        dump_dict[same_from_cluster] = [comment]
                    for k, v in dump_dict.iteritems():
                        sort_dump_dict = sorted(v,
                                                key=lambda x: x['weight'],
                                                reverse=True)
                    cluster_dump_dict[clusterid] = sort_dump_dict

    #task = taskid.split('_')
    index_body = {
        'name': taskid,
        'features': json.dumps(features),
        'cluster_dump_dict': json.dumps(cluster_dump_dict)
    }
    es_intel.index(index=topics_river_index_name,
                   doc_type=topics_river_index_type,
                   id=taskid,
                   body=index_body)

    return json.dumps({
        "features": features,
        "cluster_dump_dict": cluster_dump_dict
    })
Esempio n. 4
0
def comments_list():
    if os.path.exists(temp_file):
        os.remove(temp_file)
    AB_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                           '../../public/')
    sys.path.append(AB_PATH)
    from comment_module import comments_calculation_v2

    topicid = request.args.get('topicid', default_topic_id)
    subeventid = request.args.get('subeventid', 'global')
    min_cluster_num = request.args.get('min_cluster_num',
                                       default_min_cluster_num)
    max_cluster_num = request.args.get('max_cluster_num',
                                       default_max_cluster_num)
    cluster_eva_min_size = request.args.get('cluster_eva_min_size',
                                            default_cluster_eva_min_size)
    vsm = request.args.get('vsm', default_vsm)

    ec = EventComments(topicid)
    if subeventid == 'global':
        comments = ec.getAllNewsComments()
    else:
        comments = ec.getCommentsBySubeventid(subeventid)

    if not comments:
        return json.dumps({"status": "fail"})

    cal_results = comments_calculation_v2(comments, int(min_cluster_num),
                                          int(max_cluster_num),
                                          int(cluster_eva_min_size), vsm)
    features = cal_results['cluster_infos']['features']
    item_infos = cal_results['item_infos']

    senti_dict = {0: '中性', 1: '积极', 2: '愤怒', 3: '悲伤'}

    cluster_ratio = dict()
    senti_ratio = dict()
    sentiment_results = dict()
    cluster_results = dict()
    for comment in item_infos:
        if ('clusterid'
                in comment) and (comment['clusterid'][:8] != 'nonsense'):
            clusterid = comment['clusterid']

            try:
                cluster_ratio[clusterid] += 1
            except KeyError:
                cluster_ratio[clusterid] = 1
            try:
                cluster_results[clusterid].append(comment)
            except KeyError:
                cluster_results[clusterid] = [comment]

        if ('sentiment' in comment) and (comment['sentiment'] in senti_dict) and ('clusterid' in comment) \
                and (comment['clusterid'][:8] != 'nonsense'):
            sentiment = comment['sentiment']

            try:
                senti_ratio[sentiment] += 1
            except KeyError:
                senti_ratio[sentiment] = 1
            try:
                sentiment_results[sentiment].append(comment)
            except KeyError:
                sentiment_results[sentiment] = [comment]

    ratio_results = dict()
    ratio_total_count = sum(cluster_ratio.values())
    for clusterid, ratio in cluster_ratio.iteritems():
        if clusterid in features:
            feature = features[clusterid]
            if feature and len(feature):
                ratio_results[','.join(
                    feature[:3])] = float(ratio) / float(ratio_total_count)

    sentiratio_results = dict()
    sentiratio_total_count = sum(senti_ratio.values())
    for sentiment, ratio in senti_ratio.iteritems():
        if sentiment in senti_dict:
            label = senti_dict[sentiment]
            if label and len(label):
                sentiratio_results[label] = float(ratio) / float(
                    sentiratio_total_count)

    # 情感分类去重
    sentiment_dump_dict = dict()
    for sentiment, contents in sentiment_results.iteritems():
        dump_dict = dict()
        for comment in contents:
            same_from_sentiment = comment["same_from_sentiment"]
            try:
                dump_dict[same_from_sentiment].append(comment)
            except KeyError:
                dump_dict[same_from_sentiment] = [comment]
        sentiment_dump_dict[sentiment] = dump_dict

    # 子观点分类去重
    cluster_dump_dict = dict()
    for clusterid, contents in cluster_results.iteritems():
        if clusterid in features:
            feature = features[clusterid]
            if feature and len(feature):
                dump_dict = dict()
                for comment in contents:
                    same_from_cluster = comment["same_from"]
                    try:
                        dump_dict[same_from_cluster].append(comment)
                    except KeyError:
                        dump_dict[same_from_cluster] = [comment]
                    cluster_dump_dict[clusterid] = dump_dict

    dump_file = open(temp_file, 'w')
    dump_file.write(json.dumps({"features":features, "senti_dump_dict":sentiment_dump_dict,\
            "cluster_dump_dict":cluster_dump_dict}))
    dump_file.close()

    return json.dumps({
        "ratio": ratio_results,
        "sentiratio": sentiratio_results,
    })
Esempio n. 5
0
def comments_list():

    taskid = request.args.get('taskid', default_task_id)
    cluster_num = request.args.get('cluster_num', '')  #若无此参数,取-1;否则取用户设定值
    if cluster_num == '':
        cluster_num = default_cluster_num
    cluster_eva_min_size = request.args.get('cluster_eva_min_size',
                                            default_cluster_eva_min_size)
    vsm = request.args.get('vsm', default_vsm)

    temp_file = taskid + temp_file_post
    if os.path.exists(temp_file):
        os.remove(temp_file)

    ec = EventComments(taskid)
    comments = ec.getAllNewsComments()

    if not comments:
        return json.dumps({"status": "fail"})

    cal_results = comments_calculation_v2(
        comments, cluster_eva_min_size=int(cluster_eva_min_size), version=vsm)
    features = cal_results['cluster_infos']['features']
    item_infos = cal_results['item_infos']

    senti_dict = {0: '中性', 1: '积极', 2: '愤怒', 3: '悲伤'}

    cluster_ratio = dict()
    senti_ratio = dict()
    sentiment_results = dict()
    cluster_results = dict()
    for comment in item_infos:
        if ('clusterid'
                in comment) and (comment['clusterid'][:8] != 'nonsense'):
            clusterid = comment['clusterid']

            try:
                cluster_ratio[clusterid] += 1
            except KeyError:
                cluster_ratio[clusterid] = 1
            try:
                cluster_results[clusterid].append(comment)
            except KeyError:
                cluster_results[clusterid] = [comment]

        if ('sentiment' in comment) and (comment['sentiment'] in senti_dict) and ('clusterid' in comment) \
                and (comment['clusterid'][:8] != 'nonsense'):
            sentiment = comment['sentiment']

            try:
                senti_ratio[sentiment] += 1
            except KeyError:
                senti_ratio[sentiment] = 1
            try:
                sentiment_results[sentiment].append(comment)
            except KeyError:
                sentiment_results[sentiment] = [comment]

    ratio_results = dict()
    ratio_total_count = sum(cluster_ratio.values())
    for clusterid, ratio in cluster_ratio.iteritems():
        if clusterid in features:
            feature = features[clusterid]
            if feature and len(feature):
                ratio_results[','.join(
                    feature[:3])] = float(ratio) / float(ratio_total_count)

    sentiratio_results = dict()
    sentiratio_total_count = sum(senti_ratio.values())
    for sentiment, ratio in senti_ratio.iteritems():
        if sentiment in senti_dict:
            label = senti_dict[sentiment]
            if label and len(label):
                sentiratio_results[label] = float(ratio) / float(
                    sentiratio_total_count)

    # 情感分类去重
    sentiment_dump_dict = dict()
    for sentiment, contents in sentiment_results.iteritems():
        dump_dict = dict()
        for comment in contents:
            same_from_sentiment = comment["same_from_sentiment"]
            try:
                dump_dict[same_from_sentiment].append(comment)
            except KeyError:
                dump_dict[same_from_sentiment] = [comment]
        sentiment_dump_dict[sentiment] = dump_dict

    # 子观点分类去重
    cluster_dump_dict = dict()
    for clusterid, contents in cluster_results.iteritems():
        if clusterid in features:
            feature = features[clusterid]
            if feature and len(feature):
                dump_dict = dict()
                for comment in contents:
                    same_from_cluster = comment["same_from"]
                    try:
                        dump_dict[same_from_cluster].append(comment)
                    except KeyError:
                        dump_dict[same_from_cluster] = [comment]
                    cluster_dump_dict[clusterid] = dump_dict

    dump_file = open(temp_file, 'w')
    dump_file.write(json.dumps({"features":features, "senti_dump_dict":sentiment_dump_dict,\
            "cluster_dump_dict":cluster_dump_dict, "ratio":ratio_results, "sentiratio": sentiratio_results}))
    dump_file.close()

    return json.dumps({
        "ratio": ratio_results,
        "sentiratio": sentiratio_results
    })
Esempio n. 6
0
def comments_list():
    if os.path.exists(temp_file):
        os.remove(temp_file)
    AB_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../public/')
    sys.path.append(AB_PATH)
    from comment_module import comments_calculation_v2

    topicid = request.args.get('topicid', default_topic_id)
    subeventid = request.args.get('subeventid', 'global')
    min_cluster_num = request.args.get('min_cluster_num', default_min_cluster_num)
    max_cluster_num = request.args.get('max_cluster_num', default_max_cluster_num)
    cluster_eva_min_size = request.args.get('cluster_eva_min_size', default_cluster_eva_min_size)
    vsm = request.args.get('vsm', default_vsm)

    ec = EventComments(topicid)
    if subeventid == 'global':
        comments = ec.getAllNewsComments()
    else:
        comments = ec.getCommentsBySubeventid(subeventid)

    if not comments:
        return json.dumps({"status":"fail"})

    cal_results = comments_calculation_v2(comments, int(min_cluster_num), int(max_cluster_num), int(cluster_eva_min_size), vsm)
    features = cal_results['cluster_infos']['features']
    item_infos = cal_results['item_infos']

    senti_dict = {
            0:'中性',
            1:'积极',
            2:'愤怒',
            3:'悲伤'
        }

    cluster_ratio = dict()
    senti_ratio = dict()
    sentiment_results = dict()
    cluster_results = dict()
    for comment in item_infos:
        if ('clusterid' in comment) and (comment['clusterid'][:8] != 'nonsense') : 
            clusterid = comment['clusterid']

            try:
                cluster_ratio[clusterid] += 1
            except KeyError:
                cluster_ratio[clusterid] = 1
            try:
                cluster_results[clusterid].append(comment)
            except KeyError:
                cluster_results[clusterid] = [comment]

        if ('sentiment' in comment) and (comment['sentiment'] in senti_dict) and ('clusterid' in comment) \
                and (comment['clusterid'][:8] != 'nonsense'):
            sentiment = comment['sentiment']

            try:
                senti_ratio[sentiment] += 1
            except KeyError:
                senti_ratio[sentiment] = 1
            try:
                sentiment_results[sentiment].append(comment)
            except KeyError:
                sentiment_results[sentiment] = [comment]

    ratio_results = dict()
    ratio_total_count = sum(cluster_ratio.values())
    for clusterid, ratio in cluster_ratio.iteritems():
        if clusterid in features:
            feature = features[clusterid]
            if feature and len(feature):
                ratio_results[','.join(feature[:3])] = float(ratio) / float(ratio_total_count)

    sentiratio_results = dict()
    sentiratio_total_count = sum(senti_ratio.values())
    for sentiment, ratio in senti_ratio.iteritems():
        if sentiment in senti_dict:
            label = senti_dict[sentiment]
            if label and len(label):
                sentiratio_results[label] = float(ratio) / float(sentiratio_total_count)

    # 情感分类去重
    sentiment_dump_dict = dict()
    for sentiment, contents in sentiment_results.iteritems():
        dump_dict = dict()
        for comment in contents:
            same_from_sentiment = comment["same_from_sentiment"]
            try:
                dump_dict[same_from_sentiment].append(comment)
            except KeyError:
                dump_dict[same_from_sentiment] = [comment]
        sentiment_dump_dict[sentiment] = dump_dict


    # 子观点分类去重
    cluster_dump_dict = dict()
    for clusterid, contents in cluster_results.iteritems():
        if clusterid in features:
            feature = features[clusterid]
            if feature and len(feature):
                dump_dict = dict()
                for comment in contents:
                    same_from_cluster = comment["same_from"]
                    try:
                        dump_dict[same_from_cluster].append(comment)
                    except KeyError:
                        dump_dict[same_from_cluster] = [comment]
                    cluster_dump_dict[clusterid] = dump_dict

    dump_file = open(temp_file, 'w')
    dump_file.write(json.dumps({"features":features, "senti_dump_dict":sentiment_dump_dict,\
            "cluster_dump_dict":cluster_dump_dict}));
    dump_file.close();

    return json.dumps({"ratio":ratio_results, "sentiratio":sentiratio_results,})
Esempio n. 7
0
def comments_list():
    """计算饼图数据,并将饼图数据和去重后的推荐文本写到文件
    """
    taskid = request.args.get('taskid', default_task_id)
    cluster_num = request.args.get('cluster_num', default_cluster_num) #若无此参数,取-1;否则取用户设定值
    if cluster_num == default_cluster_num:
        cluster_num = -1
    cluster_eva_min_size = request.args.get('cluster_eva_min_size', default_cluster_eva_min_size)
    vsm = request.args.get('vsm', default_vsm)
    calculation_label = int(request.args.get('calcu', 1)) # 默认进行重新计算, 0表示从从已有结果数据文件加载数据

    params = {"taskid": taskid, "cluster_num": cluster_num, "cluster_eva_min_size": cluster_eva_min_size, \
            "vsm": vsm, "calculation_label": calculation_label}

    task_result_file = os.path.join(RESULT_FOLDER, taskid)
    if os.path.exists(task_result_file) and calculation_label == 0:
        # 从已有数据文件加载结果集
        with open(task_result_file) as dump_file:
            dump_dict = json.loads(dump_file.read())
            ratio_results = dump_dict["ratio"]
            sentiratio_results = dump_dict["sentiratio"]
            before_filter_count = dump_dict["before_filter_count"]
            after_filter_count = dump_dict["after_filter_count"]

        return json.dumps({"ratio": ratio_results, "sentiratio": sentiratio_results, \
                "before_filter_count": before_filter_count, "after_filter_count": after_filter_count})

    comments = []
    task_input_file = os.path.join(UPLOAD_FOLDER, taskid)
    if os.path.exists(task_input_file):
        with open(task_input_file) as f:
            for line in f:
                comments.append(json.loads(line.strip()))

    if not comments or not len(comments):
        return json.dumps({"status":"fail"})

    logfile = os.path.join(LOG_FOLDER, taskid + '.log')

    cal_results = comments_calculation_v2(comments, logfile=logfile, cluster_num=cluster_num, \
            cluster_eva_min_size=int(cluster_eva_min_size), version=vsm)
    features = cal_results['cluster_infos']['features']
    item_infos = cal_results['item_infos']

    cluster_ratio = dict()
    senti_ratio = dict()
    sentiment_results = dict()
    cluster_results = dict()
    rub_results = []

    # 过滤前文本数
    before_filter_count = len(item_infos)
    # 过滤后文本数
    after_filter_count = 0

    for comment in item_infos:
        if ('clusterid' in comment) and (comment['clusterid'][:8] != 'nonsense') : 
            clusterid = comment['clusterid']

            try:
                cluster_ratio[clusterid] += 1
            except KeyError:
                cluster_ratio[clusterid] = 1
            try:
                cluster_results[clusterid].append(comment)
            except KeyError:
                cluster_results[clusterid] = [comment]

        if ('sentiment' in comment) and (comment['sentiment'] in emotions_vk_v1) and ('clusterid' in comment) \
                and (comment['clusterid'][:8] != 'nonsense'):
            sentiment = comment['sentiment']

            try:
                senti_ratio[sentiment] += 1
            except KeyError:
                senti_ratio[sentiment] = 1
            try:
                sentiment_results[sentiment].append(comment)
            except KeyError:
                sentiment_results[sentiment] = [comment]

            after_filter_count += 1

        if comment['clusterid'][:8] == 'nonsense':
            rub_results.append(comment)

    ratio_results = dict()
    ratio_total_count = sum(cluster_ratio.values())
    for clusterid, ratio in cluster_ratio.iteritems():
        if clusterid in features:
            feature = features[clusterid]
            if feature and len(feature):
                ratio_results[','.join(feature[:3])] = float(ratio) / float(ratio_total_count)

    sentiratio_results = dict()
    sentiratio_total_count = sum(senti_ratio.values())
    for sentiment, ratio in senti_ratio.iteritems():
        if sentiment in emotions_vk_v1:
            label = emotions_vk_v1[sentiment]
            if label and len(label):
                sentiratio_results[label] = float(ratio) / float(sentiratio_total_count)

    # 情感分类去重
    sentiment_dump_dict = dict()
    for sentiment, contents in sentiment_results.iteritems():
        dump_dict = dict()
        for comment in contents:
            same_from_sentiment = comment["same_from_sentiment"]
            try:
                dump_dict[same_from_sentiment].append(comment)
            except KeyError:
                dump_dict[same_from_sentiment] = [comment]
        sentiment_dump_dict[sentiment] = dump_dict

    # 子观点分类去重
    cluster_dump_dict = dict()
    for clusterid, contents in cluster_results.iteritems():
        if clusterid in features:
            feature = features[clusterid]
            if feature and len(feature):
                dump_dict = dict()
                for comment in contents:
                    same_from_cluster = comment["same_from"]
                    try:
                        dump_dict[same_from_cluster].append(comment)
                    except KeyError:
                        dump_dict[same_from_cluster] = [comment]
                    cluster_dump_dict[clusterid] = dump_dict

    dump_file = open(task_result_file, 'w')
    dump_file.write(json.dumps({"params": params, "features":features, "senti_dump_dict":sentiment_dump_dict, \
            "cluster_dump_dict":cluster_dump_dict, "ratio":ratio_results, "sentiratio": sentiratio_results, \
            "before_filter_count": before_filter_count, "after_filter_count": after_filter_count, \
            "rub_results": rub_results}))
    dump_file.close()

    return json.dumps({"ratio": ratio_results, "sentiratio": sentiratio_results, \
            "before_filter_count": before_filter_count, "after_filter_count": after_filter_count})
Esempio n. 8
0
def comments_list():
    """计算饼图数据,并将饼图数据和去重后的推荐文本写到文件
    """
    topicid = request.args.get('topicid', default_topic_id)
    subeventid = request.args.get('subeventid', 'global')
    cluster_num = request.args.get('cluster_num', default_cluster_num)
    if cluster_num == default_cluster_num:
        cluster_num = -1
    cluster_eva_min_size = request.args.get('cluster_eva_min_size', default_cluster_eva_min_size)
    vsm = request.args.get('vsm', default_vsm)

    ec = EventComments(topicid)
    if subeventid == 'global':
        comments = ec.getAllNewsComments()
    else:
        comments = ec.getCommentsBySubeventid(subeventid)

    if not comments:
        return json.dumps({"status":"fail"})

    cal_results = comments_calculation_v2(comments, cluster_num=int(cluster_num), \
            cluster_eva_min_size=int(cluster_eva_min_size), version=vsm)
    features = cal_results['cluster_infos']['features']
    item_infos = cal_results['item_infos']

    cluster_ratio = dict()
    senti_ratio = dict()
    sentiment_results = dict()
    cluster_results = dict()
    for comment in item_infos:
        if ('clusterid' in comment) and (comment['clusterid'][:8] != 'nonsense') : 
            clusterid = comment['clusterid']

            try:
                cluster_ratio[clusterid] += 1
            except KeyError:
                cluster_ratio[clusterid] = 1
            try:
                cluster_results[clusterid].append(comment)
            except KeyError:
                cluster_results[clusterid] = [comment]

        if ('sentiment' in comment) and (comment['sentiment'] in emotions_vk_v1) and ('clusterid' in comment) \
                and (comment['clusterid'][:8] != 'nonsense'):
            sentiment = comment['sentiment']

            try:
                senti_ratio[sentiment] += 1
            except KeyError:
                senti_ratio[sentiment] = 1
            try:
                sentiment_results[sentiment].append(comment)
            except KeyError:
                sentiment_results[sentiment] = [comment]

    ratio_results = dict()
    ratio_total_count = sum(cluster_ratio.values())
    for clusterid, ratio in cluster_ratio.iteritems():
        if clusterid in features:
            feature = features[clusterid]
            if feature and len(feature):
                ratio_results[','.join(feature[:3])] = float(ratio) / float(ratio_total_count)

    sentiratio_results = dict()
    sentiratio_total_count = sum(senti_ratio.values())
    for sentiment, ratio in senti_ratio.iteritems():
        if sentiment in emotions_vk_v1:
            label = emotions_vk_v1[sentiment]
            if label and len(label):
                sentiratio_results[label] = float(ratio) / float(sentiratio_total_count)

    # 情感分类去重
    sentiment_dump_dict = dict()
    for sentiment, contents in sentiment_results.iteritems():
        dump_dict = dict()
        for comment in contents:
            same_from_sentiment = comment["same_from_sentiment"]
            try:
                dump_dict[same_from_sentiment].append(comment)
            except KeyError:
                dump_dict[same_from_sentiment] = [comment]
        sentiment_dump_dict[sentiment] = dump_dict


    # 子观点分类去重
    cluster_dump_dict = dict()
    for clusterid, contents in cluster_results.iteritems():
        if clusterid in features:
            feature = features[clusterid]
            if feature and len(feature):
                dump_dict = dict()
                for comment in contents:
                    same_from_cluster = comment["same_from"]
                    try:
                        dump_dict[same_from_cluster].append(comment)
                    except KeyError:
                        dump_dict[same_from_cluster] = [comment]
                    cluster_dump_dict[clusterid] = dump_dict

    dump_file = open(temp_file, 'w')
    dump_file.write(json.dumps({"features":features, "senti_dump_dict":sentiment_dump_dict,\
            "cluster_dump_dict":cluster_dump_dict}));
    dump_file.close();

    return json.dumps({"ratio":ratio_results, "sentiratio":sentiratio_results,})