Ejemplo n.º 1
0
def text_kmeans_clustering():
    # 聚类评价时选取TOPK_FREQ_WORD的高频词
    TOPK_FREQ_WORD = 50

    # 聚类评价时最小簇的大小
    LEAST_SIZE = 8

    eventcomment = EventComments(topicid)
    newsIds = eventcomment.getNewsIds()

    for news_id in newsIds:
        results = eventcomment.getNewsComments(news_id)

        inputs = []
        for r in results:
            r['title'] = ''
            r['content'] = r['content168'].encode('utf-8')
            r['text'] = r['content168']
            item = ad_filter(r)
            if item['ad_label'] == 0:
                inputs.append(item)

        # 情绪计算
        for r in inputs:
            sentiment = triple_classifier(r)
            comment = Comment(r['_id'])
            comment.update_comment_sentiment(sentiment)

        # kmeans 聚类及评价
        kmeans_results = kmeans(inputs, k=10)
        reserve_num = 5
        final_cluster_results, tfidf_dict = cluster_evaluation(kmeans_results, \
                top_num=reserve_num, topk_freq=TOPK_FREQ_WORD, least_size=LEAST_SIZE, min_tfidf=None)

        inputs = []
        for label, items in final_cluster_results.iteritems():
            if label != 'other':
                inputs.extend(items)

            for item in items:
                news = News(item['news_id'])

                if label == 'other':
                    label = news.otherClusterId

                comment = Comment(item['_id'])
                comment.update_comment_label(label)

            eventcomment.save_cluster(label, news_id, int(time.time()))

        #计算各簇特征词
        cluster_feature = extract_feature(inputs)
        for label, fwords in cluster_feature.iteritems():
            eventcomment.update_feature_words(label, fwords)

        #计算文本权重
        for input in inputs:
            weight = text_weight_cal(input, cluster_feature[input['label']])
            comment = Comment(input['_id'])
            comment.update_comment_weight(weight)
Ejemplo n.º 2
0
def text_kmeans_clustering():
    # 聚类评价时选取TOPK_FREQ_WORD的高频词
    TOPK_FREQ_WORD = 50

    # 聚类评价时最小簇的大小
    LEAST_SIZE = 8

    eventcomment = EventComments(topicid)
    newsIds = eventcomment.getNewsIds()

    for news_id in newsIds:
        results = eventcomment.getNewsComments(news_id)

        inputs = []
        for r in results:
            r['title'] = ''
            r['content'] = r['content168'].encode('utf-8')
            r['text'] = r['content168']
            item = ad_filter(r)
            if item['ad_label'] == 0:
                inputs.append(item)

        # 情绪计算
        for r in inputs:
            sentiment = triple_classifier(r)
            comment = Comment(r['_id'])
            comment.update_comment_sentiment(sentiment)

        # kmeans 聚类及评价
        kmeans_results = kmeans(inputs, k=10)
        reserve_num = 5
        final_cluster_results, tfidf_dict = cluster_evaluation(kmeans_results, \
                top_num=reserve_num, topk_freq=TOPK_FREQ_WORD, least_size=LEAST_SIZE, min_tfidf=None)

        inputs = []
        for label, items in final_cluster_results.iteritems():
            if label != 'other':
                inputs.extend(items)

            for item in items:
                news = News(item['news_id'])

                if label == 'other':
                    label = news.otherClusterId

                comment = Comment(item['_id'])
                comment.update_comment_label(label)

            eventcomment.save_cluster(label, news_id, int(time.time()))

        #计算各簇特征词
        cluster_feature = extract_feature(inputs)
        for label, fwords in cluster_feature.iteritems():
            eventcomment.update_feature_words(label, fwords)

        #计算文本权重
        for input in inputs:
            weight = text_weight_cal(input, cluster_feature[input['label']])
            comment = Comment(input['_id'])
            comment.update_comment_weight(weight)
Ejemplo n.º 3
0
def comments_rubbish_clustering_calculation(comments, logger, cluster_num=COMMENT_WORDS_CLUSTER_NUM, \
        cluster_eva_min_size=CLUSTER_EVA_MIN_SIZE, \
        version=COMMENT_CLUSTERING_PROCESS_FOR_CLUTO_VERSION):
    """评论垃圾过滤、聚类
       input: comments
           comment中包含news_id, news_content
       cluster_infos: 聚簇信息
       item_infos:单条信息列表, 数据字段:clusterid、weight、same_from、duplicate
    """
    # 无意义信息的clusterid,包括ad_filter分出来的广告,svm分出的垃圾,主客观分类器分出的新闻
    NON_CLUSTER_ID = 'nonsense'

    # 其他类的clusterid
    OTHER_CLUSTER_ID = 'other'

    # 最小聚类输入信息条数,少于则不聚类
    MIN_CLUSTERING_INPUT = 30

    # 簇信息,主要是簇的特征词信息
    clusters_infos = {'features': dict()}

    # 去除sentiment label clusterid ad_label subob_label rub_label
    clear_keys = ['label', 'ad_label', 'subob_label', 'rub_label', 'weight']
    inputs = []
    for r in comments:
        for key in clear_keys:
            if key in r:
                del r[key]

        inputs.append(r)
    comments = inputs

    # 单条信息list,每条信息存储 clusterid weight sentiment字段
    items_infos = []

    # 数据字段预处理
    inputs = []
    for r in comments:
        r['title'] = ''
        try:
            r['content168'] = r['content168'].encode('utf-8')
        except:
            r['content168'] = r['text'].encode('utf-8')
        r['content'] = r['content168']
        r['text'] = r['content168']
        if 'news_content' in r and r['news_content']:
            r['news_content'] = r['news_content'].encode('utf-8')
        else:
            r['news_content'] = ''

        # 简单规则过滤广告
        item = ad_filter(r)
        if item['ad_label'] == 0:
            inputs.append(item)
        else:
            item['clusterid'] = NON_CLUSTER_ID + '_rub'
            items_infos.append(item)

    # svm去除垃圾
    items = rubbish_classifier(inputs)
    inputs = []
    for item in items:
        if item['rub_label'] == 1:
            item['clusterid'] = NON_CLUSTER_ID + '_rub'
            items_infos.append(item)
        else:
            inputs.append(item)

    # 按新闻对评论归类
    results = comment_news(inputs)

    final_inputs = []
    for news_id, _inputs in results.iteritems():
        # 结合新闻,过滤评论
        _inputs = filter_comment(_inputs)
        inputs = [r for r in _inputs if r['rub_label'] == 0]
        inputs_rubbish = [r for r in _inputs if r['rub_label'] == 1]
        for r in inputs_rubbish:
            r['clusterid'] =  NON_CLUSTER_ID + '_rub'
            items_infos.append(r)

        if len(inputs) >= MIN_CLUSTERING_INPUT:
            tfidf_word, input_dict = tfidf_v2(inputs)
            results = choose_cluster(tfidf_word, inputs, \
                    cluster_num=cluster_num, version=version)

            #评论文本聚类
            cluster_text = text_classify(inputs, results, tfidf_word)

            evaluation_inputs = []

            for k, v in enumerate(cluster_text):
                inputs[k]['label'] = v['label']
                inputs[k]['weight'] = v['weight']
                evaluation_inputs.append(inputs[k])

            #簇评价, 权重及簇标签
            recommend_text = cluster_evaluation(evaluation_inputs, min_size=cluster_eva_min_size)
            for label, items in recommend_text.iteritems():
                if label != OTHER_CLUSTER_ID:
                    clusters_infos['features'][label] = results[label]
                    print '11111',results[label]
                    for item in items:
                        item['clusterid'] = label
                        item['weight'] = item['weight']

                    final_inputs.extend(items)
                else:
                    for item in items:
                        item['clusterid'] = OTHER_CLUSTER_ID
                    items_infos.extend(items)
        else:
            # 如果信息条数小于,则直接展示信息列表
            tfidf_word, input_dict = tfidf_v2(inputs)
            uuid_label = str(uuid.uuid4())
            clusters_infos['features'][uuid_label] = [kw for kw, count in tfidf_word]
            print '22222222',clusters_infos['features'][uuid_label]
            for r in inputs:
                r['clusterid'] = uuid_label
                r['weight'] = global_weight_cal_tfidf(tfidf_word, r)

            final_inputs.extend(inputs)

    # 去重,根据子观点类别去重
    cluster_items = dict()
    for r in final_inputs:
        clusterid = r['clusterid']
        try:
            cluster_items[clusterid].append(r)
        except KeyError:
            cluster_items[clusterid] = [r]

    for clusterid, items in cluster_items.iteritems():
        results = duplicate(items)
        items_infos.extend(results)

    return {'cluster_infos': clusters_infos, 'item_infos': items_infos}
Ejemplo n.º 4
0
def comments_sentiment_rubbish_calculation(comments, logger):
    """输入为一堆comments, 字段包括title、content168
       输出:
           item_infos:单条信息列表, 数据字段:sentiment、same_from、duplicate
    """
    # 无意义信息的clusterid,包括ad_filter分出来的广告,svm分出的垃圾,主客观分类器分出的新闻
    NON_CLUSTER_ID = 'nonsense'

    # 有意义的信息clusterid
    MEAN_CLUSTER_ID = 'sentiment'

    # 单条信息list,每条信息存储 clusterid weight sentiment字段
    items_infos = []

    # 去除sentiment label clusterid ad_label subob_label rub_label
    clear_keys = ['sentiment', 'label', 'clusterid', 'ad_label', 'subob_label', 'rub_label', 'weight']
    inputs = []
    for r in comments:
        for key in clear_keys:
            if key in r:
                del r[key]

        inputs.append(r)
    comments = inputs

    # 数据字段预处理
    inputs = []
    for r in comments:
        r['title'] = ''
        try:
            r['content168'] = r['content168'].encode('utf-8')
        except:
            r['content168'] = r['text'].encode('utf-8')
        r['content'] = r['content168']
        r['text'] = r['content168']

        inputs.append(r)

    # 先分中性及3类分类器
    svm_inputs = []
    for r in inputs:
        sentiment = neutral_classifier(r)

        if sentiment != 0:
            sentiment = triple_classifier(r)
            if sentiment == 0:
                svm_inputs.append(r)
            else:
                r['sentiment'] = sentiment
                items_infos.append(r)
        else:
            svm_inputs.append(r)

    # 情绪调整
    senti_modify_inputs = []
    for r in svm_inputs:
        sentiment = mid_sentiment_classify(r['text'])
        if sentiment == -1:
            sentiment = 0 # 中性

        if sentiment != 0:
            r['sentiment'] = sentiment
            items_infos.append(r)
        else:
            r['sentiment'] = sentiment
            senti_modify_inputs.append(r)

    # 新闻分类
    inputs = []
    for r in senti_modify_inputs:
        r = subob_classifier(r)
        if r['subob_label'] == 1:
            # 主客观文本分类
            r['sentiment'] = NON_CLUSTER_ID + '_news' # 新闻
            items_infos.append(r)
        else:
            inputs.append(r)

    # 去垃圾
    items = rubbish_classifier(inputs)
    for item in items:
        if item['rub_label'] == 1:
            # svm去垃圾
            item['sentiment'] = NON_CLUSTER_ID + '_rub'
        else:
            # 简单规则过滤广告
            item = ad_filter(item)
            if item['ad_label'] == 1:
                item['sentiment'] = NON_CLUSTER_ID + '_rub'

        items_infos.append(item)

    # 去重,在一个情绪类别下将文本去重
    sentiment_dict = dict()
    for item in items_infos:
        if 'sentiment' in item:
            sentiment = item['sentiment']
            try:
                sentiment_dict[sentiment].append(item)
            except KeyError:
                sentiment_dict[sentiment] = [item]

    items_infos = []
    for sentiment, items in sentiment_dict.iteritems():
        items_list = duplicate(items)
        items_infos.extend(items_list)

    return {'item_infos': items_infos}
Ejemplo n.º 5
0
def one_topic_calculation_comments_v2(topicid):
    """对评论进行聚类
    """
    from comment_clustering_tfidf_v2 import kmeans, tfidf_v2, text_classify, cluster_evaluation, global_text_weight

    eventcomment = EventComments(topicid)
    newsIds = eventcomment.getNewsIds()

    for news_id in newsIds:
        eventcomment.clear_cluster(news_id)
        results = eventcomment.getNewsComments(news_id)
        news = News(news_id)

        inputs = []
        for r in results:
            r['title'] = ''
            r['content'] = r['content168'].encode('utf-8')
            r['text'] = r['content168']
            item = ad_filter(r)
            if item['ad_label'] == 0:
                inputs.append(item)

        # 情绪计算
        for r in inputs:
            sentiment = triple_classifier(r)
            comment = Comment(r['_id'], topicid)
            comment.update_comment_sentiment(sentiment)

        tfidf_word = tfidf_v2(inputs)

        #聚类个数=过滤后文本数/2向上取整,大于10的取10
        kmeans_cluster_number = int(math.ceil(float(len(inputs)) / 5.0))
        if kmeans_cluster_number > 10:
            kmeans_cluster_number = 10
        if kmeans_cluster_number < 5:
            kmeans_cluster_number = 5

        # 评论词聚类
        word_label = kmeans(tfidf_word, inputs, k=kmeans_cluster_number)

        # 计算全局文本权重
        for r in inputs:
            gweight = global_text_weight(r['content'], tfidf_word)
            comment = Comment(r['_id'], topicid)
            comment.update_comment_global_weight(gweight)

        # 评论文本分类
        results = text_classify(inputs, word_label, tfidf_word)

        #簇评价
        reserved_num = int(math.ceil(float(kmeans_cluster_number) / 2.0))
        LEAST_CLUSTER_SIZE = 3 # 最小的簇大小
        TOPK_FREQ = 10
        TOPK_WEIGHT = 5
        LEAST_FREQ = 0
        final_cluster_results = cluster_evaluation(results, top_num=reserved_num, topk_freq=TOPK_FREQ, \
                least_freq=LEAST_FREQ, least_size=LEAST_CLUSTER_SIZE, topk_weight=TOPK_WEIGHT)
        for label, items in final_cluster_results.iteritems():
            if label == 'other':
                label = news.otherClusterId

            if len(items):
                eventcomment.save_cluster(label, news_id, int(time.time()))

            if label != news.otherClusterId:
                fwords = word_label[label]
                eventcomment.update_feature_words(label, fwords)

            for item in items:
                comment = Comment(item['_id'], topicid)
                comment.update_comment_label(label)
                comment.update_comment_weight(item['weight'])
Ejemplo n.º 6
0
def one_topic_calculation_comments_v7(topicid):
    """对评论进行聚类
    """
    from comment_clustering_tfidf_v7 import tfidf_v2, text_classify, \
            cluster_evaluation, choose_cluster
    from weibo_subob_rub_neu_classifier import weibo_subob_rub_neu_classifier

    eventcomment = EventComments(topicid)
    newsIds = eventcomment.getNewsIds()

    for news_id in newsIds:
        eventcomment.clear_cluster(news_id)
        results = eventcomment.getNewsComments(news_id)
        news = News(news_id)

        # 数据字段预处理
        inputs = []
        for r in results:
            r['title'] = ''
            r['content168'] = r['content168'].encode('utf-8')
            r['content'] = r['content168']
            r['text'] = r['content168']
            item = ad_filter(r)
            if item['ad_label'] == 0:
                inputs.append(item)

        # 去除垃圾和新闻文本
        items = weibo_subob_rub_neu_classifier(inputs)
        inputs = []
        for item in items:
            subob_rub_neu_label = item['subob_rub_neu_label']
            if not subob_rub_neu_label in [1, 0]:
                # 1表示垃圾文本,0表示新闻文本
                inputs.append(item)

        MIN_CLUSTERING_INPUT = 30
        MIN_CLUSTER_NUM = 2
        MAX_CLUSTER_NUM = 10
        # TFIDF词、聚类数量自动选择、vsm作属性也要可设成参数
        if len(inputs) >= MIN_CLUSTERING_INPUT:
            tfidf_word, input_dict = tfidf_v2(inputs)
            results = choose_cluster(tfidf_word, inputs, MIN_CLUSTER_NUM, MAX_CLUSTER_NUM)

            #评论文本聚类
            cluster_text = text_classify(inputs, results, tfidf_word)

            evaluation_inputs = []

            for k,v in enumerate(cluster_text):
                inputs[k]['label'] = v['label']
                inputs[k]['weight'] = v['weight']
                evaluation_inputs.append(inputs[k])

            #簇评价
            recommend_text = cluster_evaluation(evaluation_inputs)
            for label, items in recommend_text.iteritems():
                if label == 'other':
                    label = news.otherClusterId

                if len(items):
                    eventcomment.save_cluster(label, news_id, int(time.time()))

                if label != news.otherClusterId:
                    fwords = results[label]
                    eventcomment.update_feature_words(label, fwords)

                for item in items:
                    comment = Comment(item['_id'], topicid)
                    comment.update_comment_label(label)
                    comment.update_comment_weight(item['weight'])

        # 情绪计算
        for r in inputs:
            if r['subob_rub_neu_label'] == 2:
                sentiment = 0 # 0 中性
            elif r['subob_rub_neu_label'] == -1:
                sentiment = triple_classifier(r) # 1 高兴、2 愤怒、3 悲伤、0无情感
                if sentiment == 0:
                    sentiment = mid_sentiment_classify(r['text'])

                if sentiment == -1:
                    sentiment = 0 # 中性

            comment = Comment(r['_id'], topicid)
            comment.update_comment_sentiment(sentiment)
Ejemplo n.º 7
0
def one_topic_calculation_comments_v4(topicid):
    """对评论进行聚类
    """
    from comment_clustering_tfidf_v4 import kmeans, tfidf_v4, text_classify, \
            cluster_evaluation, choose_cluster

    eventcomment = EventComments(topicid)
    newsIds = eventcomment.getNewsIds()

    for news_id in newsIds:
        eventcomment.clear_cluster(news_id)
        results = eventcomment.getNewsComments(news_id)
        news = News(news_id)

        inputs = []
        for r in results:
            r['title'] = ''
            r['content'] = r['content168'].encode('utf-8')
            r['text'] = r['content168']
            item = ad_filter(r)
            if item['ad_label'] == 0:
                inputs.append(item)

        # 情绪计算
        for r in inputs:
            sentiment = triple_classifier(r)
            comment = Comment(r['_id'], topicid)
            comment.update_comment_sentiment(sentiment)

        MIN_CLUSTERING_INPUT = 50
        MIN_CLUSTER_NUM = 2
        MAX_CLUSTER_NUM = 15
        if len(inputs) >= MIN_CLUSTERING_INPUT:
            tfidf_word, input_dict = tfidf_v4(inputs)
            results = choose_cluster(tfidf_word, inputs, MIN_CLUSTER_NUM, MAX_CLUSTER_NUM)

            # for k, v in results.iteritems():
            #     print k, len(v)

            #评论文本聚类
            cluster_text = text_classify(inputs, results, tfidf_word)

            evaluation_inputs = []

            for k,v in enumerate(cluster_text):
                inputs[k]['label'] = v['label']
                inputs[k]['weight'] = v['weight']
                evaluation_inputs.append(inputs[k])

            #簇评价
            recommend_text = cluster_evaluation(evaluation_inputs)
            for label, items in recommend_text.iteritems():
                if label == 'other':
                    label = news.otherClusterId

                if len(items):
                    eventcomment.save_cluster(label, news_id, int(time.time()))

                if label != news.otherClusterId:
                    fwords = results[label]
                    eventcomment.update_feature_words(label, fwords)

                for item in items:
                    comment = Comment(item['_id'], topicid)
                    comment.update_comment_label(label)
                    comment.update_comment_weight(item['weight'])
Ejemplo n.º 8
0
def one_topic_calculation_comments_v2(topicid):
    """对评论进行聚类
    """
    from comment_clustering_tfidf_v2 import kmeans, tfidf_v2, text_classify, cluster_evaluation, global_text_weight

    eventcomment = EventComments(topicid)
    newsIds = eventcomment.getNewsIds()

    for news_id in newsIds:
        eventcomment.clear_cluster(news_id)
        results = eventcomment.getNewsComments(news_id)
        news = News(news_id)

        inputs = []
        for r in results:
            r['title'] = ''
            r['content'] = r['content168'].encode('utf-8')
            r['text'] = r['content168']
            item = ad_filter(r)
            if item['ad_label'] == 0:
                inputs.append(item)

        # 情绪计算
        for r in inputs:
            sentiment = triple_classifier(r)
            comment = Comment(r['_id'], topicid)
            comment.update_comment_sentiment(sentiment)

        tfidf_word = tfidf_v2(inputs)

        #聚类个数=过滤后文本数/2向上取整,大于10的取10
        kmeans_cluster_number = int(math.ceil(float(len(inputs)) / 5.0))
        if kmeans_cluster_number > 10:
            kmeans_cluster_number = 10
        if kmeans_cluster_number < 5:
            kmeans_cluster_number = 5

        # 评论词聚类
        word_label = kmeans(tfidf_word, inputs, k=kmeans_cluster_number)

        # 计算全局文本权重
        for r in inputs:
            gweight = global_text_weight(r['content'], tfidf_word)
            comment = Comment(r['_id'], topicid)
            comment.update_comment_global_weight(gweight)

        # 评论文本分类
        results = text_classify(inputs, word_label, tfidf_word)

        #簇评价
        reserved_num = int(math.ceil(float(kmeans_cluster_number) / 2.0))
        LEAST_CLUSTER_SIZE = 3  # 最小的簇大小
        TOPK_FREQ = 10
        TOPK_WEIGHT = 5
        LEAST_FREQ = 0
        final_cluster_results = cluster_evaluation(results, top_num=reserved_num, topk_freq=TOPK_FREQ, \
                least_freq=LEAST_FREQ, least_size=LEAST_CLUSTER_SIZE, topk_weight=TOPK_WEIGHT)
        for label, items in final_cluster_results.iteritems():
            if label == 'other':
                label = news.otherClusterId

            if len(items):
                eventcomment.save_cluster(label, news_id, int(time.time()))

            if label != news.otherClusterId:
                fwords = word_label[label]
                eventcomment.update_feature_words(label, fwords)

            for item in items:
                comment = Comment(item['_id'], topicid)
                comment.update_comment_label(label)
                comment.update_comment_weight(item['weight'])
Ejemplo n.º 9
0
def one_topic_calculation_comments_v7(topicid):
    """对评论进行聚类
    """
    from comment_clustering_tfidf_v7 import tfidf_v2, text_classify, \
            cluster_evaluation, choose_cluster
    from weibo_subob_rub_neu_classifier import weibo_subob_rub_neu_classifier

    eventcomment = EventComments(topicid)
    newsIds = eventcomment.getNewsIds()

    for news_id in newsIds:
        eventcomment.clear_cluster(news_id)
        results = eventcomment.getNewsComments(news_id)
        news = News(news_id)

        # 数据字段预处理
        inputs = []
        for r in results:
            r['title'] = ''
            r['content168'] = r['content168'].encode('utf-8')
            r['content'] = r['content168']
            r['text'] = r['content168']
            item = ad_filter(r)
            if item['ad_label'] == 0:
                inputs.append(item)

        # 去除垃圾和新闻文本
        items = weibo_subob_rub_neu_classifier(inputs)
        inputs = []
        for item in items:
            subob_rub_neu_label = item['subob_rub_neu_label']
            if not subob_rub_neu_label in [1, 0]:
                # 1表示垃圾文本,0表示新闻文本
                inputs.append(item)

        MIN_CLUSTERING_INPUT = 30
        MIN_CLUSTER_NUM = 2
        MAX_CLUSTER_NUM = 10
        # TFIDF词、聚类数量自动选择、vsm作属性也要可设成参数
        if len(inputs) >= MIN_CLUSTERING_INPUT:
            tfidf_word, input_dict = tfidf_v2(inputs)
            results = choose_cluster(tfidf_word, inputs, MIN_CLUSTER_NUM,
                                     MAX_CLUSTER_NUM)

            #评论文本聚类
            cluster_text = text_classify(inputs, results, tfidf_word)

            evaluation_inputs = []

            for k, v in enumerate(cluster_text):
                inputs[k]['label'] = v['label']
                inputs[k]['weight'] = v['weight']
                evaluation_inputs.append(inputs[k])

            #簇评价
            recommend_text = cluster_evaluation(evaluation_inputs)
            for label, items in recommend_text.iteritems():
                if label == 'other':
                    label = news.otherClusterId

                if len(items):
                    eventcomment.save_cluster(label, news_id, int(time.time()))

                if label != news.otherClusterId:
                    fwords = results[label]
                    eventcomment.update_feature_words(label, fwords)

                for item in items:
                    comment = Comment(item['_id'], topicid)
                    comment.update_comment_label(label)
                    comment.update_comment_weight(item['weight'])

        # 情绪计算
        for r in inputs:
            if r['subob_rub_neu_label'] == 2:
                sentiment = 0  # 0 中性
            elif r['subob_rub_neu_label'] == -1:
                sentiment = triple_classifier(r)  # 1 高兴、2 愤怒、3 悲伤、0无情感
                if sentiment == 0:
                    sentiment = mid_sentiment_classify(r['text'])

                if sentiment == -1:
                    sentiment = 0  # 中性

            comment = Comment(r['_id'], topicid)
            comment.update_comment_sentiment(sentiment)
Ejemplo n.º 10
0
def one_topic_calculation_comments_v4(topicid):
    """对评论进行聚类
    """
    from comment_clustering_tfidf_v4 import kmeans, tfidf_v4, text_classify, \
            cluster_evaluation, choose_cluster

    eventcomment = EventComments(topicid)
    newsIds = eventcomment.getNewsIds()

    for news_id in newsIds:
        eventcomment.clear_cluster(news_id)
        results = eventcomment.getNewsComments(news_id)
        news = News(news_id)

        inputs = []
        for r in results:
            r['title'] = ''
            r['content'] = r['content168'].encode('utf-8')
            r['text'] = r['content168']
            item = ad_filter(r)
            if item['ad_label'] == 0:
                inputs.append(item)

        # 情绪计算
        for r in inputs:
            sentiment = triple_classifier(r)
            comment = Comment(r['_id'], topicid)
            comment.update_comment_sentiment(sentiment)

        MIN_CLUSTERING_INPUT = 50
        MIN_CLUSTER_NUM = 2
        MAX_CLUSTER_NUM = 15
        if len(inputs) >= MIN_CLUSTERING_INPUT:
            tfidf_word, input_dict = tfidf_v4(inputs)
            results = choose_cluster(tfidf_word, inputs, MIN_CLUSTER_NUM,
                                     MAX_CLUSTER_NUM)

            # for k, v in results.iteritems():
            #     print k, len(v)

            #评论文本聚类
            cluster_text = text_classify(inputs, results, tfidf_word)

            evaluation_inputs = []

            for k, v in enumerate(cluster_text):
                inputs[k]['label'] = v['label']
                inputs[k]['weight'] = v['weight']
                evaluation_inputs.append(inputs[k])

            #簇评价
            recommend_text = cluster_evaluation(evaluation_inputs)
            for label, items in recommend_text.iteritems():
                if label == 'other':
                    label = news.otherClusterId

                if len(items):
                    eventcomment.save_cluster(label, news_id, int(time.time()))

                if label != news.otherClusterId:
                    fwords = results[label]
                    eventcomment.update_feature_words(label, fwords)

                for item in items:
                    comment = Comment(item['_id'], topicid)
                    comment.update_comment_label(label)
                    comment.update_comment_weight(item['weight'])
Ejemplo n.º 11
0
def comments_rubbish_clustering_calculation(comments, cluster_num, \
        cluster_eva_min_size=CLUSTER_EVA_MIN_SIZE, \
        version=COMMENT_CLUSTERING_PROCESS_FOR_CLUTO_VERSION):
    """评论垃圾过滤、聚类
       input: comments
           comment中包含news_id, news_content
       cluster_infos: 聚簇信息
       item_infos:单条信息列表, 数据字段:clusterid、weight、same_from、duplicate
    """
    # 无意义信息的clusterid,包括ad_filter分出来的广告,svm分出的垃圾,主客观分类器分出的新闻
    NON_CLUSTER_ID = 'nonsense'

    # 其他类的clusterid
    OTHER_CLUSTER_ID = 'other'

    # 直接显示的clusterid
    DIRECT_CLUSTER_ID = 'direct'
    DIRECT_CLUSTER_FEATURE = [u'聚簇']

    # 最小聚类输入信息条数,少于则不聚类
    MIN_CLUSTERING_INPUT = 20

    # 簇信息,主要是簇的特征词信息
    clusters_infos = {'features': dict()}

    # 单条信息list,每条信息存储 clusterid weight sentiment字段
    items_infos = []

    # 数据字段预处理
    print('\tData preprocess...')
    inputs = []
    for r in comments:
        r['title'] = ''
        r['content168'] = r['content']  #.encode('utf-8')
        r['content'] = r['content168']
        r['text'] = r['content']
        if 'news_content' in r and r['news_content']:
            r['news_content'] = r['news_content']  #.encode('utf-8')
        else:
            r['news_content'] = ''

        # 简单规则过滤广告
        item = ad_filter(r)
        if item['ad_label'] == 0:
            inputs.append(item)
        else:
            item['clusterid'] = NON_CLUSTER_ID + '_rub'
            items_infos.append(item)
    print('\tAd filter %d data, data list have: %d' %
          (len(inputs), len(items_infos)))

    # svm去除垃圾
    print('\tSvm rubbish classify...')
    if len(inputs) == 0:
        items = []
    else:
        items = rubbish_classifier(inputs)
    inputs = []
    for item in items:
        if item['rub_label'] == 1:
            item['clusterid'] = NON_CLUSTER_ID + '_rub'
            items_infos.append(item)
        else:
            inputs.append(item)
    print('\tSvm rubbish classify %d data, data list have: %d' %
          (len(inputs), len(items_infos)))

    #开始聚类
    print('\tStart clustring opinion...')
    opinion_name, word_result, text_list, word_main = opinion_main(
        inputs, cluster_num)
    # if len(inputs) >= 500:
    #     opinion_name,word_result,text_list = opinion_main(inputs,10)
    # else:
    #     opinion_name,word_result,text_list = opinion_main(inputs,5)
    print('\tEnd clustring opinion...')

    for k, v in word_result.items():
        #name = opinion_name[k]
        clusters_infos['features'][k] = v
    clusters_infos['word_main'] = word_main

    final_inputs = []
    for k, v in text_list.items():
        for item in v:
            row = copy.deepcopy(item)
            row['clusterid'] = k
            final_inputs.append(row)

    # 去重,根据子观点类别去重
    cluster_items = dict()
    for r in final_inputs:
        clusterid = r['clusterid']
        try:
            cluster_items[clusterid].append(r)
        except KeyError:
            cluster_items[clusterid] = [r]

    for clusterid, items in cluster_items.items():
        results = duplicate(items)
        items_infos.extend(results)

    return {'cluster_infos': clusters_infos, 'item_infos': items_infos}
Ejemplo n.º 12
0
def comments_rubbish_clustering_calculation(comments, logger, cluster_num=COMMENT_WORDS_CLUSTER_NUM, \
        cluster_eva_min_size=CLUSTER_EVA_MIN_SIZE, \
        version=COMMENT_CLUSTERING_PROCESS_FOR_CLUTO_VERSION):
    """评论垃圾过滤、聚类
       input: comments
           comment中包含news_id, news_content
       cluster_infos: 聚簇信息
       item_infos:单条信息列表, 数据字段:clusterid、weight、same_from、duplicate
    """
    # 无意义信息的clusterid,包括ad_filter分出来的广告,svm分出的垃圾,主客观分类器分出的新闻
    NON_CLUSTER_ID = 'nonsense'

    # 其他类的clusterid
    OTHER_CLUSTER_ID = 'other'

    # 最小聚类输入信息条数,少于则不聚类
    MIN_CLUSTERING_INPUT = 30

    # 簇信息,主要是簇的特征词信息
    clusters_infos = {'features': dict()}

    # 去除sentiment label clusterid ad_label subob_label rub_label
    clear_keys = ['label', 'ad_label', 'subob_label', 'rub_label', 'weight']
    inputs = []
    for r in comments:
        for key in clear_keys:
            if key in r:
                del r[key]

        inputs.append(r)
    comments = inputs

    # 单条信息list,每条信息存储 clusterid weight sentiment字段
    items_infos = []

    # 数据字段预处理
    inputs = []
    for r in comments:
        r['title'] = ''
        r['content168'] = r['content168'].encode('utf-8')
        r['content'] = r['content168']
        r['text'] = r['content168']
        if 'news_content' in r and r['news_content']:
            r['news_content'] = r['news_content'].encode('utf-8')
        else:
            r['news_content'] = ''

        # 简单规则过滤广告
        item = ad_filter(r)
        if item['ad_label'] == 0:
            inputs.append(item)
        else:
            item['clusterid'] = NON_CLUSTER_ID + '_rub'
            items_infos.append(item)

    # svm去除垃圾
    items = rubbish_classifier(inputs)
    inputs = []
    for item in items:
        if item['rub_label'] == 1:
            item['clusterid'] = NON_CLUSTER_ID + '_rub'
            items_infos.append(item)
        else:
            inputs.append(item)

    # 按新闻对评论归类
    results = comment_news(inputs)

    final_inputs = []
    for news_id, _inputs in results.iteritems():
        # 结合新闻,过滤评论
        _inputs = filter_comment(_inputs)
        inputs = [r for r in _inputs if r['rub_label'] == 0]
        inputs_rubbish = [r for r in _inputs if r['rub_label'] == 1]
        for r in inputs_rubbish:
            r['clusterid'] =  NON_CLUSTER_ID + '_rub'
            items_infos.append(r)

        if len(inputs) >= MIN_CLUSTERING_INPUT:
            tfidf_word, input_dict = tfidf_v2(inputs)
            results = choose_cluster(tfidf_word, inputs, \
                    cluster_num=cluster_num, version=version)

            #评论文本聚类
            cluster_text = text_classify(inputs, results, tfidf_word)

            evaluation_inputs = []

            for k, v in enumerate(cluster_text):
                inputs[k]['label'] = v['label']
                inputs[k]['weight'] = v['weight']
                evaluation_inputs.append(inputs[k])

            #簇评价, 权重及簇标签
            recommend_text = cluster_evaluation(evaluation_inputs, min_size=cluster_eva_min_size)
            for label, items in recommend_text.iteritems():
                if label != OTHER_CLUSTER_ID:
                    clusters_infos['features'][label] = results[label]

                    for item in items:
                        item['clusterid'] = label
                        item['weight'] = item['weight']

                    final_inputs.extend(items)
                else:
                    for item in items:
                        item['clusterid'] = OTHER_CLUSTER_ID
                    items_infos.extend(items)
        else:
            # 如果信息条数小于,则直接展示信息列表
            tfidf_word, input_dict = tfidf_v2(inputs)
            uuid_label = str(uuid.uuid4())
            clusters_infos['features'][uuid_label] = [kw for kw, count in tfidf_word]
            for r in inputs:
                r['clusterid'] = uuid_label
                r['weight'] = global_weight_cal_tfidf(tfidf_word, r)

            final_inputs.extend(inputs)

    # 去重,根据子观点类别去重
    cluster_items = dict()
    for r in final_inputs:
        clusterid = r['clusterid']
        try:
            cluster_items[clusterid].append(r)
        except KeyError:
            cluster_items[clusterid] = [r]

    for clusterid, items in cluster_items.iteritems():
        results = duplicate(items)
        items_infos.extend(results)

    return {'cluster_infos': clusters_infos, 'item_infos': items_infos}
Ejemplo n.º 13
0
def comments_sentiment_rubbish_calculation(comments, logger):
    """输入为一堆comments, 字段包括title、content168
       输出:
           item_infos:单条信息列表, 数据字段:sentiment、same_from、duplicate
    """
    # 无意义信息的clusterid,包括ad_filter分出来的广告,svm分出的垃圾,主客观分类器分出的新闻
    NON_CLUSTER_ID = 'nonsense'

    # 有意义的信息clusterid
    MEAN_CLUSTER_ID = 'sentiment'

    # 单条信息list,每条信息存储 clusterid weight sentiment字段
    items_infos = []

    # 去除sentiment label clusterid ad_label subob_label rub_label
    clear_keys = ['sentiment', 'label', 'clusterid', 'ad_label', 'subob_label', 'rub_label', 'weight']
    inputs = []
    for r in comments:
        for key in clear_keys:
            if key in r:
                del r[key]

        inputs.append(r)
    comments = inputs

    # 数据字段预处理
    inputs = []
    for r in comments:
        r['title'] = ''
        r['content168'] = r['content168'].encode('utf-8')
        r['content'] = r['content168']
        r['text'] = r['content168']

        inputs.append(r)

    # 先分中性及3类分类器
    svm_inputs = []
    for r in inputs:
        sentiment = neutral_classifier(r)

        if sentiment != 0:
            sentiment = triple_classifier(r)
            if sentiment == 0:
                svm_inputs.append(r)
            else:
                r['sentiment'] = sentiment
                items_infos.append(r)
        else:
            svm_inputs.append(r)

    # 情绪调整
    senti_modify_inputs = []
    for r in svm_inputs:
        sentiment = mid_sentiment_classify(r['text'])
        if sentiment == -1:
            sentiment = 0 # 中性

        if sentiment != 0:
            r['sentiment'] = sentiment
            items_infos.append(r)
        else:
            r['sentiment'] = sentiment
            senti_modify_inputs.append(r)

    # 新闻分类
    inputs = []
    for r in senti_modify_inputs:
        r = subob_classifier(r)
        if r['subob_label'] == 1:
            # 主客观文本分类
            r['sentiment'] = NON_CLUSTER_ID + '_news' # 新闻
            items_infos.append(r)
        else:
            inputs.append(r)

    # 去垃圾
    items = rubbish_classifier(inputs)
    for item in items:
        if item['rub_label'] == 1:
            # svm去垃圾
            item['sentiment'] = NON_CLUSTER_ID + '_rub'
        else:
            # 简单规则过滤广告
            item = ad_filter(item)
            if item['ad_label'] == 1:
                item['sentiment'] = NON_CLUSTER_ID + '_rub'

        items_infos.append(item)

    # 去重,在一个情绪类别下将文本去重
    sentiment_dict = dict()
    for item in items_infos:
        if 'sentiment' in item:
            sentiment = item['sentiment']
            try:
                sentiment_dict[sentiment].append(item)
            except KeyError:
                sentiment_dict[sentiment] = [item]

    items_infos = []
    for sentiment, items in sentiment_dict.iteritems():
        items_list = duplicate(items)
        items_infos.extend(items_list)

    return {'item_infos': items_infos}
Ejemplo n.º 14
0
from sta_ad import test, choose_ad

def rubbish_filter(items, topicid):
    """svm 垃圾过滤器
    """
    texts = [item['content168'] for item in items]
    test(texts, topicid)
    labels = choose_ad(topicid)

    return labels

if __name__=="__main__":
    topic = "APEC2014"
    topicid = "54916b0d955230e752f2a94e"
    mongo = _default_mongo(usedb=MONGO_DB_NAME)
    results = mongo[COMMENT_COLLECTION + topicid].find()
    results = [r for r in results]

    from ad_filter import ad_filter

    rubbish_filter_inputs = []
    for r in results:
        r['content168'] = r['content168'].encode('utf-8')
        text, label = ad_filter(r)
        if label == 0:
            rubbish_filter_inputs.append(r)

    #libsvm垃圾过滤
    results = rubbish_filter(rubbish_filter_inputs, topicid)