def text_kmeans_clustering(): # 聚类评价时选取TOPK_FREQ_WORD的高频词 TOPK_FREQ_WORD = 50 # 聚类评价时最小簇的大小 LEAST_SIZE = 8 eventcomment = EventComments(topicid) newsIds = eventcomment.getNewsIds() for news_id in newsIds: results = eventcomment.getNewsComments(news_id) inputs = [] for r in results: r['title'] = '' r['content'] = r['content168'].encode('utf-8') r['text'] = r['content168'] item = ad_filter(r) if item['ad_label'] == 0: inputs.append(item) # 情绪计算 for r in inputs: sentiment = triple_classifier(r) comment = Comment(r['_id']) comment.update_comment_sentiment(sentiment) # kmeans 聚类及评价 kmeans_results = kmeans(inputs, k=10) reserve_num = 5 final_cluster_results, tfidf_dict = cluster_evaluation(kmeans_results, \ top_num=reserve_num, topk_freq=TOPK_FREQ_WORD, least_size=LEAST_SIZE, min_tfidf=None) inputs = [] for label, items in final_cluster_results.iteritems(): if label != 'other': inputs.extend(items) for item in items: news = News(item['news_id']) if label == 'other': label = news.otherClusterId comment = Comment(item['_id']) comment.update_comment_label(label) eventcomment.save_cluster(label, news_id, int(time.time())) #计算各簇特征词 cluster_feature = extract_feature(inputs) for label, fwords in cluster_feature.iteritems(): eventcomment.update_feature_words(label, fwords) #计算文本权重 for input in inputs: weight = text_weight_cal(input, cluster_feature[input['label']]) comment = Comment(input['_id']) comment.update_comment_weight(weight)
def comments_rubbish_clustering_calculation(comments, logger, cluster_num=COMMENT_WORDS_CLUSTER_NUM, \ cluster_eva_min_size=CLUSTER_EVA_MIN_SIZE, \ version=COMMENT_CLUSTERING_PROCESS_FOR_CLUTO_VERSION): """评论垃圾过滤、聚类 input: comments comment中包含news_id, news_content cluster_infos: 聚簇信息 item_infos:单条信息列表, 数据字段:clusterid、weight、same_from、duplicate """ # 无意义信息的clusterid,包括ad_filter分出来的广告,svm分出的垃圾,主客观分类器分出的新闻 NON_CLUSTER_ID = 'nonsense' # 其他类的clusterid OTHER_CLUSTER_ID = 'other' # 最小聚类输入信息条数,少于则不聚类 MIN_CLUSTERING_INPUT = 30 # 簇信息,主要是簇的特征词信息 clusters_infos = {'features': dict()} # 去除sentiment label clusterid ad_label subob_label rub_label clear_keys = ['label', 'ad_label', 'subob_label', 'rub_label', 'weight'] inputs = [] for r in comments: for key in clear_keys: if key in r: del r[key] inputs.append(r) comments = inputs # 单条信息list,每条信息存储 clusterid weight sentiment字段 items_infos = [] # 数据字段预处理 inputs = [] for r in comments: r['title'] = '' try: r['content168'] = r['content168'].encode('utf-8') except: r['content168'] = r['text'].encode('utf-8') r['content'] = r['content168'] r['text'] = r['content168'] if 'news_content' in r and r['news_content']: r['news_content'] = r['news_content'].encode('utf-8') else: r['news_content'] = '' # 简单规则过滤广告 item = ad_filter(r) if item['ad_label'] == 0: inputs.append(item) else: item['clusterid'] = NON_CLUSTER_ID + '_rub' items_infos.append(item) # svm去除垃圾 items = rubbish_classifier(inputs) inputs = [] for item in items: if item['rub_label'] == 1: item['clusterid'] = NON_CLUSTER_ID + '_rub' items_infos.append(item) else: inputs.append(item) # 按新闻对评论归类 results = comment_news(inputs) final_inputs = [] for news_id, _inputs in results.iteritems(): # 结合新闻,过滤评论 _inputs = filter_comment(_inputs) inputs = [r for r in _inputs if r['rub_label'] == 0] inputs_rubbish = [r for r in _inputs if r['rub_label'] == 1] for r in inputs_rubbish: r['clusterid'] = NON_CLUSTER_ID + '_rub' items_infos.append(r) if len(inputs) >= MIN_CLUSTERING_INPUT: tfidf_word, input_dict = tfidf_v2(inputs) results = choose_cluster(tfidf_word, inputs, \ cluster_num=cluster_num, version=version) #评论文本聚类 cluster_text = text_classify(inputs, results, tfidf_word) evaluation_inputs = [] for k, v in enumerate(cluster_text): inputs[k]['label'] = v['label'] inputs[k]['weight'] = v['weight'] evaluation_inputs.append(inputs[k]) #簇评价, 权重及簇标签 recommend_text = cluster_evaluation(evaluation_inputs, min_size=cluster_eva_min_size) for label, items in recommend_text.iteritems(): if label != OTHER_CLUSTER_ID: clusters_infos['features'][label] = results[label] print '11111',results[label] for item in items: item['clusterid'] = label item['weight'] = item['weight'] final_inputs.extend(items) else: for item in items: item['clusterid'] = OTHER_CLUSTER_ID items_infos.extend(items) else: # 如果信息条数小于,则直接展示信息列表 tfidf_word, input_dict = tfidf_v2(inputs) uuid_label = str(uuid.uuid4()) clusters_infos['features'][uuid_label] = [kw for kw, count in tfidf_word] print '22222222',clusters_infos['features'][uuid_label] for r in inputs: r['clusterid'] = uuid_label r['weight'] = global_weight_cal_tfidf(tfidf_word, r) final_inputs.extend(inputs) # 去重,根据子观点类别去重 cluster_items = dict() for r in final_inputs: clusterid = r['clusterid'] try: cluster_items[clusterid].append(r) except KeyError: cluster_items[clusterid] = [r] for clusterid, items in cluster_items.iteritems(): results = duplicate(items) items_infos.extend(results) return {'cluster_infos': clusters_infos, 'item_infos': items_infos}
def comments_sentiment_rubbish_calculation(comments, logger): """输入为一堆comments, 字段包括title、content168 输出: item_infos:单条信息列表, 数据字段:sentiment、same_from、duplicate """ # 无意义信息的clusterid,包括ad_filter分出来的广告,svm分出的垃圾,主客观分类器分出的新闻 NON_CLUSTER_ID = 'nonsense' # 有意义的信息clusterid MEAN_CLUSTER_ID = 'sentiment' # 单条信息list,每条信息存储 clusterid weight sentiment字段 items_infos = [] # 去除sentiment label clusterid ad_label subob_label rub_label clear_keys = ['sentiment', 'label', 'clusterid', 'ad_label', 'subob_label', 'rub_label', 'weight'] inputs = [] for r in comments: for key in clear_keys: if key in r: del r[key] inputs.append(r) comments = inputs # 数据字段预处理 inputs = [] for r in comments: r['title'] = '' try: r['content168'] = r['content168'].encode('utf-8') except: r['content168'] = r['text'].encode('utf-8') r['content'] = r['content168'] r['text'] = r['content168'] inputs.append(r) # 先分中性及3类分类器 svm_inputs = [] for r in inputs: sentiment = neutral_classifier(r) if sentiment != 0: sentiment = triple_classifier(r) if sentiment == 0: svm_inputs.append(r) else: r['sentiment'] = sentiment items_infos.append(r) else: svm_inputs.append(r) # 情绪调整 senti_modify_inputs = [] for r in svm_inputs: sentiment = mid_sentiment_classify(r['text']) if sentiment == -1: sentiment = 0 # 中性 if sentiment != 0: r['sentiment'] = sentiment items_infos.append(r) else: r['sentiment'] = sentiment senti_modify_inputs.append(r) # 新闻分类 inputs = [] for r in senti_modify_inputs: r = subob_classifier(r) if r['subob_label'] == 1: # 主客观文本分类 r['sentiment'] = NON_CLUSTER_ID + '_news' # 新闻 items_infos.append(r) else: inputs.append(r) # 去垃圾 items = rubbish_classifier(inputs) for item in items: if item['rub_label'] == 1: # svm去垃圾 item['sentiment'] = NON_CLUSTER_ID + '_rub' else: # 简单规则过滤广告 item = ad_filter(item) if item['ad_label'] == 1: item['sentiment'] = NON_CLUSTER_ID + '_rub' items_infos.append(item) # 去重,在一个情绪类别下将文本去重 sentiment_dict = dict() for item in items_infos: if 'sentiment' in item: sentiment = item['sentiment'] try: sentiment_dict[sentiment].append(item) except KeyError: sentiment_dict[sentiment] = [item] items_infos = [] for sentiment, items in sentiment_dict.iteritems(): items_list = duplicate(items) items_infos.extend(items_list) return {'item_infos': items_infos}
def one_topic_calculation_comments_v2(topicid): """对评论进行聚类 """ from comment_clustering_tfidf_v2 import kmeans, tfidf_v2, text_classify, cluster_evaluation, global_text_weight eventcomment = EventComments(topicid) newsIds = eventcomment.getNewsIds() for news_id in newsIds: eventcomment.clear_cluster(news_id) results = eventcomment.getNewsComments(news_id) news = News(news_id) inputs = [] for r in results: r['title'] = '' r['content'] = r['content168'].encode('utf-8') r['text'] = r['content168'] item = ad_filter(r) if item['ad_label'] == 0: inputs.append(item) # 情绪计算 for r in inputs: sentiment = triple_classifier(r) comment = Comment(r['_id'], topicid) comment.update_comment_sentiment(sentiment) tfidf_word = tfidf_v2(inputs) #聚类个数=过滤后文本数/2向上取整,大于10的取10 kmeans_cluster_number = int(math.ceil(float(len(inputs)) / 5.0)) if kmeans_cluster_number > 10: kmeans_cluster_number = 10 if kmeans_cluster_number < 5: kmeans_cluster_number = 5 # 评论词聚类 word_label = kmeans(tfidf_word, inputs, k=kmeans_cluster_number) # 计算全局文本权重 for r in inputs: gweight = global_text_weight(r['content'], tfidf_word) comment = Comment(r['_id'], topicid) comment.update_comment_global_weight(gweight) # 评论文本分类 results = text_classify(inputs, word_label, tfidf_word) #簇评价 reserved_num = int(math.ceil(float(kmeans_cluster_number) / 2.0)) LEAST_CLUSTER_SIZE = 3 # 最小的簇大小 TOPK_FREQ = 10 TOPK_WEIGHT = 5 LEAST_FREQ = 0 final_cluster_results = cluster_evaluation(results, top_num=reserved_num, topk_freq=TOPK_FREQ, \ least_freq=LEAST_FREQ, least_size=LEAST_CLUSTER_SIZE, topk_weight=TOPK_WEIGHT) for label, items in final_cluster_results.iteritems(): if label == 'other': label = news.otherClusterId if len(items): eventcomment.save_cluster(label, news_id, int(time.time())) if label != news.otherClusterId: fwords = word_label[label] eventcomment.update_feature_words(label, fwords) for item in items: comment = Comment(item['_id'], topicid) comment.update_comment_label(label) comment.update_comment_weight(item['weight'])
def one_topic_calculation_comments_v7(topicid): """对评论进行聚类 """ from comment_clustering_tfidf_v7 import tfidf_v2, text_classify, \ cluster_evaluation, choose_cluster from weibo_subob_rub_neu_classifier import weibo_subob_rub_neu_classifier eventcomment = EventComments(topicid) newsIds = eventcomment.getNewsIds() for news_id in newsIds: eventcomment.clear_cluster(news_id) results = eventcomment.getNewsComments(news_id) news = News(news_id) # 数据字段预处理 inputs = [] for r in results: r['title'] = '' r['content168'] = r['content168'].encode('utf-8') r['content'] = r['content168'] r['text'] = r['content168'] item = ad_filter(r) if item['ad_label'] == 0: inputs.append(item) # 去除垃圾和新闻文本 items = weibo_subob_rub_neu_classifier(inputs) inputs = [] for item in items: subob_rub_neu_label = item['subob_rub_neu_label'] if not subob_rub_neu_label in [1, 0]: # 1表示垃圾文本,0表示新闻文本 inputs.append(item) MIN_CLUSTERING_INPUT = 30 MIN_CLUSTER_NUM = 2 MAX_CLUSTER_NUM = 10 # TFIDF词、聚类数量自动选择、vsm作属性也要可设成参数 if len(inputs) >= MIN_CLUSTERING_INPUT: tfidf_word, input_dict = tfidf_v2(inputs) results = choose_cluster(tfidf_word, inputs, MIN_CLUSTER_NUM, MAX_CLUSTER_NUM) #评论文本聚类 cluster_text = text_classify(inputs, results, tfidf_word) evaluation_inputs = [] for k,v in enumerate(cluster_text): inputs[k]['label'] = v['label'] inputs[k]['weight'] = v['weight'] evaluation_inputs.append(inputs[k]) #簇评价 recommend_text = cluster_evaluation(evaluation_inputs) for label, items in recommend_text.iteritems(): if label == 'other': label = news.otherClusterId if len(items): eventcomment.save_cluster(label, news_id, int(time.time())) if label != news.otherClusterId: fwords = results[label] eventcomment.update_feature_words(label, fwords) for item in items: comment = Comment(item['_id'], topicid) comment.update_comment_label(label) comment.update_comment_weight(item['weight']) # 情绪计算 for r in inputs: if r['subob_rub_neu_label'] == 2: sentiment = 0 # 0 中性 elif r['subob_rub_neu_label'] == -1: sentiment = triple_classifier(r) # 1 高兴、2 愤怒、3 悲伤、0无情感 if sentiment == 0: sentiment = mid_sentiment_classify(r['text']) if sentiment == -1: sentiment = 0 # 中性 comment = Comment(r['_id'], topicid) comment.update_comment_sentiment(sentiment)
def one_topic_calculation_comments_v4(topicid): """对评论进行聚类 """ from comment_clustering_tfidf_v4 import kmeans, tfidf_v4, text_classify, \ cluster_evaluation, choose_cluster eventcomment = EventComments(topicid) newsIds = eventcomment.getNewsIds() for news_id in newsIds: eventcomment.clear_cluster(news_id) results = eventcomment.getNewsComments(news_id) news = News(news_id) inputs = [] for r in results: r['title'] = '' r['content'] = r['content168'].encode('utf-8') r['text'] = r['content168'] item = ad_filter(r) if item['ad_label'] == 0: inputs.append(item) # 情绪计算 for r in inputs: sentiment = triple_classifier(r) comment = Comment(r['_id'], topicid) comment.update_comment_sentiment(sentiment) MIN_CLUSTERING_INPUT = 50 MIN_CLUSTER_NUM = 2 MAX_CLUSTER_NUM = 15 if len(inputs) >= MIN_CLUSTERING_INPUT: tfidf_word, input_dict = tfidf_v4(inputs) results = choose_cluster(tfidf_word, inputs, MIN_CLUSTER_NUM, MAX_CLUSTER_NUM) # for k, v in results.iteritems(): # print k, len(v) #评论文本聚类 cluster_text = text_classify(inputs, results, tfidf_word) evaluation_inputs = [] for k,v in enumerate(cluster_text): inputs[k]['label'] = v['label'] inputs[k]['weight'] = v['weight'] evaluation_inputs.append(inputs[k]) #簇评价 recommend_text = cluster_evaluation(evaluation_inputs) for label, items in recommend_text.iteritems(): if label == 'other': label = news.otherClusterId if len(items): eventcomment.save_cluster(label, news_id, int(time.time())) if label != news.otherClusterId: fwords = results[label] eventcomment.update_feature_words(label, fwords) for item in items: comment = Comment(item['_id'], topicid) comment.update_comment_label(label) comment.update_comment_weight(item['weight'])
def one_topic_calculation_comments_v7(topicid): """对评论进行聚类 """ from comment_clustering_tfidf_v7 import tfidf_v2, text_classify, \ cluster_evaluation, choose_cluster from weibo_subob_rub_neu_classifier import weibo_subob_rub_neu_classifier eventcomment = EventComments(topicid) newsIds = eventcomment.getNewsIds() for news_id in newsIds: eventcomment.clear_cluster(news_id) results = eventcomment.getNewsComments(news_id) news = News(news_id) # 数据字段预处理 inputs = [] for r in results: r['title'] = '' r['content168'] = r['content168'].encode('utf-8') r['content'] = r['content168'] r['text'] = r['content168'] item = ad_filter(r) if item['ad_label'] == 0: inputs.append(item) # 去除垃圾和新闻文本 items = weibo_subob_rub_neu_classifier(inputs) inputs = [] for item in items: subob_rub_neu_label = item['subob_rub_neu_label'] if not subob_rub_neu_label in [1, 0]: # 1表示垃圾文本,0表示新闻文本 inputs.append(item) MIN_CLUSTERING_INPUT = 30 MIN_CLUSTER_NUM = 2 MAX_CLUSTER_NUM = 10 # TFIDF词、聚类数量自动选择、vsm作属性也要可设成参数 if len(inputs) >= MIN_CLUSTERING_INPUT: tfidf_word, input_dict = tfidf_v2(inputs) results = choose_cluster(tfidf_word, inputs, MIN_CLUSTER_NUM, MAX_CLUSTER_NUM) #评论文本聚类 cluster_text = text_classify(inputs, results, tfidf_word) evaluation_inputs = [] for k, v in enumerate(cluster_text): inputs[k]['label'] = v['label'] inputs[k]['weight'] = v['weight'] evaluation_inputs.append(inputs[k]) #簇评价 recommend_text = cluster_evaluation(evaluation_inputs) for label, items in recommend_text.iteritems(): if label == 'other': label = news.otherClusterId if len(items): eventcomment.save_cluster(label, news_id, int(time.time())) if label != news.otherClusterId: fwords = results[label] eventcomment.update_feature_words(label, fwords) for item in items: comment = Comment(item['_id'], topicid) comment.update_comment_label(label) comment.update_comment_weight(item['weight']) # 情绪计算 for r in inputs: if r['subob_rub_neu_label'] == 2: sentiment = 0 # 0 中性 elif r['subob_rub_neu_label'] == -1: sentiment = triple_classifier(r) # 1 高兴、2 愤怒、3 悲伤、0无情感 if sentiment == 0: sentiment = mid_sentiment_classify(r['text']) if sentiment == -1: sentiment = 0 # 中性 comment = Comment(r['_id'], topicid) comment.update_comment_sentiment(sentiment)
def one_topic_calculation_comments_v4(topicid): """对评论进行聚类 """ from comment_clustering_tfidf_v4 import kmeans, tfidf_v4, text_classify, \ cluster_evaluation, choose_cluster eventcomment = EventComments(topicid) newsIds = eventcomment.getNewsIds() for news_id in newsIds: eventcomment.clear_cluster(news_id) results = eventcomment.getNewsComments(news_id) news = News(news_id) inputs = [] for r in results: r['title'] = '' r['content'] = r['content168'].encode('utf-8') r['text'] = r['content168'] item = ad_filter(r) if item['ad_label'] == 0: inputs.append(item) # 情绪计算 for r in inputs: sentiment = triple_classifier(r) comment = Comment(r['_id'], topicid) comment.update_comment_sentiment(sentiment) MIN_CLUSTERING_INPUT = 50 MIN_CLUSTER_NUM = 2 MAX_CLUSTER_NUM = 15 if len(inputs) >= MIN_CLUSTERING_INPUT: tfidf_word, input_dict = tfidf_v4(inputs) results = choose_cluster(tfidf_word, inputs, MIN_CLUSTER_NUM, MAX_CLUSTER_NUM) # for k, v in results.iteritems(): # print k, len(v) #评论文本聚类 cluster_text = text_classify(inputs, results, tfidf_word) evaluation_inputs = [] for k, v in enumerate(cluster_text): inputs[k]['label'] = v['label'] inputs[k]['weight'] = v['weight'] evaluation_inputs.append(inputs[k]) #簇评价 recommend_text = cluster_evaluation(evaluation_inputs) for label, items in recommend_text.iteritems(): if label == 'other': label = news.otherClusterId if len(items): eventcomment.save_cluster(label, news_id, int(time.time())) if label != news.otherClusterId: fwords = results[label] eventcomment.update_feature_words(label, fwords) for item in items: comment = Comment(item['_id'], topicid) comment.update_comment_label(label) comment.update_comment_weight(item['weight'])
def comments_rubbish_clustering_calculation(comments, cluster_num, \ cluster_eva_min_size=CLUSTER_EVA_MIN_SIZE, \ version=COMMENT_CLUSTERING_PROCESS_FOR_CLUTO_VERSION): """评论垃圾过滤、聚类 input: comments comment中包含news_id, news_content cluster_infos: 聚簇信息 item_infos:单条信息列表, 数据字段:clusterid、weight、same_from、duplicate """ # 无意义信息的clusterid,包括ad_filter分出来的广告,svm分出的垃圾,主客观分类器分出的新闻 NON_CLUSTER_ID = 'nonsense' # 其他类的clusterid OTHER_CLUSTER_ID = 'other' # 直接显示的clusterid DIRECT_CLUSTER_ID = 'direct' DIRECT_CLUSTER_FEATURE = [u'聚簇'] # 最小聚类输入信息条数,少于则不聚类 MIN_CLUSTERING_INPUT = 20 # 簇信息,主要是簇的特征词信息 clusters_infos = {'features': dict()} # 单条信息list,每条信息存储 clusterid weight sentiment字段 items_infos = [] # 数据字段预处理 print('\tData preprocess...') inputs = [] for r in comments: r['title'] = '' r['content168'] = r['content'] #.encode('utf-8') r['content'] = r['content168'] r['text'] = r['content'] if 'news_content' in r and r['news_content']: r['news_content'] = r['news_content'] #.encode('utf-8') else: r['news_content'] = '' # 简单规则过滤广告 item = ad_filter(r) if item['ad_label'] == 0: inputs.append(item) else: item['clusterid'] = NON_CLUSTER_ID + '_rub' items_infos.append(item) print('\tAd filter %d data, data list have: %d' % (len(inputs), len(items_infos))) # svm去除垃圾 print('\tSvm rubbish classify...') if len(inputs) == 0: items = [] else: items = rubbish_classifier(inputs) inputs = [] for item in items: if item['rub_label'] == 1: item['clusterid'] = NON_CLUSTER_ID + '_rub' items_infos.append(item) else: inputs.append(item) print('\tSvm rubbish classify %d data, data list have: %d' % (len(inputs), len(items_infos))) #开始聚类 print('\tStart clustring opinion...') opinion_name, word_result, text_list, word_main = opinion_main( inputs, cluster_num) # if len(inputs) >= 500: # opinion_name,word_result,text_list = opinion_main(inputs,10) # else: # opinion_name,word_result,text_list = opinion_main(inputs,5) print('\tEnd clustring opinion...') for k, v in word_result.items(): #name = opinion_name[k] clusters_infos['features'][k] = v clusters_infos['word_main'] = word_main final_inputs = [] for k, v in text_list.items(): for item in v: row = copy.deepcopy(item) row['clusterid'] = k final_inputs.append(row) # 去重,根据子观点类别去重 cluster_items = dict() for r in final_inputs: clusterid = r['clusterid'] try: cluster_items[clusterid].append(r) except KeyError: cluster_items[clusterid] = [r] for clusterid, items in cluster_items.items(): results = duplicate(items) items_infos.extend(results) return {'cluster_infos': clusters_infos, 'item_infos': items_infos}
def comments_rubbish_clustering_calculation(comments, logger, cluster_num=COMMENT_WORDS_CLUSTER_NUM, \ cluster_eva_min_size=CLUSTER_EVA_MIN_SIZE, \ version=COMMENT_CLUSTERING_PROCESS_FOR_CLUTO_VERSION): """评论垃圾过滤、聚类 input: comments comment中包含news_id, news_content cluster_infos: 聚簇信息 item_infos:单条信息列表, 数据字段:clusterid、weight、same_from、duplicate """ # 无意义信息的clusterid,包括ad_filter分出来的广告,svm分出的垃圾,主客观分类器分出的新闻 NON_CLUSTER_ID = 'nonsense' # 其他类的clusterid OTHER_CLUSTER_ID = 'other' # 最小聚类输入信息条数,少于则不聚类 MIN_CLUSTERING_INPUT = 30 # 簇信息,主要是簇的特征词信息 clusters_infos = {'features': dict()} # 去除sentiment label clusterid ad_label subob_label rub_label clear_keys = ['label', 'ad_label', 'subob_label', 'rub_label', 'weight'] inputs = [] for r in comments: for key in clear_keys: if key in r: del r[key] inputs.append(r) comments = inputs # 单条信息list,每条信息存储 clusterid weight sentiment字段 items_infos = [] # 数据字段预处理 inputs = [] for r in comments: r['title'] = '' r['content168'] = r['content168'].encode('utf-8') r['content'] = r['content168'] r['text'] = r['content168'] if 'news_content' in r and r['news_content']: r['news_content'] = r['news_content'].encode('utf-8') else: r['news_content'] = '' # 简单规则过滤广告 item = ad_filter(r) if item['ad_label'] == 0: inputs.append(item) else: item['clusterid'] = NON_CLUSTER_ID + '_rub' items_infos.append(item) # svm去除垃圾 items = rubbish_classifier(inputs) inputs = [] for item in items: if item['rub_label'] == 1: item['clusterid'] = NON_CLUSTER_ID + '_rub' items_infos.append(item) else: inputs.append(item) # 按新闻对评论归类 results = comment_news(inputs) final_inputs = [] for news_id, _inputs in results.iteritems(): # 结合新闻,过滤评论 _inputs = filter_comment(_inputs) inputs = [r for r in _inputs if r['rub_label'] == 0] inputs_rubbish = [r for r in _inputs if r['rub_label'] == 1] for r in inputs_rubbish: r['clusterid'] = NON_CLUSTER_ID + '_rub' items_infos.append(r) if len(inputs) >= MIN_CLUSTERING_INPUT: tfidf_word, input_dict = tfidf_v2(inputs) results = choose_cluster(tfidf_word, inputs, \ cluster_num=cluster_num, version=version) #评论文本聚类 cluster_text = text_classify(inputs, results, tfidf_word) evaluation_inputs = [] for k, v in enumerate(cluster_text): inputs[k]['label'] = v['label'] inputs[k]['weight'] = v['weight'] evaluation_inputs.append(inputs[k]) #簇评价, 权重及簇标签 recommend_text = cluster_evaluation(evaluation_inputs, min_size=cluster_eva_min_size) for label, items in recommend_text.iteritems(): if label != OTHER_CLUSTER_ID: clusters_infos['features'][label] = results[label] for item in items: item['clusterid'] = label item['weight'] = item['weight'] final_inputs.extend(items) else: for item in items: item['clusterid'] = OTHER_CLUSTER_ID items_infos.extend(items) else: # 如果信息条数小于,则直接展示信息列表 tfidf_word, input_dict = tfidf_v2(inputs) uuid_label = str(uuid.uuid4()) clusters_infos['features'][uuid_label] = [kw for kw, count in tfidf_word] for r in inputs: r['clusterid'] = uuid_label r['weight'] = global_weight_cal_tfidf(tfidf_word, r) final_inputs.extend(inputs) # 去重,根据子观点类别去重 cluster_items = dict() for r in final_inputs: clusterid = r['clusterid'] try: cluster_items[clusterid].append(r) except KeyError: cluster_items[clusterid] = [r] for clusterid, items in cluster_items.iteritems(): results = duplicate(items) items_infos.extend(results) return {'cluster_infos': clusters_infos, 'item_infos': items_infos}
def comments_sentiment_rubbish_calculation(comments, logger): """输入为一堆comments, 字段包括title、content168 输出: item_infos:单条信息列表, 数据字段:sentiment、same_from、duplicate """ # 无意义信息的clusterid,包括ad_filter分出来的广告,svm分出的垃圾,主客观分类器分出的新闻 NON_CLUSTER_ID = 'nonsense' # 有意义的信息clusterid MEAN_CLUSTER_ID = 'sentiment' # 单条信息list,每条信息存储 clusterid weight sentiment字段 items_infos = [] # 去除sentiment label clusterid ad_label subob_label rub_label clear_keys = ['sentiment', 'label', 'clusterid', 'ad_label', 'subob_label', 'rub_label', 'weight'] inputs = [] for r in comments: for key in clear_keys: if key in r: del r[key] inputs.append(r) comments = inputs # 数据字段预处理 inputs = [] for r in comments: r['title'] = '' r['content168'] = r['content168'].encode('utf-8') r['content'] = r['content168'] r['text'] = r['content168'] inputs.append(r) # 先分中性及3类分类器 svm_inputs = [] for r in inputs: sentiment = neutral_classifier(r) if sentiment != 0: sentiment = triple_classifier(r) if sentiment == 0: svm_inputs.append(r) else: r['sentiment'] = sentiment items_infos.append(r) else: svm_inputs.append(r) # 情绪调整 senti_modify_inputs = [] for r in svm_inputs: sentiment = mid_sentiment_classify(r['text']) if sentiment == -1: sentiment = 0 # 中性 if sentiment != 0: r['sentiment'] = sentiment items_infos.append(r) else: r['sentiment'] = sentiment senti_modify_inputs.append(r) # 新闻分类 inputs = [] for r in senti_modify_inputs: r = subob_classifier(r) if r['subob_label'] == 1: # 主客观文本分类 r['sentiment'] = NON_CLUSTER_ID + '_news' # 新闻 items_infos.append(r) else: inputs.append(r) # 去垃圾 items = rubbish_classifier(inputs) for item in items: if item['rub_label'] == 1: # svm去垃圾 item['sentiment'] = NON_CLUSTER_ID + '_rub' else: # 简单规则过滤广告 item = ad_filter(item) if item['ad_label'] == 1: item['sentiment'] = NON_CLUSTER_ID + '_rub' items_infos.append(item) # 去重,在一个情绪类别下将文本去重 sentiment_dict = dict() for item in items_infos: if 'sentiment' in item: sentiment = item['sentiment'] try: sentiment_dict[sentiment].append(item) except KeyError: sentiment_dict[sentiment] = [item] items_infos = [] for sentiment, items in sentiment_dict.iteritems(): items_list = duplicate(items) items_infos.extend(items_list) return {'item_infos': items_infos}
from sta_ad import test, choose_ad def rubbish_filter(items, topicid): """svm 垃圾过滤器 """ texts = [item['content168'] for item in items] test(texts, topicid) labels = choose_ad(topicid) return labels if __name__=="__main__": topic = "APEC2014" topicid = "54916b0d955230e752f2a94e" mongo = _default_mongo(usedb=MONGO_DB_NAME) results = mongo[COMMENT_COLLECTION + topicid].find() results = [r for r in results] from ad_filter import ad_filter rubbish_filter_inputs = [] for r in results: r['content168'] = r['content168'].encode('utf-8') text, label = ad_filter(r) if label == 0: rubbish_filter_inputs.append(r) #libsvm垃圾过滤 results = rubbish_filter(rubbish_filter_inputs, topicid)