def weibo_subob_rub_neu_classifier(items, batch=RUBBISH_BATCH_COUNT): ''' 分类主函数: 输入数据:weibo(list元素),示例:[[mid,text,...],[mid,text,...]...] batch: rubbish filter的参数 输出数据:label_data(字典元素),示例:{{'mid':类别标签},{'mid':类别标签}...} 1表示垃圾文本,0表示新闻文本,[2表示中性文本, 已去除],-1表示有极性的文本 ''' results = [] items = rubbish_classifier(items, batch=batch) for item in items: label = 1 if item['rub_label'] == 1: label = 1 # 垃圾 else: item = subob_classifier(item) if item['subob_label'] == 1: label = 0 # 客观 else: sentiment = triple_classifier(item) if sentiment == 0: # label = 2 # 中性 label = cut_mid_weibo(item['content168']) else: label = -1 # 有极性 item['subob_rub_neu_label'] = label results.append(item) return results
def comments_sentiment_rubbish_calculation(comments, logger): """输入为一堆comments, 字段包括title、content168 输出: item_infos:单条信息列表, 数据字段:sentiment、same_from、duplicate """ # 无意义信息的clusterid,包括ad_filter分出来的广告,svm分出的垃圾,主客观分类器分出的新闻 NON_CLUSTER_ID = 'nonsense' # 有意义的信息clusterid MEAN_CLUSTER_ID = 'sentiment' # 单条信息list,每条信息存储 clusterid weight sentiment字段 items_infos = [] # 去除sentiment label clusterid ad_label subob_label rub_label clear_keys = ['sentiment', 'label', 'clusterid', 'ad_label', 'subob_label', 'rub_label', 'weight'] inputs = [] for r in comments: for key in clear_keys: if key in r: del r[key] inputs.append(r) comments = inputs # 数据字段预处理 inputs = [] for r in comments: r['title'] = '' try: r['content168'] = r['content168'].encode('utf-8') except: r['content168'] = r['text'].encode('utf-8') r['content'] = r['content168'] r['text'] = r['content168'] inputs.append(r) # 先分中性及3类分类器 svm_inputs = [] for r in inputs: sentiment = neutral_classifier(r) if sentiment != 0: sentiment = triple_classifier(r) if sentiment == 0: svm_inputs.append(r) else: r['sentiment'] = sentiment items_infos.append(r) else: svm_inputs.append(r) # 情绪调整 senti_modify_inputs = [] for r in svm_inputs: sentiment = mid_sentiment_classify(r['text']) if sentiment == -1: sentiment = 0 # 中性 if sentiment != 0: r['sentiment'] = sentiment items_infos.append(r) else: r['sentiment'] = sentiment senti_modify_inputs.append(r) # 新闻分类 inputs = [] for r in senti_modify_inputs: r = subob_classifier(r) if r['subob_label'] == 1: # 主客观文本分类 r['sentiment'] = NON_CLUSTER_ID + '_news' # 新闻 items_infos.append(r) else: inputs.append(r) # 去垃圾 items = rubbish_classifier(inputs) for item in items: if item['rub_label'] == 1: # svm去垃圾 item['sentiment'] = NON_CLUSTER_ID + '_rub' else: # 简单规则过滤广告 item = ad_filter(item) if item['ad_label'] == 1: item['sentiment'] = NON_CLUSTER_ID + '_rub' items_infos.append(item) # 去重,在一个情绪类别下将文本去重 sentiment_dict = dict() for item in items_infos: if 'sentiment' in item: sentiment = item['sentiment'] try: sentiment_dict[sentiment].append(item) except KeyError: sentiment_dict[sentiment] = [item] items_infos = [] for sentiment, items in sentiment_dict.iteritems(): items_list = duplicate(items) items_infos.extend(items_list) return {'item_infos': items_infos}
def comments_sentiment_rubbish_calculation(comments, logger): """输入为一堆comments, 字段包括title、content168 输出: item_infos:单条信息列表, 数据字段:sentiment、same_from、duplicate """ # 无意义信息的clusterid,包括ad_filter分出来的广告,svm分出的垃圾,主客观分类器分出的新闻 NON_CLUSTER_ID = 'nonsense' # 有意义的信息clusterid MEAN_CLUSTER_ID = 'sentiment' # 单条信息list,每条信息存储 clusterid weight sentiment字段 items_infos = [] # 去除sentiment label clusterid ad_label subob_label rub_label clear_keys = ['sentiment', 'label', 'clusterid', 'ad_label', 'subob_label', 'rub_label', 'weight'] inputs = [] for r in comments: for key in clear_keys: if key in r: del r[key] inputs.append(r) comments = inputs # 数据字段预处理 inputs = [] for r in comments: r['title'] = '' r['content168'] = r['content168'].encode('utf-8') r['content'] = r['content168'] r['text'] = r['content168'] inputs.append(r) # 先分中性及3类分类器 svm_inputs = [] for r in inputs: sentiment = neutral_classifier(r) if sentiment != 0: sentiment = triple_classifier(r) if sentiment == 0: svm_inputs.append(r) else: r['sentiment'] = sentiment items_infos.append(r) else: svm_inputs.append(r) # 情绪调整 senti_modify_inputs = [] for r in svm_inputs: sentiment = mid_sentiment_classify(r['text']) if sentiment == -1: sentiment = 0 # 中性 if sentiment != 0: r['sentiment'] = sentiment items_infos.append(r) else: r['sentiment'] = sentiment senti_modify_inputs.append(r) # 新闻分类 inputs = [] for r in senti_modify_inputs: r = subob_classifier(r) if r['subob_label'] == 1: # 主客观文本分类 r['sentiment'] = NON_CLUSTER_ID + '_news' # 新闻 items_infos.append(r) else: inputs.append(r) # 去垃圾 items = rubbish_classifier(inputs) for item in items: if item['rub_label'] == 1: # svm去垃圾 item['sentiment'] = NON_CLUSTER_ID + '_rub' else: # 简单规则过滤广告 item = ad_filter(item) if item['ad_label'] == 1: item['sentiment'] = NON_CLUSTER_ID + '_rub' items_infos.append(item) # 去重,在一个情绪类别下将文本去重 sentiment_dict = dict() for item in items_infos: if 'sentiment' in item: sentiment = item['sentiment'] try: sentiment_dict[sentiment].append(item) except KeyError: sentiment_dict[sentiment] = [item] items_infos = [] for sentiment, items in sentiment_dict.iteritems(): items_list = duplicate(items) items_infos.extend(items_list) return {'item_infos': items_infos}