def weibo_subob_rub_neu_classifier(items, batch=RUBBISH_BATCH_COUNT): ''' 分类主函数: 输入数据:weibo(list元素),示例:[[mid,text,...],[mid,text,...]...] batch: rubbish filter的参数 输出数据:label_data(字典元素),示例:{{'mid':类别标签},{'mid':类别标签}...} 1表示垃圾文本,0表示新闻文本,[2表示中性文本, 已去除],-1表示有极性的文本 ''' results = [] items = rubbish_classifier(items, batch=batch) for item in items: label = 1 if item['rub_label'] == 1: label = 1 # 垃圾 else: item = subob_classifier(item) if item['subob_label'] == 1: label = 0 # 客观 else: sentiment = triple_classifier(item) if sentiment == 0: # label = 2 # 中性 label = cut_mid_weibo(item['content168']) else: label = -1 # 有极性 item['subob_rub_neu_label'] = label results.append(item) return results
def classify(weibo, flag): ''' 分类主函数: 输入数据:weibo(list元素),示例:[[mid,text,...],[mid,text,...]...] flag(标记变量,任意设置) 输出数据:label_data(字典元素),示例:{{'mid':类别标签},{'mid':类别标签}...} 1表示垃圾文本,0表示新闻文本,2表示中性文本,-1表示有极性的文本 ''' start = time.time() label_data = start_ad(weibo, flag) #垃圾分类 end = time.time() print(end - start) news_weibo = [] for i in range(0, len(weibo)): if label_data[str(weibo[i][0])] == 0: news_weibo.append(weibo[i]) start = time.time() label = cut_weibo(news_weibo) #规则分类 end = time.time() print 'cutting weibo by rules takes %s' % (end - start) start = time.time() for i in range(0, len(label)): if label[i] == 0: mid = news_weibo[i][0] text = news_weibo[i][1] sentiment = triple_classifier(text) #调用中性情感分类器 if sentiment == 0: label_data[str(mid)] = cut_mid_weibo(text) #label_data[str(mid)] = 2 else: label_data[str(mid)] = -1 end = time.time() print 'classifying weibo takes %s' % (end - start) return label_data
def classify(weibo,flag): ''' 分类主函数: 输入数据:weibo(list元素),示例:[[mid,text,...],[mid,text,...]...] flag(标记变量,任意设置) 输出数据:label_data(字典元素),示例:{{'mid':类别标签},{'mid':类别标签}...} 1表示垃圾文本,0表示新闻文本,2表示中性文本,-1表示有极性的文本 ''' start = time.time() label_data = start_ad(weibo,flag)#垃圾分类 end = time.time() print (end-start) news_weibo = [] for i in range(0,len(weibo)): if label_data[str(weibo[i][0])] == 0: news_weibo.append(weibo[i]) start = time.time() label = cut_weibo(news_weibo)#规则分类 end = time.time() print 'cutting weibo by rules takes %s' % (end-start) start = time.time() for i in range(0,len(label)): if label[i] == 0: mid = news_weibo[i][0] text = news_weibo[i][1] sentiment = triple_classifier(text)#调用中性情感分类器 if sentiment == 0: label_data[str(mid)] = cut_mid_weibo(text) #label_data[str(mid)] = 2 else: label_data[str(mid)] = -1 end = time.time() print 'classifying weibo takes %s' % (end-start) return label_data