def test_classification(feature, incr=False): clf = get_classification(feature, incr) # 加载测试数据集 if feature.subjective: test = Load.load_test_balance() else: test = Load.load_test_objective_balance() test_datas, c_true, _ = feature.get_key_words(test) test = test_datas # 构建适合 bayes 分类的数据集 if not sp.issparse(test_datas): test = feature.cal_weight_improve(test_datas, c_true) c_pred_unknow = clf.predict_unknow(test) print c_pred_unknow print "precision:", clf.metrics_precision(c_true, c_pred_unknow) print "recall:", clf.metrics_recall(c_true, c_pred_unknow) print "f1:", clf.metrics_f1(c_true, c_pred_unknow) print "origin accuracy:", clf.metrics_accuracy(c_true, c_pred_unknow) print "zero_one_loss:", clf.metrics_zero_one_loss(c_true, c_pred_unknow) test_proba = clf.predict_max_proba(test) print "my_zero_one_loss:", clf.metrics_my_zero_one_loss(test_proba) print clf.metrics_correct(c_true, c_pred_unknow)
def get_classification(feature, incr=False): """ 获得分类器 :param feature: :param incr :return: """ train_datas, class_label, _ = feature.get_key_words() train = train_datas # 构建适合 bayes 分类的数据集 if not sp.issparse(train_datas): train = feature.cal_weight_improve(train_datas, class_label) if incr: bayes = IncrBayes() else: bayes = Bayes() clf = Classification(bayes=bayes, subjective=feature.subjective) clf.get_classificator(train, class_label) if incr: incr_train_datas = Load.load_incr_datas() incr_train, incr_class_label, _ = feature.get_key_words(incr_train_datas) # 构建适合 bayes 分类的增量集 if not sp.issparse(incr_train): incr_train = feature.cal_weight_improve(incr_train, incr_class_label) clf.get_incr_classificator(incr_train, incr_class_label, train, class_label, method="five") return clf
def get_classification(feature, incr=False): """ 获得分类器 :param feature: :param incr :return: """ train_datas, class_label, _ = feature.get_key_words() train = train_datas # 构建适合 bayes 分类的数据集 if not sp.issparse(train_datas): train = feature.cal_weight_improve(train_datas, class_label) if incr: bayes = IncrBayes() else: bayes = Bayes() clf = Classification(bayes=bayes, subjective=feature.subjective) clf.get_classificator(train, class_label) if incr: incr_train_datas = Load.load_incr_datas() incr_train, incr_class_label, _ = feature.get_key_words( incr_train_datas) # 构建适合 bayes 分类的增量集 if not sp.issparse(incr_train): incr_train = feature.cal_weight_improve(incr_train, incr_class_label) clf.get_incr_classificator(incr_train, incr_class_label, train, class_label, method="five") return clf
def _get_splited_train(self): """ 优先从文件中读取训练集分词后的结果 :return: """ dir_ = os.path.join(TEXT_OUT, "split") if self.subjective: split_txt = os.path.join(dir_, self.__class__.__name__ + ".txt") training_datas = Load.load_training_balance() else: split_txt = os.path.join(dir_, self.__class__.__name__ + "_objective.txt") training_datas = Load.load_training_objective_balance() if self.f or not FileUtil.isexist(split_txt) or FileUtil.isempty(split_txt): # 加载训练集 # 每个句子还包含类别信息 splited_words_list = Feature.__split(flatten(training_datas)) # splited_words_list = Feature.__del_low_frequency_word(splited_words_list) FileUtil.write(split_txt, splited_words_list) else: splited_words_list = FileUtil.read(split_txt) return splited_words_list
# l = [sentence] # fit_sentence = Feature_Hasher.transform(l).toarray() b_matrix = sentence.toarray() np.power(correct_row + b_matrix, 1, correct_row) np.power(copy_feature_count, 1, out) return copy_feature_count if __name__ == "__main__": # 加载情绪分类数据集 feature = CHIFeature() train_datas, class_label, _ = feature.get_key_words() train = train_datas if not sp.issparse(train_datas): train = feature.cal_weight_improve(train_datas, class_label) test = Load.load_test_balance() test_datas, test_label, _ = feature.get_key_words(test) test = test_datas # 构建适合 bayes 分类的数据集 if not sp.issparse(test_datas): test = feature.cal_weight_improve(test_datas, test_label) crossvalidate = False # 若不交叉验证 记得修改 load_sample.py 中加载 train 的比例 if crossvalidate: out = os.path.join(TEXT_OUT, "best_train_test_index/test_index.txt") if not FileUtil.isexist(out) or FileUtil.isempty(out): clf0 = Classification() clf0.cross_validation(train, class_label, score="recall") test_index = np.loadtxt(out, dtype=int) test = train[test_index]
raise ValueError("the two lists have different size!") l = filter(lambda x: x[1] != "unknow", zip(c_true, c_pred)) temp = zip(*l) return temp[0], temp[1] if __name__ == "__main__": print # 加载情绪分类数据集 feature = CHIFeature() train_datas, class_label, _ = feature.get_key_words() train = train_datas if not sp.issparse(train_datas): train = feature.cal_weight_improve(train_datas, class_label) test = Load.load_test_balance() test_datas, test_label, _ = feature.get_key_words(test) test = test_datas # 构建适合 bayes 分类的数据集 if not sp.issparse(train_datas): test = feature.cal_weight_improve(test_datas, test_label) clf = Classification() crossvalidate = False # 若不交叉验证 记得修改 load_sample.py 中加载 train 的比例 if crossvalidate: out = os.path.join(TEXT_OUT, "best_train_test_index/test_index.txt") if not FileUtil.isexist(out) or FileUtil.isempty(out): clf.cross_validation(train, class_label, score="recall") test_index = np.loadtxt(out, dtype=int) test = train[test_index]