Example #1
0
def test_classification(feature, incr=False):
    clf = get_classification(feature, incr)

    # 加载测试数据集
    if feature.subjective:
        test = Load.load_test_balance()
    else:
        test = Load.load_test_objective_balance()
    test_datas, c_true, _ = feature.get_key_words(test)

    test = test_datas
    # 构建适合 bayes 分类的数据集
    if not sp.issparse(test_datas):
        test = feature.cal_weight_improve(test_datas, c_true)

    c_pred_unknow = clf.predict_unknow(test)
    print c_pred_unknow
    print "precision:", clf.metrics_precision(c_true, c_pred_unknow)
    print "recall:", clf.metrics_recall(c_true, c_pred_unknow)
    print "f1:", clf.metrics_f1(c_true, c_pred_unknow)
    print "origin accuracy:", clf.metrics_accuracy(c_true, c_pred_unknow)
    print "zero_one_loss:", clf.metrics_zero_one_loss(c_true, c_pred_unknow)
    test_proba = clf.predict_max_proba(test)
    print "my_zero_one_loss:", clf.metrics_my_zero_one_loss(test_proba)
    print
    clf.metrics_correct(c_true, c_pred_unknow)
def get_classification(feature, incr=False):
    """
    获得分类器
    :param feature:
    :param incr
    :return:
    """
    train_datas, class_label, _ = feature.get_key_words()

    train = train_datas
    # 构建适合 bayes 分类的数据集
    if not sp.issparse(train_datas):
        train = feature.cal_weight_improve(train_datas, class_label)

    if incr:
        bayes = IncrBayes()
    else:
        bayes = Bayes()
    clf = Classification(bayes=bayes, subjective=feature.subjective)
    clf.get_classificator(train, class_label)
    if incr:
        incr_train_datas = Load.load_incr_datas()
        incr_train, incr_class_label, _ = feature.get_key_words(incr_train_datas)
        # 构建适合 bayes 分类的增量集
        if not sp.issparse(incr_train):
            incr_train = feature.cal_weight_improve(incr_train, incr_class_label)

        clf.get_incr_classificator(incr_train, incr_class_label, train, class_label, method="five")
    return clf
Example #3
0
def get_classification(feature, incr=False):
    """
    获得分类器
    :param feature:
    :param incr
    :return:
    """
    train_datas, class_label, _ = feature.get_key_words()

    train = train_datas
    # 构建适合 bayes 分类的数据集
    if not sp.issparse(train_datas):
        train = feature.cal_weight_improve(train_datas, class_label)

    if incr:
        bayes = IncrBayes()
    else:
        bayes = Bayes()
    clf = Classification(bayes=bayes, subjective=feature.subjective)
    clf.get_classificator(train, class_label)
    if incr:
        incr_train_datas = Load.load_incr_datas()
        incr_train, incr_class_label, _ = feature.get_key_words(
            incr_train_datas)
        # 构建适合 bayes 分类的增量集
        if not sp.issparse(incr_train):
            incr_train = feature.cal_weight_improve(incr_train,
                                                    incr_class_label)

        clf.get_incr_classificator(incr_train,
                                   incr_class_label,
                                   train,
                                   class_label,
                                   method="five")
    return clf
Example #4
0
    def _get_splited_train(self):
        """
        优先从文件中读取训练集分词后的结果
        :return:
        """
        dir_ = os.path.join(TEXT_OUT, "split")
        if self.subjective:
            split_txt = os.path.join(dir_, self.__class__.__name__ + ".txt")
            training_datas = Load.load_training_balance()
        else:
            split_txt = os.path.join(dir_, self.__class__.__name__ + "_objective.txt")
            training_datas = Load.load_training_objective_balance()

        if self.f or not FileUtil.isexist(split_txt) or FileUtil.isempty(split_txt):
            # 加载训练集
            # 每个句子还包含类别信息
            splited_words_list = Feature.__split(flatten(training_datas))
#            splited_words_list = Feature.__del_low_frequency_word(splited_words_list)

            FileUtil.write(split_txt, splited_words_list)
        else:
            splited_words_list = FileUtil.read(split_txt)

        return splited_words_list
Example #5
0
#        l = [sentence]
#        fit_sentence = Feature_Hasher.transform(l).toarray()
        b_matrix = sentence.toarray()
        np.power(correct_row + b_matrix, 1, correct_row)
        np.power(copy_feature_count, 1, out)
        return copy_feature_count

if __name__ == "__main__":
    # 加载情绪分类数据集
    feature = CHIFeature()
    train_datas, class_label, _ = feature.get_key_words()
    train = train_datas
    if not sp.issparse(train_datas):
        train = feature.cal_weight_improve(train_datas, class_label)

    test = Load.load_test_balance()
    test_datas, test_label, _ = feature.get_key_words(test)
    test = test_datas
    # 构建适合 bayes 分类的数据集
    if not sp.issparse(test_datas):
        test = feature.cal_weight_improve(test_datas, test_label)

    crossvalidate = False
    # 若不交叉验证 记得修改 load_sample.py 中加载 train 的比例
    if crossvalidate:
        out = os.path.join(TEXT_OUT, "best_train_test_index/test_index.txt")
        if not FileUtil.isexist(out) or FileUtil.isempty(out):
            clf0 = Classification()
            clf0.cross_validation(train, class_label, score="recall")
        test_index = np.loadtxt(out, dtype=int)
        test = train[test_index]
            raise ValueError("the two lists have different size!")

        l = filter(lambda x: x[1] != "unknow", zip(c_true, c_pred))
        temp = zip(*l)
        return temp[0], temp[1]

if __name__ == "__main__":
    print
    # 加载情绪分类数据集
    feature = CHIFeature()
    train_datas, class_label, _ = feature.get_key_words()
    train = train_datas
    if not sp.issparse(train_datas):
        train = feature.cal_weight_improve(train_datas, class_label)

    test = Load.load_test_balance()
    test_datas, test_label, _ = feature.get_key_words(test)
    test = test_datas
    # 构建适合 bayes 分类的数据集
    if not sp.issparse(train_datas):
        test = feature.cal_weight_improve(test_datas, test_label)

    clf = Classification()
    crossvalidate = False
    # 若不交叉验证 记得修改 load_sample.py 中加载 train 的比例
    if crossvalidate:
        out = os.path.join(TEXT_OUT, "best_train_test_index/test_index.txt")
        if not FileUtil.isexist(out) or FileUtil.isempty(out):
            clf.cross_validation(train, class_label, score="recall")
        test_index = np.loadtxt(out, dtype=int)
        test = train[test_index]
            raise ValueError("the two lists have different size!")

        l = filter(lambda x: x[1] != "unknow", zip(c_true, c_pred))
        temp = zip(*l)
        return temp[0], temp[1]

if __name__ == "__main__":
    print
    # 加载情绪分类数据集
    feature = CHIFeature()
    train_datas, class_label, _ = feature.get_key_words()
    train = train_datas
    if not sp.issparse(train_datas):
        train = feature.cal_weight_improve(train_datas, class_label)

    test = Load.load_test_balance()
    test_datas, test_label, _ = feature.get_key_words(test)
    test = test_datas
    # 构建适合 bayes 分类的数据集
    if not sp.issparse(train_datas):
        test = feature.cal_weight_improve(test_datas, test_label)

    clf = Classification()
    crossvalidate = False
    # 若不交叉验证 记得修改 load_sample.py 中加载 train 的比例
    if crossvalidate:
        out = os.path.join(TEXT_OUT, "best_train_test_index/test_index.txt")
        if not FileUtil.isexist(out) or FileUtil.isempty(out):
            clf.cross_validation(train, class_label, score="recall")
        test_index = np.loadtxt(out, dtype=int)
        test = train[test_index]