Esempio n. 1
0
    def test_ratio(self):
        '''
        Compare several competing methods changing the ratio of the positive
        class in the dataset. We use binary class dataset for the easy of
        interpretation.
        '''
        dataset = rcv1_binary_reader.toNumpy()
        set_size = 100

        X_train_full, y_train_full, X_test, y_test = dataset
        X_train, y_train = self.get_sub_set_with_size([X_train_full, y_train_full], set_size)
        assert(len(y_train) == set_size)

        train_set = (X_train, y_train)
        test_set_original = (X_test, y_test)

        clf = LogisticRegression()
        clf.fit(X_train, y_train)

        p = Prior(clf)

        for r in np.arange(0.05, 1.0, 0.05):
            # Generate a new test set with desired positive proportions.
            X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1)
            test_set = [X_test_new, y_test_new]

            true_pos = DE.arrayToDist(y_test_new)[1]

            p.fit(X_train, y_train, {-1:1-true_pos, 1:true_pos})
            y_pred = p.predict(X_test_new)
            cm = confusion_matrix(y_test_new, y_pred)
            acc = self.accuracy(cm)

            print r, acc
Esempio n. 2
0
    def run_ratio(self, dataset, set_size):
        '''
        Compare several competing methods changing the ratio of the positive
        class in the dataset. We use binary class dataset for the easy of
        interpretation.
        '''
        X_train_full, y_train_full, X_test, y_test = dataset
        X_train, y_train = self.get_sub_set_with_size([X_train_full, y_train_full], set_size)
        test_set_original = (X_test, y_test)

        large = ENMLT(LinearSVC)
        large.fit(X_train, y_train)

        simple = LinearSVC()
        simple.fit(X_train, y_train)

        for r in numpy.arange(0.05, 1.0, 0.05):
            # Generate a new test set with desired positive proportions.
            X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1)

            y_pred = large.predict(X_test_new)
            cm = confusion_matrix(y_test_new, y_pred)
            acc1 = self.accuracy(cm)

            y_pred = simple.predict(X_test_new)
            cm = confusion_matrix(y_test_new, y_pred)
            acc2 = self.accuracy(cm)

            print "%.2f, %f, %f" % (r, acc1, acc2)
Esempio n. 3
0
    def compare_based(self, clf_class, data_set):
        X_train, y_train, X_test, y_test = data_set
        full_test_set = [X_test, y_test]

        bac_mlt = BACMLT(clf_class)
        cde_ac = CDEAC(clf_class)
        cde_it = CDEITR(clf_class)
        cde_bac = CDEBAC(clf_class)
        mla_ac = MLAAC(clf_class)
        basic_c = clf_class()
        ests = [basic_c, cde_ac, cde_it, cde_bac, mla_ac, bac_mlt]
        #ests = [basic_c]

        #print "Training Estimators"
        map(lambda e: e.fit(X_train, y_train), ests)

        acc_matrix = []
        f1_matrix = []

        print "Ratio\tBase\tAC+Cost\tEM+Cost\tBAC+Cost\tAC+MLA\tBACMLA"
        for r in np.arange(0.2, 1.0, 0.2):
            # Generate a new test set with desired positive proportions.
            test_set = SetGen.with_pos_ratio(full_test_set, r, pos_label=1)
            cms = map(lambda e: self.run_for_estimator(e, test_set), ests)

            acc = map(lambda e: self.accuracy(e), cms)
            acc_matrix.append(acc)
            f1 = map(lambda e: self.f1(e), cms)
            f1_matrix.append(f1)

            print ("%.2f" + "\t%.4f" * len(acc)) % tuple([r] + acc)

        return acc_matrix, f1_matrix
Esempio n. 4
0
def test_ratio():
    '''
    Compare several competing methods changing the ratio of the positive
    class in the dataset. We use binary class dataset for the easy of
    interpretation.
    '''
    dataset = rcv1_binary_reader.toNumpy()
    set_size = 100

    X_train_full, y_train_full, X_test_full, y_test_full = dataset
    X_train, y_train = get_sub_set_with_size([X_train_full, y_train_full], set_size)
    assert(len(y_train) == set_size)
    X_test, y_test = get_sub_set_with_size([X_test_full, y_test_full], 10000)

    train_set = (X_train, y_train)
    test_set_original = (X_test, y_test)

    save_libsvm(X_train, y_train, 'rcv_train_%d.libsvm' % set_size)

    #for r in np.arange(0.05, 1.0, 0.05):
    r = 0.05
    # Generate a new test set with desired positive proportions.
    X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1)
    test_set = [X_test_new, y_test_new]

    save_libsvm(X_test_new, y_test_new, 'rcv_test_%.2f.libsvm' % r)
Esempio n. 5
0
    def test_ratio(self):
        '''
        Compare several competing methods changing the ratio of the positive
        class in the dataset. We use binary class dataset for the easy of
        interpretation.
        '''
        dataset = rcv1_binary_reader.toNumpy()
        set_size = 100

        X_train_full, y_train_full, X_test_full, y_test_full = dataset
        X_train, y_train = self.get_sub_set_with_size([X_train_full, y_train_full], set_size)
        assert(len(y_train) == set_size)
        X_test, y_test = self.get_sub_set_with_size([X_test_full, y_test_full], 10000)

        train_set = (X_train, y_train)
        test_set_original = (X_test, y_test)

        rfw = RFWeights()
        svmw = SVMWeights()

        rf = RandomForestClassifier(n_estimators=400)
        svm = LinearSVC()

        rf.fit(X_train.toarray(), y_train)
        svm.fit(X_train, y_train)

        print "Ratio\tSVM\tSVMW\tRF\tRFW"
        for r in np.arange(0.05, 1.0, 0.05):
            # Generate a new test set with desired positive proportions.
            X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1)
            test_set = [X_test_new, y_test_new]

            true_pos = DE.to_bin_dist(y_test_new)[1]
            new_class_dist = {0:1-true_pos, 1:true_pos}

            rfw.fit(X_train, y_train, new_class_dist)
            svmw.fit(X_train, y_train, new_class_dist)

            svm_pred = svm.predict(X_test_new)
            svmw_pred = svmw.predict(X_test_new)
            rf_pred = rf.predict(X_test_new.toarray())
            rfw_pred = rfw.predict(X_test_new.toarray())

            preds = [svm_pred, svmw_pred, rf_pred, rfw_pred]
            pos_ratios = map(lambda x: DE.to_bin_dist(x)[1], preds)

            print ("%.2f" + "\t%.2f" * len(pos_ratios)) % tuple([r] + pos_ratios)
Esempio n. 6
0
File: mla.py Progetto: pyongjoo/ende
    def test_ratio(self):
        '''
        Compare several competing methods changing the ratio of the positive
        class in the dataset. We use binary class dataset for the easy of
        interpretation.
        '''
        #dataset = rcv1_binary_reader.toNumpy()
        #dataset = snippet_reader.toNumpy()
        dataset = sentiment_reader.toNumpy()
        #set_size = 200
        #X_train_full, y_train_full, X_test, y_test = dataset
        #X_train, y_train = self.get_sub_set_with_size([X_train_full, y_train_full], set_size)
        #assert(len(y_train) == set_size)

        X_train, y_train, X_test, y_test = dataset

        X_test = X_test[:1000]
        y_test = y_test[:1000]

        train_set = (X_train, y_train)
        test_set_original = (X_test, y_test)

        clf = SVMLight()
        #clf = LinearSVC()
        clf.fit(X_train, y_train)


        mla = MLA(clf, verbose=1)

        for r in np.arange(0.05, 1.0, 0.05):
        #r = 0.1

            # Generate a new test set with desired positive proportions.
            X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1)
            test_set = [X_test_new, y_test_new]

            dist_dict = DE.arrayToDistDict(y_test_new)

            mla.fit(X_train, y_train, dist_dict)
            y_pred = mla.predict(X_test_new)
            cm = confusion_matrix(y_test_new, y_pred)
            acc = self.accuracy(cm)

            print r, acc
Esempio n. 7
0
    def compare_svm_based_repeat(self, data_set):
        X_train, y_train, X_test, y_test = data_set

        prob_estimator = LinearSVC()
        prob_estimator.fit(X_train, y_train)

        w = SVMWeights()
        #p = Prior(prob_estimator)
        m = MLT(prob_estimator)

        ests = [w, m]

        acc_matrix = []
        f1_matrix = []
        auc_matrix = []

        #print "Ratio\tSVM\tSVMW\tPrior\tMLA"
        for r in np.arange(0.1, 1.0, 0.1):
            repeat_num = 20

            for repeat in range(repeat_num):
                # Generate a new test set with desired positive proportions.
                X_test_new, y_test_new = SetGen.with_pos_ratio([X_test, y_test], r, pos_label=1)

                class_dist = DE.arrayToDistDict(y_test_new)

                map(lambda x: x.fit(X_train, y_train, class_dist), ests)
                y_preds = map(lambda x: x.predict(X_test_new), [prob_estimator] + ests)
                cms = map(lambda x: confusion_matrix(y_test_new, x), y_preds)

                accs = map(self.accuracy, cms)
                f1s = map(self.f1, cms)
                auc = map(self.auc, cms)
                acc_matrix.append(accs)
                f1_matrix.append(f1s)
                auc_matrix.append(auc)

                #print ("%.2f" + "\t%.4f" * len(accs)) % tuple([r] + accs)
                print r
                print accs
                print f1s
                print

        return acc_matrix, f1_matrix, auc_matrix
Esempio n. 8
0
    def compare_rf_based(self, data_set):
        X_train, y_train, X_test, y_test = data_set

        # TODO: We actually need to convert to dense array using toarray()
        # TODO: Satimage data is the only exception.
        prob_estimator = RandomForestClassifier(n_estimators=200)
        prob_estimator.fit(X_train, y_train)

        w = RFWeights(n_estimators=200)
        p = Prior(prob_estimator)
        m = MLT(prob_estimator)

        ests = [w, p, m]

        acc_matrix = []
        f1_matrix = []
        auc_matrix = []

        #print "Ratio\tRF\tRFW\tPrior\tMLA"
        for r in np.arange(0.2, 1.0, 0.2):
            # Generate a new test set with desired positive proportions.
            X_test_new, y_test_new = SetGen.with_pos_ratio([X_test, y_test], r, pos_label=1)

            class_dist = DE.arrayToDistDict(y_test_new)


            # TODO: We actually need to convert to dense array using toarray()
            # TODO: Satimage data is the only exception.

            map(lambda x: x.fit(X_train, y_train, class_dist), ests)
            y_preds = map(lambda x: x.predict(X_test_new), [prob_estimator] + ests)
            cms = map(lambda x: confusion_matrix(y_test_new, x), y_preds)

            accs = map(self.accuracy, cms)
            f1s = map(self.f1, cms)
            auc = map(self.auc, cms)
            acc_matrix.append(accs)
            f1_matrix.append(f1s)
            auc_matrix.append(auc)

            #print ("%.2f" + "\t%.4f" * len(accs)) % tuple([r] + accs)

        return acc_matrix, f1_matrix, auc_matrix
Esempio n. 9
0
    def compare_maxent_based(self, data_set):
        X_train, y_train, X_test, y_test = data_set

        prob_estimator = LogisticRegression()
        prob_estimator.fit(X_train, y_train)

        w = MaxentWeights()
        p = Prior(prob_estimator)
        m = MLT(prob_estimator)

        ests = [w, p, m]

        acc_matrix = []
        f1_matrix = []
        auc_matrix = []

        #print "Ratio\tME\tMEW\tPrior\tMLA"
        for r in np.arange(0.2, 1.0, 0.2):
            # Generate a new test set with desired positive proportions.
            X_test_new, y_test_new = SetGen.with_pos_ratio([X_test, y_test], r, pos_label=1)

            class_dist = DE.arrayToDistDict(y_test_new)

            map(lambda x: x.fit(X_train, y_train, class_dist), ests)
            y_preds = map(lambda x: x.predict(X_test_new), [prob_estimator] + ests)
            cms = map(lambda x: confusion_matrix(y_test_new, x), y_preds)

            accs = map(self.accuracy, cms)
            f1s = map(self.f1, cms)
            auc = map(self.auc, cms)
            acc_matrix.append(accs)
            f1_matrix.append(f1s)
            auc_matrix.append(auc)

            #print ("%.2f" + "\t%.4f" * len(accs)) % tuple([r] + accs)

        return acc_matrix, f1_matrix, auc_matrix