Example #1
0
    def test_ratio(self):
        '''
        Compare several competing methods changing the ratio of the positive
        class in the dataset. We use binary class dataset for the easy of
        interpretation.
        '''
        dataset = rcv1_binary_reader.toNumpy()
        set_size = 100

        X_train_full, y_train_full, X_test_full, y_test_full = dataset
        X_train, y_train = self.get_sub_set_with_size([X_train_full, y_train_full], set_size)
        assert(len(y_train) == set_size)
        X_test, y_test = self.get_sub_set_with_size([X_test_full, y_test_full], 10000)

        train_set = (X_train, y_train)
        test_set_original = (X_test, y_test)

        rfw = RFWeights()
        svmw = SVMWeights()

        rf = RandomForestClassifier(n_estimators=400)
        svm = LinearSVC()

        rf.fit(X_train.toarray(), y_train)
        svm.fit(X_train, y_train)

        print "Ratio\tSVM\tSVMW\tRF\tRFW"
        for r in np.arange(0.05, 1.0, 0.05):
            # Generate a new test set with desired positive proportions.
            X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1)
            test_set = [X_test_new, y_test_new]

            true_pos = DE.to_bin_dist(y_test_new)[1]
            new_class_dist = {0:1-true_pos, 1:true_pos}

            rfw.fit(X_train, y_train, new_class_dist)
            svmw.fit(X_train, y_train, new_class_dist)

            svm_pred = svm.predict(X_test_new)
            svmw_pred = svmw.predict(X_test_new)
            rf_pred = rf.predict(X_test_new.toarray())
            rfw_pred = rfw.predict(X_test_new.toarray())

            preds = [svm_pred, svmw_pred, rf_pred, rfw_pred]
            pos_ratios = map(lambda x: DE.to_bin_dist(x)[1], preds)

            print ("%.2f" + "\t%.2f" * len(pos_ratios)) % tuple([r] + pos_ratios)