def test_ratio(self): ''' Compare several competing methods changing the ratio of the positive class in the dataset. We use binary class dataset for the easy of interpretation. ''' dataset = rcv1_binary_reader.toNumpy() set_size = 100 X_train_full, y_train_full, X_test_full, y_test_full = dataset X_train, y_train = self.get_sub_set_with_size([X_train_full, y_train_full], set_size) assert(len(y_train) == set_size) X_test, y_test = self.get_sub_set_with_size([X_test_full, y_test_full], 10000) train_set = (X_train, y_train) test_set_original = (X_test, y_test) rfw = RFWeights() svmw = SVMWeights() rf = RandomForestClassifier(n_estimators=400) svm = LinearSVC() rf.fit(X_train.toarray(), y_train) svm.fit(X_train, y_train) print "Ratio\tSVM\tSVMW\tRF\tRFW" for r in np.arange(0.05, 1.0, 0.05): # Generate a new test set with desired positive proportions. X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1) test_set = [X_test_new, y_test_new] true_pos = DE.to_bin_dist(y_test_new)[1] new_class_dist = {0:1-true_pos, 1:true_pos} rfw.fit(X_train, y_train, new_class_dist) svmw.fit(X_train, y_train, new_class_dist) svm_pred = svm.predict(X_test_new) svmw_pred = svmw.predict(X_test_new) rf_pred = rf.predict(X_test_new.toarray()) rfw_pred = rfw.predict(X_test_new.toarray()) preds = [svm_pred, svmw_pred, rf_pred, rfw_pred] pos_ratios = map(lambda x: DE.to_bin_dist(x)[1], preds) print ("%.2f" + "\t%.2f" * len(pos_ratios)) % tuple([r] + pos_ratios)