def _test_size(self): ''' Compare several competing methods changing the ratio of the positive class in the dataset. We use binary class dataset for the easy of interpretation. ''' for set_size in numpy.arange(100, 1000, 100): X_train_full, y_train_full, X_test, y_test = nc_rna_reader.toNumpy() X_train, y_train= self.get_sub_set_with_size([X_train_full, y_train_full], set_size) assert(len(y_train) == set_size) train_set = (X_train, y_train) test_set_original = (X_test, y_test) ms = MS2(LogisticRegression) ms.fit(train_set) r = 0.05 X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1) test_set = [X_test_new, y_test_new] dist_true = DE.arrayToDist(y_test_new) dist_est = ms.predict(X_test_new) err = rms(dist_est, dist_true) print dist_est print "size: %d, err: %f" % (set_size, err)
def _test_ac_forest(self): X_train_full, y_train_full, X_test, y_test = rcv1_binary_reader.toNumpy() set_size = 200 # an arbitrary number X_train, y_train= self.get_sub_set_with_size([X_train_full, y_train_full], set_size) assert(len(y_train) == set_size) train_set = (X_train, y_train) test_set_original = (X_test, y_test) r = 0.8 X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1) test_set = [X_test_new, y_test_new] dist_true = DE.arrayToDist(y_test_new) pos = [] for i in range(500): train_sub = self.get_sub_set_with_size(train_set, 0.5) ac = AC2(LogisticRegression) ac.fit(train_sub) dist_est = ac.predict(X_test_new) pos.append(dist_est[1]) print pos print numpy.mean(pos) print numpy.median(pos)
def test_class_ratio(self): ''' Compare several competing methods changing the ratio of the positive class in the dataset. We use binary class dataset for the easy of interpretation. ''' #X_train_full, y_train_full, X_test, y_test = nc_rna_reader.toNumpy() X_train_full, y_train_full, X_test, y_test = news_20_reader.toNumpy() set_size = 1000 X_train, y_train = self.get_sub_set_with_size([X_train_full, y_train_full], set_size) train_set = (X_train, y_train) test_set_original = (X_test, y_test) ms = MSHI(LinearSVC) ms.fit(train_set) print 'Done training' for r in numpy.arange(0.05, 1.0, 0.05): #for r in [0.05]: X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1) test_set = [X_test_new, y_test_new] dist_true = DE.arrayToDist(y_test_new) dist_est = ms.predict(X_test_new) err = rms(dist_est, dist_true) print "r: %f, pos: %f" % (r, dist_est[1])
def test_ratio(self): #X_train_full, y_train_full, X_test, y_test = nc_rna_reader.toNumpy() #X_train_full, y_train_full, X_test, y_test = rcv1_binary_reader.toNumpy() X_train_full, y_train_full, X_test, y_test = synthetic_reader.toNumpy(0.3, n_class=2) set_size = 500 # an arbitrary number X_train, y_train= self.get_sub_set_with_size([X_train_full, y_train_full], set_size) assert(len(y_train) == set_size) train_set = (X_train, y_train) test_set_original = (X_test, y_test) cc = CC2(KNeighborsClassifier) cc.fit(train_set) for r in numpy.arange(0.05, 1.0, 0.05): X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1) test_set = [X_test_new, y_test_new] dist_true = DE.arrayToDist(y_test_new) dist_est = cc.predict(X_test_new) err = rms(dist_est, dist_true) #print dist_est print "%f\t%f" % (dist_true[1], dist_est[1])