def run_training_size(self, pos_ratio): X_train_full, y_train_full, X_test, y_test = synthetic_reader.toNumpy() test_set_original = [X_test, y_test] X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, pos_ratio, pos_label=1) test_set = [X_test_new, y_test_new] print "We compare performance as chaning the training set size." print "Positive class ratio is %f" % pos_ratio print "size\tcc\tac\tms\tra\trc\trb\trd" for set_size in (numpy.arange(50, 100, 10).tolist() + numpy.arange(100, 1100, 100).tolist()): cc = CC2(LinearSVC) ac = AC2(LinearSVC) ms = MS2(LogisticRegression) ra = RA(LinearSVC, ac_method = 'ac') rc = RA(LinearSVC, ac_method = 'cac') rb = RA(LinearSVC, ac_method = 'bac') rd = RA(LinearSVC, ac_method = 'dac') ests = [cc, ac, ms, ra, rc, rb, rd] X_train_sub, y_train_sub = self.get_sub_set_with_size( [X_train_full, y_train_full], set_size) train_set = [X_train_sub, y_train_sub] map(lambda e: e.fit(train_set), ests) errs = map(lambda e: self.run_for_estimator(e, test_set), ests) print ("%d" + "\t%.4f" * 7) % (set_size, errs[0], errs[1], errs[2], errs[3], errs[4], errs[5], errs[6])
def test_ratio(self): #X_train_full, y_train_full, X_test, y_test = nc_rna_reader.toNumpy() #X_train_full, y_train_full, X_test, y_test = rcv1_binary_reader.toNumpy() X_train_full, y_train_full, X_test, y_test = synthetic_reader.toNumpy(0.3, n_class=2) set_size = 500 # an arbitrary number X_train, y_train= self.get_sub_set_with_size([X_train_full, y_train_full], set_size) assert(len(y_train) == set_size) train_set = (X_train, y_train) test_set_original = (X_test, y_test) cc = CC2(KNeighborsClassifier) cc.fit(train_set) for r in numpy.arange(0.05, 1.0, 0.05): X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1) test_set = [X_test_new, y_test_new] dist_true = DE.arrayToDist(y_test_new) dist_est = cc.predict(X_test_new) err = rms(dist_est, dist_true) #print dist_est print "%f\t%f" % (dist_true[1], dist_est[1])
def _test_basic(self): X_train_full, y_train_full, X_test, y_test = synthetic_reader.toNumpy(0.3, n_class=2) set_size = 1000 # an arbitrary number X_train, y_train= self.get_sub_set_with_size([X_train_full, y_train_full], set_size) assert(len(y_train) == set_size) train_set = (X_train, y_train) test_set_original = (X_test, y_test) clf = LinearSVC() clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print confusion_matrix(y_test, y_pred) print clf.coef_ print clf.intercept_ numpy.savetxt('synthetic_X_test', X_test) numpy.savetxt('synthetic_y_test', y_test)
def test_class_ratio(self): ''' Compare several competing methods changing the ratio of the positive class in the dataset. We use binary class dataset for the easy of interpretation. ''' #X_train_full, y_train_full, X_test, y_test = nc_rna_reader.toNumpy() #X_train_full, y_train_full, X_test, y_test = rcv1_binary_reader.toNumpy() X_train_full, y_train_full, X_test, y_test = synthetic_reader.toNumpy(0, n_class=2) #X_train_full, y_train_full, X_test, y_test = snippet_reader.toNumpy() set_size = 100 # an arbitrary number X_train, y_train= self.get_sub_set_with_size([X_train_full, y_train_full], set_size) assert(len(y_train) == set_size) train_set = (X_train, y_train) test_set_original = (X_test, y_test) #ac = AC2(LogisticRegression) ac = AC2(SyntheticClassifier) ac.fit(train_set)
def test_ratio(self): dataset = synthetic_reader.toNumpy(err = 0, n_class=2) #for set_size in numpy.arange(100, 2100, 100): for set_size in [100]: self.run_ratio(dataset, set_size) print