def _test_size(self): ''' Compare several competing methods changing the ratio of the positive class in the dataset. We use binary class dataset for the easy of interpretation. ''' for set_size in numpy.arange(100, 1000, 100): X_train_full, y_train_full, X_test, y_test = nc_rna_reader.toNumpy() X_train, y_train= self.get_sub_set_with_size([X_train_full, y_train_full], set_size) assert(len(y_train) == set_size) train_set = (X_train, y_train) test_set_original = (X_test, y_test) ms = MS2(LogisticRegression) ms.fit(train_set) r = 0.05 X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1) test_set = [X_test_new, y_test_new] dist_true = DE.arrayToDist(y_test_new) dist_est = ms.predict(X_test_new) err = rms(dist_est, dist_true) print dist_est print "size: %d, err: %f" % (set_size, err)
def run_for_estimator(self, estimator, test_set): X_test, y_test = test_set dist = estimator.predict(X_test) assert len(dist) == 2 err = DE.rms(y_test, dist) return err
def _test_ac_forest(self): X_train_full, y_train_full, X_test, y_test = rcv1_binary_reader.toNumpy() set_size = 200 # an arbitrary number X_train, y_train= self.get_sub_set_with_size([X_train_full, y_train_full], set_size) assert(len(y_train) == set_size) train_set = (X_train, y_train) test_set_original = (X_test, y_test) r = 0.8 X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1) test_set = [X_test_new, y_test_new] dist_true = DE.arrayToDist(y_test_new) pos = [] for i in range(500): train_sub = self.get_sub_set_with_size(train_set, 0.5) ac = AC2(LogisticRegression) ac.fit(train_sub) dist_est = ac.predict(X_test_new) pos.append(dist_est[1]) print pos print numpy.mean(pos) print numpy.median(pos)
def test_class_ratio(self): ''' Compare several competing methods changing the ratio of the positive class in the dataset. We use binary class dataset for the easy of interpretation. ''' #X_train_full, y_train_full, X_test, y_test = nc_rna_reader.toNumpy() X_train_full, y_train_full, X_test, y_test = news_20_reader.toNumpy() set_size = 1000 X_train, y_train = self.get_sub_set_with_size([X_train_full, y_train_full], set_size) train_set = (X_train, y_train) test_set_original = (X_test, y_test) ms = MSHI(LinearSVC) ms.fit(train_set) print 'Done training' for r in numpy.arange(0.05, 1.0, 0.05): #for r in [0.05]: X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1) test_set = [X_test_new, y_test_new] dist_true = DE.arrayToDist(y_test_new) dist_est = ms.predict(X_test_new) err = rms(dist_est, dist_true) print "r: %f, pos: %f" % (r, dist_est[1])
def test_ratio(self): #X_train_full, y_train_full, X_test, y_test = nc_rna_reader.toNumpy() #X_train_full, y_train_full, X_test, y_test = rcv1_binary_reader.toNumpy() X_train_full, y_train_full, X_test, y_test = synthetic_reader.toNumpy(0.3, n_class=2) set_size = 500 # an arbitrary number X_train, y_train= self.get_sub_set_with_size([X_train_full, y_train_full], set_size) assert(len(y_train) == set_size) train_set = (X_train, y_train) test_set_original = (X_test, y_test) cc = CC2(KNeighborsClassifier) cc.fit(train_set) for r in numpy.arange(0.05, 1.0, 0.05): X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1) test_set = [X_test_new, y_test_new] dist_true = DE.arrayToDist(y_test_new) dist_est = cc.predict(X_test_new) err = rms(dist_est, dist_true) #print dist_est print "%f\t%f" % (dist_true[1], dist_est[1])
def adjust_count(y_pred, cm): ncm = cm / cm.sum(1, dtype=numpy.float64)[:, numpy.newaxis] tpr = ncm[1,1] fpr = ncm[0,1] pp = DE.to_bin_dist(y_pred)[1] if tpr - fpr < .25: raise TooLittleDifferenceException new_pos = (pp - fpr) / float(tpr - fpr) dist_est = numpy.array([1-new_pos, new_pos]) dist_est[dist_est<0] = 0 dist_est[dist_est>1] = 1 return dist_est / dist_est.sum()
def adjust_count(self, y_pred, cm): ncm = cm / cm.sum(1, dtype=numpy.float64)[:, numpy.newaxis] tpr = ncm[1,1] fpr = ncm[0,1] pp = DE.to_bin_dist(y_pred)[1] if tpr == fpr: new_pos = .5 else: new_pos = (pp - fpr) / float(tpr - fpr) dist_est = numpy.array([1-new_pos, new_pos]) if self.cap: dist_est[dist_est<0] = 0 dist_est[dist_est>1] = 1 return dist_est / dist_est.sum()
def predict_binary(self, X_population, params): clf, pos2neg, X, y = params cost_fp = 1.0 cost_fn = 1.0 for i in range(self.itr_count): new_label = clf.predict(X_population) cn = Counter(new_label) # We add a small prior (1) to prevent divide by zero error. This is # not mentioned in the original paper, but we add this for fair # comparison. pos2neg_new = (cn[1] + 1) / float(cn[0] + 1) cost_fp = pos2neg / pos2neg_new # High cost means the examples are important, thus we should weigh # them more. clf = self.base_clf_class(class_weight = {0: cost_fn, 1:cost_fp}) clf.fit(X, y) y_pred = clf.predict(X_population) return DE.to_bin_dist(y_pred)
def predict_binary(self, X_population, params): ''' prediction outcome is length-2 array. ''' clf = params y_pred = clf.predict(X_population) return DE.to_bin_dist(y_pred)