def test_ratio(self): ''' Compare several competing methods changing the ratio of the positive class in the dataset. We use binary class dataset for the easy of interpretation. ''' dataset = rcv1_binary_reader.toNumpy() set_size = 100 X_train_full, y_train_full, X_test, y_test = dataset X_train, y_train = self.get_sub_set_with_size([X_train_full, y_train_full], set_size) assert(len(y_train) == set_size) train_set = (X_train, y_train) test_set_original = (X_test, y_test) clf = LogisticRegression() clf.fit(X_train, y_train) p = Prior(clf) for r in np.arange(0.05, 1.0, 0.05): # Generate a new test set with desired positive proportions. X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1) test_set = [X_test_new, y_test_new] true_pos = DE.arrayToDist(y_test_new)[1] p.fit(X_train, y_train, {-1:1-true_pos, 1:true_pos}) y_pred = p.predict(X_test_new) cm = confusion_matrix(y_test_new, y_pred) acc = self.accuracy(cm) print r, acc
def run_ratio(self, dataset, set_size): ''' Compare several competing methods changing the ratio of the positive class in the dataset. We use binary class dataset for the easy of interpretation. ''' X_train_full, y_train_full, X_test, y_test = dataset X_train, y_train = self.get_sub_set_with_size([X_train_full, y_train_full], set_size) test_set_original = (X_test, y_test) large = ENMLT(LinearSVC) large.fit(X_train, y_train) simple = LinearSVC() simple.fit(X_train, y_train) for r in numpy.arange(0.05, 1.0, 0.05): # Generate a new test set with desired positive proportions. X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1) y_pred = large.predict(X_test_new) cm = confusion_matrix(y_test_new, y_pred) acc1 = self.accuracy(cm) y_pred = simple.predict(X_test_new) cm = confusion_matrix(y_test_new, y_pred) acc2 = self.accuracy(cm) print "%.2f, %f, %f" % (r, acc1, acc2)
def compare_based(self, clf_class, data_set): X_train, y_train, X_test, y_test = data_set full_test_set = [X_test, y_test] bac_mlt = BACMLT(clf_class) cde_ac = CDEAC(clf_class) cde_it = CDEITR(clf_class) cde_bac = CDEBAC(clf_class) mla_ac = MLAAC(clf_class) basic_c = clf_class() ests = [basic_c, cde_ac, cde_it, cde_bac, mla_ac, bac_mlt] #ests = [basic_c] #print "Training Estimators" map(lambda e: e.fit(X_train, y_train), ests) acc_matrix = [] f1_matrix = [] print "Ratio\tBase\tAC+Cost\tEM+Cost\tBAC+Cost\tAC+MLA\tBACMLA" for r in np.arange(0.2, 1.0, 0.2): # Generate a new test set with desired positive proportions. test_set = SetGen.with_pos_ratio(full_test_set, r, pos_label=1) cms = map(lambda e: self.run_for_estimator(e, test_set), ests) acc = map(lambda e: self.accuracy(e), cms) acc_matrix.append(acc) f1 = map(lambda e: self.f1(e), cms) f1_matrix.append(f1) print ("%.2f" + "\t%.4f" * len(acc)) % tuple([r] + acc) return acc_matrix, f1_matrix
def test_ratio(): ''' Compare several competing methods changing the ratio of the positive class in the dataset. We use binary class dataset for the easy of interpretation. ''' dataset = rcv1_binary_reader.toNumpy() set_size = 100 X_train_full, y_train_full, X_test_full, y_test_full = dataset X_train, y_train = get_sub_set_with_size([X_train_full, y_train_full], set_size) assert(len(y_train) == set_size) X_test, y_test = get_sub_set_with_size([X_test_full, y_test_full], 10000) train_set = (X_train, y_train) test_set_original = (X_test, y_test) save_libsvm(X_train, y_train, 'rcv_train_%d.libsvm' % set_size) #for r in np.arange(0.05, 1.0, 0.05): r = 0.05 # Generate a new test set with desired positive proportions. X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1) test_set = [X_test_new, y_test_new] save_libsvm(X_test_new, y_test_new, 'rcv_test_%.2f.libsvm' % r)
def test_ratio(self): ''' Compare several competing methods changing the ratio of the positive class in the dataset. We use binary class dataset for the easy of interpretation. ''' dataset = rcv1_binary_reader.toNumpy() set_size = 100 X_train_full, y_train_full, X_test_full, y_test_full = dataset X_train, y_train = self.get_sub_set_with_size([X_train_full, y_train_full], set_size) assert(len(y_train) == set_size) X_test, y_test = self.get_sub_set_with_size([X_test_full, y_test_full], 10000) train_set = (X_train, y_train) test_set_original = (X_test, y_test) rfw = RFWeights() svmw = SVMWeights() rf = RandomForestClassifier(n_estimators=400) svm = LinearSVC() rf.fit(X_train.toarray(), y_train) svm.fit(X_train, y_train) print "Ratio\tSVM\tSVMW\tRF\tRFW" for r in np.arange(0.05, 1.0, 0.05): # Generate a new test set with desired positive proportions. X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1) test_set = [X_test_new, y_test_new] true_pos = DE.to_bin_dist(y_test_new)[1] new_class_dist = {0:1-true_pos, 1:true_pos} rfw.fit(X_train, y_train, new_class_dist) svmw.fit(X_train, y_train, new_class_dist) svm_pred = svm.predict(X_test_new) svmw_pred = svmw.predict(X_test_new) rf_pred = rf.predict(X_test_new.toarray()) rfw_pred = rfw.predict(X_test_new.toarray()) preds = [svm_pred, svmw_pred, rf_pred, rfw_pred] pos_ratios = map(lambda x: DE.to_bin_dist(x)[1], preds) print ("%.2f" + "\t%.2f" * len(pos_ratios)) % tuple([r] + pos_ratios)
def test_ratio(self): ''' Compare several competing methods changing the ratio of the positive class in the dataset. We use binary class dataset for the easy of interpretation. ''' #dataset = rcv1_binary_reader.toNumpy() #dataset = snippet_reader.toNumpy() dataset = sentiment_reader.toNumpy() #set_size = 200 #X_train_full, y_train_full, X_test, y_test = dataset #X_train, y_train = self.get_sub_set_with_size([X_train_full, y_train_full], set_size) #assert(len(y_train) == set_size) X_train, y_train, X_test, y_test = dataset X_test = X_test[:1000] y_test = y_test[:1000] train_set = (X_train, y_train) test_set_original = (X_test, y_test) clf = SVMLight() #clf = LinearSVC() clf.fit(X_train, y_train) mla = MLA(clf, verbose=1) for r in np.arange(0.05, 1.0, 0.05): #r = 0.1 # Generate a new test set with desired positive proportions. X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1) test_set = [X_test_new, y_test_new] dist_dict = DE.arrayToDistDict(y_test_new) mla.fit(X_train, y_train, dist_dict) y_pred = mla.predict(X_test_new) cm = confusion_matrix(y_test_new, y_pred) acc = self.accuracy(cm) print r, acc
def compare_svm_based_repeat(self, data_set): X_train, y_train, X_test, y_test = data_set prob_estimator = LinearSVC() prob_estimator.fit(X_train, y_train) w = SVMWeights() #p = Prior(prob_estimator) m = MLT(prob_estimator) ests = [w, m] acc_matrix = [] f1_matrix = [] auc_matrix = [] #print "Ratio\tSVM\tSVMW\tPrior\tMLA" for r in np.arange(0.1, 1.0, 0.1): repeat_num = 20 for repeat in range(repeat_num): # Generate a new test set with desired positive proportions. X_test_new, y_test_new = SetGen.with_pos_ratio([X_test, y_test], r, pos_label=1) class_dist = DE.arrayToDistDict(y_test_new) map(lambda x: x.fit(X_train, y_train, class_dist), ests) y_preds = map(lambda x: x.predict(X_test_new), [prob_estimator] + ests) cms = map(lambda x: confusion_matrix(y_test_new, x), y_preds) accs = map(self.accuracy, cms) f1s = map(self.f1, cms) auc = map(self.auc, cms) acc_matrix.append(accs) f1_matrix.append(f1s) auc_matrix.append(auc) #print ("%.2f" + "\t%.4f" * len(accs)) % tuple([r] + accs) print r print accs print f1s print return acc_matrix, f1_matrix, auc_matrix
def compare_rf_based(self, data_set): X_train, y_train, X_test, y_test = data_set # TODO: We actually need to convert to dense array using toarray() # TODO: Satimage data is the only exception. prob_estimator = RandomForestClassifier(n_estimators=200) prob_estimator.fit(X_train, y_train) w = RFWeights(n_estimators=200) p = Prior(prob_estimator) m = MLT(prob_estimator) ests = [w, p, m] acc_matrix = [] f1_matrix = [] auc_matrix = [] #print "Ratio\tRF\tRFW\tPrior\tMLA" for r in np.arange(0.2, 1.0, 0.2): # Generate a new test set with desired positive proportions. X_test_new, y_test_new = SetGen.with_pos_ratio([X_test, y_test], r, pos_label=1) class_dist = DE.arrayToDistDict(y_test_new) # TODO: We actually need to convert to dense array using toarray() # TODO: Satimage data is the only exception. map(lambda x: x.fit(X_train, y_train, class_dist), ests) y_preds = map(lambda x: x.predict(X_test_new), [prob_estimator] + ests) cms = map(lambda x: confusion_matrix(y_test_new, x), y_preds) accs = map(self.accuracy, cms) f1s = map(self.f1, cms) auc = map(self.auc, cms) acc_matrix.append(accs) f1_matrix.append(f1s) auc_matrix.append(auc) #print ("%.2f" + "\t%.4f" * len(accs)) % tuple([r] + accs) return acc_matrix, f1_matrix, auc_matrix
def compare_maxent_based(self, data_set): X_train, y_train, X_test, y_test = data_set prob_estimator = LogisticRegression() prob_estimator.fit(X_train, y_train) w = MaxentWeights() p = Prior(prob_estimator) m = MLT(prob_estimator) ests = [w, p, m] acc_matrix = [] f1_matrix = [] auc_matrix = [] #print "Ratio\tME\tMEW\tPrior\tMLA" for r in np.arange(0.2, 1.0, 0.2): # Generate a new test set with desired positive proportions. X_test_new, y_test_new = SetGen.with_pos_ratio([X_test, y_test], r, pos_label=1) class_dist = DE.arrayToDistDict(y_test_new) map(lambda x: x.fit(X_train, y_train, class_dist), ests) y_preds = map(lambda x: x.predict(X_test_new), [prob_estimator] + ests) cms = map(lambda x: confusion_matrix(y_test_new, x), y_preds) accs = map(self.accuracy, cms) f1s = map(self.f1, cms) auc = map(self.auc, cms) acc_matrix.append(accs) f1_matrix.append(f1s) auc_matrix.append(auc) #print ("%.2f" + "\t%.4f" * len(accs)) % tuple([r] + accs) return acc_matrix, f1_matrix, auc_matrix