def _test_svm_based(self): dataset = snippet_reader.toNumpy() n_feature = 1000 print "Dataset: Snippet, Classifier: Linear SVM" print #self.run_ratio(LinearSVC, dataset, n_feature) self.run_size(LinearSVC, dataset, n_feature)
def load_dataset2(self): X, y, X_test, y_test = dataset = snippet_reader.toNumpy() X, y = shuffle(X, y) lb = LabelBinarizer() lb.fit(y) for y_bin in lb.transform(y).T: return X, y_bin
def _test_debug(self): X_train_full, y_train_full, X_test, y_test = snippet_reader.toNumpy() set_size = 500 # an arbitrary number X_train, y_train = self.get_sub_set_with_size( [X_train_full, y_train_full], set_size) assert(len(y_train) == set_size) train_set = (X_train, y_train) test_set_original = (X_test, y_test) X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, 0.5, pos_label=1) test_set = [X_test_new, y_test_new] en = EN(LogisticRegression, debug=True) self.run_for_estimator(en, train_set, test_set, debug=True)
def _test_svm_based(self): print "Compare Snippet with SVM" dataset = snippet_reader.toNumpy() from collections import Counter X_train, y_train, X_test, y_test = dataset print "training shape", X_train.shape print "test shape", X_test.shape print "training class dist", Counter(y_train) print "test class dist", Counter(y_test) #n_feature = 1000 #self.run_test_with(dataset, self.compare_svm_based, n_feature) acc_matrix, f1_matrix, auc_matrix = self.compare_svm_based(dataset)
def load_dataset(self): X, y, X_test, y_test = dataset = snippet_reader.toNumpy() lb = LabelBinarizer() lb.fit(y) for y_bin in lb.transform(y).T: y = y_bin break for y_bin in lb.transform(y_test).T: y_test = y_bin break return X, y, X_test, y_test
def _test_change_training_size(self): ''' Do compare using ncRNA dataset. The dataset is from Andrew V Uzilov, Joshua M Keegan, and David H Mathews. Detection of non-coding RNAs on the basis of predicted secondary structure formation free energy change. BMC Bioinformatics, 7(173), 2006. ''' X_train_full, y_train_full, X_test, y_test = snippet_reader.toNumpy() test_set_original = [X_test, y_test] pos_ratio = 0.7 # arbtrary ratio X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, pos_ratio, pos_label=1) test_set = [X_test_new, y_test_new] cc = CC(LogisticRegression) ac = AC(LogisticRegression) ms = MS(LogisticRegression) it = Itr(LogisticRegression) en = EN(LogisticRegression) ests = [cc, ac, ms, it, en] print "We compare performance as chaning the training set size." print "Fixed positive ratio is %f" % pos_ratio print "size\tcc\tac\tms\tit\ten" for set_size in [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 2000, 3000]: X_train_sub, y_train_sub = self.get_sub_set_with_size( [X_train_full, y_train_full], set_size) train_set = [X_train_sub, y_train_sub] en.find_hyperparameter(train_set) errs = map(lambda e: self.run_for_estimator(e, train_set, test_set), ests) print ("%d" + "\t%.4f" * 5) % (set_size, errs[0], errs[1], errs[2], errs[3], errs[4])
def test_ratio(self): dataset = snippet_reader.toNumpy() for set_size in numpy.arange(100, 1100, 100): self.run_ratio(dataset, set_size) print
def _test_svm_based(self): dataset = snippet_reader.toNumpy() n_feature = 1000 print "Snippet with SVM" self.run_test_with(dataset, LinearSVC, n_feature)