Esempio n. 1
0
    def _test_size(self):
        '''
        Compare several competing methods changing the ratio of the positive
        class in the dataset. We use binary class dataset for the easy of
        interpretation.
        '''
        for set_size in numpy.arange(100, 1000, 100):
            X_train_full, y_train_full, X_test, y_test = nc_rna_reader.toNumpy()
            X_train, y_train= self.get_sub_set_with_size([X_train_full, y_train_full], set_size)
            assert(len(y_train) == set_size)

            train_set = (X_train, y_train)
            test_set_original = (X_test, y_test)

            ms = MS2(LogisticRegression)
            ms.fit(train_set)

            r = 0.05
            X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1)
            test_set = [X_test_new, y_test_new]

            dist_true = DE.arrayToDist(y_test_new)
            dist_est = ms.predict(X_test_new)

            err = rms(dist_est, dist_true)

            print dist_est
            print "size: %d, err: %f" % (set_size, err)
Esempio n. 2
0
    def test_class_ratio(self):
        '''
        Compare several competing methods changing the ratio of the positive
        class in the dataset. We use binary class dataset for the easy of
        interpretation.
        '''
        X_train_full, y_train_full, X_test, y_test = nc_rna_reader.toNumpy()
        set_size = 400     # an arbitrary number
        X_train, y_train= self.get_sub_set_with_size(
                [X_train_full, y_train_full], set_size)
        assert(len(y_train) == set_size)

        train_set = (X_train, y_train)
        test_set_original = (X_test, y_test)

        cc = CC2(LogisticRegression)
        ac = AC2(LogisticRegression)
        ms = MS2(LogisticRegression)
        en = EN2(LogisticRegression)

        ests = [cc, ac, ms, en]

        print "We compare the performance as changing the positive class ratio."
        print "The training set size is %d" % set_size

        print "Training classifiers"
        map(lambda e: e.fit(train_set), ests)

        print "ratio\tcc\tac\tms\ten"
        for r in numpy.arange(0.05, 1.0, 0.05):
            X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1)
            test_set = [X_test_new, y_test_new]
            errs = map(lambda e: self.run_for_estimator(e, test_set), ests)
            print ("%.2f" + "\t%.4f" * 4) % (r, errs[0], errs[1], errs[2], errs[3])
Esempio n. 3
0
    def _test_debug(self):
        X_train_full, y_train_full, X_test, y_test = nc_rna_reader.toNumpy()
        set_size = 300  # an arbitrary number
        X_train, y_train = self.get_sub_set_with_size([X_train_full, y_train_full], set_size)
        assert len(y_train) == set_size

        train_set = (X_train, y_train)
        test_set_original = (X_test, y_test)
        X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, 0.05, pos_label=1)
        test_set = [X_test_new, y_test_new]

        cc = CC(LogisticRegression)
        en = EN(LogisticRegression, debug=True)

        err = self.run_for_estimator(en, train_set, test_set, debug=True)
        print "err", err
Esempio n. 4
0
    def _test_avg(self):
        dataset = nc_rna_reader.toNumpy()
        train_set_size = 200

        X_train_full, y_train_full, X_test_full, y_test_full = dataset
        X_train, y_train = self.get_sub_set_with_size([X_train_full, y_train_full], train_set_size)
        X_test, y_test = self.get_sub_set_with_size([X_test_full, y_test_full], 10000)

        train_set = (X_train, y_train)
        test_set_original = (X_test, y_test)

        clf_class = LinearSVC

        for split_r in numpy.arange(0.1, 1.0, 0.1):
            ra = RA(clf_class, ac_method="ac", subsample_count=200, split_r=split_r)
            ra.fit(train_set)
            err = self.compute_avg_error(ra, test_set_original)
            print "%f\t%f" % (split_r, err)
Esempio n. 5
0
    def _test_rna_change_training_size(self):
        '''
        Do compare using ncRNA dataset.

        The dataset is from
            Andrew V Uzilov, Joshua M Keegan, and David H Mathews. 
            Detection of non-coding RNAs on the basis of predicted secondary
            structure formation free energy change. 
            BMC Bioinformatics, 7(173), 2006.
        '''
        X_train_full, y_train_full, X_test, y_test = nc_rna_reader.toNumpy()
        test_set_original = [X_test, y_test]
        pos_ratio = 0.8     # arbtrary ratio
        X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, pos_ratio, pos_label=1)
        test_set = [X_test_new, y_test_new]


        print "We compare performance as chaning the training set size."
        print "Positive class ratio is %f" % pos_ratio
        print "ratio\tcc\tac\tms\ten"
        #for set_size in [600, 300, 400, 500, 700, 800, 900, 1000]:
        for set_size in [800, 900, 1000, 1500, 2000, 2500, 3000]:
        #for set_size in numpy.arange(1500, 5000, 500):
        #for set_size in [500, 600, 700, 800, 900, 1000]:
            cc = CC2(LogisticRegression)
            ac = AC2(LogisticRegression)
            ms = MS2(LogisticRegression)
            en = EN2(LogisticRegression)
            ests = [cc, ac, ms, en]

            X_train_sub, y_train_sub = self.get_sub_set_with_size(
                    [X_train_full, y_train_full], set_size)
            train_set = [X_train_sub, y_train_sub]
            map(lambda e: e.fit(train_set), ests)

            errs = map(lambda e: self.run_for_estimator(e, test_set), ests)
            print ("%d" + "\t%.4f" * 4) % (set_size, errs[0], errs[1], errs[2], errs[3])
Esempio n. 6
0
    def run_training_size(self, pos_ratio):
        """
        Do compare using ncRNA dataset.

        The dataset is from
            Andrew V Uzilov, Joshua M Keegan, and David H Mathews. 
            Detection of non-coding RNAs on the basis of predicted secondary
            structure formation free energy change. 
            BMC Bioinformatics, 7(173), 2006.
        """
        X_train_full, y_train_full, X_test, y_test = nc_rna_reader.toNumpy()
        test_set_original = [X_test, y_test]
        X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, pos_ratio, pos_label=1)
        test_set = [X_test_new, y_test_new]

        print "RNA dataset"
        print "We compare performance as chaning the training set size."
        print "Positive class ratio is %f" % pos_ratio
        print "size\tcc\tac\tms\tra\trc\trb\trd"
        for set_size in numpy.arange(100, 1100, 100).tolist() + [2000, 3000, 4000, 5000, 10000, 20000]:
            cc = CC2(LogisticRegression)
            ac = AC2(LogisticRegression)
            ms = MS2(LogisticRegression)
            ra = RA(LogisticRegression, ac_method="ac")
            rc = RA(LogisticRegression, ac_method="cac")
            rb = RA(LogisticRegression, ac_method="bac")
            rd = RA(LogisticRegression, ac_method="dac")

            ests = [cc, ac, ms, ra, rc, rb, rd]

            X_train_sub, y_train_sub = self.get_sub_set_with_size([X_train_full, y_train_full], set_size)
            train_set = [X_train_sub, y_train_sub]
            map(lambda e: e.fit(train_set), ests)

            errs = map(lambda e: self.run_for_estimator(e, test_set), ests)
            print ("%d" + "\t%.4f" * 7) % (set_size, errs[0], errs[1], errs[2], errs[3], errs[4], errs[5], errs[6])
Esempio n. 7
0
 def test_ratio(self):
     dataset = nc_rna_reader.toNumpy()
     for set_size in numpy.arange(100, 1100, 100):
         self.run_ratio(dataset, set_size)
         print