Ejemplo n.º 1
0
    def test_split_classifier_extended(self, clf_):
        clf2 = clf_.clone()
        ds = datasets['uni2%s' % self._get_clf_ds(clf2)]
        clf = SplitClassifier(clf=clf_, #SameSignClassifier(),
                enable_ca=['stats', 'feature_ids'])
        clf.train(ds)                   # train the beast
        error = clf.ca.stats.error

        cv = CrossValidation(clf2, NFoldPartitioner(), postproc=mean_sample(),
            enable_ca=['stats', 'training_stats'])
        cverror = cv(ds).samples.squeeze()

        if not 'non-deterministic' in clf.__tags__:
            self.assertTrue(abs(error-cverror)<0.01,
                    msg="We should get the same error using split classifier as"
                        " using CrossValidation. Got %s and %s"
                        % (error, cverror))

        if cfg.getboolean('tests', 'labile', default='yes'):
            self.assertTrue(error < 0.25,
                msg="clf should generalize more or less fine. "
                    "Got error %s" % error)
        self.assertEqual(len(clf.ca.stats.sets), len(ds.UC),
            msg="Should have 1 confusion per each split")
        self.assertEqual(len(clf.clfs), len(ds.UC),
            msg="Should have number of classifiers equal # of epochs")
Ejemplo n.º 2
0
    def test_split_clf_on_chainpartitioner(self):
        # pretty much a smoke test for #156
        ds = datasets['uni2small']
        part = ChainNode([
            NFoldPartitioner(cvtype=1),
            Balancer(attr='targets',
                     count=2,
                     limit='partitions',
                     apply_selection=True)
        ])
        partitions = list(part.generate(ds))
        sclf = SplitClassifier(sample_clf_lin,
                               part,
                               enable_ca=['stats', 'splits'])
        sclf.train(ds)
        pred = sclf.predict(ds)
        assert_equal(len(pred), len(ds))  # rudimentary check
        assert_equal(len(sclf.ca.splits), len(partitions))
        assert_equal(len(sclf.clfs), len(partitions))

        # now let's do sensitivity analyzer just in case
        sclf.untrain()
        sensana = sclf.get_sensitivity_analyzer()
        sens = sensana(ds)
        # basic check that sensitivities varied across splits
        from mvpa2.mappers.fx import FxMapper
        sens_stds = FxMapper('samples', np.std, uattrs=['targets'])(sens)
        assert_true(np.any(sens_stds != 0))
Ejemplo n.º 3
0
    def test_split_classifier_extended(self, clf_):
        clf2 = clf_.clone()
        ds = datasets['uni2%s' % self._get_clf_ds(clf2)]
        clf = SplitClassifier(
            clf=clf_,  #SameSignClassifier(),
            enable_ca=['stats', 'feature_ids'])
        clf.train(ds)  # train the beast
        error = clf.ca.stats.error

        cv = CrossValidation(clf2,
                             NFoldPartitioner(),
                             postproc=mean_sample(),
                             enable_ca=['stats', 'training_stats'])
        cverror = cv(ds).samples.squeeze()

        if not 'non-deterministic' in clf.__tags__:
            self.assertTrue(
                abs(error - cverror) < 0.01,
                msg="We should get the same error using split classifier as"
                " using CrossValidation. Got %s and %s" % (error, cverror))

        if cfg.getboolean('tests', 'labile', default='yes'):
            self.assertTrue(error < 0.25,
                            msg="clf should generalize more or less fine. "
                            "Got error %s" % error)
        self.assertEqual(len(clf.ca.stats.sets),
                         len(ds.UC),
                         msg="Should have 1 confusion per each split")
        self.assertEqual(
            len(clf.clfs),
            len(ds.UC),
            msg="Should have number of classifiers equal # of epochs")
Ejemplo n.º 4
0
    def test_split_classifier(self):
        ds = self.data_bin_1
        clf = SplitClassifier(
            clf=SameSignClassifier(),
            enable_ca=['stats', 'training_stats', 'feature_ids'])
        clf.train(ds)  # train the beast
        error = clf.ca.stats.error
        tr_error = clf.ca.training_stats.error

        clf2 = clf.clone()
        cv = CrossValidation(clf2,
                             NFoldPartitioner(),
                             postproc=mean_sample(),
                             enable_ca=['stats', 'training_stats'])
        cverror = cv(ds)
        cverror = cverror.samples.squeeze()
        tr_cverror = cv.ca.training_stats.error

        self.assertEqual(
            error,
            cverror,
            msg="We should get the same error using split classifier as"
            " using CrossValidation. Got %s and %s" % (error, cverror))

        self.assertEqual(
            tr_error,
            tr_cverror,
            msg="We should get the same training error using split classifier as"
            " using CrossValidation. Got %s and %s" % (tr_error, tr_cverror))

        self.assertEqual(clf.ca.stats.percent_correct,
                         100,
                         msg="Dummy clf should train perfectly")
        # CV and SplitClassifier should get the same confusion matrices
        assert_array_equal(clf.ca.stats.matrix, cv.ca.stats.matrix)

        self.assertEqual(len(clf.ca.stats.sets),
                         len(ds.UC),
                         msg="Should have 1 confusion per each split")
        self.assertEqual(
            len(clf.clfs),
            len(ds.UC),
            msg="Should have number of classifiers equal # of epochs")
        self.assertEqual(clf.predict(ds.samples),
                         list(ds.targets),
                         msg="Should classify correctly")

        # feature_ids must be list of lists, and since it is not
        # feature-selecting classifier used - we expect all features
        # to be utilized
        #  NOT ANYMORE -- for BoostedClassifier we have now union of all
        #  used features across slave classifiers. That makes
        #  semantics clear. If you need to get deeper -- use upcoming
        #  harvesting facility ;-)
        # self.assertEqual(len(clf.feature_ids), len(ds.uniquechunks))
        # self.assertTrue(np.array([len(ids)==ds.nfeatures
        #                         for ids in clf.feature_ids]).all())

        # Just check if we get it at all ;-)
        summary = clf.summary()
Ejemplo n.º 5
0
    def test_split_classifier(self):
        ds = self.data_bin_1
        clf = SplitClassifier(clf=SameSignClassifier(),
                enable_ca=['stats', 'training_stats',
                               'feature_ids'])
        clf.train(ds)                   # train the beast
        error = clf.ca.stats.error
        tr_error = clf.ca.training_stats.error

        clf2 = clf.clone()
        cv = CrossValidation(clf2, NFoldPartitioner(), postproc=mean_sample(),
            enable_ca=['stats', 'training_stats'])
        cverror = cv(ds)
        cverror = cverror.samples.squeeze()
        tr_cverror = cv.ca.training_stats.error

        self.assertEqual(error, cverror,
                msg="We should get the same error using split classifier as"
                    " using CrossValidation. Got %s and %s"
                    % (error, cverror))

        self.assertEqual(tr_error, tr_cverror,
                msg="We should get the same training error using split classifier as"
                    " using CrossValidation. Got %s and %s"
                    % (tr_error, tr_cverror))

        self.assertEqual(clf.ca.stats.percent_correct,
                             100,
                             msg="Dummy clf should train perfectly")
        # CV and SplitClassifier should get the same confusion matrices
        assert_array_equal(clf.ca.stats.matrix,
                           cv.ca.stats.matrix)

        self.assertEqual(len(clf.ca.stats.sets),
                             len(ds.UC),
                             msg="Should have 1 confusion per each split")
        self.assertEqual(len(clf.clfs), len(ds.UC),
                             msg="Should have number of classifiers equal # of epochs")
        self.assertEqual(clf.predict(ds.samples), list(ds.targets),
                             msg="Should classify correctly")

        # feature_ids must be list of lists, and since it is not
        # feature-selecting classifier used - we expect all features
        # to be utilized
        #  NOT ANYMORE -- for BoostedClassifier we have now union of all
        #  used features across slave classifiers. That makes
        #  semantics clear. If you need to get deeper -- use upcoming
        #  harvesting facility ;-)
        # self.assertEqual(len(clf.feature_ids), len(ds.uniquechunks))
        # self.assertTrue(np.array([len(ids)==ds.nfeatures
        #                         for ids in clf.feature_ids]).all())

        # Just check if we get it at all ;-)
        summary = clf.summary()
Ejemplo n.º 6
0
 def test_harvesting(self):
     """Basic testing of harvesting based on SplitClassifier
     """
     ds = self.data_bin_1
     clf = SplitClassifier(clf=SameSignClassifier(),
             enable_ca=['stats', 'training_stats'],
             harvest_attribs=['clf.ca.training_time'],
             descr="DESCR")
     clf.train(ds)                   # train the beast
     # Number of harvested items should be equal to number of chunks
     self.assertEqual(
         len(clf.ca.harvested['clf.ca.training_time']), len(ds.UC))
     # if we can blame multiple inheritance and ClassWithCollections.__init__
     self.assertEqual(clf.descr, "DESCR")
Ejemplo n.º 7
0
    def test_split_clf_on_chainpartitioner(self):
        # pretty much a smoke test for #156
        ds = datasets['uni2small']
        part = ChainNode([NFoldPartitioner(cvtype=1),
                          Balancer(attr='targets', count=2,
                                   limit='partitions', apply_selection=True)])
        partitions = list(part.generate(ds))
        sclf = SplitClassifier(sample_clf_lin, part, enable_ca=['stats', 'splits'])
        sclf.train(ds)
        pred = sclf.predict(ds)
        assert_equal(len(pred), len(ds))  # rudimentary check
        assert_equal(len(sclf.ca.splits), len(partitions))
        assert_equal(len(sclf.clfs), len(partitions))

        # now let's do sensitivity analyzer just in case
        sclf.untrain()
        sensana = sclf.get_sensitivity_analyzer()
        sens = sensana(ds)
        # basic check that sensitivities varied across splits
        from mvpa2.mappers.fx import FxMapper
        sens_stds = FxMapper('samples', np.std, uattrs=['targets'])(sens)
        assert_true(np.any(sens_stds != 0))
Ejemplo n.º 8
0
    def test_regressions(self, regr):
        """Simple tests on regressions
        """
        if not externals.exists('scipy'):
            raise SkipTest
        else:
            from mvpa2.misc.errorfx import corr_error
        ds = datasets['chirp_linear']
        # we want numeric labels to maintain the previous behavior, especially
        # since we deal with regressions here
        ds.sa.targets = AttributeMap().to_numeric(ds.targets)

        cve = CrossValidation(regr, NFoldPartitioner(), postproc=mean_sample(),
            errorfx=corr_error, enable_ca=['training_stats', 'stats'])
        # check the default
        #self.assertTrue(cve.transerror.errorfx is corr_error)

        corr = np.asscalar(cve(ds).samples)

        # Our CorrErrorFx should never return NaN
        self.assertTrue(not np.isnan(corr))
        self.assertTrue(corr == cve.ca.stats.stats['CCe'])

        splitregr = SplitClassifier(
            regr, partitioner=OddEvenPartitioner(),
            enable_ca=['training_stats', 'stats'])
        splitregr.train(ds)
        split_corr = splitregr.ca.stats.stats['CCe']
        split_corr_tr = splitregr.ca.training_stats.stats['CCe']

        for confusion, error in (
            (cve.ca.stats, corr),
            (splitregr.ca.stats, split_corr),
            (splitregr.ca.training_stats, split_corr_tr),
            ):
            #TODO: test confusion statistics
            # Part of it for now -- CCe
            for conf in confusion.summaries:
                stats = conf.stats
                if cfg.getboolean('tests', 'labile', default='yes'):
                    self.assertTrue(stats['CCe'] < 0.5)
                self.assertEqual(stats['CCe'], stats['Summary CCe'])

            s0 = confusion.as_string(short=True)
            s1 = confusion.as_string(short=False)

            for s in [s0, s1]:
                self.assertTrue(len(s) > 10,
                                msg="We should get some string representation "
                                "of regression summary. Got %s" % s)
            if cfg.getboolean('tests', 'labile', default='yes'):
                self.assertTrue(error < 0.2,
                            msg="Regressions should perform well on a simple "
                            "dataset. Got correlation error of %s " % error)

            # Test access to summary statistics
            # YOH: lets start making testing more reliable.
            #      p-value for such accident to have is verrrry tiny,
            #      so if regression works -- it better has at least 0.5 ;)
            #      otherwise fix it! ;)
            # YOH: not now -- issues with libsvr in SG and linear kernel
            if cfg.getboolean('tests', 'labile', default='yes'):
                self.assertTrue(confusion.stats['CCe'] < 0.5)

        # just to check if it works fine
        split_predictions = splitregr.predict(ds.samples)
Ejemplo n.º 9
0
    def test_regressions(self, regr):
        """Simple tests on regressions
        """
        if not externals.exists('scipy'):
            raise SkipTest
        else:
            from mvpa2.misc.errorfx import corr_error
        ds = datasets['chirp_linear']
        # we want numeric labels to maintain the previous behavior, especially
        # since we deal with regressions here
        ds.sa.targets = AttributeMap().to_numeric(ds.targets)

        cve = CrossValidation(regr, NFoldPartitioner(), postproc=mean_sample(),
            errorfx=corr_error, enable_ca=['training_stats', 'stats'])
        # check the default
        #self.assertTrue(cve.transerror.errorfx is corr_error)

        corr = np.asscalar(cve(ds).samples)

        # Our CorrErrorFx should never return NaN
        self.assertTrue(not np.isnan(corr))
        self.assertTrue(corr == cve.ca.stats.stats['CCe'])

        splitregr = SplitClassifier(
            regr, partitioner=OddEvenPartitioner(),
            enable_ca=['training_stats', 'stats'])
        splitregr.train(ds)
        split_corr = splitregr.ca.stats.stats['CCe']
        split_corr_tr = splitregr.ca.training_stats.stats['CCe']

        for confusion, error in (
            (cve.ca.stats, corr),
            (splitregr.ca.stats, split_corr),
            (splitregr.ca.training_stats, split_corr_tr),
            ):
            #TODO: test confusion statistics
            # Part of it for now -- CCe
            for conf in confusion.summaries:
                stats = conf.stats
                if cfg.getboolean('tests', 'labile', default='yes'):
                    self.assertTrue(stats['CCe'] < 0.5)
                self.assertEqual(stats['CCe'], stats['Summary CCe'])

            s0 = confusion.as_string(short=True)
            s1 = confusion.as_string(short=False)

            for s in [s0, s1]:
                self.assertTrue(len(s) > 10,
                                msg="We should get some string representation "
                                "of regression summary. Got %s" % s)
            if cfg.getboolean('tests', 'labile', default='yes'):
                self.assertTrue(error < 0.2,
                            msg="Regressions should perform well on a simple "
                            "dataset. Got correlation error of %s " % error)

            # Test access to summary statistics
            # YOH: lets start making testing more reliable.
            #      p-value for such accident to have is verrrry tiny,
            #      so if regression works -- it better has at least 0.5 ;)
            #      otherwise fix it! ;)
            # YOH: not now -- issues with libsvr in SG and linear kernel
            if cfg.getboolean('tests', 'labile', default='yes'):
                self.assertTrue(confusion.stats['CCe'] < 0.5)

        # just to check if it works fine
        split_predictions = splitregr.predict(ds.samples)