Esempio n. 1
0
    def test_james_problem_multiclass(self):
        percent = 80
        dataset = datasets['uni4large']
        #dataset = dataset[:, dataset.a.nonbogus_features]

        rfesvm_split = LinearCSVMC()
        fs = \
            RFE(rfesvm_split.get_sensitivity_analyzer(
            postproc=ChainMapper([
                #FxMapper('features', l2_normed),
                #FxMapper('samples', np.mean),
                #FxMapper('samples', np.abs)
                FxMapper('features', lambda x: np.argsort(np.abs(x))),
                #maxofabs_sample()
                mean_sample()
                ])),
                ProxyMeasure(rfesvm_split,
                             postproc=BinaryFxNode(mean_mismatch_error,
                                                   'targets')),
                Splitter('train'),
                fselector=FractionTailSelector(
                    percent / 100.0,
                    mode='select', tail='upper'), update_sensitivity=True)

        clf = FeatureSelectionClassifier(
            LinearCSVMC(),
            # on features selected via RFE
            fs)

        # update sensitivity at each step (since we're not using the
        # same CLF as sensitivity analyzer)

        class StoreResults(object):
            def __init__(self):
                self.storage = []

            def __call__(self, data, node, result):
                self.storage.append((node.measure.mapper.ca.history,
                                     node.measure.mapper.ca.errors)),

        cv_storage = StoreResults()
        cv = CrossValidation(clf,
                             NFoldPartitioner(),
                             postproc=mean_sample(),
                             callback=cv_storage,
                             enable_ca=['stats'])
        #cv = SplitClassifier(clf)
        try:
            error = cv(dataset).samples.squeeze()
        except Exception, e:
            self.fail('CrossValidation cannot handle classifier with RFE '
                      'feature selection. Got exception: %s' % (e, ))
Esempio n. 2
0
    def test_james_problem_multiclass(self):
        percent = 80
        dataset = datasets['uni4large']
        #dataset = dataset[:, dataset.a.nonbogus_features]

        rfesvm_split = LinearCSVMC()
        fs = \
            RFE(rfesvm_split.get_sensitivity_analyzer(
            postproc=ChainMapper([
                #FxMapper('features', l2_normed),
                #FxMapper('samples', np.mean),
                #FxMapper('samples', np.abs)
                FxMapper('features', lambda x: np.argsort(np.abs(x))),
                #maxofabs_sample()
                mean_sample()
                ])),
                ProxyMeasure(rfesvm_split,
                             postproc=BinaryFxNode(mean_mismatch_error,
                                                   'targets')),
                Splitter('train'),
                fselector=FractionTailSelector(
                    percent / 100.0,
                    mode='select', tail='upper'), update_sensitivity=True)

        clf = FeatureSelectionClassifier(
            LinearCSVMC(),
            # on features selected via RFE
            fs)
             # update sensitivity at each step (since we're not using the
             # same CLF as sensitivity analyzer)

        class StoreResults(object):
            def __init__(self):
                self.storage = []
            def __call__(self, data, node, result):
                self.storage.append((node.measure.mapper.ca.history,
                                     node.measure.mapper.ca.errors)),

        cv_storage = StoreResults()
        cv = CrossValidation(clf, NFoldPartitioner(), postproc=mean_sample(),
                             callback=cv_storage,
                             enable_ca=['stats'])
        #cv = SplitClassifier(clf)
        try:
            error = cv(dataset).samples.squeeze()
        except Exception, e:
            self.fail('CrossValidation cannot handle classifier with RFE '
                      'feature selection. Got exception: %s' % (e,))
Esempio n. 3
0
    def test_classifier_generalization(self, clf):
        """Simple test if classifiers can generalize ok on simple data
        """
        te = CrossValidation(clf, NFoldPartitioner(), postproc=mean_sample())
        # check the default
        #self.assertTrue(te.transerror.errorfx is mean_mismatch_error)

        nclasses = 2 * (1 + int('multiclass' in clf.__tags__))

        ds = datasets['uni%d%s' % (nclasses, self._get_clf_ds(clf))]
        try:
            cve = te(ds).samples.squeeze()
        except Exception as e:
            self.fail("Failed with %s" % e)

        if cfg.getboolean('tests', 'labile', default='yes'):
            if nclasses > 2 and \
                   ((clf.descr is not None and 'on 5%(' in clf.descr)
                    or 'regression_based' in clf.__tags__):
                # skip those since they are barely applicable/testable here
                raise SkipTest("Skip testing of cve on %s" % clf)

            self.assertTrue(
                cve < 0.25,  # TODO: use multinom distribution
                msg="Got transfer error %g on %s with %d labels" %
                (cve, ds, len(ds.UT)))
Esempio n. 4
0
    def test_split_classifier(self):
        ds = self.data_bin_1
        clf = SplitClassifier(
            clf=SameSignClassifier(),
            enable_ca=['stats', 'training_stats', 'feature_ids'])
        clf.train(ds)  # train the beast
        error = clf.ca.stats.error
        tr_error = clf.ca.training_stats.error

        clf2 = clf.clone()
        cv = CrossValidation(clf2,
                             NFoldPartitioner(),
                             postproc=mean_sample(),
                             enable_ca=['stats', 'training_stats'])
        cverror = cv(ds)
        cverror = cverror.samples.squeeze()
        tr_cverror = cv.ca.training_stats.error

        self.assertEqual(
            error,
            cverror,
            msg="We should get the same error using split classifier as"
            " using CrossValidation. Got %s and %s" % (error, cverror))

        self.assertEqual(
            tr_error,
            tr_cverror,
            msg="We should get the same training error using split classifier as"
            " using CrossValidation. Got %s and %s" % (tr_error, tr_cverror))

        self.assertEqual(clf.ca.stats.percent_correct,
                         100,
                         msg="Dummy clf should train perfectly")
        # CV and SplitClassifier should get the same confusion matrices
        assert_array_equal(clf.ca.stats.matrix, cv.ca.stats.matrix)

        self.assertEqual(len(clf.ca.stats.sets),
                         len(ds.UC),
                         msg="Should have 1 confusion per each split")
        self.assertEqual(
            len(clf.clfs),
            len(ds.UC),
            msg="Should have number of classifiers equal # of epochs")
        self.assertEqual(clf.predict(ds.samples),
                         list(ds.targets),
                         msg="Should classify correctly")

        # feature_ids must be list of lists, and since it is not
        # feature-selecting classifier used - we expect all features
        # to be utilized
        #  NOT ANYMORE -- for BoostedClassifier we have now union of all
        #  used features across slave classifiers. That makes
        #  semantics clear. If you need to get deeper -- use upcoming
        #  harvesting facility ;-)
        # self.assertEqual(len(clf.feature_ids), len(ds.uniquechunks))
        # self.assertTrue(np.array([len(ids)==ds.nfeatures
        #                         for ids in clf.feature_ids]).all())

        # Just check if we get it at all ;-)
        summary = clf.summary()
Esempio n. 5
0
    def test_split_classifier_extended(self, clf_):
        clf2 = clf_.clone()
        ds = datasets['uni2%s' % self._get_clf_ds(clf2)]
        clf = SplitClassifier(
            clf=clf_,  #SameSignClassifier(),
            enable_ca=['stats', 'feature_ids'])
        clf.train(ds)  # train the beast
        error = clf.ca.stats.error

        cv = CrossValidation(clf2,
                             NFoldPartitioner(),
                             postproc=mean_sample(),
                             enable_ca=['stats', 'training_stats'])
        cverror = cv(ds).samples.squeeze()

        if not 'non-deterministic' in clf.__tags__:
            self.assertTrue(
                abs(error - cverror) < 0.01,
                msg="We should get the same error using split classifier as"
                " using CrossValidation. Got %s and %s" % (error, cverror))

        if cfg.getboolean('tests', 'labile', default='yes'):
            self.assertTrue(error < 0.25,
                            msg="clf should generalize more or less fine. "
                            "Got error %s" % error)
        self.assertEqual(len(clf.ca.stats.sets),
                         len(ds.UC),
                         msg="Should have 1 confusion per each split")
        self.assertEqual(
            len(clf.clfs),
            len(ds.UC),
            msg="Should have number of classifiers equal # of epochs")
Esempio n. 6
0
    def test_split_classifier_extended(self, clf_):
        clf2 = clf_.clone()
        ds = datasets['uni2%s' % self._get_clf_ds(clf2)]
        clf = SplitClassifier(clf=clf_, #SameSignClassifier(),
                enable_ca=['stats', 'feature_ids'])
        clf.train(ds)                   # train the beast
        error = clf.ca.stats.error

        cv = CrossValidation(clf2, NFoldPartitioner(), postproc=mean_sample(),
            enable_ca=['stats', 'training_stats'])
        cverror = cv(ds).samples.squeeze()

        if not 'non-deterministic' in clf.__tags__:
            self.assertTrue(abs(error-cverror)<0.01,
                    msg="We should get the same error using split classifier as"
                        " using CrossValidation. Got %s and %s"
                        % (error, cverror))

        if cfg.getboolean('tests', 'labile', default='yes'):
            self.assertTrue(error < 0.25,
                msg="clf should generalize more or less fine. "
                    "Got error %s" % error)
        self.assertEqual(len(clf.ca.stats.sets), len(ds.UC),
            msg="Should have 1 confusion per each split")
        self.assertEqual(len(clf.clfs), len(ds.UC),
            msg="Should have number of classifiers equal # of epochs")
Esempio n. 7
0
    def test_regression_as_classifier(self, regr):
        """Basic tests of metaclass for using regressions as classifiers
        """
        for dsname in 'uni2small', 'uni4small':
            ds = datasets[dsname]

            clf = RegressionAsClassifier(regr, enable_ca=['distances'])
            cv = CrossValidation(clf, OddEvenPartitioner(),
                    postproc=mean_sample(),
                    enable_ca=['stats', 'training_stats'])

            error = cv(ds).samples.squeeze()

            nlabels = len(ds.uniquetargets)
            if nlabels == 2 \
               and cfg.getboolean('tests', 'labile', default='yes'):
                self.assertTrue(error < 0.3,
                                msg="Got error %.2f on %s dataset"
                                % (error, dsname))

            # Check if does not puke on repr and str
            self.assertTrue(str(clf) != "")
            self.assertTrue(repr(clf) != "")

            self.assertEqual(clf.ca.distances.shape,
                                 (ds.nsamples / 2, nlabels))
Esempio n. 8
0
    def test_usecase_concordancesl(self):
        import numpy as np
        from mvpa2.base.dataset import vstack
        from mvpa2.mappers.fx import mean_sample

        # Take our sample 3d dataset
        ds1 = datasets['3dsmall'].copy(deep=True)
        ds1.fa['voxel_indices'] = ds1.fa.myspace
        ds1.sa['subject'] = [1]  # not really necessary -- but let's for clarity
        ds1 = mean_sample()(ds1) # so we get just a single representative sample

        def corr12(ds):
            corr = np.corrcoef(ds.samples)
            assert(corr.shape == (2, 2)) # for paranoid ones
            return corr[0, 1]

        for nsc, thr, thr_mean in (
            (0, 1.0, 1.0),
            (0.1, 0.3, 0.8)):   # just a bit of noise
            ds2 = ds1.copy(deep=True)    # make a copy for the 2nd subject
            ds2.sa['subject'] = [2]
            ds2.samples += nsc * np.random.normal(size=ds1.shape)

            # make sure that both have the same voxel indices
            assert(np.all(ds1.fa.voxel_indices == ds2.fa.voxel_indices))
            ds_both = vstack((ds1, ds2))# join 2 images into a single dataset
                                        # with .sa.subject distinguishing both

            sl = sphere_searchlight(corr12, radius=2)
            slmap = sl(ds_both)
            ok_(np.all(slmap.samples >= thr))
            ok_(np.mean(slmap.samples) >= thr)
Esempio n. 9
0
    def test_usecase_concordancesl(self):
        import numpy as np
        from mvpa2.base.dataset import vstack
        from mvpa2.mappers.fx import mean_sample

        # Take our sample 3d dataset
        ds1 = datasets['3dsmall'].copy(deep=True)
        ds1.fa['voxel_indices'] = ds1.fa.myspace
        ds1.sa['subject'] = [1
                             ]  # not really necessary -- but let's for clarity
        ds1 = mean_sample()(
            ds1)  # so we get just a single representative sample

        def corr12(ds):
            corr = np.corrcoef(ds.samples)
            assert (corr.shape == (2, 2))  # for paranoid ones
            return corr[0, 1]

        for nsc, thr, thr_mean in ((0, 1.0, 1.0),
                                   (0.1, 0.3, 0.8)):  # just a bit of noise
            ds2 = ds1.copy(deep=True)  # make a copy for the 2nd subject
            ds2.sa['subject'] = [2]
            ds2.samples += nsc * np.random.normal(size=ds1.shape)

            # make sure that both have the same voxel indices
            assert (np.all(ds1.fa.voxel_indices == ds2.fa.voxel_indices))
            ds_both = vstack((ds1, ds2))  # join 2 images into a single dataset
            # with .sa.subject distinguishing both

            sl = sphere_searchlight(corr12, radius=2)
            slmap = sl(ds_both)
            ok_(np.all(slmap.samples >= thr))
            ok_(np.mean(slmap.samples) >= thr)
Esempio n. 10
0
    def test_james_problem(self):
        percent = 80
        dataset = datasets['uni2small']
        rfesvm_split = LinearCSVMC()
        fs = \
            RFE(rfesvm_split.get_sensitivity_analyzer(),
                ProxyMeasure(rfesvm_split,
                             postproc=BinaryFxNode(mean_mismatch_error,
                                                   'targets')),
                Splitter('train'),
                fselector=FractionTailSelector(
                    percent / 100.0,
                    mode='select', tail='upper'), update_sensitivity=True)

        clf = FeatureSelectionClassifier(
            LinearCSVMC(),
            # on features selected via RFE
            fs)
             # update sensitivity at each step (since we're not using the
             # same CLF as sensitivity analyzer)

        cv = CrossValidation(clf, NFoldPartitioner(), postproc=mean_sample(),
            enable_ca=['confusion'])
        #cv = SplitClassifier(clf)
        try:
            error = cv(dataset).samples.squeeze()
        except Exception, e:
            self.fail('CrossValidation cannot handle classifier with RFE '
                      'feature selection. Got exception: %s' % (e,))
Esempio n. 11
0
    def test_regression_as_classifier(self, regr):
        """Basic tests of metaclass for using regressions as classifiers
        """
        for dsname in 'uni2small', 'uni4small':
            ds = datasets[dsname]

            clf = RegressionAsClassifier(regr, enable_ca=['distances'])
            cv = CrossValidation(clf,
                                 OddEvenPartitioner(),
                                 postproc=mean_sample(),
                                 enable_ca=['stats', 'training_stats'])

            error = cv(ds).samples.squeeze()

            nlabels = len(ds.uniquetargets)
            if nlabels == 2 \
               and cfg.getboolean('tests', 'labile', default='yes'):
                self.assertTrue(error < 0.3,
                                msg="Got error %.2f on %s dataset" %
                                (error, dsname))

            # Check if does not puke on repr and str
            self.assertTrue(str(clf) != "")
            self.assertTrue(repr(clf) != "")

            self.assertEqual(clf.ca.distances.shape,
                             (ds.nsamples / 2, nlabels))
Esempio n. 12
0
    def test_null_dist_prob(self, l_clf):
        train = datasets['uni2medium']

        num_perm = 10
        permutator = AttributePermutator('targets',
                                         count=num_perm,
                                         limit='chunks')
        # define class to estimate NULL distribution of errors
        # use left tail of the distribution since we use MeanMatchFx as error
        # function and lower is better
        terr = TransferMeasure(l_clf,
                               Repeater(count=2),
                               postproc=BinaryFxNode(mean_mismatch_error,
                                                     'targets'),
                               null_dist=MCNullDist(permutator, tail='left'))

        # check reasonable error range
        err = terr(train)
        self.assertTrue(np.mean(err) < 0.4)

        # Lets do the same for CVTE
        cvte = CrossValidation(l_clf,
                               OddEvenPartitioner(),
                               null_dist=MCNullDist(permutator,
                                                    tail='left',
                                                    enable_ca=['dist_samples'
                                                               ]),
                               postproc=mean_sample())
        cv_err = cvte(train)

        # check that the result is highly significant since we know that the
        # data has signal
        null_prob = np.asscalar(terr.ca.null_prob)

        if cfg.getboolean('tests', 'labile', default='yes'):
            self.assertTrue(
                null_prob <= 0.1,
                msg="Failed to check that the result is highly significant "
                "(got %f) since we know that the data has signal" % null_prob)

            self.assertTrue(
                np.asscalar(cvte.ca.null_prob) <= 0.1,
                msg="Failed to check that the result is highly significant "
                "(got p(cvte)=%f) since we know that the data has signal" %
                np.asscalar(cvte.ca.null_prob))

        # we should be able to access the actual samples of the distribution
        # yoh: why it is 3D really?
        # mih: because these are the distribution samples for the ONE error
        #      collapsed into ONE value across all folds. It will also be
        #      3d if the return value of the measure isn't a scalar and it is
        #      not collapsed across folds. it simply corresponds to the shape
        #      of the output dataset of the respective measure (+1 axis)
        # Some permutations could have been skipped since classifier failed
        # to train due to degenerate situation etc, thus accounting for them
        self.assertEqual(cvte.null_dist.ca.dist_samples.shape[2],
                         num_perm - cvte.null_dist.ca.skipped)
Esempio n. 13
0
def test_function_ptrs(fname):
    skip_if_no_external('nibabel')
    ds = load_example_fmri_dataset()
    # add a mapper with a function ptr inside
    ds = ds.get_mapped(mean_sample())
    h5save(fname, ds)
    ds_loaded = h5load(fname)
    fresh = load_example_fmri_dataset().O
    # check that the reconstruction function pointer in the FxMapper points
    # to the right one
    assert_array_equal(ds_loaded.a.mapper.forward(fresh), ds.samples)
Esempio n. 14
0
    def test_null_dist_prob(self, l_clf):
        train = datasets['uni2medium']

        num_perm = 10
        permutator = AttributePermutator('targets', count=num_perm,
                                         limit='chunks')
        # define class to estimate NULL distribution of errors
        # use left tail of the distribution since we use MeanMatchFx as error
        # function and lower is better
        terr = TransferMeasure(
            l_clf,
            Repeater(count=2),
            postproc=BinaryFxNode(mean_mismatch_error, 'targets'),
            null_dist=MCNullDist(permutator,
                                 tail='left'))

        # check reasonable error range
        err = terr(train)
        self.assertTrue(np.mean(err) < 0.4)

        # Lets do the same for CVTE
        cvte = CrossValidation(l_clf, OddEvenPartitioner(),
            null_dist=MCNullDist(permutator,
                                 tail='left',
                                 enable_ca=['dist_samples']),
            postproc=mean_sample())
        cv_err = cvte(train)

        # check that the result is highly significant since we know that the
        # data has signal
        null_prob = np.asscalar(terr.ca.null_prob)

        if cfg.getboolean('tests', 'labile', default='yes'):
            self.assertTrue(null_prob <= 0.1,
                msg="Failed to check that the result is highly significant "
                    "(got %f) since we know that the data has signal"
                    % null_prob)

            self.assertTrue(np.asscalar(cvte.ca.null_prob) <= 0.1,
                msg="Failed to check that the result is highly significant "
                    "(got p(cvte)=%f) since we know that the data has signal"
                    % np.asscalar(cvte.ca.null_prob))

        # we should be able to access the actual samples of the distribution
        # yoh: why it is 3D really?
        # mih: because these are the distribution samples for the ONE error
        #      collapsed into ONE value across all folds. It will also be
        #      3d if the return value of the measure isn't a scalar and it is
        #      not collapsed across folds. it simply corresponds to the shape
        #      of the output dataset of the respective measure (+1 axis)
        # Some permutations could have been skipped since classifier failed
        # to train due to degenerate situation etc, thus accounting for them
        self.assertEqual(cvte.null_dist.ca.dist_samples.shape[2],
                             num_perm - cvte.null_dist.ca.skipped)
Esempio n. 15
0
    def test_split_classifier(self):
        ds = self.data_bin_1
        clf = SplitClassifier(clf=SameSignClassifier(),
                enable_ca=['stats', 'training_stats',
                               'feature_ids'])
        clf.train(ds)                   # train the beast
        error = clf.ca.stats.error
        tr_error = clf.ca.training_stats.error

        clf2 = clf.clone()
        cv = CrossValidation(clf2, NFoldPartitioner(), postproc=mean_sample(),
            enable_ca=['stats', 'training_stats'])
        cverror = cv(ds)
        cverror = cverror.samples.squeeze()
        tr_cverror = cv.ca.training_stats.error

        self.assertEqual(error, cverror,
                msg="We should get the same error using split classifier as"
                    " using CrossValidation. Got %s and %s"
                    % (error, cverror))

        self.assertEqual(tr_error, tr_cverror,
                msg="We should get the same training error using split classifier as"
                    " using CrossValidation. Got %s and %s"
                    % (tr_error, tr_cverror))

        self.assertEqual(clf.ca.stats.percent_correct,
                             100,
                             msg="Dummy clf should train perfectly")
        # CV and SplitClassifier should get the same confusion matrices
        assert_array_equal(clf.ca.stats.matrix,
                           cv.ca.stats.matrix)

        self.assertEqual(len(clf.ca.stats.sets),
                             len(ds.UC),
                             msg="Should have 1 confusion per each split")
        self.assertEqual(len(clf.clfs), len(ds.UC),
                             msg="Should have number of classifiers equal # of epochs")
        self.assertEqual(clf.predict(ds.samples), list(ds.targets),
                             msg="Should classify correctly")

        # feature_ids must be list of lists, and since it is not
        # feature-selecting classifier used - we expect all features
        # to be utilized
        #  NOT ANYMORE -- for BoostedClassifier we have now union of all
        #  used features across slave classifiers. That makes
        #  semantics clear. If you need to get deeper -- use upcoming
        #  harvesting facility ;-)
        # self.assertEqual(len(clf.feature_ids), len(ds.uniquechunks))
        # self.assertTrue(np.array([len(ids)==ds.nfeatures
        #                         for ids in clf.feature_ids]).all())

        # Just check if we get it at all ;-)
        summary = clf.summary()
Esempio n. 16
0
def test_function_ptrs(fname):
    skip_if_no_external('nibabel')
    ds = load_example_fmri_dataset()
    # add a mapper with a function ptr inside
    ds = ds.get_mapped(mean_sample())
    h5save(fname, ds)
    ds_loaded = h5load(fname)
    fresh = load_example_fmri_dataset().O
    # check that the reconstruction function pointer in the FxMapper points
    # to the right one
    assert_array_equal(ds_loaded.a.mapper.forward(fresh),
                       ds.samples)
Esempio n. 17
0
def test_function_ptrs():
    if not externals.exists('nibabel'):
        raise SkipTest
    ds = load_example_fmri_dataset()
    # add a mapper with a function ptr inside
    ds = ds.get_mapped(mean_sample())
    f = tempfile.NamedTemporaryFile()
    h5save(f.name, ds)
    ds_loaded = h5load(f.name)
    fresh = load_example_fmri_dataset().O
    # check that the reconstruction function pointer in the FxMapper points
    # to the right one
    assert_array_equal(ds_loaded.a.mapper.forward(fresh), ds.samples)
Esempio n. 18
0
    def test_classifier_generalization(self, clf):
        """Simple test if classifiers can generalize ok on simple data
        """
        te = CrossValidation(clf, NFoldPartitioner(), postproc=mean_sample())
        # check the default
        #self.assertTrue(te.transerror.errorfx is mean_mismatch_error)

        nclasses = 2 * (1 + int('multiclass' in clf.__tags__))

        ds = datasets['uni%d%s' % (nclasses, self._get_clf_ds(clf))]
        try:
            cve = te(ds).samples.squeeze()
        except Exception, e:
            self.fail("Failed with %s" % e)
Esempio n. 19
0
    def test_classifier_generalization(self, clf):
        """Simple test if classifiers can generalize ok on simple data
        """
        te = CrossValidation(clf, NFoldPartitioner(), postproc=mean_sample())
        # check the default
        #self.assertTrue(te.transerror.errorfx is mean_mismatch_error)

        nclasses = 2 * (1 + int('multiclass' in clf.__tags__))

        ds = datasets['uni%d%s' % (nclasses, self._get_clf_ds(clf))]
        try:
            cve = te(ds).samples.squeeze()
        except Exception, e:
            self.fail("Failed with %s" % e)
Esempio n. 20
0
    def test_confusionmatrix_nulldist(self):
        from mvpa2.clfs.gnb import GNB

        class ConfusionMatrixError(object):
            """Custom error "function"
            """
            def __init__(self, labels=None):
                self.labels = labels
            def __call__(self, predictions, targets):
                cm = ConfusionMatrix(labels=list(self.labels),
                                     targets=targets, predictions=predictions)
                #print cm.matrix
                # We have to add a degenerate leading dimension
                # so we could separate them into separate 'samples'
                return cm.matrix[None, :]

        from mvpa2.misc.data_generators import normal_feature_dataset
        for snr in [0., 2.,]:
            ds = normal_feature_dataset(snr=snr, perlabel=42, nchunks=3,
                                        nonbogus_features=[0,1], nfeatures=2)

            clf = GNB()
            num_perm = 50
            permutator = AttributePermutator('targets',
                                             limit='chunks',
                                             count=num_perm)
            cv = CrossValidation(
                clf, NFoldPartitioner(),
                errorfx=ConfusionMatrixError(labels=ds.sa['targets'].unique),
                postproc=mean_sample(),
                null_dist=MCNullDist(permutator,
                                     tail='right', # because we now look at accuracy not error
                                     enable_ca=['dist_samples']),
                enable_ca=['stats'])
            cmatrix = cv(ds)
            #print "Result:\n", cmatrix.samples
            cvnp = cv.ca.null_prob.samples
            #print cvnp
            self.assertTrue(cvnp.shape, (2, 2))
            if cfg.getboolean('tests', 'labile', default='yes'):
                if snr == 0.:
                    # all p should be high since no signal
                    assert_array_less(0.05, cvnp)
                else:
                    # diagonal p is low -- we have signal after all
                    assert_array_less(np.diag(cvnp), 0.05)
                    # off diagonals are high p since for them we would
                    # need to look at the other tail
                    assert_array_less(0.9,
                                      cvnp[(np.array([0,1]), np.array([1,0]))])
Esempio n. 21
0
def test_function_ptrs():
    if not externals.exists('nibabel'):
        raise SkipTest
    ds = load_example_fmri_dataset()
    # add a mapper with a function ptr inside
    ds = ds.get_mapped(mean_sample())
    f = tempfile.NamedTemporaryFile()
    h5save(f.name, ds)
    ds_loaded = h5load(f.name)
    fresh = load_example_fmri_dataset().O
    # check that the reconstruction function pointer in the FxMapper points
    # to the right one
    assert_array_equal(ds_loaded.a.mapper.forward(fresh),
                        ds.samples)
Esempio n. 22
0
    def test_james_problem(self):
        percent = 80
        dataset = datasets['uni2small']
        rfesvm_split = LinearCSVMC()
        fs = \
            RFE(rfesvm_split.get_sensitivity_analyzer(),
                ProxyMeasure(rfesvm_split,
                             postproc=BinaryFxNode(mean_mismatch_error,
                                                   'targets')),
                Splitter('train'),
                fselector=FractionTailSelector(
                    percent / 100.0,
                    mode='select', tail='upper'), update_sensitivity=True)

        clf = FeatureSelectionClassifier(
            LinearCSVMC(),
            # on features selected via RFE
            fs)

        # update sensitivity at each step (since we're not using the
        # same CLF as sensitivity analyzer)

        class StoreResults(object):
            def __init__(self):
                self.storage = []

            def __call__(self, data, node, result):
                self.storage.append((node.measure.mapper.ca.history,
                                     node.measure.mapper.ca.errors)),

        cv_storage = StoreResults()
        cv = CrossValidation(clf,
                             NFoldPartitioner(),
                             postproc=mean_sample(),
                             callback=cv_storage,
                             enable_ca=['confusion'])  # TODO -- it is stats
        #cv = SplitClassifier(clf)
        try:
            error = cv(dataset).samples.squeeze()
        except Exception as e:
            self.fail('CrossValidation cannot handle classifier with RFE '
                      'feature selection. Got exception: %s' % (e, ))

        assert (len(cv_storage.storage) == len(dataset.sa['chunks'].unique))
        assert (len(cv_storage.storage[0]) == 2)
        assert (len(cv_storage.storage[0][0]) == dataset.nfeatures)

        self.assertTrue(error < 0.2)
Esempio n. 23
0
    def test_ifs(self, svm):

        # measure for feature selection criterion and performance assesment
        # use the SAME clf!
        errorfx = mean_mismatch_error
        fmeasure = CrossValidation(svm,
                                   NFoldPartitioner(),
                                   postproc=mean_sample())
        pmeasure = ProxyMeasure(svm, postproc=BinaryFxNode(errorfx, 'targets'))

        ifs = IFS(fmeasure,
                  pmeasure,
                  Splitter('purpose', attr_values=['train', 'test']),
                  fselector=\
                    # go for lower tail selection as data_measure will return
                    # errors -> low is good


                    FixedNElementTailSelector(1, tail='lower', mode='select'),
                  )
        wdata = self.get_data()
        wdata.sa['purpose'] = np.repeat('train', len(wdata))
        tdata = self.get_data()
        tdata.sa['purpose'] = np.repeat('test', len(tdata))
        ds = vstack((wdata, tdata))
        orig_nfeatures = ds.nfeatures

        ifs.train(ds)
        resds = ifs(ds)

        # fail if orig datasets are changed
        self.assertTrue(ds.nfeatures == orig_nfeatures)

        # check that the features set with the least error is selected
        self.assertTrue(len(ifs.ca.errors))
        e = np.array(ifs.ca.errors)
        self.assertTrue(resds.nfeatures == e.argmin() + 1)

        # repeat with dataset where selection order is known
        wsignal = datasets['dumb2'].copy()
        wsignal.sa['purpose'] = np.repeat('train', len(wsignal))
        tsignal = datasets['dumb2'].copy()
        tsignal.sa['purpose'] = np.repeat('test', len(tsignal))
        signal = vstack((wsignal, tsignal))
        ifs.train(signal)
        resds = ifs(signal)
        self.assertTrue((resds.samples[:, 0] == signal.samples[:, 0]).all())
Esempio n. 24
0
    def test_confusionmatrix_nulldist(self):
        from mvpa2.clfs.gnb import GNB
        from mvpa2.clfs.transerror import ConfusionMatrixError
        from mvpa2.misc.data_generators import normal_feature_dataset
        for snr in [
                0.,
                2.,
        ]:
            ds = normal_feature_dataset(snr=snr,
                                        perlabel=42,
                                        nchunks=3,
                                        nonbogus_features=[0, 1],
                                        nfeatures=2)

            clf = GNB()
            num_perm = 50
            permutator = AttributePermutator('targets',
                                             limit='chunks',
                                             count=num_perm)
            cv = CrossValidation(
                clf,
                NFoldPartitioner(),
                errorfx=ConfusionMatrixError(labels=ds.sa['targets'].unique),
                postproc=mean_sample(),
                null_dist=MCNullDist(
                    permutator,
                    tail='right',  # because we now look at accuracy not error
                    enable_ca=['dist_samples']),
                enable_ca=['stats'])
            cmatrix = cv(ds)
            #print "Result:\n", cmatrix.samples
            cvnp = cv.ca.null_prob.samples
            #print cvnp
            self.assertTrue(cvnp.shape, (2, 2))
            if cfg.getboolean('tests', 'labile', default='yes'):
                if snr == 0.:
                    # all p should be high since no signal
                    assert_array_less(0.05, cvnp)
                else:
                    # diagonal p is low -- we have signal after all
                    assert_array_less(np.diag(cvnp), 0.05)
                    # off diagonals are high p since for them we would
                    # need to look at the other tail
                    assert_array_less(
                        0.9, cvnp[(np.array([0, 1]), np.array([1, 0]))])
Esempio n. 25
0
    def test_ifs(self, svm):

        # measure for feature selection criterion and performance assesment
        # use the SAME clf!
        errorfx = mean_mismatch_error
        fmeasure = CrossValidation(svm, NFoldPartitioner(), postproc=mean_sample())
        pmeasure = ProxyMeasure(svm, postproc=BinaryFxNode(errorfx, 'targets'))

        ifs = IFS(fmeasure,
                  pmeasure,
                  Splitter('purpose', attr_values=['train', 'test']),
                  fselector=
                    # go for lower tail selection as data_measure will return
                    # errors -> low is good
                    FixedNElementTailSelector(1, tail='lower', mode='select'),
                  )
        wdata = self.get_data()
        wdata.sa['purpose'] = np.repeat('train', len(wdata))
        tdata = self.get_data()
        tdata.sa['purpose'] = np.repeat('test', len(tdata))
        ds = vstack((wdata, tdata))
        orig_nfeatures = ds.nfeatures

        ifs.train(ds)
        resds = ifs(ds)

        # fail if orig datasets are changed
        self.assertTrue(ds.nfeatures == orig_nfeatures)

        # check that the features set with the least error is selected
        self.assertTrue(len(ifs.ca.errors))
        e = np.array(ifs.ca.errors)
        self.assertTrue(resds.nfeatures == e.argmin() + 1)


        # repeat with dataset where selection order is known
        wsignal = datasets['dumb2'].copy()
        wsignal.sa['purpose'] = np.repeat('train', len(wsignal))
        tsignal = datasets['dumb2'].copy()
        tsignal.sa['purpose'] = np.repeat('test', len(tsignal))
        signal = vstack((wsignal, tsignal))
        ifs.train(signal)
        resds = ifs(signal)
        self.assertTrue((resds.samples[:,0] == signal.samples[:,0]).all())
Esempio n. 26
0
    def test_adhocsearchlight_perm_testing(self):
        # just a smoke test pretty much
        ds = datasets['3dmedium'].copy()
        #ds.samples += np.random.normal(size=ds.samples.shape)*10
        mvpa2.seed()
        ds.fa['voxel_indices'] = ds.fa.myspace
        from mvpa2.mappers.fx import mean_sample
        from mvpa2.clfs.stats import MCNullDist
        permutator = AttributePermutator('targets', count=8,
                                         limit='chunks')
        distr_est = MCNullDist(permutator, tail='left',
                               enable_ca=['dist_samples'])
        slargs = (kNN(1),
                  NFoldPartitioner(0.5,
                                   selection_strategy='random',
                                   count=9))
        slkwargs = dict(radius=1, postproc=mean_sample())

        sl_nodistr = sphere_m1nnsearchlight(*slargs, **slkwargs)
        skip_if_no_external('scipy')    # needed for null_t
        sl = sphere_m1nnsearchlight(
            *slargs,
            null_dist=distr_est,
            enable_ca=['null_t'],
            reuse_neighbors=True,
            **slkwargs
            )
        mvpa2.seed()
        res_nodistr = sl_nodistr(ds)
        mvpa2.seed()
        res = sl(ds)
        # verify that we at least got the same main result
        # ah (yoh) -- null dist is estimated before the main
        # estimate so we can't guarantee correspondence :-/
        # assert_array_equal(res_nodistr, res)
        # only resemblance (TODO, may be we want to get/setstate
        # for rng before null_dist.fit?)

        # and dimensions correspond
        assert_array_equal(distr_est.ca.dist_samples.shape,
                           (1, ds.nfeatures, 8))
        assert_array_equal(sl.ca.null_t.samples.shape,
                           (1, ds.nfeatures))
Esempio n. 27
0
    def test_adhocsearchlight_perm_testing(self):
        # just a smoke test pretty much
        ds = datasets['3dmedium'].copy()
        #ds.samples += np.random.normal(size=ds.samples.shape)*10
        mvpa2.seed()
        ds.fa['voxel_indices'] = ds.fa.myspace
        from mvpa2.mappers.fx import mean_sample
        from mvpa2.clfs.stats import MCNullDist
        permutator = AttributePermutator('targets', count=8, limit='chunks')
        distr_est = MCNullDist(permutator,
                               tail='left',
                               enable_ca=['dist_samples'])
        slargs = (kNN(1),
                  NFoldPartitioner(0.5, selection_strategy='random', count=9))
        slkwargs = dict(radius=1, postproc=mean_sample())

        sl_nodistr = sphere_m1nnsearchlight(*slargs, **slkwargs)
        skip_if_no_external('scipy')  # needed for null_t
        sl = sphere_m1nnsearchlight(*slargs,
                                    null_dist=distr_est,
                                    enable_ca=['null_t'],
                                    reuse_neighbors=True,
                                    **slkwargs)
        mvpa2.seed()
        res_nodistr = sl_nodistr(ds)
        mvpa2.seed()
        res = sl(ds)
        # verify that we at least got the same main result
        # ah (yoh) -- null dist is estimated before the main
        # estimate so we can't guarantee correspondence :-/
        # assert_array_equal(res_nodistr, res)
        # only resemblance (TODO, may be we want to get/setstate
        # for rng before null_dist.fit?)

        # and dimensions correspond
        assert_array_equal(distr_est.ca.dist_samples.shape,
                           (1, ds.nfeatures, 8))
        assert_array_equal(sl.ca.null_t.samples.shape, (1, ds.nfeatures))
Esempio n. 28
0
    def test_confusionmatrix_nulldist(self):
        from mvpa2.clfs.gnb import GNB
        from mvpa2.clfs.transerror import ConfusionMatrixError
        from mvpa2.misc.data_generators import normal_feature_dataset
        for snr in [0., 2.,]:
            ds = normal_feature_dataset(snr=snr, perlabel=42, nchunks=3,
                                        nonbogus_features=[0,1], nfeatures=2)

            clf = GNB()
            num_perm = 50
            permutator = AttributePermutator('targets',
                                             limit='chunks',
                                             count=num_perm)
            cv = CrossValidation(
                clf, NFoldPartitioner(),
                errorfx=ConfusionMatrixError(labels=ds.sa['targets'].unique),
                postproc=mean_sample(),
                null_dist=MCNullDist(permutator,
                                     tail='right', # because we now look at accuracy not error
                                     enable_ca=['dist_samples']),
                enable_ca=['stats'])
            cmatrix = cv(ds)
            #print "Result:\n", cmatrix.samples
            cvnp = cv.ca.null_prob.samples
            #print cvnp
            self.assertTrue(cvnp.shape, (2, 2))
            if cfg.getboolean('tests', 'labile', default='yes'):
                if snr == 0.:
                    # all p should be high since no signal
                    assert_array_less(0.05, cvnp)
                else:
                    # diagonal p is low -- we have signal after all
                    assert_array_less(np.diag(cvnp), 0.05)
                    # off diagonals are high p since for them we would
                    # need to look at the other tail
                    assert_array_less(0.9,
                                      cvnp[(np.array([0,1]), np.array([1,0]))])
Esempio n. 29
0
def do_searchlight(glm_dataset, radius, output_basename, with_null_prob=False):
    clf = LinearCSVMC(space='condition')
    #		clf = RbfCSVMC(C=5.0)
    splt = NFoldPartitioner()
    cv = CrossValidation(clf,
                         splt,
                         errorfx=mean_match_accuracy,
                         enable_ca=['stats'],
                         postproc=mean_sample())
    distr_est = []
    if with_null_prob:
        permutator = AttributePermutator('condition',
                                         count=100,
                                         limit='chunks')
        distr_est = MCNullDist(permutator,
                               tail='left',
                               enable_ca=['dist_samples'])
        """
		repeater   = Repeater(count=100)
		permutator = AttributePermutator('condition', limit={'partitions': 1}, count=1) 
		null_cv = CrossValidation(clf, ChainNode([splt, permutator],space=splt.get_space()),
					  postproc=mean_sample())
		null_sl = sphere_searchlight(null_cv, radius=radius, space='voxel_indices',
					     enable_ca=['roi_sizes'])
		distr_est = MCNullDist(repeater,tail='left', measure=null_sl,
				       enable_ca=['dist_samples'])
		"""
        sl = sphere_searchlight(cv,
                                radius=radius,
                                space='voxel_indices',
                                null_dist=distr_est,
                                enable_ca=['roi_sizes', 'roi_feature_ids'])
    else:

        sl = sphere_searchlight(cv,
                                radius=radius,
                                space='voxel_indices',
                                enable_ca=['roi_sizes', 'roi_feature_ids'])
    #ds = glm_dataset.copy(deep=False,
    #		       sa=['condition','chunks'],
    #		       fa=['voxel_indices'],
    #		       a=['mapper'])
    #debug.active += ["SLC"]
    sl_map = sl(glm_dataset)
    errresults = map2nifti(sl_map, imghdr=glm_dataset.a.imghdr)
    errresults.to_filename('{}-acc.nii.gz'.format(output_basename))
    sl_map.samples *= -1
    sl_map.samples += 1
    niftiresults = map2nifti(sl_map, imghdr=glm_dataset.a.imghdr)
    niftiresults.to_filename('{}-err.nii.gz'.format(output_basename))
    #TODO: save p value map
    if with_null_prob:
        nullt_results = map2nifti(sl_map,
                                  data=sl.ca.null_t,
                                  imghdr=glm_dataset.a.imghdr)
        nullt_results.to_filename('{}-t.nii.gz'.format(output_basename))
        nullprob_results = map2nifti(sl_map,
                                     data=sl.ca.null_prob,
                                     imghdr=glm_dataset.a.imghdr)
        nullprob_results.to_filename('{}-prob.nii.gz'.format(output_basename))
        nullprob_results = map2nifti(sl_map,
                                     data=distr_est.cdf(sl_map.samples),
                                     imghdr=glm_dataset.a.imghdr)
        nullprob_results.to_filename('{}-cdf.nii.gz'.format(output_basename))
Esempio n. 30
0
    def _call(self, ds):
        if len(ds) > 1:
            # average all samples into one, assuming we got something like one
            # sample per subject as input
            avgr = mean_sample()
            ds = avgr(ds)
        # threshold input; at this point we only have one sample left
        thrd = ds.samples[0] > self._thrmap
        # mapper default
        mapper = IdentityMapper()
        # overwrite if possible
        if hasattr(ds, 'a') and 'mapper' in ds.a:
            mapper = ds.a.mapper
        # reverse-map input
        osamp = mapper.reverse1(thrd)
        # prep output dataset
        outds = ds.copy(deep=False)
        outds.fa['featurewise_thresh'] = self._thrmap
        # determine clusters
        labels, num = measurements.label(osamp)
        area = measurements.sum(osamp,
                                labels,
                                index=np.arange(1, num + 1)).astype(int)
        # for the rest we need the labels flattened
        labels = mapper.forward1(labels)
        # relabel clusters starting with the biggest and increase index with
        # decreasing size
        ordered_labels = np.zeros(labels.shape, dtype=int)
        ordered_area = np.zeros(area.shape, dtype=int)
        for i, idx in enumerate(np.argsort(area)):
            ordered_labels[labels == idx + 1] = num - i
            ordered_area[i] = area[idx]
        area = ordered_area[::-1]
        labels = ordered_labels
        del ordered_labels  # this one can be big
        # store cluster labels after forward-mapping
        outds.fa['clusters_featurewise_thresh'] = labels.copy()
        # update cluster size histogram with the actual result to get a
        # proper lower bound for p-values
        # this will make a copy, because the original matrix is int
        cluster_probs_raw = _transform_to_pvals(
            area, self._null_cluster_sizes.astype('float'))

        if self.params.multicomp_correction is None:
            probs_corr = np.array(cluster_probs_raw)
            rej = probs_corr <= self.params.fwe_rate
            outds.a['clusterstats'] = \
                np.rec.fromarrays(
                    [area, cluster_probs_raw], names=('size', 'prob_raw'))
        else:
            # do a local import as only this tiny portion needs statsmodels
            import statsmodels.stats.multitest as smm
            rej, probs_corr = smm.multipletests(
                cluster_probs_raw,
                alpha=self.params.fwe_rate,
                method=self.params.multicomp_correction)[:2]
            # store corrected per-cluster probabilities
            outds.a['clusterstats'] = \
                np.rec.fromarrays(
                    [area, cluster_probs_raw, probs_corr],
                    names=('size', 'prob_raw', 'prob_corrected'))
            # remove cluster labels that did not pass the FWE threshold
            for i, r in enumerate(rej):
                if not r:
                    labels[labels == i + 1] = 0
            outds.fa['clusters_fwe_thresh'] = labels
        return outds
Esempio n. 31
0
def get_crossvalidation_instance(learner,
                                 partitioner,
                                 errorfx,
                                 sampling_repetitions=1,
                                 learner_space='targets',
                                 balance_training=None,
                                 permutations=0,
                                 avg_datafold_results=True,
                                 prob_tail='left'):
    from mvpa2.base.node import ChainNode
    from mvpa2.measures.base import CrossValidation
    if not balance_training is None:
        # balance training data
        try:
            amount = int(balance_training)
        except ValueError:
            try:
                amount = float(balance_training)
            except ValueError:
                amount = balance_training
        from mvpa2.generators.resampling import Balancer
        balancer = Balancer(amount=amount,
                            attr=learner_space,
                            count=sampling_repetitions,
                            limit={partitioner.get_space(): 1},
                            apply_selection=True,
                            include_offlimit=True)
    else:
        balancer = None
    # set learner space
    learner.set_space(learner_space)
    # setup generator for data folding -- put in a chain node for easy
    # amending
    gennode = ChainNode([partitioner], space=partitioner.get_space())
    if avg_datafold_results:
        from mvpa2.mappers.fx import mean_sample
        postproc = mean_sample()
    else:
        postproc = None
    if not balancer is None:
        # enable balancing step for each partitioning step
        gennode.append(balancer)
    if permutations > 0:
        from mvpa2.generators.base import Repeater
        from mvpa2.generators.permutation import AttributePermutator
        from mvpa2.clfs.stats import MCNullDist
        # how often do we want to shuffle the data
        repeater = Repeater(count=permutations)
        # permute the training part of a dataset exactly ONCE
        permutator = AttributePermutator(learner_space,
                                         limit={partitioner.get_space(): 1},
                                         count=1)
        # CV with null-distribution estimation that permutes the training data for
        # each fold independently
        perm_gen_node = copy.deepcopy(gennode)
        perm_gen_node.append(permutator)
        null_cv = CrossValidation(learner,
                                  perm_gen_node,
                                  postproc=postproc,
                                  errorfx=errorfx)
        # Monte Carlo distribution estimator
        distr_est = MCNullDist(repeater,
                               tail=prob_tail,
                               measure=null_cv,
                               enable_ca=['dist_samples'])
        # pass the p-values as feature attributes on to the results
        pass_attr = [('ca.null_prob', 'fa', 1)]
    else:
        distr_est = None
        pass_attr = None
    # final CV node
    cv = CrossValidation(learner,
                         gennode,
                         errorfx=errorfx,
                         null_dist=distr_est,
                         postproc=postproc,
                         enable_ca=['stats', 'null_prob'],
                         pass_attr=pass_attr)
    return cv
Esempio n. 32
0
    def test_rfe(self, clf):

        # sensitivity analyser and transfer error quantifier use the SAME clf!
        sens_ana = clf.get_sensitivity_analyzer(postproc=maxofabs_sample())
        pmeasure = ProxyMeasure(clf,
                                postproc=BinaryFxNode(mean_mismatch_error,
                                                      'targets'))
        cvmeasure = CrossValidation(clf,
                                    NFoldPartitioner(),
                                    errorfx=mean_mismatch_error,
                                    postproc=mean_sample())

        rfesvm_split = SplitClassifier(clf, OddEvenPartitioner())

        # explore few recipes
        for rfe, data in [
                # because the clf is already trained when computing the sensitivity
                # map, prevent retraining for transfer error calculation
                # Use absolute of the svm weights as sensitivity
            (RFE(sens_ana,
                 pmeasure,
                 Splitter('train'),
                 fselector=FixedNElementTailSelector(1),
                 train_pmeasure=False), self.get_data()),
                # use cross-validation within training to get error for the stopping point
                # but use full training data to derive sensitivity
            (
                RFE(
                    sens_ana,
                    cvmeasure,
                    Repeater(
                        2
                    ),  # give the same full dataset to sens_ana and cvmeasure
                    fselector=FractionTailSelector(0.70,
                                                   mode='select',
                                                   tail='upper'),
                    train_pmeasure=True),
                normal_feature_dataset(perlabel=20,
                                       nchunks=5,
                                       nfeatures=200,
                                       nonbogus_features=[0, 1],
                                       snr=1.5)),
                # use cross-validation (via SplitClassifier) and get mean
                # of normed sensitivities across those splits
            (
                RFE(
                    rfesvm_split.get_sensitivity_analyzer(
                        postproc=ChainMapper([
                            FxMapper('features', l2_normed),
                            FxMapper('samples', np.mean),
                            FxMapper('samples', np.abs)
                        ])),
                    ConfusionBasedError(rfesvm_split, confusion_state='stats'),
                    Repeater(
                        2),  #  we will use the same full cv-training dataset
                    fselector=FractionTailSelector(0.50,
                                                   mode='select',
                                                   tail='upper'),
                    stopping_criterion=NBackHistoryStopCrit(
                        BestDetector(), 10),
                    train_pmeasure=
                    False,  # we just extract it from existing confusion
                    update_sensitivity=True),
                normal_feature_dataset(perlabel=28,
                                       nchunks=7,
                                       nfeatures=200,
                                       nonbogus_features=[0, 1],
                                       snr=1.5))
        ]:
            # prep data
            # data = datasets['uni2medium']
            data_nfeatures = data.nfeatures

            rfe.train(data)
            resds = rfe(data)

            # fail if orig datasets are changed
            self.assertTrue(data.nfeatures == data_nfeatures)

            # check that the features set with the least error is selected
            if len(rfe.ca.errors):
                e = np.array(rfe.ca.errors)
                if isinstance(rfe._fselector, FixedNElementTailSelector):
                    self.assertTrue(resds.nfeatures == data_nfeatures -
                                    e.argmin())
                else:
                    imin = np.argmin(e)
                    if 'does_feature_selection' in clf.__tags__:
                        # if clf is smart it might figure it out right away
                        assert_array_less(imin, len(e))
                    else:
                        # in this case we can even check if we had actual
                        # going down/up trend... although -- why up???
                        self.assertTrue(1 < imin < len(e) - 1)
            else:
                self.assertTrue(resds.nfeatures == data_nfeatures)

            # silly check if nfeatures is in decreasing order
            nfeatures = np.array(rfe.ca.nfeatures).copy()
            nfeatures.sort()
            self.assertTrue((nfeatures[::-1] == rfe.ca.nfeatures).all())

            # check if history has elements for every step
            self.assertTrue(
                set(rfe.ca.history) == set(range(len(np.array(
                    rfe.ca.errors)))))

            # Last (the largest number) can be present multiple times even
            # if we remove 1 feature at a time -- just need to stop well
            # in advance when we have more than 1 feature left ;)
            self.assertTrue(rfe.ca.nfeatures[-1] == len(
                np.where(rfe.ca.history == max(rfe.ca.history))[0]))
def get_linear_svm_measure():
    clf = LinearCSVMC(space="condition")
    splt = ChainNode(
        [NFoldPartitioner(), Balancer(attr="condition", count=1, limit="partitions", apply_selection=True)],
        space="partitions",
    )
    # splt = NFoldPartitioner()
    cv = CrossValidation(clf, splt, errorfx=mean_match_accuracy, enable_ca=["stats"], postproc=mean_sample())
    return cv
Esempio n. 34
0
    def test_regressions(self, regr):
        """Simple tests on regressions
        """
        if not externals.exists('scipy'):
            raise SkipTest
        else:
            from mvpa2.misc.errorfx import corr_error
        ds = datasets['chirp_linear']
        # we want numeric labels to maintain the previous behavior, especially
        # since we deal with regressions here
        ds.sa.targets = AttributeMap().to_numeric(ds.targets)

        cve = CrossValidation(regr, NFoldPartitioner(), postproc=mean_sample(),
            errorfx=corr_error, enable_ca=['training_stats', 'stats'])
        # check the default
        #self.assertTrue(cve.transerror.errorfx is corr_error)

        corr = np.asscalar(cve(ds).samples)

        # Our CorrErrorFx should never return NaN
        self.assertTrue(not np.isnan(corr))
        self.assertTrue(corr == cve.ca.stats.stats['CCe'])

        splitregr = SplitClassifier(
            regr, partitioner=OddEvenPartitioner(),
            enable_ca=['training_stats', 'stats'])
        splitregr.train(ds)
        split_corr = splitregr.ca.stats.stats['CCe']
        split_corr_tr = splitregr.ca.training_stats.stats['CCe']

        for confusion, error in (
            (cve.ca.stats, corr),
            (splitregr.ca.stats, split_corr),
            (splitregr.ca.training_stats, split_corr_tr),
            ):
            #TODO: test confusion statistics
            # Part of it for now -- CCe
            for conf in confusion.summaries:
                stats = conf.stats
                if cfg.getboolean('tests', 'labile', default='yes'):
                    self.assertTrue(stats['CCe'] < 0.5)
                self.assertEqual(stats['CCe'], stats['Summary CCe'])

            s0 = confusion.as_string(short=True)
            s1 = confusion.as_string(short=False)

            for s in [s0, s1]:
                self.assertTrue(len(s) > 10,
                                msg="We should get some string representation "
                                "of regression summary. Got %s" % s)
            if cfg.getboolean('tests', 'labile', default='yes'):
                self.assertTrue(error < 0.2,
                            msg="Regressions should perform well on a simple "
                            "dataset. Got correlation error of %s " % error)

            # Test access to summary statistics
            # YOH: lets start making testing more reliable.
            #      p-value for such accident to have is verrrry tiny,
            #      so if regression works -- it better has at least 0.5 ;)
            #      otherwise fix it! ;)
            # YOH: not now -- issues with libsvr in SG and linear kernel
            if cfg.getboolean('tests', 'labile', default='yes'):
                self.assertTrue(confusion.stats['CCe'] < 0.5)

        # just to check if it works fine
        split_predictions = splitregr.predict(ds.samples)
Esempio n. 35
0
    def test_tree_classifier(self):
        """Basic tests for TreeClassifier
        """
        ds = datasets['uni4medium']
        # make it simple of the beast -- take only informative ones
        # because classifiers for the tree are selected randomly, so
        # performance varies a lot and we just need to check on
        # correct operation
        ds = ds[:, ds.fa.nonbogus_targets != [None]]

        clfs = clfswh['binary']  # pool of classifiers
        # Lets permute so each time we try some different combination
        # of the classifiers but exclude those operating on %s of
        # features since we might not have enough for that
        clfs = [
            clfs[i] for i in np.random.permutation(len(clfs))
            if not '%' in str(clfs[i])
        ]

        # NB: It is necessary that the same classifier was not used at
        # different nodes, since it would be re-trained for a new set
        # of targets, thus leading to incorrect behavior/high error.
        #
        # Clone only those few leading ones which we will use
        # throughout the test
        clfs = [clf.clone() for clf in clfs[:4]]

        # Test conflicting definition
        tclf = TreeClassifier(clfs[0], {
            'L0+2': (('L0', 'L2'), clfs[1]),
            'L2+3': (('L2', 'L3'), clfs[2])
        })
        self.assertRaises(ValueError, tclf.train, ds)
        """Should raise exception since label 2 is in both"""

        # Test insufficient definition
        tclf = TreeClassifier(clfs[0], {
            'L0+5': (('L0', 'L5'), clfs[1]),
            'L2+3': (('L2', 'L3'), clfs[2])
        })
        self.assertRaises(ValueError, tclf.train, ds)
        """Should raise exception since no group for L1"""

        # proper definition now
        tclf = TreeClassifier(clfs[0], {
            'L0+1': (('L0', 'L1'), clfs[1]),
            'L2+3': (('L2', 'L3'), clfs[2])
        })

        # Lets test train/test cycle using CVTE
        cv = CrossValidation(tclf,
                             OddEvenPartitioner(),
                             postproc=mean_sample(),
                             enable_ca=['stats', 'training_stats'])
        cverror = cv(ds).samples.squeeze()
        try:
            rtclf = repr(tclf)
        except:
            self.fail(msg="Could not obtain repr for TreeClassifier")

        # Test accessibility of .clfs
        self.assertTrue(tclf.clfs['L0+1'] is clfs[1])
        self.assertTrue(tclf.clfs['L2+3'] is clfs[2])

        cvtrc = cv.ca.training_stats
        cvtc = cv.ca.stats

        if cfg.getboolean('tests', 'labile', default='yes'):
            # just a dummy check to make sure everything is working
            self.assertTrue(cvtrc != cvtc)
            self.assertTrue(cverror < 0.3,
                            msg="Got too high error = %s using %s" %
                            (cverror, tclf))

        # Test trailing nodes with no classifier

        # That is why we use separate pool of classifiers here
        # (that is probably old/not-needed since switched to use clones)
        clfs_mc = clfswh['multiclass']  # pool of classifiers
        clfs_mc = [
            clfs_mc[i] for i in np.random.permutation(len(clfs_mc))
            if not '%' in str(clfs_mc[i])
        ]
        clfs_mc = [clf.clone() for clf in clfs_mc[:4]]  # and clones again

        tclf = TreeClassifier(clfs_mc[0], {
            'L0': (('L0', ), None),
            'L1+2+3': (('L1', 'L2', 'L3'), clfs_mc[1])
        })

        cv = CrossValidation(tclf,
                             OddEvenPartitioner(),
                             postproc=mean_sample(),
                             enable_ca=['stats', 'training_stats'])
        cverror = np.asscalar(cv(ds))
        if cfg.getboolean('tests', 'labile', default='yes'):
            self.assertTrue(cverror < 0.3,
                            msg="Got too high error = %s using %s" %
                            (cverror, tclf))
Esempio n. 36
0
def test_gnbsearchlight_permutations():
    import mvpa2
    from mvpa2.base.node import ChainNode
    from mvpa2.clfs.gnb import GNB
    from mvpa2.generators.base import  Repeater
    from mvpa2.generators.partition import NFoldPartitioner, OddEvenPartitioner
    #import mvpa2.generators.permutation
    #reload(mvpa2.generators.permutation)
    from mvpa2.generators.permutation import AttributePermutator
    from mvpa2.testing.datasets import datasets
    from mvpa2.measures.base import CrossValidation
    from mvpa2.measures.gnbsearchlight import sphere_gnbsearchlight
    from mvpa2.measures.searchlight import sphere_searchlight
    from mvpa2.mappers.fx import mean_sample
    from mvpa2.misc.errorfx import mean_mismatch_error
    from mvpa2.clfs.stats import MCNullDist
    from mvpa2.testing.tools import assert_raises, ok_, assert_array_less

    # mvpa2.debug.active = ['APERM', 'SLC'] #, 'REPM']
    # mvpa2.debug.metrics += ['pid']
    count = 10
    nproc = 1 + int(mvpa2.externals.exists('pprocess'))
    ds = datasets['3dsmall'].copy()
    ds.fa['voxel_indices'] = ds.fa.myspace

    slkwargs = dict(radius=3, space='voxel_indices',  enable_ca=['roi_sizes'],
                    center_ids=[1, 10, 70, 100])

    mvpa2.seed(mvpa2._random_seed)
    clf  = GNB()
    splt = NFoldPartitioner(cvtype=2, attr='chunks')

    repeater   = Repeater(count=count)
    permutator = AttributePermutator('targets', limit={'partitions': 1}, count=1)

    null_sl = sphere_gnbsearchlight(clf, ChainNode([splt, permutator], space=splt.get_space()),
                                    postproc=mean_sample(), errorfx=mean_mismatch_error,
                                    **slkwargs)

    distr_est = MCNullDist(repeater, tail='left', measure=null_sl,
                           enable_ca=['dist_samples'])
    sl = sphere_gnbsearchlight(clf, splt,
                               reuse_neighbors=True,
                               null_dist=distr_est, postproc=mean_sample(),
                               errorfx=mean_mismatch_error,
                               **slkwargs)
    if __debug__:                         # assert is done only without -O mode
        assert_raises(NotImplementedError, sl, ds)

    # "ad-hoc searchlights can't handle yet varying targets across partitions"
    if False:
        # after above limitation is removed -- enable
        sl_map = sl(ds)
        sl_null_prob = sl.ca.null_prob.samples.copy()

    mvpa2.seed(mvpa2._random_seed)
    ### 'normal' Searchlight
    clf  = GNB()
    splt = NFoldPartitioner(cvtype=2, attr='chunks')
    repeater   = Repeater(count=count)
    permutator = AttributePermutator('targets', limit={'partitions': 1}, count=1)
    # rng=np.random.RandomState(0)) # to trigger failure since the same np.random state
    # would be reused across all pprocesses
    null_cv = CrossValidation(clf, ChainNode([splt, permutator], space=splt.get_space()),
                              postproc=mean_sample())
    null_sl_normal = sphere_searchlight(null_cv, nproc=nproc, **slkwargs)
    distr_est_normal = MCNullDist(repeater, tail='left', measure=null_sl_normal,
                           enable_ca=['dist_samples'])

    cv = CrossValidation(clf, splt, errorfx=mean_mismatch_error,
                         enable_ca=['stats'], postproc=mean_sample() )
    sl = sphere_searchlight(cv, nproc=nproc, null_dist=distr_est_normal, **slkwargs)
    sl_map_normal = sl(ds)
    sl_null_prob_normal = sl.ca.null_prob.samples.copy()

    # For every feature -- we should get some variance in estimates In
    # case of failure they are all really close to each other (up to
    # numerical precision), so variance will be close to 0
    assert_array_less(-np.var(distr_est_normal.ca.dist_samples.samples[0],
                              axis=1), -1e-5)
    for s in distr_est_normal.ca.dist_samples.samples[0]:
        ok_(len(np.unique(s)) > 1)
Esempio n. 37
0
    def test_regressions(self, regr):
        """Simple tests on regressions
        """
        if not externals.exists('scipy'):
            raise SkipTest
        else:
            from mvpa2.misc.errorfx import corr_error
        ds = datasets['chirp_linear']
        # we want numeric labels to maintain the previous behavior, especially
        # since we deal with regressions here
        ds.sa.targets = AttributeMap().to_numeric(ds.targets)

        cve = CrossValidation(regr, NFoldPartitioner(), postproc=mean_sample(),
            errorfx=corr_error, enable_ca=['training_stats', 'stats'])
        # check the default
        #self.assertTrue(cve.transerror.errorfx is corr_error)

        corr = np.asscalar(cve(ds).samples)

        # Our CorrErrorFx should never return NaN
        self.assertTrue(not np.isnan(corr))
        self.assertTrue(corr == cve.ca.stats.stats['CCe'])

        splitregr = SplitClassifier(
            regr, partitioner=OddEvenPartitioner(),
            enable_ca=['training_stats', 'stats'])
        splitregr.train(ds)
        split_corr = splitregr.ca.stats.stats['CCe']
        split_corr_tr = splitregr.ca.training_stats.stats['CCe']

        for confusion, error in (
            (cve.ca.stats, corr),
            (splitregr.ca.stats, split_corr),
            (splitregr.ca.training_stats, split_corr_tr),
            ):
            #TODO: test confusion statistics
            # Part of it for now -- CCe
            for conf in confusion.summaries:
                stats = conf.stats
                if cfg.getboolean('tests', 'labile', default='yes'):
                    self.assertTrue(stats['CCe'] < 0.5)
                self.assertEqual(stats['CCe'], stats['Summary CCe'])

            s0 = confusion.as_string(short=True)
            s1 = confusion.as_string(short=False)

            for s in [s0, s1]:
                self.assertTrue(len(s) > 10,
                                msg="We should get some string representation "
                                "of regression summary. Got %s" % s)
            if cfg.getboolean('tests', 'labile', default='yes'):
                self.assertTrue(error < 0.2,
                            msg="Regressions should perform well on a simple "
                            "dataset. Got correlation error of %s " % error)

            # Test access to summary statistics
            # YOH: lets start making testing more reliable.
            #      p-value for such accident to have is verrrry tiny,
            #      so if regression works -- it better has at least 0.5 ;)
            #      otherwise fix it! ;)
            # YOH: not now -- issues with libsvr in SG and linear kernel
            if cfg.getboolean('tests', 'labile', default='yes'):
                self.assertTrue(confusion.stats['CCe'] < 0.5)

        # just to check if it works fine
        split_predictions = splitregr.predict(ds.samples)
def do_searchlight(glm_dataset, radius, output_basename, with_null_prob=False, clf=LinearCSVMC(space='condition')):
    if(len(glob(output_basename+"*")) > 0):
        print "sl already ran"
        return
    splt = ChainNode([NFoldPartitioner(),Balancer(attr='condition',count=1,limit='partitions',apply_selection=True)],space='partitions')
    #splt = NFoldPartitioner()
    cv = CrossValidation(clf, splt,
                         errorfx=mean_match_accuracy,
                         enable_ca=['stats'], postproc=mean_sample())
    distr_est = []
    if with_null_prob:
        permutator = AttributePermutator('condition', count=100,
                                         limit='chunks')
        distr_est = MCNullDist(permutator, tail='left',
                               enable_ca=['dist_samples'])
        """
        repeater   = Repeater(count=100)
        permutator = AttributePermutator('condition', limit={'partitions': 1}, count=1)
        null_cv = CrossValidation(clf, ChainNode([splt, permutator],space=splt.get_space()),
                      postproc=mean_sample())
        null_sl = sphere_searchlight(null_cv, radius=radius, space='voxel_indices',
                         enable_ca=['roi_sizes'])
        distr_est = MCNullDist(repeater,tail='left', measure=null_sl,
                       enable_ca=['dist_samples'])

        sl = sphere_searchlight(cv, radius=radius, space='voxel_indices',
                                null_dist=distr_est,
                                enable_ca=['roi_sizes', 'roi_feature_ids']
                                # ,result_fx = _fill_in_scattered_results # average across all spheres
                                )
        """
    else:
        kwa = {'voxel_indices': KNNNeighbourhood(radius, glm_dataset.fa['voxel_indices'])}
        qe = IndexQueryEngine(**kwa)
        # init the searchlight with the queryengine
        sl = Searchlight(cv, queryengine=qe, roi_ids=None,
                       enable_ca=['roi_sizes', 'roi_feature_ids']
                       # ,results_fx = _fill_in_scattered_results # average across all spheres
                          )
       #;v sl = sphere_searchlight(cv, radius=radius, space='voxel_indices',

                                # ,result_fx = _fill_in_scattered_results # average across all spheres
                               # )
    # ds = glm_dataset.copy(deep=False,
    #		       sa=['condition','chunks'],
    #		       fa=['voxel_indices'],
    #		       a=['mapper'])
    from datetime import datetime
    print "starting sl {}".format(datetime.now())
    sl_map = sl(glm_dataset)
    print "finished sl {}".format(datetime.now())
    import pickle
    pickle.dump(sl_map, open("{}_sl_map.p".format(output_basename), "wb"))
#    pickle.dump(sl.ca.roi_feature_ids, open("{}_sl_feature_ids.p".format(output_basename), "wb"))
#    print len(sl.ca.roi_feature_ids[0])
    acc_results = map2nifti(sl_map,
                           imghdr=glm_dataset.a.imghdr)
    acc_nii_filename = '{}-acc.nii.gz'.format(output_basename)
    acc_results.to_filename(acc_nii_filename)
    sl_map.samples *= -1
    sl_map.samples += 1
    niftiresults = map2nifti(sl_map,
                             imghdr=glm_dataset.a.imghdr)
    niftiresults.to_filename('{}-err.nii.gz'.format(output_basename))
    # TODO: check p value map
    if with_null_prob:
        nullt_results = map2nifti(sl_map, data=sl.ca.null_t,
                                  imghdr=glm_dataset.a.imghdr)
        nullt_results.to_filename('{}-t.nii.gz'.format(output_basename))
        nullprob_results = map2nifti(sl_map, data=sl.ca.null_prob,
                                     imghdr=glm_dataset.a.imghdr)
        nullprob_results.to_filename('{}-prob.nii.gz'.format(output_basename))
        nullprob_results = map2nifti(sl_map, data=distr_est.cdf(sl_map.samples),
                                     imghdr=glm_dataset.a.imghdr)
        nullprob_results.to_filename('{}-cdf.nii.gz'.format(output_basename))
    return sl_map
Esempio n. 39
0
    def test_rfe(self, clf):

        # sensitivity analyser and transfer error quantifier use the SAME clf!
        sens_ana = clf.get_sensitivity_analyzer(postproc=maxofabs_sample())
        pmeasure = ProxyMeasure(clf, postproc=BinaryFxNode(mean_mismatch_error,
                                                           'targets'))
        cvmeasure = CrossValidation(clf, NFoldPartitioner(),
                                    errorfx=mean_mismatch_error,
                                    postproc=mean_sample())

        rfesvm_split = SplitClassifier(clf, OddEvenPartitioner())

        # explore few recipes
        for rfe, data in [
            # because the clf is already trained when computing the sensitivity
            # map, prevent retraining for transfer error calculation
            # Use absolute of the svm weights as sensitivity
            (RFE(sens_ana,
                pmeasure,
                Splitter('train'),
                fselector=FixedNElementTailSelector(1),
                train_pmeasure=False),
             self.get_data()),
            # use cross-validation within training to get error for the stopping point
            # but use full training data to derive sensitivity
            (RFE(sens_ana,
                 cvmeasure,
                 Repeater(2),            # give the same full dataset to sens_ana and cvmeasure
                 fselector=FractionTailSelector(
                     0.70,
                     mode='select', tail='upper'),
                train_pmeasure=True),
             normal_feature_dataset(perlabel=20, nchunks=5, nfeatures=200,
                                    nonbogus_features=[0, 1], snr=1.5)),
            # use cross-validation (via SplitClassifier) and get mean
            # of normed sensitivities across those splits
            (RFE(rfesvm_split.get_sensitivity_analyzer(
                    postproc=ChainMapper([ FxMapper('features', l2_normed),
                                           FxMapper('samples', np.mean),
                                           FxMapper('samples', np.abs)])),
                 ConfusionBasedError(rfesvm_split, confusion_state='stats'),
                 Repeater(2),             #  we will use the same full cv-training dataset
                 fselector=FractionTailSelector(
                     0.50,
                     mode='select', tail='upper'),
                 stopping_criterion=NBackHistoryStopCrit(BestDetector(), 10),
                 train_pmeasure=False,    # we just extract it from existing confusion
                 update_sensitivity=True),
             normal_feature_dataset(perlabel=28, nchunks=7, nfeatures=200,
                                    nonbogus_features=[0, 1], snr=1.5))
            ]:
            # prep data
            # data = datasets['uni2medium']
            data_nfeatures = data.nfeatures

            rfe.train(data)
            resds = rfe(data)

            # fail if orig datasets are changed
            self.assertTrue(data.nfeatures == data_nfeatures)

            # check that the features set with the least error is selected
            if len(rfe.ca.errors):
                e = np.array(rfe.ca.errors)
                if isinstance(rfe._fselector, FixedNElementTailSelector):
                    self.assertTrue(resds.nfeatures == data_nfeatures - e.argmin())
                else:
                    imin = np.argmin(e)
                    if 'does_feature_selection' in clf.__tags__:
                        # if clf is smart it might figure it out right away
                        assert_array_less( imin, len(e) )
                    else:
                        # in this case we can even check if we had actual
                        # going down/up trend... although -- why up???
                        self.assertTrue( 1 < imin < len(e) - 1 )
            else:
                self.assertTrue(resds.nfeatures == data_nfeatures)

            # silly check if nfeatures is in decreasing order
            nfeatures = np.array(rfe.ca.nfeatures).copy()
            nfeatures.sort()
            self.assertTrue( (nfeatures[::-1] == rfe.ca.nfeatures).all() )

            # check if history has elements for every step
            self.assertTrue(set(rfe.ca.history)
                            == set(range(len(np.array(rfe.ca.errors)))))

            # Last (the largest number) can be present multiple times even
            # if we remove 1 feature at a time -- just need to stop well
            # in advance when we have more than 1 feature left ;)
            self.assertTrue(rfe.ca.nfeatures[-1]
                            == len(np.where(rfe.ca.history
                                           ==max(rfe.ca.history))[0]))
Esempio n. 40
0
def test_multiclass_ties(clf):
    if 'lars' in clf.__tags__:
        raise SkipTest("Known to crash while running this test")
    ds = _dsties1

    # reassign data between ties, so we know that decision is data, not order driven
    ds_ = ds.copy(deep=True)
    ds_.samples[ds.a.ties_idx[1]] = ds.samples[ds.a.ties_idx[0]]
    ds_.samples[ds.a.ties_idx[0]] = ds.samples[ds.a.ties_idx[1]]
    ok_(np.any(ds_.samples != ds.samples))

    clf_ = clf.clone()
    clf = clf.clone()
    clf.ca.enable(['estimates', 'predictions'])
    clf_.ca.enable(['estimates', 'predictions'])
    te = TransferMeasure(clf,
                         Splitter('train'),
                         postproc=BinaryFxNode(mean_mismatch_error, 'targets'),
                         enable_ca=['stats'])
    te_ = TransferMeasure(clf_,
                          Splitter('train'),
                          postproc=BinaryFxNode(mean_mismatch_error,
                                                'targets'),
                          enable_ca=['stats'])

    te = CrossValidation(clf,
                         NFoldPartitioner(),
                         postproc=mean_sample(),
                         enable_ca=['stats'])
    te_ = CrossValidation(clf_,
                          NFoldPartitioner(),
                          postproc=mean_sample(),
                          enable_ca=['stats'])

    error = te(ds)
    matrix = te.ca.stats.matrix

    # if ties were broken randomly we should have got nearly the same
    # number of hits for tied targets
    ties_indices = [te.ca.stats.labels.index(c) for c in ds.a.ties]
    hits = np.diag(te.ca.stats.matrix)[ties_indices]

    # First check is to see if we swap data between tied labels we
    # are getting the same results if we permute labels accordingly,
    # i.e. that tie resolution is not dependent on the labels order
    # but rather on the data
    te_(ds_)
    matrix_swapped = te_.ca.stats.matrix

    if False:  #0 in hits:
        print clf, matrix, matrix_swapped
        print clf.ca.estimates[:, 2] - clf.ca.estimates[:, 0]
        #print clf.ca.estimates

    # TODO: for now disabled all the non-compliant ones to pass the
    #       tests. For visibility decided to skip them instead of just
    #       exclusion and skipping only here to possibly catch crashes
    #       which might happen before
    if len(
            set(('libsvm', 'sg', 'skl', 'gpr',
                 'blr')).intersection(clf.__tags__)):
        raise SkipTest("Skipped %s because it is known to fail")

    ok_(not (np.array_equal(matrix, matrix_swapped) and 0 in hits))

    # this check is valid only if ties are not broken randomly
    # like it is the case with SMLR
    if not ('random_tie_breaking' in clf.__tags__
            or  # since __tags__ would not go that high up e.g. in
            # <knn on SMLR non-0>
            'SMLR' in str(clf)):
        assert_array_equal(hits, np.diag(matrix_swapped)[ties_indices[::-1]])

    # Second check is to just see if we didn't get an obvious bias and
    # got 0 in one of the hits, although it is labile
    if cfg.getboolean('tests', 'labile', default='yes'):
        ok_(not 0 in hits)
    def _call(self, ds):
        if len(ds) > 1:
            # average all samples into one, assuming we got something like one
            # sample per subject as input
            avgr = mean_sample()
            ds = avgr(ds)
        # threshold input; at this point we only have one sample left
        thrd = ds.samples[0] > self._thrmap
        # mapper default
        mapper = IdentityMapper()
        # overwrite if possible
        if hasattr(ds, 'a') and 'mapper' in ds.a:
            mapper = ds.a.mapper
        # reverse-map input
        othrd = _verified_reverse1(mapper, thrd)
        # TODO: what is your purpose in life osamp? ;-)
        osamp = _verified_reverse1(mapper, ds.samples[0])
        # prep output dataset
        outds = ds.copy(deep=False)
        outds.fa['featurewise_thresh'] = self._thrmap
        # determine clusters
        labels, num = measurements.label(othrd,structure=np.ones([3,3,3]))
        area = measurements.sum(othrd,
                                labels,
                                index=np.arange(1, num + 1)).astype(int)
        com = measurements.center_of_mass(
            osamp, labels=labels, index=np.arange(1, num + 1))
        maxpos = measurements.maximum_position(
            osamp, labels=labels, index=np.arange(1, num + 1))
        # for the rest we need the labels flattened
        labels = mapper.forward1(labels)
        # relabel clusters starting with the biggest and increase index with
        # decreasing size
        ordered_labels = np.zeros(labels.shape, dtype=int)
        ordered_area = np.zeros(area.shape, dtype=int)
        ordered_com = np.zeros((num, len(osamp.shape)), dtype=float)
        ordered_maxpos = np.zeros((num, len(osamp.shape)), dtype=float)
        for i, idx in enumerate(np.argsort(area)):
            ordered_labels[labels == idx + 1] = num - i
            # kinda ugly, but we are looping anyway
            ordered_area[i] = area[idx]
            ordered_com[i] = com[idx]
            ordered_maxpos[i] = maxpos[idx]
        labels = ordered_labels
        area = ordered_area[::-1]
        com = ordered_com[::-1]
        maxpos = ordered_maxpos[::-1]
        del ordered_labels  # this one can be big
        # store cluster labels after forward-mapping
        outds.fa['clusters_featurewise_thresh'] = labels.copy()
        # location info
        outds.a['clusterlocations'] = \
            np.rec.fromarrays(
                [com, maxpos], names=('center_of_mass', 'max'))

        # update cluster size histogram with the actual result to get a
        # proper lower bound for p-values
        # this will make a copy, because the original matrix is int
        cluster_probs_raw = _transform_to_pvals(
            area, self._null_cluster_sizes.astype('float'))

        clusterstats = (
            [area, cluster_probs_raw],
            ['size', 'prob_raw']
        )
        # evaluate a bunch of stats for all clusters
        morestats = {}
        for cid in xrange(len(area)):
            # keep clusters on outer loop, because selection is more expensive
            clvals = ds.samples[0, labels == cid + 1]
            for id_, fx in (
                    ('mean', np.mean),
                    ('median', np.median),
                    ('min', np.min),
                    ('max', np.max),
                    ('std', np.std)):
                stats = morestats.get(id_, [])
                stats.append(fx(clvals))
                morestats[id_] = stats

        for k, v in morestats.items():
            clusterstats[0].append(v)
            clusterstats[1].append(k)

        if self.params.multicomp_correction is not None:
            # do a local import as only this tiny portion needs statsmodels
            import statsmodels.stats.multitest as smm
            rej, probs_corr = smm.multipletests(
                cluster_probs_raw,
                alpha=self.params.fwe_rate,
                method=self.params.multicomp_correction)[:2]
            # store corrected per-cluster probabilities
            clusterstats[0].append(probs_corr)
            clusterstats[1].append('prob_corrected')
            # remove cluster labels that did not pass the FWE threshold
            for i, r in enumerate(rej):
                if not r:
                    labels[labels == i + 1] = 0
            outds.fa['clusters_fwe_thresh'] = labels
        outds.a['clusterstats'] = \
            np.rec.fromarrays(clusterstats[0], names=clusterstats[1])
        return outds
Esempio n. 42
0
def test_multiclass_ties(clf):
    ds = _dsties1

    # reassign data between ties, so we know that decision is data, not order driven
    ds_ = ds.copy(deep=True)
    ds_.samples[ds.a.ties_idx[1]] = ds.samples[ds.a.ties_idx[0]]
    ds_.samples[ds.a.ties_idx[0]] = ds.samples[ds.a.ties_idx[1]]
    ok_(np.any(ds_.samples != ds.samples))

    clf_ = clf.clone()
    clf = clf.clone()
    clf.ca.enable(['estimates', 'predictions'])
    clf_.ca.enable(['estimates', 'predictions'])
    te = TransferMeasure(clf, Splitter('train'),
                            postproc=BinaryFxNode(mean_mismatch_error,
                                                  'targets'),
                            enable_ca=['stats'])
    te_ = TransferMeasure(clf_, Splitter('train'),
                            postproc=BinaryFxNode(mean_mismatch_error,
                                                  'targets'),
                            enable_ca=['stats'])

    te = CrossValidation(clf, NFoldPartitioner(), postproc=mean_sample(),
                        enable_ca=['stats'])
    te_ = CrossValidation(clf_, NFoldPartitioner(), postproc=mean_sample(),
                        enable_ca=['stats'])

    error = te(ds)
    matrix = te.ca.stats.matrix

    # if ties were broken randomly we should have got nearly the same
    # number of hits for tied targets
    ties_indices = [te.ca.stats.labels.index(c) for c in ds.a.ties]
    hits = np.diag(te.ca.stats.matrix)[ties_indices]

    # First check is to see if we swap data between tied labels we
    # are getting the same results if we permute labels accordingly,
    # i.e. that tie resolution is not dependent on the labels order
    # but rather on the data
    te_(ds_)
    matrix_swapped = te_.ca.stats.matrix

    if False: #0 in hits:
        print clf, matrix, matrix_swapped
        print clf.ca.estimates[:, 2] - clf.ca.estimates[:,0]
        #print clf.ca.estimates

    # TODO: for now disabled all the non-compliant ones to pass the
    #       tests. For visibility decided to skip them instead of just
    #       exclusion and skipping only here to possibly catch crashes
    #       which might happen before
    if len(set(('libsvm', 'sg', 'skl', 'gpr', 'blr')
               ).intersection(clf.__tags__)):
        raise SkipTest("Skipped %s because it is known to fail")

    ok_(not (np.array_equal(matrix, matrix_swapped) and 0 in hits))

    # this check is valid only if ties are not broken randomly
    # like it is the case with SMLR
    if not ('random_tie_breaking' in clf.__tags__
            or  # since __tags__ would not go that high up e.g. in
                # <knn on SMLR non-0>
            'SMLR' in str(clf)):
        assert_array_equal(hits,
                           np.diag(matrix_swapped)[ties_indices[::-1]])

    # Second check is to just see if we didn't get an obvious bias and
    # got 0 in one of the hits, although it is labile
    if cfg.getboolean('tests', 'labile', default='yes'):
        ok_(not 0 in hits)
Esempio n. 43
0
from mvpa2.mappers.fx import mean_sample

"""For the sake of simplicity, let's use a small artificial dataset."""

# Lets just use our tiny 4D dataset from testing battery
dataset = datasets['3dlarge']

"""Now it only takes three lines for a searchlight analysis."""

# setup measure to be computed in each sphere (cross-validated
# generalization error on odd/even splits)
cv = CrossValidation(LinearCSVMC(), OddEvenPartitioner())

# setup searchlight with 2 voxels radius and measure configured above
sl = sphere_searchlight(cv, radius=2, space='myspace',
                        postproc=mean_sample())

# run searchlight on dataset
sl_map = sl(dataset)

print 'Best performing sphere error:', np.min(sl_map.samples)

"""
If this analysis is done on a fMRI dataset using `NiftiDataset` the resulting
searchlight map (`sl_map`) can be mapped back into the original dataspace
and viewed as a brain overlay. :ref:`Another example <example_searchlight>`
shows a typical application of this algorithm.

.. Mention the fact that it also is a special `SensitivityAnalyzer`
"""
Esempio n. 44
0
def test_rfe_sensmap():
    # http://lists.alioth.debian.org/pipermail/pkg-exppsy-pymvpa/2013q3/002538.html
    # just a smoke test. fails with
    from mvpa2.clfs.svm import LinearCSVMC
    from mvpa2.clfs.meta import FeatureSelectionClassifier
    from mvpa2.measures.base import CrossValidation, RepeatedMeasure
    from mvpa2.generators.splitters import Splitter
    from mvpa2.generators.partition import NFoldPartitioner
    from mvpa2.misc.errorfx import mean_mismatch_error
    from mvpa2.mappers.fx import mean_sample
    from mvpa2.mappers.fx import maxofabs_sample
    from mvpa2.generators.base import Repeater
    from mvpa2.featsel.rfe import RFE
    from mvpa2.featsel.helpers import FractionTailSelector, BestDetector
    from mvpa2.featsel.helpers import NBackHistoryStopCrit
    from mvpa2.datasets import vstack

    from mvpa2.misc.data_generators import normal_feature_dataset

    # Let's simulate the beast -- 6 categories total groupped into 3
    # super-ordinate, and actually without any 'superordinate' effect
    # since subordinate categories independent
    fds = normal_feature_dataset(nlabels=3,
                                 snr=1, # 100,   # pure signal! ;)
                                 perlabel=9,
                                 nfeatures=6,
                                 nonbogus_features=range(3),
                                 nchunks=3)
    clfsvm = LinearCSVMC()

    rfesvm = RFE(clfsvm.get_sensitivity_analyzer(postproc=maxofabs_sample()),
                 CrossValidation(
                     clfsvm,
                     NFoldPartitioner(),
                     errorfx=mean_mismatch_error, postproc=mean_sample()),
                 Repeater(2),
                 fselector=FractionTailSelector(0.70, mode='select', tail='upper'),
                 stopping_criterion=NBackHistoryStopCrit(BestDetector(), 10),
                 update_sensitivity=True)

    fclfsvm = FeatureSelectionClassifier(clfsvm, rfesvm)

    sensanasvm = fclfsvm.get_sensitivity_analyzer(postproc=maxofabs_sample())


    # manually repeating/splitting so we do both RFE sensitivity and classification
    senses, errors = [], []
    for i, pset in enumerate(NFoldPartitioner().generate(fds)):
        # split partitioned dataset
        split = [d for d in Splitter('partitions').generate(pset)]
        senses.append(sensanasvm(split[0])) # and it also should train the classifier so we would ask it about error
        errors.append(mean_mismatch_error(fclfsvm.predict(split[1]), split[1].targets))

    senses = vstack(senses)
    errors = vstack(errors)

    # Let's compare against rerunning the beast simply for classification with CV
    errors_cv = CrossValidation(fclfsvm, NFoldPartitioner(), errorfx=mean_mismatch_error)(fds)
    # and they should match
    assert_array_equal(errors, errors_cv)

    # buggy!
    cv_sensana_svm = RepeatedMeasure(sensanasvm, NFoldPartitioner())
    senses_rm = cv_sensana_svm(fds)

    #print senses.samples, senses_rm.samples
    #print errors, errors_cv.samples
    assert_raises(AssertionError,
                  assert_array_almost_equal,
                  senses.samples, senses_rm.samples)
    raise SkipTest("Known failure for repeated measures: https://github.com/PyMVPA/PyMVPA/issues/117")
Esempio n. 45
0
    def test_tree_classifier(self):
        """Basic tests for TreeClassifier
        """
        ds = datasets['uni4medium']
        # make it simple of the beast -- take only informative ones
        # because classifiers for the tree are selected randomly, so
        # performance varies a lot and we just need to check on
        # correct operation
        ds = ds[:, ds.fa.nonbogus_targets != [None]]

        clfs = clfswh['binary']         # pool of classifiers
        # Lets permute so each time we try some different combination
        # of the classifiers but exclude those operating on %s of
        # features since we might not have enough for that
        clfs = [clfs[i] for i in np.random.permutation(len(clfs))
                if not '%' in str(clfs[i])]

        # NB: It is necessary that the same classifier was not used at
        # different nodes, since it would be re-trained for a new set
        # of targets, thus leading to incorrect behavior/high error.
        #
        # Clone only those few leading ones which we will use
        # throughout the test
        clfs = [clf.clone() for clf in clfs[:4]]

        # Test conflicting definition
        tclf = TreeClassifier(clfs[0], {
            'L0+2': (('L0', 'L2'), clfs[1]),
            'L2+3': (('L2', 'L3'), clfs[2])})
        self.assertRaises(ValueError, tclf.train, ds)
        """Should raise exception since label 2 is in both"""

        # Test insufficient definition
        tclf = TreeClassifier(clfs[0], {
            'L0+5': (('L0', 'L5'), clfs[1]),
            'L2+3': (('L2', 'L3'),       clfs[2])})
        self.assertRaises(ValueError, tclf.train, ds)
        """Should raise exception since no group for L1"""

        # proper definition now
        tclf = TreeClassifier(clfs[0], {
            'L0+1': (('L0', 'L1'), clfs[1]),
            'L2+3': (('L2', 'L3'), clfs[2])})

        # Lets test train/test cycle using CVTE
        cv = CrossValidation(tclf, OddEvenPartitioner(), postproc=mean_sample(),
            enable_ca=['stats', 'training_stats'])
        cverror = cv(ds).samples.squeeze()
        try:
            rtclf = repr(tclf)
        except:
            self.fail(msg="Could not obtain repr for TreeClassifier")

        # Test accessibility of .clfs
        self.assertTrue(tclf.clfs['L0+1'] is clfs[1])
        self.assertTrue(tclf.clfs['L2+3'] is clfs[2])

        cvtrc = cv.ca.training_stats
        cvtc = cv.ca.stats

        if cfg.getboolean('tests', 'labile', default='yes'):
            # just a dummy check to make sure everything is working
            self.assertTrue(cvtrc != cvtc)
            self.assertTrue(cverror < 0.3,
                            msg="Got too high error = %s using %s"
                            % (cverror, tclf))

        # Test trailing nodes with no classifier

        # That is why we use separate pool of classifiers here
        # (that is probably old/not-needed since switched to use clones)
        clfs_mc = clfswh['multiclass']         # pool of classifiers
        clfs_mc = [clfs_mc[i] for i in np.random.permutation(len(clfs_mc))
                   if not '%' in str(clfs_mc[i])]
        clfs_mc = [clf.clone() for clf in clfs_mc[:4]] # and clones again

        tclf = TreeClassifier(clfs_mc[0], {
            'L0': (('L0',), None),
            'L1+2+3': (('L1', 'L2', 'L3'), clfs_mc[1])})

        cv = CrossValidation(tclf,
                             OddEvenPartitioner(),
                             postproc=mean_sample(),
                             enable_ca=['stats', 'training_stats'])
        cverror = np.asscalar(cv(ds))
        if cfg.getboolean('tests', 'labile', default='yes'):
            self.assertTrue(cverror < 0.3,
                            msg="Got too high error = %s using %s"
                            % (cverror, tclf))
Esempio n. 46
0
def get_crossvalidation_instance(learner, partitioner, errorfx,
                                 sampling_repetitions=1,
                                 learner_space='targets',
                                 balance_training=None,
                                 permutations=0,
                                 avg_datafold_results=True,
                                 prob_tail='left'):
    from mvpa2.base.node import ChainNode
    from mvpa2.measures.base import CrossValidation
    if not balance_training is None:
        # balance training data
        try:
            amount = int(balance_training)
        except ValueError:
            try:
                amount = float(balance_training)
            except ValueError:
                amount = balance_training
        from mvpa2.generators.resampling import Balancer
        balancer = Balancer(amount=amount, attr=learner_space,
                            count=sampling_repetitions,
                            limit={partitioner.get_space(): 1},
                            apply_selection=True,
                            include_offlimit=True)
    else:
        balancer = None
    # set learner space
    learner.set_space(learner_space)
    # setup generator for data folding -- put in a chain node for easy
    # amending
    gennode = ChainNode([partitioner], space=partitioner.get_space())
    if avg_datafold_results:
        from mvpa2.mappers.fx import mean_sample
        postproc = mean_sample()
    else:
        postproc = None
    if not balancer is None:
        # enable balancing step for each partitioning step
        gennode.append(balancer)
    if permutations > 0:
        from mvpa2.generators.base import Repeater
        from mvpa2.generators.permutation import AttributePermutator
        from mvpa2.clfs.stats import MCNullDist
        # how often do we want to shuffle the data
        repeater = Repeater(count=permutations)
        # permute the training part of a dataset exactly ONCE
        permutator = AttributePermutator(
                        learner_space,
                        limit={partitioner.get_space(): 1},
                        count=1)
        # CV with null-distribution estimation that permutes the training data for
        # each fold independently
        perm_gen_node = copy.deepcopy(gennode)
        perm_gen_node.append(permutator)
        null_cv = CrossValidation(learner,
                                  perm_gen_node,
                                  postproc=postproc,
                                  errorfx=errorfx)
        # Monte Carlo distribution estimator
        distr_est = MCNullDist(repeater,
                               tail=prob_tail,
                               measure=null_cv,
                               enable_ca=['dist_samples'])
        # pass the p-values as feature attributes on to the results
        pass_attr = [('ca.null_prob', 'fa', 1)]
    else:
        distr_est = None
        pass_attr = None
    # final CV node
    cv = CrossValidation(learner,
                         gennode,
                         errorfx=errorfx,
                         null_dist=distr_est,
                         postproc=postproc,
                         enable_ca=['stats', 'null_prob'],
                         pass_attr=pass_attr)
    return cv
Esempio n. 47
0
print externals.exists('libsvm')
from mvpa2.measures.base import CrossValidation
from mvpa2.measures.searchlight import sphere_searchlight
from mvpa2.testing.datasets import datasets
from mvpa2.mappers.fx import mean_sample
"""For the sake of simplicity, let's use a small artificial dataset."""

# Lets just use our tiny 4D dataset from testing battery
dataset = datasets['3dlarge']
"""Now it only takes three lines for a searchlight analysis."""

# setup measure to be computed in each sphere (cross-validated
# generalization error on odd/even splits)
cv = CrossValidation(LinearCSVMC(), OddEvenPartitioner())

# setup searchlight with 2 voxels radius and measure configured above
sl = sphere_searchlight(cv, radius=2, space='myspace', postproc=mean_sample())

# run searchlight on dataset
sl_map = sl(dataset)

print 'Best performing sphere error:', np.min(sl_map.samples)
"""
If this analysis is done on a fMRI dataset using `NiftiDataset` the resulting
searchlight map (`sl_map`) can be mapped back into the original dataspace
and viewed as a brain overlay. :ref:`Another example <example_searchlight>`
shows a typical application of this algorithm.

.. Mention the fact that it also is a special `SensitivityAnalyzer`
"""