Exemple #1
0
    def _test_gideon_weird_case(self):
        """'The utter collapse' -- communicated by Peter J. Kohler

        Desire to collapse all samples per each category in training
        and testing sets, thus resulting only in a single
        sample/category per training and per testing.  As it is now,
        CrossValidation on MappedClassifier would not work

        observations: chance distribution obviously gets wide, but
        also gets skewed to anti-learning on nfolds like 4.
        
        """
        from mvpa2.mappers.fx import mean_group_sample
        from mvpa2.clfs.knn import kNN
        clf = kNN()
        print "HERE"
        ds = datasets['uni2large'].copy()
        ds = ds[ds.sa.chunks < 9]
        accs = []
        for i in xrange(10):          # # of random samples
            ds.samples = np.random.randn(*ds.shape)
            if False: # this would have been a native way IF we allowed change of number of samples
                clf2 = MappedClassifier(clf=kNN(), #clf,
                                        mapper=mean_group_sample(['targets', 'partitions']))
                cv = CrossValidation(clf2, NFoldPartitioner(4), postproc=None,
                                     enable_ca=['stats'])
                print cv(ds)
            else:
                from mvpa2.clfs.transerror import ConfusionMatrix
                partitioner = NFoldPartitioner(6)
                meaner = mean_group_sample(['targets', 'partitions'])
                cm = ConfusionMatrix()
                te = TransferMeasure(clf, Splitter('partitions'),
                                     postproc=BinaryFxNode(mean_mismatch_error,
                                                           'targets'),
                                     enable_ca = ['stats']
                                     )
                for part in partitioner.generate(ds):
                    ds_meaned = meaner(part)
                    error = np.asscalar(te(ds_meaned))
                    cm += te.ca.stats
                print i, cm.stats['ACC']
                accs.append(cm.stats['ACC'])
Exemple #2
0
    def test_adhocsearchlight_perm_testing(self):
        # just a smoke test pretty much
        ds = datasets['3dmedium'].copy()
        #ds.samples += np.random.normal(size=ds.samples.shape)*10
        mvpa2.seed()
        ds.fa['voxel_indices'] = ds.fa.myspace
        from mvpa2.mappers.fx import mean_sample
        from mvpa2.clfs.stats import MCNullDist
        permutator = AttributePermutator('targets', count=8,
                                         limit='chunks')
        distr_est = MCNullDist(permutator, tail='left',
                               enable_ca=['dist_samples'])
        slargs = (kNN(1),
                  NFoldPartitioner(0.5,
                                   selection_strategy='random',
                                   count=9))
        slkwargs = dict(radius=1, postproc=mean_sample())

        sl_nodistr = sphere_m1nnsearchlight(*slargs, **slkwargs)
        skip_if_no_external('scipy')    # needed for null_t
        sl = sphere_m1nnsearchlight(
            *slargs,
            null_dist=distr_est,
            enable_ca=['null_t'],
            reuse_neighbors=True,
            **slkwargs
            )
        mvpa2.seed()
        res_nodistr = sl_nodistr(ds)
        mvpa2.seed()
        res = sl(ds)
        # verify that we at least got the same main result
        # ah (yoh) -- null dist is estimated before the main
        # estimate so we can't guarantee correspondence :-/
        # assert_array_equal(res_nodistr, res)
        # only resemblance (TODO, may be we want to get/setstate
        # for rng before null_dist.fit?)

        # and dimensions correspond
        assert_array_equal(distr_est.ca.dist_samples.shape,
                           (1, ds.nfeatures, 8))
        assert_array_equal(sl.ca.null_t.samples.shape,
                           (1, ds.nfeatures))
Exemple #3
0
    def test_knn_state(self):
        train = pure_multivariate_signal( 40, 3 )
        test = pure_multivariate_signal( 20, 3 )

        clf = kNN(k=10)
        clf.train(train)

        clf.ca.enable(['estimates', 'predictions', 'distances'])

        p = clf.predict(test.samples)

        self.failUnless(p == clf.ca.predictions)
        self.failUnless(np.array(clf.ca.estimates).shape == (80,2))
        self.failUnless(clf.ca.distances.shape == (80,160))

        self.failUnless(not clf.ca.distances.fa is train.sa)
        # Those are deep-copied now by default so they should not be the same
        self.failUnless(not (clf.ca.distances.fa['chunks'] is train.sa['chunks']))
        self.failUnless(not (clf.ca.distances.fa.chunks is train.sa.chunks))
    def test_knn_state(self):
        train = pure_multivariate_signal( 40, 3 )
        test = pure_multivariate_signal( 20, 3 )

        clf = kNN(k=10)
        clf.train(train)

        clf.ca.enable(['estimates', 'predictions', 'distances'])

        p = clf.predict(test.samples)

        self.assertTrue(p == clf.ca.predictions)
        self.assertTrue(len(clf.ca.estimates) == 80)
        self.assertTrue(set(clf.ca.estimates[0].keys()) == set(test.targets))
        self.assertTrue(clf.ca.distances.shape == (80,160))

        self.assertTrue(not clf.ca.distances.fa is train.sa)
        # Those are deep-copied now by default so they should not be the same
        self.assertTrue(not (clf.ca.distances.fa['chunks'] is train.sa['chunks']))
        self.assertTrue(not (clf.ca.distances.fa.chunks is train.sa.chunks))
    def test_multivariate(self):

        mv_perf = []
        uv_perf = []

        clf = kNN(k=10)
        for i in xrange(20):
            train = pure_multivariate_signal( 20, 3 )
            test = pure_multivariate_signal( 20, 3 )
            clf.train(train)
            p_mv = clf.predict( test.samples )
            mv_perf.append( np.mean(p_mv==test.targets) )

            clf.train(train[:, 0])
            p_uv = clf.predict(test[:, 0].samples)
            uv_perf.append( np.mean(p_uv==test.targets) )

        mean_mv_perf = np.mean(mv_perf)
        mean_uv_perf = np.mean(uv_perf)

        self.assertTrue( mean_mv_perf > 0.9 )
        self.assertTrue( mean_uv_perf < mean_mv_perf )
Exemple #6
0
    def test_adhocsearchlight_perm_testing(self):
        # just a smoke test pretty much
        ds = datasets['3dmedium'].copy()
        #ds.samples += np.random.normal(size=ds.samples.shape)*10
        mvpa2.seed()
        ds.fa['voxel_indices'] = ds.fa.myspace
        from mvpa2.mappers.fx import mean_sample
        from mvpa2.clfs.stats import MCNullDist
        permutator = AttributePermutator('targets', count=8, limit='chunks')
        distr_est = MCNullDist(permutator,
                               tail='left',
                               enable_ca=['dist_samples'])
        slargs = (kNN(1),
                  NFoldPartitioner(0.5, selection_strategy='random', count=9))
        slkwargs = dict(radius=1, postproc=mean_sample())

        sl_nodistr = sphere_m1nnsearchlight(*slargs, **slkwargs)
        skip_if_no_external('scipy')  # needed for null_t
        sl = sphere_m1nnsearchlight(*slargs,
                                    null_dist=distr_est,
                                    enable_ca=['null_t'],
                                    reuse_neighbors=True,
                                    **slkwargs)
        mvpa2.seed()
        res_nodistr = sl_nodistr(ds)
        mvpa2.seed()
        res = sl(ds)
        # verify that we at least got the same main result
        # ah (yoh) -- null dist is estimated before the main
        # estimate so we can't guarantee correspondence :-/
        # assert_array_equal(res_nodistr, res)
        # only resemblance (TODO, may be we want to get/setstate
        # for rng before null_dist.fit?)

        # and dimensions correspond
        assert_array_equal(distr_est.ca.dist_samples.shape,
                           (1, ds.nfeatures, 8))
        assert_array_equal(sl.ca.null_t.samples.shape, (1, ds.nfeatures))
Exemple #7
0
mvpa2.seed(0)  # to reproduce the plot

dataset_kwargs = dict(nfeatures=2,
                      nchunks=10,
                      snr=2,
                      nlabels=4,
                      means=[[0, 1], [1, 0], [1, 1], [0, 0]])

dataset_train = normal_feature_dataset(**dataset_kwargs)
dataset_plot = normal_feature_dataset(**dataset_kwargs)

# make a new figure
pl.figure(figsize=(9, 9))

for i, k in enumerate((1, 3, 9, 20)):
    knn = kNN(k)

    print "Processing kNN(%i) problem..." % k
    pl.subplot(2, 2, i + 1)
    """
    """

    knn.train(dataset_train)

    plot_decision_boundary_2d(dataset_plot, clf=knn, maps='targets')

if cfg.getboolean('examples', 'interactive', True):
    # show all the cool figures
    pl.show()
Exemple #8
0
class SearchlightTests(unittest.TestCase):
    def setUp(self):
        self.dataset = datasets['3dlarge']
        # give the feature coord a more common name, matching the default of
        # the searchlight
        self.dataset.fa['voxel_indices'] = self.dataset.fa.myspace
        self._tested_pprocess = False

    # https://github.com/PyMVPA/PyMVPA/issues/67
    # https://github.com/PyMVPA/PyMVPA/issues/69
    def test_gnbsearchlight_doc(self):
        # Test either we excluded nproc from the docstrings
        ok_(not 'nproc' in GNBSearchlight.__init__.__doc__)
        ok_(not 'nproc' in GNBSearchlight.__doc__)
        ok_(not 'nproc' in sphere_gnbsearchlight.__doc__)
        # but present elsewhere
        ok_('nproc' in sphere_searchlight.__doc__)
        ok_('nproc' in Searchlight.__init__.__doc__)

    # https://github.com/PyMVPA/PyMVPA/issues/106
    def test_searchlights_doc_qe(self):
        # queryengine should not be provided to sphere_* helpers
        for sl in (sphere_searchlight, sphere_gnbsearchlight,
                   sphere_m1nnsearchlight):
            for kw in ('queryengine', 'qe'):
                ok_(not kw in sl.__doc__,
                    msg='There should be no %r in %s.__doc__' % (kw, sl))

        # queryengine should be provided in corresponding classes __doc__s
        for sl in (Searchlight, GNBSearchlight, M1NNSearchlight):
            for kw in ('queryengine', ):
                ok_(kw in sl.__init__.__doc__,
                    msg='There should be %r in %s.__init__.__doc__' % (kw, sl))
            for kw in ('qe', ):
                ok_(not kw in sl.__init__.__doc__,
                    msg='There should be no %r in %s.__init__.__doc__' %
                    (kw, sl))

    #def _test_searchlights(self, ds, sls, roi_ids, result_all):  # pragma: no cover

    @sweepargs(
        lrn_sllrn_SL_partitioner=[
            (
                GNB(common_variance=v, descr='GNB'),
                None,
                sphere_gnbsearchlight,
                NFoldPartitioner(cvtype=1),
                0.  # correction for the error range
            ) for v in (True, False)
        ] +
        # Mean 1 NN searchlights
        [
            (ChainMapper(
                [mean_group_sample(['targets', 'partitions']),
                 kNN(1)],
                space='targets',
                descr='M1NN'), kNN(1), sphere_m1nnsearchlight,
             NFoldPartitioner(0.5, selection_strategy='random',
                              count=20), 0.05),
            # the same but with NFold(1) partitioner since it still should work
            (ChainMapper(
                [mean_group_sample(['targets', 'partitions']),
                 kNN(1)],
                space='targets',
                descr='NF-M1NN'), kNN(1), sphere_m1nnsearchlight,
             NFoldPartitioner(1), 0.05),
        ])
    @sweepargs(do_roi=(False, True))
    @sweepargs(results_backend=('native', 'hdf5'))
    @reseed_rng()
    def test_spatial_searchlight(self,
                                 lrn_sllrn_SL_partitioner,
                                 do_roi=False,
                                 results_backend='native'):
        """Tests both generic and ad-hoc searchlights (e.g. GNBSearchlight)
        Test of and adhoc searchlight anyways requires a ground-truth
        comparison to the generic version, so we are doing sweepargs here
        """
        lrn, sllrn, SL, partitioner, correction = lrn_sllrn_SL_partitioner
        ## if results_backend == 'hdf5' and not common_variance:
        ##     # no need for full combination of all possible arguments here
        ##     return

        if __debug__ and 'ENFORCE_CA_ENABLED' in debug.active \
           and  isinstance(lrn, ChainMapper):
            raise SkipTest("Known to fail while trying to enable "
                           "training_stats for the ChainMapper (M1NN here)")

        # e.g. for M1NN we need plain kNN(1) for m1nnsl, but to imitate m1nn
        #      "learner" we must use a chainmapper atm
        if sllrn is None:
            sllrn = lrn
        ds = datasets['3dsmall'].copy()
        # Let's test multiclass here, so boost # of labels
        ds[6:18].T += 2
        ds.fa['voxel_indices'] = ds.fa.myspace

        # To assure that users do not run into incorrect operation due to overflows
        ds.samples += 5000
        ds.samples *= 1000
        ds.samples = ds.samples.astype(np.int16)

        # compute N-1 cross-validation for each sphere
        # YOH: unfortunately sample_clf_lin is not guaranteed
        #      to provide exactly the same results due to inherent
        #      iterative process.  Therefore lets use something quick
        #      and pure Python
        cv = CrossValidation(lrn, partitioner)

        skwargs = dict(
            radius=1,
            enable_ca=['roi_sizes', 'raw_results', 'roi_feature_ids'])

        if do_roi:
            # select some random set of features
            nroi = rnd.randint(1, ds.nfeatures)
            # and lets compute the full one as well once again so we have a reference
            # which will be excluded itself from comparisons but values will be compared
            # for selected roi_id
            sl_all = SL(sllrn, partitioner, **skwargs)
            result_all = sl_all(ds)
            # select random features
            roi_ids = rnd.permutation(range(ds.nfeatures))[:nroi]
            skwargs['center_ids'] = roi_ids
        else:
            nroi = ds.nfeatures
            roi_ids = np.arange(nroi)
            result_all = None

        if results_backend == 'hdf5':
            skip_if_no_external('h5py')

        sls = [
            sphere_searchlight(cv, results_backend=results_backend, **skwargs),
            #GNBSearchlight(gnb, NFoldPartitioner(cvtype=1))
            SL(sllrn, partitioner, indexsum='fancy', **skwargs)
        ]

        if externals.exists('scipy'):
            sls += [SL(sllrn, partitioner, indexsum='sparse', **skwargs)]

        # Test nproc just once
        if externals.exists('pprocess') and not self._tested_pprocess:
            sls += [sphere_searchlight(cv, nproc=2, **skwargs)]
            self._tested_pprocess = True

        # Provide the dataset and all those searchlights for testing
        #self._test_searchlights(ds, sls, roi_ids, result_all)
        #nroi = len(roi_ids)
        #do_roi = nroi != ds.nfeatures
        all_results = []
        for sl in sls:
            # run searchlight
            mvpa2.seed()  # reseed rng again for m1nnsl
            results = sl(ds)
            all_results.append(results)
            #print `sl`
            # check for correct number of spheres
            self.assertTrue(results.nfeatures == nroi)
            # and measures (one per xfold)
            if partitioner.cvtype == 1:
                self.assertTrue(len(results) == len(ds.UC))
            elif partitioner.cvtype == 0.5:
                # here we had 4 unique chunks, so 6 combinations
                # even though 20 max was specified for NFold
                self.assertTrue(len(results) == 6)
            else:
                raise RuntimeError("Unknown yet type of partitioner to check")
            # check for chance-level performance across all spheres
            # makes sense only if number of features was big enough
            # to get some stable estimate of mean
            if not do_roi or nroi > 20:
                # correction here is for M1NN class which has wider distribution
                self.assertTrue(0.67 - correction < results.samples.mean() <
                                0.85 + correction,
                                msg="Out of range mean result: "
                                "lrn: %s  sllrn: %s  NROI: %d  MEAN: %.3f" % (
                                    lrn,
                                    sllrn,
                                    nroi,
                                    results.samples.mean(),
                                ))

            mean_errors = results.samples.mean(axis=0)
            # that we do get different errors ;)
            self.assertTrue(len(np.unique(mean_errors) > 3))

            # check resonable sphere sizes
            self.assertTrue(len(sl.ca.roi_sizes) == nroi)
            self.assertTrue(len(sl.ca.roi_feature_ids) == nroi)
            for i, fids in enumerate(sl.ca.roi_feature_ids):
                self.assertTrue(len(fids) == sl.ca.roi_sizes[i])
            if do_roi:
                # for roi we should relax conditions a bit
                self.assertTrue(max(sl.ca.roi_sizes) <= 7)
                self.assertTrue(min(sl.ca.roi_sizes) >= 4)
            else:
                self.assertTrue(max(sl.ca.roi_sizes) == 7)
                self.assertTrue(min(sl.ca.roi_sizes) == 4)

            # check base-class state
            self.assertEqual(sl.ca.raw_results.nfeatures, nroi)

            # Test if we got results correctly for 'selected' roi ids
            if do_roi:
                assert_array_equal(result_all[:, roi_ids], results)

        if len(all_results) > 1:
            # if we had multiple searchlights, we can check either they all
            # gave the same result (they should have)
            aresults = np.array([a.samples for a in all_results])
            dresults = np.abs(aresults - aresults.mean(axis=0))
            dmax = np.max(dresults)
            self.assertTrue(dmax <= 1e-13)

        # Test the searchlight's reuse of neighbors
        for indexsum in ['fancy'] + (externals.exists('scipy') and ['sparse']
                                     or []):
            sl = SL(sllrn,
                    partitioner,
                    indexsum='fancy',
                    reuse_neighbors=True,
                    **skwargs)
            mvpa2.seed()
            result1 = sl(ds)
            mvpa2.seed()
            result2 = sl(ds)  # must be faster
            assert_array_equal(result1, result2)

    def test_adhocsearchlight_perm_testing(self):
        # just a smoke test pretty much
        ds = datasets['3dmedium'].copy()
        #ds.samples += np.random.normal(size=ds.samples.shape)*10
        mvpa2.seed()
        ds.fa['voxel_indices'] = ds.fa.myspace
        from mvpa2.mappers.fx import mean_sample
        from mvpa2.clfs.stats import MCNullDist
        permutator = AttributePermutator('targets', count=8, limit='chunks')
        distr_est = MCNullDist(permutator,
                               tail='left',
                               enable_ca=['dist_samples'])
        slargs = (kNN(1),
                  NFoldPartitioner(0.5, selection_strategy='random', count=9))
        slkwargs = dict(radius=1, postproc=mean_sample())

        sl_nodistr = sphere_m1nnsearchlight(*slargs, **slkwargs)
        skip_if_no_external('scipy')  # needed for null_t
        sl = sphere_m1nnsearchlight(*slargs,
                                    null_dist=distr_est,
                                    enable_ca=['null_t'],
                                    reuse_neighbors=True,
                                    **slkwargs)
        mvpa2.seed()
        res_nodistr = sl_nodistr(ds)
        mvpa2.seed()
        res = sl(ds)
        # verify that we at least got the same main result
        # ah (yoh) -- null dist is estimated before the main
        # estimate so we can't guarantee correspondence :-/
        # assert_array_equal(res_nodistr, res)
        # only resemblance (TODO, may be we want to get/setstate
        # for rng before null_dist.fit?)

        # and dimensions correspond
        assert_array_equal(distr_est.ca.dist_samples.shape,
                           (1, ds.nfeatures, 8))
        assert_array_equal(sl.ca.null_t.samples.shape, (1, ds.nfeatures))

    def test_partial_searchlight_with_full_report(self):
        ds = self.dataset.copy()
        center_ids = np.zeros(ds.nfeatures, dtype='bool')
        center_ids[[3, 50]] = True
        ds.fa['center_ids'] = center_ids
        # compute N-1 cross-validation for each sphere
        cv = CrossValidation(GNB(), NFoldPartitioner())
        # contruct diameter 1 (or just radius 0) searchlight
        # one time give center ids as a list, the other one takes it from the
        # dataset itself
        sls = (
            sphere_searchlight(cv, radius=0, center_ids=[3, 50]),
            sphere_searchlight(None, radius=0, center_ids=[3, 50]),
            sphere_searchlight(cv, radius=0, center_ids='center_ids'),
        )
        for sl in sls:
            # assure that we could set cv post constructor
            if sl.datameasure is None:
                sl.datameasure = cv
            # run searchlight
            results = sl(ds)
            # only two spheres but error for all CV-folds
            self.assertEqual(results.shape, (len(self.dataset.UC), 2))
            # Test if results hold if we "set" a "new" datameasure
            sl.datameasure = CrossValidation(GNB(), NFoldPartitioner())
            results2 = sl(ds)
            assert_array_almost_equal(results, results2)

        # test if we graciously puke if center_ids are out of bounds
        dataset0 = ds[:, :50]  # so we have no 50th feature
        self.assertRaises(IndexError, sls[0], dataset0)
        # but it should be fine on the one that gets the ids from the dataset
        # itself
        results = sl(dataset0)
        assert_equal(results.nfeatures, 1)
        # check whether roi_seeds are correct
        sl = sphere_searchlight(lambda x: np.vstack(
            (x.fa.roi_seed, x.samples)),
                                radius=1,
                                add_center_fa=True,
                                center_ids=[12])
        res = sl(ds)
        assert_array_equal(
            res.samples[1:, res.samples[0].astype('bool')].squeeze(),
            ds.samples[:, 12])

    def test_partial_searchlight_with_confusion_matrix(self):
        ds = self.dataset
        from mvpa2.clfs.stats import MCNullDist
        from mvpa2.mappers.fx import mean_sample, sum_sample

        # compute N-1 cross-validation for each sphere
        cm = ConfusionMatrix(labels=ds.UT)
        cv = CrossValidation(
            sample_clf_lin,
            NFoldPartitioner(),
            # we have to assure that matrix does not get flatted by
            # first vstack in cv and then hstack in searchlight --
            # thus 2 leading dimensions
            # TODO: RF? make searchlight/crossval smarter?
            errorfx=lambda *a: cm(*a)[None, None, :])
        # contruct diameter 2 (or just radius 1) searchlight
        sl = sphere_searchlight(cv, radius=1, center_ids=[3, 5, 50])

        # our regular searchlight -- to compare results
        cv_gross = CrossValidation(sample_clf_lin, NFoldPartitioner())
        sl_gross = sphere_searchlight(cv_gross,
                                      radius=1,
                                      center_ids=[3, 5, 50])

        # run searchlights
        res = sl(ds)
        res_gross = sl_gross(ds)

        # only two spheres but error for all CV-folds and complete confusion matrix
        assert_equal(res.shape, (len(ds.UC), 3, len(ds.UT), len(ds.UT)))
        assert_equal(res_gross.shape, (len(ds.UC), 3))

        # briefly inspect the confusion matrices
        mat = res.samples
        # since input dataset is probably balanced (otherwise adjust
        # to be per label): sum within columns (thus axis=-2) should
        # be identical to per-class/chunk number of samples
        samples_per_classchunk = len(ds) / (len(ds.UT) * len(ds.UC))
        ok_(np.all(np.sum(mat, axis=-2) == samples_per_classchunk))
        # and if we compute accuracies manually -- they should
        # correspond to the one from sl_gross
        assert_array_almost_equal(
            res_gross.samples,
            # from accuracies to errors
            1 - (mat[..., 0, 0] + mat[..., 1, 1]).astype(float) /
            (2 * samples_per_classchunk))

        # and now for those who remained sited -- lets perform H0 MC
        # testing of this searchlight... just a silly one with minimal
        # number of permutations
        no_permutations = 10
        permutator = AttributePermutator('targets', count=no_permutations)

        # once again -- need explicit leading dimension to avoid
        # vstacking during cross-validation
        cv.postproc = lambda x: sum_sample()(x)[None, :]

        sl = sphere_searchlight(cv,
                                radius=1,
                                center_ids=[3, 5, 50],
                                null_dist=MCNullDist(
                                    permutator,
                                    tail='right',
                                    enable_ca=['dist_samples']))
        res_perm = sl(ds)
        # XXX all of the res_perm, sl.ca.null_prob and
        #     sl.null_dist.ca.dist_samples carry a degenerate leading
        #     dimension which was probably due to introduced new axis
        #     above within cv.postproc
        assert_equal(res_perm.shape, (1, 3, 2, 2))
        assert_equal(sl.null_dist.ca.dist_samples.shape,
                     res_perm.shape + (no_permutations, ))
        assert_equal(sl.ca.null_prob.shape, res_perm.shape)
        # just to make sure ;)
        ok_(np.all(sl.ca.null_prob.samples >= 0))
        ok_(np.all(sl.ca.null_prob.samples <= 1))

        # we should have got sums of hits across the splits
        assert_array_equal(np.sum(mat, axis=0), res_perm.samples[0])

    def test_chi_square_searchlight(self):
        # only do partial to save time

        # Can't yet do this since test_searchlight isn't yet "under nose"
        #skip_if_no_external('scipy')
        if not externals.exists('scipy'):
            return

        from mvpa2.misc.stats import chisquare

        cv = CrossValidation(sample_clf_lin,
                             NFoldPartitioner(),
                             enable_ca=['stats'])

        def getconfusion(data):
            cv(data)
            return chisquare(cv.ca.stats.matrix)[0]

        sl = sphere_searchlight(getconfusion, radius=0, center_ids=[3, 50])

        # run searchlight
        results = sl(self.dataset)
        self.assertTrue(results.nfeatures == 2)

    def test_1d_multispace_searchlight(self):
        ds = Dataset([np.arange(6)])
        ds.fa['coord1'] = np.repeat(np.arange(3), 2)
        # add a second space to the dataset
        ds.fa['coord2'] = np.tile(np.arange(2), 3)
        measure = lambda x: "+".join([str(x) for x in x.samples[0]])
        # simply select each feature once
        res = Searchlight(measure,
                          IndexQueryEngine(coord1=Sphere(0), coord2=Sphere(0)),
                          nproc=1)(ds)
        assert_array_equal(res.samples, [['0', '1', '2', '3', '4', '5']])
        res = Searchlight(measure,
                          IndexQueryEngine(coord1=Sphere(0), coord2=Sphere(1)),
                          nproc=1)(ds)
        assert_array_equal(res.samples,
                           [['0+1', '0+1', '2+3', '2+3', '4+5', '4+5']])
        res = Searchlight(measure,
                          IndexQueryEngine(coord1=Sphere(1), coord2=Sphere(0)),
                          nproc=1)(ds)
        assert_array_equal(res.samples,
                           [['0+2', '1+3', '0+2+4', '1+3+5', '2+4', '3+5']])

    #@sweepargs(regr=regrswh[:])
    @reseed_rng()
    def test_regression_with_additional_sa(self):
        regr = regrswh[:][0]
        ds = datasets['3dsmall'].copy()
        ds.fa['voxel_indices'] = ds.fa.myspace

        # Create a new sample attribute which will be used along with
        # every searchlight
        ds.sa['beh'] = np.random.normal(size=(ds.nsamples, 2))

        # and now for fun -- lets create custom linar regression
        # targets out of some random feature and beh linearly combined
        rfeature = np.random.randint(ds.nfeatures)
        ds.sa.targets = np.dot(
            np.hstack((ds.sa.beh, ds.samples[:, rfeature:rfeature + 1])),
            np.array([0.3, 0.2, 0.3]))

        class CrossValidationWithBeh(CrossValidation):
            """An adapter for regular CV which would hstack
               sa.beh to the searchlighting ds"""
            def _call(self, ds):
                return CrossValidation._call(
                    self, Dataset(np.hstack((ds, ds.sa.beh)), sa=ds.sa))

        cvbeh = CrossValidationWithBeh(regr,
                                       OddEvenPartitioner(),
                                       errorfx=corr_error)
        # regular cv
        cv = CrossValidation(regr, OddEvenPartitioner(), errorfx=corr_error)

        slbeh = sphere_searchlight(cvbeh, radius=1)
        slmapbeh = slbeh(ds)
        sl = sphere_searchlight(cv, radius=1)
        slmap = sl(ds)

        assert_equal(slmap.shape, (2, ds.nfeatures))
        # SL which had access to beh should have got for sure better
        # results especially in the vicinity of the chosen feature...
        features = sl.queryengine.query_byid(rfeature)
        assert_array_lequal(slmapbeh.samples[:, features],
                            slmap.samples[:, features])

        # elsewhere they should tend to be better but not guaranteed

    @labile(5, 1)
    def test_usecase_concordancesl(self):
        import numpy as np
        from mvpa2.base.dataset import vstack
        from mvpa2.mappers.fx import mean_sample

        # Take our sample 3d dataset
        ds1 = datasets['3dsmall'].copy(deep=True)
        ds1.fa['voxel_indices'] = ds1.fa.myspace
        ds1.sa['subject'] = [1
                             ]  # not really necessary -- but let's for clarity
        ds1 = mean_sample()(
            ds1)  # so we get just a single representative sample

        def corr12(ds):
            corr = np.corrcoef(ds.samples)
            assert (corr.shape == (2, 2))  # for paranoid ones
            return corr[0, 1]

        for nsc, thr, thr_mean in ((0, 1.0, 1.0),
                                   (0.1, 0.3, 0.8)):  # just a bit of noise
            ds2 = ds1.copy(deep=True)  # make a copy for the 2nd subject
            ds2.sa['subject'] = [2]
            ds2.samples += nsc * np.random.normal(size=ds1.shape)

            # make sure that both have the same voxel indices
            assert (np.all(ds1.fa.voxel_indices == ds2.fa.voxel_indices))
            ds_both = vstack((ds1, ds2))  # join 2 images into a single dataset
            # with .sa.subject distinguishing both

            sl = sphere_searchlight(corr12, radius=2)
            slmap = sl(ds_both)
            ok_(np.all(slmap.samples >= thr))
            ok_(np.mean(slmap.samples) >= thr)

    def test_swaroop_case(self):
        """Test hdf5 backend to pass results on Swaroop's usecase
        """
        skip_if_no_external('h5py')
        from mvpa2.measures.base import Measure

        class sw_measure(Measure):
            def __init__(self):
                Measure.__init__(self, auto_train=True)

            def _call(self, dataset):
                # For performance measures -- increase to 50-200
                # np.sum here is just to get some meaningful value in
                # them
                #return np.ones(shape=(2, 2))*np.sum(dataset)
                return Dataset(
                    np.array([{
                        'd': np.ones(shape=(5, 5)) * np.sum(dataset)
                    }],
                             dtype=object))

        results = []
        ds = datasets['3dsmall'].copy(deep=True)
        ds.fa['voxel_indices'] = ds.fa.myspace

        our_custom_prefix = tempfile.mktemp()
        for backend in ['native'] + \
                (externals.exists('h5py') and ['hdf5'] or []):
            sl = sphere_searchlight(sw_measure(),
                                    radius=1,
                                    tmp_prefix=our_custom_prefix,
                                    results_backend=backend)
            t0 = time.time()
            results.append(np.asanyarray(sl(ds)))
            # print "Done for backend %s in %d sec" % (backend, time.time() - t0)
        # because of swaroop's ad-hoc (who only could recommend such
        # a construct?) use case, and absent fancy working assert_objectarray_equal
        # let's compare manually
        #assert_objectarray_equal(*results)
        if not externals.exists('h5py'):
            self.assertRaises(RuntimeError,
                              sphere_searchlight,
                              sw_measure(),
                              results_backend='hdf5')
            raise SkipTest('h5py required for test of backend="hdf5"')
        assert_equal(results[0].shape, results[1].shape)
        results = [r.flatten() for r in results]
        for x, y in zip(*results):
            assert_equal(x.keys(), y.keys())
            assert_array_equal(x['d'], y['d'])
        # verify that no junk is left behind
        tempfiles = glob.glob(our_custom_prefix + '*')
        assert_equal(len(tempfiles), 0)

    def test_nblocks(self):
        skip_if_no_external('pprocess')
        # just a basic test to see that we are getting the same
        # results with different nblocks
        ds = datasets['3dsmall'].copy(deep=True)[:, :13]
        ds.fa['voxel_indices'] = ds.fa.myspace
        cv = CrossValidation(GNB(), OddEvenPartitioner())
        res1 = sphere_searchlight(cv, radius=1, nproc=2)(ds)
        res2 = sphere_searchlight(cv, radius=1, nproc=2, nblocks=5)(ds)
        assert_array_equal(res1, res2)

    def test_custom_results_fx_logic(self):
        # results_fx was introduced for the blow-up-the-memory-Swaroop
        # where keeping all intermediate results of the dark-magic SL
        # hyperalignment is not feasible.  So it is desired to split
        # searchlight computation in more blocks while composing the
        # target result "on-the-fly" from available so far results.
        #
        # Implementation relies on using generators feeding the
        # results_fx with fresh results whenever those become
        # available.
        #
        # This test/example's "measure" creates files which should be
        # handled by the results_fx function and removed in this case
        # to check if we indeed have desired high number of blocks while
        # only limited nproc.
        skip_if_no_external('pprocess')

        tfile = tempfile.mktemp('mvpa', 'test-sl')

        ds = datasets['3dsmall'].copy()[:, :25]  # smaller copy
        ds.fa['voxel_indices'] = ds.fa.myspace
        ds.fa['feature_id'] = np.arange(ds.nfeatures)

        nproc = 3  # it is not about computing -- so we will can
        # start more processes than possibly having CPUs just to test
        nblocks = nproc * 7
        # figure out max number of features to be given to any proc_block
        # yoh: not sure why I had to +1 here... but now it became more robust and
        # still seems to be doing what was demanded so be it
        max_block = int(ceil(ds.nfeatures / float(nblocks)) + 1)

        def print_(s, *args):
            """For local debugging"""
            #print s, args
            pass

        def results_fx(sl=None, dataset=None, roi_ids=None, results=None):
            """It will "process" the results by removing those files
               generated inside the measure
            """
            res = []
            print_("READY")
            for x in results:
                ok_(isinstance(x, list))
                res.append(x)
                print_("R: ", x)
                for r in x:
                    # Can happen if we requested those .ca's enabled
                    # -- then automagically _proc_block would wrap
                    # results in a dataset... Originally detected by
                    # running with MVPA_DEBUG=.* which triggered
                    # enabling all ca's
                    if is_datasetlike(r):
                        r = np.asscalar(r.samples)
                    os.unlink(r)  # remove generated file
                print_("WAITING")

            results_ds = hstack(sum(res, []))

            # store the center ids as a feature attribute since we use
            # them for testing
            results_ds.fa['center_ids'] = roi_ids
            return results_ds

        def results_postproc_fx(results):
            for ds in results:
                ds.fa['test_postproc'] = np.atleast_1d(ds.a.roi_center_ids**2)
            return results

        def measure(ds):
            """The "measure" will check if a run with the same "index" from
               previous block has been processed by now
            """
            f = '%s+%03d' % (tfile, ds.fa.feature_id[0] % (max_block * nproc))
            print_("FID:%d f:%s" % (ds.fa.feature_id[0], f))

            # allow for up to few seconds to wait for the file to
            # disappear -- i.e. its result from previous "block" was
            # processed
            t0 = time.time()
            while os.path.exists(f) and time.time() - t0 < 4.:
                time.sleep(0.5)  # so it does take time to compute the measure
                pass
            if os.path.exists(f):
                print_("ERROR: ", f)
                raise AssertionError(
                    "File %s must have been processed by now" % f)
            open(f, 'w').write(
                'XXX')  # signal that we have computing this measure
            print_("RES: %s" % f)
            return f

        sl = sphere_searchlight(measure,
                                radius=0,
                                nproc=nproc,
                                nblocks=nblocks,
                                results_postproc_fx=results_postproc_fx,
                                results_fx=results_fx,
                                center_ids=np.arange(ds.nfeatures))

        assert_equal(len(glob.glob(tfile + '*')), 0)  # so no junk around
        try:
            res = sl(ds)
            assert_equal(res.nfeatures, ds.nfeatures)
            # verify that we did have results_postproc_fx called
            assert_array_equal(res.fa.test_postproc,
                               np.power(res.fa.center_ids, 2))
        finally:
            # remove those generated left-over files
            for f in glob.glob(tfile + '*'):
                os.unlink(f)
Exemple #9
0
    def test_gideon_weird_case(self):
        """Test if MappedClassifier could handle a mapper altering number of samples

        'The utter collapse' -- communicated by Peter J. Kohler

        Desire to collapse all samples per each category in training
        and testing sets, thus resulting only in a single
        sample/category per training and per testing.

        It is a peculiar scenario which pin points the problem that so
        far mappers assumed not to change number of samples
        """
        from mvpa2.mappers.fx import mean_group_sample
        from mvpa2.clfs.knn import kNN
        from mvpa2.mappers.base import ChainMapper
        ds = datasets['uni2large'].copy()
        #ds = ds[ds.sa.chunks < 9]
        accs = []
        k = 1                           # for kNN
        nf = 1                          # for NFoldPartitioner
        for i in xrange(1):          # # of random runs
            ds.samples = np.random.randn(*ds.shape)
            #
            # There are 3 ways to accomplish needed goal
            #

            # 0. Hard way: overcome the problem by manually
            #    pre-splitting/meaning in a loop
            from mvpa2.clfs.transerror import ConfusionMatrix
            partitioner = NFoldPartitioner(nf)
            meaner = mean_group_sample(['targets', 'partitions'])
            cm = ConfusionMatrix()
            te = TransferMeasure(kNN(k), Splitter('partitions'),
                                 postproc=BinaryFxNode(mean_mismatch_error,
                                                       'targets'),
                                 enable_ca = ['stats']
                                 )
            errors = []
            for part in partitioner.generate(ds):
                ds_meaned = meaner(part)
                errors.append(np.asscalar(te(ds_meaned)))
                cm += te.ca.stats
            #print i, cm.stats['ACC']
            accs.append(cm.stats['ACC'])


            if False: # not yet working -- see _tent/allow_ch_nsamples
                      # branch for attempt to make it work
                # 1. This is a "native way" IF we allow change of number
                #    of samples via _call to be done by MappedClassifier
                #    while operating solely on the mapped dataset
                clf2 = MappedClassifier(clf=kNN(k), #clf,
                                        mapper=mean_group_sample(['targets', 'partitions']))
                cv = CrossValidation(clf2, NFoldPartitioner(nf), postproc=None,
                                     enable_ca=['stats'])
                # meaning all should be ok since we should have ballanced
                # sets across all chunks here
                errors_native = cv(ds)

                self.assertEqual(np.max(np.abs(errors_native.samples[:,0] - errors)),
                                 0)

            # 2. Work without fixes to MappedClassifier allowing
            #    change of # of samples
            #
            # CrossValidation will operate on a chain mapper which
            # would perform necessary meaning first before dealing with
            # kNN cons: .stats would not be exposed since ChainMapper
            # doesn't expose them from ChainMapper (yet)
            if __debug__ and 'ENFORCE_CA_ENABLED' in debug.active:
                raise SkipTest("Known to fail while trying to enable "
                               "training_stats for the ChainMapper")
            cv2 = CrossValidation(ChainMapper([mean_group_sample(['targets', 'partitions']),
                                               kNN(k)],
                                              space='targets'),
                                  NFoldPartitioner(nf),
                                  postproc=None)
            errors_native2 = cv2(ds)

            self.assertEqual(np.max(np.abs(errors_native2.samples[:,0] - errors)),
                             0)

            # All of the ways should provide the same results
            #print i, np.max(np.abs(errors_native.samples[:,0] - errors)), \
            #      np.max(np.abs(errors_native2.samples[:,0] - errors))

        if False: # just to investigate the distribution if we have enough iterations
            import pylab as pl
            uaccs = np.unique(accs)
            step = np.asscalar(np.unique(np.round(uaccs[1:] - uaccs[:-1], 4)))
            bins = np.linspace(0., 1., np.round(1./step+1))
            xx = pl.hist(accs, bins=bins, align='left')
            pl.xlim((0. - step/2, 1.+step/2))
Exemple #10
0
    def test_gideon_weird_case(self):
        """Test if MappedClassifier could handle a mapper altering number of samples

        'The utter collapse' -- communicated by Peter J. Kohler

        Desire to collapse all samples per each category in training
        and testing sets, thus resulting only in a single
        sample/category per training and per testing.

        It is a peculiar scenario which pin points the problem that so
        far mappers assumed not to change number of samples
        """
        from mvpa2.mappers.fx import mean_group_sample
        from mvpa2.clfs.knn import kNN
        from mvpa2.mappers.base import ChainMapper
        ds = datasets['uni2large'].copy()
        #ds = ds[ds.sa.chunks < 9]
        accs = []
        k = 1  # for kNN
        nf = 1  # for NFoldPartitioner
        for i in xrange(1):  # # of random runs
            ds.samples = np.random.randn(*ds.shape)
            #
            # There are 3 ways to accomplish needed goal
            #

            # 0. Hard way: overcome the problem by manually
            #    pre-splitting/meaning in a loop
            from mvpa2.clfs.transerror import ConfusionMatrix
            partitioner = NFoldPartitioner(nf)
            meaner = mean_group_sample(['targets', 'partitions'])
            cm = ConfusionMatrix()
            te = TransferMeasure(kNN(k),
                                 Splitter('partitions'),
                                 postproc=BinaryFxNode(mean_mismatch_error,
                                                       'targets'),
                                 enable_ca=['stats'])
            errors = []
            for part in partitioner.generate(ds):
                ds_meaned = meaner(part)
                errors.append(np.asscalar(te(ds_meaned)))
                cm += te.ca.stats
            #print i, cm.stats['ACC']
            accs.append(cm.stats['ACC'])

            if False:  # not yet working -- see _tent/allow_ch_nsamples
                # branch for attempt to make it work
                # 1. This is a "native way" IF we allow change of number
                #    of samples via _call to be done by MappedClassifier
                #    while operating solely on the mapped dataset
                clf2 = MappedClassifier(
                    clf=kNN(k),  #clf,
                    mapper=mean_group_sample(['targets', 'partitions']))
                cv = CrossValidation(clf2,
                                     NFoldPartitioner(nf),
                                     postproc=None,
                                     enable_ca=['stats'])
                # meaning all should be ok since we should have ballanced
                # sets across all chunks here
                errors_native = cv(ds)

                self.assertEqual(
                    np.max(np.abs(errors_native.samples[:, 0] - errors)), 0)

            # 2. Work without fixes to MappedClassifier allowing
            #    change of # of samples
            #
            # CrossValidation will operate on a chain mapper which
            # would perform necessary meaning first before dealing with
            # kNN cons: .stats would not be exposed since ChainMapper
            # doesn't expose them from ChainMapper (yet)
            if __debug__ and 'ENFORCE_CA_ENABLED' in debug.active:
                raise SkipTest("Known to fail while trying to enable "
                               "training_stats for the ChainMapper")
            cv2 = CrossValidation(ChainMapper(
                [mean_group_sample(['targets', 'partitions']),
                 kNN(k)],
                space='targets'),
                                  NFoldPartitioner(nf),
                                  postproc=None)
            errors_native2 = cv2(ds)

            self.assertEqual(
                np.max(np.abs(errors_native2.samples[:, 0] - errors)), 0)

            # All of the ways should provide the same results
            #print i, np.max(np.abs(errors_native.samples[:,0] - errors)), \
            #      np.max(np.abs(errors_native2.samples[:,0] - errors))

        if False:  # just to investigate the distribution if we have enough iterations
            import pylab as pl
            uaccs = np.unique(accs)
            step = np.asscalar(np.unique(np.round(uaccs[1:] - uaccs[:-1], 4)))
            bins = np.linspace(0., 1., np.round(1. / step + 1))
            xx = pl.hist(accs, bins=bins, align='left')
            pl.xlim((0. - step / 2, 1. + step / 2))
Exemple #11
0
            RegressionAsClassifier(_lasso_lars, descr="skl.LassoLars_C()"),
            RegressionAsClassifier(_elastic_net, descr="skl.ElasticNet_C()"),
        ]

    if _skl_version >= '0.10':
        sklLassoLarsIC = _skl_import('linear_model', 'LassoLarsIC')
        _lasso_lars_ic = SKLLearnerAdapter(sklLassoLarsIC(),
                                           tags=_lars_tags,
                                           descr='skl.LassoLarsIC()')
        regrswh += [_lasso_lars_ic]
        clfswh += [
            RegressionAsClassifier(_lasso_lars_ic, descr='skl.LassoLarsIC_C()')
        ]

# kNN
clfswh += kNN(k=5, descr="kNN(k=5)")
clfswh += kNN(k=5, voting='majority', descr="kNN(k=5, voting='majority')")

clfswh += \
    FeatureSelectionClassifier(
        kNN(),
        SensitivityBasedFeatureSelection(
           SMLRWeights(SMLR(lm=1.0, implementation="C"),
                       postproc=maxofabs_sample()),
           RangeElementSelector(mode='select')),
        descr="kNN on SMLR(lm=1) non-0")

clfswh += \
    FeatureSelectionClassifier(
        kNN(),
        SensitivityBasedFeatureSelection(
Exemple #12
0
        clfswh += [RegressionAsClassifier(_lars, descr="skl.Lars_C()"),
                   RegressionAsClassifier(_lasso_lars, descr="skl.LassoLars_C()"),
                   RegressionAsClassifier(_elastic_net, descr="skl.ElasticNet_C()"),
                   ]

    if _skl_version >= '0.10':
        sklLassoLarsIC = _skl_import('linear_model', 'LassoLarsIC')
        _lasso_lars_ic = SKLLearnerAdapter(sklLassoLarsIC(),
                                           tags=_lars_tags,
                                           descr='skl.LassoLarsIC()')
        regrswh += [_lasso_lars_ic]
        clfswh += [RegressionAsClassifier(_lasso_lars_ic,
                                          descr='skl.LassoLarsIC_C()')]

# kNN
clfswh += kNN(k=5, descr="kNN(k=5)")
clfswh += kNN(k=5, voting='majority', descr="kNN(k=5, voting='majority')")

clfswh += \
    FeatureSelectionClassifier(
        kNN(),
        SensitivityBasedFeatureSelection(
           SMLRWeights(SMLR(lm=1.0, implementation="C"),
                       postproc=maxofabs_sample()),
           RangeElementSelector(mode='select')),
        descr="kNN on SMLR(lm=1) non-0")

clfswh += \
    FeatureSelectionClassifier(
        kNN(),
        SensitivityBasedFeatureSelection(
Exemple #13
0
def train_kNN(data_set):
    clf = kNN(12)
    clf.train(data_set)
    return clf
Exemple #14
0
def cv_kNN(data_set, partitioner):
    clf = kNN(12)
    clf.set_postproc(None)
    cv = CrossValidation(clf, partitioner)
    cv_results = cv(data_set)
    return np.mean(cv_results)
Exemple #15
0
from mvpa2.misc.plot import *

mvpa2.seed(0)                            # to reproduce the plot

dataset_kwargs = dict(nfeatures=2, nchunks=10,
    snr=2, nlabels=4, means=[ [0,1], [1,0], [1,1], [0,0] ])

dataset_train = normal_feature_dataset(**dataset_kwargs)
dataset_plot = normal_feature_dataset(**dataset_kwargs)


# make a new figure
pl.figure(figsize=(9, 9))

for i,k in enumerate((1, 3, 9, 20)):
    knn = kNN(k)

    print "Processing kNN(%i) problem..." % k
    pl.subplot(2, 2, i+1)

    """
    """

    knn.train(dataset_train)

    plot_decision_boundary_2d(
        dataset_plot, clf=knn, maps='targets')

if cfg.getboolean('examples', 'interactive', True):
    # show all the cool figures
    pl.show()