Esempio n. 1
0
def test_multiclass_classifier_cv(clf, ds):
    # Extending test_clf.py:ClassifiersTests.test_multiclass_classifier
    # Compare performance with our MaximalVote to the one done natively
    # by e.g. LIBSVM
    clf = clf.clone()
    clf.params.C = 1  # so it doesn't auto-adjust
    mclf = MulticlassClassifier(clf=clf.clone())
    part = NFoldPartitioner()
    cv = CrossValidation(clf, part, enable_ca=['stats', 'training_stats'])
    mcv = CrossValidation(mclf, part, enable_ca=['stats', 'training_stats'])

    er = cv(ds)
    mer = mcv(ds)

    # errors should be the same
    assert_array_equal(er, mer)
    assert_equal(str(cv.ca.training_stats), str(mcv.ca.training_stats))
    # if it was a binary task, cv.ca.stats would also have AUC column
    # while mcv would not  :-/  TODO
    if len(ds.UT) == 2:
        # so just compare the matrix and ACC
        assert_array_equal(cv.ca.stats.matrix, mcv.ca.stats.matrix)
        assert_equal(cv.ca.stats.stats['ACC'], mcv.ca.stats.stats['ACC'])
    else:
        assert_equal(str(cv.ca.stats), str(mcv.ca.stats))
Esempio n. 2
0
def _test_edmund_chong_20120907():  # pragma: no cover
    # commented out to avoid syntax warnings while compiling
    # from mvpa2.suite import *
    from mvpa2.testing.datasets import datasets
    repeater = Repeater(count=20)

    partitioner = ChainNode([NFoldPartitioner(cvtype=1),
                             Balancer(attr='targets',
                                      count=1, # for real data > 1
                                      limit='partitions',
                                      apply_selection=True
                                      )],
                            space='partitions')

    clf = LinearCSVMC() #choice of classifier
    permutator = AttributePermutator('targets', limit={'partitions': 1},
                                     count=1)
    null_cv = CrossValidation(
        clf,
        ChainNode([partitioner, permutator], space=partitioner.get_space()),
        errorfx=mean_mismatch_error)
    distr_est = MCNullDist(repeater, tail='left', measure=null_cv,
                           enable_ca=['dist_samples'])
    cvte = CrossValidation(clf, partitioner,
                           errorfx=mean_mismatch_error,
                           null_dist=distr_est,
                           enable_ca=['stats'])
    errors = cvte(datasets['uni2small'])
Esempio n. 3
0
    def test_custom_targets(self, lrn):
        """Simple test if a learner could cope with custom sa not targets
        """

        # Since we are comparing performances of two learners, we need
        # to assure that if they depend on some random seed -- they
        # would use the same value.  Currently we have such stochastic
        # behavior in SMLR
        if 'seed' in lrn.params:
            from mvpa2 import _random_seed
            lrn = lrn.clone()  # clone the beast
            lrn.params.seed = _random_seed  # reuse the same seed
        lrn_ = lrn.clone()
        lrn_.set_space('custom')

        te = CrossValidation(lrn, NFoldPartitioner())
        te_ = CrossValidation(lrn_, NFoldPartitioner())
        nclasses = 2 * (1 + int('multiclass' in lrn.__tags__))
        dsname = ('uni%dsmall' % nclasses,
                  'sin_modulated')[int(lrn.__is_regression__)]
        ds = datasets[dsname]
        ds_ = ds.copy()
        ds_.sa['custom'] = ds_.sa['targets']
        ds_.sa.pop('targets')
        self.assertTrue('targets' in ds.sa,
                        msg="'targets' should remain in original ds")

        try:
            cve = te(ds)
            cve_ = te_(ds_)
        except Exception, e:
            self.fail("Failed with %r" % e)
Esempio n. 4
0
def test_chained_crossvalidation_searchlight():
    from mvpa2.clfs.gnb import GNB
    from mvpa2.clfs.meta import MappedClassifier
    from mvpa2.generators.partition import NFoldPartitioner
    from mvpa2.mappers.base import ChainMapper
    from mvpa2.mappers.base import Mapper
    from mvpa2.measures.base import CrossValidation
    from mvpa2.measures.searchlight import sphere_searchlight
    from mvpa2.testing.datasets import datasets

    dataset = datasets['3dlarge'].copy()
    dataset.fa['voxel_indices'] = dataset.fa.myspace
    sample_clf = GNB()              # fast and deterministic

    class ZScoreFeaturesMapper(Mapper):
        """Very basic mapper which would take care about standardizing
        all features within each sample separately
        """
        def _forward_data(self, data):
            return (data - np.mean(data, axis=1)[:, None])/np.std(data, axis=1)[:, None]

    # only do partial to save time
    sl_kwargs = dict(radius=2, center_ids=[3, 50])
    clf_mapped = MappedClassifier(sample_clf, ZScoreFeaturesMapper())
    cv = CrossValidation(clf_mapped, NFoldPartitioner())
    sl = sphere_searchlight(cv, **sl_kwargs)
    results_mapped = sl(dataset)

    cv_chained = ChainMapper([ZScoreFeaturesMapper(auto_train=True),
                              CrossValidation(sample_clf, NFoldPartitioner())])
    sl_chained = sphere_searchlight(cv_chained, **sl_kwargs)
    results_chained = sl_chained(dataset)

    assert_array_equal(results_mapped, results_chained)
Esempio n. 5
0
    def test_noise_classification(self):
        # get a dataset with a very high SNR
        data = get_mv_pattern(10)

        # do crossval with default errorfx and 'mean' combiner
        cv = CrossValidation(sample_clf_nl, NFoldPartitioner())

        # must return a scalar value
        result = cv(data)
        # must be perfect
        self.assertTrue((result.samples < 0.05).all())

        # do crossval with permuted regressors
        cv = CrossValidation(
            sample_clf_nl,
            ChainNode(
                [NFoldPartitioner(),
                 AttributePermutator('targets', count=10)],
                space='partitions'))
        results = cv(data)

        # results must not be the same
        self.assertTrue(len(np.unique(results.samples)) > 1)

        # must be at chance level
        pmean = np.array(results).mean()
        self.assertTrue(pmean < 0.58 and pmean > 0.42)
Esempio n. 6
0
def test_searchlight_errors_per_trial():
    # To make sure that searchlight can return error/accuracy per trial
    from mvpa2.clfs.gnb import GNB
    from mvpa2.generators.partition import OddEvenPartitioner
    from mvpa2.measures.base import CrossValidation
    from mvpa2.measures.searchlight import sphere_searchlight
    from mvpa2.measures.gnbsearchlight import sphere_gnbsearchlight
    from mvpa2.testing.datasets import datasets
    from mvpa2.misc.errorfx import prediction_target_matches

    dataset = datasets['3dsmall'].copy()
    # randomly permute samples so we break any random correspondence
    # to strengthen tests below
    sample_idx = np.arange(len(dataset))
    dataset = dataset[np.random.permutation(sample_idx)]

    dataset.sa.targets = ['L%d' % l for l in dataset.sa.targets]
    dataset.fa['voxel_indices'] = dataset.fa.myspace
    sample_clf = GNB()              # fast and deterministic

    part = OddEvenPartitioner()
    # only do partial to save time
    cv = CrossValidation(sample_clf, part, errorfx=None) #prediction_target_matches)
    # Just to compare error
    cv_error = CrossValidation(sample_clf, part)

    # Large searchlight radius so we get entire ROI, 2 centers just to make sure
    # that all stacking works correctly
    sl = sphere_searchlight(cv, radius=10, center_ids=[0, 1])
    results = sl(dataset)

    sl_gnb = sphere_gnbsearchlight(sample_clf, part, radius=10, errorfx=None,
                                   center_ids=[0, 1])
    results_gnbsl = sl_gnb(dataset)

    # inspect both results
    # verify that partitioning was done correctly
    partitions = list(part.generate(dataset))
    for res in (results, results_gnbsl):
        assert('targets' in res.sa.keys())  # should carry targets
        assert('cvfolds' in res.sa.keys())  # should carry cvfolds
        for ipart in xrange(len(partitions)):
            assert_array_equal(dataset[partitions[ipart].sa.partitions == 2].targets,
                               res.sa.targets[res.sa.cvfolds == ipart])

    assert_datasets_equal(results, results_gnbsl)

    # one "accuracy" per each trial
    assert_equal(results.shape, (len(dataset), 2))
    # with accuracies the same in both searchlights since the same
    # features were to be selected in both cases due too large radii
    errors_dataset = cv(dataset)
    assert_array_equal(errors_dataset.samples[:, 0], results.samples[:, 0])
    assert_array_equal(errors_dataset.samples[:, 0], results.samples[:, 1])
    # and error matching (up to precision) the one if we run with default error function
    assert_array_almost_equal(np.mean(results.targets[:, None] != results.samples, axis=0)[0],
                              np.mean(cv_error(dataset)))
Esempio n. 7
0
def test_sifter_superord_usecase():
    from mvpa2.misc.data_generators import normal_feature_dataset
    from mvpa2.clfs.svm import LinearCSVMC  # fast one to use for tests
    from mvpa2.measures.base import CrossValidation

    from mvpa2.base.node import ChainNode
    from mvpa2.generators.partition import NFoldPartitioner
    from mvpa2.generators.base import Sifter

    # Let's simulate the beast -- 6 categories total groupped into 3
    # super-ordinate, and actually without any 'superordinate' effect
    # since subordinate categories independent
    ds = normal_feature_dataset(
        nlabels=6,
        snr=100,  # pure signal! ;)
        perlabel=30,
        nfeatures=6,
        nonbogus_features=range(6),
        nchunks=5)
    ds.sa['subord'] = ds.sa.targets.copy()
    ds.sa['superord'] = ['super%d' % (int(i[1]) % 3, )
                         for i in ds.targets]  # 3 superord categories
    # let's override original targets just to be sure that we aren't relying on them
    ds.targets[:] = 0

    npart = ChainNode(
        [
            ## so we split based on superord
            NFoldPartitioner(len(ds.sa['superord'].unique), attr='subord'),
            ## so it should select only those splits where we took 1 from
            ## each of the superord categories leaving things in balance
            Sifter([('partitions', 2),
                    ('superord', {
                        'uvalues': ds.sa['superord'].unique,
                        'balanced': True
                    })]),
        ],
        space='partitions')

    # and then do your normal where clf is space='superord'
    clf = LinearCSVMC(space='superord')
    cvte_regular = CrossValidation(clf,
                                   NFoldPartitioner(),
                                   errorfx=lambda p, t: np.mean(p == t))
    cvte_super = CrossValidation(clf,
                                 npart,
                                 errorfx=lambda p, t: np.mean(p == t))

    accs_regular = cvte_regular(ds)
    accs_super = cvte_super(ds)

    # With sifting we should get only 2^3 = 8 splits
    assert (len(accs_super) == 8)
    # I don't think that this would ever fail, so not marking it labile
    assert (np.mean(accs_regular) > .8)
    assert (np.mean(accs_super) < .6)
Esempio n. 8
0
    def test_regression_as_classifier(self, regr):
        """Basic tests of metaclass for using regressions as classifiers
        """
        for dsname in 'uni2small', 'uni4small':
            ds = datasets[dsname]

            clf = RegressionAsClassifier(regr, enable_ca=['distances'])
            cv = CrossValidation(clf,
                                 OddEvenPartitioner(),
                                 postproc=mean_sample(),
                                 enable_ca=['stats', 'training_stats'])

            error = cv(ds).samples.squeeze()

            nlabels = len(ds.uniquetargets)
            if nlabels == 2 \
               and cfg.getboolean('tests', 'labile', default='yes'):
                self.assertTrue(error < 0.3,
                                msg="Got error %.2f on %s dataset" %
                                (error, dsname))

            # Check if does not puke on repr and str
            self.assertTrue(str(clf) != "")
            self.assertTrue(repr(clf) != "")

            self.assertEqual(clf.ca.distances.shape,
                             (ds.nsamples / 2, nlabels))
Esempio n. 9
0
def _test_mcasey20120222():  # pragma: no cover
    # http://lists.alioth.debian.org/pipermail/pkg-exppsy-pymvpa/2012q1/002034.html

    # This one is conditioned on allowing # of samples to be changed
    # by the mapper provided to MappedClassifier.  See
    # https://github.com/yarikoptic/PyMVPA/tree/_tent/allow_ch_nsamples

    import numpy as np
    from mvpa2.datasets.base import dataset_wizard
    from mvpa2.generators.partition import NFoldPartitioner
    from mvpa2.mappers.base import ChainMapper
    from mvpa2.mappers.svd import SVDMapper
    from mvpa2.mappers.fx import mean_group_sample
    from mvpa2.clfs.svm import LinearCSVMC
    from mvpa2.clfs.meta import MappedClassifier
    from mvpa2.measures.base import CrossValidation

    mapper = ChainMapper([mean_group_sample(['targets','chunks']),
                          SVDMapper()])
    clf = MappedClassifier(LinearCSVMC(), mapper)
    cvte = CrossValidation(clf, NFoldPartitioner(),
                           enable_ca=['repetition_results', 'stats'])

    ds = dataset_wizard(
        samples=np.arange(32).reshape((8, -1)),
        targets=[1, 1, 2, 2, 1, 1, 2, 2],
        chunks=[1, 1, 1, 1, 2, 2, 2, 2])

    errors = cvte(ds)
Esempio n. 10
0
 def test_unpartitioned_cv(self):
     data = get_mv_pattern(10)
     # only one big chunk
     data.sa.chunks[:] = 1
     cv = CrossValidation(sample_clf_nl, NFoldPartitioner())
     # need to fail, because it can't be split into training and testing
     assert_raises(ValueError, cv, data)
Esempio n. 11
0
    def test_classifier_generalization(self, clf):
        """Simple test if classifiers can generalize ok on simple data
        """
        te = CrossValidation(clf, NFoldPartitioner(), postproc=mean_sample())
        # check the default
        #self.assertTrue(te.transerror.errorfx is mean_mismatch_error)

        nclasses = 2 * (1 + int('multiclass' in clf.__tags__))

        ds = datasets['uni%d%s' % (nclasses, self._get_clf_ds(clf))]
        try:
            cve = te(ds).samples.squeeze()
        except Exception as e:
            self.fail("Failed with %s" % e)

        if cfg.getboolean('tests', 'labile', default='yes'):
            if nclasses > 2 and \
                   ((clf.descr is not None and 'on 5%(' in clf.descr)
                    or 'regression_based' in clf.__tags__):
                # skip those since they are barely applicable/testable here
                raise SkipTest("Skip testing of cve on %s" % clf)

            self.assertTrue(
                cve < 0.25,  # TODO: use multinom distribution
                msg="Got transfer error %g on %s with %d labels" %
                (cve, ds, len(ds.UT)))
Esempio n. 12
0
def test_multiclass_classifier_pass_ds_attributes():
    # TODO: replicate/extend basic testing of pass_attr
    #       in some more "basic" test_*
    clf = LinearCSVMC(C=1)
    ds = datasets['uni3small'].copy()
    ds.sa['ids'] = np.arange(len(ds))
    mclf = MulticlassClassifier(
        clf,
        pass_attr=[
            'ids',
            'sa.chunks',
            'a.bogus_features',
            # 'ca.raw_estimates' # this one is binary_clf x samples list ATM
            # that is why raw_predictions_ds was born
            'ca.raw_predictions_ds',
            'ca.estimates',  # this one is ok
            'ca.predictions',
        ],
        enable_ca=['all'])
    mcv = CrossValidation(mclf, NFoldPartitioner(), errorfx=None)
    res = mcv(ds)
    assert_array_equal(sorted(res.sa.ids), ds.sa.ids)
    assert_array_equal(res.chunks, ds.chunks[res.sa.ids])
    assert_array_equal(res.sa.predictions, res.samples[:, 0])
    assert_array_equal(res.sa.cvfolds,
                       np.repeat(range(len(ds.UC)),
                                 len(ds) / len(ds.UC)))
Esempio n. 13
0
    def test_cv_no_generator_custom_splitter(self):
        ds = Dataset(np.arange(4),
                     sa={
                         'category': ['to', 'to', 'from', 'from'],
                         'targets': ['a', 'b', 'c', 'd']
                     })

        class Measure(Classifier):
            def _train(self, ds_):
                assert_array_equal(ds_.samples, ds.samples[2:])
                assert_array_equal(ds_.sa.category, ['from'] * len(ds_))

            def _predict(self, ds_):
                assert (ds_ is not ds)  # we pass a shallow copy
                # could be called to predit training or testing data
                if np.all(ds_.sa.targets != ['c', 'd']):
                    assert_array_equal(ds_.samples, ds.samples[:2])
                    assert_array_equal(ds_.sa.category, ['to'] * len(ds_))
                else:
                    assert_array_equal(ds_.sa.category, ['from'] * len(ds_))

                return ['c', 'd']

        measure = Measure()
        cv = CrossValidation(measure,
                             splitter=Splitter('category', ['from', 'to']))
        res = cv(ds)
        assert_array_equal(res, [[1]])  # failed perfectly ;-)
Esempio n. 14
0
def test_multiclass_without_combiner():
    # The goal is to obtain all pairwise results as the resultant dataset
    # avoiding even calling any combiner
    clf = LinearCSVMC(C=1)
    ds = datasets['uni3small'].copy()
    ds.sa['ids'] = np.arange(len(ds))
    mclf = MulticlassClassifier(clf, combiner=None)

    # without combining results at all
    mcv = CrossValidation(mclf, NFoldPartitioner(), errorfx=None)
    res = mcv(ds)
    assert_equal(len(res), len(ds))
    assert_equal(res.nfeatures, 3)  # 3 pairs for 3 classes
    assert_array_equal(res.UT, ds.UT)
    assert_array_equal(np.unique(np.array(res.fa.targets.tolist())), ds.UT)
    # TODO -- check that we have all the pairs?
    assert_array_equal(res.sa['cvfolds'].unique, np.arange(len(ds.UC)))
    if mcv.ca.is_enabled('training_stats'):
        # we must have received a dictionary per each pair
        training_stats = mcv.ca.training_stats
        assert_equal(set(training_stats.keys()),
                     set([('L0', 'L1'), ('L0', 'L2'), ('L1', 'L2')]))
        for pair, cm in training_stats.iteritems():
            assert_array_equal(cm.labels, ds.UT)
            # we should have no predictions for absent label
            assert_array_equal(cm.matrix[~np.in1d(ds.UT, pair)], 0)
            # while altogether all samples were processed once
            assert_array_equal(cm.stats['P'], len(ds))
            # and number of sets should be equal number of chunks here
            assert_equal(len(cm.sets), len(ds.UC))
Esempio n. 15
0
    def test_split_classifier_extended(self, clf_):
        clf2 = clf_.clone()
        ds = datasets['uni2%s' % self._get_clf_ds(clf2)]
        clf = SplitClassifier(
            clf=clf_,  #SameSignClassifier(),
            enable_ca=['stats', 'feature_ids'])
        clf.train(ds)  # train the beast
        error = clf.ca.stats.error

        cv = CrossValidation(clf2,
                             NFoldPartitioner(),
                             postproc=mean_sample(),
                             enable_ca=['stats', 'training_stats'])
        cverror = cv(ds).samples.squeeze()

        if not 'non-deterministic' in clf.__tags__:
            self.assertTrue(
                abs(error - cverror) < 0.01,
                msg="We should get the same error using split classifier as"
                " using CrossValidation. Got %s and %s" % (error, cverror))

        if cfg.getboolean('tests', 'labile', default='yes'):
            self.assertTrue(error < 0.25,
                            msg="clf should generalize more or less fine. "
                            "Got error %s" % error)
        self.assertEqual(len(clf.ca.stats.sets),
                         len(ds.UC),
                         msg="Should have 1 confusion per each split")
        self.assertEqual(
            len(clf.clfs),
            len(ds.UC),
            msg="Should have number of classifiers equal # of epochs")
Esempio n. 16
0
    def test_split_classifier(self):
        ds = self.data_bin_1
        clf = SplitClassifier(
            clf=SameSignClassifier(),
            enable_ca=['stats', 'training_stats', 'feature_ids'])
        clf.train(ds)  # train the beast
        error = clf.ca.stats.error
        tr_error = clf.ca.training_stats.error

        clf2 = clf.clone()
        cv = CrossValidation(clf2,
                             NFoldPartitioner(),
                             postproc=mean_sample(),
                             enable_ca=['stats', 'training_stats'])
        cverror = cv(ds)
        cverror = cverror.samples.squeeze()
        tr_cverror = cv.ca.training_stats.error

        self.assertEqual(
            error,
            cverror,
            msg="We should get the same error using split classifier as"
            " using CrossValidation. Got %s and %s" % (error, cverror))

        self.assertEqual(
            tr_error,
            tr_cverror,
            msg="We should get the same training error using split classifier as"
            " using CrossValidation. Got %s and %s" % (tr_error, tr_cverror))

        self.assertEqual(clf.ca.stats.percent_correct,
                         100,
                         msg="Dummy clf should train perfectly")
        # CV and SplitClassifier should get the same confusion matrices
        assert_array_equal(clf.ca.stats.matrix, cv.ca.stats.matrix)

        self.assertEqual(len(clf.ca.stats.sets),
                         len(ds.UC),
                         msg="Should have 1 confusion per each split")
        self.assertEqual(
            len(clf.clfs),
            len(ds.UC),
            msg="Should have number of classifiers equal # of epochs")
        self.assertEqual(clf.predict(ds.samples),
                         list(ds.targets),
                         msg="Should classify correctly")

        # feature_ids must be list of lists, and since it is not
        # feature-selecting classifier used - we expect all features
        # to be utilized
        #  NOT ANYMORE -- for BoostedClassifier we have now union of all
        #  used features across slave classifiers. That makes
        #  semantics clear. If you need to get deeper -- use upcoming
        #  harvesting facility ;-)
        # self.assertEqual(len(clf.feature_ids), len(ds.uniquechunks))
        # self.assertTrue(np.array([len(ids)==ds.nfeatures
        #                         for ids in clf.feature_ids]).all())

        # Just check if we get it at all ;-)
        summary = clf.summary()
Esempio n. 17
0
    def __test_matthias_question(self):
        rfe_clf = LinearCSVMC(C=1)

        rfesvm_split = SplitClassifier(rfe_clf)
        clf = \
            FeatureSelectionClassifier(
            clf = LinearCSVMC(C=1),
            feature_selection = RFE(
                sensitivity_analyzer = rfesvm_split.get_sensitivity_analyzer(
                    combiner=first_axis_mean,
                    transformer=np.abs),
                transfer_error=ConfusionBasedError(
                    rfesvm_split,
                    confusion_state="confusion"),
                stopping_criterion=FixedErrorThresholdStopCrit(0.20),
                feature_selector=FractionTailSelector(
                    0.2, mode='discard', tail='lower'),
                update_sensitivity=True))

        no_permutations = 1000
        permutator = AttributePermutator('targets', count=no_permutations)
        cv = CrossValidation(clf,
                             NFoldPartitioner(),
                             null_dist=MCNullDist(permutator, tail='left'),
                             enable_ca=['stats'])
        error = cv(datasets['uni2small'])
        self.assertTrue(error < 0.4)
        self.assertTrue(cv.ca.null_prob < 0.05)
Esempio n. 18
0
    def test_partial_searchlight_with_full_report(self):
        ds = self.dataset.copy()
        center_ids = np.zeros(ds.nfeatures, dtype='bool')
        center_ids[[3, 50]] = True
        ds.fa['center_ids'] = center_ids
        # compute N-1 cross-validation for each sphere
        cv = CrossValidation(GNB(), NFoldPartitioner())
        # contruct diameter 1 (or just radius 0) searchlight
        # one time give center ids as a list, the other one takes it from the
        # dataset itself
        sls = (
            sphere_searchlight(cv, radius=0, center_ids=[3, 50]),
            sphere_searchlight(None, radius=0, center_ids=[3, 50]),
            sphere_searchlight(cv, radius=0, center_ids='center_ids'),
        )
        for sl in sls:
            # assure that we could set cv post constructor
            if sl.datameasure is None:
                sl.datameasure = cv
            # run searchlight
            results = sl(ds)
            # only two spheres but error for all CV-folds
            self.assertEqual(results.shape, (len(self.dataset.UC), 2))
            # Test if results hold if we "set" a "new" datameasure
            sl.datameasure = CrossValidation(GNB(), NFoldPartitioner())
            results2 = sl(ds)
            assert_array_almost_equal(results, results2)

        # test if we graciously puke if center_ids are out of bounds
        dataset0 = ds[:, :50]  # so we have no 50th feature
        self.assertRaises(IndexError, sls[0], dataset0)
        # but it should be fine on the one that gets the ids from the dataset
        # itself
        results = sl(dataset0)
        assert_equal(results.nfeatures, 1)
        # check whether roi_seeds are correct
        sl = sphere_searchlight(lambda x: np.vstack(
            (x.fa.roi_seed, x.samples)),
                                radius=1,
                                add_center_fa=True,
                                center_ids=[12])
        res = sl(ds)
        assert_array_equal(
            res.samples[1:, res.samples[0].astype('bool')].squeeze(),
            ds.samples[:, 12])
Esempio n. 19
0
    def test_null_dist_prob(self, l_clf):
        train = datasets['uni2medium']

        num_perm = 10
        permutator = AttributePermutator('targets',
                                         count=num_perm,
                                         limit='chunks')
        # define class to estimate NULL distribution of errors
        # use left tail of the distribution since we use MeanMatchFx as error
        # function and lower is better
        terr = TransferMeasure(l_clf,
                               Repeater(count=2),
                               postproc=BinaryFxNode(mean_mismatch_error,
                                                     'targets'),
                               null_dist=MCNullDist(permutator, tail='left'))

        # check reasonable error range
        err = terr(train)
        self.assertTrue(np.mean(err) < 0.4)

        # Lets do the same for CVTE
        cvte = CrossValidation(l_clf,
                               OddEvenPartitioner(),
                               null_dist=MCNullDist(permutator,
                                                    tail='left',
                                                    enable_ca=['dist_samples'
                                                               ]),
                               postproc=mean_sample())
        cv_err = cvte(train)

        # check that the result is highly significant since we know that the
        # data has signal
        null_prob = np.asscalar(terr.ca.null_prob)

        if cfg.getboolean('tests', 'labile', default='yes'):
            self.assertTrue(
                null_prob <= 0.1,
                msg="Failed to check that the result is highly significant "
                "(got %f) since we know that the data has signal" % null_prob)

            self.assertTrue(
                np.asscalar(cvte.ca.null_prob) <= 0.1,
                msg="Failed to check that the result is highly significant "
                "(got p(cvte)=%f) since we know that the data has signal" %
                np.asscalar(cvte.ca.null_prob))

        # we should be able to access the actual samples of the distribution
        # yoh: why it is 3D really?
        # mih: because these are the distribution samples for the ONE error
        #      collapsed into ONE value across all folds. It will also be
        #      3d if the return value of the measure isn't a scalar and it is
        #      not collapsed across folds. it simply corresponds to the shape
        #      of the output dataset of the respective measure (+1 axis)
        # Some permutations could have been skipped since classifier failed
        # to train due to degenerate situation etc, thus accounting for them
        self.assertEqual(cvte.null_dist.ca.dist_samples.shape[2],
                         num_perm - cvte.null_dist.ca.skipped)
Esempio n. 20
0
 def test_nblocks(self):
     skip_if_no_external('pprocess')
     # just a basic test to see that we are getting the same
     # results with different nblocks
     ds = datasets['3dsmall'].copy(deep=True)[:, :13]
     ds.fa['voxel_indices'] = ds.fa.myspace
     cv = CrossValidation(GNB(), OddEvenPartitioner())
     res1 = sphere_searchlight(cv, radius=1, nproc=2)(ds)
     res2 = sphere_searchlight(cv, radius=1, nproc=2, nblocks=5)(ds)
     assert_array_equal(res1, res2)
Esempio n. 21
0
def test_searchlight_cross_decoding(path, subjects, conf_file, type, **kwargs):

    conf = read_configuration(path, conf_file, type)

    for arg in kwargs:
        conf[arg] = kwargs[arg]
        if arg == 'radius':
            radius = kwargs[arg]

    debug.active += ["SLC"]

    ds_merged = get_merged_ds(path, subjects, conf_file, type, **kwargs)

    clf = LinearCSVMC(C=1, probability=1, enable_ca=['probabilities'])
    cv = CrossValidation(clf, NFoldPartitioner(attr='task'))

    maps = []

    for ds in ds_merged:

        ds.targets[ds.targets == 'point'] = 'face'
        ds.targets[ds.targets == 'saccade'] = 'place'

        sl = sphere_searchlight(cv, radius, space='voxel_indices')

        sl_map = sl(ds)

        sl_map.samples *= -1
        sl_map.samples += 1

        nif = map2nifti(sl_map, imghdr=ds.a.imghdr)

        maps.append(nif)

    datetime = get_time()
    analysis = 'cross_searchlight'
    mask = conf['mask_area']
    task = type

    new_dir = datetime + '_' + analysis + '_' + mask + '_' + task
    command = 'mkdir ' + os.path.join(path, '0_results', new_dir)
    os.system(command)

    parent_dir = os.path.join(path, '0_results', new_dir)

    for s, map in zip(subjects, maps):
        name = s
        command = 'mkdir ' + os.path.join(parent_dir, name)
        os.system(command)

        results_dir = os.path.join(parent_dir, name)
        fname = name + '_radius_' + str(radius) + '_searchlight_map.nii.gz'
        map.to_filename(os.path.join(results_dir, fname))

    return maps
    def test_cached_kernel_different_datasets(self):
        skip_if_no_external('shogun', ver_dep='shogun:rev', min_version=4455)

        # Inspired by the problem Swaroop ran into
        k  = LinearSGKernel(normalizer_cls=False)
        k_ = LinearSGKernel(normalizer_cls=False)   # to be cached
        ck = CachedKernel(k_)

        clf = sgSVM(svm_impl='libsvm', kernel=k, C=-1)
        clf_ = sgSVM(svm_impl='libsvm', kernel=ck, C=-1)

        cvte = CrossValidation(clf, NFoldPartitioner())
        cvte_ = CrossValidation(clf_, NFoldPartitioner())

        postproc=BinaryFxNode(mean_mismatch_error, 'targets')
        te = ProxyMeasure(clf, postproc=postproc)
        te_ = ProxyMeasure(clf_, postproc=postproc)

        for r in xrange(2):
            ds1 = datasets['uni2medium']
            errs1 = cvte(ds1)
            ck.compute(ds1)
            ok_(ck._recomputed)
            errs1_ = cvte_(ds1)
            ok_(~ck._recomputed)
            assert_array_equal(errs1, errs1_)

            ds2 = datasets['uni3small']
            errs2 = cvte(ds2)
            ck.compute(ds2)
            ok_(ck._recomputed)
            errs2_ = cvte_(ds2)
            ok_(~ck._recomputed)
            assert_array_equal(errs2, errs2_)

            ssel = np.round(datasets['uni2large'].samples[:5, 0]).astype(int)
            te.train(datasets['uni3small'][::2])
            terr = np.asscalar(te(datasets['uni3small'][ssel]))
            te_.train(datasets['uni3small'][::2])
            terr_ = np.asscalar(te_(datasets['uni3small'][ssel]))
            ok_(~ck._recomputed)
            ok_(terr == terr_)
Esempio n. 23
0
def test_sifter_superord_usecase():
    from mvpa2.misc.data_generators import normal_feature_dataset
    from mvpa2.clfs.svm import LinearCSVMC  # fast one to use for tests
    from mvpa2.measures.base import CrossValidation

    from mvpa2.base.node import ChainNode
    from mvpa2.generators.partition import NFoldPartitioner
    from mvpa2.generators.base import Sifter

    ds = _get_superord_dataset()

    npart = ChainNode(
        [
            ## so we split based on superord
            NFoldPartitioner(len(ds.sa['superord'].unique), attr='subord'),
            ## so it should select only those splits where we took 1 from
            ## each of the superord categories leaving things in balance
            Sifter([('partitions', 2),
                    ('superord', {
                        'uvalues': ds.sa['superord'].unique,
                        'balanced': True
                    })]),
        ],
        space='partitions')

    # and then do your normal where clf is space='superord'
    clf = LinearCSVMC(space='superord')
    cvte_regular = CrossValidation(clf,
                                   NFoldPartitioner(),
                                   errorfx=lambda p, t: np.mean(p == t))
    cvte_super = CrossValidation(clf,
                                 npart,
                                 errorfx=lambda p, t: np.mean(p == t))

    accs_regular = cvte_regular(ds)
    accs_super = cvte_super(ds)

    # With sifting we should get only 2^3 = 8 splits
    assert (len(accs_super) == 8)
    # I don't think that this would ever fail, so not marking it labile
    assert (np.mean(accs_regular) > .8)
    assert (np.mean(accs_super) < .6)
Esempio n. 24
0
    def test_cache_speedup(self):
        skip_if_no_external('shogun', ver_dep='shogun:rev', min_version=4455)

        ck = sgSVM(kernel=CachedKernel(kernel=RbfSGKernel(sigma=2)), C=1)
        sk = sgSVM(kernel=RbfSGKernel(sigma=2), C=1)

        cv_c = CrossValidation(ck, NFoldPartitioner())
        cv_s = CrossValidation(sk, NFoldPartitioner())

        #data = datasets['uni4large']
        P = 5000
        data = normal_feature_dataset(snr=2,
                                      perlabel=200,
                                      nchunks=10,
                                      means=np.random.randn(2, P),
                                      nfeatures=P)

        t0 = time()
        ck.params.kernel.compute(data)
        cachetime = time() - t0

        t0 = time()
        cached_err = cv_c(data)
        ccv_time = time() - t0

        t0 = time()
        norm_err = cv_s(data)
        ncv_time = time() - t0

        assert_almost_equal(np.asanyarray(cached_err), np.asanyarray(norm_err))
        ok_(cachetime < ncv_time)
        ok_(ccv_time < ncv_time)
        #print 'Regular CV time: %s seconds'%ncv_time
        #print 'Caching time: %s seconds'%cachetime
        #print 'Cached CV time: %s seconds'%ccv_time

        speedup = ncv_time / (ccv_time + cachetime)
        #print 'Speedup factor: %s'%speedup

        # Speedup ideally should be 10, though it's not purely linear
        self.failIf(speedup < 2, 'Problem caching data - too slow!')
Esempio n. 25
0
def test_confusion_as_node():
    from mvpa2.misc.data_generators import normal_feature_dataset
    from mvpa2.clfs.gnb import GNB
    from mvpa2.clfs.transerror import Confusion
    ds = normal_feature_dataset(snr=2.0,
                                perlabel=42,
                                nchunks=3,
                                nonbogus_features=[0, 1],
                                nfeatures=2)
    clf = GNB()
    cv = CrossValidation(clf,
                         NFoldPartitioner(),
                         errorfx=None,
                         postproc=Confusion(labels=ds.UT),
                         enable_ca=['stats'])
    res = cv(ds)
    # needs to be identical to CA
    assert_array_equal(res.samples, cv.ca.stats.matrix)
    assert_array_equal(res.sa.predictions, ds.UT)
    assert_array_equal(res.fa.targets, ds.UT)

    skip_if_no_external('scipy')

    from mvpa2.clfs.transerror import BayesConfusionHypothesis
    from mvpa2.base.node import ChainNode
    # same again, but this time with Bayesian hypothesis testing at the end
    cv = CrossValidation(clf,
                         NFoldPartitioner(),
                         errorfx=None,
                         postproc=ChainNode((Confusion(labels=ds.UT),
                                             BayesConfusionHypothesis())))
    res = cv(ds)
    # only two possible hypothesis with two classes
    assert_equals(len(res), 2)
    # the first hypothesis is the can't discriminate anything
    assert_equal(len(res.sa.hypothesis[0]), 1)
    assert_equal(len(res.sa.hypothesis[0][0]), 2)
    # and the hypothesis is actually less likely than the other one
    # (both classes can be distinguished)
    assert (np.e**res.samples[0, 0] < np.e**res.samples[1, 0])
Esempio n. 26
0
    def test_james_problem_multiclass(self):
        percent = 80
        dataset = datasets['uni4large']
        #dataset = dataset[:, dataset.a.nonbogus_features]

        rfesvm_split = LinearCSVMC()
        fs = \
            RFE(rfesvm_split.get_sensitivity_analyzer(
            postproc=ChainMapper([
                #FxMapper('features', l2_normed),
                #FxMapper('samples', np.mean),
                #FxMapper('samples', np.abs)
                FxMapper('features', lambda x: np.argsort(np.abs(x))),
                #maxofabs_sample()
                mean_sample()
                ])),
                ProxyMeasure(rfesvm_split,
                             postproc=BinaryFxNode(mean_mismatch_error,
                                                   'targets')),
                Splitter('train'),
                fselector=FractionTailSelector(
                    percent / 100.0,
                    mode='select', tail='upper'), update_sensitivity=True)

        clf = FeatureSelectionClassifier(
            LinearCSVMC(),
            # on features selected via RFE
            fs)

        # update sensitivity at each step (since we're not using the
        # same CLF as sensitivity analyzer)

        class StoreResults(object):
            def __init__(self):
                self.storage = []

            def __call__(self, data, node, result):
                self.storage.append((node.measure.mapper.ca.history,
                                     node.measure.mapper.ca.errors)),

        cv_storage = StoreResults()
        cv = CrossValidation(clf,
                             NFoldPartitioner(),
                             postproc=mean_sample(),
                             callback=cv_storage,
                             enable_ca=['stats'])
        #cv = SplitClassifier(clf)
        try:
            error = cv(dataset).samples.squeeze()
        except Exception, e:
            self.fail('CrossValidation cannot handle classifier with RFE '
                      'feature selection. Got exception: %s' % (e, ))
Esempio n. 27
0
def _test_gnb_overflow_haxby():  # pragma: no cover
    # example from https://github.com/PyMVPA/PyMVPA/issues/581
    # a heavier version of the above test
    import os
    import numpy as np

    from mvpa2.datasets.sources.native import load_tutorial_data
    from mvpa2.clfs.gnb import GNB
    from mvpa2.measures.base import CrossValidation
    from mvpa2.generators.partition import HalfPartitioner
    from mvpa2.mappers.zscore import zscore
    from mvpa2.mappers.detrend import poly_detrend
    from mvpa2.datasets.miscfx import remove_invariant_features
    from mvpa2.testing.datasets import *

    datapath = '/usr/share/data/pymvpa2-tutorial/'
    haxby = load_tutorial_data(datapath,
                               roi='vt',
                               add_fa={
                                   'vt_thr_glm':
                                   os.path.join(datapath, 'haxby2001',
                                                'sub001', 'masks', 'orig',
                                                'vt.nii.gz')
                               })
    # poly_detrend(haxby, polyord=1, chunks_attr='chunks')
    haxby = haxby[np.array(
        [
            l in ['rest', 'scrambled']  # ''house', 'face']
            for l in haxby.targets
        ],
        dtype='bool')]
    #zscore(haxby, chunks_attr='chunks', param_est=('targets', ['rest']),
    #       dtype='float32')
    # haxby = haxby[haxby.sa.targets != 'rest']
    haxby = remove_invariant_features(haxby)

    clf = GNB(enable_ca='estimates', logprob=True, normalize=True)

    #clf.train(haxby)
    #clf.predict(haxby)
    # estimates a bit "overfit" to judge in the train/predict on the same data

    cv = CrossValidation(clf,
                         HalfPartitioner(attr='chunks'),
                         postproc=None,
                         enable_ca=['stats'])

    cv_results = cv(haxby)
    res1_est = clf.ca.estimates
    print "Estimates:\n", res1_est
    print "Exp(estimates):\n", np.round(np.exp(res1_est), 3)
    assert np.all(np.isfinite(res1_est))
Esempio n. 28
0
    def test_classifier_generalization(self, clf):
        """Simple test if classifiers can generalize ok on simple data
        """
        te = CrossValidation(clf, NFoldPartitioner(), postproc=mean_sample())
        # check the default
        #self.assertTrue(te.transerror.errorfx is mean_mismatch_error)

        nclasses = 2 * (1 + int('multiclass' in clf.__tags__))

        ds = datasets['uni%d%s' % (nclasses, self._get_clf_ds(clf))]
        try:
            cve = te(ds).samples.squeeze()
        except Exception, e:
            self.fail("Failed with %s" % e)
Esempio n. 29
0
    def test_values(self, clf):
        if isinstance(clf, MulticlassClassifier):
            # TODO: handle those values correctly
            return
        ds = datasets['uni2small']
        clf.ca.change_temporarily(enable_ca = ['estimates'])
        cv = CrossValidation(clf, OddEvenPartitioner(),
            enable_ca=['stats', 'training_stats'])
        _ = cv(ds)
        #print clf.descr, clf.values[0]
        # basic test either we get 1 set of values per each sample
        self.assertEqual(len(clf.ca.estimates), ds.nsamples/2)

        clf.ca.reset_changed_temporarily()
    def test_perturbation_sensitivity_analyzer(self):
        # compute N-1 cross-validation as datameasure
        cv = CrossValidation(sample_clf_lin, NFoldPartitioner())
        # do perturbation analysis using gaussian noise
        pa = NoisePerturbationSensitivity(cv, noise=np.random.normal)

        # run analysis
        map = pa(self.dataset)

        # check for correct size of map
        self.assertTrue(map.nfeatures == self.dataset.nfeatures)

        # dataset is noise -> mean sensitivity should be zero
        self.assertTrue(-0.2 < np.mean(map) < 0.2)