Beispiel #1
0
    def test_nfold_random_counted_selection_partitioner(self):
        # Lets get somewhat extensive but complete one and see if
        # everything is legit. 0.5 must correspond to 50%, in our case
        # 5 out of 10 unique chunks
        split_partitions = [
            tuple(x.sa.partitions)
            for x in NFoldPartitioner(0.5).generate(self.data)]
        # 252 is # of combinations of 5 from 10
        assert_equal(len(split_partitions), 252)

        # verify that all of them are unique
        assert_equal(len(set(split_partitions)), 252)

        # now let's limit our query
        kwargs = dict(count=10, selection_strategy='random')
        split10_partitions = [
            tuple(x.sa.partitions)
            for x in NFoldPartitioner(5, **kwargs).generate(self.data)]
        split10_partitions_ = [
            tuple(x.sa.partitions)
            for x in NFoldPartitioner(0.5, **kwargs).generate(self.data)]
        # to make sure that I deal with sets of tuples correctly:
        assert_equal(len(set(split10_partitions)), 10)
        assert_equal(len(split10_partitions), 10)
        assert_equal(len(split10_partitions_), 10)
        # and they must differ (same ones are possible but very very unlikely)
        assert_not_equal(split10_partitions, split10_partitions_)
        # but every one of them must be within known exhaustive set
        assert_equal(set(split_partitions).intersection(split10_partitions),
                     set(split10_partitions))
        assert_equal(set(split_partitions).intersection(split10_partitions_),
                     set(split10_partitions_))
Beispiel #2
0
    def test_custom_targets(self, lrn):
        """Simple test if a learner could cope with custom sa not targets
        """

        # Since we are comparing performances of two learners, we need
        # to assure that if they depend on some random seed -- they
        # would use the same value.  Currently we have such stochastic
        # behavior in SMLR
        if 'seed' in lrn.params:
            from mvpa2 import _random_seed
            lrn = lrn.clone()  # clone the beast
            lrn.params.seed = _random_seed  # reuse the same seed
        lrn_ = lrn.clone()
        lrn_.set_space('custom')

        te = CrossValidation(lrn, NFoldPartitioner())
        te_ = CrossValidation(lrn_, NFoldPartitioner())
        nclasses = 2 * (1 + int('multiclass' in lrn.__tags__))
        dsname = ('uni%dsmall' % nclasses,
                  'sin_modulated')[int(lrn.__is_regression__)]
        ds = datasets[dsname]
        ds_ = ds.copy()
        ds_.sa['custom'] = ds_.sa['targets']
        ds_.sa.pop('targets')
        self.assertTrue('targets' in ds.sa,
                        msg="'targets' should remain in original ds")

        try:
            cve = te(ds)
            cve_ = te_(ds_)
        except Exception, e:
            self.fail("Failed with %r" % e)
Beispiel #3
0
    def test_noise_classification(self):
        # get a dataset with a very high SNR
        data = get_mv_pattern(10)

        # do crossval with default errorfx and 'mean' combiner
        cv = CrossValidation(sample_clf_nl, NFoldPartitioner())

        # must return a scalar value
        result = cv(data)
        # must be perfect
        self.assertTrue((result.samples < 0.05).all())

        # do crossval with permuted regressors
        cv = CrossValidation(
            sample_clf_nl,
            ChainNode(
                [NFoldPartitioner(),
                 AttributePermutator('targets', count=10)],
                space='partitions'))
        results = cv(data)

        # results must not be the same
        self.assertTrue(len(np.unique(results.samples)) > 1)

        # must be at chance level
        pmean = np.array(results).mean()
        self.assertTrue(pmean < 0.58 and pmean > 0.42)
Beispiel #4
0
def test_sifter_with_balancing():
    # extended previous test which was already
    # "... somewhat duplicating the doctest"
    ds = Dataset(samples=np.arange(12).reshape((-1, 2)),
                 sa={
                     'chunks': [0, 1, 2, 3, 4, 5],
                     'targets': ['c', 'c', 'c', 'p', 'p', 'p']
                 })

    # Without sifter -- just to assure that we do get all of them
    # i.e. 6*5*4*3/(4!) = 15
    par = ChainNode([NFoldPartitioner(cvtype=4, attr='chunks')])
    assert_equal(len(list(par.generate(ds))), 15)

    # so we will take 4 chunks out of available 7, but would care only
    # about those partitions where we have balanced number of 'c' and 'p'
    # entries
    assert_raises(
        ValueError,
        lambda x: list(Sifter([('targets', dict(wrong=1))]).generate(x)), ds)

    par = ChainNode([
        NFoldPartitioner(cvtype=4, attr='chunks'),
        Sifter([('partitions', 2),
                ('targets', dict(uvalues=['c', 'p'], balanced=True))])
    ])
    dss = list(par.generate(ds))
    # print [ x[x.sa.partitions==2].sa.targets for x in dss ]
    assert_equal(len(dss), 9)
    for ds_ in dss:
        testing = ds[ds_.sa.partitions == 2]
        assert_array_equal(np.unique(testing.sa.targets), ['c', 'p'])
        # and we still have both targets  present in training
        training = ds[ds_.sa.partitions == 1]
        assert_array_equal(np.unique(training.sa.targets), ['c', 'p'])
def blocked_detection_n_equals_1(mech_vec_list, mech_nm_list):
    data, _ = mar.create_blocked_dataset_semantic_classes(mech_vec_list, mech_nm_list, append_robot = False)
    nfs = NFoldPartitioner(cvtype=1, attr='targets') # 1-fold ?
    spl = splitters.Splitter(attr='partitions')
    splits = [list(spl.generate(x)) for x in nfs.generate(data)]
    
    ## splitter = NFoldSplitter(cvtype=1)
    ## label_splitter = NFoldSplitter(cvtype=1, attr='labels')
    mean_thresh_known_mech_dict = {}
    for l_wdata, l_vdata in splits:
        mean_thresh_known_mech_list = []
        Ms = mar.compute_Ms(data, l_vdata.targets[0], plot=True)
        break

        mechs = l_vdata.uniquechunks
        for m in mechs:
            n_std = 0.
            all_trials = l_vdata.samples[np.where(l_vdata.chunks == m)]
            le = all_trials.shape[1]
            for i in range(all_trials.shape[0]):
                one_trial = all_trials[i,:].reshape(1,le)
                mn_list, std_list = mar.estimate_theta(one_trial, Ms, plot=False)
                mn_arr, std_arr = np.array(mn_list), np.array(std_list)
                n_std = max(n_std, np.max(np.abs(all_trials - mn_arr) / std_arr))

            mean_thresh_known_mech_dict[m] = (Ms, n_std) # store on a per mechanism granularity
            print 'n_std for', m, ':', n_std
            print 'max error force for', m, ':', np.max(n_std*std_arr[2:])
Beispiel #6
0
def test_chained_crossvalidation_searchlight():
    from mvpa2.clfs.gnb import GNB
    from mvpa2.clfs.meta import MappedClassifier
    from mvpa2.generators.partition import NFoldPartitioner
    from mvpa2.mappers.base import ChainMapper
    from mvpa2.mappers.base import Mapper
    from mvpa2.measures.base import CrossValidation
    from mvpa2.measures.searchlight import sphere_searchlight
    from mvpa2.testing.datasets import datasets

    dataset = datasets['3dlarge'].copy()
    dataset.fa['voxel_indices'] = dataset.fa.myspace
    sample_clf = GNB()              # fast and deterministic

    class ZScoreFeaturesMapper(Mapper):
        """Very basic mapper which would take care about standardizing
        all features within each sample separately
        """
        def _forward_data(self, data):
            return (data - np.mean(data, axis=1)[:, None])/np.std(data, axis=1)[:, None]

    # only do partial to save time
    sl_kwargs = dict(radius=2, center_ids=[3, 50])
    clf_mapped = MappedClassifier(sample_clf, ZScoreFeaturesMapper())
    cv = CrossValidation(clf_mapped, NFoldPartitioner())
    sl = sphere_searchlight(cv, **sl_kwargs)
    results_mapped = sl(dataset)

    cv_chained = ChainMapper([ZScoreFeaturesMapper(auto_train=True),
                              CrossValidation(sample_clf, NFoldPartitioner())])
    sl_chained = sphere_searchlight(cv_chained, **sl_kwargs)
    results_chained = sl_chained(dataset)

    assert_array_equal(results_mapped, results_chained)
Beispiel #7
0
    def test_slicing(self):
        hs = HalfPartitioner()
        spl = Splitter(attr="partitions")
        splits = list(hs.generate(self.data))
        for s in splits:
            # partitioned dataset shared the data
            assert_true(s.samples.base is self.data.samples)
        splits = [list(spl.generate(p)) for p in hs.generate(self.data)]

        # with numpy 1.7.0b1 "chaining" was deprecated so let's create
        # check function appropriate for the given numpy version
        _a = np.arange(5)
        __a = _a[:4][:3]
        if __a.base is _a:
            # 1.7.0b1
            def is_the_same_base(x, base=self.data.samples):
                return x.base is base

        elif __a.base.base is _a:
            # prior 1.7.0b1
            def is_the_same_base(x, base=self.data.samples):
                return x.base.base is base

        else:
            raise RuntimeError("Uknown handling of .base by numpy")

        for s in splits:
            # we get slicing all the time
            assert_true(is_the_same_base(s[0].samples))
            assert_true(is_the_same_base(s[1].samples))
        spl = Splitter(attr="partitions", noslicing=True)
        splits = [list(spl.generate(p)) for p in hs.generate(self.data)]
        for s in splits:
            # we no slicing at all
            assert_false(s[0].samples.base is self.data.samples)
            assert_false(s[1].samples.base is self.data.samples)
        nfs = NFoldPartitioner()
        spl = Splitter(attr="partitions")
        splits = [list(spl.generate(p)) for p in nfs.generate(self.data)]
        for i, s in enumerate(splits):
            # training only first and last split
            if i == 0 or i == len(splits) - 1:
                assert_true(is_the_same_base(s[0].samples))
            else:
                assert_true(s[0].samples.base is None)
            # we get slicing all the time
            assert_true(is_the_same_base(s[1].samples))
        step_ds = Dataset(np.random.randn(20, 2), sa={"chunks": np.tile([0, 1], 10)})
        oes = OddEvenPartitioner()
        spl = Splitter(attr="partitions")
        splits = list(oes.generate(step_ds))
        for s in splits:
            # partitioned dataset shared the data
            assert_true(s.samples.base is step_ds.samples)
        splits = [list(spl.generate(p)) for p in oes.generate(step_ds)]
        assert_equal(len(splits), 2)
        for s in splits:
            # we get slicing all the time
            assert_true(is_the_same_base(s[0].samples, step_ds.samples))
            assert_true(is_the_same_base(s[1].samples, step_ds.samples))
Beispiel #8
0
    def test_slicing(self):
        hs = HalfPartitioner()
        spl = Splitter(attr='partitions')
        splits = list(hs.generate(self.data))
        for s in splits:
            # partitioned dataset shared the data
            assert_true(s.samples.base is self.data.samples)
        splits = [list(spl.generate(p)) for p in hs.generate(self.data)]

        # with numpy 1.7.0b1 "chaining" was deprecated so let's create
        # check function appropriate for the given numpy version
        _a = np.arange(5)
        __a = _a[:4][:3]
        if __a.base is _a:
            # 1.7.0b1
            def is_the_same_base(x, base=self.data.samples):
                return x.base is base
        elif __a.base.base is _a:
            # prior 1.7.0b1
            def is_the_same_base(x, base=self.data.samples):
                return x.base.base is base
        else:
            raise RuntimeError("Uknown handling of .base by numpy")

        for s in splits:
            # we get slicing all the time
            assert_true(is_the_same_base(s[0].samples))
            assert_true(is_the_same_base(s[1].samples))
        spl = Splitter(attr='partitions', noslicing=True)
        splits = [list(spl.generate(p)) for p in hs.generate(self.data)]
        for s in splits:
            # we no slicing at all
            assert_false(s[0].samples.base is self.data.samples)
            assert_false(s[1].samples.base is self.data.samples)
        nfs = NFoldPartitioner()
        spl = Splitter(attr='partitions')
        splits = [list(spl.generate(p)) for p in nfs.generate(self.data)]
        for i, s in enumerate(splits):
            # training only first and last split
            if i == 0 or i == len(splits) - 1:
                assert_true(is_the_same_base(s[0].samples))
            else:
                assert_true(s[0].samples.base is None)
            # we get slicing all the time
            assert_true(s[1].samples.base.base is self.data.samples)
        step_ds = Dataset(np.random.randn(20, 2),
                          sa={'chunks': np.tile([0, 1], 10)})
        oes = OddEvenPartitioner()
        spl = Splitter(attr='partitions')
        splits = list(oes.generate(step_ds))
        for s in splits:
            # partitioned dataset shared the data
            assert_true(s.samples.base is step_ds.samples)
        splits = [list(spl.generate(p)) for p in oes.generate(step_ds)]
        assert_equal(len(splits), 2)
        for s in splits:
            # we get slicing all the time
            assert_true(s[0].samples.base.base is step_ds.samples)
            assert_true(s[1].samples.base.base is step_ds.samples)
Beispiel #9
0
def test_sifter_superord_usecase():
    from mvpa2.misc.data_generators import normal_feature_dataset
    from mvpa2.clfs.svm import LinearCSVMC  # fast one to use for tests
    from mvpa2.measures.base import CrossValidation

    from mvpa2.base.node import ChainNode
    from mvpa2.generators.partition import NFoldPartitioner
    from mvpa2.generators.base import Sifter

    # Let's simulate the beast -- 6 categories total groupped into 3
    # super-ordinate, and actually without any 'superordinate' effect
    # since subordinate categories independent
    ds = normal_feature_dataset(
        nlabels=6,
        snr=100,  # pure signal! ;)
        perlabel=30,
        nfeatures=6,
        nonbogus_features=range(6),
        nchunks=5)
    ds.sa['subord'] = ds.sa.targets.copy()
    ds.sa['superord'] = ['super%d' % (int(i[1]) % 3, )
                         for i in ds.targets]  # 3 superord categories
    # let's override original targets just to be sure that we aren't relying on them
    ds.targets[:] = 0

    npart = ChainNode(
        [
            ## so we split based on superord
            NFoldPartitioner(len(ds.sa['superord'].unique), attr='subord'),
            ## so it should select only those splits where we took 1 from
            ## each of the superord categories leaving things in balance
            Sifter([('partitions', 2),
                    ('superord', {
                        'uvalues': ds.sa['superord'].unique,
                        'balanced': True
                    })]),
        ],
        space='partitions')

    # and then do your normal where clf is space='superord'
    clf = LinearCSVMC(space='superord')
    cvte_regular = CrossValidation(clf,
                                   NFoldPartitioner(),
                                   errorfx=lambda p, t: np.mean(p == t))
    cvte_super = CrossValidation(clf,
                                 npart,
                                 errorfx=lambda p, t: np.mean(p == t))

    accs_regular = cvte_regular(ds)
    accs_super = cvte_super(ds)

    # With sifting we should get only 2^3 = 8 splits
    assert (len(accs_super) == 8)
    # I don't think that this would ever fail, so not marking it labile
    assert (np.mean(accs_regular) > .8)
    assert (np.mean(accs_super) < .6)
Beispiel #10
0
 def test_gnbsearchlight_matchaccuracy(self):
     # was not able to deal with custom errorfx collapsing samples
     # after 55e147e0bd30fbf4edede3faef3a15c6c65b33ea
     ds = datasets['3dmedium'].copy()
     ds.fa['voxel_indices'] = ds.fa.myspace
     sl_err = sphere_gnbsearchlight(GNB(),
                                    NFoldPartitioner(cvtype=1),
                                    radius=0)
     sl_acc = sphere_gnbsearchlight(GNB(),
                                    NFoldPartitioner(cvtype=1),
                                    radius=0,
                                    errorfx=mean_match_accuracy)
     assert_array_almost_equal(sl_err(ds), 1.0 - sl_acc(ds).samples)
Beispiel #11
0
    def test_simplest_cv_pat_gen(self):
        # create the generator
        nfs = NFoldPartitioner(cvtype=1)
        spl = Splitter(attr='partitions')
        # now get the xval pattern sets One-Fold CV)
        xvpat = [list(spl.generate(p)) for p in nfs.generate(self.data)]

        self.assertTrue(len(xvpat) == 10)

        for i, p in enumerate(xvpat):
            self.assertTrue(len(p) == 2)
            self.assertTrue(p[0].nsamples == 90)
            self.assertTrue(p[1].nsamples == 10)
            self.assertTrue(p[1].chunks[0] == i)
Beispiel #12
0
    def test_simplest_cv_pat_gen(self):
        # create the generator
        nfs = NFoldPartitioner(cvtype=1)
        spl = Splitter(attr='partitions')
        # now get the xval pattern sets One-Fold CV)
        xvpat = [ list(spl.generate(p)) for p in nfs.generate(self.data) ]

        self.failUnless( len(xvpat) == 10 )

        for i,p in enumerate(xvpat):
            self.failUnless( len(p) == 2 )
            self.failUnless( p[0].nsamples == 90 )
            self.failUnless( p[1].nsamples == 10 )
            self.failUnless( p[1].chunks[0] == i )
Beispiel #13
0
def test_exclude_targets_combinations_subjectchunks():
    partitioner = ChainNode([NFoldPartitioner(attr='subjects'),
                             ExcludeTargetsCombinationsPartitioner(
                                 k=1,
                                 targets_attr='chunks',
                                 space='partitions')],
                            space='partitions')
    # targets do not need even to be defined!
    ds = Dataset(np.arange(18).reshape(9, 2),
                 sa={'chunks': np.arange(9) // 3,
                     'subjects': np.arange(9) % 3})
    dss = list(partitioner.generate(ds))
    assert_equal(len(dss), 9)

    testing_subjs, testing_chunks = [], []
    for ds_ in dss:
        testing_partition = ds_.sa.partitions == 2
        training_partition = ds_.sa.partitions == 1
        # must be scalars -- so implicit test here
        # if not -- would be error
        testing_subj = np.asscalar(np.unique(ds_.sa.subjects[testing_partition]))
        testing_subjs.append(testing_subj)
        testing_chunk = np.asscalar(np.unique(ds_.sa.chunks[testing_partition]))
        testing_chunks.append(testing_chunk)
        # and those must not appear for training
        ok_(not testing_subj in ds_.sa.subjects[training_partition])
        ok_(not testing_chunk in ds_.sa.chunks[training_partition])
    # and we should have gone through all chunks/subjs pairs
    testing_pairs = set(zip(testing_subjs, testing_chunks))
    assert_equal(len(testing_pairs), 9)
    # yoh: equivalent to set(itertools.product(range(3), range(3))))
    #      but .product is N/A for python2.5
    assert_equal(testing_pairs, set(zip(*np.where(np.ones((3,3))))))
Beispiel #14
0
    def test_classifier_generalization(self, clf):
        """Simple test if classifiers can generalize ok on simple data
        """
        te = CrossValidation(clf, NFoldPartitioner(), postproc=mean_sample())
        # check the default
        #self.assertTrue(te.transerror.errorfx is mean_mismatch_error)

        nclasses = 2 * (1 + int('multiclass' in clf.__tags__))

        ds = datasets['uni%d%s' % (nclasses, self._get_clf_ds(clf))]
        try:
            cve = te(ds).samples.squeeze()
        except Exception as e:
            self.fail("Failed with %s" % e)

        if cfg.getboolean('tests', 'labile', default='yes'):
            if nclasses > 2 and \
                   ((clf.descr is not None and 'on 5%(' in clf.descr)
                    or 'regression_based' in clf.__tags__):
                # skip those since they are barely applicable/testable here
                raise SkipTest("Skip testing of cve on %s" % clf)

            self.assertTrue(
                cve < 0.25,  # TODO: use multinom distribution
                msg="Got transfer error %g on %s with %d labels" %
                (cve, ds, len(ds.UT)))
Beispiel #15
0
def test_splitclf_sensitivities():
    datasets = [
        normal_feature_dataset(perlabel=100,
                               nlabels=2,
                               nfeatures=4,
                               nonbogus_features=[0, i + 1],
                               snr=1,
                               nchunks=2) for i in xrange(2)
    ]

    sclf = SplitClassifier(SMLR(), NFoldPartitioner())
    analyzer = sclf.get_sensitivity_analyzer()

    senses1 = analyzer(datasets[0])
    senses2 = analyzer(datasets[1])

    for senses in senses1, senses2:
        # This should be False when comparing two folds
        assert_false(np.allclose(senses.samples[0], senses.samples[2]))
        assert_false(np.allclose(senses.samples[1], senses.samples[3]))
    # Moreover with new data we should have got different results
    # (i.e. it must retrained correctly)
    for s1, s2 in zip(senses1, senses2):
        assert_false(np.allclose(s1, s2))

    # and we should have "selected" "correct" voxels
    for i, senses in enumerate((senses1, senses2)):
        assert_equal(set(np.argsort(np.max(np.abs(senses), axis=0))[-2:]),
                     set((0, i + 1)))
Beispiel #16
0
    def test_split_clf_on_chainpartitioner(self):
        # pretty much a smoke test for #156
        ds = datasets['uni2small']
        part = ChainNode([
            NFoldPartitioner(cvtype=1),
            Balancer(attr='targets',
                     count=2,
                     limit='partitions',
                     apply_selection=True)
        ])
        partitions = list(part.generate(ds))
        sclf = SplitClassifier(sample_clf_lin,
                               part,
                               enable_ca=['stats', 'splits'])
        sclf.train(ds)
        pred = sclf.predict(ds)
        assert_equal(len(pred), len(ds))  # rudimentary check
        assert_equal(len(sclf.ca.splits), len(partitions))
        assert_equal(len(sclf.clfs), len(partitions))

        # now let's do sensitivity analyzer just in case
        sclf.untrain()
        sensana = sclf.get_sensitivity_analyzer()
        sens = sensana(ds)
        # basic check that sensitivities varied across splits
        from mvpa2.mappers.fx import FxMapper
        sens_stds = FxMapper('samples', np.std, uattrs=['targets'])(sens)
        assert_true(np.any(sens_stds != 0))
Beispiel #17
0
    def test_split_classifier_extended(self, clf_):
        clf2 = clf_.clone()
        ds = datasets['uni2%s' % self._get_clf_ds(clf2)]
        clf = SplitClassifier(
            clf=clf_,  #SameSignClassifier(),
            enable_ca=['stats', 'feature_ids'])
        clf.train(ds)  # train the beast
        error = clf.ca.stats.error

        cv = CrossValidation(clf2,
                             NFoldPartitioner(),
                             postproc=mean_sample(),
                             enable_ca=['stats', 'training_stats'])
        cverror = cv(ds).samples.squeeze()

        if not 'non-deterministic' in clf.__tags__:
            self.assertTrue(
                abs(error - cverror) < 0.01,
                msg="We should get the same error using split classifier as"
                " using CrossValidation. Got %s and %s" % (error, cverror))

        if cfg.getboolean('tests', 'labile', default='yes'):
            self.assertTrue(error < 0.25,
                            msg="clf should generalize more or less fine. "
                            "Got error %s" % error)
        self.assertEqual(len(clf.ca.stats.sets),
                         len(ds.UC),
                         msg="Should have 1 confusion per each split")
        self.assertEqual(
            len(clf.clfs),
            len(ds.UC),
            msg="Should have number of classifiers equal # of epochs")
Beispiel #18
0
def _test_mcasey20120222():  # pragma: no cover
    # http://lists.alioth.debian.org/pipermail/pkg-exppsy-pymvpa/2012q1/002034.html

    # This one is conditioned on allowing # of samples to be changed
    # by the mapper provided to MappedClassifier.  See
    # https://github.com/yarikoptic/PyMVPA/tree/_tent/allow_ch_nsamples

    import numpy as np
    from mvpa2.datasets.base import dataset_wizard
    from mvpa2.generators.partition import NFoldPartitioner
    from mvpa2.mappers.base import ChainMapper
    from mvpa2.mappers.svd import SVDMapper
    from mvpa2.mappers.fx import mean_group_sample
    from mvpa2.clfs.svm import LinearCSVMC
    from mvpa2.clfs.meta import MappedClassifier
    from mvpa2.measures.base import CrossValidation

    mapper = ChainMapper([mean_group_sample(['targets','chunks']),
                          SVDMapper()])
    clf = MappedClassifier(LinearCSVMC(), mapper)
    cvte = CrossValidation(clf, NFoldPartitioner(),
                           enable_ca=['repetition_results', 'stats'])

    ds = dataset_wizard(
        samples=np.arange(32).reshape((8, -1)),
        targets=[1, 1, 2, 2, 1, 1, 2, 2],
        chunks=[1, 1, 1, 1, 2, 2, 2, 2])

    errors = cvte(ds)
Beispiel #19
0
 def test_cached_qe_gnbsearchlight(self):
     ds1 = datasets['3dsmall'].copy(deep=True)
     qe = IndexQueryEngine(myspace=Sphere(2))
     cached_qe = CachedQueryEngine(qe)
     gnb_sl = GNBSearchlight(GNB(), NFoldPartitioner(), qe=cached_qe)
     res = gnb_sl(ds1)
     assert_false(cached_qe.ids is None)
Beispiel #20
0
def test_multiclass_classifier_pass_ds_attributes():
    # TODO: replicate/extend basic testing of pass_attr
    #       in some more "basic" test_*
    clf = LinearCSVMC(C=1)
    ds = datasets['uni3small'].copy()
    ds.sa['ids'] = np.arange(len(ds))
    mclf = MulticlassClassifier(
        clf,
        pass_attr=[
            'ids',
            'sa.chunks',
            'a.bogus_features',
            # 'ca.raw_estimates' # this one is binary_clf x samples list ATM
            # that is why raw_predictions_ds was born
            'ca.raw_predictions_ds',
            'ca.estimates',  # this one is ok
            'ca.predictions',
        ],
        enable_ca=['all'])
    mcv = CrossValidation(mclf, NFoldPartitioner(), errorfx=None)
    res = mcv(ds)
    assert_array_equal(sorted(res.sa.ids), ds.sa.ids)
    assert_array_equal(res.chunks, ds.chunks[res.sa.ids])
    assert_array_equal(res.sa.predictions, res.samples[:, 0])
    assert_array_equal(res.sa.cvfolds,
                       np.repeat(range(len(ds.UC)),
                                 len(ds) / len(ds.UC)))
Beispiel #21
0
def test_exclude_targets_combinations():
    partitioner = ChainNode([
        NFoldPartitioner(),
        ExcludeTargetsCombinationsPartitioner(
            k=2, targets_attr='targets', space='partitions')
    ],
                            space='partitions')
    from mvpa2.misc.data_generators import normal_feature_dataset
    ds = normal_feature_dataset(snr=0.,
                                nlabels=4,
                                perlabel=3,
                                nchunks=3,
                                nonbogus_features=[0, 1, 2, 3],
                                nfeatures=4)
    partitions = list(partitioner.generate(ds))
    assert_equal(len(partitions), 3 * 6)
    splitter = Splitter('partitions')
    combs = []
    comb_chunks = []
    for p in partitions:
        trds, teds = list(splitter.generate(p))[:2]
        comb = tuple(np.unique(teds.targets))
        combs.append(comb)
        comb_chunks.append(comb + tuple(np.unique(teds.chunks)))
    assert_equal(len(set(combs)),
                 6)  # just 6 possible combinations of 2 out of 4
    assert_equal(len(set(comb_chunks)), 3 * 6)  # all unique
Beispiel #22
0
    def test_split_classifier(self):
        ds = self.data_bin_1
        clf = SplitClassifier(
            clf=SameSignClassifier(),
            enable_ca=['stats', 'training_stats', 'feature_ids'])
        clf.train(ds)  # train the beast
        error = clf.ca.stats.error
        tr_error = clf.ca.training_stats.error

        clf2 = clf.clone()
        cv = CrossValidation(clf2,
                             NFoldPartitioner(),
                             postproc=mean_sample(),
                             enable_ca=['stats', 'training_stats'])
        cverror = cv(ds)
        cverror = cverror.samples.squeeze()
        tr_cverror = cv.ca.training_stats.error

        self.assertEqual(
            error,
            cverror,
            msg="We should get the same error using split classifier as"
            " using CrossValidation. Got %s and %s" % (error, cverror))

        self.assertEqual(
            tr_error,
            tr_cverror,
            msg="We should get the same training error using split classifier as"
            " using CrossValidation. Got %s and %s" % (tr_error, tr_cverror))

        self.assertEqual(clf.ca.stats.percent_correct,
                         100,
                         msg="Dummy clf should train perfectly")
        # CV and SplitClassifier should get the same confusion matrices
        assert_array_equal(clf.ca.stats.matrix, cv.ca.stats.matrix)

        self.assertEqual(len(clf.ca.stats.sets),
                         len(ds.UC),
                         msg="Should have 1 confusion per each split")
        self.assertEqual(
            len(clf.clfs),
            len(ds.UC),
            msg="Should have number of classifiers equal # of epochs")
        self.assertEqual(clf.predict(ds.samples),
                         list(ds.targets),
                         msg="Should classify correctly")

        # feature_ids must be list of lists, and since it is not
        # feature-selecting classifier used - we expect all features
        # to be utilized
        #  NOT ANYMORE -- for BoostedClassifier we have now union of all
        #  used features across slave classifiers. That makes
        #  semantics clear. If you need to get deeper -- use upcoming
        #  harvesting facility ;-)
        # self.assertEqual(len(clf.feature_ids), len(ds.uniquechunks))
        # self.assertTrue(np.array([len(ids)==ds.nfeatures
        #                         for ids in clf.feature_ids]).all())

        # Just check if we get it at all ;-)
        summary = clf.summary()
Beispiel #23
0
def test_permute_superord():
    from mvpa2.base.node import ChainNode
    from mvpa2.generators.partition import NFoldPartitioner
    from mvpa2.generators.base import Sifter
    from mvpa2.generators.permutation import AttributePermutator

    ds = _get_superord_dataset()
    # mvpa2.seed(1)
    part = ChainNode(
        [
            ## so we split based on superord
            NFoldPartitioner(len(ds.sa['superord'].unique), attr='subord'),
            ## so it should select only those splits where we took 1 from
            ## each of the superord categories leaving things in balance
            Sifter([('partitions', 2),
                    ('superord', {
                        'uvalues': ds.sa['superord'].unique,
                        'balanced': True
                    })]),
            AttributePermutator(['superord'], limit=['partitions', 'chunks']),
        ],
        space='partitions')

    for ds_perm in part.generate(ds):
        # it does permutation
        assert (np.sum(ds_perm.sa.superord != ds.sa.superord) != 0)
Beispiel #24
0
 def test_unpartitioned_cv(self):
     data = get_mv_pattern(10)
     # only one big chunk
     data.sa.chunks[:] = 1
     cv = CrossValidation(sample_clf_nl, NFoldPartitioner())
     # need to fail, because it can't be split into training and testing
     assert_raises(ValueError, cv, data)
Beispiel #25
0
    def test_split_featurewise_dataset_measure(self):
        ds = datasets['uni3small']
        sana = RepeatedMeasure(
            SMLR(fit_all_weights=True).get_sensitivity_analyzer(),
            ChainNode(
                [NFoldPartitioner(),
                 Splitter('partitions', attr_values=[1])]))

        sens = sana(ds)
        # a sensitivity for each chunk and each label combination
        assert_equal(sens.shape, (len(ds.sa['chunks'].unique) *
                                  len(ds.sa['targets'].unique), ds.nfeatures))

        # Lets try more complex example with 'boosting'
        ds = datasets['uni3medium']
        ds.init_origids('samples')
        sana = RepeatedMeasure(
            SMLR(fit_all_weights=True).get_sensitivity_analyzer(),
            Balancer(amount=0.25, count=2, apply_selection=True),
            enable_ca=['datasets', 'repetition_results'])
        sens = sana(ds)

        assert_equal(sens.shape,
                     (2 * len(ds.sa['targets'].unique), ds.nfeatures))
        splits = sana.ca.datasets
        self.assertEqual(len(splits), 2)
        self.assertTrue(
            np.all([s.nsamples == ds.nsamples // 4 for s in splits]))
        # should have used different samples
        self.assertTrue(np.any([splits[0].sa.origids != splits[1].sa.origids]))
        # and should have got different sensitivities
        self.assertTrue(np.any(sens[0] != sens[3]))
Beispiel #26
0
def _test_edmund_chong_20120907():  # pragma: no cover
    # commented out to avoid syntax warnings while compiling
    # from mvpa2.suite import *
    from mvpa2.testing.datasets import datasets
    repeater = Repeater(count=20)

    partitioner = ChainNode([NFoldPartitioner(cvtype=1),
                             Balancer(attr='targets',
                                      count=1, # for real data > 1
                                      limit='partitions',
                                      apply_selection=True
                                      )],
                            space='partitions')

    clf = LinearCSVMC() #choice of classifier
    permutator = AttributePermutator('targets', limit={'partitions': 1},
                                     count=1)
    null_cv = CrossValidation(
        clf,
        ChainNode([partitioner, permutator], space=partitioner.get_space()),
        errorfx=mean_mismatch_error)
    distr_est = MCNullDist(repeater, tail='left', measure=null_cv,
                           enable_ca=['dist_samples'])
    cvte = CrossValidation(clf, partitioner,
                           errorfx=mean_mismatch_error,
                           null_dist=distr_est,
                           enable_ca=['stats'])
    errors = cvte(datasets['uni2small'])
Beispiel #27
0
def test_multiclass_without_combiner():
    # The goal is to obtain all pairwise results as the resultant dataset
    # avoiding even calling any combiner
    clf = LinearCSVMC(C=1)
    ds = datasets['uni3small'].copy()
    ds.sa['ids'] = np.arange(len(ds))
    mclf = MulticlassClassifier(clf, combiner=None)

    # without combining results at all
    mcv = CrossValidation(mclf, NFoldPartitioner(), errorfx=None)
    res = mcv(ds)
    assert_equal(len(res), len(ds))
    assert_equal(res.nfeatures, 3)  # 3 pairs for 3 classes
    assert_array_equal(res.UT, ds.UT)
    assert_array_equal(np.unique(np.array(res.fa.targets.tolist())), ds.UT)
    # TODO -- check that we have all the pairs?
    assert_array_equal(res.sa['cvfolds'].unique, np.arange(len(ds.UC)))
    if mcv.ca.is_enabled('training_stats'):
        # we must have received a dictionary per each pair
        training_stats = mcv.ca.training_stats
        assert_equal(set(training_stats.keys()),
                     set([('L0', 'L1'), ('L0', 'L2'), ('L1', 'L2')]))
        for pair, cm in training_stats.iteritems():
            assert_array_equal(cm.labels, ds.UT)
            # we should have no predictions for absent label
            assert_array_equal(cm.matrix[~np.in1d(ds.UT, pair)], 0)
            # while altogether all samples were processed once
            assert_array_equal(cm.stats['P'], len(ds))
            # and number of sets should be equal number of chunks here
            assert_equal(len(cm.sets), len(ds.UC))
Beispiel #28
0
def test_multiclass_classifier_cv(clf, ds):
    # Extending test_clf.py:ClassifiersTests.test_multiclass_classifier
    # Compare performance with our MaximalVote to the one done natively
    # by e.g. LIBSVM
    clf = clf.clone()
    clf.params.C = 1  # so it doesn't auto-adjust
    mclf = MulticlassClassifier(clf=clf.clone())
    part = NFoldPartitioner()
    cv = CrossValidation(clf, part, enable_ca=['stats', 'training_stats'])
    mcv = CrossValidation(mclf, part, enable_ca=['stats', 'training_stats'])

    er = cv(ds)
    mer = mcv(ds)

    # errors should be the same
    assert_array_equal(er, mer)
    assert_equal(str(cv.ca.training_stats), str(mcv.ca.training_stats))
    # if it was a binary task, cv.ca.stats would also have AUC column
    # while mcv would not  :-/  TODO
    if len(ds.UT) == 2:
        # so just compare the matrix and ACC
        assert_array_equal(cv.ca.stats.matrix, mcv.ca.stats.matrix)
        assert_equal(cv.ca.stats.stats['ACC'], mcv.ca.stats.stats['ACC'])
    else:
        assert_equal(str(cv.ca.stats), str(mcv.ca.stats))
Beispiel #29
0
    def __test_matthias_question(self):
        rfe_clf = LinearCSVMC(C=1)

        rfesvm_split = SplitClassifier(rfe_clf)
        clf = \
            FeatureSelectionClassifier(
            clf = LinearCSVMC(C=1),
            feature_selection = RFE(
                sensitivity_analyzer = rfesvm_split.get_sensitivity_analyzer(
                    combiner=first_axis_mean,
                    transformer=np.abs),
                transfer_error=ConfusionBasedError(
                    rfesvm_split,
                    confusion_state="confusion"),
                stopping_criterion=FixedErrorThresholdStopCrit(0.20),
                feature_selector=FractionTailSelector(
                    0.2, mode='discard', tail='lower'),
                update_sensitivity=True))

        no_permutations = 1000
        permutator = AttributePermutator('targets', count=no_permutations)
        cv = CrossValidation(clf,
                             NFoldPartitioner(),
                             null_dist=MCNullDist(permutator, tail='left'),
                             enable_ca=['stats'])
        error = cv(datasets['uni2small'])
        self.assertTrue(error < 0.4)
        self.assertTrue(cv.ca.null_prob < 0.05)
Beispiel #30
0
    def test_counted_splitting(self):
        spl = Splitter(attr='partitions')
        # count > #chunks, should result in 10 splits
        nchunks = len(self.data.sa['chunks'].unique)
        for strategy in Partitioner._STRATEGIES:
            for count, target in [(nchunks * 2, nchunks), (nchunks, nchunks),
                                  (nchunks - 1, nchunks - 1), (3, 3), (0, 0),
                                  (1, 1)]:
                nfs = NFoldPartitioner(cvtype=1,
                                       count=count,
                                       selection_strategy=strategy)
                splits = [
                    list(spl.generate(p)) for p in nfs.generate(self.data)
                ]
                self.assertTrue(len(splits) == target)
                chosenchunks = [int(s[1].uniquechunks) for s in splits]

                # Test if configuration matches as well
                nsplits_cfg = len(nfs.get_partition_specs(self.data))
                self.assertEqual(nsplits_cfg, target)

                # Check if "lastsplit" dsattr was assigned appropriately
                nsplits = len(splits)
                if nsplits > 0:
                    # dummy-proof testing of last split
                    for ds_ in splits[-1]:
                        self.assertTrue(ds_.a.lastpartitionset)
                    # test all now
                    for isplit, split in enumerate(splits):
                        for ds_ in split:
                            ds_.a.lastpartitionset == isplit == nsplits - 1

                # Check results of different strategies
                if strategy == 'first':
                    self.assertEqual(chosenchunks, range(target))
                elif strategy == 'equidistant':
                    if target == 3:
                        self.assertEqual(chosenchunks, [0, 3, 7])
                elif strategy == 'random':
                    # none is selected twice
                    self.assertTrue(
                        len(set(chosenchunks)) == len(chosenchunks))
                    self.assertTrue(target == len(chosenchunks))
                else:
                    raise RuntimeError, "Add unittest for strategy %s" \
                          % strategy
Beispiel #31
0
    def test_counted_splitting(self):
        spl = Splitter(attr='partitions')
        # count > #chunks, should result in 10 splits
        nchunks = len(self.data.sa['chunks'].unique)
        for strategy in Partitioner._STRATEGIES:
            for count, target in [ (nchunks*2, nchunks),
                                   (nchunks, nchunks),
                                   (nchunks-1, nchunks-1),
                                   (3, 3),
                                   (0, 0),
                                   (1, 1)
                                   ]:
                nfs = NFoldPartitioner(cvtype=1, count=count,
                                       selection_strategy=strategy)
                splits = [ list(spl.generate(p)) for p in nfs.generate(self.data) ]
                self.failUnless(len(splits) == target)
                chosenchunks = [int(s[1].uniquechunks) for s in splits]

                # Test if configuration matches as well
                nsplits_cfg = len(nfs.get_partition_specs(self.data))
                self.failUnlessEqual(nsplits_cfg, target)

                # Check if "lastsplit" dsattr was assigned appropriately
                nsplits = len(splits)
                if nsplits > 0:
                    # dummy-proof testing of last split
                    for ds_ in splits[-1]:
                        self.failUnless(ds_.a.lastpartitionset)
                    # test all now
                    for isplit,split in enumerate(splits):
                        for ds_ in split:
                            ds_.a.lastpartitionset == isplit==nsplits-1

                # Check results of different strategies
                if strategy == 'first':
                    self.failUnlessEqual(chosenchunks, range(target))
                elif strategy == 'equidistant':
                    if target == 3:
                        self.failUnlessEqual(chosenchunks, [0, 3, 7])
                elif strategy == 'random':
                    # none is selected twice
                    self.failUnless(len(set(chosenchunks)) == len(chosenchunks))
                    self.failUnless(target == len(chosenchunks))
                else:
                    raise RuntimeError, "Add unittest for strategy %s" \
                          % strategy
Beispiel #32
0
    def test_partial_searchlight_with_full_report(self):
        ds = self.dataset.copy()
        center_ids = np.zeros(ds.nfeatures, dtype='bool')
        center_ids[[3, 50]] = True
        ds.fa['center_ids'] = center_ids
        # compute N-1 cross-validation for each sphere
        cv = CrossValidation(GNB(), NFoldPartitioner())
        # contruct diameter 1 (or just radius 0) searchlight
        # one time give center ids as a list, the other one takes it from the
        # dataset itself
        sls = (
            sphere_searchlight(cv, radius=0, center_ids=[3, 50]),
            sphere_searchlight(None, radius=0, center_ids=[3, 50]),
            sphere_searchlight(cv, radius=0, center_ids='center_ids'),
        )
        for sl in sls:
            # assure that we could set cv post constructor
            if sl.datameasure is None:
                sl.datameasure = cv
            # run searchlight
            results = sl(ds)
            # only two spheres but error for all CV-folds
            self.assertEqual(results.shape, (len(self.dataset.UC), 2))
            # Test if results hold if we "set" a "new" datameasure
            sl.datameasure = CrossValidation(GNB(), NFoldPartitioner())
            results2 = sl(ds)
            assert_array_almost_equal(results, results2)

        # test if we graciously puke if center_ids are out of bounds
        dataset0 = ds[:, :50]  # so we have no 50th feature
        self.assertRaises(IndexError, sls[0], dataset0)
        # but it should be fine on the one that gets the ids from the dataset
        # itself
        results = sl(dataset0)
        assert_equal(results.nfeatures, 1)
        # check whether roi_seeds are correct
        sl = sphere_searchlight(lambda x: np.vstack(
            (x.fa.roi_seed, x.samples)),
                                radius=1,
                                add_center_fa=True,
                                center_ids=[12])
        res = sl(ds)
        assert_array_equal(
            res.samples[1:, res.samples[0].astype('bool')].squeeze(),
            ds.samples[:, 12])
 def test_incorrect_parameter_error(self):
     # Just a sample class
     from mvpa2.generators.partition import NFoldPartitioner
     try:
         spl = NFoldPartitioner(1, incorrect=None)
         raise AssertionError("Must have failed with an exception here "
                              "due to incorrect parameter")
     except Exception, e:
         estr = str(e)
Beispiel #34
0
 def test_slicing(self):
     hs = HalfPartitioner()
     spl = Splitter(attr='partitions')
     splits = list(hs.generate(self.data))
     for s in splits:
         # partitioned dataset shared the data
         assert_true(s.samples.base is self.data.samples)
     splits = [ list(spl.generate(p)) for p in hs.generate(self.data) ]
     for s in splits:
         # we get slicing all the time
         assert_true(s[0].samples.base.base is self.data.samples)
         assert_true(s[1].samples.base.base is self.data.samples)
     spl = Splitter(attr='partitions', noslicing=True)
     splits = [ list(spl.generate(p)) for p in hs.generate(self.data) ]
     for s in splits:
         # we no slicing at all
         assert_false(s[0].samples.base is self.data.samples)
         assert_false(s[1].samples.base is self.data.samples)
     nfs = NFoldPartitioner()
     spl = Splitter(attr='partitions')
     splits = [ list(spl.generate(p)) for p in nfs.generate(self.data) ]
     for i, s in enumerate(splits):
         # training only first and last split
         if i == 0 or i == len(splits) - 1:
             assert_true(s[0].samples.base.base is self.data.samples)
         else:
             assert_true(s[0].samples.base is None)
         # we get slicing all the time
         assert_true(s[1].samples.base.base is self.data.samples)
     step_ds = Dataset(np.random.randn(20,2),
                       sa={'chunks': np.tile([0,1], 10)})
     oes = OddEvenPartitioner()
     spl = Splitter(attr='partitions')
     splits = list(oes.generate(step_ds))
     for s in splits:
         # partitioned dataset shared the data
         assert_true(s.samples.base is step_ds.samples)
     splits = [ list(spl.generate(p)) for p in oes.generate(step_ds) ]
     assert_equal(len(splits), 2)
     for s in splits:
         # we get slicing all the time
         assert_true(s[0].samples.base.base is step_ds.samples)
         assert_true(s[1].samples.base.base is step_ds.samples)
Beispiel #35
0
    def _test_gideon_weird_case(self):
        """'The utter collapse' -- communicated by Peter J. Kohler

        Desire to collapse all samples per each category in training
        and testing sets, thus resulting only in a single
        sample/category per training and per testing.  As it is now,
        CrossValidation on MappedClassifier would not work

        observations: chance distribution obviously gets wide, but
        also gets skewed to anti-learning on nfolds like 4.
        
        """
        from mvpa2.mappers.fx import mean_group_sample
        from mvpa2.clfs.knn import kNN
        clf = kNN()
        print "HERE"
        ds = datasets['uni2large'].copy()
        ds = ds[ds.sa.chunks < 9]
        accs = []
        for i in xrange(10):          # # of random samples
            ds.samples = np.random.randn(*ds.shape)
            if False: # this would have been a native way IF we allowed change of number of samples
                clf2 = MappedClassifier(clf=kNN(), #clf,
                                        mapper=mean_group_sample(['targets', 'partitions']))
                cv = CrossValidation(clf2, NFoldPartitioner(4), postproc=None,
                                     enable_ca=['stats'])
                print cv(ds)
            else:
                from mvpa2.clfs.transerror import ConfusionMatrix
                partitioner = NFoldPartitioner(6)
                meaner = mean_group_sample(['targets', 'partitions'])
                cm = ConfusionMatrix()
                te = TransferMeasure(clf, Splitter('partitions'),
                                     postproc=BinaryFxNode(mean_mismatch_error,
                                                           'targets'),
                                     enable_ca = ['stats']
                                     )
                for part in partitioner.generate(ds):
                    ds_meaned = meaner(part)
                    error = np.asscalar(te(ds_meaned))
                    cm += te.ca.stats
                print i, cm.stats['ACC']
                accs.append(cm.stats['ACC'])
Beispiel #36
0
def test_searchlight_cross_decoding(path, subjects, conf_file, type, **kwargs):

    conf = read_configuration(path, conf_file, type)

    for arg in kwargs:
        conf[arg] = kwargs[arg]
        if arg == 'radius':
            radius = kwargs[arg]

    debug.active += ["SLC"]

    ds_merged = get_merged_ds(path, subjects, conf_file, type, **kwargs)

    clf = LinearCSVMC(C=1, probability=1, enable_ca=['probabilities'])
    cv = CrossValidation(clf, NFoldPartitioner(attr='task'))

    maps = []

    for ds in ds_merged:

        ds.targets[ds.targets == 'point'] = 'face'
        ds.targets[ds.targets == 'saccade'] = 'place'

        sl = sphere_searchlight(cv, radius, space='voxel_indices')

        sl_map = sl(ds)

        sl_map.samples *= -1
        sl_map.samples += 1

        nif = map2nifti(sl_map, imghdr=ds.a.imghdr)

        maps.append(nif)

    datetime = get_time()
    analysis = 'cross_searchlight'
    mask = conf['mask_area']
    task = type

    new_dir = datetime + '_' + analysis + '_' + mask + '_' + task
    command = 'mkdir ' + os.path.join(path, '0_results', new_dir)
    os.system(command)

    parent_dir = os.path.join(path, '0_results', new_dir)

    for s, map in zip(subjects, maps):
        name = s
        command = 'mkdir ' + os.path.join(parent_dir, name)
        os.system(command)

        results_dir = os.path.join(parent_dir, name)
        fname = name + '_radius_' + str(radius) + '_searchlight_map.nii.gz'
        map.to_filename(os.path.join(results_dir, fname))

    return maps
Beispiel #37
0
def test_factorialpartitioner():
    # Test against sifter and chainmap implemented in test_usecases
    # -- code below copied from test_usecases --
    # Let's simulate the beast -- 6 categories total groupped into 3
    # super-ordinate, and actually without any 'superordinate' effect
    # since subordinate categories independent
    ds = normal_feature_dataset(
        nlabels=6, snr=100, perlabel=30, nfeatures=6, nonbogus_features=range(6), nchunks=5  # pure signal! ;)
    )
    ds.sa["subord"] = ds.sa.targets.copy()
    ds.sa["superord"] = ["super%d" % (int(i[1]) % 3,) for i in ds.targets]  # 3 superord categories
    # let's override original targets just to be sure that we aren't relying on them
    ds.targets[:] = 0

    # let's make two other datasets to test later
    # one superordinate category only
    ds_1super = ds.copy()
    ds_1super.sa["superord"] = ["super1" for i in ds_1super.targets]

    # one superordinate category has only one subordinate
    # ds_unbalanced = ds.copy()
    # nsuper1 = np.sum(ds_unbalanced.sa.superord == 'super1')
    # mask_superord = ds_unbalanced.sa.superord == 'super1'
    # uniq_subord = np.unique(ds_unbalanced.sa.subord[mask_superord])
    # ds_unbalanced.sa.subord[mask_superord] = [uniq_subord[0] for i in range(nsuper1)]
    ds_unbalanced = Dataset(range(4), sa={"subord": [0, 0, 1, 2], "superord": [1, 1, 2, 2]})

    npart = ChainNode(
        [
            ## so we split based on superord
            NFoldPartitioner(len(ds.sa["superord"].unique), attr="subord"),
            ## so it should select only those splits where we took 1 from
            ## each of the superord categories leaving things in balance
            Sifter([("partitions", 2), ("superord", {"uvalues": ds.sa["superord"].unique, "balanced": True})]),
        ],
        space="partitions",
    )

    # now the new implementation
    factpart = FactorialPartitioner(NFoldPartitioner(attr="subord"), attr="superord")

    partitions_npart = [p.sa.partitions for p in npart.generate(ds)]
    partitions_factpart = [p.sa.partitions for p in factpart.generate(ds)]

    assert_array_equal(np.sort(partitions_npart), np.sort(partitions_factpart))

    # now let's check it behaves correctly if we have only one superord class
    nfold = NFoldPartitioner(attr="subord")
    partitions_nfold = [p.sa.partitions for p in nfold.generate(ds_1super)]
    partitions_factpart = [p.sa.partitions for p in factpart.generate(ds_1super)]
    assert_array_equal(np.sort(partitions_nfold), np.sort(partitions_factpart))

    # smoke test for unbalanced subord classes
    warning_msg = (
        "One or more superordinate attributes do not have the same "
        "number of subordinate attributes. This could yield to "
        "unbalanced partitions."
    )
    with assert_warnings([(RuntimeWarning, warning_msg)]):
        partitions_factpart = [p.sa.partitions for p in factpart.generate(ds_unbalanced)]

    partitions_unbalanced = [np.array([2, 2, 2, 1]), np.array([2, 2, 1, 2])]
    superord_unbalanced = [([2], [1, 1, 2]), ([2], [1, 1, 2])]
    subord_unbalanced = [([2], [0, 0, 1]), ([1], [0, 0, 2])]

    for out_part, true_part, super_out, sub_out in zip(
        partitions_factpart, partitions_unbalanced, superord_unbalanced, subord_unbalanced
    ):
        assert_array_equal(out_part, true_part)
        assert_array_equal(
            (ds_unbalanced[out_part == 1].sa.superord.tolist(), ds_unbalanced[out_part == 2].sa.superord.tolist()),
            super_out,
        )
        assert_array_equal(
            (ds_unbalanced[out_part == 1].sa.subord.tolist(), ds_unbalanced[out_part == 2].sa.subord.tolist()), sub_out
        )

    # now let's test on a dummy dataset
    ds_dummy = Dataset(range(4), sa={"subord": range(4), "superord": [1, 2] * 2})
    partitions_factpart = [p.sa.partitions for p in factpart.generate(ds_dummy)]
    assert_array_equal(partitions_factpart, [[2, 2, 1, 1], [2, 1, 1, 2], [1, 2, 2, 1], [1, 1, 2, 2]])
Beispiel #38
0
def test_gnbsearchlight_permutations():
    import mvpa2
    from mvpa2.base.node import ChainNode
    from mvpa2.clfs.gnb import GNB
    from mvpa2.generators.base import  Repeater
    from mvpa2.generators.partition import NFoldPartitioner, OddEvenPartitioner
    #import mvpa2.generators.permutation
    #reload(mvpa2.generators.permutation)
    from mvpa2.generators.permutation import AttributePermutator
    from mvpa2.testing.datasets import datasets
    from mvpa2.measures.base import CrossValidation
    from mvpa2.measures.gnbsearchlight import sphere_gnbsearchlight
    from mvpa2.measures.searchlight import sphere_searchlight
    from mvpa2.mappers.fx import mean_sample
    from mvpa2.misc.errorfx import mean_mismatch_error
    from mvpa2.clfs.stats import MCNullDist
    from mvpa2.testing.tools import assert_raises, ok_, assert_array_less

    # mvpa2.debug.active = ['APERM', 'SLC'] #, 'REPM']
    # mvpa2.debug.metrics += ['pid']
    count = 10
    nproc = 1 + int(mvpa2.externals.exists('pprocess'))
    ds = datasets['3dsmall'].copy()
    ds.fa['voxel_indices'] = ds.fa.myspace

    slkwargs = dict(radius=3, space='voxel_indices',  enable_ca=['roi_sizes'],
                    center_ids=[1, 10, 70, 100])

    mvpa2.seed(mvpa2._random_seed)
    clf  = GNB()
    splt = NFoldPartitioner(cvtype=2, attr='chunks')

    repeater   = Repeater(count=count)
    permutator = AttributePermutator('targets', limit={'partitions': 1}, count=1)

    null_sl = sphere_gnbsearchlight(clf, ChainNode([splt, permutator], space=splt.get_space()),
                                    postproc=mean_sample(), errorfx=mean_mismatch_error,
                                    **slkwargs)

    distr_est = MCNullDist(repeater, tail='left', measure=null_sl,
                           enable_ca=['dist_samples'])
    sl = sphere_gnbsearchlight(clf, splt,
                               reuse_neighbors=True,
                               null_dist=distr_est, postproc=mean_sample(),
                               errorfx=mean_mismatch_error,
                               **slkwargs)
    if __debug__:                         # assert is done only without -O mode
        assert_raises(NotImplementedError, sl, ds)

    # "ad-hoc searchlights can't handle yet varying targets across partitions"
    if False:
        # after above limitation is removed -- enable
        sl_map = sl(ds)
        sl_null_prob = sl.ca.null_prob.samples.copy()

    mvpa2.seed(mvpa2._random_seed)
    ### 'normal' Searchlight
    clf  = GNB()
    splt = NFoldPartitioner(cvtype=2, attr='chunks')
    repeater   = Repeater(count=count)
    permutator = AttributePermutator('targets', limit={'partitions': 1}, count=1)
    # rng=np.random.RandomState(0)) # to trigger failure since the same np.random state
    # would be reused across all pprocesses
    null_cv = CrossValidation(clf, ChainNode([splt, permutator], space=splt.get_space()),
                              postproc=mean_sample())
    null_sl_normal = sphere_searchlight(null_cv, nproc=nproc, **slkwargs)
    distr_est_normal = MCNullDist(repeater, tail='left', measure=null_sl_normal,
                           enable_ca=['dist_samples'])

    cv = CrossValidation(clf, splt, errorfx=mean_mismatch_error,
                         enable_ca=['stats'], postproc=mean_sample() )
    sl = sphere_searchlight(cv, nproc=nproc, null_dist=distr_est_normal, **slkwargs)
    sl_map_normal = sl(ds)
    sl_null_prob_normal = sl.ca.null_prob.samples.copy()

    # For every feature -- we should get some variance in estimates In
    # case of failure they are all really close to each other (up to
    # numerical precision), so variance will be close to 0
    assert_array_less(-np.var(distr_est_normal.ca.dist_samples.samples[0],
                              axis=1), -1e-5)
    for s in distr_est_normal.ca.dist_samples.samples[0]:
        ok_(len(np.unique(s)) > 1)
Beispiel #39
0
    def test_analyzer_with_split_classifier(self, clfds):
        """Test analyzers in split classifier
        """
        clf, ds = clfds             # unroll the tuple
        # We need to skip some LARSes here
        _sclf = str(clf)
        if 'LARS(' in _sclf and "type='stepwise'" in _sclf:
            # ADD KnownToFail thingie from NiPy
            return

        # To don't waste too much time testing lets limit to 3 splits
        nsplits = 3
        partitioner = NFoldPartitioner(count=nsplits)
        mclf = SplitClassifier(clf=clf,
                               partitioner=partitioner,
                               enable_ca=['training_stats',
                                              'stats'])
        sana = mclf.get_sensitivity_analyzer(# postproc=absolute_features(),
                                           pass_attr=['fa.nonbogus_targets'],
                                           enable_ca=["sensitivities"])

        ulabels = ds.uniquetargets
        nlabels = len(ulabels)
        # Can't rely on splitcfg since count-limit is done in __call__
        assert(nsplits == len(list(partitioner.generate(ds))))
        sens = sana(ds)
        assert('nonbogus_targets' in sens.fa) # were they passsed?
        # TODO: those few do not expose biases
        if not len(set(clf.__tags__).intersection(('lars', 'glmnet', 'gpr'))):
            assert('biases' in sens.sa)
            # print sens.sa.biases
        # It should return either ...
        #  nlabels * nsplits
        req_nsamples = [ nlabels * nsplits ]
        if nlabels == 2:
            # A single sensitivity in case of binary
            req_nsamples += [ nsplits ]
        else:
            # and for pairs in case of multiclass
            req_nsamples += [ (nlabels * (nlabels - 1) / 2) * nsplits ]
            # and for 1-vs-1 embedded within Multiclass operating on
            # pairs (e.g. SMLR)
            req_nsamples += [req_nsamples[-1] * 2]

            # Also for regression_based -- they can do multiclass
            # but only 1 sensitivity is provided
            if 'regression_based' in clf.__tags__:
                req_nsamples += [ nsplits ]

        # # of features should correspond
        self.assertEqual(sens.shape[1], ds.nfeatures)
        # # of samples/sensitivities should also be reasonable
        self.assertTrue(sens.shape[0] in req_nsamples)

        # Check if labels are present
        self.assertTrue('splits' in sens.sa)
        self.assertTrue('targets' in sens.sa)
        # should be 1D -- otherwise dtype object
        self.assertTrue(sens.sa.targets.ndim == 1)

        sens_ulabels = sens.sa['targets'].unique
        # Some labels might be pairs(tuples) so ndarray would be of
        # dtype object and we would need to get them all
        if sens_ulabels.dtype is np.dtype('object'):
            sens_ulabels = np.unique(
                reduce(lambda x, y: x + y, [list(x) for x in sens_ulabels]))

        assert_array_equal(sens_ulabels, ds.sa['targets'].unique)

        errors = [x.percent_correct
                    for x in sana.clf.ca.stats.matrices]

        # lets go through all sensitivities and see if we selected the right
        # features
        #if 'meta' in clf.__tags__ and len(sens.samples[0].nonzero()[0])<2:
        if '5%' in clf.descr \
               or (nlabels > 2 and 'regression_based' in clf.__tags__):
            # Some meta classifiers (5% of ANOVA) are too harsh ;-)
            # if we get less than 2 features with on-zero sensitivities we
            # cannot really test
            # Also -- regression based classifiers performance for multiclass
            # is expected to suck in general
            return

        if cfg.getboolean('tests', 'labile', default='yes'):
            for conf_matrix in [sana.clf.ca.training_stats] \
                              + sana.clf.ca.stats.matrices:
                self.assertTrue(
                    conf_matrix.percent_correct >= 70,
                    msg="We must have trained on each one more or " \
                    "less correctly. Got %f%% correct on %d labels" %
                    (conf_matrix.percent_correct,
                     nlabels))


        # Since  now we have per split and possibly per label -- lets just find
        # mean per each feature per label across splits
        sensm = FxMapper('samples', lambda x: np.sum(x),
                         uattrs=['targets']).forward(sens)
        sensgm = maxofabs_sample().forward(sensm)    # global max of abs of means

        assert_equal(sensgm.shape[0], 1)
        assert_equal(sensgm.shape[1], ds.nfeatures)

        selected = FixedNElementTailSelector(
            len(ds.a.bogus_features))(sensgm.samples[0])

        if cfg.getboolean('tests', 'labile', default='yes'):

            self.assertEqual(
                set(selected), set(ds.a.nonbogus_features),
                msg="At the end we should have selected the right features. "
                "Chose %s whenever nonbogus are %s"
                % (selected, ds.a.nonbogus_features))

            # Now test each one per label
            # TODO: collect all failures and spit them out at once --
            #       that would make it easy to see if the sensitivity
            #       just has incorrect order of labels assigned
            for sens1 in sensm:
                labels1 = sens1.targets  # labels (1) for this sensitivity
                lndim = labels1.ndim
                label = labels1[0]      # current label

                # XXX whole lndim comparison should be gone after
                #     things get fixed and we arrive here with a tuple!
                if lndim == 1: # just a single label
                    self.assertTrue(label in ulabels)

                    ilabel_all = np.where(ds.fa.nonbogus_targets == label)[0]
                    # should have just 1 feature for the label
                    self.assertEqual(len(ilabel_all), 1)
                    ilabel = ilabel_all[0]

                    maxsensi = np.argmax(sens1) # index of max sensitivity
                    self.assertEqual(maxsensi, ilabel,
                        "Maximal sensitivity for %s was found in %i whenever"
                        " original feature was %i for nonbogus features %s"
                        % (labels1, maxsensi, ilabel, ds.a.nonbogus_features))
                elif lndim == 2 and labels1.shape[1] == 2: # pair of labels
                    # we should have highest (in abs) coefficients in
                    # those two labels
                    maxsensi2 = np.argsort(np.abs(sens1))[0][-2:]
                    ilabel2 = [np.where(ds.fa.nonbogus_targets == l)[0][0]
                                    for l in label]
                    self.assertEqual(
                        set(maxsensi2), set(ilabel2),
                        "Maximal sensitivity for %s was found in %s whenever"
                        " original features were %s for nonbogus features %s"
                        % (labels1, maxsensi2, ilabel2, ds.a.nonbogus_features))
                    """
                    # Now test for the sign of each one in pair ;) in
                    # all binary problems L1 (-1) -> L2(+1), then
                    # weights for L2 should be positive.  to test for
                    # L1 -- invert the sign
                    # We already know (if we haven't failed in previous test),
                    # that those 2 were the strongest -- so check only signs
                    """
                    self.assertTrue(
                        sens1.samples[0, ilabel2[0]] < 0,
                        "With %i classes in pair %s got feature %i for %r >= 0"
                        % (nlabels, label, ilabel2[0], label[0]))
                    self.assertTrue(sens1.samples[0, ilabel2[1]] > 0,
                        "With %i classes in pair %s got feature %i for %r <= 0"
                        % (nlabels, label, ilabel2[1], label[1]))
                else:
                    # yoh could be wrong at this assumption... time will show
                    self.fail("Got unknown number labels per sensitivity: %s."
                              " Should be either a single label or a pair"
                              % labels1)
def generate_roc_curve(mech_vec_list, mech_nm_list,
                       semantic_range = np.arange(0.2, 2.7, 0.3),
                       mech_range = np.arange(0.2, 6.5, 0.7),
                       n_prev_trials = 1, prev_c = 'r',
                       plot_prev=True, sem_c = 'b', sem_m = '+',
                       plot_semantic=True, semantic_label='operating 1st time and \n known mechanism class'):

    t_nm_list, t_mech_vec_list = [], []
    for i, nm in enumerate(mech_nm_list):
        ## print 'nm:', nm
        if 'known' in nm:
            continue
        t_nm_list.append(nm)
        t_mech_vec_list.append(mech_vec_list[i])

    data, _ = mar.create_blocked_dataset_semantic_classes(t_mech_vec_list, t_nm_list, append_robot = False)
    
    ## label_splitter = NFoldSplitter(cvtype=1, attr='labels')
    thresh_dict = ut.load_pickle('blocked_thresh_dict.pkl') # human + robot data
    mean_charlie_dict = thresh_dict['mean_charlie']
    mean_known_mech_dict = thresh_dict['mean_known_mech']

    #---------------- semantic class prior -------------
    if plot_semantic:
        fp_l_l = []
        mn_l_l = []
        err_l_l = []
        mech_fp_l_l = []
        mech_mn_l_l = []
        mech_err_l_l = []

        nfs = NFoldPartitioner(cvtype=1, attr='targets') # 1-fold ?
        label_splitter = splitters.Splitter(attr='partitions')            
        splits = [list(label_splitter.generate(x)) for x in nfs.generate(data)]            

        # Grouping by labels
        for l_wdata, l_vdata in splits: #label_splitter(data):

            print "Number of data: ", len(l_vdata.chunks)
        
            # Why zero??? Do we want specific chunk?  -> changed into 10
            lab = l_vdata.targets[0] # all same label
            chunk = l_vdata.chunks[0] # chunk should be independant!!
            trials = l_vdata.samples 

            if lab == 'Refrigerator':
                lab = 'Fridge'

            ## tot_mean = None
            ## tot_std  = None
            ## for chunk in l_vdata.chunks:
            ##     _, mean, std =  mean_charlie_dict[chunk] # mean except the specified chunk in same class
            ##     if tot_mean is None:
            ##         tot_mean = mean
            ##         tot_std  = std
            ##     else:
            ##         tot_mean += mean
            ##         tot_std += std

            ##     print chunk, mean[0], tot_mean[0]

            ## mean = tot_mean/float(len(l_vdata.chunks))
            ## std = tot_std/float(len(l_vdata.chunks))
            ## print mean[0], tot_mean[0], float(len(l_vdata.chunks))
            ## sys.exit()
            
            # Select evaluation chunk for the ROC ? 
            ## _, mean, std =  mean_charlie_dict[lab]
            _, mean, std =  mean_charlie_dict[chunk]

            # cutting into the same length
            min_len = min(len(mean), trials.shape[1])
            trials = trials[:,:min_len]
            mean = mean[:min_len]
            std = std[:min_len] #???

            mn_list = []
            fp_list, err_list = [], []
            for n in semantic_range:
                err = (mean + n*std) - trials                    
                #false_pos = np.sum(np.any(err<0, 1))
                #tot = trials.shape[0]
                false_pos = np.sum(err<0) # Count false cases
                tot = trials.shape[0] * trials.shape[1]
                fp_list.append(false_pos/(tot*0.01))
                err = err[np.where(err>0)] 
                err_list.append(err.flatten())
                mn_list.append(np.mean(err))
            err_l_l.append(err_list)
            fp_l_l.append(fp_list)
            mn_l_l.append(mn_list)

        
            
        ll = [[] for i in err_l_l[0]]  # why 0?
        for i,e in enumerate(err_l_l): # labels
            for j,l in enumerate(ll):  # multiplier range
                l.append(e[j])

        std_list = []
        for l in ll:
            std_list.append(np.std(np.concatenate(l).flatten()))

        mn_list = np.mean(np.row_stack(mn_l_l), 0).tolist() # means into a row
        fp_list = np.mean(np.row_stack(fp_l_l), 0).tolist()
        #pp.errorbar(fp_list, mn_list, std_list)

        ## mn_list = np.array(mn_l_l).flatten()
        ## fp_list = np.array(fp_l_l).flatten()
        
        pp.plot(fp_list, mn_list, '--'+sem_m+sem_c, label= semantic_label,
                mec=sem_c, ms=8, mew=2)
        #pp.plot(fp_list, mn_list, '-ob', label='with prior')

    #---------------- mechanism knowledge prior -------------
    if plot_prev:
        
        t_nm_list, t_mech_vec_list = [], []
        for i, nm in enumerate(mech_nm_list):
            ## print 'nm:', nm
            if 'known' in nm:
                t_nm_list.append(nm)
                t_mech_vec_list.append(mech_vec_list[i])
        if t_nm_list == []:
            t_mech_vec_list = mech_vec_list
            t_nm_list = mech_nm_list

        data, _ = mar.create_blocked_dataset_semantic_classes(t_mech_vec_list, t_nm_list, append_robot = False)
        
        ## chunk_splitter = NFoldSplitter(cvtype=1, attr='chunks')        
        nfs = NFoldPartitioner(cvtype=1, attr='chunks') # 1-fold ?
        chunk_splitter = splitters.Splitter(attr='partitions')            
        splits = [list(label_splitter.generate(x)) for x in nfs.generate(data)]            
        
        err_mean_list = []
        err_std_list = []
        fp_list = []
        for n in mech_range:
            false_pos = 0
            n_trials = 0
            err_list = []
            for _, l_vdata in splits: #chunk_splitter(data):
                lab = l_vdata.targets[0]
                trials = l_vdata.samples
                m = l_vdata.chunks[0]
                #one_trial = trials[0].reshape(1, len(trials[0]))
                one_trial = trials[0:n_prev_trials]

                ## print n, ": ", lab, chunk
                
                Ms, n_std = mean_known_mech_dict[m]
                mn_list, std_list = mar.estimate_theta(one_trial, Ms, plot=False, add_var = 0.0)
                mn_mech_arr = np.array(mn_list)
                std_mech_arr = np.array(std_list)

    #            trials = trials[:,:len(mn_mech_arr)]
                min_len = min(len(mn_mech_arr), trials.shape[1])
                trials = trials[:,:min_len]
                mn_mech_arr = mn_mech_arr[:min_len]
                std_mech_arr = std_mech_arr[:min_len]

                for t in trials:
                    err = (mn_mech_arr + n*std_mech_arr) - t
                    #false_pos += np.any(err<0)
                    #n_trials += 1
                    false_pos += np.sum(err<0)
                    n_trials += len(err)
                    err = err[np.where(err>0)]
                    err_list.append(err)

            e_all = np.concatenate(err_list)
            err_mean_list.append(np.mean(e_all))
            err_std_list.append(np.std(e_all))
            fp_list.append(false_pos/(n_trials*0.01))

        #pp.plot(fp_list, err_mean_list, '-o'+prev_c, label='knowledge of mechanism and \n opened earlier %d times'%n_prev_trials)
        pp.plot(fp_list, err_mean_list, '-o'+prev_c, mec=prev_c,
                ms=5, label='operating 2nd time and \n known mechanism identity')
        #pp.plot(fp_list, err_mean_list, '-or', label='with prior')


    pp.xlabel('False positive rate (percentage)', fontsize=22)
    pp.ylabel('Mean excess force (Newtons)', fontsize=22)
    pp.xlim(-0.5,45)
    mpu.legend()
Beispiel #41
0
    def test_gideon_weird_case(self):
        """Test if MappedClassifier could handle a mapper altering number of samples

        'The utter collapse' -- communicated by Peter J. Kohler

        Desire to collapse all samples per each category in training
        and testing sets, thus resulting only in a single
        sample/category per training and per testing.

        It is a peculiar scenario which pin points the problem that so
        far mappers assumed not to change number of samples
        """
        from mvpa2.mappers.fx import mean_group_sample
        from mvpa2.clfs.knn import kNN
        from mvpa2.mappers.base import ChainMapper
        ds = datasets['uni2large'].copy()
        #ds = ds[ds.sa.chunks < 9]
        accs = []
        k = 1                           # for kNN
        nf = 1                          # for NFoldPartitioner
        for i in xrange(1):          # # of random runs
            ds.samples = np.random.randn(*ds.shape)
            #
            # There are 3 ways to accomplish needed goal
            #

            # 0. Hard way: overcome the problem by manually
            #    pre-splitting/meaning in a loop
            from mvpa2.clfs.transerror import ConfusionMatrix
            partitioner = NFoldPartitioner(nf)
            meaner = mean_group_sample(['targets', 'partitions'])
            cm = ConfusionMatrix()
            te = TransferMeasure(kNN(k), Splitter('partitions'),
                                 postproc=BinaryFxNode(mean_mismatch_error,
                                                       'targets'),
                                 enable_ca = ['stats']
                                 )
            errors = []
            for part in partitioner.generate(ds):
                ds_meaned = meaner(part)
                errors.append(np.asscalar(te(ds_meaned)))
                cm += te.ca.stats
            #print i, cm.stats['ACC']
            accs.append(cm.stats['ACC'])


            if False: # not yet working -- see _tent/allow_ch_nsamples
                      # branch for attempt to make it work
                # 1. This is a "native way" IF we allow change of number
                #    of samples via _call to be done by MappedClassifier
                #    while operating solely on the mapped dataset
                clf2 = MappedClassifier(clf=kNN(k), #clf,
                                        mapper=mean_group_sample(['targets', 'partitions']))
                cv = CrossValidation(clf2, NFoldPartitioner(nf), postproc=None,
                                     enable_ca=['stats'])
                # meaning all should be ok since we should have ballanced
                # sets across all chunks here
                errors_native = cv(ds)

                self.assertEqual(np.max(np.abs(errors_native.samples[:,0] - errors)),
                                 0)

            # 2. Work without fixes to MappedClassifier allowing
            #    change of # of samples
            #
            # CrossValidation will operate on a chain mapper which
            # would perform necessary meaning first before dealing with
            # kNN cons: .stats would not be exposed since ChainMapper
            # doesn't expose them from ChainMapper (yet)
            if __debug__ and 'ENFORCE_CA_ENABLED' in debug.active:
                raise SkipTest("Known to fail while trying to enable "
                               "training_stats for the ChainMapper")
            cv2 = CrossValidation(ChainMapper([mean_group_sample(['targets', 'partitions']),
                                               kNN(k)],
                                              space='targets'),
                                  NFoldPartitioner(nf),
                                  postproc=None)
            errors_native2 = cv2(ds)

            self.assertEqual(np.max(np.abs(errors_native2.samples[:,0] - errors)),
                             0)

            # All of the ways should provide the same results
            #print i, np.max(np.abs(errors_native.samples[:,0] - errors)), \
            #      np.max(np.abs(errors_native2.samples[:,0] - errors))

        if False: # just to investigate the distribution if we have enough iterations
            import pylab as pl
            uaccs = np.unique(accs)
            step = np.asscalar(np.unique(np.round(uaccs[1:] - uaccs[:-1], 4)))
            bins = np.linspace(0., 1., np.round(1./step+1))
            xx = pl.hist(accs, bins=bins, align='left')
            pl.xlim((0. - step/2, 1.+step/2))
Beispiel #42
0
def setup_classifier(**kwargs):

    '''
    Thinked!
    '''
    for arg in kwargs:
        if arg == 'clf_type':
            clf_type = kwargs[arg]
        if arg == 'fsel':
            f_sel = kwargs[arg]
        if arg == 'cv_type':
            cv_approach = kwargs[arg]
        if arg == 'cv_folds':
            if np.int(kwargs[arg]) == 0:
                cv_type = np.float(kwargs[arg])
            else:
                cv_type = np.int(kwargs[arg])
        if arg == 'permutations':
            permutations = np.int(kwargs[arg])
        if arg == 'cv_attribute':
            attribute = kwargs[arg]

    cv_n = cv_type

    ################# Classifier #######################
    if clf_type == 'SVM':
        clf = LinearCSVMC(C=1, probability=1, enable_ca=['probabilities'])
    elif clf_type == 'GNB':
        clf = GNB()
    elif clf_type == 'LDA':
        clf = LDA()
    elif clf_type == 'QDA':
        clf = QDA()
    elif clf_type == 'SMLR':
        clf = SMLR()
    elif clf_type == 'RbfSVM':
        sk_clf = SVC(gamma=0.1, C=1)
        clf = SKLLearnerAdapter(sk_clf, enable_ca=['probabilities'])
    elif clf_type == 'GP':
        clf = GPR()
    else:
        clf = LinearCSVMC(C=1, probability=1, enable_ca=['probabilities'])
    
    ############## Feature Selection #########################
    if f_sel == 'True':
        logger.info('Feature Selection selected.')
        fsel = SensitivityBasedFeatureSelection(OneWayAnova(),  
                                                FractionTailSelector(0.05,
                                                                     mode='select',
                                                                     tail='upper'))
        fclf = FeatureSelectionClassifier(clf, fsel)

    elif f_sel == 'Fixed':
        logger.info('Fixed Feature Selection selected.')
        fsel = SensitivityBasedFeatureSelection(OneWayAnova(),  
                                                FixedNElementTailSelector(100,
                                                                     mode='select',
                                                                     tail='upper'))
        fclf = FeatureSelectionClassifier(clf, fsel)
        
    elif f_sel == 'PCA':
        from mvpa2.mappers.skl_adaptor import SKLTransformer
        from sklearn.decomposition import PCA
        logger.info('Fixed Feature Selection selected.')
        fsel = SKLTransformer(PCA(n_components=45))
        
        fclf = FeatureSelectionClassifier(clf, fsel)
    else:

        fclf = clf

    ######################### Permutations #############################

    if permutations != 0:
        if __debug__:
            debug.active += ["STATMC"]
        repeater = Repeater(count=permutations)
        permutator = AttributePermutator('targets', limit={'partitions': 1}, 
                                         count=1)
        partitioner = NFoldPartitioner(cvtype=cv_n, attr=attribute)
        null_cv = CrossValidation(
                                  clf,
                                  ChainNode([partitioner, permutator],
                                            space=partitioner.get_space()),
                                  errorfx=mean_mismatch_error)

        distr_est = MCNullDist(repeater, tail='left', measure=null_cv,
                               enable_ca=['dist_samples'])
        #postproc = mean_sample()
    else:
        distr_est = None
        #postproc = None

    ########################################################
    if cv_approach == 'n_fold':
        if cv_type != 0:
            splitter_used = NFoldPartitioner(cvtype=cv_type, attr=attribute)
        else:
            splitter_used = NFoldPartitioner(cvtype=1, attr=attribute)
    else:
        splitter_used = HalfPartitioner(attr=attribute)
        
    
    chain_splitter = ChainNode([splitter_used, 
                                Balancer(attr='targets',
                                         count=1,
                                         limit='partitions',
                                         apply_selection=True)],
                               space='partitions')

    #############################################################
    if distr_est == None:
        cvte = CrossValidation(fclf,
                               chain_splitter,
                               enable_ca=['stats', 'repetition_results'])
    else:
        cvte = CrossValidation(fclf,
                               chain_splitter,
                               errorfx=mean_mismatch_error,
                               null_dist=distr_est,
                               enable_ca=['stats', 'repetition_results'])

    logger.info('Classifier set...')

    return [fclf, cvte]