Ejemplo n.º 1
0
def test_splitclf_sensitivities():
    datasets = [normal_feature_dataset(perlabel=100, nlabels=2,
                                       nfeatures=4,
                                       nonbogus_features=[0, i + 1],
                                       snr=1, nchunks=2)
                for i in xrange(2)]

    sclf = SplitClassifier(SMLR(),
                           NFoldPartitioner())
    analyzer = sclf.get_sensitivity_analyzer()

    senses1 = analyzer(datasets[0])
    senses2 = analyzer(datasets[1])

    for senses in senses1, senses2:
        # This should be False when comparing two folds
        assert_false(np.allclose(senses.samples[0],
                                 senses.samples[2]))
        assert_false(np.allclose(senses.samples[1],
                                 senses.samples[3]))
    # Moreover with new data we should have got different results
    # (i.e. it must retrained correctly)
    for s1, s2 in zip(senses1, senses2):
        assert_false(np.allclose(s1, s2))

    # and we should have "selected" "correct" voxels
    for i, senses in enumerate((senses1, senses2)):
        assert_equal(set(np.argsort(np.max(np.abs(senses), axis=0))[-2:]),
                     set((0, i + 1)))
Ejemplo n.º 2
0
def test_ds_shallowcopy():
    # lets use some instance of somewhat evolved dataset
    ds = normal_feature_dataset()
    ds.samples = ds.samples.view(myarray)

    # SHALLOW copy the beast
    ds_ = copy.copy(ds)
    # verify that we have the same data
    assert_array_equal(ds.samples, ds_.samples)
    assert_array_equal(ds.targets, ds_.targets)
    assert_array_equal(ds.chunks, ds_.chunks)

    # array subclass survives
    ok_(isinstance(ds_.samples, myarray))


    # modify and see that we actually DO change the data in both
    ds_.samples[0, 0] = 1234
    assert_array_equal(ds.samples, ds_.samples)
    assert_array_equal(ds.targets, ds_.targets)
    assert_array_equal(ds.chunks, ds_.chunks)

    ds_.sa.targets[0] = 'ab'
    ds_.sa.chunks[0] = 234
    assert_array_equal(ds.samples, ds_.samples)
    assert_array_equal(ds.targets, ds_.targets)
    assert_array_equal(ds.chunks, ds_.chunks)
    ok_(ds.sa.targets[0] == 'ab')
    ok_(ds.sa.chunks[0] == 234)
Ejemplo n.º 3
0
def test_ds_deepcopy():
    # lets use some instance of somewhat evolved dataset
    ds = normal_feature_dataset()
    ds.samples = ds.samples.view(myarray)
    # Clone the beast
    ds_ = ds.copy()
    # array subclass survives
    ok_(isinstance(ds_.samples, myarray))

    # verify that we have the same data
    assert_array_equal(ds.samples, ds_.samples)
    assert_array_equal(ds.targets, ds_.targets)
    assert_array_equal(ds.chunks, ds_.chunks)

    # modify and see if we don't change data in the original one
    ds_.samples[0, 0] = 1234
    ok_(np.any(ds.samples != ds_.samples))
    assert_array_equal(ds.targets, ds_.targets)
    assert_array_equal(ds.chunks, ds_.chunks)

    ds_.sa.targets = np.hstack(([123], ds_.targets[1:]))
    ok_(np.any(ds.samples != ds_.samples))
    ok_(np.any(ds.targets != ds_.targets))
    assert_array_equal(ds.chunks, ds_.chunks)

    ds_.sa.chunks = np.hstack(([1234], ds_.chunks[1:]))
    ok_(np.any(ds.samples != ds_.samples))
    ok_(np.any(ds.targets != ds_.targets))
    ok_(np.any(ds.chunks != ds_.chunks))
Ejemplo n.º 4
0
def test_factorialpartitioner_big():
    # just to see that we can cope with relatively large datasets/numbers
    ds = normal_feature_dataset(nlabels=6,
                                perlabel=66,
                                nfeatures=2,
                                nchunks=11)

    # and now let's do factorial partitioner

    def partition(ds_=ds, **kwargs):
        partitioner = FactorialPartitioner(
            partitioner=NFoldPartitioner(attr='targets'),
            attr='chunks',
            **kwargs)
        return [p.sa.partitions for p in partitioner.generate(ds_)]

    # prohibitively large
    # print len(partition(ds))
    t0 = time()
    assert_equal(len(partition(ds, count=2, selection_strategy='first')), 2)
    # Those time limits are really a stretch. on a any reasonable box not too busy
    # should be done in fraction of a second, but allow to catch "naive"
    # implementation
    assert(time() - t0 < 3)

    assert_equal(len(partition(ds, count=2, selection_strategy='random')), 2)
    assert(time() - t0 < 3)
Ejemplo n.º 5
0
def test_gnb_sensitivities():
    gnb = GNB(common_variance=True)
    ds = normal_feature_dataset(perlabel=4,
                                nlabels=3,
                                nfeatures=5,
                                nchunks=4,
                                snr=10,
                                nonbogus_features=[0, 1, 2]
                                )

    s = gnb.get_sensitivity_analyzer()(ds)
    assert_in('targets', s.sa)
    assert_equal(s.shape, (((len(ds.uniquetargets) * (len(ds.uniquetargets) - 1))/2), ds.nfeatures))
    # test zero variance case
    # set variance of feature to zero
    ds.samples[:,3]=0.3
    s_zerovar = gnb.get_sensitivity_analyzer()
    sens = s_zerovar(ds)
    assert_true(all(sens.samples[:, 3] == 0))

    # test whether tagging and untagging works
    assert 'has_sensitivity' in gnb.__tags__
    gnb.untrain()
    assert 'has_sensitivity' not in gnb.__tags__

    # test whether content of sensitivities makes rough sense
    # e.g.: sensitivity of first feature should be larger than of bogus last feature
    assert_true(abs(sens.samples[i, 0]) > abs(sens.samples[i, 4]) for i in range(np.shape(sens.samples)[0]))
Ejemplo n.º 6
0
def test_smlr_sensitivities(clf):
    data = normal_feature_dataset(perlabel=10, nlabels=2, nfeatures=4)

    # use SMLR on binary problem, but not fitting all weights
    clf.train(data)

    # now ask for the sensitivities WITHOUT having to pass the dataset
    # again
    sens = clf.get_sensitivity_analyzer(force_train=False)(None)
    assert_equal(sens.shape, (len(data.UT) - 1, data.nfeatures))
Ejemplo n.º 7
0
def test_mdpflowmapper():
    flow = mdp.nodes.PCANode() + mdp.nodes.SFANode()
    fm = MDPFlowMapper(flow)
    ds = normal_feature_dataset(perlabel=10, nlabels=2, nfeatures=4)

    fm.train(ds)
    assert_false(fm.flow[0].is_training())
    assert_false(fm.flow[1].is_training())

    fds = fm.forward(ds)
    assert_true(isinstance(fds, Dataset))
    assert_equal(fds.samples.shape, ds.samples.shape)
Ejemplo n.º 8
0
def test_glmnet_c_sensitivities():
    data = normal_feature_dataset(perlabel=10, nlabels=2, nfeatures=4)

    # use GLMNET on binary problem
    clf = GLMNET_C()
    clf.train(data)

    # now ask for the sensitivities WITHOUT having to pass the dataset
    # again
    sens = clf.get_sensitivity_analyzer(force_train=False)(None)

    #failUnless(sens.shape == (data.nfeatures,))
    assert_equal(sens.shape, (len(data.UT), data.nfeatures))
Ejemplo n.º 9
0
def test_imshow():
    from mvpa2.viz import matshow
    from mvpa2.misc.data_generators import normal_feature_dataset
    from matplotlib.colorbar import Colorbar
    ds = normal_feature_dataset(10, 2, 18, 5)
    im = matshow(ds)
    # old mpl returns a tuple of Colorbar which is anyways available as its .ax
    if isinstance(im.colorbar, tuple):
        assert_is_instance(im.colorbar[0], Colorbar)
        assert_true(im.colorbar[1] is im.colorbar[0].ax)
    else:
        # new mpls do it withough unnecessary duplication
        assert_is_instance(im.colorbar, Colorbar)
Ejemplo n.º 10
0
def test_sifter_superord_usecase():
    from mvpa2.misc.data_generators import normal_feature_dataset
    from mvpa2.clfs.svm import LinearCSVMC            # fast one to use for tests
    from mvpa2.measures.base import CrossValidation

    from mvpa2.base.node import ChainNode
    from mvpa2.generators.partition import NFoldPartitioner
    from mvpa2.generators.base import  Sifter

    # Let's simulate the beast -- 6 categories total groupped into 3
    # super-ordinate, and actually without any 'superordinate' effect
    # since subordinate categories independent
    ds = normal_feature_dataset(nlabels=6,
                                snr=100,   # pure signal! ;)
                                perlabel=30,
                                nfeatures=6,
                                nonbogus_features=range(6),
                                nchunks=5)
    ds.sa['subord'] = ds.sa.targets.copy()
    ds.sa['superord'] = ['super%d' % (int(i[1])%3,)
                         for i in ds.targets]   # 3 superord categories
    # let's override original targets just to be sure that we aren't relying on them
    ds.targets[:] = 0

    npart = ChainNode([
    ## so we split based on superord
        NFoldPartitioner(len(ds.sa['superord'].unique),
                         attr='subord'),
        ## so it should select only those splits where we took 1 from
        ## each of the superord categories leaving things in balance
        Sifter([('partitions', 2),
                ('superord',
                 { 'uvalues': ds.sa['superord'].unique,
                   'balanced': True})
                 ]),
                   ], space='partitions')

    # and then do your normal where clf is space='superord'
    clf = LinearCSVMC(space='superord')
    cvte_regular = CrossValidation(clf, NFoldPartitioner(),
                                   errorfx=lambda p,t: np.mean(p==t))
    cvte_super = CrossValidation(clf, npart, errorfx=lambda p,t: np.mean(p==t))

    accs_regular = cvte_regular(ds)
    accs_super = cvte_super(ds)

    # With sifting we should get only 2^3 = 8 splits
    assert(len(accs_super) == 8)
    # I don't think that this would ever fail, so not marking it labile
    assert(np.mean(accs_regular) > .8)
    assert(np.mean(accs_super)   < .6)
Ejemplo n.º 11
0
    def test_confusionmatrix_nulldist(self):
        from mvpa2.clfs.gnb import GNB

        class ConfusionMatrixError(object):
            """Custom error "function"
            """
            def __init__(self, labels=None):
                self.labels = labels
            def __call__(self, predictions, targets):
                cm = ConfusionMatrix(labels=list(self.labels),
                                     targets=targets, predictions=predictions)
                #print cm.matrix
                # We have to add a degenerate leading dimension
                # so we could separate them into separate 'samples'
                return cm.matrix[None, :]

        from mvpa2.misc.data_generators import normal_feature_dataset
        for snr in [0., 2.,]:
            ds = normal_feature_dataset(snr=snr, perlabel=42, nchunks=3,
                                        nonbogus_features=[0,1], nfeatures=2)

            clf = GNB()
            num_perm = 50
            permutator = AttributePermutator('targets',
                                             limit='chunks',
                                             count=num_perm)
            cv = CrossValidation(
                clf, NFoldPartitioner(),
                errorfx=ConfusionMatrixError(labels=ds.sa['targets'].unique),
                postproc=mean_sample(),
                null_dist=MCNullDist(permutator,
                                     tail='right', # because we now look at accuracy not error
                                     enable_ca=['dist_samples']),
                enable_ca=['stats'])
            cmatrix = cv(ds)
            #print "Result:\n", cmatrix.samples
            cvnp = cv.ca.null_prob.samples
            #print cvnp
            self.assertTrue(cvnp.shape, (2, 2))
            if cfg.getboolean('tests', 'labile', default='yes'):
                if snr == 0.:
                    # all p should be high since no signal
                    assert_array_less(0.05, cvnp)
                else:
                    # diagonal p is low -- we have signal after all
                    assert_array_less(np.diag(cvnp), 0.05)
                    # off diagonals are high p since for them we would
                    # need to look at the other tail
                    assert_array_less(0.9,
                                      cvnp[(np.array([0,1]), np.array([1,0]))])
Ejemplo n.º 12
0
def test_mdpnodemapper():
    ds = normal_feature_dataset(perlabel=10, nlabels=2, nfeatures=4)

    node = mdp.nodes.PCANode()
    mm = MDPNodeMapper(node, nodeargs={'stoptrain': ((), {'debug': True})})

    mm.train(ds)

    fds = mm.forward(ds)
    if externals.versions['mdp'] >= '2.5':
        assert_true(hasattr(mm.node, 'cov_mtx'))

    assert_true(isinstance(fds, Dataset))
    assert_equal(fds.samples.shape, ds.samples.shape)

    # set projection onto first 2 components
    mm.nodeargs['exec'] = ((), {'n': 2})
    #should be different from above
    lfds = mm.forward(ds.samples)
    # output shape changes although the node still claim otherwise
    assert_equal(mm.node.output_dim, 4)
    assert_equal(lfds.shape[0], fds.samples.shape[0])
    assert_equal(lfds.shape[1], 2)
    assert_array_equal(lfds, fds.samples[:, :2])

    # reverse
    rfds = mm.reverse(fds)

    # even smaller size works
    rlfds = mm.reverse(lfds)
    assert_equal(rfds.samples.shape, ds.samples.shape)

    # retraining has to work on a new dataset too, since we copy the node
    # internally
    dsbig = normal_feature_dataset(perlabel=10, nlabels=2, nfeatures=10)
    mm.train(dsbig)
Ejemplo n.º 13
0
def test_mdpflow_additional_arguments_nones():
    skip_if_no_external('mdp', min_version='2.5')
    # we have no IdentityNode yet... is there analog?

    ds = normal_feature_dataset(perlabel=10, nlabels=2, nfeatures=4)
    flow = mdp.nodes.PCANode() + mdp.nodes.IdentityNode() + mdp.nodes.FDANode()
    # this is what it would look like in MDP itself
    #flow.train([[ds.samples],
    #            [[ds.samples, ds.sa.targets]]])
    assert_raises(ValueError, MDPFlowMapper, flow, node_arguments=[[],[]])
    fm = MDPFlowMapper(flow, node_arguments = (None, None, [ds.sa.targets]))
    fm.train(ds)
    fds = fm.forward(ds)
    assert_equal(ds.samples.shape, fds.samples.shape)
    rds = fm.reverse(fds)
    assert_array_almost_equal(ds.samples, rds.samples)
Ejemplo n.º 14
0
def test_hist():
    from mvpa2.viz import hist
    from mvpa2.misc.data_generators import normal_feature_dataset
    from matplotlib.axes import Subplot
    ds = normal_feature_dataset(10, 3, 10, 5)
    plots = hist(ds, ygroup_attr='targets', xgroup_attr='chunks',
                 noticks=None, xlim=(-.5, .5), normed=True)
    assert_equal(len(plots), 15)
    for sp in plots:
        assert_is_instance(sp, Subplot)
    # simple case
    plots = hist(ds)
    assert_equal(len(plots), 1)
    assert_is_instance(plots[0], Subplot)
    # make sure it works with plan arrays too
    plots = hist(ds.samples)
    assert_equal(len(plots), 1)
    assert_is_instance(plots[0], Subplot)
Ejemplo n.º 15
0
def _get_superord_dataset():
    """A little helper to simulate a dataset with super/subord targets structure
    """
    # Let's simulate the beast -- 6 categories total groupped into 3
    # super-ordinate, and actually without any 'superordinate' effect
    # since subordinate categories independent
    ds = normal_feature_dataset(nlabels=6,
                                snr=100,  # pure signal! ;)
                                perlabel=30,
                                nfeatures=6,
                                nonbogus_features=range(6),
                                nchunks=5)
    ds.sa['subord'] = ds.sa.targets.copy()
    ds.sa['superord'] = ['super%d' % (int(i[1]) % 3,)
                         for i in ds.targets]  # 3 superord categories
    # let's override original targets just to be sure that we aren't relying on them
    ds.targets[:] = 0
    return ds
Ejemplo n.º 16
0
    def test_SplitRFE(self):
        # just a smoke test ATM
        from mvpa2.clfs.svm import LinearCSVMC
        from mvpa2.clfs.meta import MappedClassifier
        from mvpa2.misc.data_generators import normal_feature_dataset
        #import mvpa2.featsel.rfe
        #reload(mvpa2.featsel.rfe)
        from mvpa2.featsel.rfe import RFE, SplitRFE
        from mvpa2.generators.partition import NFoldPartitioner
        from mvpa2.featsel.helpers import FractionTailSelector
        from mvpa2.testing import ok_, assert_equal

        clf = LinearCSVMC(C=1)
        dataset = normal_feature_dataset(perlabel=20, nlabels=2, nfeatures=30,
                                         snr=1., nonbogus_features=[1,5])
        # flip one of the meaningful features around to see
        # if we are still getting proper selection
        dataset.samples[:, dataset.a.nonbogus_features[1]] *= -1
        # 4 partitions should be enough for testing
        partitioner = NFoldPartitioner(count=4)

        rfeclf = MappedClassifier(
            clf, SplitRFE(clf,
                          partitioner,
                          fselector=FractionTailSelector(
                              0.2, mode='discard', tail='lower')))
        r0 = repr(rfeclf)

        ok_(rfeclf.mapper.nfeatures_min == 0)
        rfeclf.train(dataset)
        ok_(rfeclf.mapper.nfeatures_min > 0)
        predictions = rfeclf(dataset).samples

        # at least 1 of the nonbogus-features should be chosen
        ok_(len(set(dataset.a.nonbogus_features).intersection(
                rfeclf.mapper.slicearg)) > 0)
        # check repr to have all needed pieces
        r = repr(rfeclf)
        s = str(rfeclf)
        ok_(('partitioner=NFoldP' in r) or
            ('partitioner=mvpa2.generators.partition.NFoldPartitioner' in r))
        ok_('lrn=' in r)
        ok_(not 'slicearg=' in r)
        assert_equal(r, r0)
Ejemplo n.º 17
0
def test_exclude_targets_combinations():
    partitioner = ChainNode(
        [NFoldPartitioner(), ExcludeTargetsCombinationsPartitioner(k=2, targets_attr="targets", space="partitions")],
        space="partitions",
    )
    from mvpa2.misc.data_generators import normal_feature_dataset

    ds = normal_feature_dataset(snr=0.0, nlabels=4, perlabel=3, nchunks=3, nonbogus_features=[0, 1, 2, 3], nfeatures=4)
    partitions = list(partitioner.generate(ds))
    assert_equal(len(partitions), 3 * 6)
    splitter = Splitter("partitions")
    combs = []
    comb_chunks = []
    for p in partitions:
        trds, teds = list(splitter.generate(p))[:2]
        comb = tuple(np.unique(teds.targets))
        combs.append(comb)
        comb_chunks.append(comb + tuple(np.unique(teds.chunks)))
    assert_equal(len(set(combs)), 6)  # just 6 possible combinations of 2 out of 4
    assert_equal(len(set(comb_chunks)), 3 * 6)  # all unique
Ejemplo n.º 18
0
    def test_hypal_michael_caused_problem(self):
        from mvpa2.misc import data_generators
        from mvpa2.mappers.zscore import zscore
        # Fake data
        ds = data_generators.normal_feature_dataset(nfeatures=20)
        ds_all = [data_generators.random_affine_transformation(ds) for i in range(3)]
        _ = [zscore(sd, chunks_attr=None) for sd in ds_all]
        # Making random data per subject for testing with bias added to first subject
        ds_test = [np.random.rand(1, ds.nfeatures) for i in range(len(ds_all))]
        ds_test[0] += np.arange(1, ds.nfeatures + 1) * 100
        assert(np.corrcoef(ds_test[2], ds_test[1])[0, 1] < 0.99)  # that would have been rudiculous if it was

        # Test with varying alpha so we for sure to not have that issue now
        for alpha in (0, 0.01, 0.5, 0.99, 1.0):
            hyper09 = Hyperalignment(alpha=alpha)
            mappers = hyper09([sd for sd in ds_all])
            ds_test_a = [m.forward(sd) for m, sd in zip(mappers, ds_test)]
            ds_test_a = [mappers[0].reverse(sd) for sd in ds_test_a]
            corr = np.corrcoef(ds_test_a[2], ds_test_a[1])[0, 1]
            assert(corr < 0.99)
Ejemplo n.º 19
0
def test_nodeargs():
    skip_if_no_external('mdp', min_version='2.4')
    ds = normal_feature_dataset(perlabel=10, nlabels=2, nfeatures=4)

    for svd_val in [True, False]:
        pcm = PCAMapper(alg='PCA', svd=svd_val)
        assert_equal(pcm.node.svd, svd_val)
        pcm.train(ds)
        assert_equal(pcm.node.svd, svd_val)
    for output_dim in [0.5, 0.95, 0.99, 10, 50, 100]:
        pcm = PCAMapper(alg='PCA', output_dim=output_dim)
        for i in range(2):              # so we also test on trained one
            if isinstance(output_dim, float):
                assert_equal(pcm.node.desired_variance, output_dim)
            else:
                assert_equal(pcm.node.output_dim, output_dim)
            pcm.train(ds)
            if isinstance(output_dim, float):
                assert_not_equal(pcm.node.output_dim, output_dim)
                # some dimensions are chosen
                assert_true(pcm.node.output_dim > 0)
Ejemplo n.º 20
0
    def test_cache_speedup(self):
        skip_if_no_external('shogun', ver_dep='shogun:rev', min_version=4455)

        ck = sgSVM(kernel=CachedKernel(kernel=RbfSGKernel(sigma=2)), C=1)
        sk = sgSVM(kernel=RbfSGKernel(sigma=2), C=1)

        cv_c = CrossValidation(ck, NFoldPartitioner())
        cv_s = CrossValidation(sk, NFoldPartitioner())

        #data = datasets['uni4large']
        P = 5000
        data = normal_feature_dataset(snr=2, perlabel=200, nchunks=10,
                                    means=np.random.randn(2, P), nfeatures=P)

        t0 = time()
        ck.params.kernel.compute(data)
        cachetime = time()-t0

        t0 = time()
        cached_err = cv_c(data)
        ccv_time = time()-t0

        t0 = time()
        norm_err = cv_s(data)
        ncv_time = time()-t0

        assert_almost_equal(np.asanyarray(cached_err),
                            np.asanyarray(norm_err))
        ok_(cachetime<ncv_time)
        ok_(ccv_time<ncv_time)
        #print 'Regular CV time: %s seconds'%ncv_time
        #print 'Caching time: %s seconds'%cachetime
        #print 'Cached CV time: %s seconds'%ccv_time

        speedup = ncv_time/(ccv_time+cachetime)
        #print 'Speedup factor: %s'%speedup

        # Speedup ideally should be 10, though it's not purely linear
        self.failIf(speedup < 2, 'Problem caching data - too slow!')
Ejemplo n.º 21
0
    def test_hypal_michael_caused_problem(self):
        from mvpa2.misc import data_generators
        from mvpa2.mappers.zscore import zscore
        # Fake data
        ds = data_generators.normal_feature_dataset(nfeatures=20)
        ds_all = [
            data_generators.random_affine_transformation(ds) for i in range(3)
        ]
        _ = [zscore(sd, chunks_attr=None) for sd in ds_all]
        # Making random data per subject for testing with bias added to first subject
        ds_test = [np.random.rand(1, ds.nfeatures) for i in range(len(ds_all))]
        ds_test[0] += np.arange(1, ds.nfeatures + 1) * 100
        assert (np.corrcoef(ds_test[2], ds_test[1])[0, 1] < 0.99
                )  # that would have been ridiculous if it was

        # Test with varying alpha so we for sure to not have that issue now
        for alpha in (0, 0.01, 0.5, 0.99, 1.0):
            hyper09 = Hyperalignment(alpha=alpha)
            mappers = hyper09([sd for sd in ds_all])
            ds_test_a = [m.forward(sd) for m, sd in zip(mappers, ds_test)]
            ds_test_a = [mappers[0].reverse(sd) for sd in ds_test_a]
            corr = np.corrcoef(ds_test_a[2], ds_test_a[1])[0, 1]
            assert (corr < 0.99)
Ejemplo n.º 22
0
def test_confusion_as_node():
    from mvpa2.misc.data_generators import normal_feature_dataset
    from mvpa2.clfs.gnb import GNB
    from mvpa2.clfs.transerror import Confusion
    ds = normal_feature_dataset(snr=2.0, perlabel=42, nchunks=3,
                                nonbogus_features=[0,1], nfeatures=2)
    clf = GNB()
    cv = CrossValidation(
        clf, NFoldPartitioner(),
        errorfx=None,
        postproc=Confusion(labels=ds.UT),
        enable_ca=['stats'])
    res = cv(ds)
    # needs to be identical to CA
    assert_array_equal(res.samples, cv.ca.stats.matrix)
    assert_array_equal(res.sa.predictions, ds.UT)
    assert_array_equal(res.fa.targets, ds.UT)

    skip_if_no_external('scipy')

    from mvpa2.clfs.transerror import BayesConfusionHypothesis
    from mvpa2.base.node import ChainNode
    # same again, but this time with Bayesian hypothesis testing at the end
    cv = CrossValidation(
        clf, NFoldPartitioner(),
        errorfx=None,
        postproc=ChainNode((Confusion(labels=ds.UT),
                            BayesConfusionHypothesis())))
    res = cv(ds)
    # only two possible hypothesis with two classes
    assert_equals(len(res), 2)
    # the first hypothesis is the can't discriminate anything
    assert_equal(len(res.sa.hypothesis[0]), 1)
    assert_equal(len(res.sa.hypothesis[0][0]), 2)
    # and the hypothesis is actually less likely than the other one
    # (both classes can be distinguished)
    assert(np.e**res.samples[0,0] < np.e**res.samples[1,0])
Ejemplo n.º 23
0
def test_gnb_sensitivities(logprob):
    gnb = GNB(common_variance=True, logprob=logprob)
    ds = normal_feature_dataset(perlabel=4,
                                nlabels=3,
                                nfeatures=5,
                                nchunks=4,
                                snr=20,
                                nonbogus_features=[0, 1, 2])

    s = gnb.get_sensitivity_analyzer()(ds)
    assert_in('targets', s.sa)
    assert_equal(s.shape, (((len(ds.uniquetargets) *
                             (len(ds.uniquetargets) - 1)) / 2), ds.nfeatures))
    # test zero variance case
    # set variance of feature to zero
    ds.samples[:, 3] = 0.3
    s_zerovar = gnb.get_sensitivity_analyzer()
    sens = s_zerovar(ds)
    assert_equal(sens.T.dtype, 'O')  # we store pairs
    assert_equal(sens.T[0], ('L0', 'L1'))
    assert_true(all(sens.samples[:, 3] == 0))

    gnb.untrain()

    # test whether content of sensitivities makes rough sense
    # First feature has information only about L0, so it would be of
    # no use for L1 -vs- L2 classification, so we will go through each pair
    # and make sure that signs etc all correct for each pair.
    # This in principle should be a generic test for multiclass sensitivities
    abssens = abs(sens.samples)
    for (t1, t2), t1t2sens in zip(sens.T, sens.samples):
        # go from literal L1 to 1, L0 to 0 - corresponds to feature
        i1 = int(t1[1])
        i2 = int(t2[1])
        assert t1t2sens[i1] < 0
        assert t1t2sens[i2] > 0
        assert t1t2sens[i2] > t1t2sens[4]
Ejemplo n.º 24
0
def test_gnb_sensitivities():
    gnb = GNB(common_variance=True)
    ds = normal_feature_dataset(perlabel=4,
                                nlabels=3,
                                nfeatures=5,
                                nchunks=4,
                                snr=20,
                                nonbogus_features=[0, 1, 2]
                                )

    s = gnb.get_sensitivity_analyzer()(ds)
    assert_in('targets', s.sa)
    assert_equal(s.shape, (((len(ds.uniquetargets) * (len(ds.uniquetargets) - 1))/2), ds.nfeatures))
    # test zero variance case
    # set variance of feature to zero
    ds.samples[:, 3] = 0.3
    s_zerovar = gnb.get_sensitivity_analyzer()
    sens = s_zerovar(ds)
    assert_equal(sens.T.dtype, 'O')  # we store pairs
    assert_equal(sens.T[0], ('L0', 'L1'))
    assert_true(all(sens.samples[:, 3] == 0))

    gnb.untrain()

    # test whether content of sensitivities makes rough sense
    # First feature has information only about L0, so it would be of
    # no use for L1 -vs- L2 classification, so we will go through each pair
    # and make sure that signs etc all correct for each pair.
    # This in principle should be a generic test for multiclass sensitivities
    abssens = abs(sens.samples)
    for (t1, t2), t1t2sens in zip(sens.T, sens.samples):
        # go from literal L1 to 1, L0 to 0 - corresponds to feature
        i1 = int(t1[1])
        i2 = int(t2[1])
        assert t1t2sens[i1] < 0
        assert t1t2sens[i2] > 0
        assert t1t2sens[i2] > t1t2sens[4]
Ejemplo n.º 25
0
    def test_confusionmatrix_nulldist(self):
        from mvpa2.clfs.gnb import GNB
        from mvpa2.clfs.transerror import ConfusionMatrixError
        from mvpa2.misc.data_generators import normal_feature_dataset
        for snr in [0., 2.,]:
            ds = normal_feature_dataset(snr=snr, perlabel=42, nchunks=3,
                                        nonbogus_features=[0,1], nfeatures=2)

            clf = GNB()
            num_perm = 50
            permutator = AttributePermutator('targets',
                                             limit='chunks',
                                             count=num_perm)
            cv = CrossValidation(
                clf, NFoldPartitioner(),
                errorfx=ConfusionMatrixError(labels=ds.sa['targets'].unique),
                postproc=mean_sample(),
                null_dist=MCNullDist(permutator,
                                     tail='right', # because we now look at accuracy not error
                                     enable_ca=['dist_samples']),
                enable_ca=['stats'])
            cmatrix = cv(ds)
            #print "Result:\n", cmatrix.samples
            cvnp = cv.ca.null_prob.samples
            #print cvnp
            self.assertTrue(cvnp.shape, (2, 2))
            if cfg.getboolean('tests', 'labile', default='yes'):
                if snr == 0.:
                    # all p should be high since no signal
                    assert_array_less(0.05, cvnp)
                else:
                    # diagonal p is low -- we have signal after all
                    assert_array_less(np.diag(cvnp), 0.05)
                    # off diagonals are high p since for them we would
                    # need to look at the other tail
                    assert_array_less(0.9,
                                      cvnp[(np.array([0,1]), np.array([1,0]))])
Ejemplo n.º 26
0
    def test_confusionmatrix_nulldist(self):
        from mvpa2.clfs.gnb import GNB
        from mvpa2.clfs.transerror import ConfusionMatrixError
        from mvpa2.misc.data_generators import normal_feature_dataset
        for snr in [0., 2.,]:
            ds = normal_feature_dataset(snr=snr, perlabel=42, nchunks=3,
                                        nonbogus_features=[0,1], nfeatures=2)

            clf = GNB()
            num_perm = 50
            permutator = AttributePermutator('targets',
                                             limit='chunks',
                                             count=num_perm)
            cv = CrossValidation(
                clf, NFoldPartitioner(),
                errorfx=ConfusionMatrixError(labels=ds.sa['targets'].unique),
                postproc=mean_sample(),
                null_dist=MCNullDist(permutator,
                                     tail='right', # because we now look at accuracy not error
                                     enable_ca=['dist_samples']),
                enable_ca=['stats'])
            cmatrix = cv(ds)
            #print "Result:\n", cmatrix.samples
            cvnp = cv.ca.null_prob.samples
            #print cvnp
            self.assertTrue(cvnp.shape, (2, 2))
            if cfg.getboolean('tests', 'labile', default='yes'):
                if snr == 0.:
                    # all p should be high since no signal
                    assert_array_less(0.05, cvnp)
                else:
                    # diagonal p is low -- we have signal after all
                    assert_array_less(np.diag(cvnp), 0.05)
                    # off diagonals are high p since for them we would
                    # need to look at the other tail
                    assert_array_less(0.9,
                                      cvnp[(np.array([0,1]), np.array([1,0]))])
Ejemplo n.º 27
0
    def test_binds(self):
        ds = normal_feature_dataset()
        ds_data = ds.samples.copy()
        ds_chunks = ds.chunks.copy()
        self.assertTrue(np.all(ds.samples == ds_data)) # sanity check

        funcs = ['coarsen_chunks']

        for f in funcs:
            eval('ds.%s()' % f)
            self.assertTrue(np.any(ds.samples != ds_data) or
                            np.any(ds.chunks != ds_chunks),
                msg="We should have modified original dataset with %s" % f)
            ds.samples = ds_data.copy()
            ds.sa['chunks'].value = ds_chunks.copy()

        # and some which should just return results
        for f in ['aggregate_features', 'remove_invariant_features',
                  'get_samples_per_chunk_target']:
            res = eval('ds.%s()' % f)
            self.assertTrue(res is not None,
                msg='We should have got result from function %s' % f)
            self.assertTrue(np.all(ds.samples == ds_data),
                msg="Function %s should have not modified original dataset" % f)
Ejemplo n.º 28
0
    def test_binds(self):
        ds = normal_feature_dataset()
        ds_data = ds.samples.copy()
        ds_chunks = ds.chunks.copy()
        self.assertTrue(np.all(ds.samples == ds_data)) # sanity check

        funcs = ['coarsen_chunks']

        for f in funcs:
            eval('ds.%s()' % f)
            self.assertTrue(np.any(ds.samples != ds_data) or
                            np.any(ds.chunks != ds_chunks),
                msg="We should have modified original dataset with %s" % f)
            ds.samples = ds_data.copy()
            ds.sa['chunks'].value = ds_chunks.copy()

        # and some which should just return results
        for f in ['aggregate_features', 'remove_invariant_features',
                  'get_samples_per_chunk_target']:
            res = eval('ds.%s()' % f)
            self.assertTrue(res is not None,
                msg='We should have got result from function %s' % f)
            self.assertTrue(np.all(ds.samples == ds_data),
                msg="Function %s should have not modified original dataset" % f)
Ejemplo n.º 29
0
    def test_binds(self):
        ds = normal_feature_dataset()
        ds_data = ds.samples.copy()
        ds_chunks = ds.chunks.copy()
        self.failUnless(np.all(ds.samples == ds_data))  # sanity check

        funcs = ["coarsen_chunks"]

        for f in funcs:
            eval("ds.%s()" % f)
            self.failUnless(
                np.any(ds.samples != ds_data) or np.any(ds.chunks != ds_chunks),
                msg="We should have modified original dataset with %s" % f,
            )
            ds.samples = ds_data.copy()
            ds.sa["chunks"].value = ds_chunks.copy()

        # and some which should just return results
        for f in ["aggregate_features", "remove_invariant_features", "get_samples_per_chunk_target"]:
            res = eval("ds.%s()" % f)
            self.failUnless(res is not None, msg="We should have got result from function %s" % f)
            self.failUnless(
                np.all(ds.samples == ds_data), msg="Function %s should have not modified original dataset" % f
            )
Ejemplo n.º 30
0
 def setUp(self):
     self.dataset = normal_feature_dataset(perlabel=100, nlabels=2,
                                           nfeatures=10,
                                           nonbogus_features=[0,1],
                                           snr=0.3, nchunks=2)
Ejemplo n.º 31
0
def test_confusion_as_node():
    from mvpa2.misc.data_generators import normal_feature_dataset
    from mvpa2.clfs.gnb import GNB
    from mvpa2.clfs.transerror import Confusion
    ds = normal_feature_dataset(snr=2.0, perlabel=42, nchunks=3,
                                nonbogus_features=[0,1], nfeatures=2)
    clf = GNB()
    cv = CrossValidation(
        clf, NFoldPartitioner(),
        errorfx=None,
        postproc=Confusion(labels=ds.UT),
        enable_ca=['stats'])
    res = cv(ds)
    # needs to be identical to CA
    assert_array_equal(res.samples, cv.ca.stats.matrix)
    assert_array_equal(res.sa.predictions, ds.UT)
    assert_array_equal(res.fa.targets, ds.UT)

    skip_if_no_external('scipy')

    from mvpa2.clfs.transerror import BayesConfusionHypothesis
    from mvpa2.base.node import ChainNode
    # same again, but this time with Bayesian hypothesis testing at the end
    cv = CrossValidation(
        clf, NFoldPartitioner(),
        errorfx=None,
        postproc=ChainNode([Confusion(labels=ds.UT),
                            BayesConfusionHypothesis()]))
    res = cv(ds)
    # only two possible hypothesis with two classes
    assert_equals(len(res), 2)
    # the first hypothesis is the can't discriminate anything
    assert_equal(len(res.sa.hypothesis[0]), 1)
    assert_equal(len(res.sa.hypothesis[0][0]), 2)
    # and the hypothesis is actually less likely than the other one
    # (both classes can be distinguished)
    assert(np.e**res.samples[0,0] < np.e**res.samples[1,0])

    # Let's see how well it would work within the searchlight when we also
    # would like to store the hypotheses per each voxel
    # Somewhat an ad-hoc solution for the answer posted on the ML
    #
    # run 1d searchlight of radii 0, for that just provide a .fa with coordinates
    ds.fa['voxel_indices'] = [[0], [1]]
    # and a custom Node which would collect .sa.hypothesis to place together along
    # with the posterior probabilities
    from mvpa2.base.node import Node
    from mvpa2.measures.searchlight import sphere_searchlight

    class KeepBothPosteriorAndHypothesis(Node):
        def _call(self, ds):
            out = np.zeros(1, dtype=object)
            out[0] = (ds.samples, ds.sa.hypothesis)
            return out
    cv.postproc.append(KeepBothPosteriorAndHypothesis())
    sl = sphere_searchlight(cv, radius=0, nproc=1)
    res = sl(ds)

    assert_equal(res.shape, (1, 2))
    assert_equal(len(res.samples[0,0]), 2)
    assert_equal(res.samples[0,0][0].shape, (2, 2))   # posteriors per 1st SL
    assert_equal(len(res.samples[0,0][1]), 2)   # 2 of hypotheses
Ejemplo n.º 32
0
    def test_rfe(self, clf):

        # sensitivity analyser and transfer error quantifier use the SAME clf!
        sens_ana = clf.get_sensitivity_analyzer(postproc=maxofabs_sample())
        pmeasure = ProxyMeasure(clf,
                                postproc=BinaryFxNode(mean_mismatch_error,
                                                      'targets'))
        cvmeasure = CrossValidation(clf,
                                    NFoldPartitioner(),
                                    errorfx=mean_mismatch_error,
                                    postproc=mean_sample())

        rfesvm_split = SplitClassifier(clf, OddEvenPartitioner())

        # explore few recipes
        for rfe, data in [
                # because the clf is already trained when computing the sensitivity
                # map, prevent retraining for transfer error calculation
                # Use absolute of the svm weights as sensitivity
            (RFE(sens_ana,
                 pmeasure,
                 Splitter('train'),
                 fselector=FixedNElementTailSelector(1),
                 train_pmeasure=False), self.get_data()),
                # use cross-validation within training to get error for the stopping point
                # but use full training data to derive sensitivity
            (
                RFE(
                    sens_ana,
                    cvmeasure,
                    Repeater(
                        2
                    ),  # give the same full dataset to sens_ana and cvmeasure
                    fselector=FractionTailSelector(0.70,
                                                   mode='select',
                                                   tail='upper'),
                    train_pmeasure=True),
                normal_feature_dataset(perlabel=20,
                                       nchunks=5,
                                       nfeatures=200,
                                       nonbogus_features=[0, 1],
                                       snr=1.5)),
                # use cross-validation (via SplitClassifier) and get mean
                # of normed sensitivities across those splits
            (
                RFE(
                    rfesvm_split.get_sensitivity_analyzer(
                        postproc=ChainMapper([
                            FxMapper('features', l2_normed),
                            FxMapper('samples', np.mean),
                            FxMapper('samples', np.abs)
                        ])),
                    ConfusionBasedError(rfesvm_split, confusion_state='stats'),
                    Repeater(
                        2),  #  we will use the same full cv-training dataset
                    fselector=FractionTailSelector(0.50,
                                                   mode='select',
                                                   tail='upper'),
                    stopping_criterion=NBackHistoryStopCrit(
                        BestDetector(), 10),
                    train_pmeasure=
                    False,  # we just extract it from existing confusion
                    update_sensitivity=True),
                normal_feature_dataset(perlabel=28,
                                       nchunks=7,
                                       nfeatures=200,
                                       nonbogus_features=[0, 1],
                                       snr=1.5))
        ]:
            # prep data
            # data = datasets['uni2medium']
            data_nfeatures = data.nfeatures

            rfe.train(data)
            resds = rfe(data)

            # fail if orig datasets are changed
            self.assertTrue(data.nfeatures == data_nfeatures)

            # check that the features set with the least error is selected
            if len(rfe.ca.errors):
                e = np.array(rfe.ca.errors)
                if isinstance(rfe._fselector, FixedNElementTailSelector):
                    self.assertTrue(resds.nfeatures == data_nfeatures -
                                    e.argmin())
                else:
                    imin = np.argmin(e)
                    if 'does_feature_selection' in clf.__tags__:
                        # if clf is smart it might figure it out right away
                        assert_array_less(imin, len(e))
                    else:
                        # in this case we can even check if we had actual
                        # going down/up trend... although -- why up???
                        self.assertTrue(1 < imin < len(e) - 1)
            else:
                self.assertTrue(resds.nfeatures == data_nfeatures)

            # silly check if nfeatures is in decreasing order
            nfeatures = np.array(rfe.ca.nfeatures).copy()
            nfeatures.sort()
            self.assertTrue((nfeatures[::-1] == rfe.ca.nfeatures).all())

            # check if history has elements for every step
            self.assertTrue(
                set(rfe.ca.history) == set(range(len(np.array(
                    rfe.ca.errors)))))

            # Last (the largest number) can be present multiple times even
            # if we remove 1 feature at a time -- just need to stop well
            # in advance when we have more than 1 feature left ;)
            self.assertTrue(rfe.ca.nfeatures[-1] == len(
                np.where(rfe.ca.history == max(rfe.ca.history))[0]))
Ejemplo n.º 33
0
import mvpa2
import pylab as pl
import numpy as np
from mvpa2.misc.data_generators import normal_feature_dataset
from mvpa2.clfs.svm import LinearCSVMC
from mvpa2.generators.partition import NFoldPartitioner
from mvpa2.measures.base import CrossValidation
from mvpa2.mappers.zscore import zscore
"""
Generate a binary dataset without any signal (snr=0).
"""

mvpa2.seed(1)
ds_noise = normal_feature_dataset(perlabel=100,
                                  nlabels=2,
                                  nfeatures=2,
                                  snr=0,
                                  nonbogus_features=[0, 1])

# signal levels
sigs = [0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0]
"""

To mimic behavior of hard-margin SVM whenever classes become
separable, which is easier to comprehend, we are intentionally setting
very high C value.

"""

clf = LinearCSVMC(C=1000, enable_ca=['training_stats'])
cve = CrossValidation(clf, NFoldPartitioner(), enable_ca='stats')
Ejemplo n.º 34
0
    def test_SplitRFE(self, fmeasure):
        # just a smoke test ATM
        from mvpa2.clfs.svm import LinearCSVMC
        from mvpa2.clfs.meta import MappedClassifier
        from mvpa2.misc.data_generators import normal_feature_dataset
        #import mvpa2.featsel.rfe
        #reload(mvpa2.featsel.rfe)
        from mvpa2.featsel.rfe import RFE, SplitRFE
        from mvpa2.generators.partition import NFoldPartitioner
        from mvpa2.featsel.helpers import FractionTailSelector
        from mvpa2.testing import ok_, assert_equal

        clf = LinearCSVMC(C=1)
        dataset = normal_feature_dataset(perlabel=20,
                                         nlabels=2,
                                         nfeatures=11,
                                         snr=1.,
                                         nonbogus_features=[1, 5])
        # flip one of the meaningful features around to see
        # if we are still getting proper selection
        dataset.samples[:, dataset.a.nonbogus_features[1]] *= -1
        # 3 partitions should be enough for testing
        partitioner = NFoldPartitioner(count=3)

        rfeclf = MappedClassifier(
            clf,
            SplitRFE(
                clf,
                partitioner,
                fselector=FractionTailSelector(0.5,
                                               mode='discard',
                                               tail='lower'),
                fmeasure=fmeasure,
                # need to update only when using clf's sens anal
                update_sensitivity=fmeasure is None))
        r0 = repr(rfeclf)

        ok_(rfeclf.mapper.nfeatures_min == 0)
        rfeclf.train(dataset)
        ok_(rfeclf.mapper.nfeatures_min > 0)
        predictions = rfeclf(dataset).samples

        # at least 1 of the nonbogus-features should be chosen
        ok_(
            len(
                set(dataset.a.nonbogus_features).intersection(
                    rfeclf.mapper.slicearg)) > 0)

        # check repr to have all needed pieces
        r = repr(rfeclf)
        s = str(rfeclf)
        ok_(('partitioner=NFoldP' in r) or
            ('partitioner=mvpa2.generators.partition.NFoldPartitioner' in r))
        ok_('lrn=' in r)
        ok_(not 'slicearg=' in r)
        assert_equal(r, r0)

        if externals.exists('joblib'):
            rfeclf.mapper.nproc = -1
            # compare results against the one ran in parallel
            _slicearg = rfeclf.mapper.slicearg
            _predictions = predictions
            rfeclf.train(dataset)
            predictions = rfeclf(dataset).samples
            assert_array_equal(predictions, _predictions)
            assert_array_equal(_slicearg, rfeclf.mapper.slicearg)

        # Test that we can collect stats from cas within cross-validation
        sensitivities = []
        nested_errors = []
        nested_nfeatures = []

        def store_me(data, node, result):
            sens = node.measure.get_sensitivity_analyzer(
                force_train=False)(data)
            sensitivities.append(sens)
            nested_errors.append(node.measure.mapper.ca.nested_errors)
            nested_nfeatures.append(node.measure.mapper.ca.nested_nfeatures)

        cv = CrossValidation(rfeclf,
                             NFoldPartitioner(count=1),
                             callback=store_me,
                             enable_ca=['stats'])
        _ = cv(dataset)
        # just to make sure we collected them
        assert_equal(len(sensitivities), 1)
        assert_equal(len(nested_errors), 1)
        assert_equal(len(nested_nfeatures), 1)
Ejemplo n.º 35
0
    def test_rfe(self, clf):

        # sensitivity analyser and transfer error quantifier use the SAME clf!
        sens_ana = clf.get_sensitivity_analyzer(postproc=maxofabs_sample())
        pmeasure = ProxyMeasure(clf, postproc=BinaryFxNode(mean_mismatch_error,
                                                           'targets'))
        cvmeasure = CrossValidation(clf, NFoldPartitioner(),
                                    errorfx=mean_mismatch_error,
                                    postproc=mean_sample())

        rfesvm_split = SplitClassifier(clf, OddEvenPartitioner())

        # explore few recipes
        for rfe, data in [
            # because the clf is already trained when computing the sensitivity
            # map, prevent retraining for transfer error calculation
            # Use absolute of the svm weights as sensitivity
            (RFE(sens_ana,
                pmeasure,
                Splitter('train'),
                fselector=FixedNElementTailSelector(1),
                train_pmeasure=False),
             self.get_data()),
            # use cross-validation within training to get error for the stopping point
            # but use full training data to derive sensitivity
            (RFE(sens_ana,
                 cvmeasure,
                 Repeater(2),            # give the same full dataset to sens_ana and cvmeasure
                 fselector=FractionTailSelector(
                     0.70,
                     mode='select', tail='upper'),
                train_pmeasure=True),
             normal_feature_dataset(perlabel=20, nchunks=5, nfeatures=200,
                                    nonbogus_features=[0, 1], snr=1.5)),
            # use cross-validation (via SplitClassifier) and get mean
            # of normed sensitivities across those splits
            (RFE(rfesvm_split.get_sensitivity_analyzer(
                    postproc=ChainMapper([ FxMapper('features', l2_normed),
                                           FxMapper('samples', np.mean),
                                           FxMapper('samples', np.abs)])),
                 ConfusionBasedError(rfesvm_split, confusion_state='stats'),
                 Repeater(2),             #  we will use the same full cv-training dataset
                 fselector=FractionTailSelector(
                     0.50,
                     mode='select', tail='upper'),
                 stopping_criterion=NBackHistoryStopCrit(BestDetector(), 10),
                 train_pmeasure=False,    # we just extract it from existing confusion
                 update_sensitivity=True),
             normal_feature_dataset(perlabel=28, nchunks=7, nfeatures=200,
                                    nonbogus_features=[0, 1], snr=1.5))
            ]:
            # prep data
            # data = datasets['uni2medium']
            data_nfeatures = data.nfeatures

            rfe.train(data)
            resds = rfe(data)

            # fail if orig datasets are changed
            self.assertTrue(data.nfeatures == data_nfeatures)

            # check that the features set with the least error is selected
            if len(rfe.ca.errors):
                e = np.array(rfe.ca.errors)
                if isinstance(rfe._fselector, FixedNElementTailSelector):
                    self.assertTrue(resds.nfeatures == data_nfeatures - e.argmin())
                else:
                    imin = np.argmin(e)
                    if 'does_feature_selection' in clf.__tags__:
                        # if clf is smart it might figure it out right away
                        assert_array_less( imin, len(e) )
                    else:
                        # in this case we can even check if we had actual
                        # going down/up trend... although -- why up???
                        self.assertTrue( 1 < imin < len(e) - 1 )
            else:
                self.assertTrue(resds.nfeatures == data_nfeatures)

            # silly check if nfeatures is in decreasing order
            nfeatures = np.array(rfe.ca.nfeatures).copy()
            nfeatures.sort()
            self.assertTrue( (nfeatures[::-1] == rfe.ca.nfeatures).all() )

            # check if history has elements for every step
            self.assertTrue(set(rfe.ca.history)
                            == set(range(len(np.array(rfe.ca.errors)))))

            # Last (the largest number) can be present multiple times even
            # if we remove 1 feature at a time -- just need to stop well
            # in advance when we have more than 1 feature left ;)
            self.assertTrue(rfe.ca.nfeatures[-1]
                            == len(np.where(rfe.ca.history
                                           ==max(rfe.ca.history))[0]))
Ejemplo n.º 36
0
def test_rfe_sensmap():
    # http://lists.alioth.debian.org/pipermail/pkg-exppsy-pymvpa/2013q3/002538.html
    # just a smoke test. fails with
    from mvpa2.clfs.svm import LinearCSVMC
    from mvpa2.clfs.meta import FeatureSelectionClassifier
    from mvpa2.measures.base import CrossValidation, RepeatedMeasure
    from mvpa2.generators.splitters import Splitter
    from mvpa2.generators.partition import NFoldPartitioner
    from mvpa2.misc.errorfx import mean_mismatch_error
    from mvpa2.mappers.fx import mean_sample
    from mvpa2.mappers.fx import maxofabs_sample
    from mvpa2.generators.base import Repeater
    from mvpa2.featsel.rfe import RFE
    from mvpa2.featsel.helpers import FractionTailSelector, BestDetector
    from mvpa2.featsel.helpers import NBackHistoryStopCrit
    from mvpa2.datasets import vstack

    from mvpa2.misc.data_generators import normal_feature_dataset

    # Let's simulate the beast -- 6 categories total groupped into 3
    # super-ordinate, and actually without any 'superordinate' effect
    # since subordinate categories independent
    fds = normal_feature_dataset(nlabels=3,
                                 snr=1, # 100,   # pure signal! ;)
                                 perlabel=9,
                                 nfeatures=6,
                                 nonbogus_features=range(3),
                                 nchunks=3)
    clfsvm = LinearCSVMC()

    rfesvm = RFE(clfsvm.get_sensitivity_analyzer(postproc=maxofabs_sample()),
                 CrossValidation(
                     clfsvm,
                     NFoldPartitioner(),
                     errorfx=mean_mismatch_error, postproc=mean_sample()),
                 Repeater(2),
                 fselector=FractionTailSelector(0.70, mode='select', tail='upper'),
                 stopping_criterion=NBackHistoryStopCrit(BestDetector(), 10),
                 update_sensitivity=True)

    fclfsvm = FeatureSelectionClassifier(clfsvm, rfesvm)

    sensanasvm = fclfsvm.get_sensitivity_analyzer(postproc=maxofabs_sample())


    # manually repeating/splitting so we do both RFE sensitivity and classification
    senses, errors = [], []
    for i, pset in enumerate(NFoldPartitioner().generate(fds)):
        # split partitioned dataset
        split = [d for d in Splitter('partitions').generate(pset)]
        senses.append(sensanasvm(split[0])) # and it also should train the classifier so we would ask it about error
        errors.append(mean_mismatch_error(fclfsvm.predict(split[1]), split[1].targets))

    senses = vstack(senses)
    errors = vstack(errors)

    # Let's compare against rerunning the beast simply for classification with CV
    errors_cv = CrossValidation(fclfsvm, NFoldPartitioner(), errorfx=mean_mismatch_error)(fds)
    # and they should match
    assert_array_equal(errors, errors_cv)

    # buggy!
    cv_sensana_svm = RepeatedMeasure(sensanasvm, NFoldPartitioner())
    senses_rm = cv_sensana_svm(fds)

    #print senses.samples, senses_rm.samples
    #print errors, errors_cv.samples
    assert_raises(AssertionError,
                  assert_array_almost_equal,
                  senses.samples, senses_rm.samples)
    raise SkipTest("Known failure for repeated measures: https://github.com/PyMVPA/PyMVPA/issues/117")
Ejemplo n.º 37
0
def test_factorialpartitioner():
    # Test against sifter and chainmap implemented in test_usecases
    # -- code below copied from test_usecases --
    # Let's simulate the beast -- 6 categories total groupped into 3
    # super-ordinate, and actually without any 'superordinate' effect
    # since subordinate categories independent
    ds = normal_feature_dataset(
        nlabels=6,
        snr=100,  # pure signal! ;)
        perlabel=30,
        nfeatures=6,
        nonbogus_features=range(6),
        nchunks=5)
    ds.sa['subord'] = ds.sa.targets.copy()
    ds.sa['superord'] = ['super%d' % (int(i[1]) % 3, )
                         for i in ds.targets]  # 3 superord categories
    # let's override original targets just to be sure that we aren't relying on them
    ds.targets[:] = 0

    # let's make two other datasets to test later
    # one superordinate category only
    ds_1super = ds.copy()
    ds_1super.sa['superord'] = ['super1' for i in ds_1super.targets]

    # one superordinate category has only one subordinate
    #ds_unbalanced = ds.copy()
    #nsuper1 = np.sum(ds_unbalanced.sa.superord == 'super1')
    #mask_superord = ds_unbalanced.sa.superord == 'super1'
    #uniq_subord = np.unique(ds_unbalanced.sa.subord[mask_superord])
    #ds_unbalanced.sa.subord[mask_superord] = [uniq_subord[0] for i in range(nsuper1)]
    ds_unbalanced = Dataset(range(4),
                            sa={
                                'subord': [0, 0, 1, 2],
                                'superord': [1, 1, 2, 2]
                            })

    npart = ChainNode(
        [
            ## so we split based on superord
            NFoldPartitioner(len(ds.sa['superord'].unique), attr='subord'),
            ## so it should select only those splits where we took 1 from
            ## each of the superord categories leaving things in balance
            Sifter([('partitions', 2),
                    ('superord', {
                        'uvalues': ds.sa['superord'].unique,
                        'balanced': True
                    })]),
        ],
        space='partitions')

    # now the new implementation
    factpart = FactorialPartitioner(NFoldPartitioner(attr='subord'),
                                    attr='superord')

    partitions_npart = [p.sa.partitions for p in npart.generate(ds)]
    partitions_factpart = [p.sa.partitions for p in factpart.generate(ds)]

    assert_array_equal(np.sort(partitions_npart), np.sort(partitions_factpart))

    # now let's check it behaves correctly if we have only one superord class
    nfold = NFoldPartitioner(attr='subord')
    partitions_nfold = [p.sa.partitions for p in nfold.generate(ds_1super)]
    partitions_factpart = [
        p.sa.partitions for p in factpart.generate(ds_1super)
    ]
    assert_array_equal(np.sort(partitions_nfold), np.sort(partitions_factpart))

    # smoke test for unbalanced subord classes
    warning_msg = 'One or more superordinate attributes do not have the same '\
                  'number of subordinate attributes. This could yield to '\
                  'unbalanced partitions.'
    with assert_warnings([(RuntimeWarning, warning_msg)]):
        partitions_factpart = [
            p.sa.partitions for p in factpart.generate(ds_unbalanced)
        ]

    partitions_unbalanced = [np.array([2, 2, 2, 1]), np.array([2, 2, 1, 2])]
    superord_unbalanced = [([2], [1, 1, 2]), ([2], [1, 1, 2])]
    subord_unbalanced = [([2], [0, 0, 1]), ([1], [0, 0, 2])]

    for out_part, true_part, super_out, sub_out in \
            zip(partitions_factpart, partitions_unbalanced,
                superord_unbalanced, subord_unbalanced):
        assert_array_equal(out_part, true_part)
        assert_array_equal((ds_unbalanced[out_part == 1].sa.superord.tolist(),
                            ds_unbalanced[out_part == 2].sa.superord.tolist()),
                           super_out)
        assert_array_equal((ds_unbalanced[out_part == 1].sa.subord.tolist(),
                            ds_unbalanced[out_part == 2].sa.subord.tolist()),
                           sub_out)

    # now let's test on a dummy dataset
    ds_dummy = Dataset(range(4),
                       sa={
                           'subord': range(4),
                           'superord': [1, 2] * 2
                       })
    partitions_factpart = [
        p.sa.partitions for p in factpart.generate(ds_dummy)
    ]
    assert_array_equal(
        partitions_factpart,
        [[2, 2, 1, 1], [2, 1, 1, 2], [1, 2, 2, 1], [1, 1, 2, 2]])
Ejemplo n.º 38
0
def test_factorialpartitioner():
    # Test against sifter and chainmap implemented in test_usecases
    # -- code below copied from test_usecases --
    # Let's simulate the beast -- 6 categories total groupped into 3
    # super-ordinate, and actually without any 'superordinate' effect
    # since subordinate categories independent
    ds = normal_feature_dataset(
        nlabels=6,
        snr=100,  # pure signal! ;)
        perlabel=30,
        nfeatures=6,
        nonbogus_features=range(6),
        nchunks=5)
    ds.sa['subord'] = ds.sa.targets.copy()
    ds.sa['superord'] = ['super%d' % (int(i[1]) % 3, )
                         for i in ds.targets]  # 3 superord categories
    # let's override original targets just to be sure that we aren't relying on them
    ds.targets[:] = 0

    # let's make two other datasets to test later
    # one superordinate category only
    ds_1super = ds.copy()
    ds_1super.sa['superord'] = ['super1' for i in ds_1super.targets]

    # one superordinate category has only one subordinate
    #ds_unbalanced = ds.copy()
    #nsuper1 = np.sum(ds_unbalanced.sa.superord == 'super1')
    #mask_superord = ds_unbalanced.sa.superord == 'super1'
    #uniq_subord = np.unique(ds_unbalanced.sa.subord[mask_superord])
    #ds_unbalanced.sa.subord[mask_superord] = [uniq_subord[0] for i in range(nsuper1)]
    ds_unbalanced = Dataset(range(4),
                            sa={
                                'subord': [0, 0, 1, 2],
                                'superord': [1, 1, 2, 2]
                            })

    npart = ChainNode(
        [
            ## so we split based on superord
            NFoldPartitioner(len(ds.sa['superord'].unique), attr='subord'),
            ## so it should select only those splits where we took 1 from
            ## each of the superord categories leaving things in balance
            Sifter([('partitions', 2),
                    ('superord', {
                        'uvalues': ds.sa['superord'].unique,
                        'balanced': True
                    })]),
        ],
        space='partitions')

    def partition(partitioner, ds_=ds):
        return [p.sa.partitions for p in partitioner.generate(ds_)]

    # now the new implementation
    # common kwargs
    factkw = dict(partitioner=NFoldPartitioner(attr='subord'), attr='superord')

    fpart = FactorialPartitioner(**factkw)
    p_npart = partition(npart)
    p_fpart = partition(fpart)

    assert_array_equal(np.sort(p_npart), np.sort(p_fpart))

    fpart2 = FactorialPartitioner(count=2,
                                  selection_strategy='first',
                                  **factkw)
    p_fpart2 = partition(fpart2)
    assert_equal(len(p_fpart), 8)
    assert_equal(len(p_fpart2), 2)
    assert_array_equal(p_fpart[:2], p_fpart2)

    # 1 equidistant -- should be the first one
    fpart1 = FactorialPartitioner(count=1, **factkw)
    p_fpart1 = partition(fpart1)
    assert_equal(len(p_fpart1), 1)
    assert_array_equal(p_fpart[:1], p_fpart1)

    # 2 equidistant
    fpart2 = FactorialPartitioner(count=2, **factkw)
    p_fpart2 = partition(fpart2)
    assert_equal(len(p_fpart2), 2)
    assert_array_equal(p_fpart[::4], p_fpart2)

    # without count -- should be all of them in original order
    fpartr = FactorialPartitioner(selection_strategy='random', **factkw)
    assert_array_equal(p_fpart, partition(fpartr))

    # but if with a count we should get some selection
    fpartr2 = FactorialPartitioner(selection_strategy='random',
                                   count=2,
                                   **factkw)
    # Let's generate a number of random selections:
    rand2_partitions = [partition(fpartr2) for i in xrange(10)]
    for p in rand2_partitions:
        assert_equal(len(p), 2)
    # majority of them must be different
    assert len(set([tuple(map(tuple, x)) for x in rand2_partitions])) >= 5

    # now let's check it behaves correctly if we have only one superord class
    nfold = NFoldPartitioner(attr='subord')
    p_nfold = partition(nfold, ds_1super)
    p_fpart = partition(fpart, ds_1super)
    assert_array_equal(np.sort(p_nfold), np.sort(p_fpart))

    # smoke test for unbalanced subord classes
    warning_msg = 'One or more superordinate attributes do not have the same '\
                  'number of subordinate attributes. This could yield to '\
                  'unbalanced partitions.'
    with assert_warnings([(RuntimeWarning, warning_msg)]):
        p_fpart = partition(fpart, ds_unbalanced)

    p_unbalanced = [np.array([2, 2, 2, 1]), np.array([2, 2, 1, 2])]
    superord_unbalanced = [([2], [1, 1, 2]), ([2], [1, 1, 2])]
    subord_unbalanced = [([2], [0, 0, 1]), ([1], [0, 0, 2])]

    for out_part, true_part, super_out, sub_out in \
            zip(p_fpart, p_unbalanced,
                superord_unbalanced, subord_unbalanced):
        assert_array_equal(out_part, true_part)
        assert_array_equal((ds_unbalanced[out_part == 1].sa.superord.tolist(),
                            ds_unbalanced[out_part == 2].sa.superord.tolist()),
                           super_out)
        assert_array_equal((ds_unbalanced[out_part == 1].sa.subord.tolist(),
                            ds_unbalanced[out_part == 2].sa.subord.tolist()),
                           sub_out)

    # now let's test on a dummy dataset
    ds_dummy = Dataset(range(4),
                       sa={
                           'subord': range(4),
                           'superord': [1, 2] * 2
                       })
    p_fpart = partition(fpart, ds_dummy)
    assert_array_equal(
        p_fpart, [[2, 2, 1, 1], [2, 1, 1, 2], [1, 2, 2, 1], [1, 1, 2, 2]])
Ejemplo n.º 39
0
def test_confusion_as_node():
    from mvpa2.misc.data_generators import normal_feature_dataset
    from mvpa2.clfs.gnb import GNB
    from mvpa2.clfs.transerror import Confusion
    ds = normal_feature_dataset(snr=2.0, perlabel=42, nchunks=3,
                                nonbogus_features=[0,1], nfeatures=2)
    clf = GNB()
    cv = CrossValidation(
        clf, NFoldPartitioner(),
        errorfx=None,
        postproc=Confusion(labels=ds.UT),
        enable_ca=['stats'])
    res = cv(ds)
    # needs to be identical to CA
    assert_array_equal(res.samples, cv.ca.stats.matrix)
    assert_array_equal(res.sa.predictions, ds.UT)
    assert_array_equal(res.fa.targets, ds.UT)

    skip_if_no_external('scipy')

    from mvpa2.clfs.transerror import BayesConfusionHypothesis
    from mvpa2.base.node import ChainNode
    # same again, but this time with Bayesian hypothesis testing at the end
    cv = CrossValidation(
        clf, NFoldPartitioner(),
        errorfx=None,
        postproc=ChainNode([Confusion(labels=ds.UT),
                            BayesConfusionHypothesis()]))
    res = cv(ds)
    # only two possible hypothesis with two classes
    assert_equals(len(res), 2)
    # the first hypothesis is the can't discriminate anything
    assert_equal(len(res.sa.hypothesis[0]), 1)
    assert_equal(len(res.sa.hypothesis[0][0]), 2)
    # and the hypothesis is actually less likely than the other one
    # (both classes can be distinguished)
    assert(np.e**res.samples[0,0] < np.e**res.samples[1,0])

    # Let's see how well it would work within the searchlight when we also
    # would like to store the hypotheses per each voxel
    # Somewhat an ad-hoc solution for the answer posted on the ML
    #
    # run 1d searchlight of radii 0, for that just provide a .fa with coordinates
    ds.fa['voxel_indices'] = [[0], [1]]
    # and a custom Node which would collect .sa.hypothesis to place together along
    # with the posterior probabilities
    from mvpa2.base.node import Node
    from mvpa2.measures.searchlight import sphere_searchlight

    class KeepBothPosteriorAndHypothesis(Node):
        def _call(self, ds):
            out = np.zeros(1, dtype=object)
            out[0] = (ds.samples, ds.sa.hypothesis)
            return out
    cv.postproc.append(KeepBothPosteriorAndHypothesis())
    sl = sphere_searchlight(cv, radius=0, nproc=1)
    res = sl(ds)

    assert_equal(res.shape, (1, 2))
    assert_equal(len(res.samples[0,0]), 2)
    assert_equal(res.samples[0,0][0].shape, (2, 2))   # posteriors per 1st SL
    assert_equal(len(res.samples[0,0][1]), 2)   # 2 of hypotheses
Ejemplo n.º 40
0
"""
import mvpa2
import pylab as pl
import numpy as np
from mvpa2.misc.data_generators import normal_feature_dataset
from mvpa2.clfs.svm import LinearCSVMC
from mvpa2.generators.partition import NFoldPartitioner
from mvpa2.measures.base import CrossValidation
from mvpa2.mappers.zscore import zscore

"""
Generate a binary dataset without any signal (snr=0).
"""
mvpa2.seed(1);
ds_noise = normal_feature_dataset(perlabel=100, nlabels=2, nfeatures=2, snr=0,
                                  nonbogus_features=[0,1])

# signal levels
sigs = [0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0]

"""

To mimic behavior of hard-margin SVM whenever classes become
separable, which is easier to comprehend, we are intentionally setting
very high C value.

"""

clf = LinearCSVMC(C=1000, enable_ca=['training_stats'])
cve = CrossValidation(clf, NFoldPartitioner(), enable_ca='stats')
sana = clf.get_sensitivity_analyzer(postproc=None)