Esempio n. 1
0
def test_ds_shallowcopy():
    # lets use some instance of somewhat evolved dataset
    ds = normal_feature_dataset()
    ds.samples = ds.samples.view(myarray)

    # SHALLOW copy the beast
    ds_ = copy.copy(ds)
    # verify that we have the same data
    assert_array_equal(ds.samples, ds_.samples)
    assert_array_equal(ds.targets, ds_.targets)
    assert_array_equal(ds.chunks, ds_.chunks)

    # array subclass survives
    ok_(isinstance(ds_.samples, myarray))


    # modify and see that we actually DO change the data in both
    ds_.samples[0, 0] = 1234
    assert_array_equal(ds.samples, ds_.samples)
    assert_array_equal(ds.targets, ds_.targets)
    assert_array_equal(ds.chunks, ds_.chunks)

    ds_.sa.targets[0] = 'ab'
    ds_.sa.chunks[0] = 234
    assert_array_equal(ds.samples, ds_.samples)
    assert_array_equal(ds.targets, ds_.targets)
    assert_array_equal(ds.chunks, ds_.chunks)
    ok_(ds.sa.targets[0] == 'ab')
    ok_(ds.sa.chunks[0] == 234)
Esempio n. 2
0
def test_ds_deepcopy():
    # lets use some instance of somewhat evolved dataset
    ds = normal_feature_dataset()
    ds.samples = ds.samples.view(myarray)
    # Clone the beast
    ds_ = ds.copy()
    # array subclass survives
    ok_(isinstance(ds_.samples, myarray))

    # verify that we have the same data
    assert_array_equal(ds.samples, ds_.samples)
    assert_array_equal(ds.targets, ds_.targets)
    assert_array_equal(ds.chunks, ds_.chunks)

    # modify and see if we don't change data in the original one
    ds_.samples[0, 0] = 1234
    ok_(np.any(ds.samples != ds_.samples))
    assert_array_equal(ds.targets, ds_.targets)
    assert_array_equal(ds.chunks, ds_.chunks)

    ds_.sa.targets = np.hstack(([123], ds_.targets[1:]))
    ok_(np.any(ds.samples != ds_.samples))
    ok_(np.any(ds.targets != ds_.targets))
    assert_array_equal(ds.chunks, ds_.chunks)

    ds_.sa.chunks = np.hstack(([1234], ds_.chunks[1:]))
    ok_(np.any(ds.samples != ds_.samples))
    ok_(np.any(ds.targets != ds_.targets))
    ok_(np.any(ds.chunks != ds_.chunks))
Esempio n. 3
0
    def test_binds(self):
        ds = normal_feature_dataset()
        ds_data = ds.samples.copy()
        ds_chunks = ds.chunks.copy()
        self.failUnless(np.all(ds.samples == ds_data))  # sanity check

        funcs = ['coarsen_chunks']

        for f in funcs:
            eval('ds.%s()' % f)
            self.failUnless(
                np.any(ds.samples != ds_data)
                or np.any(ds.chunks != ds_chunks),
                msg="We should have modified original dataset with %s" % f)
            ds.samples = ds_data.copy()
            ds.sa['chunks'].value = ds_chunks.copy()

        # and some which should just return results
        for f in [
                'aggregate_features', 'remove_invariant_features',
                'get_samples_per_chunk_target'
        ]:
            res = eval('ds.%s()' % f)
            self.failUnless(res is not None,
                            msg='We should have got result from function %s' %
                            f)
            self.failUnless(
                np.all(ds.samples == ds_data),
                msg="Function %s should have not modified original dataset" %
                f)
Esempio n. 4
0
    def test_smlr_sensitivities(self):
        data = normal_feature_dataset(perlabel=10, nlabels=2, nfeatures=4)

        # use SMLR on binary problem, but not fitting all weights
        clf = SMLR(fit_all_weights=False)
        clf.train(data)

        # now ask for the sensitivities WITHOUT having to pass the dataset
        # again
        sens = clf.get_sensitivity_analyzer(force_training=False)()
        self.failUnless(sens.shape == (len(data.UT) - 1, data.nfeatures))
Esempio n. 5
0
    def test_smlr_sensitivities(self):
        data = normal_feature_dataset(perlabel=10, nlabels=2, nfeatures=4)

        # use SMLR on binary problem, but not fitting all weights
        clf = SMLR(fit_all_weights=False)
        clf.train(data)

        # now ask for the sensitivities WITHOUT having to pass the dataset
        # again
        sens = clf.get_sensitivity_analyzer(force_training=False)()
        self.failUnless(sens.shape == (len(data.UT) - 1, data.nfeatures))
Esempio n. 6
0
def test_mdpflowmapper():
    flow = mdp.nodes.PCANode() + mdp.nodes.SFANode()
    fm = MDPFlowMapper(flow)
    ds = normal_feature_dataset(perlabel=10, nlabels=2, nfeatures=4)

    fm.train(ds)
    assert_false(fm.flow[0].is_training())
    assert_false(fm.flow[1].is_training())

    fds = fm.forward(ds)
    assert_true(isinstance(fds, Dataset))
    assert_equal(fds.samples.shape, ds.samples.shape)
Esempio n. 7
0
def test_mdpflowmapper():
    flow = mdp.nodes.PCANode() + mdp.nodes.SFANode()
    fm = MDPFlowMapper(flow)
    ds = normal_feature_dataset(perlabel=10, nlabels=2, nfeatures=4)

    fm.train(ds)
    assert_false(fm.flow[0].is_training())
    assert_false(fm.flow[1].is_training())

    fds = fm.forward(ds)
    assert_true(isinstance(fds, Dataset))
    assert_equal(fds.samples.shape, ds.samples.shape)
Esempio n. 8
0
def test_glmnet_c_sensitivities():
    data = normal_feature_dataset(perlabel=10, nlabels=2, nfeatures=4)

    # use GLMNET on binary problem
    clf = GLMNET_C()
    clf.train(data)

    # now ask for the sensitivities WITHOUT having to pass the dataset
    # again
    sens = clf.get_sensitivity_analyzer(force_train=False)(None)

    #failUnless(sens.shape == (data.nfeatures,))
    assert_equal(sens.shape, (len(data.UT), data.nfeatures))
Esempio n. 9
0
def test_glmnet_c_sensitivities():
    data = normal_feature_dataset(perlabel=10, nlabels=2, nfeatures=4)

    # use GLMNET on binary problem
    clf = GLMNET_C()
    clf.train(data)

    # now ask for the sensitivities WITHOUT having to pass the dataset
    # again
    sens = clf.get_sensitivity_analyzer(force_training=False)()

    #failUnless(sens.shape == (data.nfeatures,))
    assert_equal(sens.shape, (len(data.UT), data.nfeatures))
Esempio n. 10
0
def test_mdpnodemapper():
    ds = normal_feature_dataset(perlabel=10, nlabels=2, nfeatures=4)

    node = mdp.nodes.PCANode()
    mm = MDPNodeMapper(node, nodeargs={'stoptrain': ((), {'debug': True})})

    mm.train(ds)

    fds = mm.forward(ds)
    if externals.versions['mdp'] >= '2.5':
        assert_true(hasattr(mm.node, 'cov_mtx'))

    assert_true(isinstance(fds, Dataset))
    assert_equal(fds.samples.shape, ds.samples.shape)

    # set projection onto first 2 components
    mm.nodeargs['exec'] = ((), {'n': 2})
    #should be different from above
    lfds = mm.forward(ds.samples)
    # output shape changes although the node still claim otherwise
    assert_equal(mm.node.output_dim, 4)
    assert_equal(lfds.shape[0], fds.samples.shape[0])
    assert_equal(lfds.shape[1], 2)
    assert_array_equal(lfds, fds.samples[:, :2])

    # reverse
    rfds = mm.reverse(fds)

    # even smaller size works
    rlfds = mm.reverse(lfds)
    assert_equal(rfds.samples.shape, ds.samples.shape)

    # retraining has to work on a new dataset too, since we copy the node
    # internally
    dsbig = normal_feature_dataset(perlabel=10, nlabels=2, nfeatures=10)
    mm.train(dsbig)
Esempio n. 11
0
def test_mdpnodemapper():
    ds = normal_feature_dataset(perlabel=10, nlabels=2, nfeatures=4)

    node = mdp.nodes.PCANode()
    mm = MDPNodeMapper(node, nodeargs={'stoptrain': ((), {'debug': True})})

    mm.train(ds)

    fds = mm.forward(ds)
    if externals.versions['mdp'] >= '2.5':
        assert_true(hasattr(mm.node, 'cov_mtx'))

    assert_true(isinstance(fds, Dataset))
    assert_equal(fds.samples.shape, ds.samples.shape)

    # set projection onto first 2 components
    mm.nodeargs['exec'] = ((), {'n': 2})
    #should be different from above
    lfds = mm.forward(ds.samples)
    # output shape changes although the node still claim otherwise
    assert_equal(mm.node.output_dim, 4)
    assert_equal(lfds.shape[0], fds.samples.shape[0])
    assert_equal(lfds.shape[1], 2)
    assert_array_equal(lfds, fds.samples[:, :2])

    # reverse
    rfds = mm.reverse(fds)

    # even smaller size works
    rlfds = mm.reverse(lfds)
    assert_equal(rfds.samples.shape, ds.samples.shape)

    # retraining has to work on a new dataset too, since we copy the node
    # internally
    dsbig = normal_feature_dataset(perlabel=10, nlabels=2, nfeatures=10)
    mm.train(dsbig)
Esempio n. 12
0
    def test_cache_speedup(self):
        skip_if_no_external('shogun', ver_dep='shogun:rev', min_version=4455)
        from mvpa.algorithms.cvtranserror import CrossValidatedTransferError
        from mvpa.datasets.splitters import NFoldSplitter
        from time import time
        from mvpa.clfs.transerror import TransferError
        from mvpa.kernels.base import CachedKernel
        from mvpa.kernels.sg import RbfSGKernel
        from mvpa.misc.data_generators import normal_feature_dataset

        ck = sgSVM(kernel=CachedKernel(kernel=RbfSGKernel(sigma=2)), C=1)
        sk = sgSVM(kernel=RbfSGKernel(sigma=2), C=1)

        cv_c = CrossValidatedTransferError(TransferError(ck),
                                           splitter=NFoldSplitter())
        cv_s = CrossValidatedTransferError(TransferError(sk),
                                           splitter=NFoldSplitter())

        #data = datasets['uni4large']
        P = 5000
        data = normal_feature_dataset(snr=2, perlabel=200, nchunks=10,
                                    means=np.random.randn(2, P), nfeatures=P)

        t0 = time()
        ck.params.kernel.compute(data)
        cachetime = time()-t0

        t0 = time()
        cached_err = cv_c(data)
        ccv_time = time()-t0

        t0 = time()
        norm_err = cv_s(data)
        ncv_time = time()-t0

        assert_almost_equal(np.asanyarray(cached_err),
                            np.asanyarray(norm_err))
        ok_(cachetime<ncv_time)
        ok_(ccv_time<ncv_time)
        #print 'Regular CV time: %s seconds'%ncv_time
        #print 'Caching time: %s seconds'%cachetime
        #print 'Cached CV time: %s seconds'%ccv_time

        speedup = ncv_time/(ccv_time+cachetime)
        #print 'Speedup factor: %s'%speedup

        # Speedup ideally should be 10, though it's not purely linear
        self.failIf(speedup < 2, 'Problem caching data - too slow!')
Esempio n. 13
0
def test_mdpflow_additional_arguments_nones():
    skip_if_no_external('mdp', min_version='2.5')
    # we have no IdentityNode yet... is there analog?

    ds = normal_feature_dataset(perlabel=10, nlabels=2, nfeatures=4)
    flow = mdp.nodes.PCANode() + mdp.nodes.IdentityNode() + mdp.nodes.FDANode()
    # this is what it would look like in MDP itself
    #flow.train([[ds.samples],
    #            [[ds.samples, ds.sa.targets]]])
    assert_raises(ValueError, MDPFlowMapper, flow, node_arguments=[[],[]])
    fm = MDPFlowMapper(flow, node_arguments = (None, None, [ds.sa.targets]))
    fm.train(ds)
    fds = fm.forward(ds)
    assert_equal(ds.samples.shape, fds.samples.shape)
    rds = fm.reverse(fds)
    assert_array_almost_equal(ds.samples, rds.samples)
Esempio n. 14
0
def test_mdpflow_additional_arguments_nones():
    skip_if_no_external('mdp', min_version='2.5')
    # we have no IdentityNode yet... is there analog?

    ds = normal_feature_dataset(perlabel=10, nlabels=2, nfeatures=4)
    flow = mdp.nodes.PCANode() + mdp.nodes.IdentityNode() + mdp.nodes.FDANode()
    # this is what it would look like in MDP itself
    #flow.train([[ds.samples],
    #            [[ds.samples, ds.sa.targets]]])
    assert_raises(ValueError, MDPFlowMapper, flow, node_arguments=[[], []])
    fm = MDPFlowMapper(flow, node_arguments=(None, None, [ds.sa.targets]))
    fm.train(ds)
    fds = fm.forward(ds)
    assert_equal(ds.samples.shape, fds.samples.shape)
    rds = fm.reverse(fds)
    assert_array_almost_equal(ds.samples, rds.samples)
Esempio n. 15
0
    def test_cache_speedup(self):
        skip_if_no_external('shogun', ver_dep='shogun:rev', min_version=4455)

        ck = sgSVM(kernel=CachedKernel(kernel=RbfSGKernel(sigma=2)), C=1)
        sk = sgSVM(kernel=RbfSGKernel(sigma=2), C=1)

        cv_c = CrossValidatedTransferError(TransferError(ck),
                                           splitter=NFoldSplitter())
        cv_s = CrossValidatedTransferError(TransferError(sk),
                                           splitter=NFoldSplitter())

        #data = datasets['uni4large']
        P = 5000
        data = normal_feature_dataset(snr=2,
                                      perlabel=200,
                                      nchunks=10,
                                      means=np.random.randn(2, P),
                                      nfeatures=P)

        t0 = time()
        ck.params.kernel.compute(data)
        cachetime = time() - t0

        t0 = time()
        cached_err = cv_c(data)
        ccv_time = time() - t0

        t0 = time()
        norm_err = cv_s(data)
        ncv_time = time() - t0

        assert_almost_equal(np.asanyarray(cached_err), np.asanyarray(norm_err))
        ok_(cachetime < ncv_time)
        ok_(ccv_time < ncv_time)
        #print 'Regular CV time: %s seconds'%ncv_time
        #print 'Caching time: %s seconds'%cachetime
        #print 'Cached CV time: %s seconds'%ccv_time

        speedup = ncv_time / (ccv_time + cachetime)
        #print 'Speedup factor: %s'%speedup

        # Speedup ideally should be 10, though it's not purely linear
        self.failIf(speedup < 2, 'Problem caching data - too slow!')
Esempio n. 16
0
    def test_cache_speedup(self):
        skip_if_no_external('shogun', ver_dep='shogun:rev', min_version=4455)

        ck = sgSVM(kernel=CachedKernel(kernel=RbfSGKernel(sigma=2)), C=1)
        sk = sgSVM(kernel=RbfSGKernel(sigma=2), C=1)

        cv_c = CrossValidation(ck, NFoldPartitioner())
        cv_s = CrossValidation(sk, NFoldPartitioner())

        #data = datasets['uni4large']
        P = 5000
        data = normal_feature_dataset(snr=2, perlabel=200, nchunks=10,
                                    means=np.random.randn(2, P), nfeatures=P)

        t0 = time()
        ck.params.kernel.compute(data)
        cachetime = time()-t0

        t0 = time()
        cached_err = cv_c(data)
        ccv_time = time()-t0

        t0 = time()
        norm_err = cv_s(data)
        ncv_time = time()-t0

        assert_almost_equal(np.asanyarray(cached_err),
                            np.asanyarray(norm_err))
        ok_(cachetime<ncv_time)
        ok_(ccv_time<ncv_time)
        #print 'Regular CV time: %s seconds'%ncv_time
        #print 'Caching time: %s seconds'%cachetime
        #print 'Cached CV time: %s seconds'%ccv_time

        speedup = ncv_time/(ccv_time+cachetime)
        #print 'Speedup factor: %s'%speedup

        # Speedup ideally should be 10, though it's not purely linear
        self.failIf(speedup < 2, 'Problem caching data - too slow!')
Esempio n. 17
0
    def test_binds(self):
        ds = normal_feature_dataset()
        ds_data = ds.samples.copy()
        ds_chunks = ds.chunks.copy()
        self.failUnless(np.all(ds.samples == ds_data)) # sanity check

        funcs = ['coarsen_chunks']

        for f in funcs:
            eval('ds.%s()' % f)
            self.failUnless(np.any(ds.samples != ds_data) or
                            np.any(ds.chunks != ds_chunks),
                msg="We should have modified original dataset with %s" % f)
            ds.samples = ds_data.copy()
            ds.sa['chunks'].value = ds_chunks.copy()

        # and some which should just return results
        for f in ['aggregate_features', 'remove_invariant_features',
                  'get_samples_per_chunk_target']:
            res = eval('ds.%s()' % f)
            self.failUnless(res is not None,
                msg='We should have got result from function %s' % f)
            self.failUnless(np.all(ds.samples == ds_data),
                msg="Function %s should have not modified original dataset" % f)
Esempio n. 18
0
    def test_rfe(self, clf):

        # sensitivity analyser and transfer error quantifier use the SAME clf!
        sens_ana = clf.get_sensitivity_analyzer(postproc=maxofabs_sample())
        pmeasure = ProxyMeasure(clf, postproc=BinaryFxNode(mean_mismatch_error,
                                                           'targets'))
        cvmeasure = CrossValidation(clf, NFoldPartitioner(),
                                    errorfx=mean_mismatch_error,
                                    postproc=mean_sample())

        rfesvm_split = SplitClassifier(clf, OddEvenPartitioner())

        # explore few recipes
        for rfe, data in [
            # because the clf is already trained when computing the sensitivity
            # map, prevent retraining for transfer error calculation
            # Use absolute of the svm weights as sensitivity
            (RFE(sens_ana,
                pmeasure,
                Splitter('train'),
                fselector=FixedNElementTailSelector(1),
                train_pmeasure=False),
             self.get_data()),
            # use cross-validation within training to get error for the stopping point
            # but use full training data to derive sensitivity
            (RFE(sens_ana,
                 cvmeasure,
                 Repeater(2),            # give the same full dataset to sens_ana and cvmeasure
                 fselector=FractionTailSelector(
                     0.70,
                     mode='select', tail='upper'),
                train_pmeasure=True),
             normal_feature_dataset(perlabel=20, nchunks=5, nfeatures=200,
                                    nonbogus_features=[0, 1], snr=1.5)),
            # use cross-validation (via SplitClassifier) and get mean
            # of normed sensitivities across those splits
            (RFE(rfesvm_split.get_sensitivity_analyzer(
                    postproc=ChainMapper([ FxMapper('features', l2_normed),
                                           FxMapper('samples', np.mean),
                                           FxMapper('samples', np.abs)])),
                 ConfusionBasedError(rfesvm_split, confusion_state='stats'),
                 Repeater(2),             #  we will use the same full cv-training dataset
                 fselector=FractionTailSelector(
                     0.50,
                     mode='select', tail='upper'),
                 stopping_criterion=NBackHistoryStopCrit(BestDetector(), 10),
                 train_pmeasure=False,    # we just extract it from existing confusion
                 update_sensitivity=True),
             normal_feature_dataset(perlabel=28, nchunks=7, nfeatures=200,
                                    nonbogus_features=[0, 1], snr=1.5))
            ]:
            # prep data
            # data = datasets['uni2medium']
            data_nfeatures = data.nfeatures

            rfe.train(data)
            resds = rfe(data)

            # fail if orig datasets are changed
            self.failUnless(data.nfeatures == data_nfeatures)

            # check that the features set with the least error is selected
            if len(rfe.ca.errors):
                e = np.array(rfe.ca.errors)
                if isinstance(rfe._fselector, FixedNElementTailSelector):
                    self.failUnless(resds.nfeatures == data_nfeatures - e.argmin())
                else:
                    # in this case we can even check if we had actual
                    # going down/up trend... although -- why up???
                    imin = np.argmin(e)
                    self.failUnless( 1 < imin < len(e) - 1 )
            else:
                self.failUnless(resds.nfeatures == data_nfeatures)

            # silly check if nfeatures is in decreasing order
            nfeatures = np.array(rfe.ca.nfeatures).copy()
            nfeatures.sort()
            self.failUnless( (nfeatures[::-1] == rfe.ca.nfeatures).all() )

            # check if history has elements for every step
            self.failUnless(set(rfe.ca.history)
                            == set(range(len(np.array(rfe.ca.errors)))))

            # Last (the largest number) can be present multiple times even
            # if we remove 1 feature at a time -- just need to stop well
            # in advance when we have more than 1 feature left ;)
            self.failUnless(rfe.ca.nfeatures[-1]
                            == len(np.where(rfe.ca.history
                                           ==max(rfe.ca.history))[0]))
Esempio n. 19
0
draft of a complete analysis.

First import a necessary pieces of PyMVPA -- this time each bit individually.
"""
from mvpa.datasets.base import dataset_wizard
from mvpa.datasets.splitters import OddEvenSplitter
from mvpa.clfs.svm import LinearCSVMC
from mvpa.clfs.transerror import TransferError
from mvpa.algorithms.cvtranserror import CrossValidatedTransferError
from mvpa.measures.searchlight import Searchlight
from mvpa.misc.data_generators import normal_feature_dataset

"""For the sake of simplicity, let's use a small artificial dataset."""

# overcomplicated way to generate an example dataset
ds = normal_feature_dataset(perlabel=10, nlabels=2, nchunks=2, nfeatures=10, nonbogus_features=[3, 7], snr=5.0)
dataset = dataset_wizard(samples=ds.samples, targets=ds.targets, chunks=ds.chunks)

"""Now it only takes three lines for a searchlight analysis."""

# setup measure to be computed in each sphere (cross-validated
# generalization error on odd/even splits)
cv = CrossValidatedTransferError(TransferError(LinearCSVMC()), OddEvenSplitter())

# setup searchlight with 5 mm radius and measure configured above
sl = Searchlight(cv, radius=5)

# run searchlight on dataset
sl_map = sl(dataset)

print "Best performing sphere error:", min(sl_map)