Beispiel #1
0
def test_repeater():
    reps = 4
    r = Repeater(reps, space='OMG')
    dsl = [ds for ds in r.generate(Dataset([0,1]))]
    assert_equal(len(dsl), reps)
    for i, ds in enumerate(dsl):
        assert_equal(ds.a.OMG, i)
Beispiel #2
0
def test_repeater():
    reps = 4
    r = Repeater(reps, space='OMG')
    dsl = [ds for ds in r.generate(Dataset([0, 1]))]
    assert_equal(len(dsl), reps)
    for i, ds in enumerate(dsl):
        assert_equal(ds.a.OMG, i)
Beispiel #3
0
def _test_edmund_chong_20120907():  # pragma: no cover
    # commented out to avoid syntax warnings while compiling
    # from mvpa2.suite import *
    from mvpa2.testing.datasets import datasets
    repeater = Repeater(count=20)

    partitioner = ChainNode([NFoldPartitioner(cvtype=1),
                             Balancer(attr='targets',
                                      count=1, # for real data > 1
                                      limit='partitions',
                                      apply_selection=True
                                      )],
                            space='partitions')

    clf = LinearCSVMC() #choice of classifier
    permutator = AttributePermutator('targets', limit={'partitions': 1},
                                     count=1)
    null_cv = CrossValidation(
        clf,
        ChainNode([partitioner, permutator], space=partitioner.get_space()),
        errorfx=mean_mismatch_error)
    distr_est = MCNullDist(repeater, tail='left', measure=null_cv,
                           enable_ca=['dist_samples'])
    cvte = CrossValidation(clf, partitioner,
                           errorfx=mean_mismatch_error,
                           null_dist=distr_est,
                           enable_ca=['stats'])
    errors = cvte(datasets['uni2small'])
Beispiel #4
0
    def test_null_dist_prob(self, l_clf):
        train = datasets['uni2medium']

        num_perm = 10
        permutator = AttributePermutator('targets',
                                         count=num_perm,
                                         limit='chunks')
        # define class to estimate NULL distribution of errors
        # use left tail of the distribution since we use MeanMatchFx as error
        # function and lower is better
        terr = TransferMeasure(l_clf,
                               Repeater(count=2),
                               postproc=BinaryFxNode(mean_mismatch_error,
                                                     'targets'),
                               null_dist=MCNullDist(permutator, tail='left'))

        # check reasonable error range
        err = terr(train)
        self.assertTrue(np.mean(err) < 0.4)

        # Lets do the same for CVTE
        cvte = CrossValidation(l_clf,
                               OddEvenPartitioner(),
                               null_dist=MCNullDist(permutator,
                                                    tail='left',
                                                    enable_ca=['dist_samples'
                                                               ]),
                               postproc=mean_sample())
        cv_err = cvte(train)

        # check that the result is highly significant since we know that the
        # data has signal
        null_prob = np.asscalar(terr.ca.null_prob)

        if cfg.getboolean('tests', 'labile', default='yes'):
            self.assertTrue(
                null_prob <= 0.1,
                msg="Failed to check that the result is highly significant "
                "(got %f) since we know that the data has signal" % null_prob)

            self.assertTrue(
                np.asscalar(cvte.ca.null_prob) <= 0.1,
                msg="Failed to check that the result is highly significant "
                "(got p(cvte)=%f) since we know that the data has signal" %
                np.asscalar(cvte.ca.null_prob))

        # we should be able to access the actual samples of the distribution
        # yoh: why it is 3D really?
        # mih: because these are the distribution samples for the ONE error
        #      collapsed into ONE value across all folds. It will also be
        #      3d if the return value of the measure isn't a scalar and it is
        #      not collapsed across folds. it simply corresponds to the shape
        #      of the output dataset of the respective measure (+1 axis)
        # Some permutations could have been skipped since classifier failed
        # to train due to degenerate situation etc, thus accounting for them
        self.assertEqual(cvte.null_dist.ca.dist_samples.shape[2],
                         num_perm - cvte.null_dist.ca.skipped)
Beispiel #5
0
    def __init__(
            self,
            lrn,
            partitioner,
            fselector,
            errorfx=mean_mismatch_error,
            analyzer_postproc=maxofabs_sample(),
            # callback?
            **kwargs):
        """
        Parameters
        ----------
        lrn : Learner
          Learner with a sensitivity analyzer which will be used both
          for the sensitivity analysis and transfer error estimation
        partitioner : Partitioner
          Used to generate cross-validation partitions for cross-validation
          to deduce optimal number of features to maintain
        fselector : Functor
          Given a sensitivity map it has to return the ids of those
          features that should be kept.
        errorfx : func, optional
          Functor to use for estimation of cross-validation error
        analyzer_postproc : func, optional
          Function to provide to the sensitivity analyzer as postproc
        """
        # Initialize itself preparing for the 2nd invocation
        # with determined number of nfeatures_min
        fmeasure = lrn.get_sensitivity_analyzer(postproc=analyzer_postproc)

        RFE.__init__(self,
                     fmeasure,
                     None,
                     Repeater(2),
                     fselector=fselector,
                     bestdetector=None,
                     train_pmeasure=False,
                     stopping_criterion=None,
                     **kwargs)
        self._lrn = lrn  # should not be modified, thus _
        self.partitioner = partitioner
        self.errorfx = errorfx
        self.analyzer_postproc = analyzer_postproc
Beispiel #6
0
def test_rfe_sensmap():
    # http://lists.alioth.debian.org/pipermail/pkg-exppsy-pymvpa/2013q3/002538.html
    # just a smoke test. fails with
    from mvpa2.clfs.svm import LinearCSVMC
    from mvpa2.clfs.meta import FeatureSelectionClassifier
    from mvpa2.measures.base import CrossValidation, RepeatedMeasure
    from mvpa2.generators.splitters import Splitter
    from mvpa2.generators.partition import NFoldPartitioner
    from mvpa2.misc.errorfx import mean_mismatch_error
    from mvpa2.mappers.fx import mean_sample
    from mvpa2.mappers.fx import maxofabs_sample
    from mvpa2.generators.base import Repeater
    from mvpa2.featsel.rfe import RFE
    from mvpa2.featsel.helpers import FractionTailSelector, BestDetector
    from mvpa2.featsel.helpers import NBackHistoryStopCrit
    from mvpa2.datasets import vstack

    from mvpa2.misc.data_generators import normal_feature_dataset

    # Let's simulate the beast -- 6 categories total groupped into 3
    # super-ordinate, and actually without any 'superordinate' effect
    # since subordinate categories independent
    fds = normal_feature_dataset(nlabels=3,
                                 snr=1, # 100,   # pure signal! ;)
                                 perlabel=9,
                                 nfeatures=6,
                                 nonbogus_features=range(3),
                                 nchunks=3)
    clfsvm = LinearCSVMC()

    rfesvm = RFE(clfsvm.get_sensitivity_analyzer(postproc=maxofabs_sample()),
                 CrossValidation(
                     clfsvm,
                     NFoldPartitioner(),
                     errorfx=mean_mismatch_error, postproc=mean_sample()),
                 Repeater(2),
                 fselector=FractionTailSelector(0.70, mode='select', tail='upper'),
                 stopping_criterion=NBackHistoryStopCrit(BestDetector(), 10),
                 update_sensitivity=True)

    fclfsvm = FeatureSelectionClassifier(clfsvm, rfesvm)

    sensanasvm = fclfsvm.get_sensitivity_analyzer(postproc=maxofabs_sample())


    # manually repeating/splitting so we do both RFE sensitivity and classification
    senses, errors = [], []
    for i, pset in enumerate(NFoldPartitioner().generate(fds)):
        # split partitioned dataset
        split = [d for d in Splitter('partitions').generate(pset)]
        senses.append(sensanasvm(split[0])) # and it also should train the classifier so we would ask it about error
        errors.append(mean_mismatch_error(fclfsvm.predict(split[1]), split[1].targets))

    senses = vstack(senses)
    errors = vstack(errors)

    # Let's compare against rerunning the beast simply for classification with CV
    errors_cv = CrossValidation(fclfsvm, NFoldPartitioner(), errorfx=mean_mismatch_error)(fds)
    # and they should match
    assert_array_equal(errors, errors_cv)

    # buggy!
    cv_sensana_svm = RepeatedMeasure(sensanasvm, NFoldPartitioner())
    senses_rm = cv_sensana_svm(fds)

    #print senses.samples, senses_rm.samples
    #print errors, errors_cv.samples
    assert_raises(AssertionError,
                  assert_array_almost_equal,
                  senses.samples, senses_rm.samples)
    raise SkipTest("Known failure for repeated measures: https://github.com/PyMVPA/PyMVPA/issues/117")
Beispiel #7
0
def test_gnbsearchlight_permutations():
    import mvpa2
    from mvpa2.base.node import ChainNode
    from mvpa2.clfs.gnb import GNB
    from mvpa2.generators.base import  Repeater
    from mvpa2.generators.partition import NFoldPartitioner, OddEvenPartitioner
    #import mvpa2.generators.permutation
    #reload(mvpa2.generators.permutation)
    from mvpa2.generators.permutation import AttributePermutator
    from mvpa2.testing.datasets import datasets
    from mvpa2.measures.base import CrossValidation
    from mvpa2.measures.gnbsearchlight import sphere_gnbsearchlight
    from mvpa2.measures.searchlight import sphere_searchlight
    from mvpa2.mappers.fx import mean_sample
    from mvpa2.misc.errorfx import mean_mismatch_error
    from mvpa2.clfs.stats import MCNullDist
    from mvpa2.testing.tools import assert_raises, ok_, assert_array_less

    # mvpa2.debug.active = ['APERM', 'SLC'] #, 'REPM']
    # mvpa2.debug.metrics += ['pid']
    count = 10
    nproc = 1 + int(mvpa2.externals.exists('pprocess'))
    ds = datasets['3dsmall'].copy()
    ds.fa['voxel_indices'] = ds.fa.myspace

    slkwargs = dict(radius=3, space='voxel_indices',  enable_ca=['roi_sizes'],
                    center_ids=[1, 10, 70, 100])

    mvpa2.seed(mvpa2._random_seed)
    clf  = GNB()
    splt = NFoldPartitioner(cvtype=2, attr='chunks')

    repeater   = Repeater(count=count)
    permutator = AttributePermutator('targets', limit={'partitions': 1}, count=1)

    null_sl = sphere_gnbsearchlight(clf, ChainNode([splt, permutator], space=splt.get_space()),
                                    postproc=mean_sample(), errorfx=mean_mismatch_error,
                                    **slkwargs)

    distr_est = MCNullDist(repeater, tail='left', measure=null_sl,
                           enable_ca=['dist_samples'])
    sl = sphere_gnbsearchlight(clf, splt,
                               reuse_neighbors=True,
                               null_dist=distr_est, postproc=mean_sample(),
                               errorfx=mean_mismatch_error,
                               **slkwargs)
    if __debug__:                         # assert is done only without -O mode
        assert_raises(NotImplementedError, sl, ds)

    # "ad-hoc searchlights can't handle yet varying targets across partitions"
    if False:
        # after above limitation is removed -- enable
        sl_map = sl(ds)
        sl_null_prob = sl.ca.null_prob.samples.copy()

    mvpa2.seed(mvpa2._random_seed)
    ### 'normal' Searchlight
    clf  = GNB()
    splt = NFoldPartitioner(cvtype=2, attr='chunks')
    repeater   = Repeater(count=count)
    permutator = AttributePermutator('targets', limit={'partitions': 1}, count=1)
    # rng=np.random.RandomState(0)) # to trigger failure since the same np.random state
    # would be reused across all pprocesses
    null_cv = CrossValidation(clf, ChainNode([splt, permutator], space=splt.get_space()),
                              postproc=mean_sample())
    null_sl_normal = sphere_searchlight(null_cv, nproc=nproc, **slkwargs)
    distr_est_normal = MCNullDist(repeater, tail='left', measure=null_sl_normal,
                           enable_ca=['dist_samples'])

    cv = CrossValidation(clf, splt, errorfx=mean_mismatch_error,
                         enable_ca=['stats'], postproc=mean_sample() )
    sl = sphere_searchlight(cv, nproc=nproc, null_dist=distr_est_normal, **slkwargs)
    sl_map_normal = sl(ds)
    sl_null_prob_normal = sl.ca.null_prob.samples.copy()

    # For every feature -- we should get some variance in estimates In
    # case of failure they are all really close to each other (up to
    # numerical precision), so variance will be close to 0
    assert_array_less(-np.var(distr_est_normal.ca.dist_samples.samples[0],
                              axis=1), -1e-5)
    for s in distr_est_normal.ca.dist_samples.samples[0]:
        ok_(len(np.unique(s)) > 1)
Beispiel #8
0
    def test_rfe(self, clf):

        # sensitivity analyser and transfer error quantifier use the SAME clf!
        sens_ana = clf.get_sensitivity_analyzer(postproc=maxofabs_sample())
        pmeasure = ProxyMeasure(clf,
                                postproc=BinaryFxNode(mean_mismatch_error,
                                                      'targets'))
        cvmeasure = CrossValidation(clf,
                                    NFoldPartitioner(),
                                    errorfx=mean_mismatch_error,
                                    postproc=mean_sample())

        rfesvm_split = SplitClassifier(clf, OddEvenPartitioner())

        # explore few recipes
        for rfe, data in [
                # because the clf is already trained when computing the sensitivity
                # map, prevent retraining for transfer error calculation
                # Use absolute of the svm weights as sensitivity
            (RFE(sens_ana,
                 pmeasure,
                 Splitter('train'),
                 fselector=FixedNElementTailSelector(1),
                 train_pmeasure=False), self.get_data()),
                # use cross-validation within training to get error for the stopping point
                # but use full training data to derive sensitivity
            (
                RFE(
                    sens_ana,
                    cvmeasure,
                    Repeater(
                        2
                    ),  # give the same full dataset to sens_ana and cvmeasure
                    fselector=FractionTailSelector(0.70,
                                                   mode='select',
                                                   tail='upper'),
                    train_pmeasure=True),
                normal_feature_dataset(perlabel=20,
                                       nchunks=5,
                                       nfeatures=200,
                                       nonbogus_features=[0, 1],
                                       snr=1.5)),
                # use cross-validation (via SplitClassifier) and get mean
                # of normed sensitivities across those splits
            (
                RFE(
                    rfesvm_split.get_sensitivity_analyzer(
                        postproc=ChainMapper([
                            FxMapper('features', l2_normed),
                            FxMapper('samples', np.mean),
                            FxMapper('samples', np.abs)
                        ])),
                    ConfusionBasedError(rfesvm_split, confusion_state='stats'),
                    Repeater(
                        2),  #  we will use the same full cv-training dataset
                    fselector=FractionTailSelector(0.50,
                                                   mode='select',
                                                   tail='upper'),
                    stopping_criterion=NBackHistoryStopCrit(
                        BestDetector(), 10),
                    train_pmeasure=
                    False,  # we just extract it from existing confusion
                    update_sensitivity=True),
                normal_feature_dataset(perlabel=28,
                                       nchunks=7,
                                       nfeatures=200,
                                       nonbogus_features=[0, 1],
                                       snr=1.5))
        ]:
            # prep data
            # data = datasets['uni2medium']
            data_nfeatures = data.nfeatures

            rfe.train(data)
            resds = rfe(data)

            # fail if orig datasets are changed
            self.assertTrue(data.nfeatures == data_nfeatures)

            # check that the features set with the least error is selected
            if len(rfe.ca.errors):
                e = np.array(rfe.ca.errors)
                if isinstance(rfe._fselector, FixedNElementTailSelector):
                    self.assertTrue(resds.nfeatures == data_nfeatures -
                                    e.argmin())
                else:
                    imin = np.argmin(e)
                    if 'does_feature_selection' in clf.__tags__:
                        # if clf is smart it might figure it out right away
                        assert_array_less(imin, len(e))
                    else:
                        # in this case we can even check if we had actual
                        # going down/up trend... although -- why up???
                        self.assertTrue(1 < imin < len(e) - 1)
            else:
                self.assertTrue(resds.nfeatures == data_nfeatures)

            # silly check if nfeatures is in decreasing order
            nfeatures = np.array(rfe.ca.nfeatures).copy()
            nfeatures.sort()
            self.assertTrue((nfeatures[::-1] == rfe.ca.nfeatures).all())

            # check if history has elements for every step
            self.assertTrue(
                set(rfe.ca.history) == set(range(len(np.array(
                    rfe.ca.errors)))))

            # Last (the largest number) can be present multiple times even
            # if we remove 1 feature at a time -- just need to stop well
            # in advance when we have more than 1 feature left ;)
            self.assertTrue(rfe.ca.nfeatures[-1] == len(
                np.where(rfe.ca.history == max(rfe.ca.history))[0]))
Beispiel #9
0
    def __init__(
            self,
            lrn,
            partitioner,
            fselector,
            errorfx=mean_mismatch_error,
            fmeasure_postproc=None,
            fmeasure=None,
            nproc=1,
            # callback?
            **kwargs):
        """
        Parameters
        ----------
        lrn : Learner
          Learner with a sensitivity analyzer which will be used both
          for the sensitivity analysis and transfer error estimation
        partitioner : Partitioner
          Used to generate cross-validation partitions for cross-validation
          to deduce optimal number of features to maintain
        fselector : Functor
          Given a sensitivity map it has to return the ids of those
          features that should be kept.
        errorfx : func, optional
          Functor to use for estimation of cross-validation error
        fmeasure_postproc : func, optional
          Function to provide to the sensitivity analyzer as postproc.  If no
          fmeasure is provided and classifier sensitivity is used, then
          maxofabs_sample() would be used for this postproc, unless other
          value is provided
        fmeasure : Function, optional
          Featurewise measure.  If None was provided, lrn's sensitivity
          analyzer will be used.
        """
        # Initialize itself preparing for the 2nd invocation
        # with determined number of nfeatures_min
        # TODO:  move this into _train since better not to assign anything here
        # to avoid possible problems with copies needing to deal with the same
        # lrn... but then we might like again to reconsider delegation instead
        # of subclassing here....
        if fmeasure is None:
            if __debug__:
                debug(
                    'RFE', 'fmeasure was not provided, will be using the '
                    'sensitivity analyzer for %s' % lrn)
            fmeasure = lrn.get_sensitivity_analyzer(
                postproc=fmeasure_postproc
                if fmeasure_postproc is not None else maxofabs_sample())
            train_pmeasure = False
        else:
            assert fmeasure_postproc is None, "There should be no explicit " \
                    "fmeasure_postproc when fmeasure is specified"
            # if user provided explicit value -- use it! otherwise, we do want
            # to train an arbitrary fmeasure
            train_pmeasure = kwargs.pop('train_pmeasure', True)

        RFE.__init__(self,
                     fmeasure,
                     None,
                     Repeater(2),
                     fselector=fselector,
                     bestdetector=None,
                     train_pmeasure=train_pmeasure,
                     stopping_criterion=None,
                     **kwargs)
        self._lrn = lrn  # should not be modified, thus _
        self.partitioner = partitioner
        self.errorfx = errorfx
        self.fmeasure_postproc = fmeasure_postproc
        self.nproc = nproc
Beispiel #10
0
def get_crossvalidation_instance(learner,
                                 partitioner,
                                 errorfx,
                                 sampling_repetitions=1,
                                 learner_space='targets',
                                 balance_training=None,
                                 permutations=0,
                                 avg_datafold_results=True,
                                 prob_tail='left'):
    from mvpa2.base.node import ChainNode
    from mvpa2.measures.base import CrossValidation
    if not balance_training is None:
        # balance training data
        try:
            amount = int(balance_training)
        except ValueError:
            try:
                amount = float(balance_training)
            except ValueError:
                amount = balance_training
        from mvpa2.generators.resampling import Balancer
        balancer = Balancer(amount=amount,
                            attr=learner_space,
                            count=sampling_repetitions,
                            limit={partitioner.get_space(): 1},
                            apply_selection=True,
                            include_offlimit=True)
    else:
        balancer = None
    # set learner space
    learner.set_space(learner_space)
    # setup generator for data folding -- put in a chain node for easy
    # amending
    gennode = ChainNode([partitioner], space=partitioner.get_space())
    if avg_datafold_results:
        from mvpa2.mappers.fx import mean_sample
        postproc = mean_sample()
    else:
        postproc = None
    if not balancer is None:
        # enable balancing step for each partitioning step
        gennode.append(balancer)
    if permutations > 0:
        from mvpa2.generators.base import Repeater
        from mvpa2.generators.permutation import AttributePermutator
        from mvpa2.clfs.stats import MCNullDist
        # how often do we want to shuffle the data
        repeater = Repeater(count=permutations)
        # permute the training part of a dataset exactly ONCE
        permutator = AttributePermutator(learner_space,
                                         limit={partitioner.get_space(): 1},
                                         count=1)
        # CV with null-distribution estimation that permutes the training data for
        # each fold independently
        perm_gen_node = copy.deepcopy(gennode)
        perm_gen_node.append(permutator)
        null_cv = CrossValidation(learner,
                                  perm_gen_node,
                                  postproc=postproc,
                                  errorfx=errorfx)
        # Monte Carlo distribution estimator
        distr_est = MCNullDist(repeater,
                               tail=prob_tail,
                               measure=null_cv,
                               enable_ca=['dist_samples'])
        # pass the p-values as feature attributes on to the results
        pass_attr = [('ca.null_prob', 'fa', 1)]
    else:
        distr_est = None
        pass_attr = None
    # final CV node
    cv = CrossValidation(learner,
                         gennode,
                         errorfx=errorfx,
                         null_dist=distr_est,
                         postproc=postproc,
                         enable_ca=['stats', 'null_prob'],
                         pass_attr=pass_attr)
    return cv