def test_repeater(): reps = 4 r = Repeater(reps, space='OMG') dsl = [ds for ds in r.generate(Dataset([0,1]))] assert_equal(len(dsl), reps) for i, ds in enumerate(dsl): assert_equal(ds.a.OMG, i)
def test_repeater(): reps = 4 r = Repeater(reps, space='OMG') dsl = [ds for ds in r.generate(Dataset([0, 1]))] assert_equal(len(dsl), reps) for i, ds in enumerate(dsl): assert_equal(ds.a.OMG, i)
def _test_edmund_chong_20120907(): # pragma: no cover # commented out to avoid syntax warnings while compiling # from mvpa2.suite import * from mvpa2.testing.datasets import datasets repeater = Repeater(count=20) partitioner = ChainNode([NFoldPartitioner(cvtype=1), Balancer(attr='targets', count=1, # for real data > 1 limit='partitions', apply_selection=True )], space='partitions') clf = LinearCSVMC() #choice of classifier permutator = AttributePermutator('targets', limit={'partitions': 1}, count=1) null_cv = CrossValidation( clf, ChainNode([partitioner, permutator], space=partitioner.get_space()), errorfx=mean_mismatch_error) distr_est = MCNullDist(repeater, tail='left', measure=null_cv, enable_ca=['dist_samples']) cvte = CrossValidation(clf, partitioner, errorfx=mean_mismatch_error, null_dist=distr_est, enable_ca=['stats']) errors = cvte(datasets['uni2small'])
def test_null_dist_prob(self, l_clf): train = datasets['uni2medium'] num_perm = 10 permutator = AttributePermutator('targets', count=num_perm, limit='chunks') # define class to estimate NULL distribution of errors # use left tail of the distribution since we use MeanMatchFx as error # function and lower is better terr = TransferMeasure(l_clf, Repeater(count=2), postproc=BinaryFxNode(mean_mismatch_error, 'targets'), null_dist=MCNullDist(permutator, tail='left')) # check reasonable error range err = terr(train) self.assertTrue(np.mean(err) < 0.4) # Lets do the same for CVTE cvte = CrossValidation(l_clf, OddEvenPartitioner(), null_dist=MCNullDist(permutator, tail='left', enable_ca=['dist_samples' ]), postproc=mean_sample()) cv_err = cvte(train) # check that the result is highly significant since we know that the # data has signal null_prob = np.asscalar(terr.ca.null_prob) if cfg.getboolean('tests', 'labile', default='yes'): self.assertTrue( null_prob <= 0.1, msg="Failed to check that the result is highly significant " "(got %f) since we know that the data has signal" % null_prob) self.assertTrue( np.asscalar(cvte.ca.null_prob) <= 0.1, msg="Failed to check that the result is highly significant " "(got p(cvte)=%f) since we know that the data has signal" % np.asscalar(cvte.ca.null_prob)) # we should be able to access the actual samples of the distribution # yoh: why it is 3D really? # mih: because these are the distribution samples for the ONE error # collapsed into ONE value across all folds. It will also be # 3d if the return value of the measure isn't a scalar and it is # not collapsed across folds. it simply corresponds to the shape # of the output dataset of the respective measure (+1 axis) # Some permutations could have been skipped since classifier failed # to train due to degenerate situation etc, thus accounting for them self.assertEqual(cvte.null_dist.ca.dist_samples.shape[2], num_perm - cvte.null_dist.ca.skipped)
def __init__( self, lrn, partitioner, fselector, errorfx=mean_mismatch_error, analyzer_postproc=maxofabs_sample(), # callback? **kwargs): """ Parameters ---------- lrn : Learner Learner with a sensitivity analyzer which will be used both for the sensitivity analysis and transfer error estimation partitioner : Partitioner Used to generate cross-validation partitions for cross-validation to deduce optimal number of features to maintain fselector : Functor Given a sensitivity map it has to return the ids of those features that should be kept. errorfx : func, optional Functor to use for estimation of cross-validation error analyzer_postproc : func, optional Function to provide to the sensitivity analyzer as postproc """ # Initialize itself preparing for the 2nd invocation # with determined number of nfeatures_min fmeasure = lrn.get_sensitivity_analyzer(postproc=analyzer_postproc) RFE.__init__(self, fmeasure, None, Repeater(2), fselector=fselector, bestdetector=None, train_pmeasure=False, stopping_criterion=None, **kwargs) self._lrn = lrn # should not be modified, thus _ self.partitioner = partitioner self.errorfx = errorfx self.analyzer_postproc = analyzer_postproc
def test_rfe_sensmap(): # http://lists.alioth.debian.org/pipermail/pkg-exppsy-pymvpa/2013q3/002538.html # just a smoke test. fails with from mvpa2.clfs.svm import LinearCSVMC from mvpa2.clfs.meta import FeatureSelectionClassifier from mvpa2.measures.base import CrossValidation, RepeatedMeasure from mvpa2.generators.splitters import Splitter from mvpa2.generators.partition import NFoldPartitioner from mvpa2.misc.errorfx import mean_mismatch_error from mvpa2.mappers.fx import mean_sample from mvpa2.mappers.fx import maxofabs_sample from mvpa2.generators.base import Repeater from mvpa2.featsel.rfe import RFE from mvpa2.featsel.helpers import FractionTailSelector, BestDetector from mvpa2.featsel.helpers import NBackHistoryStopCrit from mvpa2.datasets import vstack from mvpa2.misc.data_generators import normal_feature_dataset # Let's simulate the beast -- 6 categories total groupped into 3 # super-ordinate, and actually without any 'superordinate' effect # since subordinate categories independent fds = normal_feature_dataset(nlabels=3, snr=1, # 100, # pure signal! ;) perlabel=9, nfeatures=6, nonbogus_features=range(3), nchunks=3) clfsvm = LinearCSVMC() rfesvm = RFE(clfsvm.get_sensitivity_analyzer(postproc=maxofabs_sample()), CrossValidation( clfsvm, NFoldPartitioner(), errorfx=mean_mismatch_error, postproc=mean_sample()), Repeater(2), fselector=FractionTailSelector(0.70, mode='select', tail='upper'), stopping_criterion=NBackHistoryStopCrit(BestDetector(), 10), update_sensitivity=True) fclfsvm = FeatureSelectionClassifier(clfsvm, rfesvm) sensanasvm = fclfsvm.get_sensitivity_analyzer(postproc=maxofabs_sample()) # manually repeating/splitting so we do both RFE sensitivity and classification senses, errors = [], [] for i, pset in enumerate(NFoldPartitioner().generate(fds)): # split partitioned dataset split = [d for d in Splitter('partitions').generate(pset)] senses.append(sensanasvm(split[0])) # and it also should train the classifier so we would ask it about error errors.append(mean_mismatch_error(fclfsvm.predict(split[1]), split[1].targets)) senses = vstack(senses) errors = vstack(errors) # Let's compare against rerunning the beast simply for classification with CV errors_cv = CrossValidation(fclfsvm, NFoldPartitioner(), errorfx=mean_mismatch_error)(fds) # and they should match assert_array_equal(errors, errors_cv) # buggy! cv_sensana_svm = RepeatedMeasure(sensanasvm, NFoldPartitioner()) senses_rm = cv_sensana_svm(fds) #print senses.samples, senses_rm.samples #print errors, errors_cv.samples assert_raises(AssertionError, assert_array_almost_equal, senses.samples, senses_rm.samples) raise SkipTest("Known failure for repeated measures: https://github.com/PyMVPA/PyMVPA/issues/117")
def test_gnbsearchlight_permutations(): import mvpa2 from mvpa2.base.node import ChainNode from mvpa2.clfs.gnb import GNB from mvpa2.generators.base import Repeater from mvpa2.generators.partition import NFoldPartitioner, OddEvenPartitioner #import mvpa2.generators.permutation #reload(mvpa2.generators.permutation) from mvpa2.generators.permutation import AttributePermutator from mvpa2.testing.datasets import datasets from mvpa2.measures.base import CrossValidation from mvpa2.measures.gnbsearchlight import sphere_gnbsearchlight from mvpa2.measures.searchlight import sphere_searchlight from mvpa2.mappers.fx import mean_sample from mvpa2.misc.errorfx import mean_mismatch_error from mvpa2.clfs.stats import MCNullDist from mvpa2.testing.tools import assert_raises, ok_, assert_array_less # mvpa2.debug.active = ['APERM', 'SLC'] #, 'REPM'] # mvpa2.debug.metrics += ['pid'] count = 10 nproc = 1 + int(mvpa2.externals.exists('pprocess')) ds = datasets['3dsmall'].copy() ds.fa['voxel_indices'] = ds.fa.myspace slkwargs = dict(radius=3, space='voxel_indices', enable_ca=['roi_sizes'], center_ids=[1, 10, 70, 100]) mvpa2.seed(mvpa2._random_seed) clf = GNB() splt = NFoldPartitioner(cvtype=2, attr='chunks') repeater = Repeater(count=count) permutator = AttributePermutator('targets', limit={'partitions': 1}, count=1) null_sl = sphere_gnbsearchlight(clf, ChainNode([splt, permutator], space=splt.get_space()), postproc=mean_sample(), errorfx=mean_mismatch_error, **slkwargs) distr_est = MCNullDist(repeater, tail='left', measure=null_sl, enable_ca=['dist_samples']) sl = sphere_gnbsearchlight(clf, splt, reuse_neighbors=True, null_dist=distr_est, postproc=mean_sample(), errorfx=mean_mismatch_error, **slkwargs) if __debug__: # assert is done only without -O mode assert_raises(NotImplementedError, sl, ds) # "ad-hoc searchlights can't handle yet varying targets across partitions" if False: # after above limitation is removed -- enable sl_map = sl(ds) sl_null_prob = sl.ca.null_prob.samples.copy() mvpa2.seed(mvpa2._random_seed) ### 'normal' Searchlight clf = GNB() splt = NFoldPartitioner(cvtype=2, attr='chunks') repeater = Repeater(count=count) permutator = AttributePermutator('targets', limit={'partitions': 1}, count=1) # rng=np.random.RandomState(0)) # to trigger failure since the same np.random state # would be reused across all pprocesses null_cv = CrossValidation(clf, ChainNode([splt, permutator], space=splt.get_space()), postproc=mean_sample()) null_sl_normal = sphere_searchlight(null_cv, nproc=nproc, **slkwargs) distr_est_normal = MCNullDist(repeater, tail='left', measure=null_sl_normal, enable_ca=['dist_samples']) cv = CrossValidation(clf, splt, errorfx=mean_mismatch_error, enable_ca=['stats'], postproc=mean_sample() ) sl = sphere_searchlight(cv, nproc=nproc, null_dist=distr_est_normal, **slkwargs) sl_map_normal = sl(ds) sl_null_prob_normal = sl.ca.null_prob.samples.copy() # For every feature -- we should get some variance in estimates In # case of failure they are all really close to each other (up to # numerical precision), so variance will be close to 0 assert_array_less(-np.var(distr_est_normal.ca.dist_samples.samples[0], axis=1), -1e-5) for s in distr_est_normal.ca.dist_samples.samples[0]: ok_(len(np.unique(s)) > 1)
def test_rfe(self, clf): # sensitivity analyser and transfer error quantifier use the SAME clf! sens_ana = clf.get_sensitivity_analyzer(postproc=maxofabs_sample()) pmeasure = ProxyMeasure(clf, postproc=BinaryFxNode(mean_mismatch_error, 'targets')) cvmeasure = CrossValidation(clf, NFoldPartitioner(), errorfx=mean_mismatch_error, postproc=mean_sample()) rfesvm_split = SplitClassifier(clf, OddEvenPartitioner()) # explore few recipes for rfe, data in [ # because the clf is already trained when computing the sensitivity # map, prevent retraining for transfer error calculation # Use absolute of the svm weights as sensitivity (RFE(sens_ana, pmeasure, Splitter('train'), fselector=FixedNElementTailSelector(1), train_pmeasure=False), self.get_data()), # use cross-validation within training to get error for the stopping point # but use full training data to derive sensitivity ( RFE( sens_ana, cvmeasure, Repeater( 2 ), # give the same full dataset to sens_ana and cvmeasure fselector=FractionTailSelector(0.70, mode='select', tail='upper'), train_pmeasure=True), normal_feature_dataset(perlabel=20, nchunks=5, nfeatures=200, nonbogus_features=[0, 1], snr=1.5)), # use cross-validation (via SplitClassifier) and get mean # of normed sensitivities across those splits ( RFE( rfesvm_split.get_sensitivity_analyzer( postproc=ChainMapper([ FxMapper('features', l2_normed), FxMapper('samples', np.mean), FxMapper('samples', np.abs) ])), ConfusionBasedError(rfesvm_split, confusion_state='stats'), Repeater( 2), # we will use the same full cv-training dataset fselector=FractionTailSelector(0.50, mode='select', tail='upper'), stopping_criterion=NBackHistoryStopCrit( BestDetector(), 10), train_pmeasure= False, # we just extract it from existing confusion update_sensitivity=True), normal_feature_dataset(perlabel=28, nchunks=7, nfeatures=200, nonbogus_features=[0, 1], snr=1.5)) ]: # prep data # data = datasets['uni2medium'] data_nfeatures = data.nfeatures rfe.train(data) resds = rfe(data) # fail if orig datasets are changed self.assertTrue(data.nfeatures == data_nfeatures) # check that the features set with the least error is selected if len(rfe.ca.errors): e = np.array(rfe.ca.errors) if isinstance(rfe._fselector, FixedNElementTailSelector): self.assertTrue(resds.nfeatures == data_nfeatures - e.argmin()) else: imin = np.argmin(e) if 'does_feature_selection' in clf.__tags__: # if clf is smart it might figure it out right away assert_array_less(imin, len(e)) else: # in this case we can even check if we had actual # going down/up trend... although -- why up??? self.assertTrue(1 < imin < len(e) - 1) else: self.assertTrue(resds.nfeatures == data_nfeatures) # silly check if nfeatures is in decreasing order nfeatures = np.array(rfe.ca.nfeatures).copy() nfeatures.sort() self.assertTrue((nfeatures[::-1] == rfe.ca.nfeatures).all()) # check if history has elements for every step self.assertTrue( set(rfe.ca.history) == set(range(len(np.array( rfe.ca.errors))))) # Last (the largest number) can be present multiple times even # if we remove 1 feature at a time -- just need to stop well # in advance when we have more than 1 feature left ;) self.assertTrue(rfe.ca.nfeatures[-1] == len( np.where(rfe.ca.history == max(rfe.ca.history))[0]))
def __init__( self, lrn, partitioner, fselector, errorfx=mean_mismatch_error, fmeasure_postproc=None, fmeasure=None, nproc=1, # callback? **kwargs): """ Parameters ---------- lrn : Learner Learner with a sensitivity analyzer which will be used both for the sensitivity analysis and transfer error estimation partitioner : Partitioner Used to generate cross-validation partitions for cross-validation to deduce optimal number of features to maintain fselector : Functor Given a sensitivity map it has to return the ids of those features that should be kept. errorfx : func, optional Functor to use for estimation of cross-validation error fmeasure_postproc : func, optional Function to provide to the sensitivity analyzer as postproc. If no fmeasure is provided and classifier sensitivity is used, then maxofabs_sample() would be used for this postproc, unless other value is provided fmeasure : Function, optional Featurewise measure. If None was provided, lrn's sensitivity analyzer will be used. """ # Initialize itself preparing for the 2nd invocation # with determined number of nfeatures_min # TODO: move this into _train since better not to assign anything here # to avoid possible problems with copies needing to deal with the same # lrn... but then we might like again to reconsider delegation instead # of subclassing here.... if fmeasure is None: if __debug__: debug( 'RFE', 'fmeasure was not provided, will be using the ' 'sensitivity analyzer for %s' % lrn) fmeasure = lrn.get_sensitivity_analyzer( postproc=fmeasure_postproc if fmeasure_postproc is not None else maxofabs_sample()) train_pmeasure = False else: assert fmeasure_postproc is None, "There should be no explicit " \ "fmeasure_postproc when fmeasure is specified" # if user provided explicit value -- use it! otherwise, we do want # to train an arbitrary fmeasure train_pmeasure = kwargs.pop('train_pmeasure', True) RFE.__init__(self, fmeasure, None, Repeater(2), fselector=fselector, bestdetector=None, train_pmeasure=train_pmeasure, stopping_criterion=None, **kwargs) self._lrn = lrn # should not be modified, thus _ self.partitioner = partitioner self.errorfx = errorfx self.fmeasure_postproc = fmeasure_postproc self.nproc = nproc
def get_crossvalidation_instance(learner, partitioner, errorfx, sampling_repetitions=1, learner_space='targets', balance_training=None, permutations=0, avg_datafold_results=True, prob_tail='left'): from mvpa2.base.node import ChainNode from mvpa2.measures.base import CrossValidation if not balance_training is None: # balance training data try: amount = int(balance_training) except ValueError: try: amount = float(balance_training) except ValueError: amount = balance_training from mvpa2.generators.resampling import Balancer balancer = Balancer(amount=amount, attr=learner_space, count=sampling_repetitions, limit={partitioner.get_space(): 1}, apply_selection=True, include_offlimit=True) else: balancer = None # set learner space learner.set_space(learner_space) # setup generator for data folding -- put in a chain node for easy # amending gennode = ChainNode([partitioner], space=partitioner.get_space()) if avg_datafold_results: from mvpa2.mappers.fx import mean_sample postproc = mean_sample() else: postproc = None if not balancer is None: # enable balancing step for each partitioning step gennode.append(balancer) if permutations > 0: from mvpa2.generators.base import Repeater from mvpa2.generators.permutation import AttributePermutator from mvpa2.clfs.stats import MCNullDist # how often do we want to shuffle the data repeater = Repeater(count=permutations) # permute the training part of a dataset exactly ONCE permutator = AttributePermutator(learner_space, limit={partitioner.get_space(): 1}, count=1) # CV with null-distribution estimation that permutes the training data for # each fold independently perm_gen_node = copy.deepcopy(gennode) perm_gen_node.append(permutator) null_cv = CrossValidation(learner, perm_gen_node, postproc=postproc, errorfx=errorfx) # Monte Carlo distribution estimator distr_est = MCNullDist(repeater, tail=prob_tail, measure=null_cv, enable_ca=['dist_samples']) # pass the p-values as feature attributes on to the results pass_attr = [('ca.null_prob', 'fa', 1)] else: distr_est = None pass_attr = None # final CV node cv = CrossValidation(learner, gennode, errorfx=errorfx, null_dist=distr_est, postproc=postproc, enable_ca=['stats', 'null_prob'], pass_attr=pass_attr) return cv