def test_chained_crossvalidation_searchlight(): from mvpa2.clfs.gnb import GNB from mvpa2.clfs.meta import MappedClassifier from mvpa2.generators.partition import NFoldPartitioner from mvpa2.mappers.base import ChainMapper from mvpa2.mappers.base import Mapper from mvpa2.measures.base import CrossValidation from mvpa2.measures.searchlight import sphere_searchlight from mvpa2.testing.datasets import datasets dataset = datasets['3dlarge'].copy() dataset.fa['voxel_indices'] = dataset.fa.myspace sample_clf = GNB() # fast and deterministic class ZScoreFeaturesMapper(Mapper): """Very basic mapper which would take care about standardizing all features within each sample separately """ def _forward_data(self, data): return (data - np.mean(data, axis=1)[:, None])/np.std(data, axis=1)[:, None] # only do partial to save time sl_kwargs = dict(radius=2, center_ids=[3, 50]) clf_mapped = MappedClassifier(sample_clf, ZScoreFeaturesMapper()) cv = CrossValidation(clf_mapped, NFoldPartitioner()) sl = sphere_searchlight(cv, **sl_kwargs) results_mapped = sl(dataset) cv_chained = ChainMapper([ZScoreFeaturesMapper(auto_train=True), CrossValidation(sample_clf, NFoldPartitioner())]) sl_chained = sphere_searchlight(cv_chained, **sl_kwargs) results_chained = sl_chained(dataset) assert_array_equal(results_mapped, results_chained)
def test_noise_classification(self): # get a dataset with a very high SNR data = get_mv_pattern(10) # do crossval with default errorfx and 'mean' combiner cv = CrossValidation(sample_clf_nl, NFoldPartitioner()) # must return a scalar value result = cv(data) # must be perfect self.assertTrue((result.samples < 0.05).all()) # do crossval with permuted regressors cv = CrossValidation( sample_clf_nl, ChainNode( [NFoldPartitioner(), AttributePermutator('targets', count=10)], space='partitions')) results = cv(data) # results must not be the same self.assertTrue(len(np.unique(results.samples)) > 1) # must be at chance level pmean = np.array(results).mean() self.assertTrue(pmean < 0.58 and pmean > 0.42)
def test_sifter_with_balancing(): # extended previous test which was already # "... somewhat duplicating the doctest" ds = Dataset(samples=np.arange(12).reshape((-1, 2)), sa={ 'chunks': [0, 1, 2, 3, 4, 5], 'targets': ['c', 'c', 'c', 'p', 'p', 'p'] }) # Without sifter -- just to assure that we do get all of them # i.e. 6*5*4*3/(4!) = 15 par = ChainNode([NFoldPartitioner(cvtype=4, attr='chunks')]) assert_equal(len(list(par.generate(ds))), 15) # so we will take 4 chunks out of available 7, but would care only # about those partitions where we have balanced number of 'c' and 'p' # entries assert_raises( ValueError, lambda x: list(Sifter([('targets', dict(wrong=1))]).generate(x)), ds) par = ChainNode([ NFoldPartitioner(cvtype=4, attr='chunks'), Sifter([('partitions', 2), ('targets', dict(uvalues=['c', 'p'], balanced=True))]) ]) dss = list(par.generate(ds)) # print [ x[x.sa.partitions==2].sa.targets for x in dss ] assert_equal(len(dss), 9) for ds_ in dss: testing = ds[ds_.sa.partitions == 2] assert_array_equal(np.unique(testing.sa.targets), ['c', 'p']) # and we still have both targets present in training training = ds[ds_.sa.partitions == 1] assert_array_equal(np.unique(training.sa.targets), ['c', 'p'])
def test_nfold_random_counted_selection_partitioner(self): # Lets get somewhat extensive but complete one and see if # everything is legit. 0.5 must correspond to 50%, in our case # 5 out of 10 unique chunks split_partitions = [ tuple(x.sa.partitions) for x in NFoldPartitioner(0.5).generate(self.data)] # 252 is # of combinations of 5 from 10 assert_equal(len(split_partitions), 252) # verify that all of them are unique assert_equal(len(set(split_partitions)), 252) # now let's limit our query kwargs = dict(count=10, selection_strategy='random') split10_partitions = [ tuple(x.sa.partitions) for x in NFoldPartitioner(5, **kwargs).generate(self.data)] split10_partitions_ = [ tuple(x.sa.partitions) for x in NFoldPartitioner(0.5, **kwargs).generate(self.data)] # to make sure that I deal with sets of tuples correctly: assert_equal(len(set(split10_partitions)), 10) assert_equal(len(split10_partitions), 10) assert_equal(len(split10_partitions_), 10) # and they must differ (same ones are possible but very very unlikely) assert_not_equal(split10_partitions, split10_partitions_) # but every one of them must be within known exhaustive set assert_equal(set(split_partitions).intersection(split10_partitions), set(split10_partitions)) assert_equal(set(split_partitions).intersection(split10_partitions_), set(split10_partitions_))
def test_custom_targets(self, lrn): """Simple test if a learner could cope with custom sa not targets """ # Since we are comparing performances of two learners, we need # to assure that if they depend on some random seed -- they # would use the same value. Currently we have such stochastic # behavior in SMLR if 'seed' in lrn.params: from mvpa2 import _random_seed lrn = lrn.clone() # clone the beast lrn.params.seed = _random_seed # reuse the same seed lrn_ = lrn.clone() lrn_.set_space('custom') te = CrossValidation(lrn, NFoldPartitioner()) te_ = CrossValidation(lrn_, NFoldPartitioner()) nclasses = 2 * (1 + int('multiclass' in lrn.__tags__)) dsname = ('uni%dsmall' % nclasses, 'sin_modulated')[int(lrn.__is_regression__)] ds = datasets[dsname] ds_ = ds.copy() ds_.sa['custom'] = ds_.sa['targets'] ds_.sa.pop('targets') self.assertTrue('targets' in ds.sa, msg="'targets' should remain in original ds") try: cve = te(ds) cve_ = te_(ds_) except Exception, e: self.fail("Failed with %r" % e)
def test_sifter_superord_usecase(): from mvpa2.misc.data_generators import normal_feature_dataset from mvpa2.clfs.svm import LinearCSVMC # fast one to use for tests from mvpa2.measures.base import CrossValidation from mvpa2.base.node import ChainNode from mvpa2.generators.partition import NFoldPartitioner from mvpa2.generators.base import Sifter # Let's simulate the beast -- 6 categories total groupped into 3 # super-ordinate, and actually without any 'superordinate' effect # since subordinate categories independent ds = normal_feature_dataset( nlabels=6, snr=100, # pure signal! ;) perlabel=30, nfeatures=6, nonbogus_features=range(6), nchunks=5) ds.sa['subord'] = ds.sa.targets.copy() ds.sa['superord'] = ['super%d' % (int(i[1]) % 3, ) for i in ds.targets] # 3 superord categories # let's override original targets just to be sure that we aren't relying on them ds.targets[:] = 0 npart = ChainNode( [ ## so we split based on superord NFoldPartitioner(len(ds.sa['superord'].unique), attr='subord'), ## so it should select only those splits where we took 1 from ## each of the superord categories leaving things in balance Sifter([('partitions', 2), ('superord', { 'uvalues': ds.sa['superord'].unique, 'balanced': True })]), ], space='partitions') # and then do your normal where clf is space='superord' clf = LinearCSVMC(space='superord') cvte_regular = CrossValidation(clf, NFoldPartitioner(), errorfx=lambda p, t: np.mean(p == t)) cvte_super = CrossValidation(clf, npart, errorfx=lambda p, t: np.mean(p == t)) accs_regular = cvte_regular(ds) accs_super = cvte_super(ds) # With sifting we should get only 2^3 = 8 splits assert (len(accs_super) == 8) # I don't think that this would ever fail, so not marking it labile assert (np.mean(accs_regular) > .8) assert (np.mean(accs_super) < .6)
def test_gnbsearchlight_matchaccuracy(self): # was not able to deal with custom errorfx collapsing samples # after 55e147e0bd30fbf4edede3faef3a15c6c65b33ea ds = datasets['3dmedium'].copy() ds.fa['voxel_indices'] = ds.fa.myspace sl_err = sphere_gnbsearchlight(GNB(), NFoldPartitioner(cvtype=1), radius=0) sl_acc = sphere_gnbsearchlight(GNB(), NFoldPartitioner(cvtype=1), radius=0, errorfx=mean_match_accuracy) assert_array_almost_equal(sl_err(ds), 1.0 - sl_acc(ds).samples)
def test_split_clf_on_chainpartitioner(self): # pretty much a smoke test for #156 ds = datasets['uni2small'] part = ChainNode([ NFoldPartitioner(cvtype=1), Balancer(attr='targets', count=2, limit='partitions', apply_selection=True) ]) partitions = list(part.generate(ds)) sclf = SplitClassifier(sample_clf_lin, part, enable_ca=['stats', 'splits']) sclf.train(ds) pred = sclf.predict(ds) assert_equal(len(pred), len(ds)) # rudimentary check assert_equal(len(sclf.ca.splits), len(partitions)) assert_equal(len(sclf.clfs), len(partitions)) # now let's do sensitivity analyzer just in case sclf.untrain() sensana = sclf.get_sensitivity_analyzer() sens = sensana(ds) # basic check that sensitivities varied across splits from mvpa2.mappers.fx import FxMapper sens_stds = FxMapper('samples', np.std, uattrs=['targets'])(sens) assert_true(np.any(sens_stds != 0))
def test_split_classifier_extended(self, clf_): clf2 = clf_.clone() ds = datasets['uni2%s' % self._get_clf_ds(clf2)] clf = SplitClassifier( clf=clf_, #SameSignClassifier(), enable_ca=['stats', 'feature_ids']) clf.train(ds) # train the beast error = clf.ca.stats.error cv = CrossValidation(clf2, NFoldPartitioner(), postproc=mean_sample(), enable_ca=['stats', 'training_stats']) cverror = cv(ds).samples.squeeze() if not 'non-deterministic' in clf.__tags__: self.assertTrue( abs(error - cverror) < 0.01, msg="We should get the same error using split classifier as" " using CrossValidation. Got %s and %s" % (error, cverror)) if cfg.getboolean('tests', 'labile', default='yes'): self.assertTrue(error < 0.25, msg="clf should generalize more or less fine. " "Got error %s" % error) self.assertEqual(len(clf.ca.stats.sets), len(ds.UC), msg="Should have 1 confusion per each split") self.assertEqual( len(clf.clfs), len(ds.UC), msg="Should have number of classifiers equal # of epochs")
def test_split_classifier(self): ds = self.data_bin_1 clf = SplitClassifier( clf=SameSignClassifier(), enable_ca=['stats', 'training_stats', 'feature_ids']) clf.train(ds) # train the beast error = clf.ca.stats.error tr_error = clf.ca.training_stats.error clf2 = clf.clone() cv = CrossValidation(clf2, NFoldPartitioner(), postproc=mean_sample(), enable_ca=['stats', 'training_stats']) cverror = cv(ds) cverror = cverror.samples.squeeze() tr_cverror = cv.ca.training_stats.error self.assertEqual( error, cverror, msg="We should get the same error using split classifier as" " using CrossValidation. Got %s and %s" % (error, cverror)) self.assertEqual( tr_error, tr_cverror, msg="We should get the same training error using split classifier as" " using CrossValidation. Got %s and %s" % (tr_error, tr_cverror)) self.assertEqual(clf.ca.stats.percent_correct, 100, msg="Dummy clf should train perfectly") # CV and SplitClassifier should get the same confusion matrices assert_array_equal(clf.ca.stats.matrix, cv.ca.stats.matrix) self.assertEqual(len(clf.ca.stats.sets), len(ds.UC), msg="Should have 1 confusion per each split") self.assertEqual( len(clf.clfs), len(ds.UC), msg="Should have number of classifiers equal # of epochs") self.assertEqual(clf.predict(ds.samples), list(ds.targets), msg="Should classify correctly") # feature_ids must be list of lists, and since it is not # feature-selecting classifier used - we expect all features # to be utilized # NOT ANYMORE -- for BoostedClassifier we have now union of all # used features across slave classifiers. That makes # semantics clear. If you need to get deeper -- use upcoming # harvesting facility ;-) # self.assertEqual(len(clf.feature_ids), len(ds.uniquechunks)) # self.assertTrue(np.array([len(ids)==ds.nfeatures # for ids in clf.feature_ids]).all()) # Just check if we get it at all ;-) summary = clf.summary()
def test_classifier_generalization(self, clf): """Simple test if classifiers can generalize ok on simple data """ te = CrossValidation(clf, NFoldPartitioner(), postproc=mean_sample()) # check the default #self.assertTrue(te.transerror.errorfx is mean_mismatch_error) nclasses = 2 * (1 + int('multiclass' in clf.__tags__)) ds = datasets['uni%d%s' % (nclasses, self._get_clf_ds(clf))] try: cve = te(ds).samples.squeeze() except Exception as e: self.fail("Failed with %s" % e) if cfg.getboolean('tests', 'labile', default='yes'): if nclasses > 2 and \ ((clf.descr is not None and 'on 5%(' in clf.descr) or 'regression_based' in clf.__tags__): # skip those since they are barely applicable/testable here raise SkipTest("Skip testing of cve on %s" % clf) self.assertTrue( cve < 0.25, # TODO: use multinom distribution msg="Got transfer error %g on %s with %d labels" % (cve, ds, len(ds.UT)))
def test_exclude_targets_combinations_subjectchunks(): partitioner = ChainNode([NFoldPartitioner(attr='subjects'), ExcludeTargetsCombinationsPartitioner( k=1, targets_attr='chunks', space='partitions')], space='partitions') # targets do not need even to be defined! ds = Dataset(np.arange(18).reshape(9, 2), sa={'chunks': np.arange(9) // 3, 'subjects': np.arange(9) % 3}) dss = list(partitioner.generate(ds)) assert_equal(len(dss), 9) testing_subjs, testing_chunks = [], [] for ds_ in dss: testing_partition = ds_.sa.partitions == 2 training_partition = ds_.sa.partitions == 1 # must be scalars -- so implicit test here # if not -- would be error testing_subj = np.asscalar(np.unique(ds_.sa.subjects[testing_partition])) testing_subjs.append(testing_subj) testing_chunk = np.asscalar(np.unique(ds_.sa.chunks[testing_partition])) testing_chunks.append(testing_chunk) # and those must not appear for training ok_(not testing_subj in ds_.sa.subjects[training_partition]) ok_(not testing_chunk in ds_.sa.chunks[training_partition]) # and we should have gone through all chunks/subjs pairs testing_pairs = set(zip(testing_subjs, testing_chunks)) assert_equal(len(testing_pairs), 9) # yoh: equivalent to set(itertools.product(range(3), range(3)))) # but .product is N/A for python2.5 assert_equal(testing_pairs, set(zip(*np.where(np.ones((3,3))))))
def test_permute_superord(): from mvpa2.base.node import ChainNode from mvpa2.generators.partition import NFoldPartitioner from mvpa2.generators.base import Sifter from mvpa2.generators.permutation import AttributePermutator ds = _get_superord_dataset() # mvpa2.seed(1) part = ChainNode( [ ## so we split based on superord NFoldPartitioner(len(ds.sa['superord'].unique), attr='subord'), ## so it should select only those splits where we took 1 from ## each of the superord categories leaving things in balance Sifter([('partitions', 2), ('superord', { 'uvalues': ds.sa['superord'].unique, 'balanced': True })]), AttributePermutator(['superord'], limit=['partitions', 'chunks']), ], space='partitions') for ds_perm in part.generate(ds): # it does permutation assert (np.sum(ds_perm.sa.superord != ds.sa.superord) != 0)
def test_splitclf_sensitivities(): datasets = [ normal_feature_dataset(perlabel=100, nlabels=2, nfeatures=4, nonbogus_features=[0, i + 1], snr=1, nchunks=2) for i in xrange(2) ] sclf = SplitClassifier(SMLR(), NFoldPartitioner()) analyzer = sclf.get_sensitivity_analyzer() senses1 = analyzer(datasets[0]) senses2 = analyzer(datasets[1]) for senses in senses1, senses2: # This should be False when comparing two folds assert_false(np.allclose(senses.samples[0], senses.samples[2])) assert_false(np.allclose(senses.samples[1], senses.samples[3])) # Moreover with new data we should have got different results # (i.e. it must retrained correctly) for s1, s2 in zip(senses1, senses2): assert_false(np.allclose(s1, s2)) # and we should have "selected" "correct" voxels for i, senses in enumerate((senses1, senses2)): assert_equal(set(np.argsort(np.max(np.abs(senses), axis=0))[-2:]), set((0, i + 1)))
def test_unpartitioned_cv(self): data = get_mv_pattern(10) # only one big chunk data.sa.chunks[:] = 1 cv = CrossValidation(sample_clf_nl, NFoldPartitioner()) # need to fail, because it can't be split into training and testing assert_raises(ValueError, cv, data)
def _test_mcasey20120222(): # pragma: no cover # http://lists.alioth.debian.org/pipermail/pkg-exppsy-pymvpa/2012q1/002034.html # This one is conditioned on allowing # of samples to be changed # by the mapper provided to MappedClassifier. See # https://github.com/yarikoptic/PyMVPA/tree/_tent/allow_ch_nsamples import numpy as np from mvpa2.datasets.base import dataset_wizard from mvpa2.generators.partition import NFoldPartitioner from mvpa2.mappers.base import ChainMapper from mvpa2.mappers.svd import SVDMapper from mvpa2.mappers.fx import mean_group_sample from mvpa2.clfs.svm import LinearCSVMC from mvpa2.clfs.meta import MappedClassifier from mvpa2.measures.base import CrossValidation mapper = ChainMapper([mean_group_sample(['targets','chunks']), SVDMapper()]) clf = MappedClassifier(LinearCSVMC(), mapper) cvte = CrossValidation(clf, NFoldPartitioner(), enable_ca=['repetition_results', 'stats']) ds = dataset_wizard( samples=np.arange(32).reshape((8, -1)), targets=[1, 1, 2, 2, 1, 1, 2, 2], chunks=[1, 1, 1, 1, 2, 2, 2, 2]) errors = cvte(ds)
def test_split_featurewise_dataset_measure(self): ds = datasets['uni3small'] sana = RepeatedMeasure( SMLR(fit_all_weights=True).get_sensitivity_analyzer(), ChainNode( [NFoldPartitioner(), Splitter('partitions', attr_values=[1])])) sens = sana(ds) # a sensitivity for each chunk and each label combination assert_equal(sens.shape, (len(ds.sa['chunks'].unique) * len(ds.sa['targets'].unique), ds.nfeatures)) # Lets try more complex example with 'boosting' ds = datasets['uni3medium'] ds.init_origids('samples') sana = RepeatedMeasure( SMLR(fit_all_weights=True).get_sensitivity_analyzer(), Balancer(amount=0.25, count=2, apply_selection=True), enable_ca=['datasets', 'repetition_results']) sens = sana(ds) assert_equal(sens.shape, (2 * len(ds.sa['targets'].unique), ds.nfeatures)) splits = sana.ca.datasets self.assertEqual(len(splits), 2) self.assertTrue( np.all([s.nsamples == ds.nsamples // 4 for s in splits])) # should have used different samples self.assertTrue(np.any([splits[0].sa.origids != splits[1].sa.origids])) # and should have got different sensitivities self.assertTrue(np.any(sens[0] != sens[3]))
def _test_edmund_chong_20120907(): # pragma: no cover # commented out to avoid syntax warnings while compiling # from mvpa2.suite import * from mvpa2.testing.datasets import datasets repeater = Repeater(count=20) partitioner = ChainNode([NFoldPartitioner(cvtype=1), Balancer(attr='targets', count=1, # for real data > 1 limit='partitions', apply_selection=True )], space='partitions') clf = LinearCSVMC() #choice of classifier permutator = AttributePermutator('targets', limit={'partitions': 1}, count=1) null_cv = CrossValidation( clf, ChainNode([partitioner, permutator], space=partitioner.get_space()), errorfx=mean_mismatch_error) distr_est = MCNullDist(repeater, tail='left', measure=null_cv, enable_ca=['dist_samples']) cvte = CrossValidation(clf, partitioner, errorfx=mean_mismatch_error, null_dist=distr_est, enable_ca=['stats']) errors = cvte(datasets['uni2small'])
def test_multiclass_without_combiner(): # The goal is to obtain all pairwise results as the resultant dataset # avoiding even calling any combiner clf = LinearCSVMC(C=1) ds = datasets['uni3small'].copy() ds.sa['ids'] = np.arange(len(ds)) mclf = MulticlassClassifier(clf, combiner=None) # without combining results at all mcv = CrossValidation(mclf, NFoldPartitioner(), errorfx=None) res = mcv(ds) assert_equal(len(res), len(ds)) assert_equal(res.nfeatures, 3) # 3 pairs for 3 classes assert_array_equal(res.UT, ds.UT) assert_array_equal(np.unique(np.array(res.fa.targets.tolist())), ds.UT) # TODO -- check that we have all the pairs? assert_array_equal(res.sa['cvfolds'].unique, np.arange(len(ds.UC))) if mcv.ca.is_enabled('training_stats'): # we must have received a dictionary per each pair training_stats = mcv.ca.training_stats assert_equal(set(training_stats.keys()), set([('L0', 'L1'), ('L0', 'L2'), ('L1', 'L2')])) for pair, cm in training_stats.iteritems(): assert_array_equal(cm.labels, ds.UT) # we should have no predictions for absent label assert_array_equal(cm.matrix[~np.in1d(ds.UT, pair)], 0) # while altogether all samples were processed once assert_array_equal(cm.stats['P'], len(ds)) # and number of sets should be equal number of chunks here assert_equal(len(cm.sets), len(ds.UC))
def test_multiclass_classifier_cv(clf, ds): # Extending test_clf.py:ClassifiersTests.test_multiclass_classifier # Compare performance with our MaximalVote to the one done natively # by e.g. LIBSVM clf = clf.clone() clf.params.C = 1 # so it doesn't auto-adjust mclf = MulticlassClassifier(clf=clf.clone()) part = NFoldPartitioner() cv = CrossValidation(clf, part, enable_ca=['stats', 'training_stats']) mcv = CrossValidation(mclf, part, enable_ca=['stats', 'training_stats']) er = cv(ds) mer = mcv(ds) # errors should be the same assert_array_equal(er, mer) assert_equal(str(cv.ca.training_stats), str(mcv.ca.training_stats)) # if it was a binary task, cv.ca.stats would also have AUC column # while mcv would not :-/ TODO if len(ds.UT) == 2: # so just compare the matrix and ACC assert_array_equal(cv.ca.stats.matrix, mcv.ca.stats.matrix) assert_equal(cv.ca.stats.stats['ACC'], mcv.ca.stats.stats['ACC']) else: assert_equal(str(cv.ca.stats), str(mcv.ca.stats))
def test_multiclass_classifier_pass_ds_attributes(): # TODO: replicate/extend basic testing of pass_attr # in some more "basic" test_* clf = LinearCSVMC(C=1) ds = datasets['uni3small'].copy() ds.sa['ids'] = np.arange(len(ds)) mclf = MulticlassClassifier( clf, pass_attr=[ 'ids', 'sa.chunks', 'a.bogus_features', # 'ca.raw_estimates' # this one is binary_clf x samples list ATM # that is why raw_predictions_ds was born 'ca.raw_predictions_ds', 'ca.estimates', # this one is ok 'ca.predictions', ], enable_ca=['all']) mcv = CrossValidation(mclf, NFoldPartitioner(), errorfx=None) res = mcv(ds) assert_array_equal(sorted(res.sa.ids), ds.sa.ids) assert_array_equal(res.chunks, ds.chunks[res.sa.ids]) assert_array_equal(res.sa.predictions, res.samples[:, 0]) assert_array_equal(res.sa.cvfolds, np.repeat(range(len(ds.UC)), len(ds) / len(ds.UC)))
def __test_matthias_question(self): rfe_clf = LinearCSVMC(C=1) rfesvm_split = SplitClassifier(rfe_clf) clf = \ FeatureSelectionClassifier( clf = LinearCSVMC(C=1), feature_selection = RFE( sensitivity_analyzer = rfesvm_split.get_sensitivity_analyzer( combiner=first_axis_mean, transformer=np.abs), transfer_error=ConfusionBasedError( rfesvm_split, confusion_state="confusion"), stopping_criterion=FixedErrorThresholdStopCrit(0.20), feature_selector=FractionTailSelector( 0.2, mode='discard', tail='lower'), update_sensitivity=True)) no_permutations = 1000 permutator = AttributePermutator('targets', count=no_permutations) cv = CrossValidation(clf, NFoldPartitioner(), null_dist=MCNullDist(permutator, tail='left'), enable_ca=['stats']) error = cv(datasets['uni2small']) self.assertTrue(error < 0.4) self.assertTrue(cv.ca.null_prob < 0.05)
def test_exclude_targets_combinations(): partitioner = ChainNode([ NFoldPartitioner(), ExcludeTargetsCombinationsPartitioner( k=2, targets_attr='targets', space='partitions') ], space='partitions') from mvpa2.misc.data_generators import normal_feature_dataset ds = normal_feature_dataset(snr=0., nlabels=4, perlabel=3, nchunks=3, nonbogus_features=[0, 1, 2, 3], nfeatures=4) partitions = list(partitioner.generate(ds)) assert_equal(len(partitions), 3 * 6) splitter = Splitter('partitions') combs = [] comb_chunks = [] for p in partitions: trds, teds = list(splitter.generate(p))[:2] comb = tuple(np.unique(teds.targets)) combs.append(comb) comb_chunks.append(comb + tuple(np.unique(teds.chunks))) assert_equal(len(set(combs)), 6) # just 6 possible combinations of 2 out of 4 assert_equal(len(set(comb_chunks)), 3 * 6) # all unique
def test_cached_qe_gnbsearchlight(self): ds1 = datasets['3dsmall'].copy(deep=True) qe = IndexQueryEngine(myspace=Sphere(2)) cached_qe = CachedQueryEngine(qe) gnb_sl = GNBSearchlight(GNB(), NFoldPartitioner(), qe=cached_qe) res = gnb_sl(ds1) assert_false(cached_qe.ids is None)
def test_slicing(self): hs = HalfPartitioner() spl = Splitter(attr='partitions') splits = list(hs.generate(self.data)) for s in splits: # partitioned dataset shared the data assert_true(s.samples.base is self.data.samples) splits = [list(spl.generate(p)) for p in hs.generate(self.data)] # with numpy 1.7.0b1 "chaining" was deprecated so let's create # check function appropriate for the given numpy version _a = np.arange(5) __a = _a[:4][:3] if __a.base is _a: # 1.7.0b1 def is_the_same_base(x, base=self.data.samples): return x.base is base elif __a.base.base is _a: # prior 1.7.0b1 def is_the_same_base(x, base=self.data.samples): return x.base.base is base else: raise RuntimeError("Uknown handling of .base by numpy") for s in splits: # we get slicing all the time assert_true(is_the_same_base(s[0].samples)) assert_true(is_the_same_base(s[1].samples)) spl = Splitter(attr='partitions', noslicing=True) splits = [list(spl.generate(p)) for p in hs.generate(self.data)] for s in splits: # we no slicing at all assert_false(s[0].samples.base is self.data.samples) assert_false(s[1].samples.base is self.data.samples) nfs = NFoldPartitioner() spl = Splitter(attr='partitions') splits = [list(spl.generate(p)) for p in nfs.generate(self.data)] for i, s in enumerate(splits): # training only first and last split if i == 0 or i == len(splits) - 1: assert_true(is_the_same_base(s[0].samples)) else: assert_true(s[0].samples.base is None) # we get slicing all the time assert_true(s[1].samples.base.base is self.data.samples) step_ds = Dataset(np.random.randn(20, 2), sa={'chunks': np.tile([0, 1], 10)}) oes = OddEvenPartitioner() spl = Splitter(attr='partitions') splits = list(oes.generate(step_ds)) for s in splits: # partitioned dataset shared the data assert_true(s.samples.base is step_ds.samples) splits = [list(spl.generate(p)) for p in oes.generate(step_ds)] assert_equal(len(splits), 2) for s in splits: # we get slicing all the time assert_true(s[0].samples.base.base is step_ds.samples) assert_true(s[1].samples.base.base is step_ds.samples)
def test_partial_searchlight_with_full_report(self): ds = self.dataset.copy() center_ids = np.zeros(ds.nfeatures, dtype='bool') center_ids[[3, 50]] = True ds.fa['center_ids'] = center_ids # compute N-1 cross-validation for each sphere cv = CrossValidation(GNB(), NFoldPartitioner()) # contruct diameter 1 (or just radius 0) searchlight # one time give center ids as a list, the other one takes it from the # dataset itself sls = ( sphere_searchlight(cv, radius=0, center_ids=[3, 50]), sphere_searchlight(None, radius=0, center_ids=[3, 50]), sphere_searchlight(cv, radius=0, center_ids='center_ids'), ) for sl in sls: # assure that we could set cv post constructor if sl.datameasure is None: sl.datameasure = cv # run searchlight results = sl(ds) # only two spheres but error for all CV-folds self.assertEqual(results.shape, (len(self.dataset.UC), 2)) # Test if results hold if we "set" a "new" datameasure sl.datameasure = CrossValidation(GNB(), NFoldPartitioner()) results2 = sl(ds) assert_array_almost_equal(results, results2) # test if we graciously puke if center_ids are out of bounds dataset0 = ds[:, :50] # so we have no 50th feature self.assertRaises(IndexError, sls[0], dataset0) # but it should be fine on the one that gets the ids from the dataset # itself results = sl(dataset0) assert_equal(results.nfeatures, 1) # check whether roi_seeds are correct sl = sphere_searchlight(lambda x: np.vstack( (x.fa.roi_seed, x.samples)), radius=1, add_center_fa=True, center_ids=[12]) res = sl(ds) assert_array_equal( res.samples[1:, res.samples[0].astype('bool')].squeeze(), ds.samples[:, 12])
def test_incorrect_parameter_error(self): # Just a sample class from mvpa2.generators.partition import NFoldPartitioner try: spl = NFoldPartitioner(1, incorrect=None) raise AssertionError("Must have failed with an exception here " "due to incorrect parameter") except Exception, e: estr = str(e)
def test_searchlight_cross_decoding(path, subjects, conf_file, type, **kwargs): conf = read_configuration(path, conf_file, type) for arg in kwargs: conf[arg] = kwargs[arg] if arg == 'radius': radius = kwargs[arg] debug.active += ["SLC"] ds_merged = get_merged_ds(path, subjects, conf_file, type, **kwargs) clf = LinearCSVMC(C=1, probability=1, enable_ca=['probabilities']) cv = CrossValidation(clf, NFoldPartitioner(attr='task')) maps = [] for ds in ds_merged: ds.targets[ds.targets == 'point'] = 'face' ds.targets[ds.targets == 'saccade'] = 'place' sl = sphere_searchlight(cv, radius, space='voxel_indices') sl_map = sl(ds) sl_map.samples *= -1 sl_map.samples += 1 nif = map2nifti(sl_map, imghdr=ds.a.imghdr) maps.append(nif) datetime = get_time() analysis = 'cross_searchlight' mask = conf['mask_area'] task = type new_dir = datetime + '_' + analysis + '_' + mask + '_' + task command = 'mkdir ' + os.path.join(path, '0_results', new_dir) os.system(command) parent_dir = os.path.join(path, '0_results', new_dir) for s, map in zip(subjects, maps): name = s command = 'mkdir ' + os.path.join(parent_dir, name) os.system(command) results_dir = os.path.join(parent_dir, name) fname = name + '_radius_' + str(radius) + '_searchlight_map.nii.gz' map.to_filename(os.path.join(results_dir, fname)) return maps
def test_cached_kernel_different_datasets(self): skip_if_no_external('shogun', ver_dep='shogun:rev', min_version=4455) # Inspired by the problem Swaroop ran into k = LinearSGKernel(normalizer_cls=False) k_ = LinearSGKernel(normalizer_cls=False) # to be cached ck = CachedKernel(k_) clf = sgSVM(svm_impl='libsvm', kernel=k, C=-1) clf_ = sgSVM(svm_impl='libsvm', kernel=ck, C=-1) cvte = CrossValidation(clf, NFoldPartitioner()) cvte_ = CrossValidation(clf_, NFoldPartitioner()) postproc=BinaryFxNode(mean_mismatch_error, 'targets') te = ProxyMeasure(clf, postproc=postproc) te_ = ProxyMeasure(clf_, postproc=postproc) for r in xrange(2): ds1 = datasets['uni2medium'] errs1 = cvte(ds1) ck.compute(ds1) ok_(ck._recomputed) errs1_ = cvte_(ds1) ok_(~ck._recomputed) assert_array_equal(errs1, errs1_) ds2 = datasets['uni3small'] errs2 = cvte(ds2) ck.compute(ds2) ok_(ck._recomputed) errs2_ = cvte_(ds2) ok_(~ck._recomputed) assert_array_equal(errs2, errs2_) ssel = np.round(datasets['uni2large'].samples[:5, 0]).astype(int) te.train(datasets['uni3small'][::2]) terr = np.asscalar(te(datasets['uni3small'][ssel])) te_.train(datasets['uni3small'][::2]) terr_ = np.asscalar(te_(datasets['uni3small'][ssel])) ok_(~ck._recomputed) ok_(terr == terr_)
def test_sifter_superord_usecase(): from mvpa2.misc.data_generators import normal_feature_dataset from mvpa2.clfs.svm import LinearCSVMC # fast one to use for tests from mvpa2.measures.base import CrossValidation from mvpa2.base.node import ChainNode from mvpa2.generators.partition import NFoldPartitioner from mvpa2.generators.base import Sifter ds = _get_superord_dataset() npart = ChainNode( [ ## so we split based on superord NFoldPartitioner(len(ds.sa['superord'].unique), attr='subord'), ## so it should select only those splits where we took 1 from ## each of the superord categories leaving things in balance Sifter([('partitions', 2), ('superord', { 'uvalues': ds.sa['superord'].unique, 'balanced': True })]), ], space='partitions') # and then do your normal where clf is space='superord' clf = LinearCSVMC(space='superord') cvte_regular = CrossValidation(clf, NFoldPartitioner(), errorfx=lambda p, t: np.mean(p == t)) cvte_super = CrossValidation(clf, npart, errorfx=lambda p, t: np.mean(p == t)) accs_regular = cvte_regular(ds) accs_super = cvte_super(ds) # With sifting we should get only 2^3 = 8 splits assert (len(accs_super) == 8) # I don't think that this would ever fail, so not marking it labile assert (np.mean(accs_regular) > .8) assert (np.mean(accs_super) < .6)