def test_multiclass_classifier_cv(clf, ds): # Extending test_clf.py:ClassifiersTests.test_multiclass_classifier # Compare performance with our MaximalVote to the one done natively # by e.g. LIBSVM clf = clf.clone() clf.params.C = 1 # so it doesn't auto-adjust mclf = MulticlassClassifier(clf=clf.clone()) part = NFoldPartitioner() cv = CrossValidation(clf, part, enable_ca=['stats', 'training_stats']) mcv = CrossValidation(mclf, part, enable_ca=['stats', 'training_stats']) er = cv(ds) mer = mcv(ds) # errors should be the same assert_array_equal(er, mer) assert_equal(str(cv.ca.training_stats), str(mcv.ca.training_stats)) # if it was a binary task, cv.ca.stats would also have AUC column # while mcv would not :-/ TODO if len(ds.UT) == 2: # so just compare the matrix and ACC assert_array_equal(cv.ca.stats.matrix, mcv.ca.stats.matrix) assert_equal(cv.ca.stats.stats['ACC'], mcv.ca.stats.stats['ACC']) else: assert_equal(str(cv.ca.stats), str(mcv.ca.stats))
def _test_edmund_chong_20120907(): # pragma: no cover # commented out to avoid syntax warnings while compiling # from mvpa2.suite import * from mvpa2.testing.datasets import datasets repeater = Repeater(count=20) partitioner = ChainNode([NFoldPartitioner(cvtype=1), Balancer(attr='targets', count=1, # for real data > 1 limit='partitions', apply_selection=True )], space='partitions') clf = LinearCSVMC() #choice of classifier permutator = AttributePermutator('targets', limit={'partitions': 1}, count=1) null_cv = CrossValidation( clf, ChainNode([partitioner, permutator], space=partitioner.get_space()), errorfx=mean_mismatch_error) distr_est = MCNullDist(repeater, tail='left', measure=null_cv, enable_ca=['dist_samples']) cvte = CrossValidation(clf, partitioner, errorfx=mean_mismatch_error, null_dist=distr_est, enable_ca=['stats']) errors = cvte(datasets['uni2small'])
def test_custom_targets(self, lrn): """Simple test if a learner could cope with custom sa not targets """ # Since we are comparing performances of two learners, we need # to assure that if they depend on some random seed -- they # would use the same value. Currently we have such stochastic # behavior in SMLR if 'seed' in lrn.params: from mvpa2 import _random_seed lrn = lrn.clone() # clone the beast lrn.params.seed = _random_seed # reuse the same seed lrn_ = lrn.clone() lrn_.set_space('custom') te = CrossValidation(lrn, NFoldPartitioner()) te_ = CrossValidation(lrn_, NFoldPartitioner()) nclasses = 2 * (1 + int('multiclass' in lrn.__tags__)) dsname = ('uni%dsmall' % nclasses, 'sin_modulated')[int(lrn.__is_regression__)] ds = datasets[dsname] ds_ = ds.copy() ds_.sa['custom'] = ds_.sa['targets'] ds_.sa.pop('targets') self.assertTrue('targets' in ds.sa, msg="'targets' should remain in original ds") try: cve = te(ds) cve_ = te_(ds_) except Exception, e: self.fail("Failed with %r" % e)
def test_chained_crossvalidation_searchlight(): from mvpa2.clfs.gnb import GNB from mvpa2.clfs.meta import MappedClassifier from mvpa2.generators.partition import NFoldPartitioner from mvpa2.mappers.base import ChainMapper from mvpa2.mappers.base import Mapper from mvpa2.measures.base import CrossValidation from mvpa2.measures.searchlight import sphere_searchlight from mvpa2.testing.datasets import datasets dataset = datasets['3dlarge'].copy() dataset.fa['voxel_indices'] = dataset.fa.myspace sample_clf = GNB() # fast and deterministic class ZScoreFeaturesMapper(Mapper): """Very basic mapper which would take care about standardizing all features within each sample separately """ def _forward_data(self, data): return (data - np.mean(data, axis=1)[:, None])/np.std(data, axis=1)[:, None] # only do partial to save time sl_kwargs = dict(radius=2, center_ids=[3, 50]) clf_mapped = MappedClassifier(sample_clf, ZScoreFeaturesMapper()) cv = CrossValidation(clf_mapped, NFoldPartitioner()) sl = sphere_searchlight(cv, **sl_kwargs) results_mapped = sl(dataset) cv_chained = ChainMapper([ZScoreFeaturesMapper(auto_train=True), CrossValidation(sample_clf, NFoldPartitioner())]) sl_chained = sphere_searchlight(cv_chained, **sl_kwargs) results_chained = sl_chained(dataset) assert_array_equal(results_mapped, results_chained)
def test_noise_classification(self): # get a dataset with a very high SNR data = get_mv_pattern(10) # do crossval with default errorfx and 'mean' combiner cv = CrossValidation(sample_clf_nl, NFoldPartitioner()) # must return a scalar value result = cv(data) # must be perfect self.assertTrue((result.samples < 0.05).all()) # do crossval with permuted regressors cv = CrossValidation( sample_clf_nl, ChainNode( [NFoldPartitioner(), AttributePermutator('targets', count=10)], space='partitions')) results = cv(data) # results must not be the same self.assertTrue(len(np.unique(results.samples)) > 1) # must be at chance level pmean = np.array(results).mean() self.assertTrue(pmean < 0.58 and pmean > 0.42)
def test_searchlight_errors_per_trial(): # To make sure that searchlight can return error/accuracy per trial from mvpa2.clfs.gnb import GNB from mvpa2.generators.partition import OddEvenPartitioner from mvpa2.measures.base import CrossValidation from mvpa2.measures.searchlight import sphere_searchlight from mvpa2.measures.gnbsearchlight import sphere_gnbsearchlight from mvpa2.testing.datasets import datasets from mvpa2.misc.errorfx import prediction_target_matches dataset = datasets['3dsmall'].copy() # randomly permute samples so we break any random correspondence # to strengthen tests below sample_idx = np.arange(len(dataset)) dataset = dataset[np.random.permutation(sample_idx)] dataset.sa.targets = ['L%d' % l for l in dataset.sa.targets] dataset.fa['voxel_indices'] = dataset.fa.myspace sample_clf = GNB() # fast and deterministic part = OddEvenPartitioner() # only do partial to save time cv = CrossValidation(sample_clf, part, errorfx=None) #prediction_target_matches) # Just to compare error cv_error = CrossValidation(sample_clf, part) # Large searchlight radius so we get entire ROI, 2 centers just to make sure # that all stacking works correctly sl = sphere_searchlight(cv, radius=10, center_ids=[0, 1]) results = sl(dataset) sl_gnb = sphere_gnbsearchlight(sample_clf, part, radius=10, errorfx=None, center_ids=[0, 1]) results_gnbsl = sl_gnb(dataset) # inspect both results # verify that partitioning was done correctly partitions = list(part.generate(dataset)) for res in (results, results_gnbsl): assert('targets' in res.sa.keys()) # should carry targets assert('cvfolds' in res.sa.keys()) # should carry cvfolds for ipart in xrange(len(partitions)): assert_array_equal(dataset[partitions[ipart].sa.partitions == 2].targets, res.sa.targets[res.sa.cvfolds == ipart]) assert_datasets_equal(results, results_gnbsl) # one "accuracy" per each trial assert_equal(results.shape, (len(dataset), 2)) # with accuracies the same in both searchlights since the same # features were to be selected in both cases due too large radii errors_dataset = cv(dataset) assert_array_equal(errors_dataset.samples[:, 0], results.samples[:, 0]) assert_array_equal(errors_dataset.samples[:, 0], results.samples[:, 1]) # and error matching (up to precision) the one if we run with default error function assert_array_almost_equal(np.mean(results.targets[:, None] != results.samples, axis=0)[0], np.mean(cv_error(dataset)))
def test_sifter_superord_usecase(): from mvpa2.misc.data_generators import normal_feature_dataset from mvpa2.clfs.svm import LinearCSVMC # fast one to use for tests from mvpa2.measures.base import CrossValidation from mvpa2.base.node import ChainNode from mvpa2.generators.partition import NFoldPartitioner from mvpa2.generators.base import Sifter # Let's simulate the beast -- 6 categories total groupped into 3 # super-ordinate, and actually without any 'superordinate' effect # since subordinate categories independent ds = normal_feature_dataset( nlabels=6, snr=100, # pure signal! ;) perlabel=30, nfeatures=6, nonbogus_features=range(6), nchunks=5) ds.sa['subord'] = ds.sa.targets.copy() ds.sa['superord'] = ['super%d' % (int(i[1]) % 3, ) for i in ds.targets] # 3 superord categories # let's override original targets just to be sure that we aren't relying on them ds.targets[:] = 0 npart = ChainNode( [ ## so we split based on superord NFoldPartitioner(len(ds.sa['superord'].unique), attr='subord'), ## so it should select only those splits where we took 1 from ## each of the superord categories leaving things in balance Sifter([('partitions', 2), ('superord', { 'uvalues': ds.sa['superord'].unique, 'balanced': True })]), ], space='partitions') # and then do your normal where clf is space='superord' clf = LinearCSVMC(space='superord') cvte_regular = CrossValidation(clf, NFoldPartitioner(), errorfx=lambda p, t: np.mean(p == t)) cvte_super = CrossValidation(clf, npart, errorfx=lambda p, t: np.mean(p == t)) accs_regular = cvte_regular(ds) accs_super = cvte_super(ds) # With sifting we should get only 2^3 = 8 splits assert (len(accs_super) == 8) # I don't think that this would ever fail, so not marking it labile assert (np.mean(accs_regular) > .8) assert (np.mean(accs_super) < .6)
def test_regression_as_classifier(self, regr): """Basic tests of metaclass for using regressions as classifiers """ for dsname in 'uni2small', 'uni4small': ds = datasets[dsname] clf = RegressionAsClassifier(regr, enable_ca=['distances']) cv = CrossValidation(clf, OddEvenPartitioner(), postproc=mean_sample(), enable_ca=['stats', 'training_stats']) error = cv(ds).samples.squeeze() nlabels = len(ds.uniquetargets) if nlabels == 2 \ and cfg.getboolean('tests', 'labile', default='yes'): self.assertTrue(error < 0.3, msg="Got error %.2f on %s dataset" % (error, dsname)) # Check if does not puke on repr and str self.assertTrue(str(clf) != "") self.assertTrue(repr(clf) != "") self.assertEqual(clf.ca.distances.shape, (ds.nsamples / 2, nlabels))
def _test_mcasey20120222(): # pragma: no cover # http://lists.alioth.debian.org/pipermail/pkg-exppsy-pymvpa/2012q1/002034.html # This one is conditioned on allowing # of samples to be changed # by the mapper provided to MappedClassifier. See # https://github.com/yarikoptic/PyMVPA/tree/_tent/allow_ch_nsamples import numpy as np from mvpa2.datasets.base import dataset_wizard from mvpa2.generators.partition import NFoldPartitioner from mvpa2.mappers.base import ChainMapper from mvpa2.mappers.svd import SVDMapper from mvpa2.mappers.fx import mean_group_sample from mvpa2.clfs.svm import LinearCSVMC from mvpa2.clfs.meta import MappedClassifier from mvpa2.measures.base import CrossValidation mapper = ChainMapper([mean_group_sample(['targets','chunks']), SVDMapper()]) clf = MappedClassifier(LinearCSVMC(), mapper) cvte = CrossValidation(clf, NFoldPartitioner(), enable_ca=['repetition_results', 'stats']) ds = dataset_wizard( samples=np.arange(32).reshape((8, -1)), targets=[1, 1, 2, 2, 1, 1, 2, 2], chunks=[1, 1, 1, 1, 2, 2, 2, 2]) errors = cvte(ds)
def test_unpartitioned_cv(self): data = get_mv_pattern(10) # only one big chunk data.sa.chunks[:] = 1 cv = CrossValidation(sample_clf_nl, NFoldPartitioner()) # need to fail, because it can't be split into training and testing assert_raises(ValueError, cv, data)
def test_classifier_generalization(self, clf): """Simple test if classifiers can generalize ok on simple data """ te = CrossValidation(clf, NFoldPartitioner(), postproc=mean_sample()) # check the default #self.assertTrue(te.transerror.errorfx is mean_mismatch_error) nclasses = 2 * (1 + int('multiclass' in clf.__tags__)) ds = datasets['uni%d%s' % (nclasses, self._get_clf_ds(clf))] try: cve = te(ds).samples.squeeze() except Exception as e: self.fail("Failed with %s" % e) if cfg.getboolean('tests', 'labile', default='yes'): if nclasses > 2 and \ ((clf.descr is not None and 'on 5%(' in clf.descr) or 'regression_based' in clf.__tags__): # skip those since they are barely applicable/testable here raise SkipTest("Skip testing of cve on %s" % clf) self.assertTrue( cve < 0.25, # TODO: use multinom distribution msg="Got transfer error %g on %s with %d labels" % (cve, ds, len(ds.UT)))
def test_multiclass_classifier_pass_ds_attributes(): # TODO: replicate/extend basic testing of pass_attr # in some more "basic" test_* clf = LinearCSVMC(C=1) ds = datasets['uni3small'].copy() ds.sa['ids'] = np.arange(len(ds)) mclf = MulticlassClassifier( clf, pass_attr=[ 'ids', 'sa.chunks', 'a.bogus_features', # 'ca.raw_estimates' # this one is binary_clf x samples list ATM # that is why raw_predictions_ds was born 'ca.raw_predictions_ds', 'ca.estimates', # this one is ok 'ca.predictions', ], enable_ca=['all']) mcv = CrossValidation(mclf, NFoldPartitioner(), errorfx=None) res = mcv(ds) assert_array_equal(sorted(res.sa.ids), ds.sa.ids) assert_array_equal(res.chunks, ds.chunks[res.sa.ids]) assert_array_equal(res.sa.predictions, res.samples[:, 0]) assert_array_equal(res.sa.cvfolds, np.repeat(range(len(ds.UC)), len(ds) / len(ds.UC)))
def test_cv_no_generator_custom_splitter(self): ds = Dataset(np.arange(4), sa={ 'category': ['to', 'to', 'from', 'from'], 'targets': ['a', 'b', 'c', 'd'] }) class Measure(Classifier): def _train(self, ds_): assert_array_equal(ds_.samples, ds.samples[2:]) assert_array_equal(ds_.sa.category, ['from'] * len(ds_)) def _predict(self, ds_): assert (ds_ is not ds) # we pass a shallow copy # could be called to predit training or testing data if np.all(ds_.sa.targets != ['c', 'd']): assert_array_equal(ds_.samples, ds.samples[:2]) assert_array_equal(ds_.sa.category, ['to'] * len(ds_)) else: assert_array_equal(ds_.sa.category, ['from'] * len(ds_)) return ['c', 'd'] measure = Measure() cv = CrossValidation(measure, splitter=Splitter('category', ['from', 'to'])) res = cv(ds) assert_array_equal(res, [[1]]) # failed perfectly ;-)
def test_multiclass_without_combiner(): # The goal is to obtain all pairwise results as the resultant dataset # avoiding even calling any combiner clf = LinearCSVMC(C=1) ds = datasets['uni3small'].copy() ds.sa['ids'] = np.arange(len(ds)) mclf = MulticlassClassifier(clf, combiner=None) # without combining results at all mcv = CrossValidation(mclf, NFoldPartitioner(), errorfx=None) res = mcv(ds) assert_equal(len(res), len(ds)) assert_equal(res.nfeatures, 3) # 3 pairs for 3 classes assert_array_equal(res.UT, ds.UT) assert_array_equal(np.unique(np.array(res.fa.targets.tolist())), ds.UT) # TODO -- check that we have all the pairs? assert_array_equal(res.sa['cvfolds'].unique, np.arange(len(ds.UC))) if mcv.ca.is_enabled('training_stats'): # we must have received a dictionary per each pair training_stats = mcv.ca.training_stats assert_equal(set(training_stats.keys()), set([('L0', 'L1'), ('L0', 'L2'), ('L1', 'L2')])) for pair, cm in training_stats.iteritems(): assert_array_equal(cm.labels, ds.UT) # we should have no predictions for absent label assert_array_equal(cm.matrix[~np.in1d(ds.UT, pair)], 0) # while altogether all samples were processed once assert_array_equal(cm.stats['P'], len(ds)) # and number of sets should be equal number of chunks here assert_equal(len(cm.sets), len(ds.UC))
def test_split_classifier_extended(self, clf_): clf2 = clf_.clone() ds = datasets['uni2%s' % self._get_clf_ds(clf2)] clf = SplitClassifier( clf=clf_, #SameSignClassifier(), enable_ca=['stats', 'feature_ids']) clf.train(ds) # train the beast error = clf.ca.stats.error cv = CrossValidation(clf2, NFoldPartitioner(), postproc=mean_sample(), enable_ca=['stats', 'training_stats']) cverror = cv(ds).samples.squeeze() if not 'non-deterministic' in clf.__tags__: self.assertTrue( abs(error - cverror) < 0.01, msg="We should get the same error using split classifier as" " using CrossValidation. Got %s and %s" % (error, cverror)) if cfg.getboolean('tests', 'labile', default='yes'): self.assertTrue(error < 0.25, msg="clf should generalize more or less fine. " "Got error %s" % error) self.assertEqual(len(clf.ca.stats.sets), len(ds.UC), msg="Should have 1 confusion per each split") self.assertEqual( len(clf.clfs), len(ds.UC), msg="Should have number of classifiers equal # of epochs")
def test_split_classifier(self): ds = self.data_bin_1 clf = SplitClassifier( clf=SameSignClassifier(), enable_ca=['stats', 'training_stats', 'feature_ids']) clf.train(ds) # train the beast error = clf.ca.stats.error tr_error = clf.ca.training_stats.error clf2 = clf.clone() cv = CrossValidation(clf2, NFoldPartitioner(), postproc=mean_sample(), enable_ca=['stats', 'training_stats']) cverror = cv(ds) cverror = cverror.samples.squeeze() tr_cverror = cv.ca.training_stats.error self.assertEqual( error, cverror, msg="We should get the same error using split classifier as" " using CrossValidation. Got %s and %s" % (error, cverror)) self.assertEqual( tr_error, tr_cverror, msg="We should get the same training error using split classifier as" " using CrossValidation. Got %s and %s" % (tr_error, tr_cverror)) self.assertEqual(clf.ca.stats.percent_correct, 100, msg="Dummy clf should train perfectly") # CV and SplitClassifier should get the same confusion matrices assert_array_equal(clf.ca.stats.matrix, cv.ca.stats.matrix) self.assertEqual(len(clf.ca.stats.sets), len(ds.UC), msg="Should have 1 confusion per each split") self.assertEqual( len(clf.clfs), len(ds.UC), msg="Should have number of classifiers equal # of epochs") self.assertEqual(clf.predict(ds.samples), list(ds.targets), msg="Should classify correctly") # feature_ids must be list of lists, and since it is not # feature-selecting classifier used - we expect all features # to be utilized # NOT ANYMORE -- for BoostedClassifier we have now union of all # used features across slave classifiers. That makes # semantics clear. If you need to get deeper -- use upcoming # harvesting facility ;-) # self.assertEqual(len(clf.feature_ids), len(ds.uniquechunks)) # self.assertTrue(np.array([len(ids)==ds.nfeatures # for ids in clf.feature_ids]).all()) # Just check if we get it at all ;-) summary = clf.summary()
def __test_matthias_question(self): rfe_clf = LinearCSVMC(C=1) rfesvm_split = SplitClassifier(rfe_clf) clf = \ FeatureSelectionClassifier( clf = LinearCSVMC(C=1), feature_selection = RFE( sensitivity_analyzer = rfesvm_split.get_sensitivity_analyzer( combiner=first_axis_mean, transformer=np.abs), transfer_error=ConfusionBasedError( rfesvm_split, confusion_state="confusion"), stopping_criterion=FixedErrorThresholdStopCrit(0.20), feature_selector=FractionTailSelector( 0.2, mode='discard', tail='lower'), update_sensitivity=True)) no_permutations = 1000 permutator = AttributePermutator('targets', count=no_permutations) cv = CrossValidation(clf, NFoldPartitioner(), null_dist=MCNullDist(permutator, tail='left'), enable_ca=['stats']) error = cv(datasets['uni2small']) self.assertTrue(error < 0.4) self.assertTrue(cv.ca.null_prob < 0.05)
def test_partial_searchlight_with_full_report(self): ds = self.dataset.copy() center_ids = np.zeros(ds.nfeatures, dtype='bool') center_ids[[3, 50]] = True ds.fa['center_ids'] = center_ids # compute N-1 cross-validation for each sphere cv = CrossValidation(GNB(), NFoldPartitioner()) # contruct diameter 1 (or just radius 0) searchlight # one time give center ids as a list, the other one takes it from the # dataset itself sls = ( sphere_searchlight(cv, radius=0, center_ids=[3, 50]), sphere_searchlight(None, radius=0, center_ids=[3, 50]), sphere_searchlight(cv, radius=0, center_ids='center_ids'), ) for sl in sls: # assure that we could set cv post constructor if sl.datameasure is None: sl.datameasure = cv # run searchlight results = sl(ds) # only two spheres but error for all CV-folds self.assertEqual(results.shape, (len(self.dataset.UC), 2)) # Test if results hold if we "set" a "new" datameasure sl.datameasure = CrossValidation(GNB(), NFoldPartitioner()) results2 = sl(ds) assert_array_almost_equal(results, results2) # test if we graciously puke if center_ids are out of bounds dataset0 = ds[:, :50] # so we have no 50th feature self.assertRaises(IndexError, sls[0], dataset0) # but it should be fine on the one that gets the ids from the dataset # itself results = sl(dataset0) assert_equal(results.nfeatures, 1) # check whether roi_seeds are correct sl = sphere_searchlight(lambda x: np.vstack( (x.fa.roi_seed, x.samples)), radius=1, add_center_fa=True, center_ids=[12]) res = sl(ds) assert_array_equal( res.samples[1:, res.samples[0].astype('bool')].squeeze(), ds.samples[:, 12])
def test_null_dist_prob(self, l_clf): train = datasets['uni2medium'] num_perm = 10 permutator = AttributePermutator('targets', count=num_perm, limit='chunks') # define class to estimate NULL distribution of errors # use left tail of the distribution since we use MeanMatchFx as error # function and lower is better terr = TransferMeasure(l_clf, Repeater(count=2), postproc=BinaryFxNode(mean_mismatch_error, 'targets'), null_dist=MCNullDist(permutator, tail='left')) # check reasonable error range err = terr(train) self.assertTrue(np.mean(err) < 0.4) # Lets do the same for CVTE cvte = CrossValidation(l_clf, OddEvenPartitioner(), null_dist=MCNullDist(permutator, tail='left', enable_ca=['dist_samples' ]), postproc=mean_sample()) cv_err = cvte(train) # check that the result is highly significant since we know that the # data has signal null_prob = np.asscalar(terr.ca.null_prob) if cfg.getboolean('tests', 'labile', default='yes'): self.assertTrue( null_prob <= 0.1, msg="Failed to check that the result is highly significant " "(got %f) since we know that the data has signal" % null_prob) self.assertTrue( np.asscalar(cvte.ca.null_prob) <= 0.1, msg="Failed to check that the result is highly significant " "(got p(cvte)=%f) since we know that the data has signal" % np.asscalar(cvte.ca.null_prob)) # we should be able to access the actual samples of the distribution # yoh: why it is 3D really? # mih: because these are the distribution samples for the ONE error # collapsed into ONE value across all folds. It will also be # 3d if the return value of the measure isn't a scalar and it is # not collapsed across folds. it simply corresponds to the shape # of the output dataset of the respective measure (+1 axis) # Some permutations could have been skipped since classifier failed # to train due to degenerate situation etc, thus accounting for them self.assertEqual(cvte.null_dist.ca.dist_samples.shape[2], num_perm - cvte.null_dist.ca.skipped)
def test_nblocks(self): skip_if_no_external('pprocess') # just a basic test to see that we are getting the same # results with different nblocks ds = datasets['3dsmall'].copy(deep=True)[:, :13] ds.fa['voxel_indices'] = ds.fa.myspace cv = CrossValidation(GNB(), OddEvenPartitioner()) res1 = sphere_searchlight(cv, radius=1, nproc=2)(ds) res2 = sphere_searchlight(cv, radius=1, nproc=2, nblocks=5)(ds) assert_array_equal(res1, res2)
def test_searchlight_cross_decoding(path, subjects, conf_file, type, **kwargs): conf = read_configuration(path, conf_file, type) for arg in kwargs: conf[arg] = kwargs[arg] if arg == 'radius': radius = kwargs[arg] debug.active += ["SLC"] ds_merged = get_merged_ds(path, subjects, conf_file, type, **kwargs) clf = LinearCSVMC(C=1, probability=1, enable_ca=['probabilities']) cv = CrossValidation(clf, NFoldPartitioner(attr='task')) maps = [] for ds in ds_merged: ds.targets[ds.targets == 'point'] = 'face' ds.targets[ds.targets == 'saccade'] = 'place' sl = sphere_searchlight(cv, radius, space='voxel_indices') sl_map = sl(ds) sl_map.samples *= -1 sl_map.samples += 1 nif = map2nifti(sl_map, imghdr=ds.a.imghdr) maps.append(nif) datetime = get_time() analysis = 'cross_searchlight' mask = conf['mask_area'] task = type new_dir = datetime + '_' + analysis + '_' + mask + '_' + task command = 'mkdir ' + os.path.join(path, '0_results', new_dir) os.system(command) parent_dir = os.path.join(path, '0_results', new_dir) for s, map in zip(subjects, maps): name = s command = 'mkdir ' + os.path.join(parent_dir, name) os.system(command) results_dir = os.path.join(parent_dir, name) fname = name + '_radius_' + str(radius) + '_searchlight_map.nii.gz' map.to_filename(os.path.join(results_dir, fname)) return maps
def test_cached_kernel_different_datasets(self): skip_if_no_external('shogun', ver_dep='shogun:rev', min_version=4455) # Inspired by the problem Swaroop ran into k = LinearSGKernel(normalizer_cls=False) k_ = LinearSGKernel(normalizer_cls=False) # to be cached ck = CachedKernel(k_) clf = sgSVM(svm_impl='libsvm', kernel=k, C=-1) clf_ = sgSVM(svm_impl='libsvm', kernel=ck, C=-1) cvte = CrossValidation(clf, NFoldPartitioner()) cvte_ = CrossValidation(clf_, NFoldPartitioner()) postproc=BinaryFxNode(mean_mismatch_error, 'targets') te = ProxyMeasure(clf, postproc=postproc) te_ = ProxyMeasure(clf_, postproc=postproc) for r in xrange(2): ds1 = datasets['uni2medium'] errs1 = cvte(ds1) ck.compute(ds1) ok_(ck._recomputed) errs1_ = cvte_(ds1) ok_(~ck._recomputed) assert_array_equal(errs1, errs1_) ds2 = datasets['uni3small'] errs2 = cvte(ds2) ck.compute(ds2) ok_(ck._recomputed) errs2_ = cvte_(ds2) ok_(~ck._recomputed) assert_array_equal(errs2, errs2_) ssel = np.round(datasets['uni2large'].samples[:5, 0]).astype(int) te.train(datasets['uni3small'][::2]) terr = np.asscalar(te(datasets['uni3small'][ssel])) te_.train(datasets['uni3small'][::2]) terr_ = np.asscalar(te_(datasets['uni3small'][ssel])) ok_(~ck._recomputed) ok_(terr == terr_)
def test_sifter_superord_usecase(): from mvpa2.misc.data_generators import normal_feature_dataset from mvpa2.clfs.svm import LinearCSVMC # fast one to use for tests from mvpa2.measures.base import CrossValidation from mvpa2.base.node import ChainNode from mvpa2.generators.partition import NFoldPartitioner from mvpa2.generators.base import Sifter ds = _get_superord_dataset() npart = ChainNode( [ ## so we split based on superord NFoldPartitioner(len(ds.sa['superord'].unique), attr='subord'), ## so it should select only those splits where we took 1 from ## each of the superord categories leaving things in balance Sifter([('partitions', 2), ('superord', { 'uvalues': ds.sa['superord'].unique, 'balanced': True })]), ], space='partitions') # and then do your normal where clf is space='superord' clf = LinearCSVMC(space='superord') cvte_regular = CrossValidation(clf, NFoldPartitioner(), errorfx=lambda p, t: np.mean(p == t)) cvte_super = CrossValidation(clf, npart, errorfx=lambda p, t: np.mean(p == t)) accs_regular = cvte_regular(ds) accs_super = cvte_super(ds) # With sifting we should get only 2^3 = 8 splits assert (len(accs_super) == 8) # I don't think that this would ever fail, so not marking it labile assert (np.mean(accs_regular) > .8) assert (np.mean(accs_super) < .6)
def test_cache_speedup(self): skip_if_no_external('shogun', ver_dep='shogun:rev', min_version=4455) ck = sgSVM(kernel=CachedKernel(kernel=RbfSGKernel(sigma=2)), C=1) sk = sgSVM(kernel=RbfSGKernel(sigma=2), C=1) cv_c = CrossValidation(ck, NFoldPartitioner()) cv_s = CrossValidation(sk, NFoldPartitioner()) #data = datasets['uni4large'] P = 5000 data = normal_feature_dataset(snr=2, perlabel=200, nchunks=10, means=np.random.randn(2, P), nfeatures=P) t0 = time() ck.params.kernel.compute(data) cachetime = time() - t0 t0 = time() cached_err = cv_c(data) ccv_time = time() - t0 t0 = time() norm_err = cv_s(data) ncv_time = time() - t0 assert_almost_equal(np.asanyarray(cached_err), np.asanyarray(norm_err)) ok_(cachetime < ncv_time) ok_(ccv_time < ncv_time) #print 'Regular CV time: %s seconds'%ncv_time #print 'Caching time: %s seconds'%cachetime #print 'Cached CV time: %s seconds'%ccv_time speedup = ncv_time / (ccv_time + cachetime) #print 'Speedup factor: %s'%speedup # Speedup ideally should be 10, though it's not purely linear self.failIf(speedup < 2, 'Problem caching data - too slow!')
def test_confusion_as_node(): from mvpa2.misc.data_generators import normal_feature_dataset from mvpa2.clfs.gnb import GNB from mvpa2.clfs.transerror import Confusion ds = normal_feature_dataset(snr=2.0, perlabel=42, nchunks=3, nonbogus_features=[0, 1], nfeatures=2) clf = GNB() cv = CrossValidation(clf, NFoldPartitioner(), errorfx=None, postproc=Confusion(labels=ds.UT), enable_ca=['stats']) res = cv(ds) # needs to be identical to CA assert_array_equal(res.samples, cv.ca.stats.matrix) assert_array_equal(res.sa.predictions, ds.UT) assert_array_equal(res.fa.targets, ds.UT) skip_if_no_external('scipy') from mvpa2.clfs.transerror import BayesConfusionHypothesis from mvpa2.base.node import ChainNode # same again, but this time with Bayesian hypothesis testing at the end cv = CrossValidation(clf, NFoldPartitioner(), errorfx=None, postproc=ChainNode((Confusion(labels=ds.UT), BayesConfusionHypothesis()))) res = cv(ds) # only two possible hypothesis with two classes assert_equals(len(res), 2) # the first hypothesis is the can't discriminate anything assert_equal(len(res.sa.hypothesis[0]), 1) assert_equal(len(res.sa.hypothesis[0][0]), 2) # and the hypothesis is actually less likely than the other one # (both classes can be distinguished) assert (np.e**res.samples[0, 0] < np.e**res.samples[1, 0])
def test_james_problem_multiclass(self): percent = 80 dataset = datasets['uni4large'] #dataset = dataset[:, dataset.a.nonbogus_features] rfesvm_split = LinearCSVMC() fs = \ RFE(rfesvm_split.get_sensitivity_analyzer( postproc=ChainMapper([ #FxMapper('features', l2_normed), #FxMapper('samples', np.mean), #FxMapper('samples', np.abs) FxMapper('features', lambda x: np.argsort(np.abs(x))), #maxofabs_sample() mean_sample() ])), ProxyMeasure(rfesvm_split, postproc=BinaryFxNode(mean_mismatch_error, 'targets')), Splitter('train'), fselector=FractionTailSelector( percent / 100.0, mode='select', tail='upper'), update_sensitivity=True) clf = FeatureSelectionClassifier( LinearCSVMC(), # on features selected via RFE fs) # update sensitivity at each step (since we're not using the # same CLF as sensitivity analyzer) class StoreResults(object): def __init__(self): self.storage = [] def __call__(self, data, node, result): self.storage.append((node.measure.mapper.ca.history, node.measure.mapper.ca.errors)), cv_storage = StoreResults() cv = CrossValidation(clf, NFoldPartitioner(), postproc=mean_sample(), callback=cv_storage, enable_ca=['stats']) #cv = SplitClassifier(clf) try: error = cv(dataset).samples.squeeze() except Exception, e: self.fail('CrossValidation cannot handle classifier with RFE ' 'feature selection. Got exception: %s' % (e, ))
def _test_gnb_overflow_haxby(): # pragma: no cover # example from https://github.com/PyMVPA/PyMVPA/issues/581 # a heavier version of the above test import os import numpy as np from mvpa2.datasets.sources.native import load_tutorial_data from mvpa2.clfs.gnb import GNB from mvpa2.measures.base import CrossValidation from mvpa2.generators.partition import HalfPartitioner from mvpa2.mappers.zscore import zscore from mvpa2.mappers.detrend import poly_detrend from mvpa2.datasets.miscfx import remove_invariant_features from mvpa2.testing.datasets import * datapath = '/usr/share/data/pymvpa2-tutorial/' haxby = load_tutorial_data(datapath, roi='vt', add_fa={ 'vt_thr_glm': os.path.join(datapath, 'haxby2001', 'sub001', 'masks', 'orig', 'vt.nii.gz') }) # poly_detrend(haxby, polyord=1, chunks_attr='chunks') haxby = haxby[np.array( [ l in ['rest', 'scrambled'] # ''house', 'face'] for l in haxby.targets ], dtype='bool')] #zscore(haxby, chunks_attr='chunks', param_est=('targets', ['rest']), # dtype='float32') # haxby = haxby[haxby.sa.targets != 'rest'] haxby = remove_invariant_features(haxby) clf = GNB(enable_ca='estimates', logprob=True, normalize=True) #clf.train(haxby) #clf.predict(haxby) # estimates a bit "overfit" to judge in the train/predict on the same data cv = CrossValidation(clf, HalfPartitioner(attr='chunks'), postproc=None, enable_ca=['stats']) cv_results = cv(haxby) res1_est = clf.ca.estimates print "Estimates:\n", res1_est print "Exp(estimates):\n", np.round(np.exp(res1_est), 3) assert np.all(np.isfinite(res1_est))
def test_classifier_generalization(self, clf): """Simple test if classifiers can generalize ok on simple data """ te = CrossValidation(clf, NFoldPartitioner(), postproc=mean_sample()) # check the default #self.assertTrue(te.transerror.errorfx is mean_mismatch_error) nclasses = 2 * (1 + int('multiclass' in clf.__tags__)) ds = datasets['uni%d%s' % (nclasses, self._get_clf_ds(clf))] try: cve = te(ds).samples.squeeze() except Exception, e: self.fail("Failed with %s" % e)
def test_values(self, clf): if isinstance(clf, MulticlassClassifier): # TODO: handle those values correctly return ds = datasets['uni2small'] clf.ca.change_temporarily(enable_ca = ['estimates']) cv = CrossValidation(clf, OddEvenPartitioner(), enable_ca=['stats', 'training_stats']) _ = cv(ds) #print clf.descr, clf.values[0] # basic test either we get 1 set of values per each sample self.assertEqual(len(clf.ca.estimates), ds.nsamples/2) clf.ca.reset_changed_temporarily()
def test_perturbation_sensitivity_analyzer(self): # compute N-1 cross-validation as datameasure cv = CrossValidation(sample_clf_lin, NFoldPartitioner()) # do perturbation analysis using gaussian noise pa = NoisePerturbationSensitivity(cv, noise=np.random.normal) # run analysis map = pa(self.dataset) # check for correct size of map self.assertTrue(map.nfeatures == self.dataset.nfeatures) # dataset is noise -> mean sensitivity should be zero self.assertTrue(-0.2 < np.mean(map) < 0.2)