def test_james_problem_multiclass(self): percent = 80 dataset = datasets['uni4large'] #dataset = dataset[:, dataset.a.nonbogus_features] rfesvm_split = LinearCSVMC() fs = \ RFE(rfesvm_split.get_sensitivity_analyzer( postproc=ChainMapper([ #FxMapper('features', l2_normed), #FxMapper('samples', np.mean), #FxMapper('samples', np.abs) FxMapper('features', lambda x: np.argsort(np.abs(x))), #maxofabs_sample() mean_sample() ])), ProxyMeasure(rfesvm_split, postproc=BinaryFxNode(mean_mismatch_error, 'targets')), Splitter('train'), fselector=FractionTailSelector( percent / 100.0, mode='select', tail='upper'), update_sensitivity=True) clf = FeatureSelectionClassifier( LinearCSVMC(), # on features selected via RFE fs) # update sensitivity at each step (since we're not using the # same CLF as sensitivity analyzer) class StoreResults(object): def __init__(self): self.storage = [] def __call__(self, data, node, result): self.storage.append((node.measure.mapper.ca.history, node.measure.mapper.ca.errors)), cv_storage = StoreResults() cv = CrossValidation(clf, NFoldPartitioner(), postproc=mean_sample(), callback=cv_storage, enable_ca=['stats']) #cv = SplitClassifier(clf) try: error = cv(dataset).samples.squeeze() except Exception, e: self.fail('CrossValidation cannot handle classifier with RFE ' 'feature selection. Got exception: %s' % (e, ))
def test_james_problem_multiclass(self): percent = 80 dataset = datasets['uni4large'] #dataset = dataset[:, dataset.a.nonbogus_features] rfesvm_split = LinearCSVMC() fs = \ RFE(rfesvm_split.get_sensitivity_analyzer( postproc=ChainMapper([ #FxMapper('features', l2_normed), #FxMapper('samples', np.mean), #FxMapper('samples', np.abs) FxMapper('features', lambda x: np.argsort(np.abs(x))), #maxofabs_sample() mean_sample() ])), ProxyMeasure(rfesvm_split, postproc=BinaryFxNode(mean_mismatch_error, 'targets')), Splitter('train'), fselector=FractionTailSelector( percent / 100.0, mode='select', tail='upper'), update_sensitivity=True) clf = FeatureSelectionClassifier( LinearCSVMC(), # on features selected via RFE fs) # update sensitivity at each step (since we're not using the # same CLF as sensitivity analyzer) class StoreResults(object): def __init__(self): self.storage = [] def __call__(self, data, node, result): self.storage.append((node.measure.mapper.ca.history, node.measure.mapper.ca.errors)), cv_storage = StoreResults() cv = CrossValidation(clf, NFoldPartitioner(), postproc=mean_sample(), callback=cv_storage, enable_ca=['stats']) #cv = SplitClassifier(clf) try: error = cv(dataset).samples.squeeze() except Exception, e: self.fail('CrossValidation cannot handle classifier with RFE ' 'feature selection. Got exception: %s' % (e,))
def test_classifier_generalization(self, clf): """Simple test if classifiers can generalize ok on simple data """ te = CrossValidation(clf, NFoldPartitioner(), postproc=mean_sample()) # check the default #self.assertTrue(te.transerror.errorfx is mean_mismatch_error) nclasses = 2 * (1 + int('multiclass' in clf.__tags__)) ds = datasets['uni%d%s' % (nclasses, self._get_clf_ds(clf))] try: cve = te(ds).samples.squeeze() except Exception as e: self.fail("Failed with %s" % e) if cfg.getboolean('tests', 'labile', default='yes'): if nclasses > 2 and \ ((clf.descr is not None and 'on 5%(' in clf.descr) or 'regression_based' in clf.__tags__): # skip those since they are barely applicable/testable here raise SkipTest("Skip testing of cve on %s" % clf) self.assertTrue( cve < 0.25, # TODO: use multinom distribution msg="Got transfer error %g on %s with %d labels" % (cve, ds, len(ds.UT)))
def test_split_classifier(self): ds = self.data_bin_1 clf = SplitClassifier( clf=SameSignClassifier(), enable_ca=['stats', 'training_stats', 'feature_ids']) clf.train(ds) # train the beast error = clf.ca.stats.error tr_error = clf.ca.training_stats.error clf2 = clf.clone() cv = CrossValidation(clf2, NFoldPartitioner(), postproc=mean_sample(), enable_ca=['stats', 'training_stats']) cverror = cv(ds) cverror = cverror.samples.squeeze() tr_cverror = cv.ca.training_stats.error self.assertEqual( error, cverror, msg="We should get the same error using split classifier as" " using CrossValidation. Got %s and %s" % (error, cverror)) self.assertEqual( tr_error, tr_cverror, msg="We should get the same training error using split classifier as" " using CrossValidation. Got %s and %s" % (tr_error, tr_cverror)) self.assertEqual(clf.ca.stats.percent_correct, 100, msg="Dummy clf should train perfectly") # CV and SplitClassifier should get the same confusion matrices assert_array_equal(clf.ca.stats.matrix, cv.ca.stats.matrix) self.assertEqual(len(clf.ca.stats.sets), len(ds.UC), msg="Should have 1 confusion per each split") self.assertEqual( len(clf.clfs), len(ds.UC), msg="Should have number of classifiers equal # of epochs") self.assertEqual(clf.predict(ds.samples), list(ds.targets), msg="Should classify correctly") # feature_ids must be list of lists, and since it is not # feature-selecting classifier used - we expect all features # to be utilized # NOT ANYMORE -- for BoostedClassifier we have now union of all # used features across slave classifiers. That makes # semantics clear. If you need to get deeper -- use upcoming # harvesting facility ;-) # self.assertEqual(len(clf.feature_ids), len(ds.uniquechunks)) # self.assertTrue(np.array([len(ids)==ds.nfeatures # for ids in clf.feature_ids]).all()) # Just check if we get it at all ;-) summary = clf.summary()
def test_split_classifier_extended(self, clf_): clf2 = clf_.clone() ds = datasets['uni2%s' % self._get_clf_ds(clf2)] clf = SplitClassifier( clf=clf_, #SameSignClassifier(), enable_ca=['stats', 'feature_ids']) clf.train(ds) # train the beast error = clf.ca.stats.error cv = CrossValidation(clf2, NFoldPartitioner(), postproc=mean_sample(), enable_ca=['stats', 'training_stats']) cverror = cv(ds).samples.squeeze() if not 'non-deterministic' in clf.__tags__: self.assertTrue( abs(error - cverror) < 0.01, msg="We should get the same error using split classifier as" " using CrossValidation. Got %s and %s" % (error, cverror)) if cfg.getboolean('tests', 'labile', default='yes'): self.assertTrue(error < 0.25, msg="clf should generalize more or less fine. " "Got error %s" % error) self.assertEqual(len(clf.ca.stats.sets), len(ds.UC), msg="Should have 1 confusion per each split") self.assertEqual( len(clf.clfs), len(ds.UC), msg="Should have number of classifiers equal # of epochs")
def test_split_classifier_extended(self, clf_): clf2 = clf_.clone() ds = datasets['uni2%s' % self._get_clf_ds(clf2)] clf = SplitClassifier(clf=clf_, #SameSignClassifier(), enable_ca=['stats', 'feature_ids']) clf.train(ds) # train the beast error = clf.ca.stats.error cv = CrossValidation(clf2, NFoldPartitioner(), postproc=mean_sample(), enable_ca=['stats', 'training_stats']) cverror = cv(ds).samples.squeeze() if not 'non-deterministic' in clf.__tags__: self.assertTrue(abs(error-cverror)<0.01, msg="We should get the same error using split classifier as" " using CrossValidation. Got %s and %s" % (error, cverror)) if cfg.getboolean('tests', 'labile', default='yes'): self.assertTrue(error < 0.25, msg="clf should generalize more or less fine. " "Got error %s" % error) self.assertEqual(len(clf.ca.stats.sets), len(ds.UC), msg="Should have 1 confusion per each split") self.assertEqual(len(clf.clfs), len(ds.UC), msg="Should have number of classifiers equal # of epochs")
def test_regression_as_classifier(self, regr): """Basic tests of metaclass for using regressions as classifiers """ for dsname in 'uni2small', 'uni4small': ds = datasets[dsname] clf = RegressionAsClassifier(regr, enable_ca=['distances']) cv = CrossValidation(clf, OddEvenPartitioner(), postproc=mean_sample(), enable_ca=['stats', 'training_stats']) error = cv(ds).samples.squeeze() nlabels = len(ds.uniquetargets) if nlabels == 2 \ and cfg.getboolean('tests', 'labile', default='yes'): self.assertTrue(error < 0.3, msg="Got error %.2f on %s dataset" % (error, dsname)) # Check if does not puke on repr and str self.assertTrue(str(clf) != "") self.assertTrue(repr(clf) != "") self.assertEqual(clf.ca.distances.shape, (ds.nsamples / 2, nlabels))
def test_usecase_concordancesl(self): import numpy as np from mvpa2.base.dataset import vstack from mvpa2.mappers.fx import mean_sample # Take our sample 3d dataset ds1 = datasets['3dsmall'].copy(deep=True) ds1.fa['voxel_indices'] = ds1.fa.myspace ds1.sa['subject'] = [1] # not really necessary -- but let's for clarity ds1 = mean_sample()(ds1) # so we get just a single representative sample def corr12(ds): corr = np.corrcoef(ds.samples) assert(corr.shape == (2, 2)) # for paranoid ones return corr[0, 1] for nsc, thr, thr_mean in ( (0, 1.0, 1.0), (0.1, 0.3, 0.8)): # just a bit of noise ds2 = ds1.copy(deep=True) # make a copy for the 2nd subject ds2.sa['subject'] = [2] ds2.samples += nsc * np.random.normal(size=ds1.shape) # make sure that both have the same voxel indices assert(np.all(ds1.fa.voxel_indices == ds2.fa.voxel_indices)) ds_both = vstack((ds1, ds2))# join 2 images into a single dataset # with .sa.subject distinguishing both sl = sphere_searchlight(corr12, radius=2) slmap = sl(ds_both) ok_(np.all(slmap.samples >= thr)) ok_(np.mean(slmap.samples) >= thr)
def test_usecase_concordancesl(self): import numpy as np from mvpa2.base.dataset import vstack from mvpa2.mappers.fx import mean_sample # Take our sample 3d dataset ds1 = datasets['3dsmall'].copy(deep=True) ds1.fa['voxel_indices'] = ds1.fa.myspace ds1.sa['subject'] = [1 ] # not really necessary -- but let's for clarity ds1 = mean_sample()( ds1) # so we get just a single representative sample def corr12(ds): corr = np.corrcoef(ds.samples) assert (corr.shape == (2, 2)) # for paranoid ones return corr[0, 1] for nsc, thr, thr_mean in ((0, 1.0, 1.0), (0.1, 0.3, 0.8)): # just a bit of noise ds2 = ds1.copy(deep=True) # make a copy for the 2nd subject ds2.sa['subject'] = [2] ds2.samples += nsc * np.random.normal(size=ds1.shape) # make sure that both have the same voxel indices assert (np.all(ds1.fa.voxel_indices == ds2.fa.voxel_indices)) ds_both = vstack((ds1, ds2)) # join 2 images into a single dataset # with .sa.subject distinguishing both sl = sphere_searchlight(corr12, radius=2) slmap = sl(ds_both) ok_(np.all(slmap.samples >= thr)) ok_(np.mean(slmap.samples) >= thr)
def test_james_problem(self): percent = 80 dataset = datasets['uni2small'] rfesvm_split = LinearCSVMC() fs = \ RFE(rfesvm_split.get_sensitivity_analyzer(), ProxyMeasure(rfesvm_split, postproc=BinaryFxNode(mean_mismatch_error, 'targets')), Splitter('train'), fselector=FractionTailSelector( percent / 100.0, mode='select', tail='upper'), update_sensitivity=True) clf = FeatureSelectionClassifier( LinearCSVMC(), # on features selected via RFE fs) # update sensitivity at each step (since we're not using the # same CLF as sensitivity analyzer) cv = CrossValidation(clf, NFoldPartitioner(), postproc=mean_sample(), enable_ca=['confusion']) #cv = SplitClassifier(clf) try: error = cv(dataset).samples.squeeze() except Exception, e: self.fail('CrossValidation cannot handle classifier with RFE ' 'feature selection. Got exception: %s' % (e,))
def test_regression_as_classifier(self, regr): """Basic tests of metaclass for using regressions as classifiers """ for dsname in 'uni2small', 'uni4small': ds = datasets[dsname] clf = RegressionAsClassifier(regr, enable_ca=['distances']) cv = CrossValidation(clf, OddEvenPartitioner(), postproc=mean_sample(), enable_ca=['stats', 'training_stats']) error = cv(ds).samples.squeeze() nlabels = len(ds.uniquetargets) if nlabels == 2 \ and cfg.getboolean('tests', 'labile', default='yes'): self.assertTrue(error < 0.3, msg="Got error %.2f on %s dataset" % (error, dsname)) # Check if does not puke on repr and str self.assertTrue(str(clf) != "") self.assertTrue(repr(clf) != "") self.assertEqual(clf.ca.distances.shape, (ds.nsamples / 2, nlabels))
def test_null_dist_prob(self, l_clf): train = datasets['uni2medium'] num_perm = 10 permutator = AttributePermutator('targets', count=num_perm, limit='chunks') # define class to estimate NULL distribution of errors # use left tail of the distribution since we use MeanMatchFx as error # function and lower is better terr = TransferMeasure(l_clf, Repeater(count=2), postproc=BinaryFxNode(mean_mismatch_error, 'targets'), null_dist=MCNullDist(permutator, tail='left')) # check reasonable error range err = terr(train) self.assertTrue(np.mean(err) < 0.4) # Lets do the same for CVTE cvte = CrossValidation(l_clf, OddEvenPartitioner(), null_dist=MCNullDist(permutator, tail='left', enable_ca=['dist_samples' ]), postproc=mean_sample()) cv_err = cvte(train) # check that the result is highly significant since we know that the # data has signal null_prob = np.asscalar(terr.ca.null_prob) if cfg.getboolean('tests', 'labile', default='yes'): self.assertTrue( null_prob <= 0.1, msg="Failed to check that the result is highly significant " "(got %f) since we know that the data has signal" % null_prob) self.assertTrue( np.asscalar(cvte.ca.null_prob) <= 0.1, msg="Failed to check that the result is highly significant " "(got p(cvte)=%f) since we know that the data has signal" % np.asscalar(cvte.ca.null_prob)) # we should be able to access the actual samples of the distribution # yoh: why it is 3D really? # mih: because these are the distribution samples for the ONE error # collapsed into ONE value across all folds. It will also be # 3d if the return value of the measure isn't a scalar and it is # not collapsed across folds. it simply corresponds to the shape # of the output dataset of the respective measure (+1 axis) # Some permutations could have been skipped since classifier failed # to train due to degenerate situation etc, thus accounting for them self.assertEqual(cvte.null_dist.ca.dist_samples.shape[2], num_perm - cvte.null_dist.ca.skipped)
def test_function_ptrs(fname): skip_if_no_external('nibabel') ds = load_example_fmri_dataset() # add a mapper with a function ptr inside ds = ds.get_mapped(mean_sample()) h5save(fname, ds) ds_loaded = h5load(fname) fresh = load_example_fmri_dataset().O # check that the reconstruction function pointer in the FxMapper points # to the right one assert_array_equal(ds_loaded.a.mapper.forward(fresh), ds.samples)
def test_null_dist_prob(self, l_clf): train = datasets['uni2medium'] num_perm = 10 permutator = AttributePermutator('targets', count=num_perm, limit='chunks') # define class to estimate NULL distribution of errors # use left tail of the distribution since we use MeanMatchFx as error # function and lower is better terr = TransferMeasure( l_clf, Repeater(count=2), postproc=BinaryFxNode(mean_mismatch_error, 'targets'), null_dist=MCNullDist(permutator, tail='left')) # check reasonable error range err = terr(train) self.assertTrue(np.mean(err) < 0.4) # Lets do the same for CVTE cvte = CrossValidation(l_clf, OddEvenPartitioner(), null_dist=MCNullDist(permutator, tail='left', enable_ca=['dist_samples']), postproc=mean_sample()) cv_err = cvte(train) # check that the result is highly significant since we know that the # data has signal null_prob = np.asscalar(terr.ca.null_prob) if cfg.getboolean('tests', 'labile', default='yes'): self.assertTrue(null_prob <= 0.1, msg="Failed to check that the result is highly significant " "(got %f) since we know that the data has signal" % null_prob) self.assertTrue(np.asscalar(cvte.ca.null_prob) <= 0.1, msg="Failed to check that the result is highly significant " "(got p(cvte)=%f) since we know that the data has signal" % np.asscalar(cvte.ca.null_prob)) # we should be able to access the actual samples of the distribution # yoh: why it is 3D really? # mih: because these are the distribution samples for the ONE error # collapsed into ONE value across all folds. It will also be # 3d if the return value of the measure isn't a scalar and it is # not collapsed across folds. it simply corresponds to the shape # of the output dataset of the respective measure (+1 axis) # Some permutations could have been skipped since classifier failed # to train due to degenerate situation etc, thus accounting for them self.assertEqual(cvte.null_dist.ca.dist_samples.shape[2], num_perm - cvte.null_dist.ca.skipped)
def test_split_classifier(self): ds = self.data_bin_1 clf = SplitClassifier(clf=SameSignClassifier(), enable_ca=['stats', 'training_stats', 'feature_ids']) clf.train(ds) # train the beast error = clf.ca.stats.error tr_error = clf.ca.training_stats.error clf2 = clf.clone() cv = CrossValidation(clf2, NFoldPartitioner(), postproc=mean_sample(), enable_ca=['stats', 'training_stats']) cverror = cv(ds) cverror = cverror.samples.squeeze() tr_cverror = cv.ca.training_stats.error self.assertEqual(error, cverror, msg="We should get the same error using split classifier as" " using CrossValidation. Got %s and %s" % (error, cverror)) self.assertEqual(tr_error, tr_cverror, msg="We should get the same training error using split classifier as" " using CrossValidation. Got %s and %s" % (tr_error, tr_cverror)) self.assertEqual(clf.ca.stats.percent_correct, 100, msg="Dummy clf should train perfectly") # CV and SplitClassifier should get the same confusion matrices assert_array_equal(clf.ca.stats.matrix, cv.ca.stats.matrix) self.assertEqual(len(clf.ca.stats.sets), len(ds.UC), msg="Should have 1 confusion per each split") self.assertEqual(len(clf.clfs), len(ds.UC), msg="Should have number of classifiers equal # of epochs") self.assertEqual(clf.predict(ds.samples), list(ds.targets), msg="Should classify correctly") # feature_ids must be list of lists, and since it is not # feature-selecting classifier used - we expect all features # to be utilized # NOT ANYMORE -- for BoostedClassifier we have now union of all # used features across slave classifiers. That makes # semantics clear. If you need to get deeper -- use upcoming # harvesting facility ;-) # self.assertEqual(len(clf.feature_ids), len(ds.uniquechunks)) # self.assertTrue(np.array([len(ids)==ds.nfeatures # for ids in clf.feature_ids]).all()) # Just check if we get it at all ;-) summary = clf.summary()
def test_function_ptrs(fname): skip_if_no_external('nibabel') ds = load_example_fmri_dataset() # add a mapper with a function ptr inside ds = ds.get_mapped(mean_sample()) h5save(fname, ds) ds_loaded = h5load(fname) fresh = load_example_fmri_dataset().O # check that the reconstruction function pointer in the FxMapper points # to the right one assert_array_equal(ds_loaded.a.mapper.forward(fresh), ds.samples)
def test_function_ptrs(): if not externals.exists('nibabel'): raise SkipTest ds = load_example_fmri_dataset() # add a mapper with a function ptr inside ds = ds.get_mapped(mean_sample()) f = tempfile.NamedTemporaryFile() h5save(f.name, ds) ds_loaded = h5load(f.name) fresh = load_example_fmri_dataset().O # check that the reconstruction function pointer in the FxMapper points # to the right one assert_array_equal(ds_loaded.a.mapper.forward(fresh), ds.samples)
def test_classifier_generalization(self, clf): """Simple test if classifiers can generalize ok on simple data """ te = CrossValidation(clf, NFoldPartitioner(), postproc=mean_sample()) # check the default #self.assertTrue(te.transerror.errorfx is mean_mismatch_error) nclasses = 2 * (1 + int('multiclass' in clf.__tags__)) ds = datasets['uni%d%s' % (nclasses, self._get_clf_ds(clf))] try: cve = te(ds).samples.squeeze() except Exception, e: self.fail("Failed with %s" % e)
def test_classifier_generalization(self, clf): """Simple test if classifiers can generalize ok on simple data """ te = CrossValidation(clf, NFoldPartitioner(), postproc=mean_sample()) # check the default #self.assertTrue(te.transerror.errorfx is mean_mismatch_error) nclasses = 2 * (1 + int('multiclass' in clf.__tags__)) ds = datasets['uni%d%s' % (nclasses, self._get_clf_ds(clf))] try: cve = te(ds).samples.squeeze() except Exception, e: self.fail("Failed with %s" % e)
def test_confusionmatrix_nulldist(self): from mvpa2.clfs.gnb import GNB class ConfusionMatrixError(object): """Custom error "function" """ def __init__(self, labels=None): self.labels = labels def __call__(self, predictions, targets): cm = ConfusionMatrix(labels=list(self.labels), targets=targets, predictions=predictions) #print cm.matrix # We have to add a degenerate leading dimension # so we could separate them into separate 'samples' return cm.matrix[None, :] from mvpa2.misc.data_generators import normal_feature_dataset for snr in [0., 2.,]: ds = normal_feature_dataset(snr=snr, perlabel=42, nchunks=3, nonbogus_features=[0,1], nfeatures=2) clf = GNB() num_perm = 50 permutator = AttributePermutator('targets', limit='chunks', count=num_perm) cv = CrossValidation( clf, NFoldPartitioner(), errorfx=ConfusionMatrixError(labels=ds.sa['targets'].unique), postproc=mean_sample(), null_dist=MCNullDist(permutator, tail='right', # because we now look at accuracy not error enable_ca=['dist_samples']), enable_ca=['stats']) cmatrix = cv(ds) #print "Result:\n", cmatrix.samples cvnp = cv.ca.null_prob.samples #print cvnp self.assertTrue(cvnp.shape, (2, 2)) if cfg.getboolean('tests', 'labile', default='yes'): if snr == 0.: # all p should be high since no signal assert_array_less(0.05, cvnp) else: # diagonal p is low -- we have signal after all assert_array_less(np.diag(cvnp), 0.05) # off diagonals are high p since for them we would # need to look at the other tail assert_array_less(0.9, cvnp[(np.array([0,1]), np.array([1,0]))])
def test_function_ptrs(): if not externals.exists('nibabel'): raise SkipTest ds = load_example_fmri_dataset() # add a mapper with a function ptr inside ds = ds.get_mapped(mean_sample()) f = tempfile.NamedTemporaryFile() h5save(f.name, ds) ds_loaded = h5load(f.name) fresh = load_example_fmri_dataset().O # check that the reconstruction function pointer in the FxMapper points # to the right one assert_array_equal(ds_loaded.a.mapper.forward(fresh), ds.samples)
def test_james_problem(self): percent = 80 dataset = datasets['uni2small'] rfesvm_split = LinearCSVMC() fs = \ RFE(rfesvm_split.get_sensitivity_analyzer(), ProxyMeasure(rfesvm_split, postproc=BinaryFxNode(mean_mismatch_error, 'targets')), Splitter('train'), fselector=FractionTailSelector( percent / 100.0, mode='select', tail='upper'), update_sensitivity=True) clf = FeatureSelectionClassifier( LinearCSVMC(), # on features selected via RFE fs) # update sensitivity at each step (since we're not using the # same CLF as sensitivity analyzer) class StoreResults(object): def __init__(self): self.storage = [] def __call__(self, data, node, result): self.storage.append((node.measure.mapper.ca.history, node.measure.mapper.ca.errors)), cv_storage = StoreResults() cv = CrossValidation(clf, NFoldPartitioner(), postproc=mean_sample(), callback=cv_storage, enable_ca=['confusion']) # TODO -- it is stats #cv = SplitClassifier(clf) try: error = cv(dataset).samples.squeeze() except Exception as e: self.fail('CrossValidation cannot handle classifier with RFE ' 'feature selection. Got exception: %s' % (e, )) assert (len(cv_storage.storage) == len(dataset.sa['chunks'].unique)) assert (len(cv_storage.storage[0]) == 2) assert (len(cv_storage.storage[0][0]) == dataset.nfeatures) self.assertTrue(error < 0.2)
def test_ifs(self, svm): # measure for feature selection criterion and performance assesment # use the SAME clf! errorfx = mean_mismatch_error fmeasure = CrossValidation(svm, NFoldPartitioner(), postproc=mean_sample()) pmeasure = ProxyMeasure(svm, postproc=BinaryFxNode(errorfx, 'targets')) ifs = IFS(fmeasure, pmeasure, Splitter('purpose', attr_values=['train', 'test']), fselector=\ # go for lower tail selection as data_measure will return # errors -> low is good FixedNElementTailSelector(1, tail='lower', mode='select'), ) wdata = self.get_data() wdata.sa['purpose'] = np.repeat('train', len(wdata)) tdata = self.get_data() tdata.sa['purpose'] = np.repeat('test', len(tdata)) ds = vstack((wdata, tdata)) orig_nfeatures = ds.nfeatures ifs.train(ds) resds = ifs(ds) # fail if orig datasets are changed self.assertTrue(ds.nfeatures == orig_nfeatures) # check that the features set with the least error is selected self.assertTrue(len(ifs.ca.errors)) e = np.array(ifs.ca.errors) self.assertTrue(resds.nfeatures == e.argmin() + 1) # repeat with dataset where selection order is known wsignal = datasets['dumb2'].copy() wsignal.sa['purpose'] = np.repeat('train', len(wsignal)) tsignal = datasets['dumb2'].copy() tsignal.sa['purpose'] = np.repeat('test', len(tsignal)) signal = vstack((wsignal, tsignal)) ifs.train(signal) resds = ifs(signal) self.assertTrue((resds.samples[:, 0] == signal.samples[:, 0]).all())
def test_confusionmatrix_nulldist(self): from mvpa2.clfs.gnb import GNB from mvpa2.clfs.transerror import ConfusionMatrixError from mvpa2.misc.data_generators import normal_feature_dataset for snr in [ 0., 2., ]: ds = normal_feature_dataset(snr=snr, perlabel=42, nchunks=3, nonbogus_features=[0, 1], nfeatures=2) clf = GNB() num_perm = 50 permutator = AttributePermutator('targets', limit='chunks', count=num_perm) cv = CrossValidation( clf, NFoldPartitioner(), errorfx=ConfusionMatrixError(labels=ds.sa['targets'].unique), postproc=mean_sample(), null_dist=MCNullDist( permutator, tail='right', # because we now look at accuracy not error enable_ca=['dist_samples']), enable_ca=['stats']) cmatrix = cv(ds) #print "Result:\n", cmatrix.samples cvnp = cv.ca.null_prob.samples #print cvnp self.assertTrue(cvnp.shape, (2, 2)) if cfg.getboolean('tests', 'labile', default='yes'): if snr == 0.: # all p should be high since no signal assert_array_less(0.05, cvnp) else: # diagonal p is low -- we have signal after all assert_array_less(np.diag(cvnp), 0.05) # off diagonals are high p since for them we would # need to look at the other tail assert_array_less( 0.9, cvnp[(np.array([0, 1]), np.array([1, 0]))])
def test_ifs(self, svm): # measure for feature selection criterion and performance assesment # use the SAME clf! errorfx = mean_mismatch_error fmeasure = CrossValidation(svm, NFoldPartitioner(), postproc=mean_sample()) pmeasure = ProxyMeasure(svm, postproc=BinaryFxNode(errorfx, 'targets')) ifs = IFS(fmeasure, pmeasure, Splitter('purpose', attr_values=['train', 'test']), fselector= # go for lower tail selection as data_measure will return # errors -> low is good FixedNElementTailSelector(1, tail='lower', mode='select'), ) wdata = self.get_data() wdata.sa['purpose'] = np.repeat('train', len(wdata)) tdata = self.get_data() tdata.sa['purpose'] = np.repeat('test', len(tdata)) ds = vstack((wdata, tdata)) orig_nfeatures = ds.nfeatures ifs.train(ds) resds = ifs(ds) # fail if orig datasets are changed self.assertTrue(ds.nfeatures == orig_nfeatures) # check that the features set with the least error is selected self.assertTrue(len(ifs.ca.errors)) e = np.array(ifs.ca.errors) self.assertTrue(resds.nfeatures == e.argmin() + 1) # repeat with dataset where selection order is known wsignal = datasets['dumb2'].copy() wsignal.sa['purpose'] = np.repeat('train', len(wsignal)) tsignal = datasets['dumb2'].copy() tsignal.sa['purpose'] = np.repeat('test', len(tsignal)) signal = vstack((wsignal, tsignal)) ifs.train(signal) resds = ifs(signal) self.assertTrue((resds.samples[:,0] == signal.samples[:,0]).all())
def test_adhocsearchlight_perm_testing(self): # just a smoke test pretty much ds = datasets['3dmedium'].copy() #ds.samples += np.random.normal(size=ds.samples.shape)*10 mvpa2.seed() ds.fa['voxel_indices'] = ds.fa.myspace from mvpa2.mappers.fx import mean_sample from mvpa2.clfs.stats import MCNullDist permutator = AttributePermutator('targets', count=8, limit='chunks') distr_est = MCNullDist(permutator, tail='left', enable_ca=['dist_samples']) slargs = (kNN(1), NFoldPartitioner(0.5, selection_strategy='random', count=9)) slkwargs = dict(radius=1, postproc=mean_sample()) sl_nodistr = sphere_m1nnsearchlight(*slargs, **slkwargs) skip_if_no_external('scipy') # needed for null_t sl = sphere_m1nnsearchlight( *slargs, null_dist=distr_est, enable_ca=['null_t'], reuse_neighbors=True, **slkwargs ) mvpa2.seed() res_nodistr = sl_nodistr(ds) mvpa2.seed() res = sl(ds) # verify that we at least got the same main result # ah (yoh) -- null dist is estimated before the main # estimate so we can't guarantee correspondence :-/ # assert_array_equal(res_nodistr, res) # only resemblance (TODO, may be we want to get/setstate # for rng before null_dist.fit?) # and dimensions correspond assert_array_equal(distr_est.ca.dist_samples.shape, (1, ds.nfeatures, 8)) assert_array_equal(sl.ca.null_t.samples.shape, (1, ds.nfeatures))
def test_adhocsearchlight_perm_testing(self): # just a smoke test pretty much ds = datasets['3dmedium'].copy() #ds.samples += np.random.normal(size=ds.samples.shape)*10 mvpa2.seed() ds.fa['voxel_indices'] = ds.fa.myspace from mvpa2.mappers.fx import mean_sample from mvpa2.clfs.stats import MCNullDist permutator = AttributePermutator('targets', count=8, limit='chunks') distr_est = MCNullDist(permutator, tail='left', enable_ca=['dist_samples']) slargs = (kNN(1), NFoldPartitioner(0.5, selection_strategy='random', count=9)) slkwargs = dict(radius=1, postproc=mean_sample()) sl_nodistr = sphere_m1nnsearchlight(*slargs, **slkwargs) skip_if_no_external('scipy') # needed for null_t sl = sphere_m1nnsearchlight(*slargs, null_dist=distr_est, enable_ca=['null_t'], reuse_neighbors=True, **slkwargs) mvpa2.seed() res_nodistr = sl_nodistr(ds) mvpa2.seed() res = sl(ds) # verify that we at least got the same main result # ah (yoh) -- null dist is estimated before the main # estimate so we can't guarantee correspondence :-/ # assert_array_equal(res_nodistr, res) # only resemblance (TODO, may be we want to get/setstate # for rng before null_dist.fit?) # and dimensions correspond assert_array_equal(distr_est.ca.dist_samples.shape, (1, ds.nfeatures, 8)) assert_array_equal(sl.ca.null_t.samples.shape, (1, ds.nfeatures))
def test_confusionmatrix_nulldist(self): from mvpa2.clfs.gnb import GNB from mvpa2.clfs.transerror import ConfusionMatrixError from mvpa2.misc.data_generators import normal_feature_dataset for snr in [0., 2.,]: ds = normal_feature_dataset(snr=snr, perlabel=42, nchunks=3, nonbogus_features=[0,1], nfeatures=2) clf = GNB() num_perm = 50 permutator = AttributePermutator('targets', limit='chunks', count=num_perm) cv = CrossValidation( clf, NFoldPartitioner(), errorfx=ConfusionMatrixError(labels=ds.sa['targets'].unique), postproc=mean_sample(), null_dist=MCNullDist(permutator, tail='right', # because we now look at accuracy not error enable_ca=['dist_samples']), enable_ca=['stats']) cmatrix = cv(ds) #print "Result:\n", cmatrix.samples cvnp = cv.ca.null_prob.samples #print cvnp self.assertTrue(cvnp.shape, (2, 2)) if cfg.getboolean('tests', 'labile', default='yes'): if snr == 0.: # all p should be high since no signal assert_array_less(0.05, cvnp) else: # diagonal p is low -- we have signal after all assert_array_less(np.diag(cvnp), 0.05) # off diagonals are high p since for them we would # need to look at the other tail assert_array_less(0.9, cvnp[(np.array([0,1]), np.array([1,0]))])
def do_searchlight(glm_dataset, radius, output_basename, with_null_prob=False): clf = LinearCSVMC(space='condition') # clf = RbfCSVMC(C=5.0) splt = NFoldPartitioner() cv = CrossValidation(clf, splt, errorfx=mean_match_accuracy, enable_ca=['stats'], postproc=mean_sample()) distr_est = [] if with_null_prob: permutator = AttributePermutator('condition', count=100, limit='chunks') distr_est = MCNullDist(permutator, tail='left', enable_ca=['dist_samples']) """ repeater = Repeater(count=100) permutator = AttributePermutator('condition', limit={'partitions': 1}, count=1) null_cv = CrossValidation(clf, ChainNode([splt, permutator],space=splt.get_space()), postproc=mean_sample()) null_sl = sphere_searchlight(null_cv, radius=radius, space='voxel_indices', enable_ca=['roi_sizes']) distr_est = MCNullDist(repeater,tail='left', measure=null_sl, enable_ca=['dist_samples']) """ sl = sphere_searchlight(cv, radius=radius, space='voxel_indices', null_dist=distr_est, enable_ca=['roi_sizes', 'roi_feature_ids']) else: sl = sphere_searchlight(cv, radius=radius, space='voxel_indices', enable_ca=['roi_sizes', 'roi_feature_ids']) #ds = glm_dataset.copy(deep=False, # sa=['condition','chunks'], # fa=['voxel_indices'], # a=['mapper']) #debug.active += ["SLC"] sl_map = sl(glm_dataset) errresults = map2nifti(sl_map, imghdr=glm_dataset.a.imghdr) errresults.to_filename('{}-acc.nii.gz'.format(output_basename)) sl_map.samples *= -1 sl_map.samples += 1 niftiresults = map2nifti(sl_map, imghdr=glm_dataset.a.imghdr) niftiresults.to_filename('{}-err.nii.gz'.format(output_basename)) #TODO: save p value map if with_null_prob: nullt_results = map2nifti(sl_map, data=sl.ca.null_t, imghdr=glm_dataset.a.imghdr) nullt_results.to_filename('{}-t.nii.gz'.format(output_basename)) nullprob_results = map2nifti(sl_map, data=sl.ca.null_prob, imghdr=glm_dataset.a.imghdr) nullprob_results.to_filename('{}-prob.nii.gz'.format(output_basename)) nullprob_results = map2nifti(sl_map, data=distr_est.cdf(sl_map.samples), imghdr=glm_dataset.a.imghdr) nullprob_results.to_filename('{}-cdf.nii.gz'.format(output_basename))
def _call(self, ds): if len(ds) > 1: # average all samples into one, assuming we got something like one # sample per subject as input avgr = mean_sample() ds = avgr(ds) # threshold input; at this point we only have one sample left thrd = ds.samples[0] > self._thrmap # mapper default mapper = IdentityMapper() # overwrite if possible if hasattr(ds, 'a') and 'mapper' in ds.a: mapper = ds.a.mapper # reverse-map input osamp = mapper.reverse1(thrd) # prep output dataset outds = ds.copy(deep=False) outds.fa['featurewise_thresh'] = self._thrmap # determine clusters labels, num = measurements.label(osamp) area = measurements.sum(osamp, labels, index=np.arange(1, num + 1)).astype(int) # for the rest we need the labels flattened labels = mapper.forward1(labels) # relabel clusters starting with the biggest and increase index with # decreasing size ordered_labels = np.zeros(labels.shape, dtype=int) ordered_area = np.zeros(area.shape, dtype=int) for i, idx in enumerate(np.argsort(area)): ordered_labels[labels == idx + 1] = num - i ordered_area[i] = area[idx] area = ordered_area[::-1] labels = ordered_labels del ordered_labels # this one can be big # store cluster labels after forward-mapping outds.fa['clusters_featurewise_thresh'] = labels.copy() # update cluster size histogram with the actual result to get a # proper lower bound for p-values # this will make a copy, because the original matrix is int cluster_probs_raw = _transform_to_pvals( area, self._null_cluster_sizes.astype('float')) if self.params.multicomp_correction is None: probs_corr = np.array(cluster_probs_raw) rej = probs_corr <= self.params.fwe_rate outds.a['clusterstats'] = \ np.rec.fromarrays( [area, cluster_probs_raw], names=('size', 'prob_raw')) else: # do a local import as only this tiny portion needs statsmodels import statsmodels.stats.multitest as smm rej, probs_corr = smm.multipletests( cluster_probs_raw, alpha=self.params.fwe_rate, method=self.params.multicomp_correction)[:2] # store corrected per-cluster probabilities outds.a['clusterstats'] = \ np.rec.fromarrays( [area, cluster_probs_raw, probs_corr], names=('size', 'prob_raw', 'prob_corrected')) # remove cluster labels that did not pass the FWE threshold for i, r in enumerate(rej): if not r: labels[labels == i + 1] = 0 outds.fa['clusters_fwe_thresh'] = labels return outds
def get_crossvalidation_instance(learner, partitioner, errorfx, sampling_repetitions=1, learner_space='targets', balance_training=None, permutations=0, avg_datafold_results=True, prob_tail='left'): from mvpa2.base.node import ChainNode from mvpa2.measures.base import CrossValidation if not balance_training is None: # balance training data try: amount = int(balance_training) except ValueError: try: amount = float(balance_training) except ValueError: amount = balance_training from mvpa2.generators.resampling import Balancer balancer = Balancer(amount=amount, attr=learner_space, count=sampling_repetitions, limit={partitioner.get_space(): 1}, apply_selection=True, include_offlimit=True) else: balancer = None # set learner space learner.set_space(learner_space) # setup generator for data folding -- put in a chain node for easy # amending gennode = ChainNode([partitioner], space=partitioner.get_space()) if avg_datafold_results: from mvpa2.mappers.fx import mean_sample postproc = mean_sample() else: postproc = None if not balancer is None: # enable balancing step for each partitioning step gennode.append(balancer) if permutations > 0: from mvpa2.generators.base import Repeater from mvpa2.generators.permutation import AttributePermutator from mvpa2.clfs.stats import MCNullDist # how often do we want to shuffle the data repeater = Repeater(count=permutations) # permute the training part of a dataset exactly ONCE permutator = AttributePermutator(learner_space, limit={partitioner.get_space(): 1}, count=1) # CV with null-distribution estimation that permutes the training data for # each fold independently perm_gen_node = copy.deepcopy(gennode) perm_gen_node.append(permutator) null_cv = CrossValidation(learner, perm_gen_node, postproc=postproc, errorfx=errorfx) # Monte Carlo distribution estimator distr_est = MCNullDist(repeater, tail=prob_tail, measure=null_cv, enable_ca=['dist_samples']) # pass the p-values as feature attributes on to the results pass_attr = [('ca.null_prob', 'fa', 1)] else: distr_est = None pass_attr = None # final CV node cv = CrossValidation(learner, gennode, errorfx=errorfx, null_dist=distr_est, postproc=postproc, enable_ca=['stats', 'null_prob'], pass_attr=pass_attr) return cv
def test_rfe(self, clf): # sensitivity analyser and transfer error quantifier use the SAME clf! sens_ana = clf.get_sensitivity_analyzer(postproc=maxofabs_sample()) pmeasure = ProxyMeasure(clf, postproc=BinaryFxNode(mean_mismatch_error, 'targets')) cvmeasure = CrossValidation(clf, NFoldPartitioner(), errorfx=mean_mismatch_error, postproc=mean_sample()) rfesvm_split = SplitClassifier(clf, OddEvenPartitioner()) # explore few recipes for rfe, data in [ # because the clf is already trained when computing the sensitivity # map, prevent retraining for transfer error calculation # Use absolute of the svm weights as sensitivity (RFE(sens_ana, pmeasure, Splitter('train'), fselector=FixedNElementTailSelector(1), train_pmeasure=False), self.get_data()), # use cross-validation within training to get error for the stopping point # but use full training data to derive sensitivity ( RFE( sens_ana, cvmeasure, Repeater( 2 ), # give the same full dataset to sens_ana and cvmeasure fselector=FractionTailSelector(0.70, mode='select', tail='upper'), train_pmeasure=True), normal_feature_dataset(perlabel=20, nchunks=5, nfeatures=200, nonbogus_features=[0, 1], snr=1.5)), # use cross-validation (via SplitClassifier) and get mean # of normed sensitivities across those splits ( RFE( rfesvm_split.get_sensitivity_analyzer( postproc=ChainMapper([ FxMapper('features', l2_normed), FxMapper('samples', np.mean), FxMapper('samples', np.abs) ])), ConfusionBasedError(rfesvm_split, confusion_state='stats'), Repeater( 2), # we will use the same full cv-training dataset fselector=FractionTailSelector(0.50, mode='select', tail='upper'), stopping_criterion=NBackHistoryStopCrit( BestDetector(), 10), train_pmeasure= False, # we just extract it from existing confusion update_sensitivity=True), normal_feature_dataset(perlabel=28, nchunks=7, nfeatures=200, nonbogus_features=[0, 1], snr=1.5)) ]: # prep data # data = datasets['uni2medium'] data_nfeatures = data.nfeatures rfe.train(data) resds = rfe(data) # fail if orig datasets are changed self.assertTrue(data.nfeatures == data_nfeatures) # check that the features set with the least error is selected if len(rfe.ca.errors): e = np.array(rfe.ca.errors) if isinstance(rfe._fselector, FixedNElementTailSelector): self.assertTrue(resds.nfeatures == data_nfeatures - e.argmin()) else: imin = np.argmin(e) if 'does_feature_selection' in clf.__tags__: # if clf is smart it might figure it out right away assert_array_less(imin, len(e)) else: # in this case we can even check if we had actual # going down/up trend... although -- why up??? self.assertTrue(1 < imin < len(e) - 1) else: self.assertTrue(resds.nfeatures == data_nfeatures) # silly check if nfeatures is in decreasing order nfeatures = np.array(rfe.ca.nfeatures).copy() nfeatures.sort() self.assertTrue((nfeatures[::-1] == rfe.ca.nfeatures).all()) # check if history has elements for every step self.assertTrue( set(rfe.ca.history) == set(range(len(np.array( rfe.ca.errors))))) # Last (the largest number) can be present multiple times even # if we remove 1 feature at a time -- just need to stop well # in advance when we have more than 1 feature left ;) self.assertTrue(rfe.ca.nfeatures[-1] == len( np.where(rfe.ca.history == max(rfe.ca.history))[0]))
def get_linear_svm_measure(): clf = LinearCSVMC(space="condition") splt = ChainNode( [NFoldPartitioner(), Balancer(attr="condition", count=1, limit="partitions", apply_selection=True)], space="partitions", ) # splt = NFoldPartitioner() cv = CrossValidation(clf, splt, errorfx=mean_match_accuracy, enable_ca=["stats"], postproc=mean_sample()) return cv
def test_regressions(self, regr): """Simple tests on regressions """ if not externals.exists('scipy'): raise SkipTest else: from mvpa2.misc.errorfx import corr_error ds = datasets['chirp_linear'] # we want numeric labels to maintain the previous behavior, especially # since we deal with regressions here ds.sa.targets = AttributeMap().to_numeric(ds.targets) cve = CrossValidation(regr, NFoldPartitioner(), postproc=mean_sample(), errorfx=corr_error, enable_ca=['training_stats', 'stats']) # check the default #self.assertTrue(cve.transerror.errorfx is corr_error) corr = np.asscalar(cve(ds).samples) # Our CorrErrorFx should never return NaN self.assertTrue(not np.isnan(corr)) self.assertTrue(corr == cve.ca.stats.stats['CCe']) splitregr = SplitClassifier( regr, partitioner=OddEvenPartitioner(), enable_ca=['training_stats', 'stats']) splitregr.train(ds) split_corr = splitregr.ca.stats.stats['CCe'] split_corr_tr = splitregr.ca.training_stats.stats['CCe'] for confusion, error in ( (cve.ca.stats, corr), (splitregr.ca.stats, split_corr), (splitregr.ca.training_stats, split_corr_tr), ): #TODO: test confusion statistics # Part of it for now -- CCe for conf in confusion.summaries: stats = conf.stats if cfg.getboolean('tests', 'labile', default='yes'): self.assertTrue(stats['CCe'] < 0.5) self.assertEqual(stats['CCe'], stats['Summary CCe']) s0 = confusion.as_string(short=True) s1 = confusion.as_string(short=False) for s in [s0, s1]: self.assertTrue(len(s) > 10, msg="We should get some string representation " "of regression summary. Got %s" % s) if cfg.getboolean('tests', 'labile', default='yes'): self.assertTrue(error < 0.2, msg="Regressions should perform well on a simple " "dataset. Got correlation error of %s " % error) # Test access to summary statistics # YOH: lets start making testing more reliable. # p-value for such accident to have is verrrry tiny, # so if regression works -- it better has at least 0.5 ;) # otherwise fix it! ;) # YOH: not now -- issues with libsvr in SG and linear kernel if cfg.getboolean('tests', 'labile', default='yes'): self.assertTrue(confusion.stats['CCe'] < 0.5) # just to check if it works fine split_predictions = splitregr.predict(ds.samples)
def test_tree_classifier(self): """Basic tests for TreeClassifier """ ds = datasets['uni4medium'] # make it simple of the beast -- take only informative ones # because classifiers for the tree are selected randomly, so # performance varies a lot and we just need to check on # correct operation ds = ds[:, ds.fa.nonbogus_targets != [None]] clfs = clfswh['binary'] # pool of classifiers # Lets permute so each time we try some different combination # of the classifiers but exclude those operating on %s of # features since we might not have enough for that clfs = [ clfs[i] for i in np.random.permutation(len(clfs)) if not '%' in str(clfs[i]) ] # NB: It is necessary that the same classifier was not used at # different nodes, since it would be re-trained for a new set # of targets, thus leading to incorrect behavior/high error. # # Clone only those few leading ones which we will use # throughout the test clfs = [clf.clone() for clf in clfs[:4]] # Test conflicting definition tclf = TreeClassifier(clfs[0], { 'L0+2': (('L0', 'L2'), clfs[1]), 'L2+3': (('L2', 'L3'), clfs[2]) }) self.assertRaises(ValueError, tclf.train, ds) """Should raise exception since label 2 is in both""" # Test insufficient definition tclf = TreeClassifier(clfs[0], { 'L0+5': (('L0', 'L5'), clfs[1]), 'L2+3': (('L2', 'L3'), clfs[2]) }) self.assertRaises(ValueError, tclf.train, ds) """Should raise exception since no group for L1""" # proper definition now tclf = TreeClassifier(clfs[0], { 'L0+1': (('L0', 'L1'), clfs[1]), 'L2+3': (('L2', 'L3'), clfs[2]) }) # Lets test train/test cycle using CVTE cv = CrossValidation(tclf, OddEvenPartitioner(), postproc=mean_sample(), enable_ca=['stats', 'training_stats']) cverror = cv(ds).samples.squeeze() try: rtclf = repr(tclf) except: self.fail(msg="Could not obtain repr for TreeClassifier") # Test accessibility of .clfs self.assertTrue(tclf.clfs['L0+1'] is clfs[1]) self.assertTrue(tclf.clfs['L2+3'] is clfs[2]) cvtrc = cv.ca.training_stats cvtc = cv.ca.stats if cfg.getboolean('tests', 'labile', default='yes'): # just a dummy check to make sure everything is working self.assertTrue(cvtrc != cvtc) self.assertTrue(cverror < 0.3, msg="Got too high error = %s using %s" % (cverror, tclf)) # Test trailing nodes with no classifier # That is why we use separate pool of classifiers here # (that is probably old/not-needed since switched to use clones) clfs_mc = clfswh['multiclass'] # pool of classifiers clfs_mc = [ clfs_mc[i] for i in np.random.permutation(len(clfs_mc)) if not '%' in str(clfs_mc[i]) ] clfs_mc = [clf.clone() for clf in clfs_mc[:4]] # and clones again tclf = TreeClassifier(clfs_mc[0], { 'L0': (('L0', ), None), 'L1+2+3': (('L1', 'L2', 'L3'), clfs_mc[1]) }) cv = CrossValidation(tclf, OddEvenPartitioner(), postproc=mean_sample(), enable_ca=['stats', 'training_stats']) cverror = np.asscalar(cv(ds)) if cfg.getboolean('tests', 'labile', default='yes'): self.assertTrue(cverror < 0.3, msg="Got too high error = %s using %s" % (cverror, tclf))
def test_gnbsearchlight_permutations(): import mvpa2 from mvpa2.base.node import ChainNode from mvpa2.clfs.gnb import GNB from mvpa2.generators.base import Repeater from mvpa2.generators.partition import NFoldPartitioner, OddEvenPartitioner #import mvpa2.generators.permutation #reload(mvpa2.generators.permutation) from mvpa2.generators.permutation import AttributePermutator from mvpa2.testing.datasets import datasets from mvpa2.measures.base import CrossValidation from mvpa2.measures.gnbsearchlight import sphere_gnbsearchlight from mvpa2.measures.searchlight import sphere_searchlight from mvpa2.mappers.fx import mean_sample from mvpa2.misc.errorfx import mean_mismatch_error from mvpa2.clfs.stats import MCNullDist from mvpa2.testing.tools import assert_raises, ok_, assert_array_less # mvpa2.debug.active = ['APERM', 'SLC'] #, 'REPM'] # mvpa2.debug.metrics += ['pid'] count = 10 nproc = 1 + int(mvpa2.externals.exists('pprocess')) ds = datasets['3dsmall'].copy() ds.fa['voxel_indices'] = ds.fa.myspace slkwargs = dict(radius=3, space='voxel_indices', enable_ca=['roi_sizes'], center_ids=[1, 10, 70, 100]) mvpa2.seed(mvpa2._random_seed) clf = GNB() splt = NFoldPartitioner(cvtype=2, attr='chunks') repeater = Repeater(count=count) permutator = AttributePermutator('targets', limit={'partitions': 1}, count=1) null_sl = sphere_gnbsearchlight(clf, ChainNode([splt, permutator], space=splt.get_space()), postproc=mean_sample(), errorfx=mean_mismatch_error, **slkwargs) distr_est = MCNullDist(repeater, tail='left', measure=null_sl, enable_ca=['dist_samples']) sl = sphere_gnbsearchlight(clf, splt, reuse_neighbors=True, null_dist=distr_est, postproc=mean_sample(), errorfx=mean_mismatch_error, **slkwargs) if __debug__: # assert is done only without -O mode assert_raises(NotImplementedError, sl, ds) # "ad-hoc searchlights can't handle yet varying targets across partitions" if False: # after above limitation is removed -- enable sl_map = sl(ds) sl_null_prob = sl.ca.null_prob.samples.copy() mvpa2.seed(mvpa2._random_seed) ### 'normal' Searchlight clf = GNB() splt = NFoldPartitioner(cvtype=2, attr='chunks') repeater = Repeater(count=count) permutator = AttributePermutator('targets', limit={'partitions': 1}, count=1) # rng=np.random.RandomState(0)) # to trigger failure since the same np.random state # would be reused across all pprocesses null_cv = CrossValidation(clf, ChainNode([splt, permutator], space=splt.get_space()), postproc=mean_sample()) null_sl_normal = sphere_searchlight(null_cv, nproc=nproc, **slkwargs) distr_est_normal = MCNullDist(repeater, tail='left', measure=null_sl_normal, enable_ca=['dist_samples']) cv = CrossValidation(clf, splt, errorfx=mean_mismatch_error, enable_ca=['stats'], postproc=mean_sample() ) sl = sphere_searchlight(cv, nproc=nproc, null_dist=distr_est_normal, **slkwargs) sl_map_normal = sl(ds) sl_null_prob_normal = sl.ca.null_prob.samples.copy() # For every feature -- we should get some variance in estimates In # case of failure they are all really close to each other (up to # numerical precision), so variance will be close to 0 assert_array_less(-np.var(distr_est_normal.ca.dist_samples.samples[0], axis=1), -1e-5) for s in distr_est_normal.ca.dist_samples.samples[0]: ok_(len(np.unique(s)) > 1)
def test_regressions(self, regr): """Simple tests on regressions """ if not externals.exists('scipy'): raise SkipTest else: from mvpa2.misc.errorfx import corr_error ds = datasets['chirp_linear'] # we want numeric labels to maintain the previous behavior, especially # since we deal with regressions here ds.sa.targets = AttributeMap().to_numeric(ds.targets) cve = CrossValidation(regr, NFoldPartitioner(), postproc=mean_sample(), errorfx=corr_error, enable_ca=['training_stats', 'stats']) # check the default #self.assertTrue(cve.transerror.errorfx is corr_error) corr = np.asscalar(cve(ds).samples) # Our CorrErrorFx should never return NaN self.assertTrue(not np.isnan(corr)) self.assertTrue(corr == cve.ca.stats.stats['CCe']) splitregr = SplitClassifier( regr, partitioner=OddEvenPartitioner(), enable_ca=['training_stats', 'stats']) splitregr.train(ds) split_corr = splitregr.ca.stats.stats['CCe'] split_corr_tr = splitregr.ca.training_stats.stats['CCe'] for confusion, error in ( (cve.ca.stats, corr), (splitregr.ca.stats, split_corr), (splitregr.ca.training_stats, split_corr_tr), ): #TODO: test confusion statistics # Part of it for now -- CCe for conf in confusion.summaries: stats = conf.stats if cfg.getboolean('tests', 'labile', default='yes'): self.assertTrue(stats['CCe'] < 0.5) self.assertEqual(stats['CCe'], stats['Summary CCe']) s0 = confusion.as_string(short=True) s1 = confusion.as_string(short=False) for s in [s0, s1]: self.assertTrue(len(s) > 10, msg="We should get some string representation " "of regression summary. Got %s" % s) if cfg.getboolean('tests', 'labile', default='yes'): self.assertTrue(error < 0.2, msg="Regressions should perform well on a simple " "dataset. Got correlation error of %s " % error) # Test access to summary statistics # YOH: lets start making testing more reliable. # p-value for such accident to have is verrrry tiny, # so if regression works -- it better has at least 0.5 ;) # otherwise fix it! ;) # YOH: not now -- issues with libsvr in SG and linear kernel if cfg.getboolean('tests', 'labile', default='yes'): self.assertTrue(confusion.stats['CCe'] < 0.5) # just to check if it works fine split_predictions = splitregr.predict(ds.samples)
def do_searchlight(glm_dataset, radius, output_basename, with_null_prob=False, clf=LinearCSVMC(space='condition')): if(len(glob(output_basename+"*")) > 0): print "sl already ran" return splt = ChainNode([NFoldPartitioner(),Balancer(attr='condition',count=1,limit='partitions',apply_selection=True)],space='partitions') #splt = NFoldPartitioner() cv = CrossValidation(clf, splt, errorfx=mean_match_accuracy, enable_ca=['stats'], postproc=mean_sample()) distr_est = [] if with_null_prob: permutator = AttributePermutator('condition', count=100, limit='chunks') distr_est = MCNullDist(permutator, tail='left', enable_ca=['dist_samples']) """ repeater = Repeater(count=100) permutator = AttributePermutator('condition', limit={'partitions': 1}, count=1) null_cv = CrossValidation(clf, ChainNode([splt, permutator],space=splt.get_space()), postproc=mean_sample()) null_sl = sphere_searchlight(null_cv, radius=radius, space='voxel_indices', enable_ca=['roi_sizes']) distr_est = MCNullDist(repeater,tail='left', measure=null_sl, enable_ca=['dist_samples']) sl = sphere_searchlight(cv, radius=radius, space='voxel_indices', null_dist=distr_est, enable_ca=['roi_sizes', 'roi_feature_ids'] # ,result_fx = _fill_in_scattered_results # average across all spheres ) """ else: kwa = {'voxel_indices': KNNNeighbourhood(radius, glm_dataset.fa['voxel_indices'])} qe = IndexQueryEngine(**kwa) # init the searchlight with the queryengine sl = Searchlight(cv, queryengine=qe, roi_ids=None, enable_ca=['roi_sizes', 'roi_feature_ids'] # ,results_fx = _fill_in_scattered_results # average across all spheres ) #;v sl = sphere_searchlight(cv, radius=radius, space='voxel_indices', # ,result_fx = _fill_in_scattered_results # average across all spheres # ) # ds = glm_dataset.copy(deep=False, # sa=['condition','chunks'], # fa=['voxel_indices'], # a=['mapper']) from datetime import datetime print "starting sl {}".format(datetime.now()) sl_map = sl(glm_dataset) print "finished sl {}".format(datetime.now()) import pickle pickle.dump(sl_map, open("{}_sl_map.p".format(output_basename), "wb")) # pickle.dump(sl.ca.roi_feature_ids, open("{}_sl_feature_ids.p".format(output_basename), "wb")) # print len(sl.ca.roi_feature_ids[0]) acc_results = map2nifti(sl_map, imghdr=glm_dataset.a.imghdr) acc_nii_filename = '{}-acc.nii.gz'.format(output_basename) acc_results.to_filename(acc_nii_filename) sl_map.samples *= -1 sl_map.samples += 1 niftiresults = map2nifti(sl_map, imghdr=glm_dataset.a.imghdr) niftiresults.to_filename('{}-err.nii.gz'.format(output_basename)) # TODO: check p value map if with_null_prob: nullt_results = map2nifti(sl_map, data=sl.ca.null_t, imghdr=glm_dataset.a.imghdr) nullt_results.to_filename('{}-t.nii.gz'.format(output_basename)) nullprob_results = map2nifti(sl_map, data=sl.ca.null_prob, imghdr=glm_dataset.a.imghdr) nullprob_results.to_filename('{}-prob.nii.gz'.format(output_basename)) nullprob_results = map2nifti(sl_map, data=distr_est.cdf(sl_map.samples), imghdr=glm_dataset.a.imghdr) nullprob_results.to_filename('{}-cdf.nii.gz'.format(output_basename)) return sl_map
def test_rfe(self, clf): # sensitivity analyser and transfer error quantifier use the SAME clf! sens_ana = clf.get_sensitivity_analyzer(postproc=maxofabs_sample()) pmeasure = ProxyMeasure(clf, postproc=BinaryFxNode(mean_mismatch_error, 'targets')) cvmeasure = CrossValidation(clf, NFoldPartitioner(), errorfx=mean_mismatch_error, postproc=mean_sample()) rfesvm_split = SplitClassifier(clf, OddEvenPartitioner()) # explore few recipes for rfe, data in [ # because the clf is already trained when computing the sensitivity # map, prevent retraining for transfer error calculation # Use absolute of the svm weights as sensitivity (RFE(sens_ana, pmeasure, Splitter('train'), fselector=FixedNElementTailSelector(1), train_pmeasure=False), self.get_data()), # use cross-validation within training to get error for the stopping point # but use full training data to derive sensitivity (RFE(sens_ana, cvmeasure, Repeater(2), # give the same full dataset to sens_ana and cvmeasure fselector=FractionTailSelector( 0.70, mode='select', tail='upper'), train_pmeasure=True), normal_feature_dataset(perlabel=20, nchunks=5, nfeatures=200, nonbogus_features=[0, 1], snr=1.5)), # use cross-validation (via SplitClassifier) and get mean # of normed sensitivities across those splits (RFE(rfesvm_split.get_sensitivity_analyzer( postproc=ChainMapper([ FxMapper('features', l2_normed), FxMapper('samples', np.mean), FxMapper('samples', np.abs)])), ConfusionBasedError(rfesvm_split, confusion_state='stats'), Repeater(2), # we will use the same full cv-training dataset fselector=FractionTailSelector( 0.50, mode='select', tail='upper'), stopping_criterion=NBackHistoryStopCrit(BestDetector(), 10), train_pmeasure=False, # we just extract it from existing confusion update_sensitivity=True), normal_feature_dataset(perlabel=28, nchunks=7, nfeatures=200, nonbogus_features=[0, 1], snr=1.5)) ]: # prep data # data = datasets['uni2medium'] data_nfeatures = data.nfeatures rfe.train(data) resds = rfe(data) # fail if orig datasets are changed self.assertTrue(data.nfeatures == data_nfeatures) # check that the features set with the least error is selected if len(rfe.ca.errors): e = np.array(rfe.ca.errors) if isinstance(rfe._fselector, FixedNElementTailSelector): self.assertTrue(resds.nfeatures == data_nfeatures - e.argmin()) else: imin = np.argmin(e) if 'does_feature_selection' in clf.__tags__: # if clf is smart it might figure it out right away assert_array_less( imin, len(e) ) else: # in this case we can even check if we had actual # going down/up trend... although -- why up??? self.assertTrue( 1 < imin < len(e) - 1 ) else: self.assertTrue(resds.nfeatures == data_nfeatures) # silly check if nfeatures is in decreasing order nfeatures = np.array(rfe.ca.nfeatures).copy() nfeatures.sort() self.assertTrue( (nfeatures[::-1] == rfe.ca.nfeatures).all() ) # check if history has elements for every step self.assertTrue(set(rfe.ca.history) == set(range(len(np.array(rfe.ca.errors))))) # Last (the largest number) can be present multiple times even # if we remove 1 feature at a time -- just need to stop well # in advance when we have more than 1 feature left ;) self.assertTrue(rfe.ca.nfeatures[-1] == len(np.where(rfe.ca.history ==max(rfe.ca.history))[0]))
def test_multiclass_ties(clf): if 'lars' in clf.__tags__: raise SkipTest("Known to crash while running this test") ds = _dsties1 # reassign data between ties, so we know that decision is data, not order driven ds_ = ds.copy(deep=True) ds_.samples[ds.a.ties_idx[1]] = ds.samples[ds.a.ties_idx[0]] ds_.samples[ds.a.ties_idx[0]] = ds.samples[ds.a.ties_idx[1]] ok_(np.any(ds_.samples != ds.samples)) clf_ = clf.clone() clf = clf.clone() clf.ca.enable(['estimates', 'predictions']) clf_.ca.enable(['estimates', 'predictions']) te = TransferMeasure(clf, Splitter('train'), postproc=BinaryFxNode(mean_mismatch_error, 'targets'), enable_ca=['stats']) te_ = TransferMeasure(clf_, Splitter('train'), postproc=BinaryFxNode(mean_mismatch_error, 'targets'), enable_ca=['stats']) te = CrossValidation(clf, NFoldPartitioner(), postproc=mean_sample(), enable_ca=['stats']) te_ = CrossValidation(clf_, NFoldPartitioner(), postproc=mean_sample(), enable_ca=['stats']) error = te(ds) matrix = te.ca.stats.matrix # if ties were broken randomly we should have got nearly the same # number of hits for tied targets ties_indices = [te.ca.stats.labels.index(c) for c in ds.a.ties] hits = np.diag(te.ca.stats.matrix)[ties_indices] # First check is to see if we swap data between tied labels we # are getting the same results if we permute labels accordingly, # i.e. that tie resolution is not dependent on the labels order # but rather on the data te_(ds_) matrix_swapped = te_.ca.stats.matrix if False: #0 in hits: print clf, matrix, matrix_swapped print clf.ca.estimates[:, 2] - clf.ca.estimates[:, 0] #print clf.ca.estimates # TODO: for now disabled all the non-compliant ones to pass the # tests. For visibility decided to skip them instead of just # exclusion and skipping only here to possibly catch crashes # which might happen before if len( set(('libsvm', 'sg', 'skl', 'gpr', 'blr')).intersection(clf.__tags__)): raise SkipTest("Skipped %s because it is known to fail") ok_(not (np.array_equal(matrix, matrix_swapped) and 0 in hits)) # this check is valid only if ties are not broken randomly # like it is the case with SMLR if not ('random_tie_breaking' in clf.__tags__ or # since __tags__ would not go that high up e.g. in # <knn on SMLR non-0> 'SMLR' in str(clf)): assert_array_equal(hits, np.diag(matrix_swapped)[ties_indices[::-1]]) # Second check is to just see if we didn't get an obvious bias and # got 0 in one of the hits, although it is labile if cfg.getboolean('tests', 'labile', default='yes'): ok_(not 0 in hits)
def _call(self, ds): if len(ds) > 1: # average all samples into one, assuming we got something like one # sample per subject as input avgr = mean_sample() ds = avgr(ds) # threshold input; at this point we only have one sample left thrd = ds.samples[0] > self._thrmap # mapper default mapper = IdentityMapper() # overwrite if possible if hasattr(ds, 'a') and 'mapper' in ds.a: mapper = ds.a.mapper # reverse-map input othrd = _verified_reverse1(mapper, thrd) # TODO: what is your purpose in life osamp? ;-) osamp = _verified_reverse1(mapper, ds.samples[0]) # prep output dataset outds = ds.copy(deep=False) outds.fa['featurewise_thresh'] = self._thrmap # determine clusters labels, num = measurements.label(othrd,structure=np.ones([3,3,3])) area = measurements.sum(othrd, labels, index=np.arange(1, num + 1)).astype(int) com = measurements.center_of_mass( osamp, labels=labels, index=np.arange(1, num + 1)) maxpos = measurements.maximum_position( osamp, labels=labels, index=np.arange(1, num + 1)) # for the rest we need the labels flattened labels = mapper.forward1(labels) # relabel clusters starting with the biggest and increase index with # decreasing size ordered_labels = np.zeros(labels.shape, dtype=int) ordered_area = np.zeros(area.shape, dtype=int) ordered_com = np.zeros((num, len(osamp.shape)), dtype=float) ordered_maxpos = np.zeros((num, len(osamp.shape)), dtype=float) for i, idx in enumerate(np.argsort(area)): ordered_labels[labels == idx + 1] = num - i # kinda ugly, but we are looping anyway ordered_area[i] = area[idx] ordered_com[i] = com[idx] ordered_maxpos[i] = maxpos[idx] labels = ordered_labels area = ordered_area[::-1] com = ordered_com[::-1] maxpos = ordered_maxpos[::-1] del ordered_labels # this one can be big # store cluster labels after forward-mapping outds.fa['clusters_featurewise_thresh'] = labels.copy() # location info outds.a['clusterlocations'] = \ np.rec.fromarrays( [com, maxpos], names=('center_of_mass', 'max')) # update cluster size histogram with the actual result to get a # proper lower bound for p-values # this will make a copy, because the original matrix is int cluster_probs_raw = _transform_to_pvals( area, self._null_cluster_sizes.astype('float')) clusterstats = ( [area, cluster_probs_raw], ['size', 'prob_raw'] ) # evaluate a bunch of stats for all clusters morestats = {} for cid in xrange(len(area)): # keep clusters on outer loop, because selection is more expensive clvals = ds.samples[0, labels == cid + 1] for id_, fx in ( ('mean', np.mean), ('median', np.median), ('min', np.min), ('max', np.max), ('std', np.std)): stats = morestats.get(id_, []) stats.append(fx(clvals)) morestats[id_] = stats for k, v in morestats.items(): clusterstats[0].append(v) clusterstats[1].append(k) if self.params.multicomp_correction is not None: # do a local import as only this tiny portion needs statsmodels import statsmodels.stats.multitest as smm rej, probs_corr = smm.multipletests( cluster_probs_raw, alpha=self.params.fwe_rate, method=self.params.multicomp_correction)[:2] # store corrected per-cluster probabilities clusterstats[0].append(probs_corr) clusterstats[1].append('prob_corrected') # remove cluster labels that did not pass the FWE threshold for i, r in enumerate(rej): if not r: labels[labels == i + 1] = 0 outds.fa['clusters_fwe_thresh'] = labels outds.a['clusterstats'] = \ np.rec.fromarrays(clusterstats[0], names=clusterstats[1]) return outds
def test_multiclass_ties(clf): ds = _dsties1 # reassign data between ties, so we know that decision is data, not order driven ds_ = ds.copy(deep=True) ds_.samples[ds.a.ties_idx[1]] = ds.samples[ds.a.ties_idx[0]] ds_.samples[ds.a.ties_idx[0]] = ds.samples[ds.a.ties_idx[1]] ok_(np.any(ds_.samples != ds.samples)) clf_ = clf.clone() clf = clf.clone() clf.ca.enable(['estimates', 'predictions']) clf_.ca.enable(['estimates', 'predictions']) te = TransferMeasure(clf, Splitter('train'), postproc=BinaryFxNode(mean_mismatch_error, 'targets'), enable_ca=['stats']) te_ = TransferMeasure(clf_, Splitter('train'), postproc=BinaryFxNode(mean_mismatch_error, 'targets'), enable_ca=['stats']) te = CrossValidation(clf, NFoldPartitioner(), postproc=mean_sample(), enable_ca=['stats']) te_ = CrossValidation(clf_, NFoldPartitioner(), postproc=mean_sample(), enable_ca=['stats']) error = te(ds) matrix = te.ca.stats.matrix # if ties were broken randomly we should have got nearly the same # number of hits for tied targets ties_indices = [te.ca.stats.labels.index(c) for c in ds.a.ties] hits = np.diag(te.ca.stats.matrix)[ties_indices] # First check is to see if we swap data between tied labels we # are getting the same results if we permute labels accordingly, # i.e. that tie resolution is not dependent on the labels order # but rather on the data te_(ds_) matrix_swapped = te_.ca.stats.matrix if False: #0 in hits: print clf, matrix, matrix_swapped print clf.ca.estimates[:, 2] - clf.ca.estimates[:,0] #print clf.ca.estimates # TODO: for now disabled all the non-compliant ones to pass the # tests. For visibility decided to skip them instead of just # exclusion and skipping only here to possibly catch crashes # which might happen before if len(set(('libsvm', 'sg', 'skl', 'gpr', 'blr') ).intersection(clf.__tags__)): raise SkipTest("Skipped %s because it is known to fail") ok_(not (np.array_equal(matrix, matrix_swapped) and 0 in hits)) # this check is valid only if ties are not broken randomly # like it is the case with SMLR if not ('random_tie_breaking' in clf.__tags__ or # since __tags__ would not go that high up e.g. in # <knn on SMLR non-0> 'SMLR' in str(clf)): assert_array_equal(hits, np.diag(matrix_swapped)[ties_indices[::-1]]) # Second check is to just see if we didn't get an obvious bias and # got 0 in one of the hits, although it is labile if cfg.getboolean('tests', 'labile', default='yes'): ok_(not 0 in hits)
from mvpa2.mappers.fx import mean_sample """For the sake of simplicity, let's use a small artificial dataset.""" # Lets just use our tiny 4D dataset from testing battery dataset = datasets['3dlarge'] """Now it only takes three lines for a searchlight analysis.""" # setup measure to be computed in each sphere (cross-validated # generalization error on odd/even splits) cv = CrossValidation(LinearCSVMC(), OddEvenPartitioner()) # setup searchlight with 2 voxels radius and measure configured above sl = sphere_searchlight(cv, radius=2, space='myspace', postproc=mean_sample()) # run searchlight on dataset sl_map = sl(dataset) print 'Best performing sphere error:', np.min(sl_map.samples) """ If this analysis is done on a fMRI dataset using `NiftiDataset` the resulting searchlight map (`sl_map`) can be mapped back into the original dataspace and viewed as a brain overlay. :ref:`Another example <example_searchlight>` shows a typical application of this algorithm. .. Mention the fact that it also is a special `SensitivityAnalyzer` """
def test_rfe_sensmap(): # http://lists.alioth.debian.org/pipermail/pkg-exppsy-pymvpa/2013q3/002538.html # just a smoke test. fails with from mvpa2.clfs.svm import LinearCSVMC from mvpa2.clfs.meta import FeatureSelectionClassifier from mvpa2.measures.base import CrossValidation, RepeatedMeasure from mvpa2.generators.splitters import Splitter from mvpa2.generators.partition import NFoldPartitioner from mvpa2.misc.errorfx import mean_mismatch_error from mvpa2.mappers.fx import mean_sample from mvpa2.mappers.fx import maxofabs_sample from mvpa2.generators.base import Repeater from mvpa2.featsel.rfe import RFE from mvpa2.featsel.helpers import FractionTailSelector, BestDetector from mvpa2.featsel.helpers import NBackHistoryStopCrit from mvpa2.datasets import vstack from mvpa2.misc.data_generators import normal_feature_dataset # Let's simulate the beast -- 6 categories total groupped into 3 # super-ordinate, and actually without any 'superordinate' effect # since subordinate categories independent fds = normal_feature_dataset(nlabels=3, snr=1, # 100, # pure signal! ;) perlabel=9, nfeatures=6, nonbogus_features=range(3), nchunks=3) clfsvm = LinearCSVMC() rfesvm = RFE(clfsvm.get_sensitivity_analyzer(postproc=maxofabs_sample()), CrossValidation( clfsvm, NFoldPartitioner(), errorfx=mean_mismatch_error, postproc=mean_sample()), Repeater(2), fselector=FractionTailSelector(0.70, mode='select', tail='upper'), stopping_criterion=NBackHistoryStopCrit(BestDetector(), 10), update_sensitivity=True) fclfsvm = FeatureSelectionClassifier(clfsvm, rfesvm) sensanasvm = fclfsvm.get_sensitivity_analyzer(postproc=maxofabs_sample()) # manually repeating/splitting so we do both RFE sensitivity and classification senses, errors = [], [] for i, pset in enumerate(NFoldPartitioner().generate(fds)): # split partitioned dataset split = [d for d in Splitter('partitions').generate(pset)] senses.append(sensanasvm(split[0])) # and it also should train the classifier so we would ask it about error errors.append(mean_mismatch_error(fclfsvm.predict(split[1]), split[1].targets)) senses = vstack(senses) errors = vstack(errors) # Let's compare against rerunning the beast simply for classification with CV errors_cv = CrossValidation(fclfsvm, NFoldPartitioner(), errorfx=mean_mismatch_error)(fds) # and they should match assert_array_equal(errors, errors_cv) # buggy! cv_sensana_svm = RepeatedMeasure(sensanasvm, NFoldPartitioner()) senses_rm = cv_sensana_svm(fds) #print senses.samples, senses_rm.samples #print errors, errors_cv.samples assert_raises(AssertionError, assert_array_almost_equal, senses.samples, senses_rm.samples) raise SkipTest("Known failure for repeated measures: https://github.com/PyMVPA/PyMVPA/issues/117")
def test_tree_classifier(self): """Basic tests for TreeClassifier """ ds = datasets['uni4medium'] # make it simple of the beast -- take only informative ones # because classifiers for the tree are selected randomly, so # performance varies a lot and we just need to check on # correct operation ds = ds[:, ds.fa.nonbogus_targets != [None]] clfs = clfswh['binary'] # pool of classifiers # Lets permute so each time we try some different combination # of the classifiers but exclude those operating on %s of # features since we might not have enough for that clfs = [clfs[i] for i in np.random.permutation(len(clfs)) if not '%' in str(clfs[i])] # NB: It is necessary that the same classifier was not used at # different nodes, since it would be re-trained for a new set # of targets, thus leading to incorrect behavior/high error. # # Clone only those few leading ones which we will use # throughout the test clfs = [clf.clone() for clf in clfs[:4]] # Test conflicting definition tclf = TreeClassifier(clfs[0], { 'L0+2': (('L0', 'L2'), clfs[1]), 'L2+3': (('L2', 'L3'), clfs[2])}) self.assertRaises(ValueError, tclf.train, ds) """Should raise exception since label 2 is in both""" # Test insufficient definition tclf = TreeClassifier(clfs[0], { 'L0+5': (('L0', 'L5'), clfs[1]), 'L2+3': (('L2', 'L3'), clfs[2])}) self.assertRaises(ValueError, tclf.train, ds) """Should raise exception since no group for L1""" # proper definition now tclf = TreeClassifier(clfs[0], { 'L0+1': (('L0', 'L1'), clfs[1]), 'L2+3': (('L2', 'L3'), clfs[2])}) # Lets test train/test cycle using CVTE cv = CrossValidation(tclf, OddEvenPartitioner(), postproc=mean_sample(), enable_ca=['stats', 'training_stats']) cverror = cv(ds).samples.squeeze() try: rtclf = repr(tclf) except: self.fail(msg="Could not obtain repr for TreeClassifier") # Test accessibility of .clfs self.assertTrue(tclf.clfs['L0+1'] is clfs[1]) self.assertTrue(tclf.clfs['L2+3'] is clfs[2]) cvtrc = cv.ca.training_stats cvtc = cv.ca.stats if cfg.getboolean('tests', 'labile', default='yes'): # just a dummy check to make sure everything is working self.assertTrue(cvtrc != cvtc) self.assertTrue(cverror < 0.3, msg="Got too high error = %s using %s" % (cverror, tclf)) # Test trailing nodes with no classifier # That is why we use separate pool of classifiers here # (that is probably old/not-needed since switched to use clones) clfs_mc = clfswh['multiclass'] # pool of classifiers clfs_mc = [clfs_mc[i] for i in np.random.permutation(len(clfs_mc)) if not '%' in str(clfs_mc[i])] clfs_mc = [clf.clone() for clf in clfs_mc[:4]] # and clones again tclf = TreeClassifier(clfs_mc[0], { 'L0': (('L0',), None), 'L1+2+3': (('L1', 'L2', 'L3'), clfs_mc[1])}) cv = CrossValidation(tclf, OddEvenPartitioner(), postproc=mean_sample(), enable_ca=['stats', 'training_stats']) cverror = np.asscalar(cv(ds)) if cfg.getboolean('tests', 'labile', default='yes'): self.assertTrue(cverror < 0.3, msg="Got too high error = %s using %s" % (cverror, tclf))
def get_crossvalidation_instance(learner, partitioner, errorfx, sampling_repetitions=1, learner_space='targets', balance_training=None, permutations=0, avg_datafold_results=True, prob_tail='left'): from mvpa2.base.node import ChainNode from mvpa2.measures.base import CrossValidation if not balance_training is None: # balance training data try: amount = int(balance_training) except ValueError: try: amount = float(balance_training) except ValueError: amount = balance_training from mvpa2.generators.resampling import Balancer balancer = Balancer(amount=amount, attr=learner_space, count=sampling_repetitions, limit={partitioner.get_space(): 1}, apply_selection=True, include_offlimit=True) else: balancer = None # set learner space learner.set_space(learner_space) # setup generator for data folding -- put in a chain node for easy # amending gennode = ChainNode([partitioner], space=partitioner.get_space()) if avg_datafold_results: from mvpa2.mappers.fx import mean_sample postproc = mean_sample() else: postproc = None if not balancer is None: # enable balancing step for each partitioning step gennode.append(balancer) if permutations > 0: from mvpa2.generators.base import Repeater from mvpa2.generators.permutation import AttributePermutator from mvpa2.clfs.stats import MCNullDist # how often do we want to shuffle the data repeater = Repeater(count=permutations) # permute the training part of a dataset exactly ONCE permutator = AttributePermutator( learner_space, limit={partitioner.get_space(): 1}, count=1) # CV with null-distribution estimation that permutes the training data for # each fold independently perm_gen_node = copy.deepcopy(gennode) perm_gen_node.append(permutator) null_cv = CrossValidation(learner, perm_gen_node, postproc=postproc, errorfx=errorfx) # Monte Carlo distribution estimator distr_est = MCNullDist(repeater, tail=prob_tail, measure=null_cv, enable_ca=['dist_samples']) # pass the p-values as feature attributes on to the results pass_attr = [('ca.null_prob', 'fa', 1)] else: distr_est = None pass_attr = None # final CV node cv = CrossValidation(learner, gennode, errorfx=errorfx, null_dist=distr_est, postproc=postproc, enable_ca=['stats', 'null_prob'], pass_attr=pass_attr) return cv
print externals.exists('libsvm') from mvpa2.measures.base import CrossValidation from mvpa2.measures.searchlight import sphere_searchlight from mvpa2.testing.datasets import datasets from mvpa2.mappers.fx import mean_sample """For the sake of simplicity, let's use a small artificial dataset.""" # Lets just use our tiny 4D dataset from testing battery dataset = datasets['3dlarge'] """Now it only takes three lines for a searchlight analysis.""" # setup measure to be computed in each sphere (cross-validated # generalization error on odd/even splits) cv = CrossValidation(LinearCSVMC(), OddEvenPartitioner()) # setup searchlight with 2 voxels radius and measure configured above sl = sphere_searchlight(cv, radius=2, space='myspace', postproc=mean_sample()) # run searchlight on dataset sl_map = sl(dataset) print 'Best performing sphere error:', np.min(sl_map.samples) """ If this analysis is done on a fMRI dataset using `NiftiDataset` the resulting searchlight map (`sl_map`) can be mapped back into the original dataspace and viewed as a brain overlay. :ref:`Another example <example_searchlight>` shows a typical application of this algorithm. .. Mention the fact that it also is a special `SensitivityAnalyzer` """