def test_custom_split(self): #simulate half splitter hs = CustomPartitioner([(None,[0,1,2,3,4]),(None,[5,6,7,8,9])]) spl = Splitter(attr='partitions') splits = [ list(spl.generate(p)) for p in hs.generate(self.data) ] self.failUnless(len(splits) == 2) for i,p in enumerate(splits): self.failUnless( len(p) == 2 ) self.failUnless( p[0].nsamples == 50 ) self.failUnless( p[1].nsamples == 50 ) assert_array_equal(splits[0][1].sa['chunks'].unique, [0, 1, 2, 3, 4]) assert_array_equal(splits[0][0].sa['chunks'].unique, [5, 6, 7, 8, 9]) assert_array_equal(splits[1][1].sa['chunks'].unique, [5, 6, 7, 8, 9]) assert_array_equal(splits[1][0].sa['chunks'].unique, [0, 1, 2, 3, 4]) # check fully customized split with working and validation set specified cs = CustomPartitioner([([0,3,4],[5,9])]) # we want to discared the unselected partition of the data, hence attr_value # these two splitters should do exactly the same thing splitters = (Splitter(attr='partitions', attr_values=[1,2]), Splitter(attr='partitions', ignore_values=(0,))) for spl in splitters: splits = [ list(spl.generate(p)) for p in cs.generate(self.data) ] self.failUnless(len(splits) == 1) for i,p in enumerate(splits): self.failUnless( len(p) == 2 ) self.failUnless( p[0].nsamples == 30 ) self.failUnless( p[1].nsamples == 20 ) self.failUnless((splits[0][1].sa['chunks'].unique == [5, 9]).all()) self.failUnless((splits[0][0].sa['chunks'].unique == [0, 3, 4]).all())
def test_odd_even_split(self): oes = OddEvenPartitioner() spl = Splitter(attr='partitions') splits = [list(spl.generate(p)) for p in oes.generate(self.data)] self.assertTrue(len(splits) == 2) for i, p in enumerate(splits): self.assertTrue(len(p) == 2) self.assertTrue(p[0].nsamples == 50) self.assertTrue(p[1].nsamples == 50) assert_array_equal(splits[0][1].sa['chunks'].unique, [1, 3, 5, 7, 9]) assert_array_equal(splits[0][0].sa['chunks'].unique, [0, 2, 4, 6, 8]) assert_array_equal(splits[1][0].sa['chunks'].unique, [1, 3, 5, 7, 9]) assert_array_equal(splits[1][1].sa['chunks'].unique, [0, 2, 4, 6, 8]) # check if it works on pure odd and even chunk ids moresplits = [ list(spl.generate(p)) for p in oes.generate(splits[0][0]) ] for split in moresplits: self.assertTrue(split[0] != None) self.assertTrue(split[1] != None)
def test_repeated_features(self): class CountFeatures(Measure): is_trained = True def _call(self, ds): return Dataset([ds.nfeatures], fa={ 'nonbogus_targets': list(ds.fa['nonbogus_targets'].unique) }) cf = CountFeatures() spl = Splitter('fa.nonbogus_targets') nsplits = len(list(spl.generate(self.dataset))) assert_equal(nsplits, 3) rm = RepeatedMeasure(cf, spl, concat_as='features') res = rm(self.dataset) assert_equal(res.shape, (1, nsplits)) # due to https://github.com/numpy/numpy/issues/641 we are # using list(set(...)) construct and there order of # nonbogus_targets.unique can vary from run to run, thus there # is no guarantee that we would get 18 first, which is a # questionable assumption anyways, thus performing checks # which do not require any specific order. # And yet due to another issue # https://github.com/numpy/numpy/issues/3759 # we can't just is None for the bool mask None_fa = np.array([x is None for x in res.fa.nonbogus_targets]) assert_array_equal(res.samples[0, None_fa], [18]) assert_array_equal(res.samples[0, ~None_fa], [1, 1]) if sys.version_info[0] < 3: # with python2 order seems to be consistent assert_array_equal(res.samples[0], [18, 1, 1])
def test_exclude_targets_combinations(): partitioner = ChainNode([ NFoldPartitioner(), ExcludeTargetsCombinationsPartitioner( k=2, targets_attr='targets', space='partitions') ], space='partitions') from mvpa2.misc.data_generators import normal_feature_dataset ds = normal_feature_dataset(snr=0., nlabels=4, perlabel=3, nchunks=3, nonbogus_features=[0, 1, 2, 3], nfeatures=4) partitions = list(partitioner.generate(ds)) assert_equal(len(partitions), 3 * 6) splitter = Splitter('partitions') combs = [] comb_chunks = [] for p in partitions: trds, teds = list(splitter.generate(p))[:2] comb = tuple(np.unique(teds.targets)) combs.append(comb) comb_chunks.append(comb + tuple(np.unique(teds.chunks))) assert_equal(len(set(combs)), 6) # just 6 possible combinations of 2 out of 4 assert_equal(len(set(comb_chunks)), 3 * 6) # all unique
def test_repeated_features(self): class CountFeatures(Measure): is_trained = True def _call(self, ds): return Dataset([ds.nfeatures], fa={'nonbogus_targets': list(ds.fa['nonbogus_targets'].unique)}) cf = CountFeatures() spl = Splitter('fa.nonbogus_targets') nsplits = len(list(spl.generate(self.dataset))) assert_equal(nsplits, 3) rm = RepeatedMeasure(cf, spl, concat_as='features') res = rm(self.dataset) assert_equal(res.shape, (1, nsplits)) # due to https://github.com/numpy/numpy/issues/641 we are # using list(set(...)) construct and there order of # nonbogus_targets.unique can vary from run to run, thus there # is no guarantee that we would get 18 first, which is a # questionable assumption anyways, thus performing checks # which do not require any specific order. # And yet due to another issue # https://github.com/numpy/numpy/issues/3759 # we can't just == None for the bool mask None_fa = np.array([x == None for x in res.fa.nonbogus_targets]) assert_array_equal(res.samples[0, None_fa], [18]) assert_array_equal(res.samples[0, ~None_fa], [1, 1]) if sys.version_info[0] < 3: # with python2 order seems to be consistent assert_array_equal(res.samples[0], [18, 1, 1])
def test_label_splitter(self): oes = OddEvenPartitioner(attr='targets') spl = Splitter(attr='partitions') splits = [list(spl.generate(p)) for p in oes.generate(self.data)] assert_array_equal(splits[0][0].sa['targets'].unique, [0, 2]) assert_array_equal(splits[0][1].sa['targets'].unique, [1, 3]) assert_array_equal(splits[1][0].sa['targets'].unique, [1, 3]) assert_array_equal(splits[1][1].sa['targets'].unique, [0, 2])
def test_label_splitter(self): oes = OddEvenPartitioner(attr='targets') spl = Splitter(attr='partitions') splits = [ list(spl.generate(p)) for p in oes.generate(self.data) ] assert_array_equal(splits[0][0].sa['targets'].unique, [0,2]) assert_array_equal(splits[0][1].sa['targets'].unique, [1,3]) assert_array_equal(splits[1][0].sa['targets'].unique, [1,3]) assert_array_equal(splits[1][1].sa['targets'].unique, [0,2])
def test_clf_transfer_measure(self): # and now on a classifier clf = SMLR() enode = BinaryFxNode(mean_mismatch_error, 'targets') tm = TransferMeasure(clf, Splitter('chunks', count=2), enable_ca=['stats']) res = tm(self.dataset) manual_error = np.mean(res.samples.squeeze() != res.sa.targets) postproc_error = enode(res) tm_err = TransferMeasure(clf, Splitter('chunks', count=2), postproc=enode) auto_error = tm_err(self.dataset) ok_(manual_error == postproc_error.samples[0, 0])
def test_slicing(self): hs = HalfPartitioner() spl = Splitter(attr="partitions") splits = list(hs.generate(self.data)) for s in splits: # partitioned dataset shared the data assert_true(s.samples.base is self.data.samples) splits = [list(spl.generate(p)) for p in hs.generate(self.data)] # with numpy 1.7.0b1 "chaining" was deprecated so let's create # check function appropriate for the given numpy version _a = np.arange(5) __a = _a[:4][:3] if __a.base is _a: # 1.7.0b1 def is_the_same_base(x, base=self.data.samples): return x.base is base elif __a.base.base is _a: # prior 1.7.0b1 def is_the_same_base(x, base=self.data.samples): return x.base.base is base else: raise RuntimeError("Uknown handling of .base by numpy") for s in splits: # we get slicing all the time assert_true(is_the_same_base(s[0].samples)) assert_true(is_the_same_base(s[1].samples)) spl = Splitter(attr="partitions", noslicing=True) splits = [list(spl.generate(p)) for p in hs.generate(self.data)] for s in splits: # we no slicing at all assert_false(s[0].samples.base is self.data.samples) assert_false(s[1].samples.base is self.data.samples) nfs = NFoldPartitioner() spl = Splitter(attr="partitions") splits = [list(spl.generate(p)) for p in nfs.generate(self.data)] for i, s in enumerate(splits): # training only first and last split if i == 0 or i == len(splits) - 1: assert_true(is_the_same_base(s[0].samples)) else: assert_true(s[0].samples.base is None) # we get slicing all the time assert_true(is_the_same_base(s[1].samples)) step_ds = Dataset(np.random.randn(20, 2), sa={"chunks": np.tile([0, 1], 10)}) oes = OddEvenPartitioner() spl = Splitter(attr="partitions") splits = list(oes.generate(step_ds)) for s in splits: # partitioned dataset shared the data assert_true(s.samples.base is step_ds.samples) splits = [list(spl.generate(p)) for p in oes.generate(step_ds)] assert_equal(len(splits), 2) for s in splits: # we get slicing all the time assert_true(is_the_same_base(s[0].samples, step_ds.samples)) assert_true(is_the_same_base(s[1].samples, step_ds.samples))
def test_simplest_cv_pat_gen(self): # create the generator nfs = NFoldPartitioner(cvtype=1) spl = Splitter(attr='partitions') # now get the xval pattern sets One-Fold CV) xvpat = [ list(spl.generate(p)) for p in nfs.generate(self.data) ] self.failUnless( len(xvpat) == 10 ) for i,p in enumerate(xvpat): self.failUnless( len(p) == 2 ) self.failUnless( p[0].nsamples == 90 ) self.failUnless( p[1].nsamples == 10 ) self.failUnless( p[1].chunks[0] == i )
def test_simplest_cv_pat_gen(self): # create the generator nfs = NFoldPartitioner(cvtype=1) spl = Splitter(attr='partitions') # now get the xval pattern sets One-Fold CV) xvpat = [list(spl.generate(p)) for p in nfs.generate(self.data)] self.assertTrue(len(xvpat) == 10) for i, p in enumerate(xvpat): self.assertTrue(len(p) == 2) self.assertTrue(p[0].nsamples == 90) self.assertTrue(p[1].nsamples == 10) self.assertTrue(p[1].chunks[0] == i)
def test_repeated_features(self): print self.dataset print self.dataset.fa.nonbogus_targets class CountFeatures(Measure): is_trained = True def _call(self, ds): return ds.nfeatures cf = CountFeatures() spl = Splitter('fa.nonbogus_targets') nsplits = len(list(spl.generate(self.dataset))) assert_equal(nsplits, 3) rm = RepeatedMeasure(cf, spl, concat_as='features') res = rm(self.dataset) assert_equal(res.shape, (1, nsplits)) assert_array_equal(res.samples[0], [18,1,1])
def test_split_featurewise_dataset_measure(self): ds = datasets['uni3small'] sana = RepeatedMeasure( SMLR(fit_all_weights=True).get_sensitivity_analyzer(), ChainNode( [NFoldPartitioner(), Splitter('partitions', attr_values=[1])])) sens = sana(ds) # a sensitivity for each chunk and each label combination assert_equal(sens.shape, (len(ds.sa['chunks'].unique) * len(ds.sa['targets'].unique), ds.nfeatures)) # Lets try more complex example with 'boosting' ds = datasets['uni3medium'] ds.init_origids('samples') sana = RepeatedMeasure( SMLR(fit_all_weights=True).get_sensitivity_analyzer(), Balancer(amount=0.25, count=2, apply_selection=True), enable_ca=['datasets', 'repetition_results']) sens = sana(ds) assert_equal(sens.shape, (2 * len(ds.sa['targets'].unique), ds.nfeatures)) splits = sana.ca.datasets self.assertEqual(len(splits), 2) self.assertTrue( np.all([s.nsamples == ds.nsamples // 4 for s in splits])) # should have used different samples self.assertTrue(np.any([splits[0].sa.origids != splits[1].sa.origids])) # and should have got different sensitivities self.assertTrue(np.any(sens[0] != sens[3]))
def test_repeated_features(self): print self.dataset print self.dataset.fa.nonbogus_targets class CountFeatures(Measure): is_trained = True def _call(self, ds): return ds.nfeatures cf = CountFeatures() spl = Splitter('fa.nonbogus_targets') nsplits = len(list(spl.generate(self.dataset))) assert_equal(nsplits, 3) rm = RepeatedMeasure(cf, spl, concat_as='features') res = rm(self.dataset) assert_equal(res.shape, (1, nsplits)) assert_array_equal(res.samples[0], [18,1,1])
def test_cv_no_generator_custom_splitter(self): ds = Dataset(np.arange(4), sa={ 'category': ['to', 'to', 'from', 'from'], 'targets': ['a', 'b', 'c', 'd'] }) class Measure(Classifier): def _train(self, ds_): assert_array_equal(ds_.samples, ds.samples[2:]) assert_array_equal(ds_.sa.category, ['from'] * len(ds_)) def _predict(self, ds_): assert (ds_ is not ds) # we pass a shallow copy # could be called to predit training or testing data if np.all(ds_.sa.targets != ['c', 'd']): assert_array_equal(ds_.samples, ds.samples[:2]) assert_array_equal(ds_.sa.category, ['to'] * len(ds_)) else: assert_array_equal(ds_.sa.category, ['from'] * len(ds_)) return ['c', 'd'] measure = Measure() cv = CrossValidation(measure, splitter=Splitter('category', ['from', 'to'])) res = cv(ds) assert_array_equal(res, [[1]]) # failed perfectly ;-)
def test_confusion_based_error(self, l_clf): train = datasets['uni2medium'] train = train[train.sa.train == 1] # to check if we fail to classify for 3 labels test3 = datasets['uni3medium'] test3 = test3[test3.sa.train == 1] err = ConfusionBasedError(clf=l_clf) terr = TransferMeasure(l_clf, Splitter('train', attr_values=[1, 1]), postproc=BinaryFxNode(mean_mismatch_error, 'targets')) self.assertRaises(UnknownStateError, err, None) """Shouldn't be able to access the state yet""" l_clf.train(train) e, te = err(None), terr(train) te = np.asscalar(te) self.assertTrue( abs(e - te) < 1e-10, msg="ConfusionBasedError (%.2g) should be equal to TransferError " "(%.2g) on traindataset" % (e, te)) # this will print nasty WARNING but it is ok -- it is just checking code # NB warnings are not printed while doing whole testing warning("Don't worry about the following warning.") if 'multiclass' in l_clf.__tags__: self.assertFalse(terr(test3) is None) # try copying the beast terr_copy = copy(terr)
def test_counted_splitting(self): spl = Splitter(attr='partitions') # count > #chunks, should result in 10 splits nchunks = len(self.data.sa['chunks'].unique) for strategy in Partitioner._STRATEGIES: for count, target in [ (nchunks*2, nchunks), (nchunks, nchunks), (nchunks-1, nchunks-1), (3, 3), (0, 0), (1, 1) ]: nfs = NFoldPartitioner(cvtype=1, count=count, selection_strategy=strategy) splits = [ list(spl.generate(p)) for p in nfs.generate(self.data) ] self.failUnless(len(splits) == target) chosenchunks = [int(s[1].uniquechunks) for s in splits] # Test if configuration matches as well nsplits_cfg = len(nfs.get_partition_specs(self.data)) self.failUnlessEqual(nsplits_cfg, target) # Check if "lastsplit" dsattr was assigned appropriately nsplits = len(splits) if nsplits > 0: # dummy-proof testing of last split for ds_ in splits[-1]: self.failUnless(ds_.a.lastpartitionset) # test all now for isplit,split in enumerate(splits): for ds_ in split: ds_.a.lastpartitionset == isplit==nsplits-1 # Check results of different strategies if strategy == 'first': self.failUnlessEqual(chosenchunks, range(target)) elif strategy == 'equidistant': if target == 3: self.failUnlessEqual(chosenchunks, [0, 3, 7]) elif strategy == 'random': # none is selected twice self.failUnless(len(set(chosenchunks)) == len(chosenchunks)) self.failUnless(target == len(chosenchunks)) else: raise RuntimeError, "Add unittest for strategy %s" \ % strategy
def test_counted_splitting(self): spl = Splitter(attr='partitions') # count > #chunks, should result in 10 splits nchunks = len(self.data.sa['chunks'].unique) for strategy in Partitioner._STRATEGIES: for count, target in [(nchunks * 2, nchunks), (nchunks, nchunks), (nchunks - 1, nchunks - 1), (3, 3), (0, 0), (1, 1)]: nfs = NFoldPartitioner(cvtype=1, count=count, selection_strategy=strategy) splits = [ list(spl.generate(p)) for p in nfs.generate(self.data) ] self.assertTrue(len(splits) == target) chosenchunks = [int(s[1].uniquechunks) for s in splits] # Test if configuration matches as well nsplits_cfg = len(nfs.get_partition_specs(self.data)) self.assertEqual(nsplits_cfg, target) # Check if "lastsplit" dsattr was assigned appropriately nsplits = len(splits) if nsplits > 0: # dummy-proof testing of last split for ds_ in splits[-1]: self.assertTrue(ds_.a.lastpartitionset) # test all now for isplit, split in enumerate(splits): for ds_ in split: ds_.a.lastpartitionset == isplit == nsplits - 1 # Check results of different strategies if strategy == 'first': self.assertEqual(chosenchunks, range(target)) elif strategy == 'equidistant': if target == 3: self.assertEqual(chosenchunks, [0, 3, 7]) elif strategy == 'random': # none is selected twice self.assertTrue( len(set(chosenchunks)) == len(chosenchunks)) self.assertTrue(target == len(chosenchunks)) else: raise RuntimeError, "Add unittest for strategy %s" \ % strategy
def test_splitter(): ds = give_data() # split with defaults spl1 = Splitter('chunks') assert_raises(NotImplementedError, spl1, ds) splits = list(spl1.generate(ds)) assert_equal(len(splits), len(ds.sa['chunks'].unique)) for split in splits: # it should have perform basic slicing! assert_true(split.samples.base is ds.samples) assert_equal(len(split.sa['chunks'].unique), 1) assert_true('lastsplit' in split.a) assert_true(splits[-1].a.lastsplit) # now again, more customized spl2 = Splitter('targets', attr_values=[0, 1, 1, 2, 3, 3, 3], count=4, noslicing=True) splits = list(spl2.generate(ds)) assert_equal(len(splits), 4) for split in splits: # it should NOT have perform basic slicing! assert_false(split.samples.base is ds.samples) assert_equal(len(split.sa['targets'].unique), 1) assert_equal(len(split.sa['chunks'].unique), 10) assert_true(splits[-1].a.lastsplit) # two should be identical assert_array_equal(splits[1].samples, splits[2].samples) # now go wild and split by feature attribute ds.fa['roi'] = np.repeat([0, 1], 5) # splitter should auto-detect that this is a feature attribute spl3 = Splitter('roi') splits = list(spl3.generate(ds)) assert_equal(len(splits), 2) for split in splits: assert_true(split.samples.base is ds.samples) assert_equal(len(split.fa['roi'].unique), 1) assert_equal(split.shape, (100, 5)) # and finally test chained splitters cspl = ChainNode([spl2, spl3, spl1]) splits = list(cspl.generate(ds)) # 4 target splits and 2 roi splits each and 10 chunks each assert_equal(len(splits), 80)
def _train(self, dataset): pmeasure = ProxyMeasure( self.lrn, postproc=BinaryFxNode(self.errorfx, self.lrn.space), skip_train=True # do not train since fmeasure will ) # First we need to replicate our RFE construct but this time # with pmeasure for the classifier rfe = RFE( self.fmeasure, pmeasure, Splitter('partitions'), fselector=self.fselector, bestdetector=None, train_pmeasure=False, stopping_criterion=None, # full "track" update_sensitivity=self.update_sensitivity, enable_ca=['errors', 'nfeatures']) errors, nfeatures = [], [] if __debug__: debug("RFEC", "Stage 1: initial nested CV/RFE for %s", (dataset, )) for partition in self.partitioner.generate(dataset): rfe.train(partition) errors.append(rfe.ca.errors) nfeatures.append(rfe.ca.nfeatures) # mean errors across splits and find optimal number errors_mean = np.mean(errors, axis=0) nfeatures_mean = np.mean(nfeatures, axis=0) # we will take the "mean location" of the min to stay # within the most 'stable' choice mins_idx = np.where(errors_mean == np.min(errors_mean))[0] min_idx = mins_idx[int(len(mins_idx) / 2)] min_error = errors_mean[min_idx] assert (min_error == np.min(errors_mean)) nfeatures_min = nfeatures_mean[min_idx] if __debug__: debug( "RFEC", "Choosing among %d choices to have %d features with " "mean error=%.2g (initial mean error %.2g)", (len(mins_idx), nfeatures_min, min_error, errors_mean[0])) self.nfeatures_min = nfeatures_min if __debug__: debug( "RFEC", "Stage 2: running RFE on full training dataset to " "distil best %d features" % nfeatures_min) super(SplitRFE, self)._train(dataset)
def test_svms(self, clf): knows_probabilities = \ 'probabilities' in clf.ca.keys() and clf.params.probability enable_ca = ['estimates'] if knows_probabilities: enable_ca += ['probabilities'] clf.ca.change_temporarily(enable_ca=enable_ca) spl = Splitter('train', count=2) traindata, testdata = list(spl.generate(datasets['uni2small'])) clf.train(traindata) predicts = clf.predict(testdata.samples) # values should be different from predictions for SVMs we have self.assertTrue(np.any(predicts != clf.ca.estimates)) if knows_probabilities and clf.ca.is_set('probabilities'): # XXX test more thoroughly what we are getting here ;-) self.assertEqual(len(clf.ca.probabilities), len(testdata.samples)) clf.ca.reset_changed_temporarily()
def test_svms(self, clf): knows_probabilities = \ 'probabilities' in clf.ca.keys() and clf.params.probability enable_ca = ['estimates'] if knows_probabilities: enable_ca += ['probabilities'] clf.ca.change_temporarily(enable_ca = enable_ca) spl = Splitter('train', count=2) traindata, testdata = list(spl.generate(datasets['uni2small'])) clf.train(traindata) predicts = clf.predict(testdata.samples) # values should be different from predictions for SVMs we have self.assertTrue(np.any(predicts != clf.ca.estimates)) if knows_probabilities and clf.ca.is_set('probabilities'): # XXX test more thoroughly what we are getting here ;-) self.assertEqual( len(clf.ca.probabilities), len(testdata.samples) ) clf.ca.reset_changed_temporarily()
def test_exclude_targets_combinations(): partitioner = ChainNode( [NFoldPartitioner(), ExcludeTargetsCombinationsPartitioner(k=2, targets_attr="targets", space="partitions")], space="partitions", ) from mvpa2.misc.data_generators import normal_feature_dataset ds = normal_feature_dataset(snr=0.0, nlabels=4, perlabel=3, nchunks=3, nonbogus_features=[0, 1, 2, 3], nfeatures=4) partitions = list(partitioner.generate(ds)) assert_equal(len(partitions), 3 * 6) splitter = Splitter("partitions") combs = [] comb_chunks = [] for p in partitions: trds, teds = list(splitter.generate(p))[:2] comb = tuple(np.unique(teds.targets)) combs.append(comb) comb_chunks.append(comb + tuple(np.unique(teds.chunks))) assert_equal(len(set(combs)), 6) # just 6 possible combinations of 2 out of 4 assert_equal(len(set(comb_chunks)), 3 * 6) # all unique
def test_splitter_gnbsearghlight(self): ds1 = datasets['3dsmall'].copy(deep=True) gnb_sl = GNBSearchlight(GNB(), generator=CustomPartitioner([([0], [1])]), qe=IndexQueryEngine(myspace=Sphere(2)), splitter=Splitter(attr='partitions', attr_values=[1, 2]), errorfx=None) res = gnb_sl(ds1) assert_equal(res.nsamples, (ds1.chunks == 1).sum())
def _forward_dataset(self, ds): if self.__chunks_attr is None: return self._forward_dataset_helper(ds) else: # strip down dataset to speedup local processing if self.__attr_strategy == "remove": keep_sa = [] else: keep_sa = None proc_ds = ds.copy(deep=False, sa=keep_sa, fa=[], a=[]) # process all chunks individually # use a customsplitter to speed-up splitting spl = Splitter(self.__chunks_attr) dses = [self._forward_dataset_helper(d) for d in spl.generate(proc_ds)] # and merge them again mds = vstack(dses) # put back attributes mds.fa.update(ds.fa) mds.a.update(ds.a) return mds
def test_transfer_measure(self): # come up with my own measure that only checks if training data # and test data are the same class MyMeasure(Measure): def _train(self, ds): self._tds = ds def _call(self, ds): return Dataset(ds.samples == self._tds.samples) tm = TransferMeasure(MyMeasure(), Splitter('chunks', count=2)) # result should not be all True (== identical) assert_true((tm(self.dataset).samples == False).any())
def _forward_dataset(self, ds): if self.__chunks_attr is None: return self._forward_dataset_helper(ds) else: # strip down dataset to speedup local processing if self.__attr_strategy == 'remove': keep_sa = [] else: keep_sa = None proc_ds = ds.copy(deep=False, sa=keep_sa, fa=[], a=[]) # process all chunks individually # use a customsplitter to speed-up splitting spl = Splitter(self.__chunks_attr) dses = [self._forward_dataset_helper(d) for d in spl.generate(proc_ds)] # and merge them again mds = vstack(dses) # put back attributes mds.fa.update(ds.fa) mds.a.update(ds.a) return mds
def get_partitioner(split_attr='group_split'): splitter = Splitter(attr='partitions', attr_values=(2, 3)) if split_attr == 'group_split': splitrule = [ # (leave, training, testing) (['3', '4'], ['1'], ['2']), (['3', '4'], ['2'], ['1']), (['1'], ['2'], ['3', '4']), (['2'], ['1'], ['3', '4']), (['1', '2'], ['3'], ['4']), (['1', '2'], ['4'], ['3']), (['3'], ['4'], ['1', '2']), (['4'], ['3'], ['1', '2']), ] partitioner = CustomPartitioner(splitrule=splitrule, attr=split_attr) elif split_attr == 'subject': partitioner = MemoryGroupSubjectPartitioner(group_attr='group_split', subject_attr=split_attr, attr=split_attr) elif split_attr == "group_mdm": partitioner = LeaveOneSubjectPerGroupPartitioner( group_attr='group', subject_attr="subject", attr="subject") elif split_attr == "subject_ofp": partitioner = partitioner = NFoldPartitioner(attr="subject") splitter = Splitter(attr="partitions") elif split_attr == 'group': partitioner = NFoldPartitioner(attr=split_attr) splitter = Splitter(attr='partitions') return partitioner, splitter
def test_custom_split(self): #simulate half splitter hs = CustomPartitioner([(None,[0,1,2,3,4]),(None,[5,6,7,8,9])]) spl = Splitter(attr='partitions') splits = [ list(spl.generate(p)) for p in hs.generate(self.data) ] self.assertTrue(len(splits) == 2) for i,p in enumerate(splits): self.assertTrue( len(p) == 2 ) self.assertTrue( p[0].nsamples == 50 ) self.assertTrue( p[1].nsamples == 50 ) assert_array_equal(splits[0][1].sa['chunks'].unique, [0, 1, 2, 3, 4]) assert_array_equal(splits[0][0].sa['chunks'].unique, [5, 6, 7, 8, 9]) assert_array_equal(splits[1][1].sa['chunks'].unique, [5, 6, 7, 8, 9]) assert_array_equal(splits[1][0].sa['chunks'].unique, [0, 1, 2, 3, 4]) # check fully customized split with working and validation set specified cs = CustomPartitioner([([0,3,4],[5,9])]) # we want to discared the unselected partition of the data, hence attr_value # these two splitters should do exactly the same thing splitters = (Splitter(attr='partitions', attr_values=[1,2]), Splitter(attr='partitions', ignore_values=(0,))) for spl in splitters: splits = [ list(spl.generate(p)) for p in cs.generate(self.data) ] self.assertTrue(len(splits) == 1) for i,p in enumerate(splits): self.assertTrue( len(p) == 2 ) self.assertTrue( p[0].nsamples == 30 ) self.assertTrue( p[1].nsamples == 20 ) self.assertTrue((splits[0][1].sa['chunks'].unique == [5, 9]).all()) self.assertTrue((splits[0][0].sa['chunks'].unique == [0, 3, 4]).all())
def test_james_problem_multiclass(self): percent = 80 dataset = datasets['uni4large'] #dataset = dataset[:, dataset.a.nonbogus_features] rfesvm_split = LinearCSVMC() fs = \ RFE(rfesvm_split.get_sensitivity_analyzer( postproc=ChainMapper([ #FxMapper('features', l2_normed), #FxMapper('samples', np.mean), #FxMapper('samples', np.abs) FxMapper('features', lambda x: np.argsort(np.abs(x))), #maxofabs_sample() mean_sample() ])), ProxyMeasure(rfesvm_split, postproc=BinaryFxNode(mean_mismatch_error, 'targets')), Splitter('train'), fselector=FractionTailSelector( percent / 100.0, mode='select', tail='upper'), update_sensitivity=True) clf = FeatureSelectionClassifier( LinearCSVMC(), # on features selected via RFE fs) # update sensitivity at each step (since we're not using the # same CLF as sensitivity analyzer) class StoreResults(object): def __init__(self): self.storage = [] def __call__(self, data, node, result): self.storage.append((node.measure.mapper.ca.history, node.measure.mapper.ca.errors)), cv_storage = StoreResults() cv = CrossValidation(clf, NFoldPartitioner(), postproc=mean_sample(), callback=cv_storage, enable_ca=['stats']) #cv = SplitClassifier(clf) try: error = cv(dataset).samples.squeeze() except Exception, e: self.fail('CrossValidation cannot handle classifier with RFE ' 'feature selection. Got exception: %s' % (e, ))
def test_pseudo_cv_measure(self): clf = SMLR() enode = BinaryFxNode(mean_mismatch_error, 'targets') tm = TransferMeasure(clf, Splitter('partitions'), postproc=enode) cvgen = NFoldPartitioner() rm = RepeatedMeasure(tm, cvgen) res = rm(self.dataset) # one error per fold assert_equal(res.shape, (len(self.dataset.sa['chunks'].unique), 1)) # we can do the same with Crossvalidation cv = CrossValidation(clf, cvgen, enable_ca=['stats', 'training_stats', 'datasets']) res = cv(self.dataset) assert_equal(res.shape, (len(self.dataset.sa['chunks'].unique), 1))
def test_half_split(self): hs = HalfPartitioner() spl = Splitter(attr='partitions') splits = [ list(spl.generate(p)) for p in hs.generate(self.data) ] self.failUnless(len(splits) == 2) for i,p in enumerate(splits): self.failUnless( len(p) == 2 ) self.failUnless( p[0].nsamples == 50 ) self.failUnless( p[1].nsamples == 50 ) assert_array_equal(splits[0][1].sa['chunks'].unique, [0, 1, 2, 3, 4]) assert_array_equal(splits[0][0].sa['chunks'].unique, [5, 6, 7, 8, 9]) assert_array_equal(splits[1][1].sa['chunks'].unique, [5, 6, 7, 8, 9]) assert_array_equal(splits[1][0].sa['chunks'].unique, [0, 1, 2, 3, 4]) # check if it works on pure odd and even chunk ids moresplits = [ list(spl.generate(p)) for p in hs.generate(splits[0][0])] for split in moresplits: self.failUnless(split[0] != None) self.failUnless(split[1] != None)
def test_odd_even_split(self): oes = OddEvenPartitioner() spl = Splitter(attr='partitions') splits = [ list(spl.generate(p)) for p in oes.generate(self.data) ] self.assertTrue(len(splits) == 2) for i,p in enumerate(splits): self.assertTrue( len(p) == 2 ) self.assertTrue( p[0].nsamples == 50 ) self.assertTrue( p[1].nsamples == 50 ) assert_array_equal(splits[0][1].sa['chunks'].unique, [1, 3, 5, 7, 9]) assert_array_equal(splits[0][0].sa['chunks'].unique, [0, 2, 4, 6, 8]) assert_array_equal(splits[1][0].sa['chunks'].unique, [1, 3, 5, 7, 9]) assert_array_equal(splits[1][1].sa['chunks'].unique, [0, 2, 4, 6, 8]) # check if it works on pure odd and even chunk ids moresplits = [ list(spl.generate(p)) for p in oes.generate(splits[0][0])] for split in moresplits: self.assertTrue(split[0] != None) self.assertTrue(split[1] != None)
def test_half_split(self): hs = HalfPartitioner() spl = Splitter(attr='partitions') splits = [ list(spl.generate(p)) for p in hs.generate(self.data) ] self.assertTrue(len(splits) == 2) for i,p in enumerate(splits): self.assertTrue( len(p) == 2 ) self.assertTrue( p[0].nsamples == 50 ) self.assertTrue( p[1].nsamples == 50 ) assert_array_equal(splits[0][1].sa['chunks'].unique, [0, 1, 2, 3, 4]) assert_array_equal(splits[0][0].sa['chunks'].unique, [5, 6, 7, 8, 9]) assert_array_equal(splits[1][1].sa['chunks'].unique, [5, 6, 7, 8, 9]) assert_array_equal(splits[1][0].sa['chunks'].unique, [0, 1, 2, 3, 4]) # check if it works on pure odd and even chunk ids moresplits = [ list(spl.generate(p)) for p in hs.generate(splits[0][0])] for split in moresplits: self.assertTrue(split[0] is not None) self.assertTrue(split[1] is not None)
def test_james_problem(self): percent = 80 dataset = datasets['uni2small'] rfesvm_split = LinearCSVMC() fs = \ RFE(rfesvm_split.get_sensitivity_analyzer(), ProxyMeasure(rfesvm_split, postproc=BinaryFxNode(mean_mismatch_error, 'targets')), Splitter('train'), fselector=FractionTailSelector( percent / 100.0, mode='select', tail='upper'), update_sensitivity=True) clf = FeatureSelectionClassifier( LinearCSVMC(), # on features selected via RFE fs) # update sensitivity at each step (since we're not using the # same CLF as sensitivity analyzer) class StoreResults(object): def __init__(self): self.storage = [] def __call__(self, data, node, result): self.storage.append((node.measure.mapper.ca.history, node.measure.mapper.ca.errors)), cv_storage = StoreResults() cv = CrossValidation(clf, NFoldPartitioner(), postproc=mean_sample(), callback=cv_storage, enable_ca=['confusion']) # TODO -- it is stats #cv = SplitClassifier(clf) try: error = cv(dataset).samples.squeeze() except Exception as e: self.fail('CrossValidation cannot handle classifier with RFE ' 'feature selection. Got exception: %s' % (e, )) assert (len(cv_storage.storage) == len(dataset.sa['chunks'].unique)) assert (len(cv_storage.storage[0]) == 2) assert (len(cv_storage.storage[0][0]) == dataset.nfeatures) self.assertTrue(error < 0.2)
def test_single_class(self, clf): """Test if binary and multiclass can handle single class training/testing """ ds = datasets['uni2small'] ds = ds[ds.sa.targets == 'L0'] # only 1 label assert(ds.sa['targets'].unique == ['L0']) ds_ = list(OddEvenPartitioner().generate(ds))[0] # Here is our "nice" 0.6 substitute for TransferError: trerr = TransferMeasure(clf, Splitter('train'), postproc=BinaryFxNode(mean_mismatch_error, 'targets')) try: err = np.asscalar(trerr(ds_)) except Exception, e: self.fail(str(e))
def test_ifs(self, svm): # measure for feature selection criterion and performance assesment # use the SAME clf! errorfx = mean_mismatch_error fmeasure = CrossValidation(svm, NFoldPartitioner(), postproc=mean_sample()) pmeasure = ProxyMeasure(svm, postproc=BinaryFxNode(errorfx, 'targets')) ifs = IFS(fmeasure, pmeasure, Splitter('purpose', attr_values=['train', 'test']), fselector=\ # go for lower tail selection as data_measure will return # errors -> low is good FixedNElementTailSelector(1, tail='lower', mode='select'), ) wdata = self.get_data() wdata.sa['purpose'] = np.repeat('train', len(wdata)) tdata = self.get_data() tdata.sa['purpose'] = np.repeat('test', len(tdata)) ds = vstack((wdata, tdata)) orig_nfeatures = ds.nfeatures ifs.train(ds) resds = ifs(ds) # fail if orig datasets are changed self.assertTrue(ds.nfeatures == orig_nfeatures) # check that the features set with the least error is selected self.assertTrue(len(ifs.ca.errors)) e = np.array(ifs.ca.errors) self.assertTrue(resds.nfeatures == e.argmin() + 1) # repeat with dataset where selection order is known wsignal = datasets['dumb2'].copy() wsignal.sa['purpose'] = np.repeat('train', len(wsignal)) tsignal = datasets['dumb2'].copy() tsignal.sa['purpose'] = np.repeat('test', len(tsignal)) signal = vstack((wsignal, tsignal)) ifs.train(signal) resds = ifs(signal) self.assertTrue((resds.samples[:, 0] == signal.samples[:, 0]).all())
def test_gnb(self): gnb = GNB() gnb_nc = GNB(common_variance=False) gnb_n = GNB(normalize=True) gnb_n_nc = GNB(normalize=True, common_variance=False) gnb_lin = GNB(common_variance=True) ds = datasets['uni2medium'] # Generic silly coverage just to assure that it works in all # possible scenarios: bools = (True, False) # There should be better way... heh for cv in bools: # common_variance? for prior in ('uniform', 'laplacian_smoothing', 'ratio'): tp = None # predictions -- all above should # result in the same predictions for n in bools: # normalized? for ls in bools: # logspace? for es in ((), ('estimates')): gnb_ = GNB(common_variance=cv, prior=prior, normalize=n, logprob=ls, enable_ca=es) tm = TransferMeasure(gnb_, Splitter('train')) predictions = tm(ds).samples[:, 0] if tp is None: tp = predictions assert_array_equal(predictions, tp) # if normalized -- check if estimates are such if n and 'estimates' in es: v = gnb_.ca.estimates if ls: # in log space -- take exp ;) v = np.exp(v) d1 = np.sum(v, axis=1) - 1.0 self.assertTrue(np.max(np.abs(d1)) < 1e-5) # smoke test to see whether invocation of sensitivity analyser blows # if gnb classifier isn't linear, and to see whether it doesn't blow # when it is linear. if cv: assert 'has_sensitivity' in gnb_.__tags__ gnb_.get_sensitivity_analyzer() if not cv: with self.assertRaises(NotImplementedError): gnb_.get_sensitivity_analyzer()
def test_slicing(self): hs = HalfPartitioner() spl = Splitter(attr='partitions') splits = list(hs.generate(self.data)) for s in splits: # partitioned dataset shared the data assert_true(s.samples.base is self.data.samples) splits = [ list(spl.generate(p)) for p in hs.generate(self.data) ] for s in splits: # we get slicing all the time assert_true(s[0].samples.base.base is self.data.samples) assert_true(s[1].samples.base.base is self.data.samples) spl = Splitter(attr='partitions', noslicing=True) splits = [ list(spl.generate(p)) for p in hs.generate(self.data) ] for s in splits: # we no slicing at all assert_false(s[0].samples.base is self.data.samples) assert_false(s[1].samples.base is self.data.samples) nfs = NFoldPartitioner() spl = Splitter(attr='partitions') splits = [ list(spl.generate(p)) for p in nfs.generate(self.data) ] for i, s in enumerate(splits): # training only first and last split if i == 0 or i == len(splits) - 1: assert_true(s[0].samples.base.base is self.data.samples) else: assert_true(s[0].samples.base is None) # we get slicing all the time assert_true(s[1].samples.base.base is self.data.samples) step_ds = Dataset(np.random.randn(20,2), sa={'chunks': np.tile([0,1], 10)}) oes = OddEvenPartitioner() spl = Splitter(attr='partitions') splits = list(oes.generate(step_ds)) for s in splits: # partitioned dataset shared the data assert_true(s.samples.base is step_ds.samples) splits = [ list(spl.generate(p)) for p in oes.generate(step_ds) ] assert_equal(len(splits), 2) for s in splits: # we get slicing all the time assert_true(s[0].samples.base.base is step_ds.samples) assert_true(s[1].samples.base.base is step_ds.samples)
def test_gnbsearchlight_3partitions_and_splitter(self): ds = self.dataset[:, :20] # custom partitioner which provides 3 partitions part = CustomPartitioner([([2], [3], [1])]) gnb_sl = sphere_gnbsearchlight(GNB(), part) res_gnb_sl = gnb_sl(ds) # compare results to full blown searchlight sl = sphere_searchlight(CrossValidation(GNB(), part)) res_sl = sl(ds) assert_datasets_equal(res_gnb_sl, res_sl) # and theoretically for this simple single cross-validation we could # just use Splitter splitter = Splitter('chunks', [2, 3]) # we have to put explicit None since can't become a kwarg in 1 day any # longer here gnb_sl_ = sphere_gnbsearchlight(GNB(), None, splitter=splitter) res_gnb_sl_ = gnb_sl_(ds) assert_datasets_equal(res_gnb_sl, res_gnb_sl_)
def test_splitter(): ds = give_data() # split with defaults spl1 = Splitter('chunks') assert_raises(NotImplementedError, spl1, ds) splits = list(spl1.generate(ds)) assert_equal(len(splits), len(ds.sa['chunks'].unique)) for split in splits: # it should have perform basic slicing! assert_true(split.samples.base is ds.samples) assert_equal(len(split.sa['chunks'].unique), 1) assert_true('lastsplit' in split.a) assert_true(splits[-1].a.lastsplit) # now again, more customized spl2 = Splitter('targets', attr_values = [0,1,1,2,3,3,3], count=4, noslicing=True) splits = list(spl2.generate(ds)) assert_equal(len(splits), 4) for split in splits: # it should NOT have perform basic slicing! assert_false(split.samples.base is ds.samples) assert_equal(len(split.sa['targets'].unique), 1) assert_equal(len(split.sa['chunks'].unique), 10) assert_true(splits[-1].a.lastsplit) # two should be identical assert_array_equal(splits[1].samples, splits[2].samples) # now go wild and split by feature attribute ds.fa['roi'] = np.repeat([0,1], 5) # splitter should auto-detect that this is a feature attribute spl3 = Splitter('roi') splits = list(spl3.generate(ds)) assert_equal(len(splits), 2) for split in splits: assert_true(split.samples.base is ds.samples) assert_equal(len(split.fa['roi'].unique), 1) assert_equal(split.shape, (100, 5)) # and finally test chained splitters cspl = ChainNode([spl2, spl3, spl1]) splits = list(cspl.generate(ds)) # 4 target splits and 2 roi splits each and 10 chunks each assert_equal(len(splits), 80)
def test_gnb(self): gnb = GNB() gnb_nc = GNB(common_variance=False) gnb_n = GNB(normalize=True) gnb_n_nc = GNB(normalize=True, common_variance=False) ds = datasets['uni2medium'] # Generic silly coverage just to assure that it works in all # possible scenarios: bools = (True, False) # There should be better way... heh for cv in bools: # common_variance? for prior in ('uniform', 'laplacian_smoothing', 'ratio'): tp = None # predictions -- all above should # result in the same predictions for n in bools: # normalized? for ls in bools: # logspace? for es in ((), ('estimates')): gnb_ = GNB(common_variance=cv, prior=prior, normalize=n, logprob=ls, enable_ca=es) tm = TransferMeasure(gnb_, Splitter('train')) predictions = tm(ds).samples[:, 0] if tp is None: tp = predictions assert_array_equal(predictions, tp) # if normalized -- check if estimates are such if n and 'estimates' in es: v = gnb_.ca.estimates if ls: # in log space -- take exp ;) v = np.exp(v) d1 = np.sum(v, axis=1) - 1.0 self.assertTrue(np.max(np.abs(d1)) < 1e-5)
def _sl_call(self, dataset, roi_ids, nproc): """Call to SimpleStatBaseSearchlight """ # Local bindings generator = self.generator qe = self.queryengine errorfx = self.errorfx if __debug__: time_start = time.time() targets_sa_name = self._get_space() targets_sa = dataset.sa[targets_sa_name] if __debug__: debug_slc_ = 'SLC_' in debug.active # get the dataset information into easy vars X = dataset.samples if len(X.shape) != 2: raise ValueError( 'Unlike a classifier, %s (for now) operates on already' 'flattened datasets' % (self.__class__.__name__)) labels = targets_sa.value ulabels = targets_sa.unique nlabels = len(ulabels) label2index = dict((l, il) for il, l in enumerate(ulabels)) labels_numeric = np.array([label2index[l] for l in labels]) self._ulabels_numeric = [label2index[l] for l in ulabels] # set the feature dimensions nsamples = len(X) nrois = len(roi_ids) s_shape = X.shape[1:] # shape of a single sample # The shape of results r_shape = (nrois,) + X.shape[2:] # # Everything toward optimization ;) # # Silly Yarik thinks that it might be worth to pre-compute # statistics per each feature within a block of the samples # which always come together in splits -- most often it is a # (chunk, label) combination, but since we simply use a # generator -- who knows! Therefore lets figure out what are # those blocks and operate on them instead of original samples. # # After additional thinking about this -- probably it would be # just minor additional improvements (ie not worth it) but # since it is coded already -- let it be so # 1. Query generator for the splits we will have if __debug__: debug('SLC', 'Phase 1. Initializing partitions using %s on %s' % (generator, dataset)) # Lets just create a dummy ds which will store for us actual sample # indicies # XXX we could make it even more lightweight I guess... dataset_indicies = Dataset(np.arange(nsamples), sa=dataset.sa) splitter = Splitter(attr=generator.get_space()) partitions = list(generator.generate(dataset_indicies)) if __debug__: for p in partitions: if not (np.all(p.sa[targets_sa_name].value == labels)): raise NotImplementedError( "%s does not yet support partitioners altering the targets " "(e.g. permutators)" % self.__class__) nsplits = len(partitions) # ATM we need to keep the splits instead since they are used # in two places in the code: step 2 and 5 splits = list(tuple(splitter.generate(ds_)) for ds_ in partitions) del partitions # not used any longer # 2. Figure out the new 'chunks x labels' blocks of combinations # of samples if __debug__: debug('SLC', 'Phase 2. Blocking data for %i splits and %i labels' % (nsplits, nlabels)) # array of indicies for label, split1, split2, ... # through which we will pass later on to figure out # unique combinations combinations = np.ones((nsamples, 1+nsplits), dtype=int)*-1 # labels combinations[:, 0] = labels_numeric for ipartition, (split1, split2) in enumerate(splits): combinations[split1.samples[:, 0], 1+ipartition] = 1 combinations[split2.samples[:, 0], 1+ipartition] = 2 # Check for over-sampling, i.e. no same sample used twice here if not (len(np.unique(split1.samples[:, 0])) == len(split1) and len(np.unique(split2.samples[:, 0])) == len(split2)): raise RuntimeError( "%s needs a partitioner which does not reuse " "the same the same samples more than once" % self.__class__) # sample descriptions -- should be unique for # samples within the same block descriptions = [tuple(c) for c in combinations] udescriptions = sorted(list(set(descriptions))) nblocks = len(udescriptions) description2block = dict([(d, i) for i, d in enumerate(udescriptions)]) # Indices for samples to point to their block self.__sample2block = sample2block = \ np.array([description2block[d] for d in descriptions]) # 3. Compute statistics per each block # if __debug__: debug('SLC', 'Phase 3. Computing statistics for %i blocks' % (nblocks,)) self._compute_pb_stats(labels_numeric, X, (nblocks,) + s_shape) # derived classes might decide differently on what they # actually need, so defer reserving the space and computing # stats to them self._reserve_pl_stats_space((nlabels, ) + s_shape) # results results = np.zeros((nsplits,) + r_shape) # 4. Lets deduce all neighbors... might need to be RF into the # parallel part later on # TODO: needs OPT since this is the step consuming 50% of time # or more allow to cache them entirely so this would # not be an unnecessary burden during permutation testing if not self.reuse_neighbors or self.__roi_fids is None: if __debug__: debug('SLC', 'Phase 4. Deducing neighbors information for %i ROIs' % (nrois,)) roi_fids = [qe.query_byid(f) for f in roi_ids] else: if __debug__: debug('SLC', 'Phase 4. Reusing neighbors information for %i ROIs' % (nrois,)) roi_fids = self.__roi_fids self.ca.roi_feature_ids = roi_fids roi_sizes = [] if isinstance(roi_fids, list): nroi_fids = len(roi_fids) if self.ca.is_enabled('roi_sizes'): roi_sizes = [len(x) for x in roi_fids] elif externals.exists('scipy') and isinstance(roi_fids, sps.spmatrix): nroi_fids = roi_fids.shape[1] if self.ca.is_enabled('roi_sizes'): # very expensive operation, so better not to ask over again # roi_sizes = [roi_fids.getrow(r).nnz for r in range(nroi_fids)] warning("Since 'sparse' trick is used, extracting sizes of " "roi's are expensive at this point. Get them from the " ".ca value of the original instance before " "calling again and using reuse_neighbors") else: raise RuntimeError("Should not be reachable") # Since this is ad-hoc implementation of the searchlight, we are not passing # those via ds.a but rather assign directly to self.ca self.ca.roi_sizes = roi_sizes indexsum = self._indexsum if indexsum == 'sparse': if not self.reuse_neighbors or self.__roi_fids is None: if __debug__: debug('SLC', 'Phase 4b. Converting neighbors to sparse matrix ' 'representation') # convert to "sparse representation" where column j contains # 1s only at the roi_fids[j] indices roi_fids = inds_to_coo(roi_fids, shape=(dataset.nfeatures, nroi_fids)) indexsum_fx = lastdim_columnsums_spmatrix elif indexsum == 'fancy': indexsum_fx = lastdim_columnsums_fancy_indexing else: raise ValueError, \ "Do not know how to deal with indexsum=%s" % indexsum # Store roi_fids if self.reuse_neighbors and self.__roi_fids is None: self.__roi_fids = roi_fids # 5. Lets do actual "splitting" and "classification" if __debug__: debug('SLC', 'Phase 5. Major loop' ) for isplit, split in enumerate(splits): if __debug__: debug('SLC', ' Split %i out of %i' % (isplit, nsplits)) # figure out for a given splits the blocks we want to work # with # sample_indicies training_sis = split[0].samples[:, 0] testing_sis = split[1].samples[:, 0] # That is the GNB specificity targets, predictions = self._sl_call_on_a_split( split, X, # X2 might light to go training_sis, testing_sis, ## training_nsamples, # GO? == np.sum(pl.nsamples) ## training_non0labels, ## pl.sums, pl.means, pl.sums2, pl.variances, # passing nroi_fids as well since in 'sparse' way it has no 'length' nroi_fids, roi_fids, indexsum_fx, labels_numeric, ) # assess the errors if __debug__: debug('SLC', " Assessing accuracies") if errorfx is mean_mismatch_error: results[isplit, :] = \ (predictions != targets[:, None]).sum(axis=0) \ / float(len(targets)) else: # somewhat silly but a way which allows to use pre-crafted # error functions without a chance to screw up for i, fpredictions in enumerate(predictions.T): results[isplit, i] = errorfx(fpredictions, targets) if __debug__: debug('SLC', "%s._call() is done in %.3g sec" % (self.__class__.__name__, time.time() - time_start)) return Dataset(results)
def test_rfe_sensmap(): # http://lists.alioth.debian.org/pipermail/pkg-exppsy-pymvpa/2013q3/002538.html # just a smoke test. fails with from mvpa2.clfs.svm import LinearCSVMC from mvpa2.clfs.meta import FeatureSelectionClassifier from mvpa2.measures.base import CrossValidation, RepeatedMeasure from mvpa2.generators.splitters import Splitter from mvpa2.generators.partition import NFoldPartitioner from mvpa2.misc.errorfx import mean_mismatch_error from mvpa2.mappers.fx import mean_sample from mvpa2.mappers.fx import maxofabs_sample from mvpa2.generators.base import Repeater from mvpa2.featsel.rfe import RFE from mvpa2.featsel.helpers import FractionTailSelector, BestDetector from mvpa2.featsel.helpers import NBackHistoryStopCrit from mvpa2.datasets import vstack from mvpa2.misc.data_generators import normal_feature_dataset # Let's simulate the beast -- 6 categories total groupped into 3 # super-ordinate, and actually without any 'superordinate' effect # since subordinate categories independent fds = normal_feature_dataset(nlabels=3, snr=1, # 100, # pure signal! ;) perlabel=9, nfeatures=6, nonbogus_features=range(3), nchunks=3) clfsvm = LinearCSVMC() rfesvm = RFE(clfsvm.get_sensitivity_analyzer(postproc=maxofabs_sample()), CrossValidation( clfsvm, NFoldPartitioner(), errorfx=mean_mismatch_error, postproc=mean_sample()), Repeater(2), fselector=FractionTailSelector(0.70, mode='select', tail='upper'), stopping_criterion=NBackHistoryStopCrit(BestDetector(), 10), update_sensitivity=True) fclfsvm = FeatureSelectionClassifier(clfsvm, rfesvm) sensanasvm = fclfsvm.get_sensitivity_analyzer(postproc=maxofabs_sample()) # manually repeating/splitting so we do both RFE sensitivity and classification senses, errors = [], [] for i, pset in enumerate(NFoldPartitioner().generate(fds)): # split partitioned dataset split = [d for d in Splitter('partitions').generate(pset)] senses.append(sensanasvm(split[0])) # and it also should train the classifier so we would ask it about error errors.append(mean_mismatch_error(fclfsvm.predict(split[1]), split[1].targets)) senses = vstack(senses) errors = vstack(errors) # Let's compare against rerunning the beast simply for classification with CV errors_cv = CrossValidation(fclfsvm, NFoldPartitioner(), errorfx=mean_mismatch_error)(fds) # and they should match assert_array_equal(errors, errors_cv) # buggy! cv_sensana_svm = RepeatedMeasure(sensanasvm, NFoldPartitioner()) senses_rm = cv_sensana_svm(fds) #print senses.samples, senses_rm.samples #print errors, errors_cv.samples assert_raises(AssertionError, assert_array_almost_equal, senses.samples, senses_rm.samples) raise SkipTest("Known failure for repeated measures: https://github.com/PyMVPA/PyMVPA/issues/117")
def test_multiclass_ties(clf): if 'lars' in clf.__tags__: raise SkipTest("Known to crash while running this test") ds = _dsties1 # reassign data between ties, so we know that decision is data, not order driven ds_ = ds.copy(deep=True) ds_.samples[ds.a.ties_idx[1]] = ds.samples[ds.a.ties_idx[0]] ds_.samples[ds.a.ties_idx[0]] = ds.samples[ds.a.ties_idx[1]] ok_(np.any(ds_.samples != ds.samples)) clf_ = clf.clone() clf = clf.clone() clf.ca.enable(['estimates', 'predictions']) clf_.ca.enable(['estimates', 'predictions']) te = TransferMeasure(clf, Splitter('train'), postproc=BinaryFxNode(mean_mismatch_error, 'targets'), enable_ca=['stats']) te_ = TransferMeasure(clf_, Splitter('train'), postproc=BinaryFxNode(mean_mismatch_error, 'targets'), enable_ca=['stats']) te = CrossValidation(clf, NFoldPartitioner(), postproc=mean_sample(), enable_ca=['stats']) te_ = CrossValidation(clf_, NFoldPartitioner(), postproc=mean_sample(), enable_ca=['stats']) error = te(ds) matrix = te.ca.stats.matrix # if ties were broken randomly we should have got nearly the same # number of hits for tied targets ties_indices = [te.ca.stats.labels.index(c) for c in ds.a.ties] hits = np.diag(te.ca.stats.matrix)[ties_indices] # First check is to see if we swap data between tied labels we # are getting the same results if we permute labels accordingly, # i.e. that tie resolution is not dependent on the labels order # but rather on the data te_(ds_) matrix_swapped = te_.ca.stats.matrix if False: #0 in hits: print clf, matrix, matrix_swapped print clf.ca.estimates[:, 2] - clf.ca.estimates[:, 0] #print clf.ca.estimates # TODO: for now disabled all the non-compliant ones to pass the # tests. For visibility decided to skip them instead of just # exclusion and skipping only here to possibly catch crashes # which might happen before if len( set(('libsvm', 'sg', 'skl', 'gpr', 'blr')).intersection(clf.__tags__)): raise SkipTest("Skipped %s because it is known to fail") ok_(not (np.array_equal(matrix, matrix_swapped) and 0 in hits)) # this check is valid only if ties are not broken randomly # like it is the case with SMLR if not ('random_tie_breaking' in clf.__tags__ or # since __tags__ would not go that high up e.g. in # <knn on SMLR non-0> 'SMLR' in str(clf)): assert_array_equal(hits, np.diag(matrix_swapped)[ties_indices[::-1]]) # Second check is to just see if we didn't get an obvious bias and # got 0 in one of the hits, although it is labile if cfg.getboolean('tests', 'labile', default='yes'): ok_(not 0 in hits)
def test_n_group_split(self): """Test NGroupSplitter alongside with the reversal of the order of spit out datasets """ # Test 2 groups like HalfSplitter first hs = NGroupPartitioner(2) for isreversed, splitter in enumerate((hs, hs)): if isreversed: spl = Splitter(attr='partitions', reverse=True) else: spl = Splitter(attr='partitions') splits = [ list(spl.generate(p)) for p in hs.generate(self.data) ] self.failUnless(len(splits) == 2) for i, p in enumerate(splits): self.failUnless( len(p) == 2 ) self.failUnless( p[0].nsamples == 50 ) self.failUnless( p[1].nsamples == 50 ) assert_array_equal(splits[0][1-isreversed].sa['chunks'].unique, [0, 1, 2, 3, 4]) assert_array_equal(splits[0][isreversed].sa['chunks'].unique, [5, 6, 7, 8, 9]) assert_array_equal(splits[1][1-isreversed].sa['chunks'].unique, [5, 6, 7, 8, 9]) assert_array_equal(splits[1][isreversed].sa['chunks'].unique, [0, 1, 2, 3, 4]) # check if it works on pure odd and even chunk ids moresplits = [ list(spl.generate(p)) for p in hs.generate(splits[0][0])] for split in moresplits: self.failUnless(split[0] != None) self.failUnless(split[1] != None) # now test more groups s5 = NGroupPartitioner(5) # get the splits for isreversed, s5splitter in enumerate((s5, s5)): if isreversed: spl = Splitter(attr='partitions', reverse=True) else: spl = Splitter(attr='partitions') splits = [ list(spl.generate(p)) for p in s5splitter.generate(self.data) ] # must have 10 splits self.failUnless(len(splits) == 5) # check split content assert_array_equal(splits[0][1-isreversed].sa['chunks'].unique, [0, 1]) assert_array_equal(splits[0][isreversed].sa['chunks'].unique, [2, 3, 4, 5, 6, 7, 8, 9]) assert_array_equal(splits[1][1-isreversed].sa['chunks'].unique, [2, 3]) assert_array_equal(splits[1][isreversed].sa['chunks'].unique, [0, 1, 4, 5, 6, 7, 8, 9]) # ... assert_array_equal(splits[4][1-isreversed].sa['chunks'].unique, [8, 9]) assert_array_equal(splits[4][isreversed].sa['chunks'].unique, [0, 1, 2, 3, 4, 5, 6, 7]) # Test for too many groups def splitcall(spl, dat): return list(spl.generate(dat)) s20 = NGroupPartitioner(20) self.assertRaises(ValueError,splitcall,s20,self.data)
def _sl_call(self, dataset, roi_ids, nproc): """Call to GNBSearchlight """ # Local bindings gnb = self.gnb params = gnb.params generator = self.generator errorfx = self.errorfx qe = self.queryengine ## if False: ## class A(Learner): ## pass ## self = A() ## import numpy as np ## from mvpa2.clfs.gnb import GNB ## from mvpa2.generators.partition import NFoldPartitioner ## from mvpa2.misc.errorfx import mean_mismatch_error ## from mvpa2.testing.datasets import datasets as tdatasets ## from mvpa2.datasets import Dataset ## from mvpa2.misc.neighborhood import IndexQueryEngine, Sphere ## from mvpa2.clfs.distance import absmin_distance ## import time ## if __debug__: ## from mvpa2.base import debug ## debug.active += ['SLC.*'] ## # XXX is it that ugly? ## debug.active.pop(debug.active.index('SLC_')) ## debug.metrics += ['reltime'] ## dataset = tdatasets['3dlarge'].copy() ## dataset.fa['voxel_indices'] = dataset.fa.myspace ## sphere = Sphere(radius=1, ## distance_func=absmin_distance) ## qe = IndexQueryEngine(myspace=sphere) ## # Fracisco's data ## #dataset = ds_fp ## qe = IndexQueryEngine(voxel_indices=sphere) ## qe.train(dataset) ## roi_ids = np.arange(dataset.nfeatures) ## gnb = GNB() ## params = gnb.params ## generator = NFoldPartitioner() ## errorfx = mean_mismatch_error if __debug__: time_start = time.time() targets_sa_name = gnb.get_space() targets_sa = dataset.sa[targets_sa_name] if __debug__: debug_slc_ = 'SLC_' in debug.active # get the dataset information into easy vars X = dataset.samples if len(X.shape) != 2: raise ValueError, \ 'Unlike GNB, GNBSearchlight (for now) operates on already' \ 'flattened datasets' labels = targets_sa.value ulabels = targets_sa.unique nlabels = len(ulabels) label2index = dict((l, il) for il, l in enumerate(ulabels)) labels_numeric = np.array([label2index[l] for l in labels]) ulabels_numeric = [label2index[l] for l in ulabels] # set the feature dimensions nsamples = len(X) nrois = len(roi_ids) s_shape = X.shape[1:] # shape of a single sample # The shape of results r_shape = (nrois,) + X.shape[2:] # # Everything toward optimization ;) # # Silly Yarik thinks that it might be worth to pre-compute # statistics per each feature within a block of the samples # which always come together in splits -- most often it is a # (chunk, label) combination, but since we simply use a # generator -- who knows! Therefore lets figure out what are # those blocks and operate on them instead of original samples. # # After additional thinking about this -- probably it would be # just minor additional improvements (ie not worth it) but # since it is coded already -- let it be so # 1. Query generator for the splits we will have if __debug__: debug('SLC', 'Phase 1. Initializing partitions using %s on %s' % (generator, dataset)) # Lets just create a dummy ds which will store for us actual sample # indicies # XXX we could make it even more lightweight I guess... dataset_indicies = Dataset(np.arange(nsamples), sa=dataset.sa) splitter = Splitter(attr=generator.get_space()) splits = list(tuple(splitter.generate(ds_)) for ds_ in generator.generate(dataset_indicies)) nsplits = len(splits) # 2. Figure out the new 'chunks x labels' blocks of combinations # of samples if __debug__: debug('SLC', 'Phase 2. Blocking data for %i splits and %i labels' % (nsplits, nlabels)) # array of indicies for label, split1, split2, ... # through which we will pass later on to figure out # unique combinations combinations = np.ones((nsamples, 1+nsplits), dtype=int)*-1 # labels combinations[:, 0] = labels_numeric for ipartition, (split1, split2) in enumerate(splits): combinations[split1.samples[:, 0], 1+ipartition] = 1 combinations[split2.samples[:, 0], 1+ipartition] = 2 # Check for over-sampling, i.e. no same sample used twice here if not (len(np.unique(split1.samples[:, 0])) == len(split1) and len(np.unique(split2.samples[:, 0])) == len(split2)): raise RuntimeError( "GNBSearchlight needs a partitioner which does not reuse " "the same the same samples more than once") # sample descriptions -- should be unique for # samples within the same block descriptions = [tuple(c) for c in combinations] udescriptions = sorted(list(set(descriptions))) nblocks = len(udescriptions) description2block = dict([(d, i) for i, d in enumerate(udescriptions)]) # Indices for samples to point to their block sample2block = np.array([description2block[d] for d in descriptions]) # 3. Compute statistics per each block # if __debug__: debug('SLC', 'Phase 3. Computing statistics for %i blocks' % (nblocks,)) # # reusable containers which should stay of the same size # # sums and sums of squares per each block sums = np.zeros((nblocks, ) + s_shape) # sums of squares sums2 = np.zeros((nblocks, ) + s_shape) # per each label: means = np.zeros((nlabels, ) + s_shape) # means of squares for stddev computation means2 = np.zeros((nlabels, ) + s_shape) variances = np.zeros((nlabels, ) + s_shape) # degenerate dimension are added for easy broadcasting later on nsamples_per_class = np.zeros((nlabels,) + (1,)*len(s_shape)) # results results = np.zeros((nsplits,) + r_shape) block_counts = np.zeros((nblocks,)) block_labels = [None] * nblocks X2 = np.square(X) # silly way for now for l, s, s2, ib in zip(labels_numeric, X, X2, sample2block): sums[ib] += s sums2[ib] += s2 block_counts[ib] += 1 if block_labels[ib] is None: block_labels[ib] = l else: assert(block_labels[ib] == l) block_labels = np.asanyarray(block_labels) # additional silly tests for paranoid assert(block_labels.dtype.kind is 'i') # 4. Lets deduce all neighbors... might need to be RF into the # parallel part later on if __debug__: debug('SLC', 'Phase 4. Deducing neighbors information for %i ROIs' % (nrois,)) roi_fids = [qe.query_byid(f) for f in roi_ids] nroi_fids = len(roi_fids) # makes sense to waste precious ms only if ca is enabled if self.ca.is_enabled('roi_sizes'): roi_sizes = [len(x) for x in roi_fids] else: roi_sizes = [] indexsum = self._indexsum if indexsum == 'sparse': if __debug__: debug('SLC', 'Phase 4b. Converting neighbors to sparse matrix ' 'representation') # convert to "sparse representation" where column j contains # 1s only at the roi_fids[j] indices roi_fids = inds_to_coo(roi_fids, shape=(dataset.nfeatures, nroi_fids)) indexsum_fx = lastdim_columnsums_spmatrix elif indexsum == 'fancy': indexsum_fx = lastdim_columnsums_fancy_indexing else: raise ValueError, \ "Do not know how to deal with indexsum=%s" % indexsum # 5. Lets do actual "splitting" and "classification" if __debug__: debug('SLC', 'Phase 5. Major loop' ) for isplit, split in enumerate(splits): if __debug__: debug('SLC', ' Split %i out of %i' % (isplit, nsplits)) # figure out for a given splits the blocks we want to work # with # sample_indicies training_sis = split[0].samples[:, 0] # convert to blocks training split training_bis = np.unique(sample2block[training_sis]) # now lets do our GNB business training_nsamples = 0 for il, l in enumerate(ulabels_numeric): bis_il = training_bis[block_labels[training_bis] == l] nsamples_per_class[il] = N_float = \ float(np.sum(block_counts[bis_il])) training_nsamples += N_float if N_float == 0.0: variances[il] = means[il] = means2[il] = 0. else: means[il] = np.sum(sums[bis_il], axis=0) / N_float # Not yet normed means2[il] = np.sum(sums2[bis_il], axis=0) ## Actually compute the non-0 variances non0labels = (nsamples_per_class.squeeze() != 0) if np.all(non0labels): # For a possible tiny speed up avoiding copying and # using (no) slicing non0labels = slice(None) if params.common_variance: variances[:] = \ np.sum(means2 - nsamples_per_class*np.square(means), axis=0) \ / training_nsamples else: variances[non0labels] = \ (means2 - nsamples_per_class*np.square(means))[non0labels] \ / nsamples_per_class[non0labels] # assign priors priors = gnb._get_priors( nlabels, training_nsamples, nsamples_per_class) # proceed in a way we have in GNB code with logprob=True, # i.e. operating within the exponents -- should lead to some # performance advantage norm_weight = -0.5 * np.log(2*np.pi*variances) # last added dimension would be for ROIs logpriors = np.log(priors[:, np.newaxis, np.newaxis]) if __debug__: debug('SLC', " 'Training' is done") # Now it is time to "classify" our samples. # and for that we first need to compute corresponding # probabilities (or may be un data = X[split[1].samples[:, 0]] targets = labels_numeric[split[1].samples[:, 0]] # argument of exponentiation scaled_distances = \ -0.5 * (((data - means[:, np.newaxis, ...])**2) \ / variances[:, np.newaxis, ...]) # incorporate the normalization from normals lprob_csfs = norm_weight[:, np.newaxis, ...] + scaled_distances ## First we need to reshape to get class x samples x features lprob_csf = lprob_csfs.reshape(lprob_csfs.shape[:2] + (-1,)) ## Now we come to naive part which requires looping ## through all spheres if __debug__: debug('SLC', " Doing 'Searchlight'") # resultant logprobs for each class x sample x roi lprob_cs_sl = np.zeros(lprob_csfs.shape[:2] + (nroi_fids,)) indexsum_fx(lprob_csf, roi_fids, out=lprob_cs_sl) lprob_cs_sl += logpriors lprob_cs_cp_sl = lprob_cs_sl # for each of the ROIs take the class with maximal (log)probability predictions = lprob_cs_cp_sl.argmax(axis=0) # no need to map back [self.ulabels[c] for c in winners] #predictions = winners # assess the errors if __debug__: debug('SLC', " Assessing accuracies") if errorfx is mean_mismatch_error: results[isplit, :] = \ (predictions != targets[:, None]).sum(axis=0) \ / float(len(targets)) else: # somewhat silly but a way which allows to use pre-crafted # error functions without a chance to screw up for i, fpredictions in enumerate(predictions.T): results[isplit, i] = errorfx(fpredictions, targets) if __debug__: debug('SLC', "GNBSearchlight is done in %.3g sec" % (time.time() - time_start)) return Dataset(results), roi_sizes