def test_custom_split(self): #simulate half splitter hs = CustomPartitioner([(None,[0,1,2,3,4]),(None,[5,6,7,8,9])]) spl = Splitter(attr='partitions') splits = [ list(spl.generate(p)) for p in hs.generate(self.data) ] self.failUnless(len(splits) == 2) for i,p in enumerate(splits): self.failUnless( len(p) == 2 ) self.failUnless( p[0].nsamples == 50 ) self.failUnless( p[1].nsamples == 50 ) assert_array_equal(splits[0][1].sa['chunks'].unique, [0, 1, 2, 3, 4]) assert_array_equal(splits[0][0].sa['chunks'].unique, [5, 6, 7, 8, 9]) assert_array_equal(splits[1][1].sa['chunks'].unique, [5, 6, 7, 8, 9]) assert_array_equal(splits[1][0].sa['chunks'].unique, [0, 1, 2, 3, 4]) # check fully customized split with working and validation set specified cs = CustomPartitioner([([0,3,4],[5,9])]) # we want to discared the unselected partition of the data, hence attr_value spl = Splitter(attr='partitions', attr_values=[1,2]) splits = [ list(spl.generate(p)) for p in cs.generate(self.data) ] self.failUnless(len(splits) == 1) for i,p in enumerate(splits): self.failUnless( len(p) == 2 ) self.failUnless( p[0].nsamples == 30 ) self.failUnless( p[1].nsamples == 20 ) self.failUnless((splits[0][1].sa['chunks'].unique == [5, 9]).all()) self.failUnless((splits[0][0].sa['chunks'].unique == [0, 3, 4]).all())
def test_splitter(): ds = give_data() # split with defaults spl1 = Splitter('chunks') assert_raises(NotImplementedError, spl1, ds) splits = list(spl1.generate(ds)) assert_equal(len(splits), len(ds.sa['chunks'].unique)) for split in splits: # it should have perform basic slicing! assert_true(split.samples.base is ds.samples) assert_equal(len(split.sa['chunks'].unique), 1) assert_true('lastsplit' in split.a) assert_true(splits[-1].a.lastsplit) # now again, more customized spl2 = Splitter('targets', attr_values = [0,1,1,2,3,3,3], count=4, noslicing=True) splits = list(spl2.generate(ds)) assert_equal(len(splits), 4) for split in splits: # it should NOT have perform basic slicing! assert_false(split.samples.base is ds.samples) assert_equal(len(split.sa['targets'].unique), 1) assert_equal(len(split.sa['chunks'].unique), 10) assert_true(splits[-1].a.lastsplit) # two should be identical assert_array_equal(splits[1].samples, splits[2].samples) # now go wild and split by feature attribute ds.fa['roi'] = np.repeat([0,1], 5) # splitter should auto-detect that this is a feature attribute spl3 = Splitter('roi') splits = list(spl3.generate(ds)) assert_equal(len(splits), 2) for split in splits: assert_true(split.samples.base is ds.samples) assert_equal(len(split.fa['roi'].unique), 1) assert_equal(split.shape, (100, 5)) # and finally test chained splitters cspl = ChainNode([spl2, spl3, spl1]) splits = list(cspl.generate(ds)) # 4 target splits and 2 roi splits each and 10 chunks each assert_equal(len(splits), 80)
def test_slicing(self): hs = HalfPartitioner() spl = Splitter(attr='partitions') splits = list(hs.generate(self.data)) for s in splits: # partitioned dataset shared the data assert_true(s.samples.base is self.data.samples) splits = [ list(spl.generate(p)) for p in hs.generate(self.data) ] for s in splits: # we get slicing all the time assert_true(s[0].samples.base.base is self.data.samples) assert_true(s[1].samples.base.base is self.data.samples) spl = Splitter(attr='partitions', noslicing=True) splits = [ list(spl.generate(p)) for p in hs.generate(self.data) ] for s in splits: # we no slicing at all assert_false(s[0].samples.base is self.data.samples) assert_false(s[1].samples.base is self.data.samples) nfs = NFoldPartitioner() spl = Splitter(attr='partitions') splits = [ list(spl.generate(p)) for p in nfs.generate(self.data) ] for i, s in enumerate(splits): # training only first and last split if i == 0 or i == len(splits) - 1: assert_true(s[0].samples.base.base is self.data.samples) else: assert_true(s[0].samples.base is None) # we get slicing all the time assert_true(s[1].samples.base.base is self.data.samples) step_ds = Dataset(np.random.randn(20,2), sa={'chunks': np.tile([0,1], 10)}) oes = OddEvenPartitioner() spl = Splitter(attr='partitions') splits = list(oes.generate(step_ds)) for s in splits: # partitioned dataset shared the data assert_true(s.samples.base is step_ds.samples) splits = [ list(spl.generate(p)) for p in oes.generate(step_ds) ] assert_equal(len(splits), 2) for s in splits: # we get slicing all the time assert_true(s[0].samples.base.base is step_ds.samples) assert_true(s[1].samples.base.base is step_ds.samples)
def test_label_splitter(self): oes = OddEvenPartitioner(attr='targets') spl = Splitter(attr='partitions') splits = [ list(spl.generate(p)) for p in oes.generate(self.data) ] assert_array_equal(splits[0][0].sa['targets'].unique, [0,2]) assert_array_equal(splits[0][1].sa['targets'].unique, [1,3]) assert_array_equal(splits[1][0].sa['targets'].unique, [1,3]) assert_array_equal(splits[1][1].sa['targets'].unique, [0,2])
def test_simplest_cv_pat_gen(self): # create the generator nfs = NFoldPartitioner(cvtype=1) spl = Splitter(attr='partitions') # now get the xval pattern sets One-Fold CV) xvpat = [ list(spl.generate(p)) for p in nfs.generate(self.data) ] self.failUnless( len(xvpat) == 10 ) for i,p in enumerate(xvpat): self.failUnless( len(p) == 2 ) self.failUnless( p[0].nsamples == 90 ) self.failUnless( p[1].nsamples == 10 ) self.failUnless( p[1].chunks[0] == i )
def test_half_split(self): hs = HalfPartitioner() spl = Splitter(attr='partitions') splits = [ list(spl.generate(p)) for p in hs.generate(self.data) ] self.failUnless(len(splits) == 2) for i,p in enumerate(splits): self.failUnless( len(p) == 2 ) self.failUnless( p[0].nsamples == 50 ) self.failUnless( p[1].nsamples == 50 ) assert_array_equal(splits[0][1].sa['chunks'].unique, [0, 1, 2, 3, 4]) assert_array_equal(splits[0][0].sa['chunks'].unique, [5, 6, 7, 8, 9]) assert_array_equal(splits[1][1].sa['chunks'].unique, [5, 6, 7, 8, 9]) assert_array_equal(splits[1][0].sa['chunks'].unique, [0, 1, 2, 3, 4]) # check if it works on pure odd and even chunk ids moresplits = [ list(spl.generate(p)) for p in hs.generate(splits[0][0])] for split in moresplits: self.failUnless(split[0] != None) self.failUnless(split[1] != None)
def test_repeated_features(self): print self.dataset print self.dataset.fa.nonbogus_targets class CountFeatures(Measure): is_trained = True def _call(self, ds): return ds.nfeatures cf = CountFeatures() spl = Splitter('fa.nonbogus_targets') nsplits = len(list(spl.generate(self.dataset))) assert_equal(nsplits, 3) rm = RepeatedMeasure(cf, spl, concat_as='features') res = rm(self.dataset) assert_equal(res.shape, (1, nsplits)) assert_array_equal(res.samples[0], [18,1,1])
def test_counted_splitting(self): spl = Splitter(attr='partitions') # count > #chunks, should result in 10 splits nchunks = len(self.data.sa['chunks'].unique) for strategy in Partitioner._STRATEGIES: for count, target in [ (nchunks*2, nchunks), (nchunks, nchunks), (nchunks-1, nchunks-1), (3, 3), (0, 0), (1, 1) ]: nfs = NFoldPartitioner(cvtype=1, count=count, selection_strategy=strategy) splits = [ list(spl.generate(p)) for p in nfs.generate(self.data) ] self.failUnless(len(splits) == target) chosenchunks = [int(s[1].uniquechunks) for s in splits] # Test if configuration matches as well nsplits_cfg = len(nfs.get_partition_specs(self.data)) self.failUnlessEqual(nsplits_cfg, target) # Check if "lastsplit" dsattr was assigned appropriately nsplits = len(splits) if nsplits > 0: # dummy-proof testing of last split for ds_ in splits[-1]: self.failUnless(ds_.a.lastpartitionset) # test all now for isplit,split in enumerate(splits): for ds_ in split: ds_.a.lastpartitionset == isplit==nsplits-1 # Check results of different strategies if strategy == 'first': self.failUnlessEqual(chosenchunks, range(target)) elif strategy == 'equidistant': if target == 3: self.failUnlessEqual(chosenchunks, [0, 3, 7]) elif strategy == 'random': # none is selected twice self.failUnless(len(set(chosenchunks)) == len(chosenchunks)) self.failUnless(target == len(chosenchunks)) else: raise RuntimeError, "Add unittest for strategy %s" \ % strategy
def test_svms(self, clf): knows_probabilities = \ 'probabilities' in clf.ca.keys() and clf.params.probability enable_ca = ['estimates'] if knows_probabilities: enable_ca += ['probabilities'] clf.ca.change_temporarily(enable_ca = enable_ca) spl = Splitter('train', count=2) traindata, testdata = list(spl.generate(datasets['uni2small'])) clf.train(traindata) predicts = clf.predict(testdata.samples) # values should be different from predictions for SVMs we have self.failUnless(np.any(predicts != clf.ca.estimates)) if knows_probabilities and clf.ca.is_set('probabilities'): # XXX test more thoroughly what we are getting here ;-) self.failUnlessEqual( len(clf.ca.probabilities), len(testdata.samples) ) clf.ca.reset_changed_temporarily()
def _forward_dataset(self, ds): if self.__chunks_attr is None: return self._forward_dataset_helper(ds) else: # strip down dataset to speedup local processing if self.__attr_strategy == 'remove': keep_sa = [] else: keep_sa = None proc_ds = ds.copy(deep=False, sa=keep_sa, fa=[], a=[]) # process all chunks individually # use a customsplitter to speed-up splitting spl = Splitter(self.__chunks_attr) dses = [self._forward_dataset_helper(d) for d in spl.generate(proc_ds)] # and merge them again mds = vstack(dses) # put back attributes mds.fa.update(ds.fa) mds.a.update(ds.a) return mds
def test_n_group_split(self): """Test NGroupSplitter alongside with the reversal of the order of spit out datasets """ # Test 2 groups like HalfSplitter first hs = NGroupPartitioner(2) for isreversed, splitter in enumerate((hs, hs)): if isreversed: spl = Splitter(attr='partitions', reverse=True) else: spl = Splitter(attr='partitions') splits = [ list(spl.generate(p)) for p in hs.generate(self.data) ] self.failUnless(len(splits) == 2) for i, p in enumerate(splits): self.failUnless( len(p) == 2 ) self.failUnless( p[0].nsamples == 50 ) self.failUnless( p[1].nsamples == 50 ) assert_array_equal(splits[0][1-isreversed].sa['chunks'].unique, [0, 1, 2, 3, 4]) assert_array_equal(splits[0][isreversed].sa['chunks'].unique, [5, 6, 7, 8, 9]) assert_array_equal(splits[1][1-isreversed].sa['chunks'].unique, [5, 6, 7, 8, 9]) assert_array_equal(splits[1][isreversed].sa['chunks'].unique, [0, 1, 2, 3, 4]) # check if it works on pure odd and even chunk ids moresplits = [ list(spl.generate(p)) for p in hs.generate(splits[0][0])] for split in moresplits: self.failUnless(split[0] != None) self.failUnless(split[1] != None) # now test more groups s5 = NGroupPartitioner(5) # get the splits for isreversed, s5splitter in enumerate((s5, s5)): if isreversed: spl = Splitter(attr='partitions', reverse=True) else: spl = Splitter(attr='partitions') splits = [ list(spl.generate(p)) for p in s5splitter.generate(self.data) ] # must have 10 splits self.failUnless(len(splits) == 5) # check split content assert_array_equal(splits[0][1-isreversed].sa['chunks'].unique, [0, 1]) assert_array_equal(splits[0][isreversed].sa['chunks'].unique, [2, 3, 4, 5, 6, 7, 8, 9]) assert_array_equal(splits[1][1-isreversed].sa['chunks'].unique, [2, 3]) assert_array_equal(splits[1][isreversed].sa['chunks'].unique, [0, 1, 4, 5, 6, 7, 8, 9]) # ... assert_array_equal(splits[4][1-isreversed].sa['chunks'].unique, [8, 9]) assert_array_equal(splits[4][isreversed].sa['chunks'].unique, [0, 1, 2, 3, 4, 5, 6, 7]) # Test for too many groups def splitcall(spl, dat): return list(spl.generate(dat)) s20 = NGroupPartitioner(20) self.assertRaises(ValueError,splitcall,s20,self.data)
def _sl_call(self, dataset, roi_ids, nproc): """Call to GNBSearchlight """ # Local bindings gnb = self.gnb params = gnb.params generator = self.generator errorfx = self.errorfx qe = self.queryengine ## if False: ## class A(Learner): ## pass ## self = A() ## import numpy as np ## from mvpa.clfs.gnb import GNB ## from mvpa.generators.partition import NFoldPartitioner ## from mvpa.misc.errorfx import mean_mismatch_error ## from mvpa.testing.datasets import datasets as tdatasets ## from mvpa.datasets import Dataset ## from mvpa.misc.neighborhood import IndexQueryEngine, Sphere ## from mvpa.clfs.distance import absmin_distance ## import time ## if __debug__: ## from mvpa.base import debug ## debug.active += ['SLC.*'] ## # XXX is it that ugly? ## debug.active.pop(debug.active.index('SLC_')) ## debug.metrics += ['reltime'] ## dataset = tdatasets['3dlarge'].copy() ## dataset.fa['voxel_indices'] = dataset.fa.myspace ## sphere = Sphere(radius=1, ## distance_func=absmin_distance) ## qe = IndexQueryEngine(myspace=sphere) ## # Fracisco's data ## #dataset = ds_fp ## qe = IndexQueryEngine(voxel_indices=sphere) ## qe.train(dataset) ## roi_ids = np.arange(dataset.nfeatures) ## gnb = GNB() ## params = gnb.params ## generator = NFoldPartitioner() ## errorfx = mean_mismatch_error if __debug__: time_start = time.time() targets_sa_name = gnb.get_space() targets_sa = dataset.sa[targets_sa_name] if __debug__: debug_slc_ = 'SLC_' in debug.active # get the dataset information into easy vars X = dataset.samples if len(X.shape) != 2: raise ValueError, \ 'Unlike GNB, GNBSearchlight (for now) operates on already' \ 'flattened datasets' labels = targets_sa.value ulabels = targets_sa.unique nlabels = len(ulabels) label2index = dict((l, il) for il, l in enumerate(ulabels)) labels_numeric = np.array([label2index[l] for l in labels]) ulabels_numeric = [label2index[l] for l in ulabels] # set the feature dimensions nsamples = len(X) nrois = len(roi_ids) s_shape = X.shape[1:] # shape of a single sample # The shape of results r_shape = (nrois,) + X.shape[2:] # # Everything toward optimization ;) # # Silly Yarik thinks that it might be worth to pre-compute # statistics per each feature within a block of the samples # which always come together in splits -- most often it is a # (chunk, label) combination, but since we simply use a # generator -- who knows! Therefore lets figure out what are # those blocks and operate on them instead of original samples. # # After additional thinking about this -- probably it would be # just minor additional improvements (ie not worth it) but # since it is coded already -- let it be so # 1. Query generator for the splits we will have if __debug__: debug('SLC', 'Phase 1. Initializing partitions using %s on %s' % (generator, dataset)) # Lets just create a dummy ds which will store for us actual sample # indicies # XXX we could make it even more lightweight I guess... dataset_indicies = Dataset(np.arange(nsamples), sa=dataset.sa) splitter = Splitter(attr=generator.get_space()) splits = list(tuple(splitter.generate(ds_)) for ds_ in generator.generate(dataset_indicies)) nsplits = len(splits) # 2. Figure out the new 'chunks x labels' blocks of combinations # of samples if __debug__: debug('SLC', 'Phase 2. Blocking data for %i splits and %i labels' % (nsplits, nlabels)) # array of indicies for label, split1, split2, ... # through which we will pass later on to figure out # unique combinations combinations = np.ones((nsamples, 1+nsplits), dtype=int)*-1 # labels combinations[:, 0] = labels_numeric for ipartition, (split1, split2) in enumerate(splits): combinations[split1.samples[:, 0], 1+ipartition] = 1 combinations[split2.samples[:, 0], 1+ipartition] = 2 # Check for over-sampling, i.e. no same sample used twice here if not (len(np.unique(split1.samples[:, 0])) == len(split1) and len(np.unique(split2.samples[:, 0])) == len(split2)): raise RuntimeError( "GNBSearchlight needs a partitioner which does not reuse " "the same the same samples more than once") # sample descriptions -- should be unique for # samples within the same block descriptions = [tuple(c) for c in combinations] udescriptions = sorted(list(set(descriptions))) nblocks = len(udescriptions) description2block = dict([(d, i) for i, d in enumerate(udescriptions)]) # Indices for samples to point to their block sample2block = np.array([description2block[d] for d in descriptions]) # 3. Compute statistics per each block # if __debug__: debug('SLC', 'Phase 3. Computing statistics for %i blocks' % (nblocks,)) # # reusable containers which should stay of the same size # # sums and sums of squares per each block sums = np.zeros((nblocks, ) + s_shape) # sums of squares sums2 = np.zeros((nblocks, ) + s_shape) # per each label: means = np.zeros((nlabels, ) + s_shape) # means of squares for stddev computation means2 = np.zeros((nlabels, ) + s_shape) variances = np.zeros((nlabels, ) + s_shape) # degenerate dimension are added for easy broadcasting later on nsamples_per_class = np.zeros((nlabels,) + (1,)*len(s_shape)) # results results = np.zeros((nsplits,) + r_shape) block_counts = np.zeros((nblocks,)) block_labels = [None] * nblocks X2 = np.square(X) # silly way for now for l, s, s2, ib in zip(labels_numeric, X, X2, sample2block): sums[ib] += s sums2[ib] += s2 block_counts[ib] += 1 if block_labels[ib] is None: block_labels[ib] = l else: assert(block_labels[ib] == l) block_labels = np.asanyarray(block_labels) # additional silly tests for paranoid assert(block_labels.dtype.kind is 'i') # 4. Lets deduce all neighbors... might need to be RF into the # parallel part later on if __debug__: debug('SLC', 'Phase 4. Deducing neighbors information for %i ROIs' % (nrois,)) roi_fids = [qe.query_byid(f) for f in roi_ids] nroi_fids = len(roi_fids) # makes sense to waste precious ms only if ca is enabled if self.ca.is_enabled('roi_sizes'): roi_sizes = [len(x) for x in roi_fids] else: roi_sizes = [] indexsum = self._indexsum if indexsum == 'sparse': if __debug__: debug('SLC', 'Phase 4b. Converting neighbors to sparse matrix ' 'representation') # convert to "sparse representation" where column j contains # 1s only at the roi_fids[j] indices roi_fids = inds_to_coo(roi_fids, shape=(dataset.nfeatures, nroi_fids)) indexsum_fx = lastdim_columnsums_spmatrix elif indexsum == 'fancy': indexsum_fx = lastdim_columnsums_fancy_indexing else: raise ValueError, \ "Do not know how to deal with indexsum=%s" % indexsum # 5. Lets do actual "splitting" and "classification" if __debug__: debug('SLC', 'Phase 5. Major loop' ) for isplit, split in enumerate(splits): if __debug__: debug('SLC', ' Split %i out of %i' % (isplit, nsplits)) # figure out for a given splits the blocks we want to work # with # sample_indicies training_sis = split[0].samples[:, 0] # convert to blocks training split training_bis = np.unique(sample2block[training_sis]) # now lets do our GNB business training_nsamples = 0 for il, l in enumerate(ulabels_numeric): bis_il = training_bis[block_labels[training_bis] == l] nsamples_per_class[il] = N_float = \ float(np.sum(block_counts[bis_il])) training_nsamples += N_float if N_float == 0.0: variances[il] = means[il] = means2[il] = 0. else: means[il] = np.sum(sums[bis_il], axis=0) / N_float # Not yet normed means2[il] = np.sum(sums2[bis_il], axis=0) ## Actually compute the non-0 variances non0labels = (nsamples_per_class.squeeze() != 0) if np.all(non0labels): # For a possible tiny speed up avoiding copying and # using (no) slicing non0labels = slice(None) if params.common_variance: variances[:] = \ np.sum(means2 - nsamples_per_class*np.square(means), axis=0) \ / training_nsamples else: variances[non0labels] = \ (means2 - nsamples_per_class*np.square(means))[non0labels] \ / nsamples_per_class[non0labels] # assign priors priors = gnb._get_priors( nlabels, training_nsamples, nsamples_per_class) # proceed in a way we have in GNB code with logprob=True, # i.e. operating within the exponents -- should lead to some # performance advantage norm_weight = -0.5 * np.log(2*np.pi*variances) # last added dimension would be for ROIs logpriors = np.log(priors[:, np.newaxis, np.newaxis]) if __debug__: debug('SLC', " 'Training' is done") # Now it is time to "classify" our samples. # and for that we first need to compute corresponding # probabilities (or may be un data = X[split[1].samples[:, 0]] targets = labels_numeric[split[1].samples[:, 0]] # argument of exponentiation scaled_distances = \ -0.5 * (((data - means[:, np.newaxis, ...])**2) \ / variances[:, np.newaxis, ...]) # incorporate the normalization from normals lprob_csfs = norm_weight[:, np.newaxis, ...] + scaled_distances ## First we need to reshape to get class x samples x features lprob_csf = lprob_csfs.reshape(lprob_csfs.shape[:2] + (-1,)) ## Now we come to naive part which requires looping ## through all spheres if __debug__: debug('SLC', " Doing 'Searchlight'") # resultant logprobs for each class x sample x roi lprob_cs_sl = np.zeros(lprob_csfs.shape[:2] + (nroi_fids,)) indexsum_fx(lprob_csf, roi_fids, out=lprob_cs_sl) lprob_cs_sl += logpriors lprob_cs_cp_sl = lprob_cs_sl # for each of the ROIs take the class with maximal (log)probability predictions = lprob_cs_cp_sl.argmax(axis=0) # no need to map back [self.ulabels[c] for c in winners] #predictions = winners # assess the errors if __debug__: debug('SLC', " Assessing accuracies") if errorfx is mean_mismatch_error: results[isplit, :] = \ (predictions != targets[:, None]).sum(axis=0) \ / float(len(targets)) else: # somewhat silly but a way which allows to use pre-crafted # error functions without a chance to screw up for i, fpredictions in enumerate(predictions.T): results[isplit, i] = errorfx(fpredictions, targets) if __debug__: debug('SLC', "GNBSearchlight is done in %.3g sec" % (time.time() - time_start)) return Dataset(results), roi_sizes