def test_custom_split(self): #simulate half splitter hs = CustomPartitioner([(None,[0,1,2,3,4]),(None,[5,6,7,8,9])]) spl = Splitter(attr='partitions') splits = [ list(spl.generate(p)) for p in hs.generate(self.data) ] self.assertTrue(len(splits) == 2) for i,p in enumerate(splits): self.assertTrue( len(p) == 2 ) self.assertTrue( p[0].nsamples == 50 ) self.assertTrue( p[1].nsamples == 50 ) assert_array_equal(splits[0][1].sa['chunks'].unique, [0, 1, 2, 3, 4]) assert_array_equal(splits[0][0].sa['chunks'].unique, [5, 6, 7, 8, 9]) assert_array_equal(splits[1][1].sa['chunks'].unique, [5, 6, 7, 8, 9]) assert_array_equal(splits[1][0].sa['chunks'].unique, [0, 1, 2, 3, 4]) # check fully customized split with working and validation set specified cs = CustomPartitioner([([0,3,4],[5,9])]) # we want to discared the unselected partition of the data, hence attr_value # these two splitters should do exactly the same thing splitters = (Splitter(attr='partitions', attr_values=[1,2]), Splitter(attr='partitions', ignore_values=(0,))) for spl in splitters: splits = [ list(spl.generate(p)) for p in cs.generate(self.data) ] self.assertTrue(len(splits) == 1) for i,p in enumerate(splits): self.assertTrue( len(p) == 2 ) self.assertTrue( p[0].nsamples == 30 ) self.assertTrue( p[1].nsamples == 20 ) self.assertTrue((splits[0][1].sa['chunks'].unique == [5, 9]).all()) self.assertTrue((splits[0][0].sa['chunks'].unique == [0, 3, 4]).all())
def test_slicing(self): hs = HalfPartitioner() spl = Splitter(attr="partitions") splits = list(hs.generate(self.data)) for s in splits: # partitioned dataset shared the data assert_true(s.samples.base is self.data.samples) splits = [list(spl.generate(p)) for p in hs.generate(self.data)] # with numpy 1.7.0b1 "chaining" was deprecated so let's create # check function appropriate for the given numpy version _a = np.arange(5) __a = _a[:4][:3] if __a.base is _a: # 1.7.0b1 def is_the_same_base(x, base=self.data.samples): return x.base is base elif __a.base.base is _a: # prior 1.7.0b1 def is_the_same_base(x, base=self.data.samples): return x.base.base is base else: raise RuntimeError("Uknown handling of .base by numpy") for s in splits: # we get slicing all the time assert_true(is_the_same_base(s[0].samples)) assert_true(is_the_same_base(s[1].samples)) spl = Splitter(attr="partitions", noslicing=True) splits = [list(spl.generate(p)) for p in hs.generate(self.data)] for s in splits: # we no slicing at all assert_false(s[0].samples.base is self.data.samples) assert_false(s[1].samples.base is self.data.samples) nfs = NFoldPartitioner() spl = Splitter(attr="partitions") splits = [list(spl.generate(p)) for p in nfs.generate(self.data)] for i, s in enumerate(splits): # training only first and last split if i == 0 or i == len(splits) - 1: assert_true(is_the_same_base(s[0].samples)) else: assert_true(s[0].samples.base is None) # we get slicing all the time assert_true(is_the_same_base(s[1].samples)) step_ds = Dataset(np.random.randn(20, 2), sa={"chunks": np.tile([0, 1], 10)}) oes = OddEvenPartitioner() spl = Splitter(attr="partitions") splits = list(oes.generate(step_ds)) for s in splits: # partitioned dataset shared the data assert_true(s.samples.base is step_ds.samples) splits = [list(spl.generate(p)) for p in oes.generate(step_ds)] assert_equal(len(splits), 2) for s in splits: # we get slicing all the time assert_true(is_the_same_base(s[0].samples, step_ds.samples)) assert_true(is_the_same_base(s[1].samples, step_ds.samples))
def test_custom_split(self): #simulate half splitter hs = CustomPartitioner([(None,[0,1,2,3,4]),(None,[5,6,7,8,9])]) spl = Splitter(attr='partitions') splits = [ list(spl.generate(p)) for p in hs.generate(self.data) ] self.failUnless(len(splits) == 2) for i,p in enumerate(splits): self.failUnless( len(p) == 2 ) self.failUnless( p[0].nsamples == 50 ) self.failUnless( p[1].nsamples == 50 ) assert_array_equal(splits[0][1].sa['chunks'].unique, [0, 1, 2, 3, 4]) assert_array_equal(splits[0][0].sa['chunks'].unique, [5, 6, 7, 8, 9]) assert_array_equal(splits[1][1].sa['chunks'].unique, [5, 6, 7, 8, 9]) assert_array_equal(splits[1][0].sa['chunks'].unique, [0, 1, 2, 3, 4]) # check fully customized split with working and validation set specified cs = CustomPartitioner([([0,3,4],[5,9])]) # we want to discared the unselected partition of the data, hence attr_value # these two splitters should do exactly the same thing splitters = (Splitter(attr='partitions', attr_values=[1,2]), Splitter(attr='partitions', ignore_values=(0,))) for spl in splitters: splits = [ list(spl.generate(p)) for p in cs.generate(self.data) ] self.failUnless(len(splits) == 1) for i,p in enumerate(splits): self.failUnless( len(p) == 2 ) self.failUnless( p[0].nsamples == 30 ) self.failUnless( p[1].nsamples == 20 ) self.failUnless((splits[0][1].sa['chunks'].unique == [5, 9]).all()) self.failUnless((splits[0][0].sa['chunks'].unique == [0, 3, 4]).all())
def test_odd_even_split(self): oes = OddEvenPartitioner() spl = Splitter(attr='partitions') splits = [list(spl.generate(p)) for p in oes.generate(self.data)] self.assertTrue(len(splits) == 2) for i, p in enumerate(splits): self.assertTrue(len(p) == 2) self.assertTrue(p[0].nsamples == 50) self.assertTrue(p[1].nsamples == 50) assert_array_equal(splits[0][1].sa['chunks'].unique, [1, 3, 5, 7, 9]) assert_array_equal(splits[0][0].sa['chunks'].unique, [0, 2, 4, 6, 8]) assert_array_equal(splits[1][0].sa['chunks'].unique, [1, 3, 5, 7, 9]) assert_array_equal(splits[1][1].sa['chunks'].unique, [0, 2, 4, 6, 8]) # check if it works on pure odd and even chunk ids moresplits = [ list(spl.generate(p)) for p in oes.generate(splits[0][0]) ] for split in moresplits: self.assertTrue(split[0] != None) self.assertTrue(split[1] != None)
def test_slicing(self): hs = HalfPartitioner() spl = Splitter(attr='partitions') splits = list(hs.generate(self.data)) for s in splits: # partitioned dataset shared the data assert_true(s.samples.base is self.data.samples) splits = [list(spl.generate(p)) for p in hs.generate(self.data)] # with numpy 1.7.0b1 "chaining" was deprecated so let's create # check function appropriate for the given numpy version _a = np.arange(5) __a = _a[:4][:3] if __a.base is _a: # 1.7.0b1 def is_the_same_base(x, base=self.data.samples): return x.base is base elif __a.base.base is _a: # prior 1.7.0b1 def is_the_same_base(x, base=self.data.samples): return x.base.base is base else: raise RuntimeError("Uknown handling of .base by numpy") for s in splits: # we get slicing all the time assert_true(is_the_same_base(s[0].samples)) assert_true(is_the_same_base(s[1].samples)) spl = Splitter(attr='partitions', noslicing=True) splits = [list(spl.generate(p)) for p in hs.generate(self.data)] for s in splits: # we no slicing at all assert_false(s[0].samples.base is self.data.samples) assert_false(s[1].samples.base is self.data.samples) nfs = NFoldPartitioner() spl = Splitter(attr='partitions') splits = [list(spl.generate(p)) for p in nfs.generate(self.data)] for i, s in enumerate(splits): # training only first and last split if i == 0 or i == len(splits) - 1: assert_true(is_the_same_base(s[0].samples)) else: assert_true(s[0].samples.base is None) # we get slicing all the time assert_true(s[1].samples.base.base is self.data.samples) step_ds = Dataset(np.random.randn(20, 2), sa={'chunks': np.tile([0, 1], 10)}) oes = OddEvenPartitioner() spl = Splitter(attr='partitions') splits = list(oes.generate(step_ds)) for s in splits: # partitioned dataset shared the data assert_true(s.samples.base is step_ds.samples) splits = [list(spl.generate(p)) for p in oes.generate(step_ds)] assert_equal(len(splits), 2) for s in splits: # we get slicing all the time assert_true(s[0].samples.base.base is step_ds.samples) assert_true(s[1].samples.base.base is step_ds.samples)
def test_splitter(): ds = give_data() # split with defaults spl1 = Splitter('chunks') assert_raises(NotImplementedError, spl1, ds) splits = list(spl1.generate(ds)) assert_equal(len(splits), len(ds.sa['chunks'].unique)) for split in splits: # it should have perform basic slicing! assert_true(split.samples.base is ds.samples) assert_equal(len(split.sa['chunks'].unique), 1) assert_true('lastsplit' in split.a) assert_true(splits[-1].a.lastsplit) # now again, more customized spl2 = Splitter('targets', attr_values=[0, 1, 1, 2, 3, 3, 3], count=4, noslicing=True) splits = list(spl2.generate(ds)) assert_equal(len(splits), 4) for split in splits: # it should NOT have perform basic slicing! assert_false(split.samples.base is ds.samples) assert_equal(len(split.sa['targets'].unique), 1) assert_equal(len(split.sa['chunks'].unique), 10) assert_true(splits[-1].a.lastsplit) # two should be identical assert_array_equal(splits[1].samples, splits[2].samples) # now go wild and split by feature attribute ds.fa['roi'] = np.repeat([0, 1], 5) # splitter should auto-detect that this is a feature attribute spl3 = Splitter('roi') splits = list(spl3.generate(ds)) assert_equal(len(splits), 2) for split in splits: assert_true(split.samples.base is ds.samples) assert_equal(len(split.fa['roi'].unique), 1) assert_equal(split.shape, (100, 5)) # and finally test chained splitters cspl = ChainNode([spl2, spl3, spl1]) splits = list(cspl.generate(ds)) # 4 target splits and 2 roi splits each and 10 chunks each assert_equal(len(splits), 80)
def test_repeated_features(self): class CountFeatures(Measure): is_trained = True def _call(self, ds): return Dataset([ds.nfeatures], fa={'nonbogus_targets': list(ds.fa['nonbogus_targets'].unique)}) cf = CountFeatures() spl = Splitter('fa.nonbogus_targets') nsplits = len(list(spl.generate(self.dataset))) assert_equal(nsplits, 3) rm = RepeatedMeasure(cf, spl, concat_as='features') res = rm(self.dataset) assert_equal(res.shape, (1, nsplits)) # due to https://github.com/numpy/numpy/issues/641 we are # using list(set(...)) construct and there order of # nonbogus_targets.unique can vary from run to run, thus there # is no guarantee that we would get 18 first, which is a # questionable assumption anyways, thus performing checks # which do not require any specific order. # And yet due to another issue # https://github.com/numpy/numpy/issues/3759 # we can't just == None for the bool mask None_fa = np.array([x == None for x in res.fa.nonbogus_targets]) assert_array_equal(res.samples[0, None_fa], [18]) assert_array_equal(res.samples[0, ~None_fa], [1, 1]) if sys.version_info[0] < 3: # with python2 order seems to be consistent assert_array_equal(res.samples[0], [18, 1, 1])
def test_exclude_targets_combinations(): partitioner = ChainNode([ NFoldPartitioner(), ExcludeTargetsCombinationsPartitioner( k=2, targets_attr='targets', space='partitions') ], space='partitions') from mvpa2.misc.data_generators import normal_feature_dataset ds = normal_feature_dataset(snr=0., nlabels=4, perlabel=3, nchunks=3, nonbogus_features=[0, 1, 2, 3], nfeatures=4) partitions = list(partitioner.generate(ds)) assert_equal(len(partitions), 3 * 6) splitter = Splitter('partitions') combs = [] comb_chunks = [] for p in partitions: trds, teds = list(splitter.generate(p))[:2] comb = tuple(np.unique(teds.targets)) combs.append(comb) comb_chunks.append(comb + tuple(np.unique(teds.chunks))) assert_equal(len(set(combs)), 6) # just 6 possible combinations of 2 out of 4 assert_equal(len(set(comb_chunks)), 3 * 6) # all unique
def test_repeated_features(self): class CountFeatures(Measure): is_trained = True def _call(self, ds): return Dataset([ds.nfeatures], fa={ 'nonbogus_targets': list(ds.fa['nonbogus_targets'].unique) }) cf = CountFeatures() spl = Splitter('fa.nonbogus_targets') nsplits = len(list(spl.generate(self.dataset))) assert_equal(nsplits, 3) rm = RepeatedMeasure(cf, spl, concat_as='features') res = rm(self.dataset) assert_equal(res.shape, (1, nsplits)) # due to https://github.com/numpy/numpy/issues/641 we are # using list(set(...)) construct and there order of # nonbogus_targets.unique can vary from run to run, thus there # is no guarantee that we would get 18 first, which is a # questionable assumption anyways, thus performing checks # which do not require any specific order. # And yet due to another issue # https://github.com/numpy/numpy/issues/3759 # we can't just is None for the bool mask None_fa = np.array([x is None for x in res.fa.nonbogus_targets]) assert_array_equal(res.samples[0, None_fa], [18]) assert_array_equal(res.samples[0, ~None_fa], [1, 1]) if sys.version_info[0] < 3: # with python2 order seems to be consistent assert_array_equal(res.samples[0], [18, 1, 1])
def test_splitter(): ds = give_data() # split with defaults spl1 = Splitter('chunks') assert_raises(NotImplementedError, spl1, ds) splits = list(spl1.generate(ds)) assert_equal(len(splits), len(ds.sa['chunks'].unique)) for split in splits: # it should have perform basic slicing! assert_true(split.samples.base is ds.samples) assert_equal(len(split.sa['chunks'].unique), 1) assert_true('lastsplit' in split.a) assert_true(splits[-1].a.lastsplit) # now again, more customized spl2 = Splitter('targets', attr_values = [0,1,1,2,3,3,3], count=4, noslicing=True) splits = list(spl2.generate(ds)) assert_equal(len(splits), 4) for split in splits: # it should NOT have perform basic slicing! assert_false(split.samples.base is ds.samples) assert_equal(len(split.sa['targets'].unique), 1) assert_equal(len(split.sa['chunks'].unique), 10) assert_true(splits[-1].a.lastsplit) # two should be identical assert_array_equal(splits[1].samples, splits[2].samples) # now go wild and split by feature attribute ds.fa['roi'] = np.repeat([0,1], 5) # splitter should auto-detect that this is a feature attribute spl3 = Splitter('roi') splits = list(spl3.generate(ds)) assert_equal(len(splits), 2) for split in splits: assert_true(split.samples.base is ds.samples) assert_equal(len(split.fa['roi'].unique), 1) assert_equal(split.shape, (100, 5)) # and finally test chained splitters cspl = ChainNode([spl2, spl3, spl1]) splits = list(cspl.generate(ds)) # 4 target splits and 2 roi splits each and 10 chunks each assert_equal(len(splits), 80)
def test_label_splitter(self): oes = OddEvenPartitioner(attr='targets') spl = Splitter(attr='partitions') splits = [list(spl.generate(p)) for p in oes.generate(self.data)] assert_array_equal(splits[0][0].sa['targets'].unique, [0, 2]) assert_array_equal(splits[0][1].sa['targets'].unique, [1, 3]) assert_array_equal(splits[1][0].sa['targets'].unique, [1, 3]) assert_array_equal(splits[1][1].sa['targets'].unique, [0, 2])
def test_slicing(self): hs = HalfPartitioner() spl = Splitter(attr='partitions') splits = list(hs.generate(self.data)) for s in splits: # partitioned dataset shared the data assert_true(s.samples.base is self.data.samples) splits = [ list(spl.generate(p)) for p in hs.generate(self.data) ] for s in splits: # we get slicing all the time assert_true(s[0].samples.base.base is self.data.samples) assert_true(s[1].samples.base.base is self.data.samples) spl = Splitter(attr='partitions', noslicing=True) splits = [ list(spl.generate(p)) for p in hs.generate(self.data) ] for s in splits: # we no slicing at all assert_false(s[0].samples.base is self.data.samples) assert_false(s[1].samples.base is self.data.samples) nfs = NFoldPartitioner() spl = Splitter(attr='partitions') splits = [ list(spl.generate(p)) for p in nfs.generate(self.data) ] for i, s in enumerate(splits): # training only first and last split if i == 0 or i == len(splits) - 1: assert_true(s[0].samples.base.base is self.data.samples) else: assert_true(s[0].samples.base is None) # we get slicing all the time assert_true(s[1].samples.base.base is self.data.samples) step_ds = Dataset(np.random.randn(20,2), sa={'chunks': np.tile([0,1], 10)}) oes = OddEvenPartitioner() spl = Splitter(attr='partitions') splits = list(oes.generate(step_ds)) for s in splits: # partitioned dataset shared the data assert_true(s.samples.base is step_ds.samples) splits = [ list(spl.generate(p)) for p in oes.generate(step_ds) ] assert_equal(len(splits), 2) for s in splits: # we get slicing all the time assert_true(s[0].samples.base.base is step_ds.samples) assert_true(s[1].samples.base.base is step_ds.samples)
def test_label_splitter(self): oes = OddEvenPartitioner(attr='targets') spl = Splitter(attr='partitions') splits = [ list(spl.generate(p)) for p in oes.generate(self.data) ] assert_array_equal(splits[0][0].sa['targets'].unique, [0,2]) assert_array_equal(splits[0][1].sa['targets'].unique, [1,3]) assert_array_equal(splits[1][0].sa['targets'].unique, [1,3]) assert_array_equal(splits[1][1].sa['targets'].unique, [0,2])
def test_simplest_cv_pat_gen(self): # create the generator nfs = NFoldPartitioner(cvtype=1) spl = Splitter(attr='partitions') # now get the xval pattern sets One-Fold CV) xvpat = [ list(spl.generate(p)) for p in nfs.generate(self.data) ] self.failUnless( len(xvpat) == 10 ) for i,p in enumerate(xvpat): self.failUnless( len(p) == 2 ) self.failUnless( p[0].nsamples == 90 ) self.failUnless( p[1].nsamples == 10 ) self.failUnless( p[1].chunks[0] == i )
def test_simplest_cv_pat_gen(self): # create the generator nfs = NFoldPartitioner(cvtype=1) spl = Splitter(attr='partitions') # now get the xval pattern sets One-Fold CV) xvpat = [list(spl.generate(p)) for p in nfs.generate(self.data)] self.assertTrue(len(xvpat) == 10) for i, p in enumerate(xvpat): self.assertTrue(len(p) == 2) self.assertTrue(p[0].nsamples == 90) self.assertTrue(p[1].nsamples == 10) self.assertTrue(p[1].chunks[0] == i)
def test_half_split(self): hs = HalfPartitioner() spl = Splitter(attr='partitions') splits = [ list(spl.generate(p)) for p in hs.generate(self.data) ] self.assertTrue(len(splits) == 2) for i,p in enumerate(splits): self.assertTrue( len(p) == 2 ) self.assertTrue( p[0].nsamples == 50 ) self.assertTrue( p[1].nsamples == 50 ) assert_array_equal(splits[0][1].sa['chunks'].unique, [0, 1, 2, 3, 4]) assert_array_equal(splits[0][0].sa['chunks'].unique, [5, 6, 7, 8, 9]) assert_array_equal(splits[1][1].sa['chunks'].unique, [5, 6, 7, 8, 9]) assert_array_equal(splits[1][0].sa['chunks'].unique, [0, 1, 2, 3, 4]) # check if it works on pure odd and even chunk ids moresplits = [ list(spl.generate(p)) for p in hs.generate(splits[0][0])] for split in moresplits: self.assertTrue(split[0] is not None) self.assertTrue(split[1] is not None)
def test_half_split(self): hs = HalfPartitioner() spl = Splitter(attr='partitions') splits = [ list(spl.generate(p)) for p in hs.generate(self.data) ] self.failUnless(len(splits) == 2) for i,p in enumerate(splits): self.failUnless( len(p) == 2 ) self.failUnless( p[0].nsamples == 50 ) self.failUnless( p[1].nsamples == 50 ) assert_array_equal(splits[0][1].sa['chunks'].unique, [0, 1, 2, 3, 4]) assert_array_equal(splits[0][0].sa['chunks'].unique, [5, 6, 7, 8, 9]) assert_array_equal(splits[1][1].sa['chunks'].unique, [5, 6, 7, 8, 9]) assert_array_equal(splits[1][0].sa['chunks'].unique, [0, 1, 2, 3, 4]) # check if it works on pure odd and even chunk ids moresplits = [ list(spl.generate(p)) for p in hs.generate(splits[0][0])] for split in moresplits: self.failUnless(split[0] != None) self.failUnless(split[1] != None)
def test_odd_even_split(self): oes = OddEvenPartitioner() spl = Splitter(attr='partitions') splits = [ list(spl.generate(p)) for p in oes.generate(self.data) ] self.assertTrue(len(splits) == 2) for i,p in enumerate(splits): self.assertTrue( len(p) == 2 ) self.assertTrue( p[0].nsamples == 50 ) self.assertTrue( p[1].nsamples == 50 ) assert_array_equal(splits[0][1].sa['chunks'].unique, [1, 3, 5, 7, 9]) assert_array_equal(splits[0][0].sa['chunks'].unique, [0, 2, 4, 6, 8]) assert_array_equal(splits[1][0].sa['chunks'].unique, [1, 3, 5, 7, 9]) assert_array_equal(splits[1][1].sa['chunks'].unique, [0, 2, 4, 6, 8]) # check if it works on pure odd and even chunk ids moresplits = [ list(spl.generate(p)) for p in oes.generate(splits[0][0])] for split in moresplits: self.assertTrue(split[0] != None) self.assertTrue(split[1] != None)
def test_repeated_features(self): print self.dataset print self.dataset.fa.nonbogus_targets class CountFeatures(Measure): is_trained = True def _call(self, ds): return ds.nfeatures cf = CountFeatures() spl = Splitter('fa.nonbogus_targets') nsplits = len(list(spl.generate(self.dataset))) assert_equal(nsplits, 3) rm = RepeatedMeasure(cf, spl, concat_as='features') res = rm(self.dataset) assert_equal(res.shape, (1, nsplits)) assert_array_equal(res.samples[0], [18,1,1])
def test_repeated_features(self): print self.dataset print self.dataset.fa.nonbogus_targets class CountFeatures(Measure): is_trained = True def _call(self, ds): return ds.nfeatures cf = CountFeatures() spl = Splitter('fa.nonbogus_targets') nsplits = len(list(spl.generate(self.dataset))) assert_equal(nsplits, 3) rm = RepeatedMeasure(cf, spl, concat_as='features') res = rm(self.dataset) assert_equal(res.shape, (1, nsplits)) assert_array_equal(res.samples[0], [18,1,1])
def test_counted_splitting(self): spl = Splitter(attr='partitions') # count > #chunks, should result in 10 splits nchunks = len(self.data.sa['chunks'].unique) for strategy in Partitioner._STRATEGIES: for count, target in [ (nchunks*2, nchunks), (nchunks, nchunks), (nchunks-1, nchunks-1), (3, 3), (0, 0), (1, 1) ]: nfs = NFoldPartitioner(cvtype=1, count=count, selection_strategy=strategy) splits = [ list(spl.generate(p)) for p in nfs.generate(self.data) ] self.failUnless(len(splits) == target) chosenchunks = [int(s[1].uniquechunks) for s in splits] # Test if configuration matches as well nsplits_cfg = len(nfs.get_partition_specs(self.data)) self.failUnlessEqual(nsplits_cfg, target) # Check if "lastsplit" dsattr was assigned appropriately nsplits = len(splits) if nsplits > 0: # dummy-proof testing of last split for ds_ in splits[-1]: self.failUnless(ds_.a.lastpartitionset) # test all now for isplit,split in enumerate(splits): for ds_ in split: ds_.a.lastpartitionset == isplit==nsplits-1 # Check results of different strategies if strategy == 'first': self.failUnlessEqual(chosenchunks, range(target)) elif strategy == 'equidistant': if target == 3: self.failUnlessEqual(chosenchunks, [0, 3, 7]) elif strategy == 'random': # none is selected twice self.failUnless(len(set(chosenchunks)) == len(chosenchunks)) self.failUnless(target == len(chosenchunks)) else: raise RuntimeError, "Add unittest for strategy %s" \ % strategy
def test_counted_splitting(self): spl = Splitter(attr='partitions') # count > #chunks, should result in 10 splits nchunks = len(self.data.sa['chunks'].unique) for strategy in Partitioner._STRATEGIES: for count, target in [(nchunks * 2, nchunks), (nchunks, nchunks), (nchunks - 1, nchunks - 1), (3, 3), (0, 0), (1, 1)]: nfs = NFoldPartitioner(cvtype=1, count=count, selection_strategy=strategy) splits = [ list(spl.generate(p)) for p in nfs.generate(self.data) ] self.assertTrue(len(splits) == target) chosenchunks = [int(s[1].uniquechunks) for s in splits] # Test if configuration matches as well nsplits_cfg = len(nfs.get_partition_specs(self.data)) self.assertEqual(nsplits_cfg, target) # Check if "lastsplit" dsattr was assigned appropriately nsplits = len(splits) if nsplits > 0: # dummy-proof testing of last split for ds_ in splits[-1]: self.assertTrue(ds_.a.lastpartitionset) # test all now for isplit, split in enumerate(splits): for ds_ in split: ds_.a.lastpartitionset == isplit == nsplits - 1 # Check results of different strategies if strategy == 'first': self.assertEqual(chosenchunks, range(target)) elif strategy == 'equidistant': if target == 3: self.assertEqual(chosenchunks, [0, 3, 7]) elif strategy == 'random': # none is selected twice self.assertTrue( len(set(chosenchunks)) == len(chosenchunks)) self.assertTrue(target == len(chosenchunks)) else: raise RuntimeError, "Add unittest for strategy %s" \ % strategy
def test_svms(self, clf): knows_probabilities = \ 'probabilities' in clf.ca.keys() and clf.params.probability enable_ca = ['estimates'] if knows_probabilities: enable_ca += ['probabilities'] clf.ca.change_temporarily(enable_ca=enable_ca) spl = Splitter('train', count=2) traindata, testdata = list(spl.generate(datasets['uni2small'])) clf.train(traindata) predicts = clf.predict(testdata.samples) # values should be different from predictions for SVMs we have self.assertTrue(np.any(predicts != clf.ca.estimates)) if knows_probabilities and clf.ca.is_set('probabilities'): # XXX test more thoroughly what we are getting here ;-) self.assertEqual(len(clf.ca.probabilities), len(testdata.samples)) clf.ca.reset_changed_temporarily()
def test_svms(self, clf): knows_probabilities = \ 'probabilities' in clf.ca.keys() and clf.params.probability enable_ca = ['estimates'] if knows_probabilities: enable_ca += ['probabilities'] clf.ca.change_temporarily(enable_ca = enable_ca) spl = Splitter('train', count=2) traindata, testdata = list(spl.generate(datasets['uni2small'])) clf.train(traindata) predicts = clf.predict(testdata.samples) # values should be different from predictions for SVMs we have self.assertTrue(np.any(predicts != clf.ca.estimates)) if knows_probabilities and clf.ca.is_set('probabilities'): # XXX test more thoroughly what we are getting here ;-) self.assertEqual( len(clf.ca.probabilities), len(testdata.samples) ) clf.ca.reset_changed_temporarily()
def _forward_dataset(self, ds): if self.__chunks_attr is None: return self._forward_dataset_helper(ds) else: # strip down dataset to speedup local processing if self.__attr_strategy == "remove": keep_sa = [] else: keep_sa = None proc_ds = ds.copy(deep=False, sa=keep_sa, fa=[], a=[]) # process all chunks individually # use a customsplitter to speed-up splitting spl = Splitter(self.__chunks_attr) dses = [self._forward_dataset_helper(d) for d in spl.generate(proc_ds)] # and merge them again mds = vstack(dses) # put back attributes mds.fa.update(ds.fa) mds.a.update(ds.a) return mds
def test_exclude_targets_combinations(): partitioner = ChainNode( [NFoldPartitioner(), ExcludeTargetsCombinationsPartitioner(k=2, targets_attr="targets", space="partitions")], space="partitions", ) from mvpa2.misc.data_generators import normal_feature_dataset ds = normal_feature_dataset(snr=0.0, nlabels=4, perlabel=3, nchunks=3, nonbogus_features=[0, 1, 2, 3], nfeatures=4) partitions = list(partitioner.generate(ds)) assert_equal(len(partitions), 3 * 6) splitter = Splitter("partitions") combs = [] comb_chunks = [] for p in partitions: trds, teds = list(splitter.generate(p))[:2] comb = tuple(np.unique(teds.targets)) combs.append(comb) comb_chunks.append(comb + tuple(np.unique(teds.chunks))) assert_equal(len(set(combs)), 6) # just 6 possible combinations of 2 out of 4 assert_equal(len(set(comb_chunks)), 3 * 6) # all unique
def _forward_dataset(self, ds): if self.__chunks_attr is None: return self._forward_dataset_helper(ds) else: # strip down dataset to speedup local processing if self.__attr_strategy == 'remove': keep_sa = [] else: keep_sa = None proc_ds = ds.copy(deep=False, sa=keep_sa, fa=[], a=[]) # process all chunks individually # use a customsplitter to speed-up splitting spl = Splitter(self.__chunks_attr) dses = [self._forward_dataset_helper(d) for d in spl.generate(proc_ds)] # and merge them again mds = vstack(dses) # put back attributes mds.fa.update(ds.fa) mds.a.update(ds.a) return mds
def _sl_call(self, dataset, roi_ids, nproc): """Call to SimpleStatBaseSearchlight """ # Local bindings generator = self.generator qe = self.queryengine errorfx = self.errorfx if __debug__: time_start = time.time() targets_sa_name = self._get_space() targets_sa = dataset.sa[targets_sa_name] if __debug__: debug_slc_ = 'SLC_' in debug.active # get the dataset information into easy vars X = dataset.samples if len(X.shape) != 2: raise ValueError( 'Unlike a classifier, %s (for now) operates on already' 'flattened datasets' % (self.__class__.__name__)) labels = targets_sa.value ulabels = targets_sa.unique nlabels = len(ulabels) label2index = dict((l, il) for il, l in enumerate(ulabels)) labels_numeric = np.array([label2index[l] for l in labels]) self._ulabels_numeric = [label2index[l] for l in ulabels] # set the feature dimensions nsamples = len(X) nrois = len(roi_ids) s_shape = X.shape[1:] # shape of a single sample # The shape of results r_shape = (nrois, ) + X.shape[2:] def assign_ulabels(a): out = np.empty(shape=a.shape, dtype=ulabels.dtype) it = np.nditer([a, out], flags=['external_loop', 'buffered'], op_flags=[['readonly'], ['writeonly', 'allocate', 'no_broadcast']]) for x, y in it: y[...] = ulabels[x] return it.operands[1] # # Everything toward optimization ;) # # Silly Yarik thinks that it might be worth to pre-compute # statistics per each feature within a block of the samples # which always come together in splits -- most often it is a # (chunk, label) combination, but since we simply use a # generator -- who knows! Therefore lets figure out what are # those blocks and operate on them instead of original samples. # # After additional thinking about this -- probably it would be # just minor additional improvements (ie not worth it) but # since it is coded already -- let it be so # 1. Query generator for the splits we will have if __debug__: debug( 'SLC', 'Phase 1. Initializing partitions using %s on %s' % (generator, dataset)) # Lets just create a dummy ds which will store for us actual sample # indicies # XXX we could make it even more lightweight I guess... dataset_indicies = Dataset(np.arange(nsamples), sa=dataset.sa) splitter = Splitter(attr=generator.get_space(), attr_values=[1, 2]) \ if self._splitter is None \ else self._splitter partitions = list(generator.generate(dataset_indicies)) \ if generator \ else [dataset_indicies] if __debug__: for p in partitions: if not (np.all(p.sa[targets_sa_name].value == labels)): raise NotImplementedError( "%s does not yet support partitioners altering the targets " "(e.g. permutators)" % self.__class__) nsplits = len(partitions) # ATM we need to keep the splits instead since they are used # in two places in the code: step 2 and 5 # We care only about training and testing partitions (i.e. first two) splits = list(tuple(splitter.generate(ds_))[:2] for ds_ in partitions) del partitions # not used any longer # 2. Figure out the new 'chunks x labels' blocks of combinations # of samples if __debug__: debug( 'SLC', 'Phase 2. Blocking data for %i splits and %i labels' % (nsplits, nlabels)) # array of indicies for label, split1, split2, ... # through which we will pass later on to figure out # unique combinations combinations = np.ones((nsamples, 1 + nsplits), dtype=int) * -1 # labels combinations[:, 0] = labels_numeric for ipartition, (split1, split2) in enumerate(splits): combinations[split1.samples[:, 0], 1 + ipartition] = 1 combinations[split2.samples[:, 0], 1 + ipartition] = 2 # Check for over-sampling, i.e. no same sample used twice here if not (len(np.unique(split1.samples[:, 0])) == len(split1) and len(np.unique(split2.samples[:, 0])) == len(split2)): raise RuntimeError( "%s needs a partitioner which does not reuse " "the same the same samples more than once" % self.__class__) # sample descriptions -- should be unique for # samples within the same block descriptions = [tuple(c) for c in combinations] udescriptions = sorted(list(set(descriptions))) nblocks = len(udescriptions) description2block = dict([(d, i) for i, d in enumerate(udescriptions)]) # Indices for samples to point to their block self.__sample2block = sample2block = \ np.array([description2block[d] for d in descriptions]) # 3. Compute statistics per each block # if __debug__: debug('SLC', 'Phase 3. Computing statistics for %i blocks' % (nblocks, )) self._compute_pb_stats(labels_numeric, X, (nblocks, ) + s_shape) # derived classes might decide differently on what they # actually need, so defer reserving the space and computing # stats to them self._reserve_pl_stats_space((nlabels, ) + s_shape) # results if errorfx is mean_mismatch_error: # if we know how it would look like, prepare the storage results = np.zeros((nsplits, ) + r_shape) else: # Otherwise delay assembling the results results = [] all_targets, all_cvfolds = [], [] # 4. Lets deduce all neighbors... might need to be RF into the # parallel part later on # TODO: needs OPT since this is the step consuming 50% of time # or more allow to cache them entirely so this would # not be an unnecessary burden during permutation testing if not self.reuse_neighbors or self.__roi_fids is None: if __debug__: debug( 'SLC', 'Phase 4. Deducing neighbors information for %i ROIs' % (nrois, )) roi_fids = [qe.query_byid(f) for f in roi_ids] else: if __debug__: debug( 'SLC', 'Phase 4. Reusing neighbors information for %i ROIs' % (nrois, )) roi_fids = self.__roi_fids self.ca.roi_feature_ids = roi_fids roi_sizes = [] if isinstance(roi_fids, list): nroi_fids = len(roi_fids) if self.ca.is_enabled('roi_sizes'): roi_sizes = [len(x) for x in roi_fids] elif externals.exists('scipy') and isinstance(roi_fids, sps.spmatrix): nroi_fids = roi_fids.shape[1] if self.ca.is_enabled('roi_sizes'): # very expensive operation, so better not to ask over again # roi_sizes = [roi_fids.getrow(r).nnz for r in range(nroi_fids)] warning( "Since 'sparse' trick is used, extracting sizes of " "roi's are expensive at this point. Get them from the " ".ca value of the original instance before " "calling again and using reuse_neighbors") else: raise RuntimeError("Should not be reachable") # Since this is ad-hoc implementation of the searchlight, we are not passing # those via ds.a but rather assign directly to self.ca self.ca.roi_sizes = roi_sizes indexsum = self._indexsum if indexsum == 'sparse': if not self.reuse_neighbors or self.__roi_fids is None: if __debug__: debug( 'SLC', 'Phase 4b. Converting neighbors to sparse matrix ' 'representation') # convert to "sparse representation" where column j contains # 1s only at the roi_fids[j] indices roi_fids = inds_to_coo(roi_fids, shape=(dataset.nfeatures, nroi_fids)) indexsum_fx = lastdim_columnsums_spmatrix elif indexsum == 'fancy': indexsum_fx = lastdim_columnsums_fancy_indexing else: raise ValueError("Do not know how to deal with indexsum=%s" % indexsum) # Store roi_fids if self.reuse_neighbors and self.__roi_fids is None: self.__roi_fids = roi_fids # 5. Lets do actual "splitting" and "classification" if __debug__: debug('SLC', 'Phase 5. Major loop') for isplit, split in enumerate(splits): if __debug__: debug('SLC', ' Split %i out of %i' % (isplit, nsplits)) # figure out for a given splits the blocks we want to work # with # sample_indicies training_sis = split[0].samples[:, 0] testing_sis = split[1].samples[:, 0] # That is the GNB specificity targets, predictions = self._sl_call_on_a_split( split, X, # X2 might light to go training_sis, testing_sis, ## training_nsamples, # GO? == np.sum(pl.nsamples) ## training_non0labels, ## pl.sums, pl.means, pl.sums2, pl.variances, # passing nroi_fids as well since in 'sparse' way it has no 'length' nroi_fids, roi_fids, indexsum_fx, labels_numeric, ) # assess the errors if __debug__: debug('SLC', " Assessing accuracies") if errorfx is mean_mismatch_error: results[isplit, :] = \ (predictions != targets[:, None]).sum(axis=0) \ / float(len(targets)) all_cvfolds += [isplit] elif errorfx: # somewhat silly but a way which allows to use pre-crafted # error functions without a chance to screw up result = np.atleast_2d( np.array([ errorfx(fpredictions, targets) for fpredictions in predictions.T ])) results.append(result) all_cvfolds += [isplit] * result.shape[0] else: # and if no errorfx -- we just need to assign original # labels to the predictions BUT keep in mind that it is a matrix results.append(assign_ulabels(predictions)) all_targets += [ulabels[i] for i in targets] all_cvfolds += [isplit] * len(targets) pass # end of the split loop if isinstance(results, list): # we have just collected them, now they need to be vstacked results = np.vstack(results) assert (results.ndim >= 2) if __debug__: debug( 'SLC', "%s._call() is done in %.3g sec" % (self.__class__.__name__, time.time() - time_start)) out = Dataset(results) if all_targets: out.sa['targets'] = all_targets out.sa['cvfolds'] = all_cvfolds out.fa['center_ids'] = roi_ids return out
def test_n_group_split(self): """Test NGroupSplitter alongside with the reversal of the order of spit out datasets """ # Test 2 groups like HalfSplitter first hs = NGroupPartitioner(2) for isreversed, splitter in enumerate((hs, hs)): if isreversed: spl = Splitter(attr='partitions', reverse=True) else: spl = Splitter(attr='partitions') splits = [list(spl.generate(p)) for p in hs.generate(self.data)] self.assertTrue(len(splits) == 2) for i, p in enumerate(splits): self.assertTrue(len(p) == 2) self.assertTrue(p[0].nsamples == 50) self.assertTrue(p[1].nsamples == 50) assert_array_equal(splits[0][1 - isreversed].sa['chunks'].unique, [0, 1, 2, 3, 4]) assert_array_equal(splits[0][isreversed].sa['chunks'].unique, [5, 6, 7, 8, 9]) assert_array_equal(splits[1][1 - isreversed].sa['chunks'].unique, [5, 6, 7, 8, 9]) assert_array_equal(splits[1][isreversed].sa['chunks'].unique, [0, 1, 2, 3, 4]) # check if it works on pure odd and even chunk ids moresplits = [list(spl.generate(p)) for p in hs.generate(splits[0][0])] for split in moresplits: self.assertTrue(split[0] != None) self.assertTrue(split[1] != None) # now test more groups s5 = NGroupPartitioner(5) # get the splits for isreversed, s5splitter in enumerate((s5, s5)): if isreversed: spl = Splitter(attr='partitions', reverse=True) else: spl = Splitter(attr='partitions') splits = [ list(spl.generate(p)) for p in s5splitter.generate(self.data) ] # must have 10 splits self.assertTrue(len(splits) == 5) # check split content assert_array_equal(splits[0][1 - isreversed].sa['chunks'].unique, [0, 1]) assert_array_equal(splits[0][isreversed].sa['chunks'].unique, [2, 3, 4, 5, 6, 7, 8, 9]) assert_array_equal(splits[1][1 - isreversed].sa['chunks'].unique, [2, 3]) assert_array_equal(splits[1][isreversed].sa['chunks'].unique, [0, 1, 4, 5, 6, 7, 8, 9]) # ... assert_array_equal(splits[4][1 - isreversed].sa['chunks'].unique, [8, 9]) assert_array_equal(splits[4][isreversed].sa['chunks'].unique, [0, 1, 2, 3, 4, 5, 6, 7]) # Test for too many groups def splitcall(spl, dat): return list(spl.generate(dat)) s20 = NGroupPartitioner(20) self.assertRaises(ValueError, splitcall, s20, self.data)
def _sl_call(self, dataset, roi_ids, nproc): """Call to GNBSearchlight """ # Local bindings gnb = self.gnb params = gnb.params generator = self.generator errorfx = self.errorfx qe = self.queryengine ## if False: ## class A(Learner): ## pass ## self = A() ## import numpy as np ## from mvpa2.clfs.gnb import GNB ## from mvpa2.generators.partition import NFoldPartitioner ## from mvpa2.misc.errorfx import mean_mismatch_error ## from mvpa2.testing.datasets import datasets as tdatasets ## from mvpa2.datasets import Dataset ## from mvpa2.misc.neighborhood import IndexQueryEngine, Sphere ## from mvpa2.clfs.distance import absmin_distance ## import time ## if __debug__: ## from mvpa2.base import debug ## debug.active += ['SLC.*'] ## # XXX is it that ugly? ## debug.active.pop(debug.active.index('SLC_')) ## debug.metrics += ['reltime'] ## dataset = tdatasets['3dlarge'].copy() ## dataset.fa['voxel_indices'] = dataset.fa.myspace ## sphere = Sphere(radius=1, ## distance_func=absmin_distance) ## qe = IndexQueryEngine(myspace=sphere) ## # Fracisco's data ## #dataset = ds_fp ## qe = IndexQueryEngine(voxel_indices=sphere) ## qe.train(dataset) ## roi_ids = np.arange(dataset.nfeatures) ## gnb = GNB() ## params = gnb.params ## generator = NFoldPartitioner() ## errorfx = mean_mismatch_error if __debug__: time_start = time.time() targets_sa_name = gnb.get_space() targets_sa = dataset.sa[targets_sa_name] if __debug__: debug_slc_ = 'SLC_' in debug.active # get the dataset information into easy vars X = dataset.samples if len(X.shape) != 2: raise ValueError, \ 'Unlike GNB, GNBSearchlight (for now) operates on already' \ 'flattened datasets' labels = targets_sa.value ulabels = targets_sa.unique nlabels = len(ulabels) label2index = dict((l, il) for il, l in enumerate(ulabels)) labels_numeric = np.array([label2index[l] for l in labels]) ulabels_numeric = [label2index[l] for l in ulabels] # set the feature dimensions nsamples = len(X) nrois = len(roi_ids) s_shape = X.shape[1:] # shape of a single sample # The shape of results r_shape = (nrois,) + X.shape[2:] # # Everything toward optimization ;) # # Silly Yarik thinks that it might be worth to pre-compute # statistics per each feature within a block of the samples # which always come together in splits -- most often it is a # (chunk, label) combination, but since we simply use a # generator -- who knows! Therefore lets figure out what are # those blocks and operate on them instead of original samples. # # After additional thinking about this -- probably it would be # just minor additional improvements (ie not worth it) but # since it is coded already -- let it be so # 1. Query generator for the splits we will have if __debug__: debug('SLC', 'Phase 1. Initializing partitions using %s on %s' % (generator, dataset)) # Lets just create a dummy ds which will store for us actual sample # indicies # XXX we could make it even more lightweight I guess... dataset_indicies = Dataset(np.arange(nsamples), sa=dataset.sa) splitter = Splitter(attr=generator.get_space()) splits = list(tuple(splitter.generate(ds_)) for ds_ in generator.generate(dataset_indicies)) nsplits = len(splits) # 2. Figure out the new 'chunks x labels' blocks of combinations # of samples if __debug__: debug('SLC', 'Phase 2. Blocking data for %i splits and %i labels' % (nsplits, nlabels)) # array of indicies for label, split1, split2, ... # through which we will pass later on to figure out # unique combinations combinations = np.ones((nsamples, 1+nsplits), dtype=int)*-1 # labels combinations[:, 0] = labels_numeric for ipartition, (split1, split2) in enumerate(splits): combinations[split1.samples[:, 0], 1+ipartition] = 1 combinations[split2.samples[:, 0], 1+ipartition] = 2 # Check for over-sampling, i.e. no same sample used twice here if not (len(np.unique(split1.samples[:, 0])) == len(split1) and len(np.unique(split2.samples[:, 0])) == len(split2)): raise RuntimeError( "GNBSearchlight needs a partitioner which does not reuse " "the same the same samples more than once") # sample descriptions -- should be unique for # samples within the same block descriptions = [tuple(c) for c in combinations] udescriptions = sorted(list(set(descriptions))) nblocks = len(udescriptions) description2block = dict([(d, i) for i, d in enumerate(udescriptions)]) # Indices for samples to point to their block sample2block = np.array([description2block[d] for d in descriptions]) # 3. Compute statistics per each block # if __debug__: debug('SLC', 'Phase 3. Computing statistics for %i blocks' % (nblocks,)) # # reusable containers which should stay of the same size # # sums and sums of squares per each block sums = np.zeros((nblocks, ) + s_shape) # sums of squares sums2 = np.zeros((nblocks, ) + s_shape) # per each label: means = np.zeros((nlabels, ) + s_shape) # means of squares for stddev computation means2 = np.zeros((nlabels, ) + s_shape) variances = np.zeros((nlabels, ) + s_shape) # degenerate dimension are added for easy broadcasting later on nsamples_per_class = np.zeros((nlabels,) + (1,)*len(s_shape)) # results results = np.zeros((nsplits,) + r_shape) block_counts = np.zeros((nblocks,)) block_labels = [None] * nblocks X2 = np.square(X) # silly way for now for l, s, s2, ib in zip(labels_numeric, X, X2, sample2block): sums[ib] += s sums2[ib] += s2 block_counts[ib] += 1 if block_labels[ib] is None: block_labels[ib] = l else: assert(block_labels[ib] == l) block_labels = np.asanyarray(block_labels) # additional silly tests for paranoid assert(block_labels.dtype.kind is 'i') # 4. Lets deduce all neighbors... might need to be RF into the # parallel part later on if __debug__: debug('SLC', 'Phase 4. Deducing neighbors information for %i ROIs' % (nrois,)) roi_fids = [qe.query_byid(f) for f in roi_ids] nroi_fids = len(roi_fids) # makes sense to waste precious ms only if ca is enabled if self.ca.is_enabled('roi_sizes'): roi_sizes = [len(x) for x in roi_fids] else: roi_sizes = [] indexsum = self._indexsum if indexsum == 'sparse': if __debug__: debug('SLC', 'Phase 4b. Converting neighbors to sparse matrix ' 'representation') # convert to "sparse representation" where column j contains # 1s only at the roi_fids[j] indices roi_fids = inds_to_coo(roi_fids, shape=(dataset.nfeatures, nroi_fids)) indexsum_fx = lastdim_columnsums_spmatrix elif indexsum == 'fancy': indexsum_fx = lastdim_columnsums_fancy_indexing else: raise ValueError, \ "Do not know how to deal with indexsum=%s" % indexsum # 5. Lets do actual "splitting" and "classification" if __debug__: debug('SLC', 'Phase 5. Major loop' ) for isplit, split in enumerate(splits): if __debug__: debug('SLC', ' Split %i out of %i' % (isplit, nsplits)) # figure out for a given splits the blocks we want to work # with # sample_indicies training_sis = split[0].samples[:, 0] # convert to blocks training split training_bis = np.unique(sample2block[training_sis]) # now lets do our GNB business training_nsamples = 0 for il, l in enumerate(ulabels_numeric): bis_il = training_bis[block_labels[training_bis] == l] nsamples_per_class[il] = N_float = \ float(np.sum(block_counts[bis_il])) training_nsamples += N_float if N_float == 0.0: variances[il] = means[il] = means2[il] = 0. else: means[il] = np.sum(sums[bis_il], axis=0) / N_float # Not yet normed means2[il] = np.sum(sums2[bis_il], axis=0) ## Actually compute the non-0 variances non0labels = (nsamples_per_class.squeeze() != 0) if np.all(non0labels): # For a possible tiny speed up avoiding copying and # using (no) slicing non0labels = slice(None) if params.common_variance: variances[:] = \ np.sum(means2 - nsamples_per_class*np.square(means), axis=0) \ / training_nsamples else: variances[non0labels] = \ (means2 - nsamples_per_class*np.square(means))[non0labels] \ / nsamples_per_class[non0labels] # assign priors priors = gnb._get_priors( nlabels, training_nsamples, nsamples_per_class) # proceed in a way we have in GNB code with logprob=True, # i.e. operating within the exponents -- should lead to some # performance advantage norm_weight = -0.5 * np.log(2*np.pi*variances) # last added dimension would be for ROIs logpriors = np.log(priors[:, np.newaxis, np.newaxis]) if __debug__: debug('SLC', " 'Training' is done") # Now it is time to "classify" our samples. # and for that we first need to compute corresponding # probabilities (or may be un data = X[split[1].samples[:, 0]] targets = labels_numeric[split[1].samples[:, 0]] # argument of exponentiation scaled_distances = \ -0.5 * (((data - means[:, np.newaxis, ...])**2) \ / variances[:, np.newaxis, ...]) # incorporate the normalization from normals lprob_csfs = norm_weight[:, np.newaxis, ...] + scaled_distances ## First we need to reshape to get class x samples x features lprob_csf = lprob_csfs.reshape(lprob_csfs.shape[:2] + (-1,)) ## Now we come to naive part which requires looping ## through all spheres if __debug__: debug('SLC', " Doing 'Searchlight'") # resultant logprobs for each class x sample x roi lprob_cs_sl = np.zeros(lprob_csfs.shape[:2] + (nroi_fids,)) indexsum_fx(lprob_csf, roi_fids, out=lprob_cs_sl) lprob_cs_sl += logpriors lprob_cs_cp_sl = lprob_cs_sl # for each of the ROIs take the class with maximal (log)probability predictions = lprob_cs_cp_sl.argmax(axis=0) # no need to map back [self.ulabels[c] for c in winners] #predictions = winners # assess the errors if __debug__: debug('SLC', " Assessing accuracies") if errorfx is mean_mismatch_error: results[isplit, :] = \ (predictions != targets[:, None]).sum(axis=0) \ / float(len(targets)) else: # somewhat silly but a way which allows to use pre-crafted # error functions without a chance to screw up for i, fpredictions in enumerate(predictions.T): results[isplit, i] = errorfx(fpredictions, targets) if __debug__: debug('SLC', "GNBSearchlight is done in %.3g sec" % (time.time() - time_start)) return Dataset(results), roi_sizes
def _sl_call(self, dataset, roi_ids, nproc): """Call to SimpleStatBaseSearchlight """ # Local bindings generator = self.generator qe = self.queryengine errorfx = self.errorfx if __debug__: time_start = time.time() targets_sa_name = self._get_space() targets_sa = dataset.sa[targets_sa_name] if __debug__: debug_slc_ = 'SLC_' in debug.active # get the dataset information into easy vars X = dataset.samples if len(X.shape) != 2: raise ValueError( 'Unlike a classifier, %s (for now) operates on already' 'flattened datasets' % (self.__class__.__name__)) labels = targets_sa.value ulabels = targets_sa.unique nlabels = len(ulabels) label2index = dict((l, il) for il, l in enumerate(ulabels)) labels_numeric = np.array([label2index[l] for l in labels]) self._ulabels_numeric = [label2index[l] for l in ulabels] # set the feature dimensions nsamples = len(X) nrois = len(roi_ids) s_shape = X.shape[1:] # shape of a single sample # The shape of results r_shape = (nrois,) + X.shape[2:] # # Everything toward optimization ;) # # Silly Yarik thinks that it might be worth to pre-compute # statistics per each feature within a block of the samples # which always come together in splits -- most often it is a # (chunk, label) combination, but since we simply use a # generator -- who knows! Therefore lets figure out what are # those blocks and operate on them instead of original samples. # # After additional thinking about this -- probably it would be # just minor additional improvements (ie not worth it) but # since it is coded already -- let it be so # 1. Query generator for the splits we will have if __debug__: debug('SLC', 'Phase 1. Initializing partitions using %s on %s' % (generator, dataset)) # Lets just create a dummy ds which will store for us actual sample # indicies # XXX we could make it even more lightweight I guess... dataset_indicies = Dataset(np.arange(nsamples), sa=dataset.sa) splitter = Splitter(attr=generator.get_space()) partitions = list(generator.generate(dataset_indicies)) if __debug__: for p in partitions: if not (np.all(p.sa[targets_sa_name].value == labels)): raise NotImplementedError( "%s does not yet support partitioners altering the targets " "(e.g. permutators)" % self.__class__) nsplits = len(partitions) # ATM we need to keep the splits instead since they are used # in two places in the code: step 2 and 5 splits = list(tuple(splitter.generate(ds_)) for ds_ in partitions) del partitions # not used any longer # 2. Figure out the new 'chunks x labels' blocks of combinations # of samples if __debug__: debug('SLC', 'Phase 2. Blocking data for %i splits and %i labels' % (nsplits, nlabels)) # array of indicies for label, split1, split2, ... # through which we will pass later on to figure out # unique combinations combinations = np.ones((nsamples, 1+nsplits), dtype=int)*-1 # labels combinations[:, 0] = labels_numeric for ipartition, (split1, split2) in enumerate(splits): combinations[split1.samples[:, 0], 1+ipartition] = 1 combinations[split2.samples[:, 0], 1+ipartition] = 2 # Check for over-sampling, i.e. no same sample used twice here if not (len(np.unique(split1.samples[:, 0])) == len(split1) and len(np.unique(split2.samples[:, 0])) == len(split2)): raise RuntimeError( "%s needs a partitioner which does not reuse " "the same the same samples more than once" % self.__class__) # sample descriptions -- should be unique for # samples within the same block descriptions = [tuple(c) for c in combinations] udescriptions = sorted(list(set(descriptions))) nblocks = len(udescriptions) description2block = dict([(d, i) for i, d in enumerate(udescriptions)]) # Indices for samples to point to their block self.__sample2block = sample2block = \ np.array([description2block[d] for d in descriptions]) # 3. Compute statistics per each block # if __debug__: debug('SLC', 'Phase 3. Computing statistics for %i blocks' % (nblocks,)) self._compute_pb_stats(labels_numeric, X, (nblocks,) + s_shape) # derived classes might decide differently on what they # actually need, so defer reserving the space and computing # stats to them self._reserve_pl_stats_space((nlabels, ) + s_shape) # results results = np.zeros((nsplits,) + r_shape) # 4. Lets deduce all neighbors... might need to be RF into the # parallel part later on # TODO: needs OPT since this is the step consuming 50% of time # or more allow to cache them entirely so this would # not be an unnecessary burden during permutation testing if not self.reuse_neighbors or self.__roi_fids is None: if __debug__: debug('SLC', 'Phase 4. Deducing neighbors information for %i ROIs' % (nrois,)) roi_fids = [qe.query_byid(f) for f in roi_ids] else: if __debug__: debug('SLC', 'Phase 4. Reusing neighbors information for %i ROIs' % (nrois,)) roi_fids = self.__roi_fids self.ca.roi_feature_ids = roi_fids roi_sizes = [] if isinstance(roi_fids, list): nroi_fids = len(roi_fids) if self.ca.is_enabled('roi_sizes'): roi_sizes = [len(x) for x in roi_fids] elif externals.exists('scipy') and isinstance(roi_fids, sps.spmatrix): nroi_fids = roi_fids.shape[1] if self.ca.is_enabled('roi_sizes'): # very expensive operation, so better not to ask over again # roi_sizes = [roi_fids.getrow(r).nnz for r in range(nroi_fids)] warning("Since 'sparse' trick is used, extracting sizes of " "roi's are expensive at this point. Get them from the " ".ca value of the original instance before " "calling again and using reuse_neighbors") else: raise RuntimeError("Should not be reachable") # Since this is ad-hoc implementation of the searchlight, we are not passing # those via ds.a but rather assign directly to self.ca self.ca.roi_sizes = roi_sizes indexsum = self._indexsum if indexsum == 'sparse': if not self.reuse_neighbors or self.__roi_fids is None: if __debug__: debug('SLC', 'Phase 4b. Converting neighbors to sparse matrix ' 'representation') # convert to "sparse representation" where column j contains # 1s only at the roi_fids[j] indices roi_fids = inds_to_coo(roi_fids, shape=(dataset.nfeatures, nroi_fids)) indexsum_fx = lastdim_columnsums_spmatrix elif indexsum == 'fancy': indexsum_fx = lastdim_columnsums_fancy_indexing else: raise ValueError, \ "Do not know how to deal with indexsum=%s" % indexsum # Store roi_fids if self.reuse_neighbors and self.__roi_fids is None: self.__roi_fids = roi_fids # 5. Lets do actual "splitting" and "classification" if __debug__: debug('SLC', 'Phase 5. Major loop' ) for isplit, split in enumerate(splits): if __debug__: debug('SLC', ' Split %i out of %i' % (isplit, nsplits)) # figure out for a given splits the blocks we want to work # with # sample_indicies training_sis = split[0].samples[:, 0] testing_sis = split[1].samples[:, 0] # That is the GNB specificity targets, predictions = self._sl_call_on_a_split( split, X, # X2 might light to go training_sis, testing_sis, ## training_nsamples, # GO? == np.sum(pl.nsamples) ## training_non0labels, ## pl.sums, pl.means, pl.sums2, pl.variances, # passing nroi_fids as well since in 'sparse' way it has no 'length' nroi_fids, roi_fids, indexsum_fx, labels_numeric, ) # assess the errors if __debug__: debug('SLC', " Assessing accuracies") if errorfx is mean_mismatch_error: results[isplit, :] = \ (predictions != targets[:, None]).sum(axis=0) \ / float(len(targets)) else: # somewhat silly but a way which allows to use pre-crafted # error functions without a chance to screw up for i, fpredictions in enumerate(predictions.T): results[isplit, i] = errorfx(fpredictions, targets) if __debug__: debug('SLC', "%s._call() is done in %.3g sec" % (self.__class__.__name__, time.time() - time_start)) return Dataset(results)
def test_n_group_split(self): """Test NGroupSplitter alongside with the reversal of the order of spit out datasets """ # Test 2 groups like HalfSplitter first hs = NGroupPartitioner(2) for isreversed, splitter in enumerate((hs, hs)): if isreversed: spl = Splitter(attr='partitions', reverse=True) else: spl = Splitter(attr='partitions') splits = [ list(spl.generate(p)) for p in hs.generate(self.data) ] self.failUnless(len(splits) == 2) for i, p in enumerate(splits): self.failUnless( len(p) == 2 ) self.failUnless( p[0].nsamples == 50 ) self.failUnless( p[1].nsamples == 50 ) assert_array_equal(splits[0][1-isreversed].sa['chunks'].unique, [0, 1, 2, 3, 4]) assert_array_equal(splits[0][isreversed].sa['chunks'].unique, [5, 6, 7, 8, 9]) assert_array_equal(splits[1][1-isreversed].sa['chunks'].unique, [5, 6, 7, 8, 9]) assert_array_equal(splits[1][isreversed].sa['chunks'].unique, [0, 1, 2, 3, 4]) # check if it works on pure odd and even chunk ids moresplits = [ list(spl.generate(p)) for p in hs.generate(splits[0][0])] for split in moresplits: self.failUnless(split[0] != None) self.failUnless(split[1] != None) # now test more groups s5 = NGroupPartitioner(5) # get the splits for isreversed, s5splitter in enumerate((s5, s5)): if isreversed: spl = Splitter(attr='partitions', reverse=True) else: spl = Splitter(attr='partitions') splits = [ list(spl.generate(p)) for p in s5splitter.generate(self.data) ] # must have 10 splits self.failUnless(len(splits) == 5) # check split content assert_array_equal(splits[0][1-isreversed].sa['chunks'].unique, [0, 1]) assert_array_equal(splits[0][isreversed].sa['chunks'].unique, [2, 3, 4, 5, 6, 7, 8, 9]) assert_array_equal(splits[1][1-isreversed].sa['chunks'].unique, [2, 3]) assert_array_equal(splits[1][isreversed].sa['chunks'].unique, [0, 1, 4, 5, 6, 7, 8, 9]) # ... assert_array_equal(splits[4][1-isreversed].sa['chunks'].unique, [8, 9]) assert_array_equal(splits[4][isreversed].sa['chunks'].unique, [0, 1, 2, 3, 4, 5, 6, 7]) # Test for too many groups def splitcall(spl, dat): return list(spl.generate(dat)) s20 = NGroupPartitioner(20) self.assertRaises(ValueError,splitcall,s20,self.data)