def test_permute_chunks(): def is_sorted(x): return np.array_equal(np.sort(x), x) ds = give_data() # change targets labels # there is no target labels permuting within chunks, # assure = True would be error ds.sa['targets'] = range(len(ds.sa.targets)) permutation = AttributePermutator(attr='targets', chunk_attr='chunks', strategy='chunks', assure=True) pds = permutation(ds) assert_false(is_sorted(pds.sa.targets)) assert_true(np.array_equal(pds.samples, ds.samples)) for chunk_id in np.unique(pds.sa.chunks): chunk_ds = pds[pds.sa.chunks == chunk_id] assert_true(is_sorted(chunk_ds.sa.targets)) permutation = AttributePermutator(attr='targets', strategy='chunks') assert_raises(ValueError, permutation, ds)
def test_basic_collectable(): c = Collectable() # empty by default assert_equal(c.name, None) assert_equal(c.value, None) assert_equal(c.__doc__, None) # late assignment c.name = 'somename' c.value = 12345 assert_equal(c.name, 'somename') assert_equal(c.value, 12345) # immediate content c = Collectable('value', 'myname', "This is a test") assert_equal(c.name, 'myname') assert_equal(c.value, 'value') assert_equal(c.__doc__, "This is a test") assert_equal(str(c), 'myname') # repr e = eval(repr(c)) assert_equal(e.name, 'myname') assert_equal(e.value, 'value') assert_equal(e.__doc__, "This is a test") # shallow copy does not create a view of value array c.value = np.arange(5) d = copy.copy(c) assert_false(d.value.base is c.value) # names starting with _ are not allowed assert_raises(ValueError, c._set_name, "_underscore")
def test_slicing(self): hs = HalfPartitioner() spl = Splitter(attr="partitions") splits = list(hs.generate(self.data)) for s in splits: # partitioned dataset shared the data assert_true(s.samples.base is self.data.samples) splits = [list(spl.generate(p)) for p in hs.generate(self.data)] # with numpy 1.7.0b1 "chaining" was deprecated so let's create # check function appropriate for the given numpy version _a = np.arange(5) __a = _a[:4][:3] if __a.base is _a: # 1.7.0b1 def is_the_same_base(x, base=self.data.samples): return x.base is base elif __a.base.base is _a: # prior 1.7.0b1 def is_the_same_base(x, base=self.data.samples): return x.base.base is base else: raise RuntimeError("Uknown handling of .base by numpy") for s in splits: # we get slicing all the time assert_true(is_the_same_base(s[0].samples)) assert_true(is_the_same_base(s[1].samples)) spl = Splitter(attr="partitions", noslicing=True) splits = [list(spl.generate(p)) for p in hs.generate(self.data)] for s in splits: # we no slicing at all assert_false(s[0].samples.base is self.data.samples) assert_false(s[1].samples.base is self.data.samples) nfs = NFoldPartitioner() spl = Splitter(attr="partitions") splits = [list(spl.generate(p)) for p in nfs.generate(self.data)] for i, s in enumerate(splits): # training only first and last split if i == 0 or i == len(splits) - 1: assert_true(is_the_same_base(s[0].samples)) else: assert_true(s[0].samples.base is None) # we get slicing all the time assert_true(is_the_same_base(s[1].samples)) step_ds = Dataset(np.random.randn(20, 2), sa={"chunks": np.tile([0, 1], 10)}) oes = OddEvenPartitioner() spl = Splitter(attr="partitions") splits = list(oes.generate(step_ds)) for s in splits: # partitioned dataset shared the data assert_true(s.samples.base is step_ds.samples) splits = [list(spl.generate(p)) for p in oes.generate(step_ds)] assert_equal(len(splits), 2) for s in splits: # we get slicing all the time assert_true(is_the_same_base(s[0].samples, step_ds.samples)) assert_true(is_the_same_base(s[1].samples, step_ds.samples))
def test_strip_boundary(): ds = datasets['hollow'] ds.sa['btest'] = np.repeat([0, 1], 20) sn = StripBoundariesSamples('btest', 1, 2) sds = sn(ds) assert_equal(len(sds), len(ds) - 3) for i in [19, 20, 21]: assert_false(i in sds.samples.sid)
def test_balancer(): ds = give_data() # only mark the selection in an attribute bal = Balancer() res = bal(ds) # we get a new dataset, with shared samples assert_false(ds is res) assert_true(ds.samples is res.samples.base) # should kick out 2 samples in each chunk of 10 assert_almost_equal(np.mean(res.sa.balanced_set), 0.8) # same as above, but actually apply the selection bal = Balancer(apply_selection=True, count=5) # just run it once res = bal(ds) # we get a new dataset, with shared samples assert_false(ds is res) # should kick out 2 samples in each chunk of 10 assert_equal(len(res), int(0.8 * len(ds))) # now use it as a generator dses = list(bal.generate(ds)) assert_equal(len(dses), 5) # with limit bal = Balancer(limit={'chunks': 3}, apply_selection=True) res = bal(ds) assert_equal(res.sa['chunks'].unique, (3,)) assert_equal(get_nelements_per_value(res.sa.targets).values(), [2] * 4) # same but include all offlimit samples bal = Balancer(limit={'chunks': 3}, include_offlimit=True, apply_selection=True) res = bal(ds) assert_array_equal(res.sa['chunks'].unique, range(10)) # chunk three still balanced, but the rest is not, i.e. all samples included assert_equal(get_nelements_per_value(res[res.sa.chunks == 3].sa.targets).values(), [2] * 4) assert_equal(get_nelements_per_value(res.sa.chunks).values(), [10, 10, 10, 8, 10, 10, 10, 10, 10, 10]) # fixed amount bal = Balancer(amount=1, limit={'chunks': 3}, apply_selection=True) res = bal(ds) assert_equal(get_nelements_per_value(res.sa.targets).values(), [1] * 4) # fraction bal = Balancer(amount=0.499, limit=None, apply_selection=True) res = bal(ds) assert_array_equal( np.round(np.array(get_nelements_per_value(ds.sa.targets).values()) * 0.5), np.array(get_nelements_per_value(res.sa.targets).values())) # check on feature attribute ds.fa['one'] = np.tile([1,2], 5) ds.fa['chk'] = np.repeat([1,2], 5) bal = Balancer(attr='one', amount=2, limit='chk', apply_selection=True) res = bal(ds) assert_equal(get_nelements_per_value(res.fa.one).values(), [4] * 2)
def test_reprs(self): # very very basic test to see that there is no errors in reprs # of partitioners import mvpa2.generators.partition as mgp for sclass in (x for x in dir(mgp) if x.endswith('Partitioner')): args = (1,) if sclass == 'ExcludeTargetsCombinationsPartitioner': args += (1,1) pclass = getattr(mgp, sclass) r = repr(pclass(*args)) assert_false('ERROR' in r)
def test_cosmo_do_not_store_unsupported_datatype(): ds = Dataset(np.zeros((0, 0))) class ArbitraryClass(object): pass ds.a['unused'] = ArbitraryClass() c = cosmo.map2cosmo(ds) assert_false('a' in c.keys()) ds.a['foo'] = np.zeros((1,)) c = cosmo.map2cosmo(ds) assert_true('a' in c.keys())
def test_splitter(): ds = give_data() # split with defaults spl1 = Splitter('chunks') assert_raises(NotImplementedError, spl1, ds) splits = list(spl1.generate(ds)) assert_equal(len(splits), len(ds.sa['chunks'].unique)) for split in splits: # it should have perform basic slicing! assert_true(split.samples.base is ds.samples) assert_equal(len(split.sa['chunks'].unique), 1) assert_true('lastsplit' in split.a) assert_true(splits[-1].a.lastsplit) # now again, more customized spl2 = Splitter('targets', attr_values = [0,1,1,2,3,3,3], count=4, noslicing=True) splits = list(spl2.generate(ds)) assert_equal(len(splits), 4) for split in splits: # it should NOT have perform basic slicing! assert_false(split.samples.base is ds.samples) assert_equal(len(split.sa['targets'].unique), 1) assert_equal(len(split.sa['chunks'].unique), 10) assert_true(splits[-1].a.lastsplit) # two should be identical assert_array_equal(splits[1].samples, splits[2].samples) # now go wild and split by feature attribute ds.fa['roi'] = np.repeat([0,1], 5) # splitter should auto-detect that this is a feature attribute spl3 = Splitter('roi') splits = list(spl3.generate(ds)) assert_equal(len(splits), 2) for split in splits: assert_true(split.samples.base is ds.samples) assert_equal(len(split.fa['roi'].unique), 1) assert_equal(split.shape, (100, 5)) # and finally test chained splitters cspl = ChainNode([spl2, spl3, spl1]) splits = list(cspl.generate(ds)) # 4 target splits and 2 roi splits each and 10 chunks each assert_equal(len(splits), 80)
def test_discarded_boundaries(self): ds = datasets["hollow"] # four runs ds.sa["chunks"] = np.repeat(np.arange(4), 10) # do odd even splitting for lots of boundaries in few splits part = ChainNode([OddEvenPartitioner(), StripBoundariesSamples("chunks", 1, 2)]) parts = [d.samples.sid for d in part.generate(ds)] # both dataset should have the same samples, because the boundaries are # identical and the same sample should be stripped assert_array_equal(parts[0], parts[1]) # we strip 3 samples per boundary assert_equal(len(parts[0]), len(ds) - (3 * 3)) for i in [9, 10, 11, 19, 20, 21, 29, 30, 31]: assert_false(i in parts[0])
def test_slicing(self): hs = HalfPartitioner() spl = Splitter(attr='partitions') splits = list(hs.generate(self.data)) for s in splits: # partitioned dataset shared the data assert_true(s.samples.base is self.data.samples) splits = [ list(spl.generate(p)) for p in hs.generate(self.data) ] for s in splits: # we get slicing all the time assert_true(s[0].samples.base.base is self.data.samples) assert_true(s[1].samples.base.base is self.data.samples) spl = Splitter(attr='partitions', noslicing=True) splits = [ list(spl.generate(p)) for p in hs.generate(self.data) ] for s in splits: # we no slicing at all assert_false(s[0].samples.base is self.data.samples) assert_false(s[1].samples.base is self.data.samples) nfs = NFoldPartitioner() spl = Splitter(attr='partitions') splits = [ list(spl.generate(p)) for p in nfs.generate(self.data) ] for i, s in enumerate(splits): # training only first and last split if i == 0 or i == len(splits) - 1: assert_true(s[0].samples.base.base is self.data.samples) else: assert_true(s[0].samples.base is None) # we get slicing all the time assert_true(s[1].samples.base.base is self.data.samples) step_ds = Dataset(np.random.randn(20,2), sa={'chunks': np.tile([0,1], 10)}) oes = OddEvenPartitioner() spl = Splitter(attr='partitions') splits = list(oes.generate(step_ds)) for s in splits: # partitioned dataset shared the data assert_true(s.samples.base is step_ds.samples) splits = [ list(spl.generate(p)) for p in oes.generate(step_ds) ] assert_equal(len(splits), 2) for s in splits: # we get slicing all the time assert_true(s[0].samples.base.base is step_ds.samples) assert_true(s[1].samples.base.base is step_ds.samples)
def test_permute_chunks(): def is_sorted(x): return np.array_equal(np.sort(x), x) ds = give_data() # change targets labels # there is no target labels permuting within chunks, # assure = True would be error ds.sa['targets'] = range(len(ds.sa.targets)) permutation = AttributePermutator(attr='targets', chunk_attr='chunks', strategy='chunks', assure=True) pds = permutation(ds) assert_false(is_sorted(pds.sa.targets)) assert_true(np.array_equal(pds.samples, ds.samples)) for chunk_id in np.unique(pds.sa.chunks): chunk_ds = pds[pds.sa.chunks == chunk_id] assert_true(is_sorted(chunk_ds.sa.targets))
def test_transpose(): from mvpa2.mappers.shape import TransposeMapper ds = Dataset(np.arange(24).reshape(2, 3, 4), sa={"testsa": np.arange(2)}, fa={"testfa": np.arange(3)}) tp = TransposeMapper() tds = tp(ds) assert_equal(tds.shape, (3, 2, 4)) assert_true("testfa" in tds.sa) assert_true("testsa" in tds.fa) assert_false(tds.fa is tds.sa) # and back ttds = tp(tds) assert_array_equal(ttds.samples, ds.samples) assert_equal(ttds.sa, ds.sa) assert_equal(ttds.fa, ds.fa) # or this way rds = tp.reverse(tds) assert_array_equal(rds.samples, ds.samples) assert_equal(rds.sa, ds.sa) assert_equal(rds.fa, ds.fa) assert_array_equal(rds.samples, ttds.samples) assert_equal(rds.sa, ttds.sa) assert_equal(rds.fa, ttds.fa)
def test_transpose(): from mvpa2.mappers.shape import TransposeMapper ds = Dataset(np.arange(24).reshape(2, 3, 4), sa={'testsa': np.arange(2)}, fa={'testfa': np.arange(3)}) tp = TransposeMapper() tds = tp(ds) assert_equal(tds.shape, (3, 2, 4)) assert_true('testfa' in tds.sa) assert_true('testsa' in tds.fa) assert_false(tds.fa is tds.sa) # and back ttds = tp(tds) assert_array_equal(ttds.samples, ds.samples) assert_equal(ttds.sa, ds.sa) assert_equal(ttds.fa, ds.fa) # or this way rds = tp.reverse(tds) assert_array_equal(rds.samples, ds.samples) assert_equal(rds.sa, ds.sa) assert_equal(rds.fa, ds.fa) assert_array_equal(rds.samples, ttds.samples) assert_equal(rds.sa, ttds.sa) assert_equal(rds.fa, ttds.fa)
def test_attrpermute(): # Was about to use borrowkwargs but didn't work out . Test doesn't hurt doc = AttributePermutator.__init__.__doc__ assert_in('limit : ', doc) assert_not_in('collection : ', doc) ds = give_data() ds.sa['ids'] = range(len(ds)) pristine_data = ds.samples.copy() permutation = AttributePermutator(['targets', 'ids'], assure=True) pds = permutation(ds) # should not touch the data assert_array_equal(pristine_data, pds.samples) # even keep the very same array assert_true(pds.samples.base is ds.samples) # there is no way that it can be the same attribute assert_false(np.all(pds.sa.ids == ds.sa.ids)) # ids should reflect permutation setup assert_array_equal(pds.sa.targets, ds.sa.targets[pds.sa.ids]) # other attribute should remain intact assert_array_equal(pds.sa.chunks, ds.sa.chunks) # now chunk-wise permutation permutation = AttributePermutator('ids', limit='chunks') pds = permutation(ds) # first ten should remain first ten assert_false(np.any(pds.sa.ids[:10] > 9)) # verify that implausible assure=True would not work permutation = AttributePermutator('targets', limit='ids', assure=True) assert_raises(RuntimeError, permutation, ds) # same thing, but only permute single chunk permutation = AttributePermutator('ids', limit={'chunks': 3}) pds = permutation(ds) # one chunk should change assert_false(np.any(pds.sa.ids[30:40] > 39)) assert_false(np.any(pds.sa.ids[30:40] < 30)) # the rest not assert_array_equal(pds.sa.ids[:30], range(30)) # or a list of chunks permutation = AttributePermutator('ids', limit={'chunks': [3,4]}) pds = permutation(ds) # two chunks should change assert_false(np.any(pds.sa.ids[30:50] > 49)) assert_false(np.any(pds.sa.ids[30:50] < 30)) # the rest not assert_array_equal(pds.sa.ids[:30], range(30)) # and now try generating more permutations nruns = 2 def assert_all_different_permutations(pds): assert_equal(len(pds), nruns) for i, p in enumerate(pds): assert_false(np.all(p.sa.ids == ds.sa.ids)) for p_ in pds[i+1:]: assert_false(np.all(p.sa.ids == p_.sa.ids)) permutation = AttributePermutator(['targets', 'ids'], assure=True, count=nruns) pds = list(permutation.generate(ds)) assert_all_different_permutations(pds) # if we provide seeding, and generate, it should also return different datasets permutation = AttributePermutator(['targets', 'ids'], count=nruns, rng=1) pds1 = list(permutation.generate(ds)) assert_all_different_permutations(pds) # but if we regenerate -- should all be the same to before pds2 = list(permutation.generate(ds)) assert_equal(len(pds1), len(pds2)) for p1, p2 in zip(pds1, pds2): assert_datasets_equal(p1, p2) # permute feature attrs ds.fa['ids'] = range(ds.shape[1]) permutation = AttributePermutator('fa.ids', assure=True) pds = permutation(ds) assert_false(np.all(pds.fa.ids == ds.fa.ids)) # now chunk-wise uattrs strategy (reassignment) permutation = AttributePermutator('targets', limit='chunks', strategy='uattrs', assure=True) pds = permutation(ds) # Due to assure above -- we should have changed things assert_not_equal(zip(ds.targets), zip(pds.targets)) # in each chunk we should have unique remappings for c in ds.UC: chunk_idx = ds.C == c otargets, ptargets = ds.targets[chunk_idx], pds.sa.targets[chunk_idx] # we still have the same targets assert_equal(set(ptargets), set(otargets)) # we have only 1-to-1 mappings assert_true(len(set(zip(otargets, ptargets))), len(set(otargets))) ds.sa['odds'] = ds.sa.ids % 2 # test combinations permutation = AttributePermutator(['targets', 'odds'], limit='chunks', strategy='uattrs', assure=True) pds = permutation(ds) # Due to assure above -- we should have changed things assert_not_equal(zip(ds.targets, ds.sa.odds), zip(pds.targets, pds.sa.odds)) # In each chunk we should have unique remappings for c in ds.UC: chunk_idx = ds.C == c otargets, ptargets = ds.targets[chunk_idx], pds.sa.targets[chunk_idx] oodds, podds = ds.sa.odds[chunk_idx], pds.sa.odds[chunk_idx] # we still have the same targets assert_equal(set(ptargets), set(otargets)) assert_equal(set(oodds), set(podds)) # at the end we have the same mapping assert_equal(set(zip(otargets, oodds)), set(zip(ptargets, podds)))
def test_flatten(): samples_shape = (2, 2, 4) data_shape = (4,) + samples_shape data = np.arange(np.prod(data_shape)).reshape(data_shape).view(myarray) pristinedata = data.copy() target = [[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47], [48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63]] target = np.array(target).view(myarray) index_target = np.array([[0, 0, 0], [0, 0, 1], [0, 0, 2], [0, 0, 3], [0, 1, 0], [0, 1, 1], [0, 1, 2], [0, 1, 3], [1, 0, 0], [1, 0, 1], [1, 0, 2], [1, 0, 3], [1, 1, 0], [1, 1, 1], [1, 1, 2], [1, 1, 3]]) # test only flattening the first two dimensions fm_max = FlattenMapper(maxdims=2) fm_max.train(data) assert_equal(fm_max(data).shape, (4, 4, 4)) # array subclass survives ok_(isinstance(data, myarray)) # actually, there should be no difference between a plain FlattenMapper and # a chain that only has a FlattenMapper as the one element for fm in [FlattenMapper(space='voxel'), ChainMapper([FlattenMapper(space='voxel'), StaticFeatureSelection(slice(None))])]: # not working if untrained assert_raises(RuntimeError, fm.forward1, np.arange(np.sum(samples_shape) + 1)) fm.train(data) ok_(isinstance(fm.forward(data), myarray)) ok_(isinstance(fm.forward1(data[2]), myarray)) assert_array_equal(fm.forward(data), target) assert_array_equal(fm.forward1(data[2]), target[2]) assert_raises(ValueError, fm.forward, np.arange(4)) # all of that leaves that data unmodified assert_array_equal(data, pristinedata) # reverse mapping ok_(isinstance(fm.reverse(target), myarray)) ok_(isinstance(fm.reverse1(target[0]), myarray)) ok_(isinstance(fm.reverse(target[1:2]), myarray)) assert_array_equal(fm.reverse(target), data) assert_array_equal(fm.reverse1(target[0]), data[0]) assert_array_equal(fm.reverse1(target[0]), _verified_reverse1(fm, target[0])) assert_array_equal(fm.reverse(target[1:2]), data[1:2]) assert_raises(ValueError, fm.reverse, np.arange(14)) # check one dimensional data, treated as scalar samples oned = np.arange(5) fm.train(Dataset(oned)) # needs 2D assert_raises(ValueError, fm.forward, oned) # doesn't match mapper, since Dataset turns `oned` into (5,1) assert_raises(ValueError, fm.forward, oned) assert_equal(Dataset(oned).nfeatures, 1) # try dataset mode, with some feature attribute fattr = np.arange(np.prod(samples_shape)).reshape(samples_shape) ds = Dataset(data, fa={'awesome': fattr.copy()}) assert_equal(ds.samples.shape, data_shape) fm.train(ds) dsflat = fm.forward(ds) ok_(isinstance(dsflat, Dataset)) ok_(isinstance(dsflat.samples, myarray)) assert_array_equal(dsflat.samples, target) assert_array_equal(dsflat.fa.awesome, np.arange(np.prod(samples_shape))) assert_true(isinstance(dsflat.fa['awesome'], ArrayCollectable)) # test index creation assert_array_equal(index_target, dsflat.fa.voxel) # and back revds = fm.reverse(dsflat) ok_(isinstance(revds, Dataset)) ok_(isinstance(revds.samples, myarray)) assert_array_equal(revds.samples, data) assert_array_equal(revds.fa.awesome, fattr) assert_true(isinstance(revds.fa['awesome'], ArrayCollectable)) assert_false('voxel' in revds.fa)
def test_attrpermute(): ds = give_data() ds.sa['ids'] = range(len(ds)) pristine_data = ds.samples.copy() permutation = AttributePermutator(['targets', 'ids'], assure=True) pds = permutation(ds) # should not touch the data assert_array_equal(pristine_data, pds.samples) # even keep the very same array assert_true(pds.samples.base is ds.samples) # there is no way that it can be the same attribute assert_false(np.all(pds.sa.ids == ds.sa.ids)) # ids should reflect permutation setup assert_array_equal(pds.sa.targets, ds.sa.targets[pds.sa.ids]) # other attribute should remain intact assert_array_equal(pds.sa.chunks, ds.sa.chunks) # now chunk-wise permutation permutation = AttributePermutator('ids', limit='chunks') pds = permutation(ds) # first ten should remain first ten assert_false(np.any(pds.sa.ids[:10] > 9)) # same thing, but only permute single chunk permutation = AttributePermutator('ids', limit={'chunks': 3}) pds = permutation(ds) # one chunk should change assert_false(np.any(pds.sa.ids[30:40] > 39)) assert_false(np.any(pds.sa.ids[30:40] < 30)) # the rest not assert_array_equal(pds.sa.ids[:30], range(30)) # or a list of chunks permutation = AttributePermutator('ids', limit={'chunks': [3,4]}) pds = permutation(ds) # two chunks should change assert_false(np.any(pds.sa.ids[30:50] > 49)) assert_false(np.any(pds.sa.ids[30:50] < 30)) # the rest not assert_array_equal(pds.sa.ids[:30], range(30)) # and now try generating more permutations nruns = 2 permutation = AttributePermutator(['targets', 'ids'], assure=True, count=nruns) pds = list(permutation.generate(ds)) assert_equal(len(pds), nruns) for p in pds: assert_false(np.all(p.sa.ids == ds.sa.ids)) # permute feature attrs ds.fa['ids'] = range(ds.shape[1]) permutation = AttributePermutator('fa.ids', assure=True) pds = permutation(ds) assert_false(np.all(pds.fa.ids == ds.fa.ids))
def assert_all_different_permutations(pds): assert_equal(len(pds), nruns) for i, p in enumerate(pds): assert_false(np.all(p.sa.ids == ds.sa.ids)) for p_ in pds[i+1:]: assert_false(np.all(p.sa.ids == p_.sa.ids))
def test_flatten(): samples_shape = (2, 2, 4) data_shape = (4,) + samples_shape data = np.arange(np.prod(data_shape)).reshape(data_shape).view(myarray) pristinedata = data.copy() target = [[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47], [48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63]] target = np.array(target).view(myarray) index_target = np.array([[0, 0, 0], [0, 0, 1], [0, 0, 2], [0, 0, 3], [0, 1, 0], [0, 1, 1], [0, 1, 2], [0, 1, 3], [1, 0, 0], [1, 0, 1], [1, 0, 2], [1, 0, 3], [1, 1, 0], [1, 1, 1], [1, 1, 2], [1, 1, 3]]) # test only flattening the first two dimensions fm_max = FlattenMapper(maxdims=2) fm_max.train(data) assert_equal(fm_max(data).shape, (4, 4, 4)) # array subclass survives ok_(isinstance(data, myarray)) # actually, there should be no difference between a plain FlattenMapper and # a chain that only has a FlattenMapper as the one element for fm in [FlattenMapper(space='voxel'), ChainMapper([FlattenMapper(space='voxel'), StaticFeatureSelection(slice(None))])]: # not working if untrained assert_raises(RuntimeError, fm.forward1, np.arange(np.sum(samples_shape) + 1)) fm.train(data) ok_(isinstance(fm.forward(data), myarray)) ok_(isinstance(fm.forward1(data[2]), myarray)) assert_array_equal(fm.forward(data), target) assert_array_equal(fm.forward1(data[2]), target[2]) assert_raises(ValueError, fm.forward, np.arange(4)) # all of that leaves that data unmodified assert_array_equal(data, pristinedata) # reverse mapping ok_(isinstance(fm.reverse(target), myarray)) ok_(isinstance(fm.reverse1(target[0]), myarray)) ok_(isinstance(fm.reverse(target[1:2]), myarray)) assert_array_equal(fm.reverse(target), data) assert_array_equal(fm.reverse1(target[0]), data[0]) assert_array_equal(fm.reverse(target[1:2]), data[1:2]) assert_raises(ValueError, fm.reverse, np.arange(14)) # check one dimensional data, treated as scalar samples oned = np.arange(5) fm.train(Dataset(oned)) # needs 2D assert_raises(ValueError, fm.forward, oned) # doesn't match mapper, since Dataset turns `oned` into (5,1) assert_raises(ValueError, fm.forward, oned) assert_equal(Dataset(oned).nfeatures, 1) # try dataset mode, with some feature attribute fattr = np.arange(np.prod(samples_shape)).reshape(samples_shape) ds = Dataset(data, fa={'awesome': fattr.copy()}) assert_equal(ds.samples.shape, data_shape) fm.train(ds) dsflat = fm.forward(ds) ok_(isinstance(dsflat, Dataset)) ok_(isinstance(dsflat.samples, myarray)) assert_array_equal(dsflat.samples, target) assert_array_equal(dsflat.fa.awesome, np.arange(np.prod(samples_shape))) assert_true(isinstance(dsflat.fa['awesome'], ArrayCollectable)) # test index creation assert_array_equal(index_target, dsflat.fa.voxel) # and back revds = fm.reverse(dsflat) ok_(isinstance(revds, Dataset)) ok_(isinstance(revds.samples, myarray)) assert_array_equal(revds.samples, data) assert_array_equal(revds.fa.awesome, fattr) assert_true(isinstance(revds.fa['awesome'], ArrayCollectable)) assert_false('voxel' in revds.fa)
def test_balancer(): ds = give_data() ds.sa['ids'] = np.arange(len(ds)) # some sa to ease tracking of samples # only mark the selection in an attribute bal = Balancer() res = bal(ds) # we get a new dataset, with shared samples assert_false(ds is res) assert_true(ds.samples is res.samples.base) # should kick out 2 samples in each chunk of 10 assert_almost_equal(np.mean(res.sa.balanced_set), 0.8) # same as above, but actually apply the selection bal = Balancer(apply_selection=True, count=5) # just run it once res = bal(ds) # we get a new dataset, with shared samples assert_false(ds is res) # should kick out 2 samples in each chunk of 10 assert_equal(len(res), int(0.8 * len(ds))) # now use it as a generator dses = list(bal.generate(ds)) assert_equal(len(dses), 5) # if we rerun again, it would be a different selection res2 = bal(ds) assert_true(np.any(res.sa.ids != bal(ds).sa.ids)) # but if we create a balancer providing seed rng int, # should be identical results bal = Balancer(apply_selection=True, count=5, rng=1) assert_false(np.any(bal(ds).sa.ids != bal(ds).sa.ids)) # But results should differ if we use .generate to produce those multiple # balanced datasets b = Balancer(apply_selection=True, count=3, rng=1) balanced = list(b.generate(ds)) assert_false(all(balanced[0].sa.ids == balanced[1].sa.ids)) assert_false(all(balanced[0].sa.ids == balanced[2].sa.ids)) assert_false(all(balanced[1].sa.ids == balanced[2].sa.ids)) # And should be exactly the same for ds_a, ds_b in zip(balanced, b.generate(ds)): assert_datasets_equal(ds_a, ds_b) # Contribution by Chris Markiewicz # And interleaving __call__ and generator fetches gen1 = b.generate(ds) gen2 = b.generate(ds) seq1, seq2, seq3 = [], [], [] for i in xrange(3): seq1.append(gen1.next()) seq2.append(gen2.next()) seq3.append(b(ds)) # Produces expected sequences for i in xrange(3): assert_datasets_equal(balanced[i], seq1[i]) assert_datasets_equal(balanced[i], seq2[i]) # And all __call__s return the same result ds_a = seq3[0] for ds_b in seq3[1:]: assert_array_equal(ds_a.sa.ids, ds_b.sa.ids) # with limit bal = Balancer(limit={'chunks': 3}, apply_selection=True) res = bal(ds) assert_equal(res.sa['chunks'].unique, (3,)) assert_equal(get_nelements_per_value(res.sa.targets).values(), [2] * 4) # same but include all offlimit samples bal = Balancer(limit={'chunks': 3}, include_offlimit=True, apply_selection=True) res = bal(ds) assert_array_equal(res.sa['chunks'].unique, range(10)) # chunk three still balanced, but the rest is not, i.e. all samples included assert_equal(get_nelements_per_value(res[res.sa.chunks == 3].sa.targets).values(), [2] * 4) assert_equal(get_nelements_per_value(res.sa.chunks).values(), [10, 10, 10, 8, 10, 10, 10, 10, 10, 10]) # fixed amount bal = Balancer(amount=1, limit={'chunks': 3}, apply_selection=True) res = bal(ds) assert_equal(get_nelements_per_value(res.sa.targets).values(), [1] * 4) # fraction bal = Balancer(amount=0.499, limit=None, apply_selection=True) res = bal(ds) assert_array_equal( np.round(np.array(get_nelements_per_value(ds.sa.targets).values()) * 0.5), np.array(get_nelements_per_value(res.sa.targets).values())) # check on feature attribute ds.fa['one'] = np.tile([1, 2], 5) ds.fa['chk'] = np.repeat([1, 2], 5) bal = Balancer(attr='one', amount=2, limit='chk', apply_selection=True) res = bal(ds) assert_equal(get_nelements_per_value(res.fa.one).values(), [4] * 2)
def test_balancer(): ds = give_data() ds.sa['ids'] = np.arange(len(ds)) # some sa to ease tracking of samples # only mark the selection in an attribute bal = Balancer() res = bal(ds) # we get a new dataset, with shared samples assert_false(ds is res) assert_true(ds.samples is res.samples.base) # should kick out 2 samples in each chunk of 10 assert_almost_equal(np.mean(res.sa.balanced_set), 0.8) # same as above, but actually apply the selection bal = Balancer(apply_selection=True, count=5) # just run it once res = bal(ds) # we get a new dataset, with shared samples assert_false(ds is res) # should kick out 2 samples in each chunk of 10 assert_equal(len(res), int(0.8 * len(ds))) # now use it as a generator dses = list(bal.generate(ds)) assert_equal(len(dses), 5) # if we rerun again, it would be a different selection res2 = bal(ds) assert_true(np.any(res.sa.ids != bal(ds).sa.ids)) # but if we create a balancer providing seed rng int, # should be identical results bal = Balancer(apply_selection=True, count=5, rng=1) assert_false(np.any(bal(ds).sa.ids != bal(ds).sa.ids)) # But results should differ if we use .generate to produce those multiple # balanced datasets b = Balancer(apply_selection=True, count=3, rng=1) balanced = list(b.generate(ds)) assert_false(all(balanced[0].sa.ids == balanced[1].sa.ids)) assert_false(all(balanced[0].sa.ids == balanced[2].sa.ids)) assert_false(all(balanced[1].sa.ids == balanced[2].sa.ids)) # And should be exactly the same for ds_a, ds_b in zip(balanced, b.generate(ds)): assert_datasets_equal(ds_a, ds_b) # Contribution by Chris Markiewicz # And interleaving __call__ and generator fetches gen1 = b.generate(ds) gen2 = b.generate(ds) seq1, seq2, seq3 = [], [], [] for i in xrange(3): seq1.append(gen1.next()) seq2.append(gen2.next()) seq3.append(b(ds)) # Produces expected sequences for i in xrange(3): assert_datasets_equal(balanced[i], seq1[i]) assert_datasets_equal(balanced[i], seq2[i]) # And all __call__s return the same result ds_a = seq3[0] for ds_b in seq3[1:]: assert_array_equal(ds_a.sa.ids, ds_b.sa.ids) # with limit bal = Balancer(limit={'chunks': 3}, apply_selection=True) res = bal(ds) assert_equal(res.sa['chunks'].unique, (3, )) assert_equal(get_nelements_per_value(res.sa.targets).values(), [2] * 4) # same but include all offlimit samples bal = Balancer(limit={'chunks': 3}, include_offlimit=True, apply_selection=True) res = bal(ds) assert_array_equal(res.sa['chunks'].unique, range(10)) # chunk three still balanced, but the rest is not, i.e. all samples included assert_equal( get_nelements_per_value(res[res.sa.chunks == 3].sa.targets).values(), [2] * 4) assert_equal( get_nelements_per_value(res.sa.chunks).values(), [10, 10, 10, 8, 10, 10, 10, 10, 10, 10]) # fixed amount bal = Balancer(amount=1, limit={'chunks': 3}, apply_selection=True) res = bal(ds) assert_equal(get_nelements_per_value(res.sa.targets).values(), [1] * 4) # fraction bal = Balancer(amount=0.499, limit=None, apply_selection=True) res = bal(ds) assert_array_equal( np.round( np.array(get_nelements_per_value(ds.sa.targets).values()) * 0.5), np.array(get_nelements_per_value(res.sa.targets).values())) # check on feature attribute ds.fa['one'] = np.tile([1, 2], 5) ds.fa['chk'] = np.repeat([1, 2], 5) bal = Balancer(attr='one', amount=2, limit='chk', apply_selection=True) res = bal(ds) assert_equal(get_nelements_per_value(res.fa.one).values(), [4] * 2)
def test_attrpermute(): ds = give_data() ds.sa['ids'] = range(len(ds)) pristine_data = ds.samples.copy() permutation = AttributePermutator(['targets', 'ids'], assure=True) pds = permutation(ds) # should not touch the data assert_array_equal(pristine_data, pds.samples) # even keep the very same array assert_true(pds.samples.base is ds.samples) # there is no way that it can be the same attribute assert_false(np.all(pds.sa.ids == ds.sa.ids)) # ids should reflect permutation setup assert_array_equal(pds.sa.targets, ds.sa.targets[pds.sa.ids]) # other attribute should remain intact assert_array_equal(pds.sa.chunks, ds.sa.chunks) # now chunk-wise permutation permutation = AttributePermutator('ids', limit='chunks') pds = permutation(ds) # first ten should remain first ten assert_false(np.any(pds.sa.ids[:10] > 9)) # verify that implausible assure=True would not work permutation = AttributePermutator('targets', limit='ids', assure=True) assert_raises(RuntimeError, permutation, ds) # same thing, but only permute single chunk permutation = AttributePermutator('ids', limit={'chunks': 3}) pds = permutation(ds) # one chunk should change assert_false(np.any(pds.sa.ids[30:40] > 39)) assert_false(np.any(pds.sa.ids[30:40] < 30)) # the rest not assert_array_equal(pds.sa.ids[:30], range(30)) # or a list of chunks permutation = AttributePermutator('ids', limit={'chunks': [3,4]}) pds = permutation(ds) # two chunks should change assert_false(np.any(pds.sa.ids[30:50] > 49)) assert_false(np.any(pds.sa.ids[30:50] < 30)) # the rest not assert_array_equal(pds.sa.ids[:30], range(30)) # and now try generating more permutations nruns = 2 permutation = AttributePermutator(['targets', 'ids'], assure=True, count=nruns) pds = list(permutation.generate(ds)) assert_equal(len(pds), nruns) for p in pds: assert_false(np.all(p.sa.ids == ds.sa.ids)) # permute feature attrs ds.fa['ids'] = range(ds.shape[1]) permutation = AttributePermutator('fa.ids', assure=True) pds = permutation(ds) assert_false(np.all(pds.fa.ids == ds.fa.ids)) # now chunk-wise uattrs strategy (reassignment) permutation = AttributePermutator('targets', limit='chunks', strategy='uattrs', assure=True) pds = permutation(ds) # Due to assure above -- we should have changed things assert_not_equal(zip(ds.targets), zip(pds.targets)) # in each chunk we should have unique remappings for c in ds.UC: chunk_idx = ds.C == c otargets, ptargets = ds.targets[chunk_idx], pds.sa.targets[chunk_idx] # we still have the same targets assert_equal(set(ptargets), set(otargets)) # we have only 1-to-1 mappings assert_true(len(set(zip(otargets, ptargets))), len(set(otargets))) ds.sa['odds'] = ds.sa.ids % 2 # test combinations permutation = AttributePermutator(['targets', 'odds'], limit='chunks', strategy='uattrs', assure=True) pds = permutation(ds) # Due to assure above -- we should have changed things assert_not_equal(zip(ds.targets, ds.sa.odds), zip(pds.targets, pds.sa.odds)) # In each chunk we should have unique remappings for c in ds.UC: chunk_idx = ds.C == c otargets, ptargets = ds.targets[chunk_idx], pds.sa.targets[chunk_idx] oodds, podds = ds.sa.odds[chunk_idx], pds.sa.odds[chunk_idx] # we still have the same targets assert_equal(set(ptargets), set(otargets)) assert_equal(set(oodds), set(podds)) # at the end we have the same mapping assert_equal(set(zip(otargets, oodds)), set(zip(ptargets, podds)))
def test_attrmap(): map_default = {'eins': 0, 'zwei': 2, 'sieben': 1} map_custom = {'eins': 11, 'zwei': 22, 'sieben': 33} literal = ['eins', 'zwei', 'sieben', 'eins', 'sieben', 'eins'] literal_nonmatching = ['uno', 'dos', 'tres'] num_default = [0, 2, 1, 0, 1, 0] num_custom = [11, 22, 33, 11, 33, 11] # no custom mapping given am = AttributeMap() assert_false(am) ok_(len(am) == 0) assert_array_equal(am.to_numeric(literal), num_default) assert_array_equal(am.to_literal(num_default), literal) ok_(am) ok_(len(am) == 3) # # Tests for recursive mapping + preserving datatype class myarray(np.ndarray): pass assert_raises(KeyError, am.to_literal, [(1, 2), 2, 0]) literal_fancy = [(1, 2), 2, [0], np.array([0, 1]).view(myarray)] literal_fancy_tuple = tuple(literal_fancy) literal_fancy_array = np.array(literal_fancy, dtype=object) for l in (literal_fancy, literal_fancy_tuple, literal_fancy_array): res = am.to_literal(l, recurse=True) assert_equal(res[0], ('sieben', 'zwei')) assert_equal(res[1], 'zwei') assert_equal(res[2], ['eins']) assert_array_equal(res[3], ['eins', 'sieben']) # types of result and subsequences should be preserved ok_(isinstance(res, l.__class__)) ok_(isinstance(res[0], tuple)) ok_(isinstance(res[1], str)) ok_(isinstance(res[2], list)) ok_(isinstance(res[3], myarray)) # yet another example a = np.empty(1, dtype=object) a[0] = (0, 1) res = am.to_literal(a, recurse=True) ok_(isinstance(res[0], tuple)) # # with custom mapping am = AttributeMap(map=map_custom) assert_array_equal(am.to_numeric(literal), num_custom) assert_array_equal(am.to_literal(num_custom), literal) # if not numeric nothing is mapped assert_array_equal(am.to_numeric(num_custom), num_custom) # even if the map doesn't fit assert_array_equal(am.to_numeric(num_default), num_default) # need to_numeric first am = AttributeMap() assert_raises(RuntimeError, am.to_literal, [1, 2, 3]) # stupid args assert_raises(ValueError, AttributeMap, map=num_custom) # map mismatch am = AttributeMap(map=map_custom) if __debug__: # checked only in __debug__ assert_raises(KeyError, am.to_numeric, literal_nonmatching) # needs reset and should work afterwards am.clear() assert_array_equal(am.to_numeric(literal_nonmatching), [2, 0, 1]) # and now reverse am = AttributeMap(map=map_custom) assert_raises(KeyError, am.to_literal, num_default) # dict-like interface am = AttributeMap() ok_([(k, v) for k, v in am.items()] == [])
def test_attrmap(): map_default = {'eins': 0, 'zwei': 2, 'sieben': 1} map_custom = {'eins': 11, 'zwei': 22, 'sieben': 33} literal = ['eins', 'zwei', 'sieben', 'eins', 'sieben', 'eins'] literal_nonmatching = ['uno', 'dos', 'tres'] num_default = [0, 2, 1, 0, 1, 0] num_custom = [11, 22, 33, 11, 33, 11] # no custom mapping given am = AttributeMap() assert_false(am) ok_(len(am) == 0) assert_array_equal(am.to_numeric(literal), num_default) assert_array_equal(am.to_literal(num_default), literal) ok_(am) ok_(len(am) == 3) # # Tests for recursive mapping + preserving datatype class myarray(np.ndarray): pass assert_raises(KeyError, am.to_literal, [(1, 2), 2, 0]) literal_fancy = [(1, 2), 2, [0], np.array([0, 1]).view(myarray)] literal_fancy_tuple = tuple(literal_fancy) literal_fancy_array = np.array(literal_fancy, dtype=object) for l in (literal_fancy, literal_fancy_tuple, literal_fancy_array): res = am.to_literal(l, recurse=True) assert_equal(res[0], ('sieben', 'zwei')) assert_equal(res[1], 'zwei') assert_equal(res[2], ['eins']) assert_array_equal(res[3], ['eins', 'sieben']) # types of result and subsequences should be preserved ok_(isinstance(res, l.__class__)) ok_(isinstance(res[0], tuple)) ok_(isinstance(res[1], str)) ok_(isinstance(res[2], list)) ok_(isinstance(res[3], myarray)) # yet another example a = np.empty(1, dtype=object) a[0] = (0, 1) res = am.to_literal(a, recurse=True) ok_(isinstance(res[0], tuple)) # # with custom mapping am = AttributeMap(map=map_custom) assert_array_equal(am.to_numeric(literal), num_custom) assert_array_equal(am.to_literal(num_custom), literal) # if not numeric nothing is mapped assert_array_equal(am.to_numeric(num_custom), num_custom) # even if the map doesn't fit assert_array_equal(am.to_numeric(num_default), num_default) # need to_numeric first am = AttributeMap() assert_raises(RuntimeError, am.to_literal, [1,2,3]) # stupid args assert_raises(ValueError, AttributeMap, map=num_custom) # map mismatch am = AttributeMap(map=map_custom) if __debug__: # checked only in __debug__ assert_raises(KeyError, am.to_numeric, literal_nonmatching) # needs reset and should work afterwards am.clear() assert_array_equal(am.to_numeric(literal_nonmatching), [2, 0, 1]) # and now reverse am = AttributeMap(map=map_custom) assert_raises(KeyError, am.to_literal, num_default) # dict-like interface am = AttributeMap() ok_([(k, v) for k, v in am.iteritems()] == [])
def test_attrpermute(): # Was about to use borrowkwargs but didn't work out . Test doesn't hurt doc = AttributePermutator.__init__.__doc__ assert_in('limit : ', doc) assert_not_in('collection : ', doc) ds = give_data() ds.sa['ids'] = range(len(ds)) pristine_data = ds.samples.copy() permutation = AttributePermutator(['targets', 'ids'], assure=True) pds = permutation(ds) # should not touch the data assert_array_equal(pristine_data, pds.samples) # even keep the very same array assert_true(pds.samples.base is ds.samples) # there is no way that it can be the same attribute assert_false(np.all(pds.sa.ids == ds.sa.ids)) # ids should reflect permutation setup assert_array_equal(pds.sa.targets, ds.sa.targets[pds.sa.ids]) # other attribute should remain intact assert_array_equal(pds.sa.chunks, ds.sa.chunks) # now chunk-wise permutation permutation = AttributePermutator('ids', limit='chunks') pds = permutation(ds) # first ten should remain first ten assert_false(np.any(pds.sa.ids[:10] > 9)) # verify that implausible assure=True would not work permutation = AttributePermutator('targets', limit='ids', assure=True) assert_raises(RuntimeError, permutation, ds) # same thing, but only permute single chunk permutation = AttributePermutator('ids', limit={'chunks': 3}) pds = permutation(ds) # one chunk should change assert_false(np.any(pds.sa.ids[30:40] > 39)) assert_false(np.any(pds.sa.ids[30:40] < 30)) # the rest not assert_array_equal(pds.sa.ids[:30], range(30)) # or a list of chunks permutation = AttributePermutator('ids', limit={'chunks': [3, 4]}) pds = permutation(ds) # two chunks should change assert_false(np.any(pds.sa.ids[30:50] > 49)) assert_false(np.any(pds.sa.ids[30:50] < 30)) # the rest not assert_array_equal(pds.sa.ids[:30], range(30)) # and now try generating more permutations nruns = 2 permutation = AttributePermutator(['targets', 'ids'], assure=True, count=nruns) pds = list(permutation.generate(ds)) assert_equal(len(pds), nruns) for p in pds: assert_false(np.all(p.sa.ids == ds.sa.ids)) # permute feature attrs ds.fa['ids'] = range(ds.shape[1]) permutation = AttributePermutator('fa.ids', assure=True) pds = permutation(ds) assert_false(np.all(pds.fa.ids == ds.fa.ids)) # now chunk-wise uattrs strategy (reassignment) permutation = AttributePermutator('targets', limit='chunks', strategy='uattrs', assure=True) pds = permutation(ds) # Due to assure above -- we should have changed things assert_not_equal(zip(ds.targets), zip(pds.targets)) # in each chunk we should have unique remappings for c in ds.UC: chunk_idx = ds.C == c otargets, ptargets = ds.targets[chunk_idx], pds.sa.targets[chunk_idx] # we still have the same targets assert_equal(set(ptargets), set(otargets)) # we have only 1-to-1 mappings assert_true(len(set(zip(otargets, ptargets))), len(set(otargets))) ds.sa['odds'] = ds.sa.ids % 2 # test combinations permutation = AttributePermutator(['targets', 'odds'], limit='chunks', strategy='uattrs', assure=True) pds = permutation(ds) # Due to assure above -- we should have changed things assert_not_equal(zip(ds.targets, ds.sa.odds), zip(pds.targets, pds.sa.odds)) # In each chunk we should have unique remappings for c in ds.UC: chunk_idx = ds.C == c otargets, ptargets = ds.targets[chunk_idx], pds.sa.targets[chunk_idx] oodds, podds = ds.sa.odds[chunk_idx], pds.sa.odds[chunk_idx] # we still have the same targets assert_equal(set(ptargets), set(otargets)) assert_equal(set(oodds), set(podds)) # at the end we have the same mapping assert_equal(set(zip(otargets, oodds)), set(zip(ptargets, podds)))
def assert_all_different_permutations(pds): assert_equal(len(pds), nruns) for i, p in enumerate(pds): assert_false(np.all(p.sa.ids == ds.sa.ids)) for p_ in pds[i + 1:]: assert_false(np.all(p.sa.ids == p_.sa.ids))