def test_stack_add_attributes(): data0 = Dataset.from_wizard(np.ones((5, 5)), targets=1) data1 = Dataset.from_wizard(np.ones((5, 5)), targets=1) data0.fa['ok'] = data0.sa['ok'] = np.arange(5) data1.fa['ok'] = data1.sa['ok'] = np.arange(5) data0.fa['nok'] = data0.sa['nok'] = [0] data1.fa['nok'] = data1.sa['nok'] = np.arange(5) # function, collection name, the other collection name for xstack, colname, ocolname in ((vstack, 'fa', 'sa'), (hstack, 'sa', 'fa')): for add_param in None, 'update', 'drop_nonunique': kw = {colname: add_param} if add_param else {} r = xstack((data0, data1), **kw) COL = lambda x: getattr(x, colname) col = COL(r) ocol = getattr(r, ocolname) # in any scenario, the other collection should have got # both names and be just fine assert_array_equal(ocol['nok'].value, [0] * 5 + list(range(5))) assert_array_equal(ocol['ok'].value, list(range(5)) * 2) if add_param in ('update',): # will be of the last dataset assert_array_equal(col['nok'].value, COL(data1)['nok'].value) assert_array_equal(col['ok'].value, COL(data1)['ok'].value) elif add_param in (None, 'drop_nonunique'): assert('nok' not in col) # must be dropped since not unique # both the same but let's check ;) assert_array_equal(col['ok'].value, COL(data0)['ok'].value) assert_array_equal(col['ok'].value, COL(data1)['ok'].value)
def test_stack_add_attributes(): data0 = Dataset.from_wizard(np.ones((5, 5)), targets=1) data1 = Dataset.from_wizard(np.ones((5, 5)), targets=1) data0.fa['ok'] = data0.sa['ok'] = np.arange(5) data1.fa['ok'] = data1.sa['ok'] = np.arange(5) data0.fa['nok'] = data0.sa['nok'] = [0] data1.fa['nok'] = data1.sa['nok'] = np.arange(5) # function, collection name, the other collection name for xstack, colname, ocolname in ((vstack, 'fa', 'sa'), (hstack, 'sa', 'fa')): for add_param in None, 'update', 'drop_nonunique': kw = {colname: add_param} if add_param else {} r = xstack((data0, data1), **kw) COL = lambda x: getattr(x, colname) col = COL(r) ocol = getattr(r, ocolname) # in any scenario, the other collection should have got # both names and be just fine assert_array_equal(ocol['nok'].value, [0] * 5 + range(5)) assert_array_equal(ocol['ok'].value, range(5) * 2) if add_param in ('update',): # will be of the last dataset assert_array_equal(col['nok'].value, COL(data1)['nok'].value) assert_array_equal(col['ok'].value, COL(data1)['ok'].value) elif add_param in (None, 'drop_nonunique'): assert('nok' not in col) # must be dropped since not unique # both the same but let's check ;) assert_array_equal(col['ok'].value, COL(data0)['ok'].value) assert_array_equal(col['ok'].value, COL(data1)['ok'].value)
def test_mergeds(): data0 = Dataset.from_wizard(np.ones((5, 5)), targets=1) data0.fa['one'] = np.ones(5) data1 = Dataset.from_wizard(np.ones((5, 5)), targets=1, chunks=1) data1.fa['one'] = np.zeros(5) data2 = Dataset.from_wizard(np.ones((3, 5)), targets=2, chunks=1) data3 = Dataset.from_wizard(np.ones((4, 5)), targets=2) data4 = Dataset.from_wizard(np.ones((2, 5)), targets=3, chunks=2) data4.fa['test'] = np.arange(5) # cannot merge if there are attributes missing in one of the datasets assert_raises(DatasetError, data1.append, data0) merged = data1.copy() merged.append(data2) ok_( merged.nfeatures == 5 ) l12 = [1]*5 + [2]*3 l1 = [1]*8 ok_((merged.targets == l12).all()) ok_((merged.chunks == l1).all()) data_append = data1.copy() data_append.append(data2) ok_(data_append.nfeatures == 5) ok_((data_append.targets == l12).all()) ok_((data_append.chunks == l1).all()) # # appending # # we need the same samples attributes in both datasets assert_raises(DatasetError, data2.append, data3) # # vstacking # if __debug__: # tested only in __debug__ assert_raises(ValueError, vstack, (data0, data1, data2, data3)) datasets = (data1, data2, data4) merged = vstack(datasets) assert_equal(merged.shape, (np.sum([len(ds) for ds in datasets]), data1.nfeatures)) assert_true('test' in merged.fa) assert_array_equal(merged.sa.targets, [1]*5 + [2]*3 + [3]*2) # # hstacking # assert_raises(ValueError, hstack, datasets) datasets = (data0, data1) merged = hstack(datasets) assert_equal(merged.shape, (len(data1), np.sum([ds.nfeatures for ds in datasets]))) assert_true('chunks' in merged.sa) assert_array_equal(merged.fa.one, [1]*5 + [0]*5)
def test_mergeds(): data0 = Dataset.from_wizard(np.ones((5, 5)), targets=1) data0.fa['one'] = np.ones(5) data1 = Dataset.from_wizard(np.ones((5, 5)), targets=1, chunks=1) data1.fa['one'] = np.zeros(5) data2 = Dataset.from_wizard(np.ones((3, 5)), targets=2, chunks=1) data3 = Dataset.from_wizard(np.ones((4, 5)), targets=2) data4 = Dataset.from_wizard(np.ones((2, 5)), targets=3, chunks=2) data4.fa['test'] = np.arange(5) # cannot merge if there are attributes missing in one of the datasets assert_raises(DatasetError, data1.append, data0) merged = data1.copy() merged.append(data2) ok_(merged.nfeatures == 5) l12 = [1] * 5 + [2] * 3 l1 = [1] * 8 ok_((merged.targets == l12).all()) ok_((merged.chunks == l1).all()) data_append = data1.copy() data_append.append(data2) ok_(data_append.nfeatures == 5) ok_((data_append.targets == l12).all()) ok_((data_append.chunks == l1).all()) # # appending # # we need the same samples attributes in both datasets assert_raises(DatasetError, data2.append, data3) # # vstacking # if __debug__: # tested only in __debug__ assert_raises(ValueError, vstack, (data0, data1, data2, data3)) datasets = (data1, data2, data4) merged = vstack(datasets) assert_equal(merged.shape, (np.sum([len(ds) for ds in datasets]), data1.nfeatures)) assert_true('test' in merged.fa) assert_array_equal(merged.sa.targets, [1] * 5 + [2] * 3 + [3] * 2) # # hstacking # assert_raises(ValueError, hstack, datasets) datasets = (data0, data1) merged = hstack(datasets) assert_equal(merged.shape, (len(data1), np.sum([ds.nfeatures for ds in datasets]))) assert_true('chunks' in merged.sa) assert_array_equal(merged.fa.one, [1] * 5 + [0] * 5)
def test_ex_from_masked(): ds = Dataset.from_wizard(samples=np.atleast_2d(np.arange(5)).view(myarray), targets=1, chunks=1) # simple sequence has to be a single pattern assert_equal(ds.nsamples, 1) # array subclass survives ok_(isinstance(ds.samples, myarray)) # check correct pattern layout (1x5) assert_array_equal(ds.samples, [[0, 1, 2, 3, 4]]) # check for single label and origin assert_array_equal(ds.targets, [1]) assert_array_equal(ds.chunks, [1]) # now try adding pattern with wrong shape assert_raises( ValueError, vstack, (ds, Dataset.from_wizard(np.ones((2, 3)), targets=1, chunks=1))) # now add two real patterns ds = vstack((ds, Dataset.from_wizard(np.random.standard_normal((2, 5)), targets=2, chunks=2))) assert_equal(ds.nsamples, 3) assert_array_equal(ds.targets, [1, 2, 2]) assert_array_equal(ds.chunks, [1, 2, 2]) # test unique class labels ds = vstack((ds, Dataset.from_wizard(np.random.standard_normal((2, 5)), targets=3, chunks=5))) assert_array_equal(ds.sa['targets'].unique, [1, 2, 3]) # test wrong attributes length assert_raises(ValueError, Dataset.from_wizard, np.random.standard_normal((4, 2, 3, 4)), targets=[1, 2, 3], chunks=2) assert_raises(ValueError, Dataset.from_wizard, np.random.standard_normal((4, 2, 3, 4)), targets=[1, 2, 3, 4], chunks=[2, 2, 2]) # no test one that is using from_masked ds = datasets['3dlarge'] for a in ds.sa: assert_equal(len(ds.sa[a].value), len(ds)) for a in ds.fa: assert_equal(len(ds.fa[a].value), ds.nfeatures)
def test_stack_add_dataset_attributes(): data0 = Dataset.from_wizard(np.ones((5, 5)), targets=1) data0.a['one'] = np.ones(2) data0.a['two'] = 2 data0.a['three'] = 'three' data0.a['common'] = range(10) data0.a['array'] = np.arange(10) data1 = Dataset.from_wizard(np.ones((5, 5)), targets=1) data1.a['one'] = np.ones(3) data1.a['two'] = 3 data1.a['four'] = 'four' data1.a['common'] = range(10) data1.a['array'] = np.arange(10) vstacker = lambda x: vstack((data0, data1), a=x) hstacker = lambda x: hstack((data0, data1), a=x) add_params = (1, None, 'unique', 'uniques', 'all', 'drop_nonunique') for stacker in (vstacker, hstacker): for add_param in add_params: if add_param == 'unique': assert_raises(DatasetError, stacker, add_param) continue r = stacker(add_param) if add_param == 1: assert_array_equal(data1.a.one, r.a.one) assert_equal(r.a.two, 3) assert_equal(r.a.four, 'four') assert_true('three' not in r.a.keys()) assert_true('array' in r.a.keys()) elif add_param == 'uniques': assert_equal(set(r.a.keys()), set(['one', 'two', 'three', 'four', 'common', 'array'])) assert_equal(r.a.two, (2, 3)) assert_equal(r.a.four, ('four',)) elif add_param == 'all': assert_equal(set(r.a.keys()), set(['one', 'two', 'three', 'four', 'common', 'array'])) assert_equal(r.a.two, (2, 3)) assert_equal(r.a.three, ('three', None)) elif add_param == 'drop_nonunique': assert_equal(set(r.a.keys()), set(['common', 'three', 'four', 'array'])) assert_equal(r.a.three, 'three') assert_equal(r.a.four, 'four') assert_equal(r.a.common, range(10)) assert_array_equal(r.a.array, np.arange(10))
def test_labelpermutation_randomsampling(): ds = Dataset.from_wizard(np.ones((5, 10)), targets=range(5), chunks=1) for i in xrange(1, 5): ds.append(Dataset.from_wizard(np.ones((5, 10)) + i, targets=range(5), chunks=i+1)) # assign some feature attributes ds.fa['roi'] = np.repeat(np.arange(5), 2) ds.fa['lucky'] = np.arange(10)%2 # use subclass for testing if it would survive ds.samples = ds.samples.view(myarray) ok_(ds.get_nsamples_per_attr('targets') == {0:5, 1:5, 2:5, 3:5, 4:5}) sample = ds.random_samples(2) ok_(sample.get_nsamples_per_attr('targets').values() == [ 2, 2, 2, 2, 2 ]) ok_((ds.sa['chunks'].unique == range(1, 6)).all())
def test_origmask_extraction(): origdata = np.random.standard_normal((10, 2, 4, 3)) data = Dataset.from_wizard(origdata, targets=2, chunks=2) # check with custom mask sel = data[:, 5] ok_(sel.samples.shape[1] == 1)
def test_feature_masking(): mask = np.zeros((5, 3), dtype='bool') mask[2, 1] = True mask[4, 0] = True data = Dataset.from_wizard(np.arange(60).reshape((4, 5, 3)), targets=1, chunks=1, mask=mask) # check simple masking ok_(data.nfeatures == 2) # selection should be idempotent ok_(data[:, mask].nfeatures == data.nfeatures) # check that correct feature get selected assert_array_equal(data[:, 1].samples[:, 0], [12, 27, 42, 57]) # XXX put back when coord -> fattr is implemented #ok_(tuple(data[:, 1].a.mapper.getInId(0)) == (4, 0)) ok_(data[:, 1].a.mapper.forward1(mask).shape == (1,)) # check sugarings # XXX put me back #self.failUnless(np.all(data.I == data.origids)) assert_array_equal(data.C, data.chunks) assert_array_equal(data.UC, np.unique(data.chunks)) assert_array_equal(data.T, data.targets) assert_array_equal(data.UT, np.unique(data.targets)) assert_array_equal(data.S, data.samples) assert_array_equal(data.O, data.mapper.reverse(data.samples))
def test_labelschunks_access(): samples = np.arange(12).reshape((4, 3)).view(myarray) labels = range(4) chunks = [1, 1, 2, 2] ds = Dataset.from_wizard(samples, labels, chunks) # array subclass survives ok_(isinstance(ds.samples, myarray)) assert_array_equal(ds.targets, labels) assert_array_equal(ds.chunks, chunks) # moreover they should point to the same thing ok_(ds.targets is ds.sa.targets) ok_(ds.targets is ds.sa['targets'].value) ok_(ds.chunks is ds.sa.chunks) ok_(ds.chunks is ds.sa['chunks'].value) # assignment should work at all levels including 1st ds.targets = chunks assert_array_equal(ds.targets, chunks) ok_(ds.targets is ds.sa.targets) ok_(ds.targets is ds.sa['targets'].value) # test broadcasting # but not for plain scalars assert_raises(ValueError, ds.set_attr, 'sa.bc', 5) # and not for plain plain str assert_raises(TypeError, ds.set_attr, 'sa.bc', "mike") # but for any iterable of len == 1 ds.set_attr('sa.bc', (5,)) ds.set_attr('sa.dc', ["mike"]) assert_array_equal(ds.sa.bc, [5] * len(ds)) assert_array_equal(ds.sa.dc, ["mike"] * len(ds))
def get_data(self): data = np.random.standard_normal(( 100, 2, 2, 2 )) labels = np.concatenate( ( np.repeat( 0, 50 ), np.repeat( 1, 50 ) ) ) chunks = np.repeat( range(5), 10 ) chunks = np.concatenate( (chunks, chunks) ) return Dataset.from_wizard(samples=data, targets=labels, chunks=chunks)
def test_feature_masking(): mask = np.zeros((5, 3), dtype='bool') mask[2, 1] = True mask[4, 0] = True data = Dataset.from_wizard(np.arange(60).reshape((4, 5, 3)), targets=1, chunks=1, mask=mask) # check simple masking ok_(data.nfeatures == 2) # selection should be idempotent ok_(data[:, mask].nfeatures == data.nfeatures) # check that correct feature get selected assert_array_equal(data[:, 1].samples[:, 0], [12, 27, 42, 57]) # XXX put back when coord -> fattr is implemented #ok_(tuple(data[:, 1].a.mapper.getInId(0)) == (4, 0)) ok_(data[:, 1].a.mapper.forward1(mask).shape == (1, )) # check sugarings # XXX put me back #self.assertTrue(np.all(data.I == data.origids)) assert_array_equal(data.C, data.chunks) assert_array_equal(data.UC, np.unique(data.chunks)) assert_array_equal(data.T, data.targets) assert_array_equal(data.UT, np.unique(data.targets)) assert_array_equal(data.S, data.samples) assert_array_equal(data.O, data.mapper.reverse(data.samples))
def test_multidim_attrs(): samples = np.arange(24).reshape(2, 3, 4) # have a dataset with two samples -- mapped from 2d into 1d # but have 2d labels and 3d chunks -- whatever that is ds = Dataset.from_wizard(samples.copy(), targets=samples.copy(), chunks=np.random.normal(size=(2, 10, 4, 2))) assert_equal(ds.nsamples, 2) assert_equal(ds.nfeatures, 12) assert_equal(ds.sa.targets.shape, (2, 3, 4)) assert_equal(ds.sa.chunks.shape, (2, 10, 4, 2)) # try slicing subds = ds[0] assert_equal(subds.nsamples, 1) assert_equal(subds.nfeatures, 12) assert_equal(subds.sa.targets.shape, (1, 3, 4)) assert_equal(subds.sa.chunks.shape, (1, 10, 4, 2)) # add multidim feature attr fattr = ds.mapper.forward(samples) assert_equal(fattr.shape, (2, 12)) # should puke -- first axis is #samples assert_raises(ValueError, ds.fa.__setitem__, 'moresamples', fattr) # but that should be fine ds.fa['moresamples'] = fattr.T assert_equal(ds.fa.moresamples.shape, (12, 2))
def test_labelschunks_access(): samples = np.arange(12).reshape((4, 3)).view(myarray) labels = range(4) chunks = [1, 1, 2, 2] ds = Dataset.from_wizard(samples, labels, chunks) # array subclass survives ok_(isinstance(ds.samples, myarray)) assert_array_equal(ds.targets, labels) assert_array_equal(ds.chunks, chunks) # moreover they should point to the same thing ok_(ds.targets is ds.sa.targets) ok_(ds.targets is ds.sa['targets'].value) ok_(ds.chunks is ds.sa.chunks) ok_(ds.chunks is ds.sa['chunks'].value) # assignment should work at all levels including 1st ds.targets = chunks assert_array_equal(ds.targets, chunks) ok_(ds.targets is ds.sa.targets) ok_(ds.targets is ds.sa['targets'].value) # test broadcasting # but not for plain scalars assert_raises(ValueError, ds.set_attr, 'sa.bc', 5) # and not for plain plain str assert_raises(TypeError, ds.set_attr, 'sa.bc', "mike") # but for any iterable of len == 1 ds.set_attr('sa.bc', (5, )) ds.set_attr('sa.dc', ["mike"]) assert_array_equal(ds.sa.bc, [5] * len(ds)) assert_array_equal(ds.sa.dc, ["mike"] * len(ds))
def test_multidim_attrs(): samples = np.arange(24).reshape(2, 3, 4) # have a dataset with two samples -- mapped from 2d into 1d # but have 2d labels and 3d chunks -- whatever that is ds = Dataset.from_wizard(samples.copy(), targets=samples.copy(), chunks=np.random.normal(size=(2,10,4,2))) assert_equal(ds.nsamples, 2) assert_equal(ds.nfeatures, 12) assert_equal(ds.sa.targets.shape, (2, 3, 4)) assert_equal(ds.sa.chunks.shape, (2, 10, 4, 2)) # try slicing subds = ds[0] assert_equal(subds.nsamples, 1) assert_equal(subds.nfeatures, 12) assert_equal(subds.sa.targets.shape, (1, 3, 4)) assert_equal(subds.sa.chunks.shape, (1, 10, 4, 2)) # add multidim feature attr fattr = ds.mapper.forward(samples) assert_equal(fattr.shape, (2, 12)) # should puke -- first axis is #samples assert_raises(ValueError, ds.fa.__setitem__, 'moresamples', fattr) # but that should be fine ds.fa['moresamples'] = fattr.T assert_equal(ds.fa.moresamples.shape, (12, 2))
def test_samples_shape(): ds = Dataset.from_wizard(np.ones((10, 2, 3, 4)), targets=1, chunks=1) ok_(ds.samples.shape == (10, 24)) # what happens to 1D samples ds = Dataset(np.arange(5)) assert_equal(ds.shape, (5, 1)) assert_equal(ds.nfeatures, 1)
def test_ex_from_masked(): ds = Dataset.from_wizard(samples=np.atleast_2d(np.arange(5)).view(myarray), targets=1, chunks=1) # simple sequence has to be a single pattern assert_equal(ds.nsamples, 1) # array subclass survives ok_(isinstance(ds.samples, myarray)) # check correct pattern layout (1x5) assert_array_equal(ds.samples, [[0, 1, 2, 3, 4]]) # check for single label and origin assert_array_equal(ds.targets, [1]) assert_array_equal(ds.chunks, [1]) # now try adding pattern with wrong shape assert_raises(DatasetError, ds.append, Dataset.from_wizard(np.ones((2,3)), targets=1, chunks=1)) # now add two real patterns ds.append(Dataset.from_wizard(np.random.standard_normal((2, 5)), targets=2, chunks=2)) assert_equal(ds.nsamples, 3) assert_array_equal(ds.targets, [1, 2, 2]) assert_array_equal(ds.chunks, [1, 2, 2]) # test unique class labels ds.append(Dataset.from_wizard(np.random.standard_normal((2, 5)), targets=3, chunks=5)) assert_array_equal(ds.sa['targets'].unique, [1, 2, 3]) # test wrong attributes length assert_raises(ValueError, Dataset.from_wizard, np.random.standard_normal((4,2,3,4)), targets=[1, 2, 3], chunks=2) assert_raises(ValueError, Dataset.from_wizard, np.random.standard_normal((4,2,3,4)), targets=[1, 2, 3, 4], chunks=[2, 2, 2]) # no test one that is using from_masked ds = datasets['3dlarge'] for a in ds.sa: assert_equal(len(ds.sa[a].value), len(ds)) for a in ds.fa: assert_equal(len(ds.fa[a].value), ds.nfeatures)
def test_shape_conversion(): ds = Dataset.from_wizard(np.arange(24).reshape((2, 3, 4)).view(myarray), targets=1, chunks=1) # array subclass survives ok_(isinstance(ds.samples, myarray)) assert_equal(ds.nsamples, 2) assert_equal(ds.samples.shape, (2, 12)) assert_array_equal(ds.samples, [range(12), range(12, 24)])
def setUp(self): data = np.random.standard_normal((100, 3, 4, 2)) labels = np.concatenate((np.repeat(0, 50), np.repeat(1, 50))) chunks = np.repeat(range(5), 10) chunks = np.concatenate((chunks, chunks)) mask = np.ones((3, 4, 2), dtype="bool") mask[0, 0, 0] = 0 mask[1, 3, 1] = 0 self.dataset = Dataset.from_wizard(samples=data, targets=labels, chunks=chunks, mask=mask)
def transform(self, ds): data = np.dstack([copy_matrix(array_to_matrix(a)) for a in ds.samples]) data = np.hstack([d for d in data[:, :]]).T attr = self._edit_attr(ds, data.shape) ds_ = Dataset.from_wizard(data) ds_ = add_attributes(ds_, attr) return ds_
def setUp(self): data = np.random.standard_normal((100, 3, 4, 2)) labels = np.concatenate((np.repeat(0, 50), np.repeat(1, 50))) chunks = np.repeat(range(5), 10) chunks = np.concatenate((chunks, chunks)) mask = np.ones((3, 4, 2), dtype='bool') mask[0, 0, 0] = 0 mask[1, 3, 1] = 0 self.dataset = Dataset.from_wizard(samples=data, targets=labels, chunks=chunks, mask=mask)
def test_basic_datamapping(): samples = np.arange(24).reshape((4, 3, 2)).view(myarray) ds = Dataset.from_wizard(samples) # array subclass survives ok_(isinstance(ds.samples, myarray)) # mapper should end up in the dataset ok_(ds.a.has_key('mapper')) # check correct mapping ok_(ds.nsamples == 4) ok_(ds.nfeatures == 6)
def load_mat_ds(path, subj, folder, **kwargs): data = load_mat_data(path, subj, folder, **kwargs) # load attributes attr = load_attributes(path, subj, folder, **kwargs) attr, labels = edit_attr(attr, data.shape) ds = Dataset.from_wizard(data, attr.targets) ds = add_subjectname(ds, subj) ds = add_attributes(ds, attr) #ds.fa['roi_labels'] = labels ds.fa['matrix_values'] = np.ones_like(data[0]) ds.sa['chunks'] = LabelEncoder().fit_transform(ds.sa['name']) return ds
def test_masked_featureselection(): origdata = np.random.standard_normal((10, 2, 4, 3, 5)).view(myarray) data = Dataset.from_wizard(origdata, targets=2, chunks=2) unmasked = data.samples.copy() # array subclass survives ok_(isinstance(data.samples, myarray)) # default must be no mask ok_(data.nfeatures == 120) ok_(data.a.mapper.forward1(origdata[0]).shape == (120,)) # check that full mask uses all features # this uses auto-mapping of selection arrays in __getitem__ sel = data[:, np.ones((2, 4, 3, 5), dtype='bool')] ok_(sel.nfeatures == data.samples.shape[1]) ok_(data.nfeatures == 120) # check partial array mask partial_mask = np.zeros((2, 4, 3, 5), dtype='bool') partial_mask[0, 0, 2, 2] = 1 partial_mask[1, 2, 2, 0] = 1 sel = data[:, partial_mask] ok_(sel.nfeatures == 2) # check that feature selection does not change source data ok_(data.nfeatures == 120) ok_(data.a.mapper.forward1(origdata[0]).shape == (120,)) # check selection with feature list sel = data[:, [0, 37, 119]] ok_(sel.nfeatures == 3) # check size of the masked samples ok_(sel.samples.shape == (10, 3)) # check that the right features are selected assert_array_equal(unmasked[:, [0, 37, 119]], sel.samples)
def test_masked_featureselection(): origdata = np.random.standard_normal((10, 2, 4, 3, 5)).view(myarray) data = Dataset.from_wizard(origdata, targets=2, chunks=2) unmasked = data.samples.copy() # array subclass survives ok_(isinstance(data.samples, myarray)) # default must be no mask ok_(data.nfeatures == 120) ok_(data.a.mapper.forward1(origdata[0]).shape == (120, )) # check that full mask uses all features # this uses auto-mapping of selection arrays in __getitem__ sel = data[:, np.ones((2, 4, 3, 5), dtype='bool')] ok_(sel.nfeatures == data.samples.shape[1]) ok_(data.nfeatures == 120) # check partial array mask partial_mask = np.zeros((2, 4, 3, 5), dtype='bool') partial_mask[0, 0, 2, 2] = 1 partial_mask[1, 2, 2, 0] = 1 sel = data[:, partial_mask] ok_(sel.nfeatures == 2) # check that feature selection does not change source data ok_(data.nfeatures == 120) ok_(data.a.mapper.forward1(origdata[0]).shape == (120, )) # check selection with feature list sel = data[:, [0, 37, 119]] ok_(sel.nfeatures == 3) # check size of the masked samples ok_(sel.samples.shape == (10, 3)) # check that the right features are selected assert_array_equal(unmasked[:, [0, 37, 119]], sel.samples)
def _update_ds(self, ds, X, y): ds_ = Dataset.from_wizard(X) samples_difference = len(y) - len(ds.targets) for key in ds.sa.keys(): values = ds.sa[key].value values_ = sample_generator(key, values, samples_difference, y) u, c = np.unique(values_, return_counts=True) logger.debug("%s - sample per key: %s" %(key, str([u,c]))) logger.debug(values_) ds_.sa[key] = values_ return ds_
def generate_testing_datasets(specs): # Lets permute upon each invocation of test, so we could possibly # trigger some funny cases nonbogus_pool = np.random.permutation([0, 1, 3, 5]) datasets = {} # use a partitioner to flag odd/even samples as training and test ttp = OddEvenPartitioner(space='train', count=1) for kind, spec in specs.iteritems(): # set of univariate datasets for nlabels in [2, 3, 4]: basename = 'uni%d%s' % (nlabels, kind) nonbogus_features = nonbogus_pool[:nlabels] dataset = normal_feature_dataset( nlabels=nlabels, nonbogus_features=nonbogus_features, **spec) # full dataset datasets[basename] = list(ttp.generate(dataset))[0] # sample 3D total = 2 * spec['perlabel'] nchunks = spec['nchunks'] data = np.random.standard_normal((total, 3, 6, 6)) labels = np.concatenate( (np.repeat(0, spec['perlabel']), np.repeat(1, spec['perlabel']))) data[:, 1, 0, 0] += 2 * labels # add some signal chunks = np.asarray(range(nchunks) * (total // nchunks)) mask = np.ones((3, 6, 6), dtype='bool') mask[0, 0, 0] = 0 mask[1, 3, 2] = 0 ds = Dataset.from_wizard(samples=data, targets=labels, chunks=chunks, mask=mask, space='myspace') # and to stress tests on manipulating sa/fa possibly containing # attributes of dtype object ds.sa['test_object'] = [['a'], [1, 2]] * (ds.nsamples // 2) datasets['3d%s' % kind] = ds # some additional datasets datasets['dumb2'] = dumb_feature_binary_dataset() datasets['dumb'] = dumb_feature_dataset() # dataset with few invariant features _dsinv = dumb_feature_dataset() _dsinv.samples = np.hstack((_dsinv.samples, np.zeros( (_dsinv.nsamples, 1)), np.ones((_dsinv.nsamples, 1)))) datasets['dumbinv'] = _dsinv # Datasets for regressions testing datasets['sin_modulated'] = list( ttp.generate(multiple_chunks(sin_modulated, 4, 30, 1)))[0] # use the same full for training datasets['sin_modulated_train'] = datasets['sin_modulated'] datasets['sin_modulated_test'] = sin_modulated(30, 1, flat=True) # simple signal for linear regressors datasets['chirp_linear'] = multiple_chunks(chirp_linear, 6, 50, 10, 2, 0.3, 0.1) datasets['chirp_linear_test'] = chirp_linear(20, 5, 2, 0.4, 0.1) datasets['wr1996'] = multiple_chunks(wr1996, 4, 50) datasets['wr1996_test'] = wr1996(50) datasets['hollow'] = Dataset(HollowSamples((40, 20)), sa={'targets': np.tile(['one', 'two'], 20)}) return datasets
def generate_testing_datasets(specs): # Lets permute upon each invocation of test, so we could possibly # trigger some funny cases nonbogus_pool = np.random.permutation([0, 1, 3, 5]) datasets = {} # use a partitioner to flag odd/even samples as training and test ttp = OddEvenPartitioner(space='train', count=1) for kind, spec in specs.iteritems(): # set of univariate datasets for nlabels in [ 2, 3, 4 ]: basename = 'uni%d%s' % (nlabels, kind) nonbogus_features = nonbogus_pool[:nlabels] dataset = normal_feature_dataset( nlabels=nlabels, nonbogus_features=nonbogus_features, **spec) # full dataset datasets[basename] = list(ttp.generate(dataset))[0] # sample 3D total = 2*spec['perlabel'] nchunks = spec['nchunks'] data = np.random.standard_normal(( total, 3, 6, 6 )) labels = np.concatenate( ( np.repeat( 0, spec['perlabel'] ), np.repeat( 1, spec['perlabel'] ) ) ) data[:, 1, 0, 0] += 2*labels # add some signal chunks = np.asarray(range(nchunks)*(total/nchunks)) mask = np.ones((3, 6, 6), dtype='bool') mask[0, 0, 0] = 0 mask[1, 3, 2] = 0 ds = Dataset.from_wizard(samples=data, targets=labels, chunks=chunks, mask=mask, space='myspace') # and to stress tests on manipulating sa/fa possibly containing # attributes of dtype object ds.sa['test_object'] = [['a'], [1, 2]] * (ds.nsamples/2) datasets['3d%s' % kind] = ds # some additional datasets datasets['dumb2'] = dumb_feature_binary_dataset() datasets['dumb'] = dumb_feature_dataset() # dataset with few invariant features _dsinv = dumb_feature_dataset() _dsinv.samples = np.hstack((_dsinv.samples, np.zeros((_dsinv.nsamples, 1)), np.ones((_dsinv.nsamples, 1)))) datasets['dumbinv'] = _dsinv # Datasets for regressions testing datasets['sin_modulated'] = list(ttp.generate(multiple_chunks(sin_modulated, 4, 30, 1)))[0] # use the same full for training datasets['sin_modulated_train'] = datasets['sin_modulated'] datasets['sin_modulated_test'] = sin_modulated(30, 1, flat=True) # simple signal for linear regressors datasets['chirp_linear'] = multiple_chunks(chirp_linear, 6, 50, 10, 2, 0.3, 0.1) datasets['chirp_linear_test'] = chirp_linear(20, 5, 2, 0.4, 0.1) datasets['wr1996'] = multiple_chunks(wr1996, 4, 50) datasets['wr1996_test'] = wr1996(50) datasets['hollow'] = Dataset(HollowSamples((40,20)), sa={'targets': np.tile(['one', 'two'], 20)}) return datasets
def test_chainmapper(): # the chain needs at lest one mapper assert_raises(ValueError, ChainMapper, []) # a typical first mapper is to flatten cm = ChainMapper([FlattenMapper()]) # few container checks assert_equal(len(cm), 1) assert_true(isinstance(cm[0], FlattenMapper)) # now training # come up with data samples_shape = (2, 2, 4) data_shape = (4,) + samples_shape data = np.arange(np.prod(data_shape)).reshape(data_shape) pristinedata = data.copy() target = [[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47], [48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63]] target = np.array(target) # if it is not trained it knows nothing cm.train(data) # a new mapper should appear when doing feature selection cm.append(StaticFeatureSelection(range(1, 16))) assert_equal(cm.forward1(data[0]).shape, (15,)) assert_equal(len(cm), 2) # multiple slicing cm.append(StaticFeatureSelection([9, 14])) assert_equal(cm.forward1(data[0]).shape, (2,)) assert_equal(len(cm), 3) # check reproduction if __debug__: # debug mode needs special test as it enhances the repr output # with module info and id() appendix for objects import mvpa2 cm_clone = eval(repr(cm)) assert_equal('#'.join(repr(cm_clone).split('#')[:-1]), '#'.join(repr(cm).split('#')[:-1])) else: cm_clone = eval(repr(cm)) assert_equal(repr(cm_clone), repr(cm)) # what happens if we retrain the whole beast an same data as before cm.train(data) assert_equal(cm.forward1(data[0]).shape, (2,)) assert_equal(len(cm), 3) # let's map something mdata = cm.forward(data) assert_array_equal(mdata, target[:, [10, 15]]) # and back rdata = cm.reverse(mdata) # original shape assert_equal(rdata.shape, data.shape) # content as far it could be restored assert_array_equal(rdata[rdata > 0], data[rdata > 0]) assert_equal(np.sum(rdata > 0), 8) # Lets construct a dataset with mapper assigned and see # if sub-selecting a feature adjusts trailing StaticFeatureSelection # appropriately ds_subsel = Dataset.from_wizard(data, mapper=cm)[:, 1] tail_sfs = ds_subsel.a.mapper[-1] assert_equal(repr(tail_sfs), 'StaticFeatureSelection(slicearg=array([14]))')
def test_chainmapper(): # the chain needs at lest one mapper assert_raises(ValueError, ChainMapper, []) # a typical first mapper is to flatten cm = ChainMapper([FlattenMapper()]) # few container checks assert_equal(len(cm), 1) assert_true(isinstance(cm[0], FlattenMapper)) # now training # come up with data samples_shape = (2, 2, 4) data_shape = (4,) + samples_shape data = np.arange(np.prod(data_shape)).reshape(data_shape) target = [[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47], [48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63]] target = np.array(target) # if it is not trained it knows nothing cm.train(data) # a new mapper should appear when doing feature selection cm.append(StaticFeatureSelection(list(range(1, 16)))) assert_equal(cm.forward1(data[0]).shape, (15,)) assert_equal(len(cm), 2) # multiple slicing cm.append(StaticFeatureSelection([9, 14])) assert_equal(cm.forward1(data[0]).shape, (2,)) assert_equal(len(cm), 3) # check reproduction if __debug__: # debug mode needs special test as it enhances the repr output # with module info and id() appendix for objects import mvpa2 cm_clone = eval(repr(cm)) assert_equal('#'.join(repr(cm_clone).split('#')[:-1]), '#'.join(repr(cm).split('#')[:-1])) else: cm_clone = eval(repr(cm)) assert_equal(repr(cm_clone), repr(cm)) # what happens if we retrain the whole beast an same data as before cm.train(data) assert_equal(cm.forward1(data[0]).shape, (2,)) assert_equal(len(cm), 3) # let's map something mdata = cm.forward(data) assert_array_equal(mdata, target[:, [10, 15]]) # and back rdata = cm.reverse(mdata) # original shape assert_equal(rdata.shape, data.shape) # content as far it could be restored assert_array_equal(rdata[rdata > 0], data[rdata > 0]) assert_equal(np.sum(rdata > 0), 8) # Lets construct a dataset with mapper assigned and see # if sub-selecting a feature adjusts trailing StaticFeatureSelection # appropriately ds_subsel = Dataset.from_wizard(data, mapper=cm)[:, 1] tail_sfs = ds_subsel.a.mapper[-1] assert_equal(repr(tail_sfs), 'StaticFeatureSelection(slicearg=array([14]))')