def test_bin_prop_ci(): skip_if_no_external('scipy') n = 100 succ_thresh = np.random.randint(n) acc = 1 - (float(succ_thresh) / n) bl = np.random.random(n) < acc ds = Dataset(bl) m95 = BinomialProportionCI() m50 = BinomialProportionCI(width=0.5) cids = m95(ds) assert_equal(cids.shape, (2, 1)) # accuracy is in the CI maxdist = cids.samples[1, 0] - acc mindist = acc - cids.samples[1, 0] # but allow for numerical uncertainty proportional to the sample size assert_true(maxdist > 0 or maxdist <= 1. / n) assert_true(mindist > 0 or mindist <= 1. / n) # more than one feature ds = Dataset(np.transpose([bl, np.logical_not(bl)])) ci95 = m95(ds) assert_equal(ci95.shape, (2, 2)) # CIs should be inverse assert_array_almost_equal(1 - ci95.samples[0, ::-1], ci95.samples[1]) ci50 = m50(ds) assert_array_almost_equal(1 - ci50.samples[0, ::-1], ci50.samples[1]) # 50% interval is smaller than 95% assert_true(np.all(ci95.samples[0] < ci50.samples[0])) assert_true(np.all(ci95.samples[1] > ci50.samples[1])) assert_equal(list(ci50.sa.ci_boundary), ['lower', 'upper'])
def test_stack_add_attributes(): data0 = Dataset.from_wizard(np.ones((5, 5)), targets=1) data1 = Dataset.from_wizard(np.ones((5, 5)), targets=1) data0.fa['ok'] = data0.sa['ok'] = np.arange(5) data1.fa['ok'] = data1.sa['ok'] = np.arange(5) data0.fa['nok'] = data0.sa['nok'] = [0] data1.fa['nok'] = data1.sa['nok'] = np.arange(5) # function, collection name, the other collection name for xstack, colname, ocolname in ((vstack, 'fa', 'sa'), (hstack, 'sa', 'fa')): for add_param in None, 'update', 'drop_nonunique': kw = {colname: add_param} if add_param else {} r = xstack((data0, data1), **kw) COL = lambda x: getattr(x, colname) col = COL(r) ocol = getattr(r, ocolname) # in any scenario, the other collection should have got # both names and be just fine assert_array_equal(ocol['nok'].value, [0] * 5 + range(5)) assert_array_equal(ocol['ok'].value, range(5) * 2) if add_param in ('update',): # will be of the last dataset assert_array_equal(col['nok'].value, COL(data1)['nok'].value) assert_array_equal(col['ok'].value, COL(data1)['ok'].value) elif add_param in (None, 'drop_nonunique'): assert('nok' not in col) # must be dropped since not unique # both the same but let's check ;) assert_array_equal(col['ok'].value, COL(data0)['ok'].value) assert_array_equal(col['ok'].value, COL(data1)['ok'].value)
def _frobenius_norm_and_merge(self, dss_connectomes, dss_response, node_indices): # figure out which of the two types of data are larger if dss_response[0].shape[0] > dss_connectomes[0].shape[0]: larger = dss_response smaller = dss_connectomes else: larger = dss_connectomes smaller = dss_response node_ids = node_indices # find the normalization ratio based on which is larger norm_ratios = [] for la, sm in zip(larger, smaller): laN = np.linalg.norm(la, ord='fro') smN = np.linalg.norm(sm, ord='fro') v = laN / smN norm_ratios.append(v) # normalize the smaller one and then merge the datasets merged_dss = [] for la, sm, norm in zip(larger, smaller, norm_ratios): d_sm = sm.samples * norm merged = np.vstack((d_sm, la.samples)) merged = Dataset(samples=merged) merged.fa['node_indices'] = node_ids.copy() merged_dss.append(merged) return merged_dss
def __call__(self, dataset): #if self.model == 'correlation': # orig_ds = copy.deepcopy(dataset) # zscore(orig_ds, chunks_attr=None) # ref_ts = orig_ds[:,orig_ds.fa.roi_seed].samples # corrs = np.mat(ref_ts).T*np.mat(orig_ds.samples)/orig_ds.nsamples # corrs[np.isnan(corrs)] = 0 # corrs[abs(corrs)<self.cthresh] = 0 # corrs = corrs/np.sum(corrs) # return Dataset(np.asarray(np.mat(orig_ds.samples)*corrs.T)) #elif self.model == 'regression': X = np.mat(dataset[:, dataset.fa.roi_seed != True].samples) y = np.mat(dataset[:, dataset.fa.roi_seed == True].samples) try: Xi = np.linalg.pinv(X, 1e-5) r = y.T * X * Xi * y r = r[0, 0]**2 except LinAlgError: r = -1000 if r >= self.cthresh: if self.cthresh >= 0: ym = (y + r * (X * Xi * y)) / (1 + r) else: ym = (0.241275 * y + 0.758725 * (X * Xi * y)) return Dataset(np.asarray(ym)) else: return Dataset(np.asarray(y))
def _call(self, dataset=None): """Extract weights from SMLR classifier. SMLR always has weights available, so nothing has to be computed here. """ clf = self.clf # transpose to have the number of features on the second axis # (as usual) weights = clf.weights.T if __debug__: debug('SMLR', "Extracting weights for %d-class SMLR" % (len(weights) + 1) + "Result: min=%f max=%f" %\ (np.min(weights), np.max(weights))) # limit the labels to the number of sensitivity sets, to deal # with the case of `fit_all_weights=False` ds = Dataset(weights, sa={clf.get_space(): clf._ulabels[:len(weights)]}) if clf.params.has_bias: ds.sa['biases'] = clf.biases return ds
def dense_connectivity_profile_isc(data): """ Takes the data and creates a vertex-by-vertex full connectivity matrix for each subject, then performs ISC on the connectivity profiles. Parameters: ---------- data: a n_subjects-length list of (timeseries, features) datasets from which to compute a connectivity matrix. Retu ------- all_results: a numpy array of shape (n_subjects, n_features) of ISC values. """ from mvpa2.datasets.base import Dataset from mvpa2.mappers.fxy import FxyMapper conn_metric = lambda x, y: np.dot(x.samples, y.samples) / x.nsamples connectivity_mapper = FxyMapper(conn_metric) connectomes = np.ndarray((data.shape[0], data.shape[2], data.shape[2]), dtype=float) for i, ds in enumerate(data): d = Dataset(ds) conn_targets = Dataset(samples=ds.T) connectivity_mapper.train(conn_targets) connectomes[i] = connectivity_mapper.forward(d) del conn_targets, d results = vertex_isc(connectomes) return results
def _call(self, dataset): """Computes the average correlation in similarity structure across chunks.""" chunks_attr = self.params.chunks_attr nchunks = len(dataset.sa[chunks_attr].unique) if nchunks < 2: raise StandardError("This measure calculates similarity consistency across " "chunks and is not meaningful for datasets with only " "one chunk:") dsms = [] chunks = [] for chunk in dataset.sa[chunks_attr].unique: data = np.atleast_2d( dataset.samples[dataset.sa[chunks_attr].value == chunk,:]) if self.params.center_data: data = data - np.mean(data,0) dsm = pdist(data, self.params.pairwise_metric) dsms.append(dsm) chunks.append(chunk) dsms = np.vstack(dsms) if self.params.consistency_metric=='spearman': dsms = np.apply_along_axis(rankdata, 1, dsms) corrmat = np.corrcoef(dsms) if self.params.square: ds = Dataset(corrmat, sa={self.params.chunks_attr: chunks}) else: ds = Dataset(squareform(corrmat,checks=False), sa=dict(pairs=list(combinations(chunks, 2)))) return ds
def test_stack_add_attributes(): data0 = Dataset.from_wizard(np.ones((5, 5)), targets=1) data1 = Dataset.from_wizard(np.ones((5, 5)), targets=1) data0.fa['ok'] = data0.sa['ok'] = np.arange(5) data1.fa['ok'] = data1.sa['ok'] = np.arange(5) data0.fa['nok'] = data0.sa['nok'] = [0] data1.fa['nok'] = data1.sa['nok'] = np.arange(5) # function, collection name, the other collection name for xstack, colname, ocolname in ((vstack, 'fa', 'sa'), (hstack, 'sa', 'fa')): for add_param in None, 'update', 'drop_nonunique': kw = {colname: add_param} if add_param else {} r = xstack((data0, data1), **kw) COL = lambda x: getattr(x, colname) col = COL(r) ocol = getattr(r, ocolname) # in any scenario, the other collection should have got # both names and be just fine assert_array_equal(ocol['nok'].value, [0] * 5 + list(range(5))) assert_array_equal(ocol['ok'].value, list(range(5)) * 2) if add_param in ('update',): # will be of the last dataset assert_array_equal(col['nok'].value, COL(data1)['nok'].value) assert_array_equal(col['ok'].value, COL(data1)['ok'].value) elif add_param in (None, 'drop_nonunique'): assert('nok' not in col) # must be dropped since not unique # both the same but let's check ;) assert_array_equal(col['ok'].value, COL(data0)['ok'].value) assert_array_equal(col['ok'].value, COL(data1)['ok'].value)
def test_featuregroup_mapper(): ds = Dataset(np.arange(24).reshape(3, 8)) ds.fa['roi'] = [0, 1] * 4 # just to check ds.sa['chunks'] = np.arange(3) # correct results csamples = [[3, 4], [11, 12], [19, 20]] m = mean_group_feature(['roi']) mds = m.forward(ds) assert_equal(mds.shape, (3, 2)) assert_array_equal(mds.samples, csamples) assert_array_equal(mds.fa.roi, np.unique([0, 1] * 4)) # FAs should simply remain the same assert_array_equal(mds.sa.chunks, np.arange(3)) # now without grouping m = mean_feature() # forwarding just the samples should yield the same result assert_array_equal(m.forward(ds.samples), m.forward(ds).samples) # And when operating on a dataset with >1D samples, then operate # only across "features", i.e. 1st dimension ds = Dataset(np.arange(24).reshape(3, 2, 2, 2)) mapped = ds.get_mapped(m) assert_array_equal(m.forward(ds.samples), mapped.samples) assert_array_equal(mapped.samples.shape, (3, 2, 2)) assert_array_equal(mapped.samples, np.mean(ds.samples, axis=1)) # and still could map back? ;) not ATM, so just to ensure consistency assert_raises(NotImplementedError, mapped.a.mapper.reverse, mapped.samples) # but it should also work with standard 2d sample arrays ds = Dataset(np.arange(24).reshape(3, 8)) mapped = ds.get_mapped(m) assert_array_equal(mapped.samples.shape, (3, 1))
def test_unique_stack(): data = Dataset(np.reshape(np.arange(24), (4, 6)), sa=dict(x=[0, 1, 0, 1]), fa=dict(y=[x for x in 'abccba'])) sa_stack = stack_by_unique_sample_attribute(data, 'x') assert_equal(sa_stack.shape, (2, 12)) assert_array_equal(sa_stack.fa.x, [0] * 6 + [1] * 6) assert_array_equal(sa_stack.fa.y, [x for x in 'abccbaabccba']) fa_stack = stack_by_unique_feature_attribute(data, 'y') assert_equal(fa_stack.shape, (12, 2)) assert_array_equal(fa_stack.sa.x, [0, 1] * 6) assert_array_equal(fa_stack.sa.y, [y for y in 'aaaabbbbcccc']) #assert_array_equal(fa_stack.fa.y,['']) # check values match the fa or sa for i in xrange(4): for j in xrange(6): d = data[i, j] for k, other in enumerate((sa_stack, fa_stack)): msk = other.samples == d.samples ii, jj = np.nonzero(msk) # find matching indices in other o = other[ii, jj] coll = [o.fa, o.sa][k] assert_equal(coll.x, d.sa.x) assert_equal(coll.y, d.fa.y) ystacker = lambda y: lambda x: stack_by_unique_feature_attribute(x, y) assert_raises(KeyError, ystacker('z'), data) data.fa['z'] = [z for z in '123451'] assert_raises(ValueError, ystacker('z'), data)
def test_query_engine(): data = np.arange(54) # indices in 3D ind = np.transpose((np.ones((3, 3, 3)).nonzero())) # sphere generator for 3 elements diameter sphere = ne.Sphere(1) # dataset with just one "space" ds = Dataset([data, data], fa={'s_ind': np.concatenate((ind, ind))}) # and the query engine attaching the generator to the "index-space" qe = ne.IndexQueryEngine(s_ind=sphere) # cannot train since the engine does not know about the second space assert_raises(ValueError, qe.train, ds) # now do it again with a full spec ds = Dataset([data, data], fa={ 's_ind': np.concatenate((ind, ind)), 't_ind': np.repeat([0, 1], 27) }) qe = ne.IndexQueryEngine(s_ind=sphere, t_ind=None) qe.train(ds) # internal representation check # YOH: invalid for new implementation with lookup tables (dictionaries) #assert_array_equal(qe._searcharray, # np.arange(54).reshape(qe._searcharray.shape) + 1) # should give us one corner, collapsing the 't_ind' assert_array_equal(qe(s_ind=(0, 0, 0)), [0, 1, 3, 9, 27, 28, 30, 36]) # directly specifying an index for 't_ind' without having an ROI # generator, should give the same corner, but just once assert_array_equal(qe(s_ind=(0, 0, 0), t_ind=0), [0, 1, 3, 9]) # just out of the mask -- no match assert_array_equal(qe(s_ind=(3, 3, 3)), []) # also out of the mask -- but single match assert_array_equal(qe(s_ind=(2, 2, 3), t_ind=1), [53]) # query by id assert_array_equal(qe(s_ind=(0, 0, 0), t_ind=0), qe[0]) assert_array_equal(qe(s_ind=(0, 0, 0), t_ind=[0, 1]), qe(s_ind=(0, 0, 0))) # should not fail if t_ind is outside assert_array_equal(qe(s_ind=(0, 0, 0), t_ind=[0, 1, 10]), qe(s_ind=(0, 0, 0))) # should fail if asked about some unknown thing assert_raises(ValueError, qe.__call__, s_ind=(0, 0, 0), buga=0) # Test by using some literal feature atttribute ds.fa['lit'] = ['roi1', 'ro2', 'r3'] * 18 # should work as well as before assert_array_equal(qe(s_ind=(0, 0, 0)), [0, 1, 3, 9, 27, 28, 30, 36]) # should fail if asked about some unknown (yet) thing assert_raises(ValueError, qe.__call__, s_ind=(0, 0, 0), lit='roi1') # Create qe which can query literals as well qe_lit = ne.IndexQueryEngine(s_ind=sphere, t_ind=None, lit=None) qe_lit.train(ds) # should work as well as before assert_array_equal(qe_lit(s_ind=(0, 0, 0)), [0, 1, 3, 9, 27, 28, 30, 36]) # and subselect nicely -- only /3 ones assert_array_equal(qe_lit(s_ind=(0, 0, 0), lit='roi1'), [0, 3, 9, 27, 30, 36]) assert_array_equal(qe_lit(s_ind=(0, 0, 0), lit=['roi1', 'ro2']), [0, 1, 3, 9, 27, 28, 30, 36])
def test_mergeds(): data0 = Dataset.from_wizard(np.ones((5, 5)), targets=1) data0.fa['one'] = np.ones(5) data1 = Dataset.from_wizard(np.ones((5, 5)), targets=1, chunks=1) data1.fa['one'] = np.zeros(5) data2 = Dataset.from_wizard(np.ones((3, 5)), targets=2, chunks=1) data3 = Dataset.from_wizard(np.ones((4, 5)), targets=2) data4 = Dataset.from_wizard(np.ones((2, 5)), targets=3, chunks=2) data4.fa['test'] = np.arange(5) # cannot merge if there are attributes missing in one of the datasets assert_raises(DatasetError, data1.append, data0) merged = data1.copy() merged.append(data2) ok_(merged.nfeatures == 5) l12 = [1] * 5 + [2] * 3 l1 = [1] * 8 ok_((merged.targets == l12).all()) ok_((merged.chunks == l1).all()) data_append = data1.copy() data_append.append(data2) ok_(data_append.nfeatures == 5) ok_((data_append.targets == l12).all()) ok_((data_append.chunks == l1).all()) # # appending # # we need the same samples attributes in both datasets assert_raises(DatasetError, data2.append, data3) # # vstacking # if __debug__: # tested only in __debug__ assert_raises(ValueError, vstack, (data0, data1, data2, data3)) datasets = (data1, data2, data4) merged = vstack(datasets) assert_equal(merged.shape, (np.sum([len(ds) for ds in datasets]), data1.nfeatures)) assert_true('test' in merged.fa) assert_array_equal(merged.sa.targets, [1] * 5 + [2] * 3 + [3] * 2) # # hstacking # assert_raises(ValueError, hstack, datasets) datasets = (data0, data1) merged = hstack(datasets) assert_equal(merged.shape, (len(data1), np.sum([ds.nfeatures for ds in datasets]))) assert_true('chunks' in merged.sa) assert_array_equal(merged.fa.one, [1] * 5 + [0] * 5)
def test_mergeds(): data0 = Dataset.from_wizard(np.ones((5, 5)), targets=1) data0.fa['one'] = np.ones(5) data1 = Dataset.from_wizard(np.ones((5, 5)), targets=1, chunks=1) data1.fa['one'] = np.zeros(5) data2 = Dataset.from_wizard(np.ones((3, 5)), targets=2, chunks=1) data3 = Dataset.from_wizard(np.ones((4, 5)), targets=2) data4 = Dataset.from_wizard(np.ones((2, 5)), targets=3, chunks=2) data4.fa['test'] = np.arange(5) # cannot merge if there are attributes missing in one of the datasets assert_raises(DatasetError, data1.append, data0) merged = data1.copy() merged.append(data2) ok_( merged.nfeatures == 5 ) l12 = [1]*5 + [2]*3 l1 = [1]*8 ok_((merged.targets == l12).all()) ok_((merged.chunks == l1).all()) data_append = data1.copy() data_append.append(data2) ok_(data_append.nfeatures == 5) ok_((data_append.targets == l12).all()) ok_((data_append.chunks == l1).all()) # # appending # # we need the same samples attributes in both datasets assert_raises(DatasetError, data2.append, data3) # # vstacking # if __debug__: # tested only in __debug__ assert_raises(ValueError, vstack, (data0, data1, data2, data3)) datasets = (data1, data2, data4) merged = vstack(datasets) assert_equal(merged.shape, (np.sum([len(ds) for ds in datasets]), data1.nfeatures)) assert_true('test' in merged.fa) assert_array_equal(merged.sa.targets, [1]*5 + [2]*3 + [3]*2) # # hstacking # assert_raises(ValueError, hstack, datasets) datasets = (data0, data1) merged = hstack(datasets) assert_equal(merged.shape, (len(data1), np.sum([ds.nfeatures for ds in datasets]))) assert_true('chunks' in merged.sa) assert_array_equal(merged.fa.one, [1]*5 + [0]*5)
def test_query_engine(): data = np.arange(54) # indices in 3D ind = np.transpose((np.ones((3, 3, 3)).nonzero())) # sphere generator for 3 elements diameter sphere = ne.Sphere(1) # dataset with just one "space" ds = Dataset([data, data], fa={'s_ind': np.concatenate((ind, ind))}) # and the query engine attaching the generator to the "index-space" qe = ne.IndexQueryEngine(s_ind=sphere) # cannot train since the engine does not know about the second space assert_raises(ValueError, qe.train, ds) # now do it again with a full spec ds = Dataset([data, data], fa={'s_ind': np.concatenate((ind, ind)), 't_ind': np.repeat([0,1], 27)}) qe = ne.IndexQueryEngine(s_ind=sphere, t_ind=None) qe.train(ds) # internal representation check # YOH: invalid for new implementation with lookup tables (dictionaries) #assert_array_equal(qe._searcharray, # np.arange(54).reshape(qe._searcharray.shape) + 1) # should give us one corner, collapsing the 't_ind' assert_array_equal(qe(s_ind=(0, 0, 0)), [0, 1, 3, 9, 27, 28, 30, 36]) # directly specifying an index for 't_ind' without having an ROI # generator, should give the same corner, but just once assert_array_equal(qe(s_ind=(0, 0, 0), t_ind=0), [0, 1, 3, 9]) # just out of the mask -- no match assert_array_equal(qe(s_ind=(3, 3, 3)), []) # also out of the mask -- but single match assert_array_equal(qe(s_ind=(2, 2, 3), t_ind=1), [53]) # query by id assert_array_equal(qe(s_ind=(0, 0, 0), t_ind=0), qe[0]) assert_array_equal(qe(s_ind=(0, 0, 0), t_ind=[0, 1]), qe(s_ind=(0, 0, 0))) # should not fail if t_ind is outside assert_array_equal(qe(s_ind=(0, 0, 0), t_ind=[0, 1, 10]), qe(s_ind=(0, 0, 0))) # should fail if asked about some unknown thing assert_raises(ValueError, qe.__call__, s_ind=(0, 0, 0), buga=0) # Test by using some literal feature atttribute ds.fa['lit'] = ['roi1', 'ro2', 'r3']*18 # should work as well as before assert_array_equal(qe(s_ind=(0, 0, 0)), [0, 1, 3, 9, 27, 28, 30, 36]) # should fail if asked about some unknown (yet) thing assert_raises(ValueError, qe.__call__, s_ind=(0,0,0), lit='roi1') # Create qe which can query literals as well qe_lit = ne.IndexQueryEngine(s_ind=sphere, t_ind=None, lit=None) qe_lit.train(ds) # should work as well as before assert_array_equal(qe_lit(s_ind=(0, 0, 0)), [0, 1, 3, 9, 27, 28, 30, 36]) # and subselect nicely -- only /3 ones assert_array_equal(qe_lit(s_ind=(0, 0, 0), lit='roi1'), [0, 3, 9, 27, 30, 36]) assert_array_equal(qe_lit(s_ind=(0, 0, 0), lit=['roi1', 'ro2']), [0, 1, 3, 9, 27, 28, 30, 36])
def _call(self, dataset): # XXX Hm... it might make sense to unify access functions # naming across our swig libsvm wrapper and sg access # functions for svm clf = self.clf sgsvm = clf.svm sens_labels = None if isinstance(sgsvm, shogun.Classifier.MultiClassSVM): sens, biases = [], [] nsvms = sgsvm.get_num_svms() clabels = sorted(clf._attrmap.values()) nclabels = len(clabels) sens_labels = [] isvm = 0 # index for svm among known for i in xrange(nclabels): for j in xrange(i + 1, nclabels): sgsvmi = sgsvm.get_svm(isvm) labels_tuple = (clabels[i], clabels[j]) # Since we gave the labels in incremental order, # we always should be right - but it does not # hurt to check if set of labels is the same if __debug__ and _shogun_exposes_slavesvm_labels: if not sgsvmi.get_labels(): # We need to call classify() so labels get assigned # to the multiclass SVM sgsvm.classify() assert (set([ sgsvmi.get_label(int(x)) for x in sgsvmi.get_support_vectors() ]) == set(labels_tuple)) sens1, bias = self.__sg_helper(sgsvmi) sens.append(sens1) biases.append(bias) sens_labels += [labels_tuple[::-1]] # ??? positive first isvm += 1 assert (len(sens) == nsvms) # we should have covered all else: sens1, bias = self.__sg_helper(sgsvm) biases = np.atleast_1d(bias) sens = np.atleast_2d(sens1) if not clf.__is_regression__: assert (set(clf._attrmap.values()) == set([-1.0, 1.0])) assert (sens.shape[0] == 1) sens_labels = [(-1.0, 1.0)] ds = Dataset(np.atleast_2d(sens)) if sens_labels is not None: if isinstance(sens_labels[0], tuple): # Need to have them in array of dtype object sens_labels = asobjarray(sens_labels) if len(clf._attrmap): sens_labels = clf._attrmap.to_literal(sens_labels, recurse=True) ds.sa[clf.get_space()] = sens_labels ds.sa['biases'] = biases return ds
def test_samples_shape(): ds = Dataset.from_wizard(np.ones((10, 2, 3, 4)), targets=1, chunks=1) ok_(ds.samples.shape == (10, 24)) # what happens to 1D samples ds = Dataset(np.arange(5)) assert_equal(ds.shape, (5, 1)) assert_equal(ds.nfeatures, 1)
def _call(self, dataset): # This code is based on SciPy's stats.f_oneway() # Copyright (c) Gary Strangman. All rights reserved # License: BSD # # However, it got tweaked and optimized to better fit into PyMVPA. # number of groups targets_sa = dataset.sa[self.get_space()] labels = targets_sa.value ul = targets_sa.unique na = len(ul) bign = float(dataset.nsamples) alldata = dataset.samples # total squares of sums sostot = np.sum(alldata, axis=0) sostot *= sostot sostot /= bign # total sum of squares sstot = np.sum(alldata * alldata, axis=0) - sostot # between group sum of squares ssbn = 0 for l in ul: # all samples for the respective label d = alldata[labels == l] sos = np.sum(d, axis=0) sos *= sos ssbn += sos / float(len(d)) ssbn -= sostot # within sswn = sstot - ssbn # degrees of freedom dfbn = na - 1 dfwn = bign - na # mean sums of squares msb = ssbn / float(dfbn) msw = sswn / float(dfwn) f = msb / msw # assure no NaNs -- otherwise it leads instead of # sane unittest failure (check of NaNs) to crazy # File "mtrand.pyx", line 1661, in mtrand.shuffle # TypeError: object of type 'numpy.int64' has no len() # without any sane backtrace f[np.isnan(f)] = 0 if externals.exists('scipy'): from scipy.stats import fprob return Dataset(f[np.newaxis], fa={'fprob': fprob(dfbn, dfwn, f)}) else: return Dataset(f[np.newaxis])
def _call(self, dataset): # XXX Hm... it might make sense to unify access functions # naming across our swig libsvm wrapper and sg access # functions for svm clf = self.clf sgsvm = clf.svm sens_labels = None if isinstance(sgsvm, shogun.Classifier.MultiClassSVM): sens, biases = [], [] nsvms = sgsvm.get_num_svms() clabels = sorted(clf._attrmap.values()) nclabels = len(clabels) sens_labels = [] isvm = 0 # index for svm among known for i in xrange(nclabels): for j in xrange(i+1, nclabels): sgsvmi = sgsvm.get_svm(isvm) labels_tuple = (clabels[i], clabels[j]) # Since we gave the labels in incremental order, # we always should be right - but it does not # hurt to check if set of labels is the same if __debug__ and _shogun_exposes_slavesvm_labels: if not sgsvmi.get_labels(): # We need to call classify() so labels get assigned # to the multiclass SVM sgsvm.classify() assert(set([sgsvmi.get_label(int(x)) for x in sgsvmi.get_support_vectors()]) == set(labels_tuple)) sens1, bias = self.__sg_helper(sgsvmi) sens.append(sens1) biases.append(bias) sens_labels += [labels_tuple[::-1]] # ??? positive first isvm += 1 assert(len(sens) == nsvms) # we should have covered all else: sens1, bias = self.__sg_helper(sgsvm) biases = np.atleast_1d(bias) sens = np.atleast_2d(sens1) if not clf.__is_regression__: assert(set(clf._attrmap.values()) == set([-1.0, 1.0])) assert(sens.shape[0] == 1) sens_labels = [(-1.0, 1.0)] ds = Dataset(np.atleast_2d(sens)) if sens_labels is not None: if isinstance(sens_labels[0], tuple): # Need to have them in array of dtype object sens_labels = asobjarray(sens_labels) if len(clf._attrmap): sens_labels = clf._attrmap.to_literal(sens_labels, recurse=True) ds.sa[clf.get_space()] = sens_labels ds.sa['biases'] = biases return ds
def test_ex_from_masked(): ds = Dataset.from_wizard(samples=np.atleast_2d(np.arange(5)).view(myarray), targets=1, chunks=1) # simple sequence has to be a single pattern assert_equal(ds.nsamples, 1) # array subclass survives ok_(isinstance(ds.samples, myarray)) # check correct pattern layout (1x5) assert_array_equal(ds.samples, [[0, 1, 2, 3, 4]]) # check for single label and origin assert_array_equal(ds.targets, [1]) assert_array_equal(ds.chunks, [1]) # now try adding pattern with wrong shape assert_raises( ValueError, vstack, (ds, Dataset.from_wizard(np.ones((2, 3)), targets=1, chunks=1))) # now add two real patterns ds = vstack((ds, Dataset.from_wizard(np.random.standard_normal((2, 5)), targets=2, chunks=2))) assert_equal(ds.nsamples, 3) assert_array_equal(ds.targets, [1, 2, 2]) assert_array_equal(ds.chunks, [1, 2, 2]) # test unique class labels ds = vstack((ds, Dataset.from_wizard(np.random.standard_normal((2, 5)), targets=3, chunks=5))) assert_array_equal(ds.sa['targets'].unique, [1, 2, 3]) # test wrong attributes length assert_raises(ValueError, Dataset.from_wizard, np.random.standard_normal((4, 2, 3, 4)), targets=[1, 2, 3], chunks=2) assert_raises(ValueError, Dataset.from_wizard, np.random.standard_normal((4, 2, 3, 4)), targets=[1, 2, 3, 4], chunks=[2, 2, 2]) # no test one that is using from_masked ds = datasets['3dlarge'] for a in ds.sa: assert_equal(len(ds.sa[a].value), len(ds)) for a in ds.fa: assert_equal(len(ds.fa[a].value), ds.nfeatures)
def create_mvpa_dataset(aXData1, aXData2, chunks, labels): feat_list = [] for x1, x2, chunk in zip(aXData1, aXData2, chunks): feat_list.append([x1, x2]) data = Dataset(samples=feat_list) data.sa['id'] = range(0,len(labels)) data.sa['chunks'] = chunks data.sa['targets'] = labels return data
def create_mvpa_dataset(aXData1, aXData2, aXData3, aXData4, chunks, labels): feat_list = [] for x1, x2, x3, x4, chunk in zip(aXData1, aXData2, aXData3, aXData4, chunks): feat_list.append([x1, x2, x3, x4]) data = Dataset(samples=feat_list) data.sa['id'] = range(0, len(labels)) data.sa['chunks'] = chunks data.sa['targets'] = labels return data
def _get_test_dataset(include_nodes=True): # returns test dataset matching the contents of _get_test_sample_node_data samples, nodes, _ = _get_test_sample_node_data() ds = Dataset(np.asarray(samples)) if include_nodes: ds.fa['node_indices'] = np.asarray(nodes) nsamples = ds.nsamples ds.sa['intents'] = ['NIFTI_INTENT_NONE'] * nsamples return ds
def _call(self,dataset): data = dataset.samples if self.params.center_data: data = data - np.mean(data,0) dsm = pdist(data,self.params.pairwise_metric) if self.params.comparison_metric=='spearman': dsm = rankdata(dsm) rho, p = pearsonr(dsm,self.target_dsm) if self.params.corrcoef_only: return Dataset([rho], fa={'metrics': ['rho']}) else: return Dataset([[rho,p]], fa={'metrics': ['rho', 'p']})
def _prep_h2a_data(self, response_data, node_indices): for d in response_data: if isinstance(d, np.ndarray): d = Dataset(d) d.fa['node_indices']= node_indices.copy() connectivity_data = self._get_connectomes(response_data) h2a_input_data = self._frobenius_norm_and_merge(connectivity_data, response_data, node_indices) for d in h2a_input_data: d.fa['node_indices'] = node_indices.copy() zscore(d, chunks_attr=None) return h2a_input_data
def test_stack_add_dataset_attributes(): data0 = Dataset.from_wizard(np.ones((5, 5)), targets=1) data0.a['one'] = np.ones(2) data0.a['two'] = 2 data0.a['three'] = 'three' data0.a['common'] = range(10) data0.a['array'] = np.arange(10) data1 = Dataset.from_wizard(np.ones((5, 5)), targets=1) data1.a['one'] = np.ones(3) data1.a['two'] = 3 data1.a['four'] = 'four' data1.a['common'] = range(10) data1.a['array'] = np.arange(10) vstacker = lambda x: vstack((data0, data1), a=x) hstacker = lambda x: hstack((data0, data1), a=x) add_params = (1, None, 'unique', 'uniques', 'all', 'drop_nonunique') for stacker in (vstacker, hstacker): for add_param in add_params: if add_param == 'unique': assert_raises(DatasetError, stacker, add_param) continue r = stacker(add_param) if add_param == 1: assert_array_equal(data1.a.one, r.a.one) assert_equal(r.a.two, 3) assert_equal(r.a.four, 'four') assert_true('three' not in r.a.keys()) assert_true('array' in r.a.keys()) elif add_param == 'uniques': assert_equal(set(r.a.keys()), set(['one', 'two', 'three', 'four', 'common', 'array'])) assert_equal(r.a.two, (2, 3)) assert_equal(r.a.four, ('four',)) elif add_param == 'all': assert_equal(set(r.a.keys()), set(['one', 'two', 'three', 'four', 'common', 'array'])) assert_equal(r.a.two, (2, 3)) assert_equal(r.a.three, ('three', None)) elif add_param == 'drop_nonunique': assert_equal(set(r.a.keys()), set(['common', 'three', 'four', 'array'])) assert_equal(r.a.three, 'three') assert_equal(r.a.four, 'four') assert_equal(r.a.common, range(10)) assert_array_equal(r.a.array, np.arange(10))
def test_cosmo_do_not_store_unsupported_datatype(): ds = Dataset(np.zeros((0, 0))) class ArbitraryClass(object): pass ds.a['unused'] = ArbitraryClass() c = cosmo.map2cosmo(ds) assert_false('a' in c.keys()) ds.a['foo'] = np.zeros((1,)) c = cosmo.map2cosmo(ds) assert_true('a' in c.keys())
def _call(self,dataset): data = dataset.samples if self.center_data: data = data - np.mean(data,0) dsm = pdist(data,self.pairwise_metric) if self.comparison_metric=='spearman': dsm = rankdata(dsm) if self.partial_dsm == None: rho, p = pearsonr(dsm,self.target_dsm) return Dataset(np.array([rho])) elif self.partial_dsm != None: rp = pcf3(dsm,self.target_dsm,self.partial_dsm) return Dataset(np.array([rp['rxy_z']]))
def _call(self, dataset): data = dataset.samples if self.center_data: data = data - np.mean(data, 0) dsm = pdist(data, self.pairwise_metric) if self.comparison_metric == 'spearman': dsm = rankdata(dsm) rho, p = pearsonr(dsm, self.target_dsm) if self.corrcoef_only: return Dataset(np.array([ rho, ])) else: return Dataset(np.array([rho, p]))
def test_assign_sa(): # https://github.com/PyMVPA/PyMVPA/issues/149 ds = Dataset(np.arange(6).reshape((2,-1)), sa=dict(targets=range(2))) ds.sa['task'] = ds.sa['targets'] # so it should be a new collectable now assert_equal(ds.sa['task'].name, 'task') assert_equal(ds.sa['targets'].name, 'targets') # this lead to issue reported in 149 assert('task' in ds.sa.keys()) assert('targets' in ds.sa.keys()) ds1 = ds[:, 1] assert('task' in ds1.sa.keys()) assert('targets' in ds1.sa.keys()) # issue reported in 149 assert_equal(ds1.sa['task'].name, 'task') assert_equal(ds1.sa['targets'].name,'targets')
def test_labelpermutation_randomsampling(): ds = Dataset.from_wizard(np.ones((5, 10)), targets=range(5), chunks=1) for i in xrange(1, 5): ds.append(Dataset.from_wizard(np.ones((5, 10)) + i, targets=range(5), chunks=i+1)) # assign some feature attributes ds.fa['roi'] = np.repeat(np.arange(5), 2) ds.fa['lucky'] = np.arange(10)%2 # use subclass for testing if it would survive ds.samples = ds.samples.view(myarray) ok_(ds.get_nsamples_per_attr('targets') == {0:5, 1:5, 2:5, 3:5, 4:5}) sample = ds.random_samples(2) ok_(sample.get_nsamples_per_attr('targets').values() == [ 2, 2, 2, 2, 2 ]) ok_((ds.sa['chunks'].unique == range(1, 6)).all())
def _call(self, dataset): # just for the beauty of it X = self._design # precompute transformation is not yet done if self._inv_design is None: self._inv_ip = (X.T * X).I self._inv_design = self._inv_ip * X.T # get parameter estimations for all features at once # (betas x features) betas = self._inv_design * dataset.samples # charge state self.ca.pe = pe = betas.T.A # if betas and no z-stats are desired return them right away if not self._voi == 'pe' or self.ca.is_enabled('zstat'): # compute residuals residuals = X * betas residuals -= dataset.samples # estimates of the parameter variance and compute zstats # assumption of mean(E) == 0 and equal variance # XXX next lines ignore off-diagonal elements and hence covariance # between regressors. The humble being writing these lines asks the # god of statistics for forgives, because it knows not what it does diag_ip = np.diag(self._inv_ip) # (features x betas) beta_vars = np.array([ r.var() * diag_ip for r in residuals.T ]) # (parameter x feature) zstat = pe / np.sqrt(beta_vars) # charge state self.ca.zstat = zstat if self._voi == 'pe': # return as (beta x feature) result = Dataset(pe.T) elif self._voi == 'zstat': # return as (zstat x feature) result = Dataset(zstat.T) else: # we shall never get to this point raise ValueError, \ "Unknown variable of interest '%s'" % str(self._voi) result.sa['regressor'] = np.arange(len(result)) return result
def test_h5py_io(dsfile): skip_if_no_external('h5py') # store random dataset to file ds = datasets['3dlarge'] ds.save(dsfile) # reload and check for identity ds2 = Dataset.from_hdf5(dsfile) assert_array_equal(ds.samples, ds2.samples) for attr in ds.sa: assert_array_equal(ds.sa[attr].value, ds2.sa[attr].value) for attr in ds.fa: assert_array_equal(ds.fa[attr].value, ds2.fa[attr].value) assert_true(len(ds.a.mapper), 2) # since we have no __equal__ do at least some comparison assert_equal(repr(ds.a.mapper), repr(ds2.a.mapper)) if __debug__: # debug mode needs special test as it enhances the repr output # with module info and id() appendix for objects # # INCORRECT slicing (:-1) since without any hash it results in # empty list -- moreover we seems of not reporting ids with # # any longer # #assert_equal('#'.join(repr(ds.a.mapper).split('#')[:-1]), # '#'.join(repr(ds2.a.mapper).split('#')[:-1])) pass
def test_multidim_attrs(): samples = np.arange(24).reshape(2, 3, 4) # have a dataset with two samples -- mapped from 2d into 1d # but have 2d labels and 3d chunks -- whatever that is ds = Dataset.from_wizard(samples.copy(), targets=samples.copy(), chunks=np.random.normal(size=(2,10,4,2))) assert_equal(ds.nsamples, 2) assert_equal(ds.nfeatures, 12) assert_equal(ds.sa.targets.shape, (2, 3, 4)) assert_equal(ds.sa.chunks.shape, (2, 10, 4, 2)) # try slicing subds = ds[0] assert_equal(subds.nsamples, 1) assert_equal(subds.nfeatures, 12) assert_equal(subds.sa.targets.shape, (1, 3, 4)) assert_equal(subds.sa.chunks.shape, (1, 10, 4, 2)) # add multidim feature attr fattr = ds.mapper.forward(samples) assert_equal(fattr.shape, (2, 12)) # should puke -- first axis is #samples assert_raises(ValueError, ds.fa.__setitem__, 'moresamples', fattr) # but that should be fine ds.fa['moresamples'] = fattr.T assert_equal(ds.fa.moresamples.shape, (12, 2))
def cosmo_dataset(cosmo): ''' Construct Dataset from CoSMoMVPA format Parameters ---------- cosmo: str or Dataset-like or dict If a str it is treated as a filename of a .mat file with a matlab struct used in CoSMoMVPA, i.e. a struct with fields .samples, .sa, .fa, and .a. If a dict is is treated like the result from scipy's loadmat of a matlab struct used in CoSMoMVPA. Returns ------- ds : Dataset PyMVPA Dataset object with values in .samples, .fa., .sa and .a based on the input ''' if isinstance(cosmo, basestring): # load file cosmo = _loadmat_internal(cosmo) # do some sanity checks _check_cosmo_dataset(cosmo) # store samples args = dict(samples=cosmo['samples']) # set dataset, feature and sample attributes args.update(_attributes_cosmo2dict(cosmo)) # build dataset using samples, fa, sa and a arguments return Dataset(**args)
def test_labelschunks_access(): samples = np.arange(12).reshape((4, 3)).view(myarray) labels = range(4) chunks = [1, 1, 2, 2] ds = Dataset.from_wizard(samples, labels, chunks) # array subclass survives ok_(isinstance(ds.samples, myarray)) assert_array_equal(ds.targets, labels) assert_array_equal(ds.chunks, chunks) # moreover they should point to the same thing ok_(ds.targets is ds.sa.targets) ok_(ds.targets is ds.sa['targets'].value) ok_(ds.chunks is ds.sa.chunks) ok_(ds.chunks is ds.sa['chunks'].value) # assignment should work at all levels including 1st ds.targets = chunks assert_array_equal(ds.targets, chunks) ok_(ds.targets is ds.sa.targets) ok_(ds.targets is ds.sa['targets'].value) # test broadcasting # but not for plain scalars assert_raises(ValueError, ds.set_attr, 'sa.bc', 5) # and not for plain plain str assert_raises(TypeError, ds.set_attr, 'sa.bc', "mike") # but for any iterable of len == 1 ds.set_attr('sa.bc', (5,)) ds.set_attr('sa.dc', ["mike"]) assert_array_equal(ds.sa.bc, [5] * len(ds)) assert_array_equal(ds.sa.dc, ["mike"] * len(ds))
def get_data(self): data = np.random.standard_normal(( 100, 2, 2, 2 )) labels = np.concatenate( ( np.repeat( 0, 50 ), np.repeat( 1, 50 ) ) ) chunks = np.repeat( range(5), 10 ) chunks = np.concatenate( (chunks, chunks) ) return Dataset.from_wizard(samples=data, targets=labels, chunks=chunks)
def _call(self, dataset): """Computes featurewise f-scores using compound comparisons.""" targets_sa = dataset.sa[self.get_space()] orig_labels = targets_sa.value labels = orig_labels.copy() # Lets create a very shallow copy of a dataset with just # samples and targets_attr dataset_mod = Dataset(dataset.samples, sa={self.get_space(): labels}) results = [] for ul in targets_sa.unique: labels[orig_labels == ul] = 1 labels[orig_labels != ul] = 2 f_ds = OneWayAnova._call(self, dataset_mod) if 'fprob' in f_ds.fa: # rename the fprob attribute to something label specific # to survive final aggregation stage f_ds.fa['fprob_' + str(ul)] = f_ds.fa.fprob del f_ds.fa['fprob'] results.append(f_ds) results = vstack(results) results.sa[self.get_space()] = targets_sa.unique return results
def test_addaxis(): from mvpa2.mappers.shape import AddAxisMapper ds = Dataset(np.arange(24).reshape(2, 3, 4), sa={'testsa': np.arange(2)}, fa={'testfa': np.arange(3)}) ds0 = AddAxisMapper(pos=0)(ds) assert_array_equal(ds0.shape, (1,) + ds.shape) # sas have extra dimension assert_array_equal(ds0.sa.testsa[0], ds.sa.testsa) # fas are duplicated assert_array_equal(ds0.fa.testfa[0], ds0.fa.testfa[1]) ds1 = AddAxisMapper(pos=1)(ds) assert_array_equal(ds1.shape, (2, 1, 3, 4)) # same sample attribute assert_equal(ds1.sa, ds.sa) # fas have extra dimension assert_array_equal(ds1.fa.testfa[0], ds.fa.testfa) ds2 = AddAxisMapper(pos=2)(ds) assert_array_equal(ds2.shape, (2, 3, 1, 4)) # no change to attribute collections assert_equal(ds2.sa, ds.sa) assert_equal(ds2.fa, ds.fa) # append an axis ds3 = AddAxisMapper(pos=3)(ds) assert_array_equal(ds3.shape, ds.shape + (1,)) # reverse indexing ds_1 = AddAxisMapper(pos=-1)(ds) assert_array_equal(ds3.samples, ds_1.samples) assert_equal(ds3.sa, ds_1.sa) assert_equal(ds3.fa, ds_1.fa) # add multiple axes ds4 = AddAxisMapper(pos=4)(ds) assert_array_equal(ds4.shape, ds.shape + (1, 1))
def test_h5py_io(): skip_if_no_external('h5py') tempdir = tempfile.mkdtemp() # store random dataset to file ds = datasets['3dlarge'] ds.save(os.path.join(tempdir, 'plain.hdf5')) # reload and check for identity ds2 = Dataset.from_hdf5(os.path.join(tempdir, 'plain.hdf5')) assert_array_equal(ds.samples, ds2.samples) for attr in ds.sa: assert_array_equal(ds.sa[attr].value, ds2.sa[attr].value) for attr in ds.fa: assert_array_equal(ds.fa[attr].value, ds2.fa[attr].value) assert_true(len(ds.a.mapper), 2) # since we have no __equal__ do at least some comparison if __debug__: # debug mode needs special test as it enhances the repr output # with module info and id() appendix for objects assert_equal('#'.join(repr(ds.a.mapper).split('#')[:-1]), '#'.join(repr(ds2.a.mapper).split('#')[:-1])) else: assert_equal(repr(ds.a.mapper), repr(ds2.a.mapper)) #cleanup temp dir shutil.rmtree(tempdir, ignore_errors=True)
def test_origmask_extraction(): origdata = np.random.standard_normal((10, 2, 4, 3)) data = Dataset.from_wizard(origdata, targets=2, chunks=2) # check with custom mask sel = data[:, 5] ok_(sel.samples.shape[1] == 1)
def test_feature_masking(): mask = np.zeros((5, 3), dtype='bool') mask[2, 1] = True mask[4, 0] = True data = Dataset.from_wizard(np.arange(60).reshape((4, 5, 3)), targets=1, chunks=1, mask=mask) # check simple masking ok_(data.nfeatures == 2) # selection should be idempotent ok_(data[:, mask].nfeatures == data.nfeatures) # check that correct feature get selected assert_array_equal(data[:, 1].samples[:, 0], [12, 27, 42, 57]) # XXX put back when coord -> fattr is implemented #ok_(tuple(data[:, 1].a.mapper.getInId(0)) == (4, 0)) ok_(data[:, 1].a.mapper.forward1(mask).shape == (1,)) # check sugarings # XXX put me back #self.failUnless(np.all(data.I == data.origids)) assert_array_equal(data.C, data.chunks) assert_array_equal(data.UC, np.unique(data.chunks)) assert_array_equal(data.T, data.targets) assert_array_equal(data.UT, np.unique(data.targets)) assert_array_equal(data.S, data.samples) assert_array_equal(data.O, data.mapper.reverse(data.samples))
def test_mean_removal(): test_array = np.array([[0, 0.5, 1, 1.5], [2, 2.5, 3, 3.5], [3, 3.5, 4, 4.5], [5, 5.5, 6, 6.5], [7, 7.5, 8, 8.5]]) test_dataset = Dataset(test_array) desired_result = np.array([[-0.75, -0.25, 0.25, 0.75], [-0.75, -0.25, 0.25, 0.75], [-0.75, -0.25, 0.25, 0.75], [-0.75, -0.25, 0.25, 0.75], [-0.75, -0.25, 0.25, 0.75]]) mr = MeanRemoval(in_place=False) mr_inplace = MeanRemoval(in_place=True) mr_fx = subtract_mean_feature() functions = (mr, mr_inplace, mr_fx) for function in functions: assert_true(np.array_equal(function(test_array.copy()), desired_result), function) for function in functions: assert_true(np.array_equal(function(test_dataset.copy()).samples, desired_result)) random_array = np.random.rand(50, 1000) assert_true(np.array_equal(mr_fx(random_array.copy()), mr(random_array.copy()))) assert_true(np.array_equal(mr_fx(random_array.copy()), mr_inplace(random_array.copy()))) # corner cases int_arr = np.array([1, 2, 3, 4, 5]) desired = int_arr.astype(float) - int_arr.mean() assert_array_equal(mr.forward1(int_arr), desired) # or list assert_array_equal(mr.forward1(list(int_arr)), desired) # missing value -> NaN just like mean() would do nan_arr = np.array([1, 2, np.nan, 4, 5]) assert_array_equal(mr.forward1(nan_arr), [np.nan] * len(int_arr)) # but with a masked array it works as intended, i.e. just like mean() nan_arr = np.ma.array(nan_arr, mask=np.isnan(nan_arr)) nan_arr_dm = desired.copy() nan_arr_dm[2] = np.nan assert_array_equal(mr.forward1(nan_arr), nan_arr_dm)
def test_ex_from_masked(): ds = Dataset.from_wizard(samples=np.atleast_2d(np.arange(5)).view(myarray), targets=1, chunks=1) # simple sequence has to be a single pattern assert_equal(ds.nsamples, 1) # array subclass survives ok_(isinstance(ds.samples, myarray)) # check correct pattern layout (1x5) assert_array_equal(ds.samples, [[0, 1, 2, 3, 4]]) # check for single label and origin assert_array_equal(ds.targets, [1]) assert_array_equal(ds.chunks, [1]) # now try adding pattern with wrong shape assert_raises(DatasetError, ds.append, Dataset.from_wizard(np.ones((2,3)), targets=1, chunks=1)) # now add two real patterns ds.append(Dataset.from_wizard(np.random.standard_normal((2, 5)), targets=2, chunks=2)) assert_equal(ds.nsamples, 3) assert_array_equal(ds.targets, [1, 2, 2]) assert_array_equal(ds.chunks, [1, 2, 2]) # test unique class labels ds.append(Dataset.from_wizard(np.random.standard_normal((2, 5)), targets=3, chunks=5)) assert_array_equal(ds.sa['targets'].unique, [1, 2, 3]) # test wrong attributes length assert_raises(ValueError, Dataset.from_wizard, np.random.standard_normal((4,2,3,4)), targets=[1, 2, 3], chunks=2) assert_raises(ValueError, Dataset.from_wizard, np.random.standard_normal((4,2,3,4)), targets=[1, 2, 3, 4], chunks=[2, 2, 2]) # no test one that is using from_masked ds = datasets['3dlarge'] for a in ds.sa: assert_equal(len(ds.sa[a].value), len(ds)) for a in ds.fa: assert_equal(len(ds.fa[a].value), ds.nfeatures)
def test_shape_conversion(): ds = Dataset.from_wizard(np.arange(24).reshape((2, 3, 4)).view(myarray), targets=1, chunks=1) # array subclass survives ok_(isinstance(ds.samples, myarray)) assert_equal(ds.nsamples, 2) assert_equal(ds.samples.shape, (2, 12)) assert_array_equal(ds.samples, [range(12), range(12, 24)])
def _forward_dataset(self, ds): chunks_attr = self.__chunks_attr mds = Dataset([]) mds.a = ds.a # mds.sa =ds.sa # mds.fa =ds.fa if chunks_attr is None: # global kmeans mds.samples = self._kmeans(ds.samples).labels_ print max(mds.samples) else: # per chunk kmeans for c in ds.sa[chunks_attr].unique: slicer = np.where(ds.sa[chunks_attr].value == c)[0] mds.samples = ds.samples[0,:] mds.samples[slicer] = self._kmeans(ds.samples[slicer]).labels_ return mds
def setUp(self): data = np.random.standard_normal((100, 3, 4, 2)) labels = np.concatenate((np.repeat(0, 50), np.repeat(1, 50))) chunks = np.repeat(range(5), 10) chunks = np.concatenate((chunks, chunks)) mask = np.ones((3, 4, 2), dtype="bool") mask[0, 0, 0] = 0 mask[1, 3, 1] = 0 self.dataset = Dataset.from_wizard(samples=data, targets=labels, chunks=chunks, mask=mask)
def _call(self, ds): test_ds = self._prepare_ds(ds) if test_ds.nsamples != self._train_ds.nsamples: raise ValueError('Datasets should have same sample size for dissimilarity, '\ 'nsamples for train: %d, test: %d'%(self._train_ds.nsamples, test_ds.nsamples)) # Call actual distance metric distds = cdist(self._train_ds.samples, test_ds.samples, metric=self.params.pairwise_metric, **self.params.pairwise_metric_kwargs) # Make target pairs sa_dict = dict() for k in self._train_ds.sa: if k in test_ds.sa: sa_dict[k] = list(product(self._train_ds.sa.get(k).value, test_ds.sa.get(k).value)) distds = Dataset(samples=distds.ravel()[:, None], sa=sa_dict) return distds
def test_icamapper(): # data: 40 sample feature line in 2d space (40x2; samples x features) samples = np.vstack([np.arange(40.) for i in range(2)]).T samples -= samples.mean() samples += np.random.normal(size=samples.shape, scale=0.1) ndlin = Dataset(samples) pm = ICAMapper() try: pm.train(ndlin.copy()) assert_equal(pm.proj.shape, (2, 2)) p = pm.forward(ndlin.copy()) assert_equal(p.shape, (40, 2)) # check that the mapped data can be fully recovered by 'reverse()' assert_array_almost_equal(pm.reverse(p), ndlin) except mdp.NodeException: # do not puke if the ICA did not converge at all -- that is not our # fault but MDP's pass
def test_npz_io(dsfile): # store random dataset to file ds = datasets['3dlarge'].copy() ds.a.pop('mapper') # can't be saved ds.to_npz(dsfile) # reload and check for identity ds2 = Dataset.from_npz(dsfile) assert_datasets_equal(ds, ds2) assert_array_equal(ds.samples, ds2.samples) # But if we try to save with mapper -- it just gets ignored (warning is # issued) datasets['3dlarge'].to_npz(dsfile) ds2_ = Dataset.from_npz(dsfile) assert_datasets_equal(ds2, ds2_)
def test_pcamapper(): # data: 40 sample feature line in 20d space (40x20; samples x features) ndlin = Dataset(np.concatenate([np.arange(40) for i in range(20)]).reshape(20,-1).T) pm = PCAMapper() # train PCA assert_raises(mdp.NodeException, pm.train, ndlin) ndlin.samples = ndlin.samples.astype('float') ndlin_noise = ndlin.copy() ndlin_noise.samples += np.random.random(size=ndlin.samples.shape) # we have no variance for more than one PCA component, hence just one # actual non-zero eigenvalue assert_raises(mdp.NodeException, pm.train, ndlin) pm.train(ndlin_noise) assert_equal(pm.proj.shape, (20, 20)) # now project data into PCA space p = pm.forward(ndlin.samples) assert_equal(p.shape, (40, 20)) # check that the mapped data can be fully recovered by 'reverse()' assert_array_almost_equal(pm.reverse(p), ndlin)