def intersubject_correlation(dss, reference_ds=0): """ Computes voxelwise inter-subject time series correlation in a pairwise fashion for a list of Datasets. Datasets must all be the same shape. Resulting dataset of pairwise correlations will inherit Dataset attributes from reference data set [Default: first data set in list]. """ # Check if input list contains Datasets, ndarrays dss = [Dataset(ds) if not type(ds) == Dataset else ds for ds in dss] ds_shape = dss[reference_ds].shape n_features = ds_shape[1] for ds in dss: assert ds.shape == ds_shape # Compute time series correlation per feature per subject pair correlations = [ map(lambda a, b: pearson_correlation(a, b), ds1.samples.T, ds2.samples.T) for (ds1, ds2) in combinations(dss, 2) ] correlations = np.asarray(correlations) # Resulting correlation map inherits attributes of referece data set correlations_ds = Dataset(correlations, fa=dss[reference_ds].fa, a=dss[reference_ds].a) correlations_ds.sa['pairs'] = list(combinations(range(len(dss)), 2)) assert correlations_ds.shape[0] == len(dss) * (len(dss) - 1) / 2 assert correlations_ds.shape[1] == n_features return correlations_ds
def test_surf_ring_queryengine(self): s = surf.generate_plane((0, 0, 0), (0, 1, 0), (0, 0, 1), 4, 5) # add second layer s2 = surf.merge(s, (s + (.01, 0, 0))) ds = Dataset(samples=np.arange(20)[np.newaxis], fa=dict(node_indices=np.arange(39, 0, -2))) # add more features (with shared node indices) ds3 = hstack((ds, ds, ds)) radius = 2.5 inner_radius = 1.0 # Makes sure it raises error if inner_radius is >= radius assert_raises(ValueError, lambda: queryengine.SurfaceRingQueryEngine(surface=s2, inner_radius=2.5, radius=radius)) distance_metrics = ('euclidean', 'dijkstra', 'euclidean', 'dijkstra') for distance_metric, include_center in zip(distance_metrics, [True, False]*2): qe = queryengine.SurfaceRingQueryEngine(surface=s2, radius=radius, inner_radius=inner_radius, distance_metric=distance_metric, include_center=include_center) # untrained qe should give errors assert_raises(ValueError, lambda: qe.ids) assert_raises(ValueError, lambda: qe.query_byid(0)) # node index out of bounds should give error ds_ = ds.copy() ds_.fa.node_indices[0] = 100 assert_raises(ValueError, lambda: qe.train(ds_)) # lack of node indices should give error ds_.fa.pop('node_indices') assert_raises(ValueError, lambda: qe.train(ds_)) # train the qe qe.train(ds3) for node in np.arange(-1, s2.nvertices + 1): if node < 0 or node >= s2.nvertices: assert_raises(KeyError, lambda: qe.query_byid(node)) continue feature_ids = np.asarray(qe.query_byid(node)) # node indices relative to ds base_ids = feature_ids[feature_ids < 20] # should have multiples of 20 assert_equal(set(feature_ids), set((base_ids[np.newaxis].T + \ [0, 20, 40]).ravel())) node_indices = s2.circlearound_n2d(node, radius, distance_metric or 'dijkstra') fa_indices = [fa_index for fa_index, inode in enumerate(ds3.fa.node_indices) if inode in node_indices and node_indices[inode] > inner_radius] if include_center and node in ds3.fa.node_indices: fa_indices += np.where(ds3.fa.node_indices == node)[0].tolist() assert_equal(set(feature_ids), set(fa_indices))
def test_zscore_withoutchunks(): # just a smoke test to see if all issues of # https://github.com/PyMVPA/PyMVPA/issues/26 # are fixed from mvpa2.datasets import Dataset ds = Dataset(np.arange(32).reshape((8,-1)), sa=dict(targets=range(8))) zscore(ds, chunks_attr=None) assert(np.any(ds.samples != np.arange(32).reshape((8,-1)))) ds_summary = ds.summary() assert(ds_summary is not None)
def test_zscore_withoutchunks(): # just a smoke test to see if all issues of # https://github.com/PyMVPA/PyMVPA/issues/26 # are fixed from mvpa2.datasets import Dataset ds = Dataset(np.arange(32).reshape((8, -1)), sa=dict(targets=range(8))) zscore(ds, chunks_attr=None) assert (np.any(ds.samples != np.arange(32).reshape((8, -1)))) ds_summary = ds.summary() assert (ds_summary is not None)
def _proc_block(self, block, ds, measure): """Little helper to capture the parts of the computation that can be parallelized """ if __debug__: debug_slc_ = 'SLC_' in debug.active debug('SLC', "Starting computing block for %i elements" % len(block)) if self.ca.is_enabled('roi_sizes'): roi_sizes = [] else: roi_sizes = None results = [] # put rois around all features in the dataset and compute the # measure within them for i, f in enumerate(block): # retrieve the feature ids of all features in the ROI from the query # engine roi_fids = self._queryengine[f] if __debug__ and debug_slc_: debug('SLC_', 'For %r query returned ids %r' % (f, roi_fids)) # slice the dataset roi = ds[:, roi_fids] if self.__add_center_fa: # add fa to indicate ROI seed if requested roi_seed = np.zeros(roi.nfeatures, dtype='bool') roi_seed[roi_fids.index(f)] = True roi.fa[self.__add_center_fa] = roi_seed # compute the datameasure and store in results res = measure(roi) if self.ca.is_enabled('roi_feature_ids'): if not is_datasetlike(res): res = Dataset(np.atleast_1d(res)) # add roi feature ids to intermediate result dataset for later # aggregation res.a['roi_feature_ids'] = roi_fids results.append(res) # store the size of the roi dataset if not roi_sizes is None: roi_sizes.append(roi.nfeatures) if __debug__: debug('SLC', "Doing %i ROIs: %i (%i features) [%i%%]" \ % (len(block), f+1, roi.nfeatures, float(i+1)/len(block)*100,), cr=True) return results, roi_sizes
def run(args): from mvpa2.base.hdf5 import h5save ds = None if not args.txt_data is None: verbose(1, "Load data from TXT file '%s'" % args.txt_data) samples = _load_from_txt(args.txt_data) ds = Dataset(samples) elif not args.npy_data is None: verbose(1, "Load data from NPY file '%s'" % args.npy_data) samples = _load_from_npy(args.npy_data) ds = Dataset(samples) elif not args.mri_data is None: verbose(1, "Load data from MRI image(s) %s" % args.mri_data) from mvpa2.datasets.mri import fmri_dataset vol_attr = dict() if not args.add_vol_attr is None: # XXX add a way to use the mapper of an existing dataset to # add a volume attribute without having to load the entire # mri data again vol_attr = dict(args.add_vol_attr) if not len(args.add_vol_attr) == len(vol_attr): warning("--vol-attr option with duplicate attribute name: " "check arguments!") verbose(2, "Add volumetric feature attributes: %s" % vol_attr) ds = fmri_dataset(args.mri_data, mask=args.mask, add_fa=vol_attr) if ds is None: if args.data is None: raise RuntimeError('no data source specific') else: ds = hdf2ds(args.data)[0] else: if args.data is not None: verbose(1, 'ignoring dataset input in favor of other data source -- remove either one to disambiguate') # act on all attribute options ds = process_common_dsattr_opts(ds, args) if not args.add_fsl_mcpar is None: from mvpa2.misc.fsl.base import McFlirtParams mc_par = McFlirtParams(args.add_fsl_mcpar) for param in mc_par: verbose(2, "Add motion regressor as sample attribute '%s'" % ('mc_' + param)) ds.sa['mc_' + param] = mc_par[param] verbose(3, "Dataset summary %s" % (ds.summary())) # and store outfilename = args.output if not outfilename.endswith('.hdf5'): outfilename += '.hdf5' verbose(1, "Save dataset to '%s'" % outfilename) h5save(outfilename, ds, mkdir=True, compression=args.hdf5_compression)
def _forward_dataset(self, ds): # apply function if self._train_as_1st: out = self._fx(self._ds_train, ds) else: out = self._fx(ds, self._ds_train) # wrap output in a dataset if necessary if not isinstance(out, Dataset): try: out = Dataset(out) except ValueError: # not a sequence? out = Dataset([out]) return out
def _level3(self, datasets): params = self.params # for quicker access ;) # create a mapper per dataset mappers = [deepcopy(params.alignment) for ds in datasets] # key different from level-2; the common space is uniform #temp_commonspace = commonspace residuals = None if self.ca['residual_errors'].enabled: residuals = np.zeros((1, len(datasets))) self.ca.residual_errors = Dataset(samples=residuals) # start from original input datasets again for i, (m, ds_new) in enumerate(zip(mappers, datasets)): if __debug__: debug('HPAL_', "Level 3: ds #%i" % i) # retrain mapper on final common space ds_new.sa[m.get_space()] = self.commonspace m.train(ds_new) # remove common space attribute again to save on memory del ds_new.sa[m.get_space()] if residuals is not None: # obtain final projection data_mapped = m.forward(ds_new.samples) residuals[0, i] = np.linalg.norm(data_mapped - self.commonspace) return mappers
def test_pval(): def not_inplace_shuffle(x): x = list(x) random.shuffle(x) return x x = range(100000) * 20 x = np.array(x) x = x.reshape(20, 100000) x = x.T x = np.apply_along_axis(not_inplace_shuffle, axis=0, arr=x) expected_result = [100000 - 100000 * 0.001] * 20 thresholds = gct.get_thresholding_map(x, p=0.001) assert_array_equal(thresholds, expected_result) # works with datasets too dsthresholds = gct.get_thresholding_map(Dataset(x), p=0.001) assert_almost_equal(thresholds, dsthresholds) assert_raises(ValueError, gct.get_thresholding_map, x, p=0.00000001) x = range(0, 100, 5) null_dist = np.repeat(1, 100).astype(float)[None] pvals = gct._transform_to_pvals(x, null_dist) desired_output = np.array([ 1, 0.95, 0.9, 0.85, 0.8, 0.75, 0.7, 0.65, 0.6, 0.55, 0.5, 0.45, 0.4, 0.35, 0.3, 0.25, 0.2, 0.15, 0.1, 0.05 ]) assert_array_almost_equal(desired_output, pvals)
def test_repeater(): reps = 4 r = Repeater(reps, space='OMG') dsl = [ds for ds in r.generate(Dataset([0, 1]))] assert_equal(len(dsl), reps) for i, ds in enumerate(dsl): assert_equal(ds.a.OMG, i)
def _call(self, dataset): sensitivities = [] for ind, analyzer in enumerate(self.__analyzers): if __debug__: debug("SA", "Computing sensitivity for SA#%d:%s" % (ind, analyzer)) sensitivity = analyzer(dataset) sensitivities.append(sensitivity) if __debug__: debug( "SA", "Returning %d sensitivities from %s" % (len(sensitivities), self.__class__.__name__)) sa_attr = self._sa_attr if isinstance(sensitivities[0], AttrDataset): smerged = [] for i, s in enumerate(sensitivities): s.sa[sa_attr] = np.repeat(i, len(s)) smerged.append(s) sensitivities = vstack(smerged) else: sensitivities = \ Dataset(sensitivities, sa={sa_attr: np.arange(len(sensitivities))}) self.ca.sensitivities = sensitivities return sensitivities
def psc(ds, **kwargs): """In-place PSC of a `Dataset` or `ndarray`. This function behaves identical to `PSCMapper`. The only difference is that the actual psc is done in-place -- potentially causing a significant reduction of memory demands. Parameters ---------- ds : Dataset or ndarray The data that will be Z-scored in-place. **kwargs For all other arguments, please see the documentation of `ZScoreMapper`. """ pscm = PSCMapper(**kwargs) pscm._secret_inplace_zscore = True # train if isinstance(ds, Dataset): pscm.train(ds) else: pscm.train(Dataset(ds)) # map mapped = pscm.forward(ds) # and append the mapper to the dataset if isinstance(mapped, Dataset): mapped._append_mapper(pscm)
def eep_dataset(samples, targets=None, chunks=None): """Create a dataset using an EEP binary file as source. EEP files are used by *eeprobe* a software for analysing even-related potentials (ERP), which was developed at the Max-Planck Institute for Cognitive Neuroscience in Leipzig, Germany. http://www.ant-neuro.com/products/eeprobe Parameters ---------- samples : str or EEPBin instance This is either a filename of an EEP file, or an EEPBin instance, providing the samples data in EEP format. targets, chunks : sequence or scalar or None Values are pass through to `Dataset.from_wizard()`. See its documentation for more information. """ if isinstance(samples, str): # open the eep file eb = EEPBin(samples) elif isinstance(samples, EEPBin): # nothing special eb = samples else: raise ValueError("eep_dataset takes the filename of an " "EEP file or a EEPBin object as 'samples' argument.") # init dataset ds = Dataset.from_channeltimeseries( eb.data, targets=targets, chunks=chunks, t0=eb.t0, dt=eb.dt, channelids=eb.channels) return ds
def _sl_call(self, dataset, roi_ids, nproc): """Classical generic searchlight implementation """ assert (self.results_backend in ('native', 'hdf5')) # compute if nproc is not None and nproc > 1: # split all target ROIs centers into `nproc` equally sized blocks nproc_needed = min(len(roi_ids), nproc) nblocks = nproc_needed \ if self.nblocks is None else self.nblocks roi_blocks = np.array_split(roi_ids, nblocks) # the next block sets up the infrastructure for parallel computing # this can easily be changed into a ParallelPython loop, if we # decide to have a PP job server in PyMVPA import pprocess p_results = pprocess.Map(limit=nproc_needed) if __debug__: debug( 'SLC', "Starting off %s child processes for nblocks=%i" % (nproc_needed, nblocks)) compute = p_results.manage(pprocess.MakeParallel(self._proc_block)) for iblock, block in enumerate(roi_blocks): # should we maybe deepcopy the measure to have a unique and # independent one per process? seed = mvpa2.get_random_seed() compute(block, dataset, copy.copy(self.__datameasure), seed=seed, iblock=iblock) else: # otherwise collect the results in an 1-item list p_results = [ self._proc_block(roi_ids, dataset, self.__datameasure) ] # Finally collect and possibly process results # p_results here is either a generator from pprocess.Map or a list. # In case of a generator it allows to process results as they become # available result_ds = self.results_fx( sl=self, dataset=dataset, roi_ids=roi_ids, results=self.__handle_all_results(p_results)) # Assure having a dataset (for paranoid ones) if not is_datasetlike(result_ds): try: result_a = np.atleast_1d(result_ds) except ValueError, e: if 'setting an array element with a sequence' in str(e): # try forcing object array. Happens with # test_custom_results_fx_logic on numpy 1.4.1 on Debian # squeeze result_a = np.array(result_ds, dtype=object) else: raise result_ds = Dataset(result_a)
def test_sifter_with_balancing(): # extended previous test which was already # "... somewhat duplicating the doctest" ds = Dataset(samples=np.arange(12).reshape((-1, 2)), sa={ 'chunks': [0, 1, 2, 3, 4, 5], 'targets': ['c', 'c', 'c', 'p', 'p', 'p'] }) # Without sifter -- just to assure that we do get all of them # i.e. 6*5*4*3/(4!) = 15 par = ChainNode([NFoldPartitioner(cvtype=4, attr='chunks')]) assert_equal(len(list(par.generate(ds))), 15) # so we will take 4 chunks out of available 7, but would care only # about those partitions where we have balanced number of 'c' and 'p' # entries assert_raises( ValueError, lambda x: list(Sifter([('targets', dict(wrong=1))]).generate(x)), ds) par = ChainNode([ NFoldPartitioner(cvtype=4, attr='chunks'), Sifter([('partitions', 2), ('targets', dict(uvalues=['c', 'p'], balanced=True))]) ]) dss = list(par.generate(ds)) # print [ x[x.sa.partitions==2].sa.targets for x in dss ] assert_equal(len(dss), 9) for ds_ in dss: testing = ds[ds_.sa.partitions == 2] assert_array_equal(np.unique(testing.sa.targets), ['c', 'p']) # and we still have both targets present in training training = ds[ds_.sa.partitions == 1] assert_array_equal(np.unique(training.sa.targets), ['c', 'p'])
def test_exclude_targets_combinations_subjectchunks(): partitioner = ChainNode([NFoldPartitioner(attr='subjects'), ExcludeTargetsCombinationsPartitioner( k=1, targets_attr='chunks', space='partitions')], space='partitions') # targets do not need even to be defined! ds = Dataset(np.arange(18).reshape(9, 2), sa={'chunks': np.arange(9) // 3, 'subjects': np.arange(9) % 3}) dss = list(partitioner.generate(ds)) assert_equal(len(dss), 9) testing_subjs, testing_chunks = [], [] for ds_ in dss: testing_partition = ds_.sa.partitions == 2 training_partition = ds_.sa.partitions == 1 # must be scalars -- so implicit test here # if not -- would be error testing_subj = np.asscalar(np.unique(ds_.sa.subjects[testing_partition])) testing_subjs.append(testing_subj) testing_chunk = np.asscalar(np.unique(ds_.sa.chunks[testing_partition])) testing_chunks.append(testing_chunk) # and those must not appear for training ok_(not testing_subj in ds_.sa.subjects[training_partition]) ok_(not testing_chunk in ds_.sa.chunks[training_partition]) # and we should have gone through all chunks/subjs pairs testing_pairs = set(zip(testing_subjs, testing_chunks)) assert_equal(len(testing_pairs), 9) # yoh: equivalent to set(itertools.product(range(3), range(3)))) # but .product is N/A for python2.5 assert_equal(testing_pairs, set(zip(*np.where(np.ones((3,3))))))
def _forward_dataset(self, ds): pos = self._pos if pos < 0: # support negative/reverse indices pos = len(ds.shape) + 1 + pos # select all prior axes, but at most all existing axes slicer = [slice(None)] * min(pos, len(ds.shape)) # and as many new axes as necessary afterwards slicer += [None] * max(1, pos + 1 - len(ds.shape)) # there are two special cases that require modification of feature # attributes if pos == 0: # prepend an axis to all sample attributes out_sa = dict([(attr, ds.sa[attr].value[None]) for attr in ds.sa]) # prepend an axis to all FAs and repeat for each previous sample out_fa = dict([(attr, np.repeat(ds.fa[attr].value[None], len(ds), axis=0)) for attr in ds.fa]) elif pos == 1: # prepend an axis to all feature attributes out_fa = dict([(attr, ds.fa[attr].value[None]) for attr in ds.fa]) out_sa = ds.sa else: out_sa = ds.sa out_fa = ds.fa out = Dataset(ds.samples.__getitem__(tuple(slicer)), sa=out_sa, fa=out_fa, a=ds.a) return out
def _call(self, dataset): """Extract weights from GPR """ clf = self.clf kernel = clf.kernel train_fv = clf._train_fv if isinstance(kernel, LinearKernel): Sigma_p = 1.0 else: Sigma_p = kernel.params.Sigma_p weights = Ndot(Sigma_p, Ndot(train_fv.T, clf._alpha)) if self.ca.is_enabled('variances'): # super ugly formulas that can be quite surely improved: tmp = np.linalg.inv(clf._L) Kyinv = Ndot(tmp.T, tmp) # XXX in such lengthy matrix manipulations you might better off # using np.matrix where * is a matrix product self.ca.variances = Ndiag( Sigma_p - Ndot(Sigma_p, Ndot(train_fv.T, Ndot(Kyinv, Ndot(train_fv, Sigma_p))))) return Dataset(np.atleast_2d(weights))
def eep_dataset(samples, targets=None, chunks=None): """Create a dataset using an EEP binary file as source. EEP files are used by *eeprobe* a software for analysing even-related potentials (ERP), which was developed at the Max-Planck Institute for Cognitive Neuroscience in Leipzig, Germany. http://www.ant-neuro.com/products/eeprobe """ if isinstance(samples, str): # open the eep file eb = EEPBin(samples) elif isinstance(samples, EEPBin): # nothing special eb = samples else: raise ValueError("eep_dataset takes the filename of an " "EEP file or a EEPBin object as 'samples' argument.") # init dataset ds = Dataset.from_channeltimeseries( eb.data, targets=targets, chunks=chunks, t0=eb.t0, dt=eb.dt, channelids=eb.channels) return ds
def get_pairwise_accuracies(ds, stat='acc', pairs=None, select=None, space='targets'): """Extract pair-wise classification performances as a dataset Converts a dataset of classifications for all pairs of classifiers (e.g. obtained from raw_predictions_ds of a MulticlassClassifier) into a dataset of performances for each pair of stimuli categories. I.e. only the pair-wise results where the target label matches one of the targets in the pair will contribute to the count. Parameters ---------- pairs : Pairs of targets corresponding to the last dimension in the provided dataset select : list, optional Deal only with those targets listed here, omitting the others """ pairs, hits_misses = get_pairwise_hits_misses( ds.samples, ds.sa[space].value, pairs=pairs, select=select) hits_misses = np.array(hits_misses) if stat in ['acc']: stat_values = hits_misses[:, 0].astype(float)/np.sum(hits_misses, axis=1) stat_fa = [stat] elif stat == 'hits_misses': stat_values = hits_misses stat_fa = ['hits', 'misses'] else: raise NotImplementedError("%s statistic not there yet" % stat) return Dataset(stat_values, sa={space: pairs}, fa={'stat': stat_fa})
def test_bayes_confusion_hyp(): from mvpa2.clfs.transerror import BayesConfusionHypothesis conf = np.array([[10, 0, 5, 5], [0, 10, 5, 5], [5, 5, 10, 0], [5, 5, 0, 10]]) conf = Dataset(conf, sa={'labels': ['A', 'B', 'C', 'D']}) bayes = BayesConfusionHypothesis(labels_attr='labels') skip_if_no_external('scipy') # uses factorial from scipy.misc hyptest = bayes(conf) # by default comes with all hypothesis and posterior probs assert_equal(hyptest.shape, (15, 2)) assert_array_equal(hyptest.fa.stat, ['log(p(C|H))', 'log(p(H|C))']) # check order of hypothesis (coarse) assert_array_equal(hyptest.sa.hypothesis[0], [['A', 'B', 'C', 'D']]) assert_array_equal(hyptest.sa.hypothesis[-1], [['A'], ['B'], ['C'], ['D']]) # now with limited hypothesis (given with literal labels), set and in # non-log scale bayes = BayesConfusionHypothesis(labels_attr='labels', log=False, hypotheses=[[['A', 'B', 'C', 'D']], [[ 'A', 'C', ], ['B', 'D']], [[ 'A', 'D', ], ['B', 'C']], [['A'], ['B'], ['C'], ['D']]]) hyptest = bayes(conf) # also with custom hyp the post-probs must add up to 1 post_prob = hyptest.samples[:, 1] assert_almost_equal(np.sum(post_prob), 1) # in this particular case ... assert (post_prob[3] - np.sum(post_prob[1:3]) < 0.02)
def get_bold(): # TODO add second model hrf_x = np.linspace(0, 25, 250) hrf = double_gamma_hrf(hrf_x) - single_gamma_hrf(hrf_x, 0.8, 1, 0.05) samples = 1200 exp_time = np.linspace(0, 120, samples) fast_er_onsets = np.array([50, 240, 340, 590, 640, 940, 960]) fast_er = np.zeros(samples) fast_er[fast_er_onsets] = 1 model_hr = np.convolve(fast_er, hrf)[:samples] tr = 2.0 model_lr = signal.resample(model_hr, int(samples / tr / 10), window='ham') ## moderate noise level baseline = 800 wsignal = baseline + 8.0 \ * model_lr + np.random.randn(int(samples / tr / 10)) * 4.0 nsignal = baseline \ + np.random.randn(int(samples / tr / 10)) * 4.0 ds = Dataset(samples=np.array([wsignal, nsignal]).T, sa={'model': model_lr}) return ds
def _forward_dataset_helper(self, ds): # local binding num = self.__num pos = None if not self.__position_attr is None: # we know something about sample position pos = ds.sa[self.__position_attr].value rsamples, pos = resample(ds.samples, self.__num, t=pos, window=self.__window_args) else: # we know nothing about samples position rsamples = resample(ds.samples, self.__num, t=None, window=self.__window_args) # new dataset that reuses that feature and dataset attributes of the # source mds = Dataset(rsamples, fa=ds.fa, a=ds.a) # the tricky part is what to do with the samples attributes, since their # number has changes if self.__attr_strategy == 'remove': # nothing to be done pass elif self.__attr_strategy == 'sample': step = int(len(ds) / num) sa = dict([(k, ds.sa[k].value[0::step][:num]) for k in ds.sa]) mds.sa.update(sa) elif self.__attr_strategy == 'resample': # resample the attributes themselves sa = {} for k in ds.sa: v = ds.sa[k].value if pos is None: sa[k] = resample(v, self.__num, t=None, window=self.__window_args) else: if k == self.__position_attr: # position attr will be handled separately at the end continue sa[k] = resample(v, self.__num, t=pos, window=self.__window_args)[0] # inject them all mds.sa.update(sa) else: raise ValueError("Unkown attribute handling strategy '%s'." % self.__attr_strategy) if not pos is None: # we got the new sample positions and can store them mds.sa[self.__position_attr] = pos return mds
def test_resample(): time = np.linspace(0, 2 * np.pi, 100) ds = Dataset(np.vstack((np.sin(time), np.cos(time))).T, sa={ 'time': time, 'section': np.repeat(range(10), 10) }) assert_equal(ds.shape, (100, 2)) # downsample num = 10 rm = FFTResampleMapper(num, window=('gauss', 50), position_attr='time', attr_strategy='sample') mds = rm.forward(ds) assert_equal(mds.shape, (num, ds.nfeatures)) # didn't change the orig assert_equal(len(ds), 100) # check position-based resampling ds_partial = ds[0::10] mds_partial = rm.forward(ds_partial) # despite different input sampling should yield the same output timepoints assert_array_almost_equal(mds.sa.time, mds_partial.sa.time) # exclude the first points to prevent edge effects, but the data should be # very similar too assert_array_almost_equal(mds.samples[2:], mds_partial.samples[2:], decimal=2) # simple sample of sa's should give meaningful stuff assert_array_equal(mds.sa.section, range(10)) # and now for a dataset with chunks cds = vstack([ds.copy(), ds.copy()]) cds.sa['chunks'] = np.repeat([0, 1], len(ds)) rm = FFTResampleMapper(num, attr_strategy='sample', chunks_attr='chunks', window=('gauss', 50)) mcds = rm.forward(cds) assert_equal(mcds.shape, (20, 2)) assert_array_equal(mcds.sa.section, np.tile(range(10), 2)) # each individual chunks should be identical to previous dataset assert_array_almost_equal(mds.samples, mcds.samples[:10]) assert_array_almost_equal(mds.samples, mcds.samples[10:])
def test_random_affine_transformation(): ds = Dataset.from_wizard(np.random.randn(8, 3, 2)) ds_d = random_affine_transformation(ds) # compare original to the inverse of the distortion using reported # parameters assert_array_almost_equal( np.dot((ds_d.samples - ds_d.a.random_shift) / ds_d.a.random_scale, ds_d.a.random_rotation.T), ds.samples)
def test_linear_kernel(self): """Simplistic testing of linear kernel""" d1 = Dataset(np.asarray([list(range(5))] * 10, dtype=float)) lk = npK.LinearKernel() lk.compute(d1) self.assertTrue(lk._k.shape == (10, 10), "Failure computing LinearKernel (Size mismatch)") self.assertTrue((lk._k == 30).all(), "Failure computing LinearKernel")
def test_random_affine_transformation(): ds = Dataset.from_wizard(np.random.randn(8,3,2)) ds_d = random_affine_transformation(ds) # compare original to the inverse of the distortion using reported # parameters assert_array_almost_equal( np.dot((ds_d.samples - ds_d.a.random_shift) / ds_d.a.random_scale, ds_d.a.random_rotation.T), ds.samples)
def test_cached_kernel(self): nchunks = 5 n = 50 * nchunks d = Dataset(np.random.randn(n, 132)) d.sa.chunks = np.random.randint(nchunks, size=n) # We'll compare against an Rbf just because it has a parameter to change rk = npK.RbfKernel(sigma=1.5) # Assure two kernels are independent for this test ck = CachedKernel(kernel=npK.RbfKernel(sigma=1.5)) ck.compute(d) # Initial cache of all data self.assertTrue(ck._recomputed, 'CachedKernel was not initially computed') # Try some splitting for chunk in [d[d.sa.chunks == i] for i in range(nchunks)]: rk.compute(chunk) ck.compute(chunk) self.kernel_equiv(rk, ck) #, accuracy=1e-12) self.failIf(ck._recomputed, "CachedKernel incorrectly recomputed it's kernel") # Test what happens when a parameter changes ck.params.sigma = 3.5 ck.compute(d) self.assertTrue(ck._recomputed, "CachedKernel doesn't recompute on kernel change") rk.params.sigma = 3.5 rk.compute(d) self.assertTrue(np.all(rk._k == ck._k), 'Cached and rbf kernels disagree after kernel change') # Now test handling new data d2 = Dataset(np.random.randn(32, 43)) ck.compute(d2) self.assertTrue( ck._recomputed, "CachedKernel did not automatically recompute new data") ck.compute(d) self.assertTrue(ck._recomputed, "CachedKernel did not recompute old data which had\n" + \ "previously been computed, but had the cache overriden")
def __process_roi(self, ds, roi_feature_id, measure, assure_dataset): # retrieve the feature ids of all features in the ROI from the query # engine roi_specs = self._queryengine[roi_feature_id] if __debug__: debug( 'SLC_', 'For %r query returned roi_specs %r' % (roi_feature_id, roi_specs)) if is_datasetlike(roi_specs): # TODO: unittest assert (len(roi_specs) == 1) roi_fids = roi_specs.samples[0] else: roi_fids = roi_specs # slice the dataset roi = ds[:, roi_fids] if is_datasetlike(roi_specs): for n, v in roi_specs.fa.iteritems(): roi.fa[n] = v if self.__add_center_fa: # add fa to indicate ROI seed if requested roi_seed = np.zeros(roi.nfeatures, dtype='bool') if roi_feature_id in roi_fids: roi_seed[roi_fids.index(roi_feature_id)] = True else: warning("Center feature attribute id %s not found" % roi_feature_id) roi.fa[self.__add_center_fa] = roi_seed # compute the datameasure and store in results res = measure(roi) if assure_dataset and not is_datasetlike(res): res = Dataset(np.atleast_1d(res)) if self.ca.is_enabled('roi_feature_ids'): # add roi feature ids to intermediate result dataset for later # aggregation res.a['roi_feature_ids'] = roi_fids if self.ca.is_enabled('roi_sizes'): res.a['roi_sizes'] = roi.nfeatures if self.ca.is_enabled('roi_center_ids'): res.a['roi_center_ids'] = roi_feature_id return res, roi
def _call(self, dataset): # For performance measures -- increase to 50-200 # np.sum here is just to get some meaningful value in # them #return np.ones(shape=(2, 2))*np.sum(dataset) return Dataset( np.array([{ 'd': np.ones(shape=(5, 5)) * np.sum(dataset) }], dtype=object))
def _call(self, ds): y = ds.sa[self.space].value if self.numeric or ((self.numeric is None) and y.dtype.char == 'S'): y = AttributeMap().to_numeric(y) # TODO: if not self.uni: out = self.fx(ds.samples, y) else: out = np.array([self.fx(feat, y) for feat in ds.samples.T]) return Dataset(out[None], fa=ds.fa)
def _fill_in_scattered_results(sl, dataset, roi_ids, results): """this requires the searchlight conditional attribute 'roi_feature_ids' to be enabled""" import numpy as np from mvpa2.datasets import Dataset resmap = None probmap = None for resblock in results: for res in resblock: if resmap is None: # prepare the result container resmap = np.zeros((len(res), dataset.nfeatures), dtype=res.samples.dtype) if 'null_prob' in res.fa: # initialize the prob map also with zeroes, as p=0 can never # happen as an empirical result probmap = np.zeros((dataset.nfeatures,) + res.fa.null_prob.shape[1:], dtype=res.samples.dtype) observ_counter = np.zeros(dataset.nfeatures, dtype=int) # project the result onto all features -- love broadcasting! #print "averaging" resmap[:, res.a.roi_feature_ids] += res.samples if not probmap is None: probmap[res.a.roi_feature_ids] += res.fa.null_prob # increment observation counter for all relevant features observ_counter[res.a.roi_feature_ids] += 1 # when all results have been added up average them according to the number # of observations observ_mask = observ_counter > 0 resmap[:, observ_mask] /= observ_counter[observ_mask] result_ds = Dataset(resmap, fa={'observations': observ_counter}) if not probmap is None: # transpose to make broadcasting work -- creates a view, so in-place # modification still does the job probmap.T[:, observ_mask] /= observ_counter[observ_mask] result_ds.fa['null_prob'] = probmap.squeeze() if 'mapper' in dataset.a: import copy result_ds.a['mapper'] = copy.copy(dataset.a.mapper) return result_ds
def _fill_in_scattered_results(sl, dataset, roi_ids, results): """this requires the searchlight conditional attribute 'roi_feature_ids' to be enabled""" import numpy as np from mvpa2.datasets import Dataset resmap = None probmap = None for resblock in results: for res in resblock: if resmap is None: # prepare the result container resmap = np.zeros((len(res), dataset.nfeatures), dtype=res.samples.dtype) if 'null_prob' in res.fa: # initialize the prob map also with zeroes, as p=0 can never # happen as an empirical result probmap = np.zeros( (dataset.nfeatures, ) + res.fa.null_prob.shape[1:], dtype=res.samples.dtype) observ_counter = np.zeros(dataset.nfeatures, dtype=int) #project the result onto all features -- love broadcasting! resmap[:, res.a.roi_feature_ids] += res.samples if not probmap is None: probmap[res.a.roi_feature_ids] += res.fa.null_prob # increment observation counter for all relevant features observ_counter[res.a.roi_feature_ids] += 1 # when all results have been added up average them according to the number # of observations observ_mask = observ_counter > 0 resmap[:, observ_mask] /= observ_counter[observ_mask] result_ds = Dataset(resmap, fa={'observations': observ_counter}) if not probmap is None: # transpose to make broadcasting work -- creates a view, so in-place # modification still does the job probmap.T[:, observ_mask] /= observ_counter[observ_mask] result_ds.fa['null_prob'] = probmap.squeeze() if 'mapper' in dataset.a: import copy result_ds.a['mapper'] = copy.copy(dataset.a.mapper) return result_ds
def test_1d_multispace_searchlight(self): ds = Dataset([np.arange(6)]) ds.fa['coord1'] = np.repeat(np.arange(3), 2) # add a second space to the dataset ds.fa['coord2'] = np.tile(np.arange(2), 3) measure = lambda x: "+".join([str(x) for x in x.samples[0]]) # simply select each feature once res = Searchlight(measure, IndexQueryEngine(coord1=Sphere(0), coord2=Sphere(0)), nproc=1)(ds) assert_array_equal(res.samples, [['0', '1', '2', '3', '4', '5']]) res = Searchlight(measure, IndexQueryEngine(coord1=Sphere(0), coord2=Sphere(1)), nproc=1)(ds) assert_array_equal(res.samples, [['0+1', '0+1', '2+3', '2+3', '4+5', '4+5']]) res = Searchlight(measure, IndexQueryEngine(coord1=Sphere(1), coord2=Sphere(0)), nproc=1)(ds) assert_array_equal(res.samples, [['0+2', '1+3', '0+2+4', '1+3+5', '2+4', '3+5']])
def resting_dmn(sub, ses, in_file=None, lh_ctx_file=None, rh_ctx_file=None, sc_file=None, schedule_file=None): from pipe_hbn_ssi import wb_to_tss import os, sys import numpy as np sys.path.append('/home/bpinsard/data/projects/CoRe') import core.mvpa.dataset as cds from nipy.modalities.fmri.glm import GeneralLinearModel import scipy.ndimage from mvpa2.datasets import Dataset sched = np.loadtxt( schedule_file, converters = {0:int,1:int,2:str,3:int,4:str,5:str}, dtype=np.object, skiprows=1) idx = sched[:,1].tolist().index(ses) #scan_no = sched[idx,2].split('-').index('Rest') if in_file is None: scan_no = [i for i,n in enumerate(lh_ctx_file) if 'RESTING' in n] else: scan_no = [i for i,n in enumerate(in_file) if 'RESTING' in n] scan_no = scan_no[0] if in_file is None: inf = lh_ctx_file[scan_no] print(inf) ds = Dataset(wb_to_tss(lh_ctx_file[scan_no], rh_ctx_file[scan_no], sc_file[scan_no])) else: inf = in_file[scan_no] print(inf) ds = cds.ds_from_ts(in_file[scan_no]) #cds.preproc_ds(ds, detrend=True) ds.samples -= scipy.ndimage.gaussian_filter1d(ds.samples,sigma=8,axis=0,truncate=2) seed_roi = 9 cds.add_aparc_ba_fa(ds, sub, pproc_tpl=os.path.join(pipe_hbn_ssi.proc_dir,'moco_multiband','surface_32k','_sub_%d')) roi_mask = np.logical_or(ds.fa.aparc==seed_roi+11100, ds.fa.aparc==seed_roi+12100) mean_roi_ts = ds.samples[:,roi_mask].mean(1) mean_roi_ts -= mean_roi_ts.mean() mtx = np.asarray([mean_roi_ts, np.ones(ds.nsamples)]).T glm = GeneralLinearModel(mtx) glm.fit(ds.samples,model='ols') contrast = glm.contrast([1,0], contrast_type='t') out_file = os.path.abspath('sub%d_ses%d_connectivity_results.npz'%(sub,ses)) np.savez_compressed(out_file, contrast=contrast, mean_roi_ts=mean_roi_ts) #return contrast return out_file, inf
def _call(self, dataset): """Computes featurewise I-RELIEF weights.""" samples = dataset.samples NS, NF = samples.shape[:2] if self.w_guess is None: self.w = np.ones(NF, 'd') # do normalization in all cases to be safe :) self.w = self.w / (self.w**2).sum() M, H = self.compute_M_H(dataset.targets) while True: self.k = self.kernel(length_scale=self.kernel_width / self.w) d_w_k = self.k.computed(samples).as_raw_np() # set d_w_k to zero where distance=0 (i.e. kernel == # 1.0), otherwise I-RELIEF could not converge. # XXX Note that kernel==1 for distance=0 only for # exponential kernels!! IMPROVE d_w_k[np.abs(d_w_k - 1.0) < 1.0e-15] = 0.0 ni = np.zeros(NF, 'd') for n in range(NS): # d_w_k[n,n] could be omitted since == 0.0 gamma_n = 1.0 - np.nan_to_num(d_w_k[n, M[n]].sum() \ / (d_w_k[n, :].sum()-d_w_k[n, n])) alpha_n = np.nan_to_num(d_w_k[n, M[n]] / (d_w_k[n, M[n]].sum())) beta_n = np.nan_to_num(d_w_k[n, H[n]] / (d_w_k[n, H[n]].sum())) m_n = (np.abs(samples[n, :] - samples[M[n], :]) \ * alpha_n[:, None]).sum(0) h_n = (np.abs(samples[n, :] - samples[H[n], :]) \ * beta_n[:, None]).sum(0) ni += gamma_n * (m_n - h_n) ni = ni / NS ni_plus = np.clip(ni, 0.0, np.inf) # set all negative elements to zero w_new = np.nan_to_num(ni_plus / (np.sqrt((ni_plus**2).sum()))) change = np.abs(w_new - self.w).sum() if __debug__ and 'IRELIEF' in debug.active: debug( 'IRELIEF', "change=%.4f max=%f min=%.4f mean=%.4f std=%.4f #nan=%d" % (change, w_new.max(), w_new.min(), w_new.mean(), w_new.std(), np.isnan(w_new).sum())) # update weights: self.w = w_new if change < self.threshold: break return Dataset(self.w[np.newaxis])
def test_resample(): time = np.linspace(0, 2*np.pi, 100) ds = Dataset(np.vstack((np.sin(time), np.cos(time))).T, sa = {'time': time, 'section': np.repeat(range(10), 10)}) assert_equal(ds.shape, (100, 2)) # downsample num = 10 rm = FFTResampleMapper(num, window=('gauss', 50), position_attr='time', attr_strategy='sample') mds = rm.forward(ds) assert_equal(mds.shape, (num, ds.nfeatures)) # didn't change the orig assert_equal(len(ds), 100) # check position-based resampling ds_partial = ds[0::10] mds_partial = rm.forward(ds_partial) # despite different input sampling should yield the same output timepoints assert_array_almost_equal(mds.sa.time, mds_partial.sa.time) # exclude the first points to prevent edge effects, but the data should be # very similar too assert_array_almost_equal(mds.samples[2:], mds_partial.samples[2:], decimal=2) # simple sample of sa's should give meaningful stuff assert_array_equal(mds.sa.section, range(10)) # and now for a dataset with chunks cds = vstack([ds.copy(), ds.copy()]) cds.sa['chunks'] = np.repeat([0,1], len(ds)) rm = FFTResampleMapper(num, attr_strategy='sample', chunks_attr='chunks', window=('gauss', 50)) mcds = rm.forward(cds) assert_equal(mcds.shape, (20, 2)) assert_array_equal(mcds.sa.section, np.tile(range(10),2)) # each individual chunks should be identical to previous dataset assert_array_almost_equal(mds.samples, mcds.samples[:10]) assert_array_almost_equal(mds.samples, mcds.samples[10:])
def intersubject_correlation(dss, reference_ds=0): """ Computes voxelwise inter-subject time series correlation in a pairwise fashion for a list of Datasets. Datasets must all be the same shape. Resulting dataset of pairwise correlations will inherit Dataset attributes from reference data set [Default: first data set in list]. """ # Check if input list contains Datasets, ndarrays dss = [Dataset(ds) if not type(ds) == Dataset else ds for ds in dss] ds_shape = dss[reference_ds].shape n_features = ds_shape[1] for ds in dss: assert ds.shape == ds_shape # Compute time series correlation per voxel per subject pair correlations = [] for pair in combinations(dss, 2): pair_map = [] for feature in xrange(n_features): pair_map.append( pearson_correlation(pair[0].samples[:, feature], pair[1].samples[:, feature])) correlations.append(pair_map) # Resulting correlation map inherits attributes of referece data set correlations_ds = Dataset(correlations, fa=dss[reference_ds].fa, a=dss[reference_ds].a) correlations_ds.sa['pairs'] = list(combinations(range(len(dss)), 2)) assert correlations_ds.shape[0] == len(dss) * (len(dss) - 1) / 2 assert correlations_ds.shape[1] == n_features return correlations_ds
def _proc_block(self, block, ds, measure, seed=None, iblock='main'): """Little helper to capture the parts of the computation that can be parallelized Parameters ---------- seed RNG seed. Should be provided e.g. in child process invocations to guarantee that they all seed differently to not keep generating the same sequencies due to reusing the same copy of numpy's RNG block Critical for generating non-colliding temp filenames in case of hdf5 backend. Otherwise RNGs of different processes might collide in their temporary file names leading to problems. """ if seed is not None: mvpa2.seed(seed) if __debug__: debug_slc_ = 'SLC_' in debug.active debug('SLC', "Starting computing block for %i elements" % len(block)) results = [] store_roi_feature_ids = self.ca.is_enabled('roi_feature_ids') store_roi_sizes = self.ca.is_enabled('roi_sizes') store_roi_center_ids = self.ca.is_enabled('roi_center_ids') assure_dataset = any([store_roi_feature_ids, store_roi_sizes, store_roi_center_ids]) # put rois around all features in the dataset and compute the # measure within them for i, f in enumerate(block): # retrieve the feature ids of all features in the ROI from the query # engine roi_specs = self._queryengine[f] if __debug__ and debug_slc_: debug('SLC_', 'For %r query returned roi_specs %r' % (f, roi_specs)) if is_datasetlike(roi_specs): # TODO: unittest assert(len(roi_specs) == 1) roi_fids = roi_specs.samples[0] else: roi_fids = roi_specs # slice the dataset roi = ds[:, roi_fids] if is_datasetlike(roi_specs): for n, v in roi_specs.fa.iteritems(): roi.fa[n] = v if self.__add_center_fa: # add fa to indicate ROI seed if requested roi_seed = np.zeros(roi.nfeatures, dtype='bool') if f in roi_fids: roi_seed[roi_fids.index(f)] = True else: warning("Center feature attribute id %s not found" % f) roi.fa[self.__add_center_fa] = roi_seed # compute the datameasure and store in results res = measure(roi) if assure_dataset and not is_datasetlike(res): res = Dataset(np.atleast_1d(res)) if store_roi_feature_ids: # add roi feature ids to intermediate result dataset for later # aggregation res.a['roi_feature_ids'] = roi_fids if store_roi_sizes: res.a['roi_sizes'] = roi.nfeatures if store_roi_center_ids: res.a['roi_center_ids'] = f results.append(res) if __debug__: debug('SLC', "Doing %i ROIs: %i (%i features) [%i%%]" \ % (len(block), f + 1, roi.nfeatures, float(i + 1) / len(block) * 100,), cr=True) if self.results_postproc_fx: if __debug__: debug('SLC', "Post-processing %d results in proc_block using %s" % (len(results), self.results_postproc_fx)) results = self.results_postproc_fx(results) if self.results_backend == 'native': pass # nothing special elif self.results_backend == 'hdf5': # store results in a temporary file and return a filename results_file = tempfile.mktemp(prefix=self.tmp_prefix, suffix='-%s.hdf5' % iblock) if __debug__: debug('SLC', "Storing results into %s" % results_file) h5save(results_file, results) if __debug__: debug('SLC_', "Results stored") results = results_file else: raise RuntimeError("Must not reach this point") return results
def test_surf_queryengine(self, qefn): s = surf.generate_plane((0, 0, 0), (0, 1, 0), (0, 0, 1), 4, 5) # add scond layer s2 = surf.merge(s, (s + (.01, 0, 0))) ds = Dataset(samples=np.arange(20)[np.newaxis], fa=dict(node_indices=np.arange(39, 0, -2))) # add more features (with shared node indices) ds3 = hstack((ds, ds, ds)) radius = 2.5 # Note: sweepargs it not used to avoid re-generating the same # surface and dataset multiple times. for distance_metric in ('euclidean', 'dijkstra', '<illegal>', None): builder = lambda: queryengine.SurfaceQueryEngine(s2, radius, distance_metric) if distance_metric in ('<illegal>', None): assert_raises(ValueError, builder) continue qe = builder() # test i/o and ensure that the untrained instance is not trained if externals.exists('h5py'): fd, qefn = tempfile.mkstemp('qe.hdf5', 'test'); os.close(fd) h5save(qefn, qe) qe = h5load(qefn) os.remove(qefn) # untrained qe should give errors assert_raises(ValueError, lambda:qe.ids) assert_raises(ValueError, lambda:qe.query_byid(0)) # node index out of bounds should give error ds_ = ds.copy() ds_.fa.node_indices[0] = 100 assert_raises(ValueError, lambda: qe.train(ds_)) # lack of node indices should give error ds_.fa.pop('node_indices') assert_raises(ValueError, lambda: qe.train(ds_)) # train the qe qe.train(ds3) # test i/o and ensure that the loaded instance is trained if externals.exists('h5py'): h5save(qefn, qe) qe = h5load(qefn) for node in np.arange(-1, s2.nvertices + 1): if node < 0 or node >= s2.nvertices: assert_raises(KeyError, lambda: qe.query_byid(node)) continue feature_ids = np.asarray(qe.query_byid(node)) # node indices relative to ds base_ids = feature_ids[feature_ids < 20] # should have multiples of 20 assert_equal(set(feature_ids), set((base_ids[np.newaxis].T + \ [0, 20, 40]).ravel())) node_indices = list(s2.circlearound_n2d(node, radius, distance_metric or 'dijkstra')) fa_indices = [fa_index for fa_index, node in enumerate(ds3.fa.node_indices) if node in node_indices] assert_equal(set(feature_ids), set(fa_indices)) # smoke tests assert_true('SurfaceQueryEngine' in '%s' % qe) assert_true('SurfaceQueryEngine' in '%r' % qe)
def _proc_block(self, block, ds, measure, iblock='main'): """Little helper to capture the parts of the computation that can be parallelized Parameters ---------- iblock Critical for generating non-colliding temp filenames in case of hdf5 backend. Otherwise RNGs of different processes might collide in their temporary file names leading to problems. """ if __debug__: debug_slc_ = 'SLC_' in debug.active debug('SLC', "Starting computing block for %i elements" % len(block)) if self.ca.is_enabled('roi_sizes'): roi_sizes = [] else: roi_sizes = None results = [] # put rois around all features in the dataset and compute the # measure within them for i, f in enumerate(block): # retrieve the feature ids of all features in the ROI from the query # engine roi_fids = self._queryengine[f] if __debug__ and debug_slc_: debug('SLC_', 'For %r query returned ids %r' % (f, roi_fids)) # slice the dataset roi = ds[:, roi_fids] if self.__add_center_fa: # add fa to indicate ROI seed if requested roi_seed = np.zeros(roi.nfeatures, dtype='bool') roi_seed[roi_fids.index(f)] = True roi.fa[self.__add_center_fa] = roi_seed # compute the datameasure and store in results res = measure(roi) if self.ca.is_enabled('roi_feature_ids'): if not is_datasetlike(res): res = Dataset(np.atleast_1d(res)) # add roi feature ids to intermediate result dataset for later # aggregation res.a['roi_feature_ids'] = roi_fids results.append(res) # store the size of the roi dataset if not roi_sizes is None: roi_sizes.append(roi.nfeatures) if __debug__: debug('SLC', "Doing %i ROIs: %i (%i features) [%i%%]" \ % (len(block), f+1, roi.nfeatures, float(i+1)/len(block)*100,), cr=True) if self.results_backend == 'native': pass # nothing special elif self.results_backend == 'hdf5': # store results in a temporary file and return a filename results_file = tempfile.mktemp(prefix=self.tmp_prefix, suffix='-%s.hdf5' % iblock) if __debug__: debug('SLC', "Storing results into %s" % results_file) h5save(results_file, results) if __debug__: debug('SLC_', "Results stored") results = results_file else: raise RuntimeError("Must not reach this point") return results, roi_sizes
def movie_dataset( subj, preproc=None, base_path=os.curdir, fname_tmpl='sub-%(subj)s/ses-movie/func/sub-%(subj)s_ses-movie_task-movie_run-%(run)i_recording-eyegaze_physio.tsv.gz'): """ Load eyegaze recordings from all runs a merge into a consecutive timeseries When merging intersegment-overlap is removed. Parameters ---------- subj : str Subject code. preproc : callable or None Callable to preprocess a record array of the raw timeseries. The record array has the field 'x', 'y', 'pupil', and 'movie_frame'. It needs to return a record array with the same fields and must not change the sampling rate or number of samples. base_path : path Base directory for input file discovery. fname_tmpl : str Template expression to match input files. Support dict expansion with 'subj' and 'run' keys. Returns ------- Dataset The dataset contains a number of attributes, most of which should be self-explanatory. The `ds.a.run_duration_deviation` attribute quantifies the eyegaze recording duration difference from the expected value (in seconds). """ # in frames (hand-verified by re-assembling in kdenlive -- using MELT # underneath) seg_offsets = (0, 22150, 43802, 65304, 89305, 112007, 133559, 160261) movie_fps = 25.0 eyegaze_sr = 1000.0 # Hz intersegment_overlap = 400 # frames segments = [] for seg, offset in enumerate(seg_offsets): raw = np.recfromcsv( os.path.join(base_path, fname_tmpl % dict(subj=subj, run=seg + 1)), delimiter='\t', names=('x', 'y', 'pupil', 'movie_frame')) if not preproc is None: raw = preproc(raw) # glue together to form a dataset ds = Dataset(np.array((raw.x, raw.y, raw.pupil)).T, sa=dict(movie_frame=raw.movie_frame)) ds.sa['movie_run_frame'] = ds.sa.movie_frame.copy() # turn into movie frame ID for the entire unsegmented movie ds.sa.movie_frame += offset ## truncate segment time series to remove overlap if seg < 7: # cut the end in a safe distance to the actual end, but inside the # overlap ds = ds[:-int(intersegment_overlap / movie_fps * eyegaze_sr)] if seg > 0: # cut the beginning to have a seamless start after the previous # segment ds = ds[ds.sa.movie_frame > segments[-1].sa.movie_frame.max()] ds.sa['movie_run'] = [seg + 1] * len(ds) segments.append(ds) ds = vstack(segments) # column names ds.fa['name'] = ('x', 'y', 'pupil') ds.a['sampling_rate'] = eyegaze_sr ds.a['movie_fps'] = movie_fps return ds
def test_datasetmapping(): # 6 samples, 4X2 features data = np.arange(48).reshape(6,4,2) ds = Dataset(data, sa={'timepoints': np.arange(6), 'multidim': data.copy()}, fa={'fid': np.arange(4)}) # with overlapping and non-overlapping boxcars startpoints = [0, 1, 4] boxlength = 2 bm = BoxcarMapper(startpoints, boxlength, space='boxy') # train is critical bm.train(ds) mds = bm.forward(ds) assert_equal(len(mds), len(startpoints)) assert_equal(mds.nfeatures, boxlength) # all samples attributes remain, but the can rotated/compressed into # multidimensional attributes assert_equal(sorted(mds.sa.keys()), ['boxy_onsetidx'] + sorted(ds.sa.keys())) assert_equal(mds.sa.multidim.shape, (len(startpoints), boxlength) + ds.shape[1:]) assert_equal(mds.sa.timepoints.shape, (len(startpoints), boxlength)) assert_array_equal(mds.sa.timepoints.flatten(), np.array([(s, s+1) for s in startpoints]).flatten()) assert_array_equal(mds.sa.boxy_onsetidx, startpoints) # feature attributes also get rotated and broadcasted assert_array_equal(mds.fa.fid, [ds.fa.fid, ds.fa.fid]) # and finally there is a new one assert_array_equal(mds.fa.boxy_offsetidx, range(boxlength)) # now see how it works on reverse() rds = bm.reverse(mds) # we got at least something of all original attributes back assert_equal(sorted(rds.sa.keys()), sorted(ds.sa.keys())) assert_equal(sorted(rds.fa.keys()), sorted(ds.fa.keys())) # it is not possible to reconstruct the full samples array # some samples even might show up multiple times (when there are overlapping # boxcars assert_array_equal(rds.samples, np.array([[[ 0, 1], [ 2, 3], [ 4, 5], [ 6, 7]], [[ 8, 9], [10, 11], [12, 13], [14, 15]], [[ 8, 9], [10, 11], [12, 13], [14, 15]], [[16, 17], [18, 19], [20, 21], [22, 23]], [[32, 33], [34, 35], [36, 37], [38, 39]], [[40, 41], [42, 43], [44, 45], [46, 47]]])) assert_array_equal(rds.sa.timepoints, [0, 1, 1, 2, 4, 5]) assert_array_equal(rds.sa.multidim, ds.sa.multidim[rds.sa.timepoints]) # but feature attributes should be fully recovered assert_array_equal(rds.fa.fid, ds.fa.fid) # popular dataset configuration (double flatten + boxcar) cm= ChainMapper([FlattenMapper(), bm, FlattenMapper()]) cm.train(ds) bflat = ds.get_mapped(cm) assert_equal(bflat.shape, (len(startpoints), boxlength * np.prod(ds.shape[1:]))) # add attributes bflat.fa['testfa'] = np.arange(bflat.nfeatures) bflat.sa['testsa'] = np.arange(bflat.nsamples) # now try to go back bflatrev = bflat.mapper.reverse(bflat) # data should be same again, as far as the boxcars match assert_array_equal(ds.samples[:2], bflatrev.samples[:2]) assert_array_equal(ds.samples[-2:], bflatrev.samples[-2:]) # feature axis should match assert_equal(ds.shape[1:], bflatrev.shape[1:])
def test_polydetrend(): samples_forwhole = np.array( [[1.0, 2, 3, 4, 5, 6], [-2.0, -4, -6, -8, -10, -12]], ndmin=2 ).T samples_forchunks = np.array( [[1.0, 2, 3, 3, 2, 1], [-2.0, -4, -6, -6, -4, -2]], ndmin=2 ).T chunks = [0, 0, 0, 1, 1, 1] chunks_bad = [ 0, 0, 1, 1, 1, 0] target_whole = np.array( [[-3.0, -2, -1, 1, 2, 3], [-6, -4, -2, 2, 4, 6]], ndmin=2 ).T target_chunked = np.array( [[-1.0, 0, 1, 1, 0, -1], [2, 0, -2, -2, 0, 2]], ndmin=2 ).T ds = Dataset(samples_forwhole) # this one will auto-train the mapper on first use dm = PolyDetrendMapper(polyord=1, space='police') mds = dm.forward(ds) # features are linear trends, so detrending should remove all assert_array_almost_equal(mds.samples, np.zeros(mds.shape)) # we get the information where each sample is assumed to be in the # space spanned by the polynomials assert_array_equal(mds.sa.police, np.arange(len(ds))) # hackish way to get the previous regressors into a dataset ds.sa['opt_reg_const'] = dm._regs[:,0] ds.sa['opt_reg_lin'] = dm._regs[:,1] # using these precomputed regressors, we should get the same result as # before even if we do not generate a regressor for linear dm_optreg = PolyDetrendMapper(polyord=0, opt_regs=['opt_reg_const', 'opt_reg_lin']) mds_optreg = dm_optreg.forward(ds) assert_array_almost_equal(mds_optreg, np.zeros(mds.shape)) ds = Dataset(samples_forchunks) # 'constant' detrending removes the mean mds = PolyDetrendMapper(polyord=0).forward(ds) assert_array_almost_equal( mds.samples, samples_forchunks - np.mean(samples_forchunks, axis=0)) # if there is no GLOBAL linear trend it should be identical to mean removal # even if trying to remove linear mds2 = PolyDetrendMapper(polyord=1).forward(ds) assert_array_almost_equal(mds, mds2) # chunk-wise detrending ds = dataset_wizard(samples_forchunks, chunks=chunks) dm = PolyDetrendMapper(chunks_attr='chunks', polyord=1, space='police') mds = dm.forward(ds) # features are chunkswise linear trends, so detrending should remove all assert_array_almost_equal(mds.samples, np.zeros(mds.shape)) # we get the information where each sample is assumed to be in the # space spanned by the polynomials, which is the identical linspace in both # chunks assert_array_equal(mds.sa.police, range(3) * 2) # non-matching number of samples cannot be mapped assert_raises(ValueError, dm.forward, ds[:-1]) # however, if the dataset knows about the space it is possible ds.sa['police'] = mds.sa.police # XXX this should be #mds2 = dm(ds[1:-1]) #assert_array_equal(mds[1:-1], mds2) # XXX but right now is assert_raises(NotImplementedError, dm.forward, ds[1:-1]) # Detrend must preserve the size of dataset assert_equal(mds.shape, ds.shape) # small additional test for break points # although they are no longer there ds = dataset_wizard(np.array([[1.0, 2, 3, 1, 2, 3]], ndmin=2).T, targets=chunks, chunks=chunks) mds = PolyDetrendMapper(chunks_attr='chunks', polyord=1).forward(ds) assert_array_almost_equal(mds.samples, np.zeros(mds.shape)) # test of different polyord on each chunk target_mixed = np.array( [[-1.0, 0, 1, 0, 0, 0], [2.0, 0, -2, 0, 0, 0]], ndmin=2 ).T ds = dataset_wizard(samples_forchunks.copy(), targets=chunks, chunks=chunks) mds = PolyDetrendMapper(chunks_attr='chunks', polyord=[0,1]).forward(ds) assert_array_almost_equal(mds, target_mixed) # test irregluar spacing of samples, but with corrective time info samples_forwhole = np.array( [[1.0, 4, 6, 8, 2, 9], [-2.0, -8, -12, -16, -4, -18]], ndmin=2 ).T ds = Dataset(samples_forwhole, sa={'time': samples_forwhole[:,0]}) # linear detrending that makes use of temporal info from dataset dm = PolyDetrendMapper(polyord=1, space='time') mds = dm.forward(ds) assert_array_almost_equal(mds.samples, np.zeros(mds.shape)) # and now the same stuff, but with chunking and ordered by time samples_forchunks = np.array( [[1.0, 3, 3, 2, 2, 1], [-2.0, -6, -6, -4, -4, -2]], ndmin=2 ).T chunks = [0, 1, 0, 1, 0, 1] time = [4, 4, 12, 8, 8, 12] ds = Dataset(samples_forchunks.copy(), sa={'chunks': chunks, 'time': time}) mds = PolyDetrendMapper(chunks_attr='chunks', polyord=1, space='time').forward(ds) # the whole thing must not affect the source data assert_array_equal(ds, samples_forchunks) # but if done inplace that is no longer true poly_detrend(ds, chunks_attr='chunks', polyord=1, space='time') assert_array_equal(ds, mds)