def hdf2ds(fnames): """Load dataset(s) from an HDF5 file Parameters ---------- fname : list(str) Names of the input HDF5 files Returns ------- list(Dataset) All datasets-like elements in all given HDF5 files (in order of appearance). If any given HDF5 file contains non-Dataset elements they are silently ignored. If no given HDF5 file contains any dataset, an empty list is returned. """ from mvpa2.base.hdf5 import h5load dss = [] for fname in fnames: content = h5load(fname) if is_datasetlike(content): dss.append(content) else: for c in content: if is_datasetlike(c): dss.append(c) return dss
def forward(self, data): """Map data from input to output space. Parameters ---------- data : Dataset-like, (at least 2D)-array-like Typically this is a `Dataset`, but it might also be a plain data array, or even something completely different(TM) that is supported by a subclass' implementation. If such an object is Dataset-like it is handled by a dedicated method that also transforms dataset attributes if necessary. If an array-like is passed, it has to be at least two-dimensional, with the first axis separating samples or observations. For single samples `forward1()` might be more appropriate. """ if is_datasetlike(data): if __debug__: debug('MAP', "Forward-map %s-shaped dataset through '%s'." % (data.shape, self)) return self._forward_dataset(data) else: if hasattr(data, 'ndim') and data.ndim < 2: raise ValueError( 'Mapper.forward() only support mapping of data with ' 'at least two dimensions, where the first axis ' 'separates samples/observations. Consider using ' 'Mapper.forward1() instead.') if __debug__: debug('MAP', "Forward-map data through '%s'." % (self)) return self._forward_data(data)
def _sl_call(self, dataset, roi_ids, nproc): """Classical generic searchlight implementation """ assert (self.results_backend in ('native', 'hdf5')) # compute if nproc is not None and nproc > 1: # split all target ROIs centers into `nproc` equally sized blocks nproc_needed = min(len(roi_ids), nproc) nblocks = nproc_needed \ if self.nblocks is None else self.nblocks roi_blocks = np.array_split(roi_ids, nblocks) # the next block sets up the infrastructure for parallel computing # this can easily be changed into a ParallelPython loop, if we # decide to have a PP job server in PyMVPA import pprocess p_results = pprocess.Map(limit=nproc_needed) if __debug__: debug( 'SLC', "Starting off %s child processes for nblocks=%i" % (nproc_needed, nblocks)) compute = p_results.manage(pprocess.MakeParallel(self._proc_block)) for iblock, block in enumerate(roi_blocks): # should we maybe deepcopy the measure to have a unique and # independent one per process? seed = mvpa2.get_random_seed() compute(block, dataset, copy.copy(self.__datameasure), seed=seed, iblock=iblock) else: # otherwise collect the results in an 1-item list p_results = [ self._proc_block(roi_ids, dataset, self.__datameasure) ] # Finally collect and possibly process results # p_results here is either a generator from pprocess.Map or a list. # In case of a generator it allows to process results as they become # available result_ds = self.results_fx( sl=self, dataset=dataset, roi_ids=roi_ids, results=self.__handle_all_results(p_results)) # Assure having a dataset (for paranoid ones) if not is_datasetlike(result_ds): try: result_a = np.atleast_1d(result_ds) except ValueError, e: if 'setting an array element with a sequence' in str(e): # try forcing object array. Happens with # test_custom_results_fx_logic on numpy 1.4.1 on Debian # squeeze result_a = np.array(result_ds, dtype=object) else: raise result_ds = Dataset(result_a)
def p(self, x, return_tails=False, **kwargs): """Returns the p-value for values of `x`. Returned values are determined left, right, or from any tail depending on the constructor setting. In case a `FeaturewiseMeasure` was used to estimate the distribution the method returns an array. In that case `x` can be a scalar value or an array of a matching shape. """ peas = _pvalue(x, self.cdf, self.rcdf, self.__tail, return_tails=return_tails, **kwargs) if is_datasetlike(x): # return the p-values in a dataset as well and assign the input # dataset attributes to the return dataset too pds = x.copy(deep=False) if return_tails: pds.samples = peas[0] return pds, peas[1] else: pds.samples = peas return pds return peas
def results_fx(sl=None, dataset=None, roi_ids=None, results=None): """It will "process" the results by removing those files generated inside the measure """ res = [] print_("READY") for x in results: ok_(isinstance(x, list)) res.append(x) print_("R: ", x) for r in x: # Can happen if we requested those .ca's enabled # -- then automagically _proc_block would wrap # results in a dataset... Originally detected by # running with MVPA_DEBUG=.* which triggered # enabling all ca's if is_datasetlike(r): r = np.asscalar(r.samples) os.unlink(r) # remove generated file print_("WAITING") results_ds = hstack(sum(res, [])) # store the center ids as a feature attribute since we use # them for testing results_ds.fa['center_ids'] = roi_ids return results_ds
def train(self, ds): """ The default implementation calls ``_pretrain()``, ``_train()``, and finally ``_posttrain()``. Parameters ---------- ds: Dataset Training dataset. Returns ------- None """ got_ds = is_datasetlike(ds) # TODO remove first condition if all Learners get only datasets if got_ds and (ds.nfeatures == 0 or len(ds) == 0): raise DegenerateInputError( "Cannot train learner on degenerate data %s" % ds) if __debug__: debug( "LRN", "Training learner %(lrn)s on dataset %(dataset)s", msgargs={'lrn': self, 'dataset': ds}) self._pretrain(ds) # remember the time when started training t0 = time.time() if got_ds: # things might have happened during pretraining if ds.nfeatures > 0: self._train(ds) else: warning("Trying to train on dataset with no features present") if __debug__: debug("LRN", "No features present for training, no actual training " "is called") else: # in this case we claim to have no idea and simply try to train self._train(ds) # store timing self.ca.training_time = time.time() - t0 # and post-proc self._posttrain(ds) # finally flag as trained self._set_trained() if __debug__: debug( "LRN", "Finished training learner %(lrn)s on dataset %(dataset)s", msgargs={'lrn': self, 'dataset': ds})
def compute(self, ds1, ds2=None): """Generic computation of any kernel Assumptions: - ds1, ds2 are either datasets or arrays, - presumably 2D (not checked neither enforced here - _compute takes ndarrays. If your kernel needs datasets, override compute """ if is_datasetlike(ds1): ds1 = ds1.samples if ds2 is None: ds2 = ds1 elif is_datasetlike(ds2): ds2 = ds2.samples # TODO: assure 2D shape self._compute(ds1, ds2)
def train(self, ds): """ The default implementation calls ``_pretrain()``, ``_train()``, and finally ``_posttrain()``. Parameters ---------- ds: Dataset Training dataset. Returns ------- None """ got_ds = is_datasetlike(ds) # TODO remove first condition if all Learners get only datasets if got_ds and (ds.nfeatures == 0 or len(ds) == 0): raise DegenerateInputError( "Cannot train learner on degenerate data %s" % ds) if __debug__: debug("LRN", "Training learner %(lrn)s on dataset %(dataset)s", msgargs={'lrn':self, 'dataset': ds}) self._pretrain(ds) # remember the time when started training t0 = time.time() if got_ds: # things might have happened during pretraining if ds.nfeatures > 0: result = self._train(ds) else: warning("Trying to train on dataset with no features present") if __debug__: debug("LRN", "No features present for training, no actual training " \ "is called") result = None else: # in this case we claim to have no idea and simply try to train result = self._train(ds) # store timing self.ca.training_time = time.time() - t0 # and post-proc result = self._posttrain(ds) # finally flag as trained self._set_trained() if __debug__: debug("LRN", "Finished training learner %(lrn)s on dataset %(dataset)s", msgargs={'lrn':self, 'dataset': ds})
def __process_roi(self, ds, roi_feature_id, measure, assure_dataset): # retrieve the feature ids of all features in the ROI from the query # engine roi_specs = self._queryengine[roi_feature_id] if __debug__: debug( 'SLC_', 'For %r query returned roi_specs %r' % (roi_feature_id, roi_specs)) if is_datasetlike(roi_specs): # TODO: unittest assert (len(roi_specs) == 1) roi_fids = roi_specs.samples[0] else: roi_fids = roi_specs # slice the dataset roi = ds[:, roi_fids] if is_datasetlike(roi_specs): for n, v in roi_specs.fa.iteritems(): roi.fa[n] = v if self.__add_center_fa: # add fa to indicate ROI seed if requested roi_seed = np.zeros(roi.nfeatures, dtype='bool') if roi_feature_id in roi_fids: roi_seed[roi_fids.index(roi_feature_id)] = True else: warning("Center feature attribute id %s not found" % roi_feature_id) roi.fa[self.__add_center_fa] = roi_seed # compute the datameasure and store in results res = measure(roi) if assure_dataset and not is_datasetlike(res): res = Dataset(np.atleast_1d(res)) if self.ca.is_enabled('roi_feature_ids'): # add roi feature ids to intermediate result dataset for later # aggregation res.a['roi_feature_ids'] = roi_fids if self.ca.is_enabled('roi_sizes'): res.a['roi_sizes'] = roi.nfeatures if self.ca.is_enabled('roi_center_ids'): res.a['roi_center_ids'] = roi_feature_id return res, roi
def _sl_call(self, dataset, roi_ids, nproc): """Classical generic searchlight implementation """ assert(self.results_backend in ('native', 'hdf5')) # compute if nproc is not None and nproc > 1: # split all target ROIs centers into `nproc` equally sized blocks nproc_needed = min(len(roi_ids), nproc) nblocks = nproc_needed \ if self.nblocks is None else self.nblocks roi_blocks = np.array_split(roi_ids, nblocks) # the next block sets up the infrastructure for parallel computing # this can easily be changed into a ParallelPython loop, if we # decide to have a PP job server in PyMVPA import pprocess p_results = pprocess.Map(limit=nproc_needed) if __debug__: debug('SLC', "Starting off %s child processes for nblocks=%i" % (nproc_needed, nblocks)) compute = p_results.manage( pprocess.MakeParallel(self._proc_block)) for iblock, block in enumerate(roi_blocks): # should we maybe deepcopy the measure to have a unique and # independent one per process? seed = mvpa2.get_random_seed() compute(block, dataset, copy.copy(self.__datameasure), seed=seed, iblock=iblock) else: # otherwise collect the results in an 1-item list p_results = [ self._proc_block(roi_ids, dataset, self.__datameasure)] # Finally collect and possibly process results # p_results here is either a generator from pprocess.Map or a list. # In case of a generator it allows to process results as they become # available result_ds = self.results_fx(sl=self, dataset=dataset, roi_ids=roi_ids, results=self.__handle_all_results(p_results)) # Assure having a dataset (for paranoid ones) if not is_datasetlike(result_ds): try: result_a = np.atleast_1d(result_ds) except ValueError, e: if 'setting an array element with a sequence' in str(e): # try forcing object array. Happens with # test_custom_results_fx_logic on numpy 1.4.1 on Debian # squeeze result_a = np.array(result_ds, dtype=object) else: raise result_ds = Dataset(result_a)
def _proc_block(self, block, ds, measure): """Little helper to capture the parts of the computation that can be parallelized """ if __debug__: debug_slc_ = 'SLC_' in debug.active debug('SLC', "Starting computing block for %i elements" % len(block)) if self.ca.is_enabled('roi_sizes'): roi_sizes = [] else: roi_sizes = None results = [] # put rois around all features in the dataset and compute the # measure within them for i, f in enumerate(block): # retrieve the feature ids of all features in the ROI from the query # engine roi_fids = self._queryengine[f] if __debug__ and debug_slc_: debug('SLC_', 'For %r query returned ids %r' % (f, roi_fids)) # slice the dataset roi = ds[:, roi_fids] if self.__add_center_fa: # add fa to indicate ROI seed if requested roi_seed = np.zeros(roi.nfeatures, dtype='bool') roi_seed[roi_fids.index(f)] = True roi.fa[self.__add_center_fa] = roi_seed # compute the datameasure and store in results res = measure(roi) if self.ca.is_enabled('roi_feature_ids'): if not is_datasetlike(res): res = Dataset(np.atleast_1d(res)) # add roi feature ids to intermediate result dataset for later # aggregation res.a['roi_feature_ids'] = roi_fids results.append(res) # store the size of the roi dataset if not roi_sizes is None: roi_sizes.append(roi.nfeatures) if __debug__: debug('SLC', "Doing %i ROIs: %i (%i features) [%i%%]" \ % (len(block), f+1, roi.nfeatures, float(i+1)/len(block)*100,), cr=True) return results, roi_sizes
def _get_sl_connectomes(self, seed_means, qe_all, datasets, inode, connectivity_mapper): # For each SL, computing connectivity of features to seed means sl_connectomes = [] # Looping over each subject for seed_mean, qe_, sd in zip(seed_means, qe_all, datasets): connectivity_mapper.train(seed_mean) sl_ids = qe_[inode] if is_datasetlike(sl_ids): assert (sl_ids.nsamples == 1) sl_ids = sl_ids.samples[0, :].tolist() sl_connectomes.append(connectivity_mapper.forward(sd[:, sl_ids])) return sl_connectomes
def _predict(self, data): l = len(self._ulabels) # oh those lovely random estimates, for now just an estimate # per sample. Since we are random after all -- keep it random self.ca.estimates = np.random.normal(size=len(data)) if is_datasetlike(data) and self.params.same: # decide on mapping between original labels labels_map = dict( (t, rt) for t, rt in zip(self._ulabels, self._ulabels[npr.randint(0, l, size=l)])) return [labels_map[t] for t in data.sa[self.get_space()].value] else: # random one per each return self._ulabels[npr.randint(0, l, size=len(data))]
def _posttrain(self, dataset): """Functionality post training For instance -- computing confusion matrix. Parameters ---------- dataset : Dataset Data which was used for training """ super(Classifier, self)._posttrain(dataset) ca = self.ca # needs to be assigned first since below we use predict self.__trainednfeatures = dataset.nfeatures if __debug__ and 'CHECK_TRAINED' in debug.active: self.__trainedidhash = dataset.idhash if ca.is_enabled('training_stats') and \ not ca.is_set('training_stats'): # we should not store predictions for training data, # it is confusing imho (yoh) ca.change_temporarily(disable_ca=["predictions"]) if self.params.retrainable: # we would need to recheck if data is the same, # XXX think if there is a way to make this all # efficient. For now, probably, retrainable # classifiers have no chance but not to use # training_stats... sad self.__changedData_isset = False predictions = self.predict(dataset) ca.reset_changed_temporarily() targets = dataset.sa[self.get_space()].value if is_datasetlike(predictions) and (self.get_space() in predictions.fa): # e.g. in case of pair-wise uncombined results - provide # stats per each of the targets pairs prediction_targets = predictions.fa[self.get_space()].value ca.training_stats = dict( (t, self.__summary_class__(targets=targets, predictions=predictions.samples[:, i]) ) for i, t in enumerate(prediction_targets)) else: ca.training_stats = self.__summary_class__( targets=targets, predictions=predictions)
def hstack(datasets): """Stacks datasets horizontally (appending features). Sample attribute collections are merged incrementally, attribute with identical keys overwriting previous ones in the stacked dataset. All datasets must have an identical set of feature attributes (matching keys, not values), otherwise a ValueError will be raised. No dataset attributes from any source dataset will be transferred into the stacked dataset. Parameters ---------- datasets : tuple Sequence of datasets to be stacked. Returns ------- AttrDataset (or respective subclass) """ # # XXX Use CombinedMapper in here whenever it comes back # # fall back to numpy if it is not a dataset if not is_datasetlike(datasets[0]): # we might get a list of 1Ds that would yield wrong results when # turned into a dict (would run along samples-axis) return AttrDataset(np.atleast_2d(np.hstack(datasets))) if __debug__: target = sorted(datasets[0].fa.keys()) if not np.all([sorted(ds.fa.keys()) == target for ds in datasets]): raise ValueError("Feature attributes collections of to be stacked " "datasets have varying attributes.") # will puke if not equal number of samples stacked_samp = np.concatenate([ds.samples for ds in datasets], axis=1) stacked_fa = {} for attr in datasets[0].fa: stacked_fa[attr] = np.concatenate( [ds.fa[attr].value for ds in datasets], axis=0) # create the dataset merged = datasets[0].__class__(stacked_samp, fa=stacked_fa) for ds in datasets: merged.sa.update(ds.sa) return merged
def _posttrain(self, dataset): """Functionality post training For instance -- computing confusion matrix. Parameters ---------- dataset : Dataset Data which was used for training """ super(Classifier, self)._posttrain(dataset) ca = self.ca # needs to be assigned first since below we use predict self.__trainednfeatures = dataset.nfeatures if __debug__ and 'CHECK_TRAINED' in debug.active: self.__trainedidhash = dataset.idhash if ca.is_enabled('training_stats') and \ not ca.is_set('training_stats'): # we should not store predictions for training data, # it is confusing imho (yoh) ca.change_temporarily( disable_ca=["predictions"]) if self.params.retrainable: # we would need to recheck if data is the same, # XXX think if there is a way to make this all # efficient. For now, probably, retrainable # classifiers have no chance but not to use # training_stats... sad self.__changedData_isset = False predictions = self.predict(dataset) ca.reset_changed_temporarily() targets = dataset.sa[self.get_space()].value if is_datasetlike(predictions) and (self.get_space() in predictions.fa): # e.g. in case of pair-wise uncombined results - provide # stats per each of the targets pairs prediction_targets = predictions.fa[self.get_space()].value ca.training_stats = dict( (t, self.__summary_class__( targets=targets, predictions=predictions.samples[:, i])) for i, t in enumerate(prediction_targets)) else: ca.training_stats = self.__summary_class__( targets=targets, predictions=predictions)
def vstack(datasets): """Stacks datasets vertically (appending samples). Feature attribute collections are merged incrementally, attribute with identical keys overwriting previous ones in the stacked dataset. All datasets must have an identical set of sample attributes (matching keys, not values), otherwise a ValueError will be raised. No dataset attributes from any source dataset will be transferred into the stacked dataset. If all input dataset have common dataset attributes that are also valid for the stacked dataset, they can be moved into the output dataset like this:: ds_merged = vstack((ds1, ds2, ds3)) ds_merged.a.update(ds1.a) Parameters ---------- datasets : tuple Sequence of datasets to be stacked. Returns ------- AttrDataset (or respective subclass) """ # fall back to numpy if it is not a dataset if not is_datasetlike(datasets[0]): return AttrDataset(np.vstack(datasets)) if __debug__: target = sorted(datasets[0].sa.keys()) if not np.all([sorted(ds.sa.keys()) == target for ds in datasets]): raise ValueError("Sample attributes collections of to be stacked " "datasets have varying attributes.") # will puke if not equal number of features stacked_samp = np.concatenate([ds.samples for ds in datasets], axis=0) stacked_sa = {} for attr in datasets[0].sa: stacked_sa[attr] = np.concatenate( [ds.sa[attr].value for ds in datasets], axis=0) # create the dataset merged = datasets[0].__class__(stacked_samp, sa=stacked_sa) for ds in datasets: merged.fa.update(ds.fa) return merged
def _get_hypesvs(self, sl_connectomes, local_common_model=None): ''' Hyperalign connectomes and return mapppers and trained SVDMapper of common space. Parameters ---------- sl_connectomes: a list of connectomes to hyperalign local_common_model: a reference common model to be used. Returns ------- a tuple (sl_hmappers, svm, local_common_model) sl_hmappers: a list of mappers corresponding to input list in that order. svm: a svm mapper based on the input data. if given a common model, this is None. local_common_model: If local_common_model is provided as input, this will be None. Otherwise, local_common_model will be computed here and returned. ''' # TODO Should we z-score sl_connectomes? return_model = False if self.params.save_model is None else True if local_common_model is not None: ha = Hyperalignment(level2_niter=0) if not is_datasetlike(local_common_model): local_common_model = Dataset(samples=local_common_model) ha.train([local_common_model]) sl_hmappers = ha(sl_connectomes) return sl_hmappers, None, None ha = Hyperalignment() sl_hmappers = ha(sl_connectomes) sl_connectomes = [ slhm.forward(slc) for slhm, slc in zip(sl_hmappers, sl_connectomes) ] _ = [zscore(slc, chunks_attr=None) for slc in sl_connectomes] sl_connectomes = np.dstack(sl_connectomes).mean(axis=-1) svm = SVDMapper(force_train=True) svm.train(sl_connectomes) if return_model: local_common_model = svm.forward(sl_connectomes) else: local_common_model = None return sl_hmappers, svm, local_common_model
def reverse(self, data): """Reverse-map data from output back into input space. Parameters ---------- data : Dataset-like, anything Typically this is a `Dataset`, but it might also be a plain data array, or even something completely different(TM) that is supported by a subclass' implementation. If such an object is Dataset-like it is handled by a dedicated method that also transforms dataset attributes if necessary. """ if is_datasetlike(data): if __debug__: debug('MAP', "Reverse-map %s-shaped dataset through '%s'." % (data.shape, self)) return self._reverse_dataset(data) else: if __debug__: debug('MAP', "Reverse-map data through '%s'." % (self)) return self._reverse_data(data)
def p(self, x, return_tails=False, **kwargs): """Returns the p-value for values of `x`. Returned values are determined left, right, or from any tail depending on the constructor setting. In case a `FeaturewiseMeasure` was used to estimate the distribution the method returns an array. In that case `x` can be a scalar value or an array of a matching shape. """ peas = _pvalue(x, self.cdf, self.__tail, return_tails=return_tails, **kwargs) if is_datasetlike(x): # return the p-values in a dataset as well and assign the input # dataset attributes to the return dataset too pds = x.copy(deep=False) if return_tails: pds.samples = peas[0] return pds, peas[1] else: pds.samples = peas return pds return peas
def _get_hypesvs(self, sl_connectomes, local_common_model=None): ''' Hyperalign connectomes and return mapppers and trained SVDMapper of common space. Parameters ---------- sl_connectomes: a list of connectomes to hyperalign local_common_model: a reference common model to be used. Returns ------- a tuple (sl_hmappers, svm, local_common_model) sl_hmappers: a list of mappers corresponding to input list in that order. svm: a svm mapper based on the input data. if given a common model, this is None. local_common_model: If local_common_model is provided as input, this will be None. Otherwise, local_common_model will be computed here and returned. ''' # TODO Should we z-score sl_connectomes? return_model = False if self.params.save_model is None else True if local_common_model is not None: ha = Hyperalignment(level2_niter=0) if not is_datasetlike(local_common_model): local_common_model = Dataset(samples=local_common_model) ha.train([local_common_model]) sl_hmappers = ha(sl_connectomes) return sl_hmappers, None, None ha = Hyperalignment() sl_hmappers = ha(sl_connectomes) sl_connectomes = [slhm.forward(slc) for slhm, slc in zip(sl_hmappers, sl_connectomes)] _ = [zscore(slc, chunks_attr=None) for slc in sl_connectomes] sl_connectomes = np.dstack(sl_connectomes).mean(axis=-1) svm = SVDMapper(force_train=True) svm.train(sl_connectomes) if return_model: local_common_model = svm.forward(sl_connectomes) else: local_common_model = None return sl_hmappers, svm, local_common_model
def wrap_samples(obj, data, *args, **kwargs): if is_datasetlike(data): return fx(obj, data, *args, **kwargs) else: return fx(obj, Dataset(data), *args, **kwargs)
def hstack(datasets, a=None, sa='drop_nonunique'): """Stacks datasets horizontally (appending features). All datasets must have an identical set of sample attributes (matching keys, not values), otherwise a ValueError will be raised. See `a` argument documentation for transferring dataset attributes, and `sa` argument for sample attributes -- by default sample attributes which differ in any input dataset from the others would be dropped. Parameters ---------- datasets : tuple Sequence of datasets to be stacked. a: {'unique','drop_nonunique','uniques','all'} or True or False or None (default: None) Indicates which dataset attributes from datasets are stored in merged_dataset. If an int k, then the dataset attributes from datasets[k] are taken. If 'unique' then it is assumed that any attribute common to more than one dataset in datasets is unique; if not an exception is raised. If 'drop_nonunique' then as 'unique', except that exceptions are not raised. If 'uniques' then, for each attribute, any unique value across the datasets is stored in a tuple in merged_datasets. If 'all' then each attribute present in any dataset across datasets is stored as a tuple in merged_datasets; missing values are replaced by None. If None (the default) then no attributes are stored in merged_dataset. True is equivalent to 'drop_nonunique'. False is equivalent to None. sa: {'update', 'drop_nonunique'}, (default: 'update') Indicate which feature attributes are stored in merged dataset. If 'update' - attributes are updated while growing the dataset. If 'drop_nonunique', attribute would be dropped from the dataset if its value differs across datasets for any sample. Returns ------- AttrDataset (or respective subclass) """ # # XXX Use CombinedMapper in here whenever it comes back # if not len(datasets): raise ValueError('concatenation of zero-length sequences is impossible') if not len(datasets) > 1: # trivial hstack return datasets[0] # fall back to numpy if it is not a dataset if not is_datasetlike(datasets[0]): # we might get a list of 1Ds that would yield wrong results when # turned into a dict (would run along samples-axis) return AttrDataset(np.atleast_2d(np.hstack(datasets))) if __debug__: target = sorted(datasets[0].fa.keys()) if not np.all([sorted(ds.fa.keys()) == target for ds in datasets]): raise ValueError("Feature attributes collections of to be stacked " "datasets have varying attributes.") # will puke if not equal number of samples stacked_samp = np.concatenate([ds.samples for ds in datasets], axis=1) stacked_fa = {} for attr in datasets[0].fa: stacked_fa[attr] = np.concatenate( [ds.fa[attr].value for ds in datasets], axis=0) # create the dataset merged = datasets[0].__class__(stacked_samp, fa=stacked_fa) _stack_add_equal_attributes(merged, datasets, sa, 'sa') _stack_add_equal_dataset_attributes(merged, datasets, a) return merged
def _train(self, source): params = self.params # Since it is unsupervised, we don't care about labels datas = () odatas = () means = () shapes = () assess_residuals = __debug__ and 'MAP_' in debug.active target = source.sa[self.get_space()].value for i, ds in enumerate((source, target)): if is_datasetlike(ds): data = np.asarray(ds.samples) else: data = ds if assess_residuals: odatas += (data, ) if self._demean: if i == 0: mean = self._offset_in else: mean = data.mean(axis=0) data = data - mean else: # no demeaning === zero means mean = np.zeros(shape=data.shape[1:]) means += (mean, ) datas += (data, ) shapes += (data.shape, ) # shortcuts for sizes sn, sm = shapes[0] tn, tm = shapes[1] # Check the sizes if sn != tn: raise ValueError, "Data for both spaces should have the same " \ "number of samples. Got %d in source and %d in target space" \ % (sn, tn) # Sums of squares ssqs = [np.sum(d**2, axis=0) for d in datas] # XXX check for being invariant? # needs to be tuned up properly and not raise but handle for i in xrange(2): if np.all(ssqs[i] <= np.abs((np.finfo(datas[i].dtype).eps * sn * means[i])**2)): raise ValueError, "For now do not handle invariant in time datasets" norms = [np.sqrt(np.sum(ssq)) for ssq in ssqs] normed = [data / norm for (data, norm) in zip(datas, norms)] # add new blank dimensions to source space if needed if sm < tm: normed[0] = np.hstack((normed[0], np.zeros((sn, tm - sm)))) if sm > tm: if params.reduction: normed[1] = np.hstack((normed[1], np.zeros((sn, sm - tm)))) else: raise ValueError, "reduction=False, so mapping from " \ "higher dimensionality " \ "source space is not supported. Source space had %d " \ "while target %d dimensions (features)" % (sm, tm) source, target = normed if params.oblique: # Just do silly linear system of equations ;) or naive # inverse problem if sn == sm and tm == 1: T = np.linalg.solve(source, target) else: T = np.linalg.lstsq(source, target, rcond=params.oblique_rcond)[0] ss = 1.0 else: # Orthogonal transformation # figure out optimal rotation if params.svd == 'numpy': U, s, Vh = np.linalg.svd(np.dot(target.T, source), full_matrices=False) elif params.svd == 'scipy': # would raise exception if not present externals.exists('scipy', raise_=True) import scipy U, s, Vh = scipy.linalg.svd(np.dot(target.T, source), full_matrices=False) elif params.svd == 'dgesvd': from mvpa2.support.lapack_svd import svd as dgesvd U, s, Vh = dgesvd(np.dot(target.T, source), full_matrices=True, algo='svd') else: raise ValueError('Unknown type of svd %r' % (params.svd)) T = np.dot(Vh.T, U.T) if not params.reflection: # then we need to assure that it is only rotation # "recipe" from # http://en.wikipedia.org/wiki/Orthogonal_Procrustes_problem # for more and info and original references, see # http://dx.doi.org/10.1007%2FBF02289451 s_new = np.ones_like(s) s_new[-1] = np.linalg.det(T) T = np.dot(Vh.T * s_new, U.T) # figure out scale and final translation if not params.reflection: ss = np.sum(s_new * s) else: ss = np.sum(s) # if we were to collect standardized distance # std_d = 1 - sD**2 # select out only relevant dimensions if sm != tm: T = T[:sm, :tm] self._scale = scale = ss * norms[1] / norms[0] # Assign projection if self.params.scaling: proj = scale * T else: proj = T self._proj = proj if self._demean: self._offset_out = means[1] if __debug__ and 'MAP_' in debug.active: # compute the residuals res_f = self.forward(odatas[0]) d_f = np.linalg.norm(odatas[1] - res_f) / np.linalg.norm(odatas[1]) res_r = self.reverse(odatas[1]) d_r = np.linalg.norm(odatas[0] - res_r) / np.linalg.norm(odatas[0]) debug( 'MAP_', "%s, residuals are forward: %g," " reverse: %g" % (repr(self), d_f, d_r))
def _proc_block(self, block, ds, measure, iblock='main'): """Little helper to capture the parts of the computation that can be parallelized Parameters ---------- iblock Critical for generating non-colliding temp filenames in case of hdf5 backend. Otherwise RNGs of different processes might collide in their temporary file names leading to problems. """ if __debug__: debug_slc_ = 'SLC_' in debug.active debug('SLC', "Starting computing block for %i elements" % len(block)) results = [] store_roi_feature_ids = self.ca.is_enabled('roi_feature_ids') store_roi_sizes = self.ca.is_enabled('roi_sizes') assure_dataset = store_roi_feature_ids or store_roi_sizes # put rois around all features in the dataset and compute the # measure within them for i, f in enumerate(block): # retrieve the feature ids of all features in the ROI from the query # engine roi_fids = self._queryengine[f] if __debug__ and debug_slc_: debug('SLC_', 'For %r query returned ids %r' % (f, roi_fids)) # slice the dataset roi = ds[:, roi_fids] if self.__add_center_fa: # add fa to indicate ROI seed if requested roi_seed = np.zeros(roi.nfeatures, dtype='bool') roi_seed[roi_fids.index(f)] = True roi.fa[self.__add_center_fa] = roi_seed # compute the datameasure and store in results res = measure(roi) if assure_dataset and not is_datasetlike(res): res = Dataset(np.atleast_1d(res)) if store_roi_feature_ids: # add roi feature ids to intermediate result dataset for later # aggregation res.a['roi_feature_ids'] = roi_fids if store_roi_sizes: res.a['roi_sizes'] = roi.nfeatures results.append(res) if __debug__: debug('SLC', "Doing %i ROIs: %i (%i features) [%i%%]" \ % (len(block), f+1, roi.nfeatures, float(i+1)/len(block)*100,), cr=True) if self.results_backend == 'native': pass # nothing special elif self.results_backend == 'hdf5': # store results in a temporary file and return a filename results_file = tempfile.mktemp(prefix=self.tmp_prefix, suffix='-%s.hdf5' % iblock) if __debug__: debug('SLC', "Storing results into %s" % results_file) h5save(results_file, results) if __debug__: debug('SLC_', "Results stored") results = results_file else: raise RuntimeError("Must not reach this point") return results
def vstack(datasets, a=None): """Stacks datasets vertically (appending samples). Feature attribute collections are merged incrementally, attribute with identical keys overwriting previous ones in the stacked dataset. All datasets must have an identical set of sample attributes (matching keys, not values), otherwise a ValueError will be raised. No dataset attributes from any source dataset will be transferred into the stacked dataset. If all input dataset have common dataset attributes that are also valid for the stacked dataset, they can be moved into the output dataset like this:: ds_merged = vstack((ds1, ds2, ds3)) ds_merged.a.update(ds1.a) Parameters ---------- datasets : tuple Sequence of datasets to be stacked. a: {'unique','drop_nonunique','uniques','all'} or True or False or None (default: None) Indicates which dataset attributes from datasets are stored in merged_dataset. If an int k, then the dataset attributes from datasets[k] are taken. If 'unique' then it is assumed that any attribute common to more than one dataset in datasets is unique; if not an exception is raised. If 'drop_nonunique' then as 'unique', except that exceptions are not raised. If 'uniques' then, for each attribute, any unique value across the datasets is stored in a tuple in merged_datasets. If 'all' then each attribute present in any dataset across datasets is stored as a tuple in merged_datasets; missing values are replaced by None. If None (the default) then no attributes are stored in merged_dataset. True is equivalent to 'drop_nonunique'. False is equivalent to None. Returns ------- AttrDataset (or respective subclass) """ # fall back to numpy if it is not a dataset if not is_datasetlike(datasets[0]): return AttrDataset(np.vstack(datasets)) if __debug__: target = sorted(datasets[0].sa.keys()) if not np.all([sorted(ds.sa.keys()) == target for ds in datasets]): raise ValueError("Sample attributes collections of to be stacked " "datasets have varying attributes.") # will puke if not equal number of features stacked_samp = np.concatenate([ds.samples for ds in datasets], axis=0) stacked_sa = {} for attr in datasets[0].sa: stacked_sa[attr] = np.concatenate( [ds.sa[attr].value for ds in datasets], axis=0) # create the dataset merged = datasets[0].__class__(stacked_samp, sa=stacked_sa) for ds in datasets: merged.fa.update(ds.fa) _stack_add_equal_dataset_attributes(merged, datasets, a) return merged
def hstack(datasets, a=None): """Stacks datasets horizontally (appending features). Sample attribute collections are merged incrementally, attribute with identical keys overwriting previous ones in the stacked dataset. All datasets must have an identical set of feature attributes (matching keys, not values), otherwise a ValueError will be raised. No dataset attributes from any source dataset will be transferred into the stacked dataset. Parameters ---------- datasets : tuple Sequence of datasets to be stacked. a: {'unique','drop_nonunique','uniques','all'} or True or False or None (default: None) Indicates which dataset attributes from datasets are stored in merged_dataset. If an int k, then the dataset attributes from datasets[k] are taken. If 'unique' then it is assumed that any attribute common to more than one dataset in datasets is unique; if not an exception is raised. If 'drop_nonunique' then as 'unique', except that exceptions are not raised. If 'uniques' then, for each attribute, any unique value across the datasets is stored in a tuple in merged_datasets. If 'all' then each attribute present in any dataset across datasets is stored as a tuple in merged_datasets; missing values are replaced by None. If None (the default) then no attributes are stored in merged_dataset. True is equivalent to 'drop_nonunique'. False is equivalent to None. Returns ------- AttrDataset (or respective subclass) """ # # XXX Use CombinedMapper in here whenever it comes back # # fall back to numpy if it is not a dataset if not is_datasetlike(datasets[0]): # we might get a list of 1Ds that would yield wrong results when # turned into a dict (would run along samples-axis) return AttrDataset(np.atleast_2d(np.hstack(datasets))) if __debug__: target = sorted(datasets[0].fa.keys()) if not np.all([sorted(ds.fa.keys()) == target for ds in datasets]): raise ValueError("Feature attributes collections of to be stacked " "datasets have varying attributes.") # will puke if not equal number of samples stacked_samp = np.concatenate([ds.samples for ds in datasets], axis=1) stacked_fa = {} for attr in datasets[0].fa: stacked_fa[attr] = np.concatenate( [ds.fa[attr].value for ds in datasets], axis=0) # create the dataset merged = datasets[0].__class__(stacked_samp, fa=stacked_fa) for ds in datasets: merged.sa.update(ds.sa) _stack_add_equal_dataset_attributes(merged, datasets, a) return merged
def test_from_wizard(): samples = np.arange(12).reshape((4, 3)).view(myarray) labels = range(4) chunks = [1, 1, 2, 2] ds = Dataset(samples, sa={'targets': labels, 'chunks': chunks}) ds.init_origids('both') first = ds.sa.origids # now do again and check that they get regenerated ds.init_origids('both') assert_false(first is ds.sa.origids) assert_array_equal(first, ds.sa.origids) ok_(is_datasetlike(ds)) ok_(not is_datasetlike(labels)) # array subclass survives ok_(isinstance(ds.samples, myarray)) ## XXX stuff that needs thought: # ds.sa (empty) has this in the public namespace: # add, get, getvalue, has_key, is_set, items, listing, name, names # owner, remove, reset, setvalue, which_set # maybe we need some form of leightweightCollection? assert_array_equal(ds.samples, samples) assert_array_equal(ds.sa.targets, labels) assert_array_equal(ds.sa.chunks, chunks) # same should work for shortcuts assert_array_equal(ds.targets, labels) assert_array_equal(ds.chunks, chunks) ok_(sorted(ds.sa.keys()) == ['chunks', 'origids', 'targets']) ok_(sorted(ds.fa.keys()) == ['origids']) # add some more ds.a['random'] = 'blurb' # check stripping attributes from a copy cds = ds.copy() # full copy ok_(sorted(cds.sa.keys()) == ['chunks', 'origids', 'targets']) ok_(sorted(cds.fa.keys()) == ['origids']) ok_(sorted(cds.a.keys()) == ['random']) cds = ds.copy(sa=[], fa=[], a=[]) # plain copy ok_(cds.sa.keys() == []) ok_(cds.fa.keys() == []) ok_(cds.a.keys() == []) cds = ds.copy(sa=['targets'], fa=None, a=['random']) # partial copy ok_(cds.sa.keys() == ['targets']) ok_(cds.fa.keys() == ['origids']) ok_(cds.a.keys() == ['random']) # there is not necessarily a mapper present ok_(not ds.a.has_key('mapper')) # has to complain about misshaped samples attributes assert_raises(ValueError, Dataset.from_wizard, samples, labels + labels) # check that we actually have attributes of the expected type ok_(isinstance(ds.sa['targets'], ArrayCollectable)) # the dataset will take care of not adding stupid stuff assert_raises(ValueError, ds.sa.__setitem__, 'stupid', np.arange(3)) assert_raises(ValueError, ds.fa.__setitem__, 'stupid', np.arange(4)) # or change proper attributes to stupid shapes try: ds.sa.targets = np.arange(3) except ValueError: pass else: ok_(False, msg="Assigning value with improper shape to attribute " "did not raise exception.")
def hstack(datasets, a=None): """Stacks datasets horizontally (appending features). Sample attribute collections are merged incrementally, attribute with identical keys overwriting previous ones in the stacked dataset. All datasets must have an identical set of feature attributes (matching keys, not values), otherwise a ValueError will be raised. No dataset attributes from any source dataset will be transferred into the stacked dataset. Parameters ---------- datasets : tuple Sequence of datasets to be stacked. a: {'unique','drop_nonunique','uniques','all'} or True or False or None (default: None) Indicates which dataset attributes from datasets are stored in merged_dataset. If an int k, then the dataset attributes from datasets[k] are taken. If 'unique' then it is assumed that any attribute common to more than one dataset in datasets is unique; if not an exception is raised. If 'drop_nonunique' then as 'unique', except that exceptions are not raised. If 'uniques' then, for each attribute, any unique value across the datasets is stored in a tuple in merged_datasets. If 'all' then each attribute present in any dataset across datasets is stored as a tuple in merged_datasets; missing values are replaced by None. If None (the default) then no attributes are stored in merged_dataset. True is equivalent to 'drop_nonunique'. False is equivalent to None. Returns ------- AttrDataset (or respective subclass) """ # # XXX Use CombinedMapper in here whenever it comes back # if not len(datasets): raise ValueError('concatenation of zero-length sequences is impossible') if not len(datasets) > 1: # trivial hstack return datasets[0] # fall back to numpy if it is not a dataset if not is_datasetlike(datasets[0]): # we might get a list of 1Ds that would yield wrong results when # turned into a dict (would run along samples-axis) return AttrDataset(np.atleast_2d(np.hstack(datasets))) if __debug__: target = sorted(datasets[0].fa.keys()) if not np.all([sorted(ds.fa.keys()) == target for ds in datasets]): raise ValueError("Feature attributes collections of to be stacked " "datasets have varying attributes.") # will puke if not equal number of samples stacked_samp = np.concatenate([ds.samples for ds in datasets], axis=1) stacked_fa = {} for attr in datasets[0].fa: stacked_fa[attr] = np.concatenate( [ds.fa[attr].value for ds in datasets], axis=0) # create the dataset merged = datasets[0].__class__(stacked_samp, fa=stacked_fa) for ds in datasets: merged.sa.update(ds.sa) _stack_add_equal_dataset_attributes(merged, datasets, a) return merged
def _train(self, source): params = self.params # Since it is unsupervised, we don't care about labels datas = () odatas = () means = () shapes = () assess_residuals = __debug__ and 'MAP_' in debug.active target = source.sa[self.get_space()].value for i, ds in enumerate((source, target)): if is_datasetlike(ds): data = np.asarray(ds.samples) else: data = ds if assess_residuals: odatas += (data,) if self._demean: if i == 0: mean = self._offset_in else: mean = data.mean(axis=0) data = data - mean else: # no demeaning === zero means mean = np.zeros(shape=data.shape[1:]) means += (mean,) datas += (data,) shapes += (data.shape,) # shortcuts for sizes sn, sm = shapes[0] tn, tm = shapes[1] # Check the sizes if sn != tn: raise ValueError, "Data for both spaces should have the same " \ "number of samples. Got %d in source and %d in target space" \ % (sn, tn) # Sums of squares ssqs = [np.sum(d**2, axis=0) for d in datas] # XXX check for being invariant? # needs to be tuned up properly and not raise but handle for i in xrange(2): if np.all(ssqs[i] <= np.abs((np.finfo(datas[i].dtype).eps * sn * means[i] )**2)): raise ValueError, "For now do not handle invariant in time datasets" norms = [ np.sqrt(np.sum(ssq)) for ssq in ssqs ] normed = [ data/norm for (data, norm) in zip(datas, norms) ] # add new blank dimensions to source space if needed if sm < tm: normed[0] = np.hstack( (normed[0], np.zeros((sn, tm-sm))) ) if sm > tm: if params.reduction: normed[1] = np.hstack( (normed[1], np.zeros((sn, sm-tm))) ) else: raise ValueError, "reduction=False, so mapping from " \ "higher dimensionality " \ "source space is not supported. Source space had %d " \ "while target %d dimensions (features)" % (sm, tm) source, target = normed if params.oblique: # Just do silly linear system of equations ;) or naive # inverse problem if sn == sm and tm == 1: T = np.linalg.solve(source, target) else: T = np.linalg.lstsq(source, target, rcond=params.oblique_rcond)[0] ss = 1.0 else: # Orthogonal transformation # figure out optimal rotation if params.svd == 'numpy': U, s, Vh = np.linalg.svd(np.dot(target.T, source), full_matrices=False) elif params.svd == 'scipy': # would raise exception if not present externals.exists('scipy', raise_=True) import scipy U, s, Vh = scipy.linalg.svd(np.dot(target.T, source), full_matrices=False) elif params.svd == 'dgesvd': from mvpa2.support.lapack_svd import svd as dgesvd U, s, Vh = dgesvd(np.dot(target.T, source), full_matrices=True, algo='svd') else: raise ValueError('Unknown type of svd %r'%(params.svd)) T = np.dot(Vh.T, U.T) if not params.reflection: # then we need to assure that it is only rotation # "recipe" from # http://en.wikipedia.org/wiki/Orthogonal_Procrustes_problem # for more and info and original references, see # http://dx.doi.org/10.1007%2FBF02289451 nsv = len(s) s[:-1] = 1 s[-1] = np.linalg.det(T) T = np.dot(U[:, :nsv] * s, Vh) # figure out scale and final translation # XXX with reflection False -- not sure if here or there or anywhere... ss = sum(s) # if we were to collect standardized distance # std_d = 1 - sD**2 # select out only relevant dimensions if sm != tm: T = T[:sm, :tm] self._scale = scale = ss * norms[1] / norms[0] # Assign projection if self.params.scaling: proj = scale * T else: proj = T self._proj = proj if self._demean: self._offset_out = means[1] if __debug__ and 'MAP_' in debug.active: # compute the residuals res_f = self.forward(odatas[0]) d_f = np.linalg.norm(odatas[1] - res_f)/np.linalg.norm(odatas[1]) res_r = self.reverse(odatas[1]) d_r = np.linalg.norm(odatas[0] - res_r)/np.linalg.norm(odatas[0]) debug('MAP_', "%s, residuals are forward: %g," " reverse: %g" % (repr(self), d_f, d_r))
def _proc_block(self, block, datasets, featselhyper, queryengines, seed=None, iblock='main'): if seed is not None: mvpa2.seed(seed) if __debug__: debug('SLC', 'Starting computing block for %i elements' % len(block)) bar = ProgressBar() projections = [csc_matrix((self.nfeatures, self.nfeatures), dtype=self.params.dtype) for isub in range(self.ndatasets)] for i, node_id in enumerate(block): # retrieve the feature ids of all features in the ROI from the query # engine # Find the neighborhood for that selected nearest node roi_feature_ids_all = [qe[node_id] for qe in queryengines] # handling queryengines that return AttrDatasets for isub in range(len(roi_feature_ids_all)): if is_datasetlike(roi_feature_ids_all[isub]): # making sure queryengine returned proper shaped output assert(roi_feature_ids_all[isub].nsamples == 1) roi_feature_ids_all[isub] = roi_feature_ids_all[isub].samples[0, :].tolist() if len(roi_feature_ids_all) == 1: # just one was provided to be "broadcasted" roi_feature_ids_all *= len(datasets) # if qe returns zero-sized ROI for any subject, pass... if any(len(x)==0 for x in roi_feature_ids_all): continue # selecting neighborhood for all subject for hyperalignment ds_temp = [sd[:, ids] for sd, ids in zip(datasets, roi_feature_ids_all)] if self.force_roi_seed: roi_seed = np.array(roi_feature_ids_all[self.params.ref_ds]) == node_id ds_temp[self.params.ref_ds].fa['roi_seed'] = roi_seed if __debug__: msg = 'ROI (%i/%i), %i features' % (i + 1, len(block), ds_temp[self.params.ref_ds].nfeatures) debug('SLC', bar(float(i + 1) / len(block), msg), cr=True) hmappers = featselhyper(ds_temp) assert(len(hmappers) == len(datasets)) roi_feature_ids_ref_ds = roi_feature_ids_all[self.params.ref_ds] for isub, roi_feature_ids in enumerate(roi_feature_ids_all): if not self.params.combine_neighbormappers: I = roi_feature_ids #J = [roi_feature_ids[node_id]] * len(roi_feature_ids) J = [node_id] * len(roi_feature_ids) V = hmappers[isub].tolist() if np.isscalar(V): V = [V] else: I, J, V = [], [], [] for f2, roi_feature_id_ref_ds in enumerate(roi_feature_ids_ref_ds): I += roi_feature_ids J += [roi_feature_id_ref_ds] * len(roi_feature_ids) V += hmappers[isub][:, f2].tolist() proj = coo_matrix( (V, (I, J)), shape=(max(self.nfeatures, max(I) + 1), max(self.nfeatures, max(J) + 1)), dtype=self.params.dtype) proj = proj.tocsc() # Cleaning up the current subject's projections to free up memory hmappers[isub] = [[] for _ in hmappers] projections[isub] = projections[isub] + proj if self.params.results_backend == 'native': return projections elif self.params.results_backend == 'hdf5': # store results in a temporary file and return a filename results_file = mktemp(prefix=self.params.tmp_prefix, suffix='-%s.hdf5' % iblock) if __debug__: debug('SLC', "Storing results into %s" % results_file) h5save(results_file, projections) if __debug__: debug('SLC_', "Results stored") return results_file else: raise RuntimeError("Must not reach this point")
def vstack(datasets, a=None, fa='drop_nonunique'): """Stacks datasets vertically (appending samples). All datasets must have an identical set of sample attributes (matching keys, not values), otherwise a ValueError will be raised. See `a` argument documentation for transferring dataset attributes, and `fa` argument for feature attributes -- by default feature attributes which differ in any input dataset from the others would be dropped. Parameters ---------- datasets : tuple Sequence of datasets to be stacked. a: {'unique','drop_nonunique','uniques','all'} or True or False or None (default: None) Indicates which dataset attributes from datasets are stored in merged_dataset. If an int k, then the dataset attributes from datasets[k] are taken. If 'unique' then it is assumed that any attribute common to more than one dataset in datasets is unique; if not an exception is raised. If 'drop_nonunique' then as 'unique', except that exceptions are not raised. If 'uniques' then, for each attribute, any unique value across the datasets is stored in a tuple in merged_datasets. If 'all' then each attribute present in any dataset across datasets is stored as a tuple in merged_datasets; missing values are replaced by None. If None (the default) then no attributes are stored in merged_dataset. True is equivalent to 'drop_nonunique'. False is equivalent to None. fa: {'update', 'drop_nonunique'}, (default: 'drop_nonunique') Indicate which feature attributes are stored in merged dataset. If 'update' - attributes are updated while growing the dataset. If 'drop_nonunique', attribute would be dropped from the dataset if its value differs across datasets for any feature. Returns ------- AttrDataset (or respective subclass) """ if not len(datasets): raise ValueError('concatenation of zero-length sequences is impossible') if not len(datasets) > 1: # trivial vstack return datasets[0] # fall back to numpy if it is not a dataset if not is_datasetlike(datasets[0]): return AttrDataset(np.vstack(datasets)) if __debug__: target = sorted(datasets[0].sa.keys()) if not np.all([sorted(ds.sa.keys()) == target for ds in datasets]): raise ValueError("Sample attributes collections of to be stacked " "datasets have varying attributes.") # will puke if not equal number of features stacked_samp = np.concatenate([ds.samples for ds in datasets], axis=0) stacked_sa = {} for attr in datasets[0].sa: stacked_sa[attr] = np.concatenate( [ds.sa[attr].value for ds in datasets], axis=0) # create the dataset merged = datasets[0].__class__(stacked_samp, sa=stacked_sa) _stack_add_equal_attributes(merged, datasets, fa, 'fa') _stack_add_equal_dataset_attributes(merged, datasets, a) return merged
def _proc_block(self, block, datasets, featselhyper, queryengines, seed=None, iblock='main'): if seed is not None: mvpa2.seed(seed) if __debug__: debug('SLC', 'Starting computing block for %i elements' % len(block)) bar = ProgressBar() projections = [ csc_matrix((self.nfeatures, self.nfeatures), dtype=self.params.dtype) for isub in range(self.ndatasets) ] for i, node_id in enumerate(block): # retrieve the feature ids of all features in the ROI from the query # engine # Find the neighborhood for that selected nearest node roi_feature_ids_all = [qe[node_id] for qe in queryengines] # handling queryengines that return AttrDatasets for isub in range(len(roi_feature_ids_all)): if is_datasetlike(roi_feature_ids_all[isub]): # making sure queryengine returned proper shaped output assert (roi_feature_ids_all[isub].nsamples == 1) roi_feature_ids_all[isub] = roi_feature_ids_all[ isub].samples[0, :].tolist() if len(roi_feature_ids_all) == 1: # just one was provided to be "broadcasted" roi_feature_ids_all *= len(datasets) # if qe returns zero-sized ROI for any subject, pass... if any(len(x) == 0 for x in roi_feature_ids_all): continue # selecting neighborhood for all subject for hyperalignment ds_temp = [ sd[:, ids] for sd, ids in zip(datasets, roi_feature_ids_all) ] if self.force_roi_seed: roi_seed = np.array( roi_feature_ids_all[self.params.ref_ds]) == node_id ds_temp[self.params.ref_ds].fa['roi_seed'] = roi_seed if __debug__: msg = 'ROI (%i/%i), %i features' % ( i + 1, len(block), ds_temp[self.params.ref_ds].nfeatures) debug('SLC', bar(float(i + 1) / len(block), msg), cr=True) hmappers = featselhyper(ds_temp) assert (len(hmappers) == len(datasets)) roi_feature_ids_ref_ds = roi_feature_ids_all[self.params.ref_ds] for isub, roi_feature_ids in enumerate(roi_feature_ids_all): if not self.params.combine_neighbormappers: I = roi_feature_ids #J = [roi_feature_ids[node_id]] * len(roi_feature_ids) J = [node_id] * len(roi_feature_ids) V = hmappers[isub].tolist() if np.isscalar(V): V = [V] else: I, J, V = [], [], [] for f2, roi_feature_id_ref_ds in enumerate( roi_feature_ids_ref_ds): I += roi_feature_ids J += [roi_feature_id_ref_ds] * len(roi_feature_ids) V += hmappers[isub][:, f2].tolist() proj = coo_matrix( (V, (I, J)), shape=(max(self.nfeatures, max(I) + 1), max(self.nfeatures, max(J) + 1)), dtype=self.params.dtype) proj = proj.tocsc() # Cleaning up the current subject's projections to free up memory hmappers[isub] = [[] for _ in hmappers] projections[isub] = projections[isub] + proj if self.params.results_backend == 'native': return projections elif self.params.results_backend == 'hdf5': # store results in a temporary file and return a filename results_file = mktemp(prefix=self.params.tmp_prefix, suffix='-%s.hdf5' % iblock) if __debug__: debug('SLC', "Storing results into %s" % results_file) h5save(results_file, projections) if __debug__: debug('SLC_', "Results stored") return results_file else: raise RuntimeError("Must not reach this point")
def vstack(datasets, a=None): """Stacks datasets vertically (appending samples). Feature attribute collections are merged incrementally, attribute with identical keys overwriting previous ones in the stacked dataset. All datasets must have an identical set of sample attributes (matching keys, not values), otherwise a ValueError will be raised. No dataset attributes from any source dataset will be transferred into the stacked dataset. If all input dataset have common dataset attributes that are also valid for the stacked dataset, they can be moved into the output dataset like this:: ds_merged = vstack((ds1, ds2, ds3)) ds_merged.a.update(ds1.a) Parameters ---------- datasets : tuple Sequence of datasets to be stacked. a: {'unique','drop_nonunique','uniques','all'} or True or False or None (default: None) Indicates which dataset attributes from datasets are stored in merged_dataset. If an int k, then the dataset attributes from datasets[k] are taken. If 'unique' then it is assumed that any attribute common to more than one dataset in datasets is unique; if not an exception is raised. If 'drop_nonunique' then as 'unique', except that exceptions are not raised. If 'uniques' then, for each attribute, any unique value across the datasets is stored in a tuple in merged_datasets. If 'all' then each attribute present in any dataset across datasets is stored as a tuple in merged_datasets; missing values are replaced by None. If None (the default) then no attributes are stored in merged_dataset. True is equivalent to 'drop_nonunique'. False is equivalent to None. Returns ------- AttrDataset (or respective subclass) """ if not len(datasets): raise ValueError('concatenation of zero-length sequences is impossible') if not len(datasets) > 1: # trivial vstack return datasets[0] # fall back to numpy if it is not a dataset if not is_datasetlike(datasets[0]): return AttrDataset(np.vstack(datasets)) if __debug__: target = sorted(datasets[0].sa.keys()) if not np.all([sorted(ds.sa.keys()) == target for ds in datasets]): raise ValueError("Sample attributes collections of to be stacked " "datasets have varying attributes.") # will puke if not equal number of features stacked_samp = np.concatenate([ds.samples for ds in datasets], axis=0) stacked_sa = {} for attr in datasets[0].sa: stacked_sa[attr] = np.concatenate( [ds.sa[attr].value for ds in datasets], axis=0) # create the dataset merged = datasets[0].__class__(stacked_samp, sa=stacked_sa) for ds in datasets: merged.fa.update(ds.fa) _stack_add_equal_dataset_attributes(merged, datasets, a) return merged
def _proc_block(self, block, ds, measure, seed=None, iblock='main'): """Little helper to capture the parts of the computation that can be parallelized Parameters ---------- seed RNG seed. Should be provided e.g. in child process invocations to guarantee that they all seed differently to not keep generating the same sequencies due to reusing the same copy of numpy's RNG block Critical for generating non-colliding temp filenames in case of hdf5 backend. Otherwise RNGs of different processes might collide in their temporary file names leading to problems. """ if seed is not None: mvpa2.seed(seed) if __debug__: debug_slc_ = 'SLC_' in debug.active debug('SLC', "Starting computing block for %i elements" % len(block)) results = [] store_roi_feature_ids = self.ca.is_enabled('roi_feature_ids') store_roi_sizes = self.ca.is_enabled('roi_sizes') store_roi_center_ids = self.ca.is_enabled('roi_center_ids') assure_dataset = any([store_roi_feature_ids, store_roi_sizes, store_roi_center_ids]) # put rois around all features in the dataset and compute the # measure within them for i, f in enumerate(block): # retrieve the feature ids of all features in the ROI from the query # engine roi_specs = self._queryengine[f] if __debug__ and debug_slc_: debug('SLC_', 'For %r query returned roi_specs %r' % (f, roi_specs)) if is_datasetlike(roi_specs): # TODO: unittest assert(len(roi_specs) == 1) roi_fids = roi_specs.samples[0] else: roi_fids = roi_specs # slice the dataset roi = ds[:, roi_fids] if is_datasetlike(roi_specs): for n, v in roi_specs.fa.iteritems(): roi.fa[n] = v if self.__add_center_fa: # add fa to indicate ROI seed if requested roi_seed = np.zeros(roi.nfeatures, dtype='bool') if f in roi_fids: roi_seed[roi_fids.index(f)] = True else: warning("Center feature attribute id %s not found" % f) roi.fa[self.__add_center_fa] = roi_seed # compute the datameasure and store in results res = measure(roi) if assure_dataset and not is_datasetlike(res): res = Dataset(np.atleast_1d(res)) if store_roi_feature_ids: # add roi feature ids to intermediate result dataset for later # aggregation res.a['roi_feature_ids'] = roi_fids if store_roi_sizes: res.a['roi_sizes'] = roi.nfeatures if store_roi_center_ids: res.a['roi_center_ids'] = f results.append(res) if __debug__: debug('SLC', "Doing %i ROIs: %i (%i features) [%i%%]" \ % (len(block), f + 1, roi.nfeatures, float(i + 1) / len(block) * 100,), cr=True) if self.results_postproc_fx: if __debug__: debug('SLC', "Post-processing %d results in proc_block using %s" % (len(results), self.results_postproc_fx)) results = self.results_postproc_fx(results) if self.results_backend == 'native': pass # nothing special elif self.results_backend == 'hdf5': # store results in a temporary file and return a filename results_file = tempfile.mktemp(prefix=self.tmp_prefix, suffix='-%s.hdf5' % iblock) if __debug__: debug('SLC', "Storing results into %s" % results_file) h5save(results_file, results) if __debug__: debug('SLC_', "Results stored") results = results_file else: raise RuntimeError("Must not reach this point") return results
def _proc_block(self, block, ds, measure, iblock='main'): """Little helper to capture the parts of the computation that can be parallelized Parameters ---------- iblock Critical for generating non-colliding temp filenames in case of hdf5 backend. Otherwise RNGs of different processes might collide in their temporary file names leading to problems. """ if __debug__: debug_slc_ = 'SLC_' in debug.active debug('SLC', "Starting computing block for %i elements" % len(block)) if self.ca.is_enabled('roi_sizes'): roi_sizes = [] else: roi_sizes = None results = [] # put rois around all features in the dataset and compute the # measure within them for i, f in enumerate(block): # retrieve the feature ids of all features in the ROI from the query # engine roi_fids = self._queryengine[f] if __debug__ and debug_slc_: debug('SLC_', 'For %r query returned ids %r' % (f, roi_fids)) # slice the dataset roi = ds[:, roi_fids] if self.__add_center_fa: # add fa to indicate ROI seed if requested roi_seed = np.zeros(roi.nfeatures, dtype='bool') roi_seed[roi_fids.index(f)] = True roi.fa[self.__add_center_fa] = roi_seed # compute the datameasure and store in results res = measure(roi) if self.ca.is_enabled('roi_feature_ids'): if not is_datasetlike(res): res = Dataset(np.atleast_1d(res)) # add roi feature ids to intermediate result dataset for later # aggregation res.a['roi_feature_ids'] = roi_fids results.append(res) # store the size of the roi dataset if not roi_sizes is None: roi_sizes.append(roi.nfeatures) if __debug__: debug('SLC', "Doing %i ROIs: %i (%i features) [%i%%]" \ % (len(block), f+1, roi.nfeatures, float(i+1)/len(block)*100,), cr=True) if self.results_backend == 'native': pass # nothing special elif self.results_backend == 'hdf5': # store results in a temporary file and return a filename results_file = tempfile.mktemp(prefix=self.tmp_prefix, suffix='-%s.hdf5' % iblock) if __debug__: debug('SLC', "Storing results into %s" % results_file) h5save(results_file, results) if __debug__: debug('SLC_', "Results stored") results = results_file else: raise RuntimeError("Must not reach this point") return results, roi_sizes