def match_variables(self, pattern, return_type='name', match_type='unix'): """Return columns whose names match the provided pattern. Parameters ---------- pattern : str, list One or more regex patterns to match all variable names against. return_type : {'name', 'variable'} What to return. Must be one of: 'name': Returns a list of names of matching variables. 'variable': Returns a list of Variable objects whose names match. match_type : str Matching approach to use. Either 'regex' (full-blown regular expression matching) or 'unix' (unix-style pattern matching via the fnmatch module). Returns ------- A list of all matching variables or variable names """ pattern = listify(pattern) results = [] for patt in pattern: if match_type.lower().startswith('re'): patt = re.compile(patt) vars_ = [v for v in self.variables.keys() if patt.search(v)] else: vars_ = fnmatch.filter(list(self.variables.keys()), patt) if return_type.startswith('var'): vars_ = [self.variables[v] for v in vars_] results.extend(vars_) return results
def _transform(self, var, query, by=None): if by is None: by = [] names = [var.name] + listify(by) # assure ordered dict so we have consistent (if not correct) operation, # because later we ask for name_map.values # pandas .query can't handle non-identifiers in variable names, so we # need to replace them in both the variable names and the query string. name_map = odict((n, re.sub('[^a-zA-Z0-9_]+', '_', n)) for n in names) for k, v in name_map.items(): query = query.replace(k, v) data = pd.concat([self.collection[n].values for n in names], axis=1, sort=True) # Make sure we can use integer index data = data.reset_index(drop=True) data.columns = list(name_map.values()) data = data.query(query) # Truncate target variable to retained rows var.select_rows(data.index.values) return var
def merge(cls, variables, name=None, **kwargs): ''' Merge/concatenate a list of variables along the row axis. Args: variables (list): A list of Variables to merge. name (str): Optional name to assign to the output Variable. By default, uses the same name as the input variables. kwargs: Optional keyword arguments to pass onto the class-specific merge() call. See merge_variables docstring for details. Returns: A single BIDSVariable of the same class as the input variables. Notes: see merge_variables docstring for additional details. ''' variables = listify(variables) if len(variables) == 1: return variables[0] var_names = set([v.name for v in variables]) if len(var_names) > 1: raise ValueError("Columns with different names cannot be merged. " "Column names provided: %s" % var_names) if name is None: name = variables[0].name return cls._merge(variables, name, **kwargs)
def _regex_replace_variables(self, args): """For each argument named in args, interpret the values set in the argument as regex patterns to potentially be replaced with variables that match the pattern. """ args = listify(args) if 'variables' in args: args.remove('variables') variables = True else: variables = False # Ensure all keyword arguments user wants to scan are valid missing = set(args) - set(self.kwargs.keys()) if missing: raise ValueError("Arguments '%s' specified for regex-based " "variable name replacement, but were not found " "among keyword arguments." % missing) def _replace_arg_values(names): variables = listify(names) variables = [self.collection.match_variables(c) for c in names] variables = itertools.chain(*variables) return list(set(variables)) # 'variables' is stored separately, so handle it separately if variables: self.variables = _replace_arg_values(self.variables) for arg in args: self.kwargs[arg] = _replace_arg_values(self.kwargs[arg])
def merge_collections(collections, force_dense=False, sampling_rate='auto'): ''' Merge two or more collections at the same level of analysis. Args: collections (list): List of Collections to merge. sampling_rate (int, str): Sampling rate to use if it becomes necessary to resample DenseRunVariables. Either an integer or 'auto' (see merge_variables docstring for further explanation). Returns: A BIDSVariableCollection or BIDSRunVariableCollection, depending on the type of the input collections. ''' if len(listify(collections)) == 1: return collections levels = set([c.level for c in collections]) if len(levels) > 1: raise ValueError("At the moment, it's only possible to merge " "Collections at the same level of analysis. You " "passed collections at levels: %s." % levels) variables = list(chain(*[c.variables.values() for c in collections])) cls = collections[0].__class__ variables = cls.merge_variables(variables, sampling_rate=sampling_rate) if isinstance(collections[0], BIDSRunVariableCollection): return cls(variables, sampling_rate) return cls(variables)
def _transform(self, var, query, by=None): if by is None: by = [] names = [var.name] + listify(by) # pandas .query can't handle non-identifiers in variable names, so we # need to replace them in both the variable names and the query string. name_map = {n: re.sub('[^a-zA-Z0-9_]+', '_', n) for n in names} for k, v in name_map.items(): query = query.replace(k, v) data = pd.concat([self.collection[c].values for c in names], axis=1, sort=True) # Make sure we can use integer index data = data.reset_index(drop=True) data.columns = list(name_map.values()) data = data.query(query) # Truncate target variable to retained rows var.select_rows(data.index.values) return var
def _transform(self, var, by, drop_orig=True): if not isinstance(var, SimpleVariable): self._densify_variables() # Set up all the splitting variables as a DF. Note that variables in # 'by' can be either regular variables, or entities in the index--so # we need to check both places. all_variables = self._variables by_variables = [ all_variables[v].values if v in all_variables else var.index[v].reset_index(drop=True) for v in listify(by) ] group_data = pd.concat(by_variables, axis=1) group_data.columns = listify(by) # For sparse data, we need to set up a 1D grouper if isinstance(var, SimpleVariable): # Create single grouping variable by combining all 'by' variables if group_data.shape[1] == 1: group_labels = group_data.iloc[:, 0].values else: group_rows = group_data.astype(str).values.tolist() group_labels = ['_'.join(r) for r in group_rows] result = var.split(group_labels) # For dense data, use patsy to create design matrix, then multiply # it by target variable else: group_data = group_data.astype(str) formula = '0+' + '*'.join(listify(by)) dm = dmatrix(formula, data=group_data, return_type='dataframe') result = var.split(dm) if drop_orig: self.collection.variables.pop(var.name) return result
def merge_collections(collections, sampling_rate='highest', output_level=None): """Merge two or more collections at the same level of analysis. Parameters ---------- collections : list List of Collections to merge. sampling_rate : int or str Sampling rate to use if it becomes necessary to resample DenseRunVariables. Either an integer or 'highest' (see merge_variables docstring for further explanation). output_level : str, optional Assign a new level (e.g., 'run', 'subject', etc.) to the merged collection. If None, the current level is retained. Returns ------- BIDSVariableCollection or BIDSRunVariableCollection Result type depends on the type of the input collections. """ collections = listify(collections) if len(collections) == 1: return collections[0] levels = set([c.level for c in collections]) if len(levels) > 1: raise ValueError("At the moment, it's only possible to merge " "Collections at the same level of analysis. You " "passed collections at levels: %s." % levels) variables = list(chain(*[c.variables.values() for c in collections])) cls = collections[0].__class__ variables = cls.merge_variables(variables, sampling_rate=sampling_rate) if isinstance(collections[0], BIDSRunVariableCollection): # 'auto' was renamed to 'highest' circa 0.10, but check for both if sampling_rate in {'auto', 'highest'}: rates = [ var.sampling_rate for var in variables if isinstance(var, DenseRunVariable) ] sampling_rate = rates[0] if rates else None return cls(variables, sampling_rate) # For non-run collections, we may need to set a different output level coll = cls(variables) if output_level is not None: coll.level = output_level return coll
def _transform(self, var, by): if not isinstance(var, SimpleVariable): self._densify_variables() # Set up all the splitting variables as a DF. Note that variables in # 'by' can be either regular variables, or entities in the index--so # we need to check both places. all_variables = self._variables by_variables = [all_variables[v].values if v in all_variables else var.index[v].reset_index(drop=True) for v in listify(by)] group_data = pd.concat(by_variables, axis=1, sort=True) group_data.columns = listify(by) # Use patsy to create splitting design matrix group_data = group_data.astype(str) formula = '0+' + ':'.join(listify(by)) dm = dmatrix(formula, data=group_data, return_type='dataframe') dm.columns = [col.replace(':', '.') for col in dm.columns] return var.split(dm)
def _transform(self, var, by, drop_orig=True): if not isinstance(var, SimpleVariable): self._densify_variables() # Set up all the splitting variables as a DF. Note that variables in # 'by' can be either regular variables, or entities in the index--so # we need to check both places. all_variables = self._variables by_variables = [all_variables[v].values if v in all_variables else var.index[v].reset_index(drop=True) for v in listify(by)] group_data = pd.concat(by_variables, axis=1, sort=True) group_data.columns = listify(by) # For sparse data, we need to set up a 1D grouper if isinstance(var, SimpleVariable): # Create single grouping variable by combining all 'by' variables if group_data.shape[1] == 1: group_labels = group_data.iloc[:, 0].values else: group_rows = group_data.astype(str).values.tolist() group_labels = ['_'.join(r) for r in group_rows] result = var.split(group_labels) # For dense data, use patsy to create design matrix, then multiply # it by target variable else: group_data = group_data.astype(str) formula = '0+' + '*'.join(listify(by)) dm = dmatrix(formula, data=group_data, return_type='dataframe') result = var.split(dm) if drop_orig: self.collection.variables.pop(var.name) return result
def _clone_columns(self): ''' Deep copy all columns the transformation touches. This prevents us from unnecessarily overwriting existing columns. ''' # Always clone the target columns self._columns = {c: self.collection[c].clone() for c in self.cols} if not self._columns_used: return # Loop over argument names and clone all column names in each one for var in self._columns_used: for c in listify(self.kwargs.get(var, [])): self._columns[c] = deepcopy(self.collection[c])
def _transform(self, col, other): other = listify(other) # Set up X matrix and slice into it based on target column indices X = np.array([self._columns[c].values.values.squeeze() for c in other]).T X = X[col.index, :] assert len(X) == len(col) y = col.values _aX = np.c_[np.ones(len(y)), X] coefs, resids, rank, s = np.linalg.lstsq(_aX, y) result = pd.DataFrame(y - X.dot(coefs[1:]), index=col.index) return result
def _transform(self, var, other): other = listify(other) # Set up X matrix and slice into it based on target variable indices X = np.array( [self._variables[c].values.values.squeeze() for c in other]).T X = X[var.index, :] assert len(X) == len(var) y = var.values _aX = np.c_[np.ones(len(y)), X] coefs, resids, rank, s = np.linalg.lstsq(_aX, y, rcond=None) result = pd.DataFrame(y - X.dot(coefs[1:]), index=var.index) return result
def fit(self, spatialimage): r""" Generate the interpolation matrix (and the VSM with it). Implements Eq. :math:`\eqref{eq:1}`, interpolating :math:`f(\mathbf{s})` for all voxels in the target-image's extent. Returns ------- updated : :obj:`bool` ``True`` if the internal field representation was fit, ``False`` if cache was valid and will be reused. """ # Calculate the physical coordinates of target grid if isinstance(spatialimage, (str, bytes, Path)): spatialimage = nb.load(spatialimage) if self.shifts is not None: newaff = spatialimage.affine newshape = spatialimage.shape if np.all(newshape == self.shifts.shape) and np.allclose( newaff, self.shifts.affine): return False weights = [] coeffs = [] # Generate tensor-product B-Spline weights for level in listify(self.coeffs): self.xfm.reference = spatialimage moved_cs = level.__class__(level.dataobj, self.xfm.matrix @ level.affine, level.header) wmat = grid_bspline_weights(spatialimage, moved_cs) weights.append(wmat) coeffs.append(level.get_fdata(dtype="float32").reshape(-1)) # Interpolate the VSM (voxel-shift map) vsm = np.zeros(spatialimage.shape[:3], dtype="float32") vsm = ( np.squeeze(np.hstack(coeffs).T) @ sparse_vstack(weights)).reshape( vsm.shape) # Cache self.shifts = nb.Nifti1Image(vsm, spatialimage.affine, None) self.shifts.header.set_intent("estimate", name="Voxel shift") self.shifts.header.set_xyzt_units("mm") return True
def load_variables(layout, levels=None, merge=False, target=None, **kwargs): ''' A convenience wrapper for one or more load_*_variables() calls. Args: layout (BIDSLayout): BIDSLayout containing variable files. levels (str, list): Level or list of levels to load variables for. Valid values are 'time', 'run', 'session', and 'subject'. merge (bool): If True, the requested levels are merged into a single BIDSVariableCollection before returning. Ignored if only one level is requested. target (str): If merge=True, target indicates the level that defines the granularity of the result. See merge_collections for further explanation. kwargs: Optional keyword arguments to pass onto the individual load_*_variables() calls. Returns: If only a single level is passed, or merge is True, a single BIDSVariableCollection. If a list of levels is passed and merge is False, a dict is returned, with level names in keys and BIDSVariableCollections in values. ''' ALL_LEVELS = ['time', 'run', 'session', 'subject'] if levels is None: levels = ALL_LEVELS _levels = listify(levels) func_map = { 'time': load_event_variables, 'run': load_run_variables, 'session': load_session_variables, 'subject': load_subject_variables } bad_levels = set(_levels) - set(ALL_LEVELS) if bad_levels: raise ValueError("Invalid level names: %s" % bad_levels) collections = [func_map[l](layout, **kwargs) for l in _levels] if len(collections) == 1: return collections[0] # if merge: # return merge_collections(collections, target=target) return dict(zip(_levels, collections))
def _densify_variables(self): variables = [] for var in self._densify: if var == 'variables': variables.extend(self.variables) else: variables.extend(listify(self.kwargs.get(var, []))) for v in variables: var = self._variables[v] if isinstance(var, SparseRunVariable): sr = self.collection.sampling_rate self._variables[v] = var.to_dense(sr)
def _replace_arg_values(values): is_iter = isinstance(values, (list, tuple)) values = listify(values) result = [] # Only try to match strings containing a relevant special character for v in values: if isinstance(v, str) and re.search('[\*\?\[\]]', v): result.append(self.collection.match_variables(v)) else: result.append([v]) result = list(itertools.chain(*result)) # Don't return a list unless we have to if is_iter or len(result) > 1: return result return result[0]
def _densify_columns(self): from bids.analysis.variables import SparseEventColumn cols = [] for var in self._densify: if var == 'cols': cols.extend(self.cols) else: cols.extend(listify(self.kwargs.get(var, []))) for c in cols: col = self._columns[c] if isinstance(col, SparseEventColumn): self._columns[c] = col.to_dense()
def merge_collections(collections, force_dense=False, sampling_rate='auto'): """Merge two or more collections at the same level of analysis. Parameters ---------- collections : list List of Collections to merge. sampling_rate : int or str Sampling rate to use if it becomes necessary to resample DenseRunVariables. Either an integer or 'auto' (see merge_variables docstring for further explanation). Returns ------- BIDSVariableCollection or BIDSRunVariableCollection Result type depends on the type of the input collections. """ if len(listify(collections)) == 1: return collections levels = set([c.level for c in collections]) if len(levels) > 1: raise ValueError("At the moment, it's only possible to merge " "Collections at the same level of analysis. You " "passed collections at levels: %s." % levels) variables = list(chain(*[c.variables.values() for c in collections])) cls = collections[0].__class__ variables = cls.merge_variables(variables, sampling_rate=sampling_rate) if isinstance(collections[0], BIDSRunVariableCollection): if sampling_rate == 'auto': rates = [ var.sampling_rate for var in variables if isinstance(var, DenseRunVariable) ] sampling_rate = rates[0] if rates else None return cls(variables, sampling_rate) return cls(variables)
def _check_categorical_columns(self): ''' Convert categorical columns to dummy-coded indicators. ''' # Collect column names to pass through pass_thru = [] if self._allow_categorical is not None: for arg in self._allow_categorical: keys = self.cols if arg == 'cols' else self.kwargs.get(arg, []) pass_thru.extend(listify(keys)) pass_thru = list(set(pass_thru)) for name, col in self._columns.items(): if name not in pass_thru: if col.values.values.dtype.kind not in 'bifc': msg = ("The %s transformation does not allow column '%s' " "to be categorical. Eithe pass a different column, " "or explicitly convert to a set of binary " "indicators via the 'factor' transformation.") raise ValueError(msg % (self.__class__.__name__, name))
def _clone_variables(self): ''' Deep copy all variables the transformation touches. This prevents us from unnecessarily overwriting existing variables. ''' # Always clone the target variables self._variables = {v: self.collection[v].clone() for v in self.variables} if not self._variables_used: return # Loop over argument names and clone all variable names in each one for var in self._variables_used: for v in listify(self.kwargs.get(var, [])): # Kludge: we need to allow entity variables to be passed as # names even though they don't exist as separate variables if (v not in self.collection.variables and v in ['task', 'run', 'session', 'subject']): continue self._variables[v] = deepcopy(self.collection[v])
def merge_collections(collections, force_dense=False, sampling_rate='auto'): ''' Merge two or more collections at the same level of analysis. Args: collections (list): List of Collections to merge. sampling_rate (int, str): Sampling rate to use if it becomes necessary to resample DenseRunVariables. Either an integer or 'auto' (see merge_variables docstring for further explanation). Returns: A BIDSVariableCollection or BIDSRunVariableCollection, depending on the type of the input collections. ''' if len(listify(collections)) == 1: return collections levels = set([c.level for c in collections]) if len(levels) > 1: raise ValueError("At the moment, it's only possible to merge " "Collections at the same level of analysis. You " "passed collections at levels: %s." % levels) variables = list(chain(*[c.variables.values() for c in collections])) cls = collections[0].__class__ variables = cls.merge_variables(variables, sampling_rate=sampling_rate) if isinstance(collections[0], BIDSRunVariableCollection): if sampling_rate == 'auto': rates = [var.sampling_rate for var in variables if isinstance(var, DenseRunVariable)] sampling_rate = rates[0] if rates else None return cls(variables, sampling_rate) return cls(variables)
def _load_tsv_variables(layout, type_, dataset=None, columns=None, prepend_type=False, **selectors): ''' Reads variables from scans.tsv, sessions.tsv, and participants.tsv. Args: layout (BIDSLayout): The BIDSLayout to use. type_ (str): The type of file to read from. Must be one of 'scans', 'sessions', or 'participants'. dataset (NodeIndex): A BIDS NodeIndex container. If None, a new one is initialized. columns (list): Optional list of names specifying which columns in the files to return. If None, all columns are returned. prepend_type (bool): If True, variable names are prepended with the type name (e.g., 'age' becomes 'participants.age'). selectors (dict): Optional keyword arguments passed onto the BIDSLayout instance's get() method; can be used to constrain which data are loaded. Returns: A NodeIndex instance. ''' # Sanitize the selectors: only keep entities at current level or above remap = {'scans': 'run', 'sessions': 'session', 'participants': 'subject'} level = remap[type_] valid_entities = BASE_ENTITIES[:BASE_ENTITIES.index(level)] layout_kwargs = {k: v for k, v in selectors.items() if k in valid_entities} if dataset is None: dataset = NodeIndex() files = layout.get(extensions='.tsv', return_type='file', type=type_, **layout_kwargs) for f in files: f = layout.files[f] _data = pd.read_table(f.path, sep='\t') # Entities can be defined either within the first column of the .tsv # file (for entities that vary by row), or from the full file path # (for entities constant over all rows in the file). We extract both # and store them in the main DataFrame alongside other variables (as # they'll be extracted when the Column is initialized anyway). for ent_name, ent_val in f.entities.items(): if ent_name in BASE_ENTITIES: _data[ent_name] = ent_val # Handling is a bit more convoluted for scans.tsv, because the first # column contains the run filename, which we also need to parse. if type_ == 'scans': image = _data['filename'] _data = _data.drop('filename', axis=1) dn = f.dirname paths = [join(dn, p) for p in image.values] ent_recs = [layout.files[p].entities for p in paths if p in layout.files] ent_cols = pd.DataFrame.from_records(ent_recs) _data = pd.concat([_data, ent_cols], axis=1) # It's possible to end up with duplicate entity columns this way _data = _data.T.drop_duplicates().T # The BIDS spec requires ID columns to be named 'session_id', 'run_id', # etc., and IDs begin with entity prefixes (e.g., 'sub-01'). To ensure # consistent internal handling, we strip these suffixes and prefixes. elif type_ == 'sessions': _data = _data.rename(columns={'session_id': 'session'}) _data['session'] = _data['session'].str.replace('ses-', '') elif type_ == 'participants': _data = _data.rename(columns={'participant_id': 'subject'}) _data['subject'] = _data['subject'].str.replace('sub-', '') # Filter rows on all selectors comm_cols = list(set(_data.columns) & set(selectors.keys())) for col in comm_cols: vals = listify(selectors.get(col)) _data = _data.query('%s in @vals' % col) level = {'scans': 'session', 'sessions': 'subject', 'participants': 'dataset'}[type_] node = dataset.get_or_create_node(level, f.entities) ent_cols = list(set(ALL_ENTITIES) & set(_data.columns)) amp_cols = list(set(_data.columns) - set(ent_cols)) if columns is not None: amp_cols = list(set(amp_cols) & set(columns)) for col_name in amp_cols: # Rename colummns: values must be in 'amplitude' df = _data.loc[:, [col_name] + ent_cols] df.columns = ['amplitude'] + ent_cols if prepend_type: col_name = '%s.%s' % (type_, col_name) node.add_variable(SimpleVariable(col_name, df, type_)) return dataset
def load_variables(layout, types=None, levels=None, skip_empty=True, **kwargs): ''' A convenience wrapper for one or more load_*_variables() calls. Args: layout (BIDSLayout): BIDSLayout containing variable files. types (str, list): Types of variables to retrieve. All valid values reflect the filename stipulated in the BIDS spec for each kind of variable. Valid values include: 'events', 'physio', 'stim', 'scans', 'participants', 'sessions', and 'confounds'. levels (str, list): Optional level(s) of variables to load. Valid values are 'run', 'session', 'subject', or 'dataset'. This is simply a shorthand way to specify types--e.g., 'run' will be converted to types=['events', 'physio', 'stim', 'confounds']. skip_empty (bool): Whether or not to skip empty Variables (i.e., where there are no rows/records in a file after applying any filtering operations like dropping NaNs). kwargs: Optional keyword arguments to pass onto the individual load_*_variables() calls. Returns: A NodeIndex instance. Example: >>> load_variables(layout, ['events', 'physio'], subject='01') # returns all variables stored in _events.tsv and _physio.tsv.gz files # for runs that belong to subject with id '01'. ''' TYPES = ['events', 'physio', 'stim', 'scans', 'participants', 'sessions', 'confounds'] types = listify(types) if types is None: if levels is not None: types = [] lev_map = { 'run': ['events', 'physio', 'stim', 'confounds'], 'session': ['scans'], 'subject': ['sessions'], 'dataset': ['participants'] } [types.extend(lev_map[l]) for l in listify(levels)] else: types = TYPES bad_types = set(types) - set(TYPES) if bad_types: raise ValueError("Invalid variable types: %s" % bad_types) dataset = NodeIndex() run_types = list({'events', 'physio', 'stim', 'confounds'} - set(types)) type_flags = {t: False for t in run_types} if len(type_flags) < 4: _kwargs = kwargs.copy() _kwargs.update(type_flags) dataset = _load_time_variables(layout, dataset, **_kwargs) for t in ({'scans', 'sessions', 'participants'} & set(types)): dataset = _load_tsv_variables(layout, t, dataset, **kwargs) return dataset
def load_variables(layout, types=None, levels=None, skip_empty=True, dataset=None, scope='all', **kwargs): """A convenience wrapper for one or more load_*_variables() calls. Parameters ---------- layout : :obj:`bids.layout.BIDSLayout` BIDSLayout containing variable files. types : str or list Types of variables to retrieve. All valid values reflect the filename stipulated in the BIDS spec for each kind of variable. Valid values include: 'events', 'physio', 'stim', 'scans', 'participants', 'sessions', and 'regressors'. levels : str or list Optional level(s) of variables to load. Valid values are 'run', 'session', 'subject', or 'dataset'. This is simply a shorthand way to specify types--e.g., 'run' will be converted to types=['events', 'physio', 'stim', 'regressors']. skip_empty : bool Whether or not to skip empty Variables (i.e., where there are no rows/records in a file after applying any filtering operations like dropping NaNs). dataset : NodeIndex An existing NodeIndex container to store the loaded data in. Can be used to iteratively construct a dataset that contains otherwise heterogeneous sets of variables. If None, a new NodeIndex is used. scope : str or list The scope of the space to search for variables. See docstring for BIDSLayout for details and valid predefined values. kwargs : dict Optional keyword arguments to pass onto the individual load_*_variables() calls. Returns ------- A NodeIndex instance. Examples -------- >>> load_variables(layout, ['events', 'physio'], subject='01') # doctest: +SKIP # returns all variables stored in _events.tsv and _physio.tsv.gz files # for runs that belong to subject with id '01'. """ TYPES = [ 'events', 'physio', 'stim', 'scans', 'participants', 'sessions', 'regressors' ] types = listify(types) if types is None: if levels is not None: types = [] lev_map = { 'run': ['events', 'physio', 'stim', 'regressors'], 'session': ['scans'], 'subject': ['sessions'], 'dataset': ['participants'] } [types.extend(lev_map[l.lower()]) for l in listify(levels)] else: types = TYPES bad_types = set(types) - set(TYPES) if bad_types: raise ValueError("Invalid variable types: %s" % bad_types) dataset = dataset or NodeIndex() run_types = list({'events', 'physio', 'stim', 'regressors'} - set(types)) type_flags = {t: False for t in run_types} if len(type_flags) < 4: _kwargs = kwargs.copy() _kwargs.update(type_flags) dataset = _load_time_variables(layout, dataset, scope=scope, **_kwargs) for t in ({'scans', 'sessions', 'participants'} & set(types)): kwargs.pop('suffix', None) # suffix is always one of values above dataset = _load_tsv_variables(layout, t, dataset, scope=scope, **kwargs) return dataset
class FieldmapEstimation: """ Represent fieldmap estimation strategies. This class provides a consistent interface to all types of fieldmap estimation strategies. The actual type of method for estimation is inferred from the ``sources`` input, and collects all the available metadata. """ sources = attr.ib( default=None, converter=lambda v: [ FieldmapFile(f) if not isinstance(f, FieldmapFile) else f for f in listify(v) ], repr=lambda v: f"<{len(v)} files>", ) """File path or list of paths indicating the source data to estimate a fieldmap.""" method = attr.ib(init=False, default=EstimatorType.UNKNOWN, on_setattr=_type_setter) """Flag indicating the estimator type inferred from the input sources.""" bids_id = attr.ib(default=None, kw_only=True, type=str, on_setattr=_id_setter) """The unique ``B0FieldIdentifier`` field of this fieldmap.""" _wf = attr.ib(init=False, default=None, repr=False) """Internal pointer to a workflow.""" def __attrs_post_init__(self): """Determine the inteded fieldmap estimation type and check for data completeness.""" suffix_list = [f.suffix for f in self.sources] suffix_set = set(suffix_list) # Fieldmap option 1: actual field-mapping sequences fmap_types = suffix_set.intersection( ("fieldmap", "phasediff", "phase1", "phase2")) if len(fmap_types) > 1 and fmap_types - set(("phase1", "phase2")): raise TypeError( f"Incompatible suffices found: <{','.join(fmap_types)}>.") if fmap_types: sources = sorted( str(f.path) for f in self.sources if f.suffix in ("fieldmap", "phasediff", "phase1", "phase2")) # Automagically add the corresponding phase2 file if missing as argument missing_phases = ("phase1" not in fmap_types, "phase2" not in fmap_types) if sum(missing_phases) == 1: mis_ph = "phase1" if missing_phases[0] else "phase2" hit_ph = "phase2" if missing_phases[0] else "phase1" new_source = sources[0].replace(hit_ph, mis_ph) self.sources.append(FieldmapFile(new_source)) sources.insert(int(missing_phases[1]), new_source) # Set method, this cannot be undone self.method = MODALITIES[fmap_types.pop()] # Determine the name of the corresponding (first) magnitude file(s) magnitude = f"magnitude{'' if self.method == EstimatorType.MAPPED else '1'}" if magnitude not in suffix_set: try: self.sources.append( FieldmapFile(sources[0].replace( "fieldmap", "magnitude").replace("diff", "1").replace( "phase", "magnitude"))) except Exception: raise ValueError( "A fieldmap or phase-difference estimation type was found, " f"but an anatomical reference ({magnitude} file) is missing." ) # Check presence and try to find (if necessary) the second magnitude file if (self.method == EstimatorType.PHASEDIFF and "magnitude2" not in suffix_set): try: self.sources.append( FieldmapFile(sources[-1].replace("diff", "2").replace( "phase", "magnitude"))) except Exception: if "phase2" in suffix_set: raise ValueError( "A phase-difference estimation (phase1/2) type was found, " "but an anatomical reference (magnitude2 file) is missing." ) # Fieldmap option 2: PEPOLAR (and fieldmap-less or ANAT) # IMPORTANT NOTE: fieldmap-less approaches can be considered PEPOLAR with RO = 0.0s pepolar_types = suffix_set.intersection( ("bold", "dwi", "epi", "sbref")) anat_types = suffix_set.intersection(("T1w", "T2w")) _pepolar_estimation = (len([ f for f in suffix_list if f in ("bold", "dwi", "epi", "sbref") ]) > 1) if _pepolar_estimation and not anat_types: self.method = MODALITIES[pepolar_types.pop()] _pe = set(f.metadata["PhaseEncodingDirection"] for f in self.sources) if len(_pe) == 1: raise ValueError( f"Only one phase-encoding direction <{_pe.pop()}> found across sources." ) elif anat_types: self.method = MODALITIES[anat_types.pop()] if not pepolar_types: raise ValueError( "Only anatomical sources were found, cannot estimate fieldmap." ) if self.method == EstimatorType.UNKNOWN: # No method has been identified -> fail. raise ValueError("Insufficient sources to estimate a fieldmap.") intents_meta = set( el for f in self.sources for el in listify(f.metadata.get("IntendedFor") or [])) # Register this estimation method if not self.bids_id: # If not manually set, try to get it from BIDS metadata bids_ids = set([ f.metadata.get("B0FieldIdentifier") for f in self.sources if f.metadata.get("B0FieldIdentifier") ]) if len(bids_ids) > 1: raise ValueError( f"Multiple ``B0FieldIdentifier`` set: <{', '.join(bids_ids)}>" ) elif bids_ids: object.__setattr__(self, "bids_id", bids_ids.pop()) else: bids_id = _estimators.add(self.paths()) object.__setattr__(self, "bids_id", bids_id) for intent_file in intents_meta: _intents[intent_file].add(bids_id) return _estimators[self.bids_id] = self.paths() for intent_file in intents_meta: _intents[intent_file].add(self.bids_id) def paths(self): """Return a tuple of paths that are sorted.""" return tuple(sorted(str(f.path) for f in self.sources)) def get_workflow(self, **kwargs): """Build the estimation workflow corresponding to this instance.""" if self._wf is not None: return self._wf # Override workflow name kwargs["name"] = f"wf_{self.bids_id}" if self.method in (EstimatorType.MAPPED, EstimatorType.PHASEDIFF): from .workflows.fit.fieldmap import init_fmap_wf kwargs["mode"] = str(self.method).rpartition(".")[-1].lower() self._wf = init_fmap_wf(**kwargs) self._wf.inputs.inputnode.magnitude = [ str(f.path) for f in self.sources if f.suffix.startswith("magnitude") ] self._wf.inputs.inputnode.fieldmap = [ (str(f.path), f.metadata) for f in self.sources if f.suffix in ("fieldmap", "phasediff", "phase2", "phase1") ] elif self.method == EstimatorType.PEPOLAR: from .workflows.fit.pepolar import init_topup_wf self._wf = init_topup_wf(**kwargs) elif self.method == EstimatorType.ANAT: from .workflows.fit.syn import init_syn_sdc_wf self._wf = init_syn_sdc_wf(**kwargs) return self._wf
def _align_variables(self, variables): """Checks whether the specified variables have aligned indexes. This implies either that all variables are dense, or that all variables are sparse and have exactly the same onsets and durations. If variables are not aligned and force = True, all variables will be forced to dense format in order to ensure alignment. """ if self._aligned_required is None or self._aligned_required == 'none': return def _align(variables): # If any variable is dense, all variables must be dense sparse = [c for c in variables if isinstance(c, SparseRunVariable)] if len(sparse) < len(variables): if sparse: msg = ("Found a mix of dense and sparse variables. May " "cause problems for some transformations.") warnings.warn(msg) # If all are sparse, durations, onsets, and index must match # perfectly for all else: def get_col_data(col): return np.c_[col.values.index, col.duration, col.onset] def compare_variables(a, b): return len(a) == len(b) and np.allclose(a, b) # Compare 1st col with each of the others fc = get_col_data(variables[0]) if not all([ compare_variables(fc, get_col_data(c)) for c in variables[1:] ]): if self._aligned_required == 'force_dense': msg = ("Forcing all sparse variables to dense in " "order to ensure proper alignment.") sr = self.collection.sampling_rate variables = [c.to_dense(sr) for c in variables] warnings.warn(msg) else: raise ValueError( "Misaligned sparse variables found." "To force variables into alignment by densifying," "set dense=True in the Transformation arguments") _aligned_variables = True if not self._aligned_variables \ else self._aligned_variables _aligned_variables = [ listify(self.kwargs[v]) for v in listify(_aligned_variables) if v in self.kwargs ] _aligned_variables = list(itertools.chain(*_aligned_variables)) _aligned_variables = [ self.collection[c] for c in _aligned_variables if c ] if _aligned_variables and self._loopable: for c in variables: # TODO: should clone all variables in align_variables before # alignment to prevent conversion to dense in any given # iteration having side effects. This could be an issue if, # e.g., some vars in 'variables' are dense and some are sparse. _align([c] + _aligned_variables) else: _align(listify(variables) + _aligned_variables)
def _run_interface(self, runtime): # Ready the output folder base_directory = runtime.cwd if isdefined(self.inputs.base_directory): base_directory = self.inputs.base_directory base_directory = Path(base_directory).absolute() out_path = base_directory / self.out_path_base out_path.mkdir(exist_ok=True, parents=True) # Ensure we have a list in_file = listify(self.inputs.in_file) # Read in the dictionary of metadata if isdefined(self.inputs.meta_dict): meta = self.inputs.meta_dict # inputs passed in construction take priority meta.update(self._metadata) self._metadata = meta # Initialize entities with those from the source file. in_entities = [ parse_file_entities(str(relative_to_root(source_file))) for source_file in self.inputs.source_file ] out_entities = { k: v for k, v in in_entities[0].items() if all( ent.get(k) == v for ent in in_entities[1:]) } for drop_entity in listify(self.inputs.dismiss_entities or []): out_entities.pop(drop_entity, None) # Override extension with that of the input file(s) out_entities["extension"] = [ # _splitext does not accept .surf.gii (for instance) "".join(Path(orig_file).suffixes).lstrip(".") for orig_file in in_file ] compress = listify(self.inputs.compress) or [None] if len(compress) == 1: compress = compress * len(in_file) for i, ext in enumerate(out_entities["extension"]): if compress[i] is not None: ext = regz.sub("", ext) out_entities["extension"][ i] = f"{ext}.gz" if compress[i] else ext # Override entities with those set as inputs for key in self._allowed_entities: value = getattr(self.inputs, key) if value is not None and isdefined(value): out_entities[key] = value # Clean up native resolution with space if out_entities.get("resolution") == "native" and out_entities.get( "space"): out_entities.pop("resolution", None) if len(set(out_entities["extension"])) == 1: out_entities["extension"] = out_entities["extension"][0] # Insert custom (non-BIDS) entities from allowed_entities. custom_entities = set(out_entities.keys()) - set(BIDS_DERIV_ENTITIES) patterns = BIDS_DERIV_PATTERNS if custom_entities: # Example: f"{key}-{{{key}}}" -> "task-{task}" custom_pat = "_".join(f"{key}-{{{key}}}" for key in sorted(custom_entities)) patterns = [ pat.replace("_{suffix", "_".join(("", custom_pat, "{suffix"))) for pat in patterns ] # Prepare SimpleInterface outputs object self._results["out_file"] = [] self._results["compression"] = [] self._results["fixed_hdr"] = [False] * len(in_file) dest_files = build_path(out_entities, path_patterns=patterns) if not dest_files: raise ValueError( f"Could not build path with entities {out_entities}.") # Make sure the interpolated values is embedded in a list, and check dest_files = listify(dest_files) if len(in_file) != len(dest_files): raise ValueError(f"Input files ({len(in_file)}) not matched " f"by interpolated patterns ({len(dest_files)}).") for i, (orig_file, dest_file) in enumerate(zip(in_file, dest_files)): out_file = out_path / dest_file out_file.parent.mkdir(exist_ok=True, parents=True) self._results["out_file"].append(str(out_file)) self._results["compression"].append( _copy_any(orig_file, str(out_file))) is_nifti = out_file.name.endswith( (".nii", ".nii.gz")) and not out_file.name.endswith( (".dtseries.nii", ".dtseries.nii.gz")) data_dtype = self.inputs.data_dtype or DEFAULT_DTYPES[ self.inputs.suffix] if is_nifti and any((self.inputs.check_hdr, data_dtype)): # Do not use mmap; if we need to access the data at all, it will be to # rewrite, risking a BusError nii = nb.load(out_file, mmap=False) if self.inputs.check_hdr: hdr = nii.header curr_units = tuple([ None if u == "unknown" else u for u in hdr.get_xyzt_units() ]) curr_codes = (int(hdr["qform_code"]), int(hdr["sform_code"])) # Default to mm, use sec if data type is bold units = ( curr_units[0] or "mm", "sec" if out_entities["suffix"] == "bold" else None, ) xcodes = (1, 1) # Derivative in its original scanner space if self.inputs.space: xcodes = ((4, 4) if self.inputs.space in STANDARD_SPACES else (2, 2)) if curr_codes != xcodes or curr_units != units: self._results["fixed_hdr"][i] = True hdr.set_qform(nii.affine, xcodes[0]) hdr.set_sform(nii.affine, xcodes[1]) hdr.set_xyzt_units(*units) # Rewrite file with new header overwrite_header(nii, out_file) if data_dtype == "source": # match source dtype try: data_dtype = nb.load( self.inputs.source_file[0]).get_data_dtype() except Exception: LOGGER.warning( f"Could not get data type of file {self.inputs.source_file[0]}" ) data_dtype = None if data_dtype: if self.inputs.check_hdr: # load updated NIfTI nii = nb.load(out_file, mmap=False) data_dtype = np.dtype(data_dtype) orig_dtype = nii.get_data_dtype() if orig_dtype != data_dtype: LOGGER.warning( f"Changing {out_file} dtype from {orig_dtype} to {data_dtype}" ) # coerce dataobj to new data dtype if np.issubdtype(data_dtype, np.integer): new_data = np.rint(nii.dataobj).astype(data_dtype) else: new_data = np.asanyarray(nii.dataobj, dtype=data_dtype) # and set header to match nii.set_data_dtype(data_dtype) nii = nii.__class__(new_data, nii.affine, nii.header) nii.to_filename(out_file) if len(self._results["out_file"]) == 1: meta_fields = self.inputs.copyable_trait_names() self._metadata.update({ k: getattr(self.inputs, k) for k in meta_fields if k not in self._static_traits }) if self._metadata: out_file = Path(self._results["out_file"][0]) # 1.3.x hack # For dtseries, we have been generating weird non-BIDS JSON files. # We can safely keep producing them to avoid breaking derivatives, but # only the existing keys should keep going into them. if out_file.name.endswith(".dtseries.nii"): legacy_metadata = {} for key in ("grayordinates", "space", "surface", "surface_density", "volume"): if key in self._metadata: legacy_metadata[key] = self._metadata.pop(key) if legacy_metadata: sidecar = out_file.parent / f"{_splitext(str(out_file))[0]}.json" sidecar.write_text( dumps(legacy_metadata, sort_keys=True, indent=2)) # The future: the extension is the first . and everything after sidecar = out_file.parent / f"{out_file.name.split('.', 1)[0]}.json" sidecar.write_text( dumps(self._metadata, sort_keys=True, indent=2)) self._results["out_meta"] = str(sidecar) return runtime
def __init__(self, fields=None, undef_fields=False, **inputs): super(ReadSidecarJSON, self).__init__(**inputs) self._fields = listify(fields or []) self._undef_fields = undef_fields
def __attrs_post_init__(self): """Determine the inteded fieldmap estimation type and check for data completeness.""" suffix_list = [f.suffix for f in self.sources] suffix_set = set(suffix_list) # Fieldmap option 1: actual field-mapping sequences fmap_types = suffix_set.intersection( ("fieldmap", "phasediff", "phase1", "phase2")) if len(fmap_types) > 1 and fmap_types - set(("phase1", "phase2")): raise TypeError( f"Incompatible suffices found: <{','.join(fmap_types)}>.") if fmap_types: sources = sorted( str(f.path) for f in self.sources if f.suffix in ("fieldmap", "phasediff", "phase1", "phase2")) # Automagically add the corresponding phase2 file if missing as argument missing_phases = ("phase1" not in fmap_types, "phase2" not in fmap_types) if sum(missing_phases) == 1: mis_ph = "phase1" if missing_phases[0] else "phase2" hit_ph = "phase2" if missing_phases[0] else "phase1" new_source = sources[0].replace(hit_ph, mis_ph) self.sources.append(FieldmapFile(new_source)) sources.insert(int(missing_phases[1]), new_source) # Set method, this cannot be undone self.method = MODALITIES[fmap_types.pop()] # Determine the name of the corresponding (first) magnitude file(s) magnitude = f"magnitude{'' if self.method == EstimatorType.MAPPED else '1'}" if magnitude not in suffix_set: try: self.sources.append( FieldmapFile(sources[0].replace( "fieldmap", "magnitude").replace("diff", "1").replace( "phase", "magnitude"))) except Exception: raise ValueError( "A fieldmap or phase-difference estimation type was found, " f"but an anatomical reference ({magnitude} file) is missing." ) # Check presence and try to find (if necessary) the second magnitude file if (self.method == EstimatorType.PHASEDIFF and "magnitude2" not in suffix_set): try: self.sources.append( FieldmapFile(sources[-1].replace("diff", "2").replace( "phase", "magnitude"))) except Exception: if "phase2" in suffix_set: raise ValueError( "A phase-difference estimation (phase1/2) type was found, " "but an anatomical reference (magnitude2 file) is missing." ) # Fieldmap option 2: PEPOLAR (and fieldmap-less or ANAT) # IMPORTANT NOTE: fieldmap-less approaches can be considered PEPOLAR with RO = 0.0s pepolar_types = suffix_set.intersection( ("bold", "dwi", "epi", "sbref")) anat_types = suffix_set.intersection(("T1w", "T2w")) _pepolar_estimation = (len([ f for f in suffix_list if f in ("bold", "dwi", "epi", "sbref") ]) > 1) if _pepolar_estimation and not anat_types: self.method = MODALITIES[pepolar_types.pop()] _pe = set(f.metadata["PhaseEncodingDirection"] for f in self.sources) if len(_pe) == 1: raise ValueError( f"Only one phase-encoding direction <{_pe.pop()}> found across sources." ) elif anat_types: self.method = MODALITIES[anat_types.pop()] if not pepolar_types: raise ValueError( "Only anatomical sources were found, cannot estimate fieldmap." ) if self.method == EstimatorType.UNKNOWN: # No method has been identified -> fail. raise ValueError("Insufficient sources to estimate a fieldmap.") intents_meta = set( el for f in self.sources for el in listify(f.metadata.get("IntendedFor") or [])) # Register this estimation method if not self.bids_id: # If not manually set, try to get it from BIDS metadata bids_ids = set([ f.metadata.get("B0FieldIdentifier") for f in self.sources if f.metadata.get("B0FieldIdentifier") ]) if len(bids_ids) > 1: raise ValueError( f"Multiple ``B0FieldIdentifier`` set: <{', '.join(bids_ids)}>" ) elif bids_ids: object.__setattr__(self, "bids_id", bids_ids.pop()) else: bids_id = _estimators.add(self.paths()) object.__setattr__(self, "bids_id", bids_id) for intent_file in intents_meta: _intents[intent_file].add(bids_id) return _estimators[self.bids_id] = self.paths() for intent_file in intents_meta: _intents[intent_file].add(self.bids_id)
def _load_tsv_variables(layout, suffix, dataset=None, columns=None, prepend_type=False, scope='all', **selectors): """Reads variables from scans.tsv, sessions.tsv, and participants.tsv. Parameters ---------- layout : :obj:`bids.layout.BIDSLayout` The BIDSLayout to use. suffix : str The suffix of file to read from. Must be one of 'scans', 'sessions', or 'participants'. dataset : NodeIndex A BIDS NodeIndex container. If None, a new one is initialized. columns : list Optional list of names specifying which columns in the files to return. If None, all columns are returned. prepend_type : bool If True, variable names are prepended with the type name (e.g., 'age' becomes 'participants.age'). scope : str or list The scope of the space to search for variables. See docstring for BIDSLayout for details and valid predefined values. selectors : dict Optional keyword arguments passed onto the BIDSLayout instance's get() method; can be used to constrain which data are loaded. Returns ------- A NodeIndex instance. """ # Sanitize the selectors: only keep entities at current level or above remap = {'scans': 'run', 'sessions': 'session', 'participants': 'subject'} level = remap[suffix] valid_entities = BASE_ENTITIES[:BASE_ENTITIES.index(level)] layout_kwargs = {k: v for k, v in selectors.items() if k in valid_entities} if dataset is None: dataset = NodeIndex() files = layout.get(extension='.tsv', suffix=suffix, scope=scope, **layout_kwargs) for f in files: _data = f.get_df(include_timing=False) # Entities can be defined either within the first column of the .tsv # file (for entities that vary by row), or from the full file path # (for entities constant over all rows in the file). We extract both # and store them in the main DataFrame alongside other variables (as # they'll be extracted when the BIDSVariable is initialized anyway). for ent_name, ent_val in f.entities.items(): if ent_name in ALL_ENTITIES: _data[ent_name] = ent_val # Handling is a bit more convoluted for scans.tsv, because the first # column contains the run filename, which we also need to parse. if suffix == 'scans': # Suffix is guaranteed to be present in each filename, so drop the # constant column with value 'scans' to make way for it and prevent # two 'suffix' columns. _data.drop(columns=['suffix'], inplace=True) image = _data['filename'] _data = _data.drop('filename', axis=1) dn = f._dirname paths = [str(dn / p) for p in image.values] ent_recs = [ dict(layout.files[p].entities) for p in paths if p in layout.files ] ent_cols = pd.DataFrame.from_records(ent_recs) # Remove entity columns found in both DFs dupes = list(set(ent_cols.columns) & set(_data.columns)) to_drop = ['extension'] + dupes ent_cols.drop(columns=to_drop, inplace=True) _data = pd.concat([_data, ent_cols], axis=1, sort=True) # The BIDS spec requires ID columns to be named 'session_id', 'run_id', # etc., and IDs begin with entity prefixes (e.g., 'sub-01'). To ensure # consistent internal handling, we strip these suffixes and prefixes. elif suffix == 'sessions': _data = _data.rename(columns={'session_id': 'session'}) _data['session'] = _data['session'].str.replace('ses-', '') elif suffix == 'participants': _data = _data.rename(columns={'participant_id': 'subject'}) _data['subject'] = _data['subject'].str.replace('sub-', '') def make_patt(x, regex_search=False): patt = '%s' % x if isinstance(x, (int, float)): # allow for leading zeros if a number was specified # regardless of regex_search patt = '0*' + patt if not regex_search: patt = '^%s$' % patt return patt # Filter rows on all selectors comm_cols = list(set(_data.columns) & set(selectors.keys())) for col in comm_cols: ent_patts = [ make_patt(x, regex_search=layout.regex_search) for x in listify(selectors.get(col)) ] patt = '|'.join(ent_patts) _data = _data[_data[col].str.contains(patt)] level = { 'scans': 'session', 'sessions': 'subject', 'participants': 'dataset' }[suffix] node = dataset.get_or_create_node(level, f.entities) ent_cols = list(set(ALL_ENTITIES) & set(_data.columns)) amp_cols = list(set(_data.columns) - set(ent_cols)) if columns is not None: amp_cols = list(set(amp_cols) & set(columns)) for col_name in amp_cols: # Rename columns: values must be in 'amplitude' df = _data.loc[:, [col_name] + ent_cols] df.columns = ['amplitude'] + ent_cols if prepend_type: col_name = '%s.%s' % (suffix, col_name) node.add_variable( SimpleVariable(name=col_name, data=df, source=suffix)) return dataset
def _flatten(inlist): from bids.utils import listify return [el for items in listify(inlist) for el in listify(items)]