def connect_input(self, spec_name, node, node_input, format=None, **kwargs): """ Connects a analysis fileset_spec as an input to the provided node Parameters ---------- spec_name : str Name of the analysis data spec (or one of the IDs from the iterator nodes, 'subject_id' or 'visit_id') to connect to the node node : arcana.Node The node to connect the input to node_input : str Name of the input on the node to connect the fileset spec to format : FileFormat | None The file format the input is expected in. If it differs from the format in data spec or of analysis input then an implicit conversion is performed. If None the file format in the data spec is assumed """ if spec_name in self.analysis.ITERFIELDS: self._iterator_conns[spec_name].append((node, node_input, format)) else: name = self._map_name(spec_name, self._input_map) if name not in self.analysis.data_spec_names(): raise ArcanaDesignError( "Proposed input '{}' to {} is not a valid spec name ('{}')" .format(name, self._error_msg_loc, "', '".join(self.analysis.data_spec_names()))) self._check_valid_trait_name(node, node_input, 'input') self._input_conns[name].append((node, node_input, format, kwargs))
def branch(self, name, values=None): # noqa: E501 @UnusedVariable """ Checks whether the given switch matches the value provided Parameters ---------- name : str The name of the parameter to retrieve value : str | None The value(s) of the switch to match if a non-boolean switch """ if isinstance(values, basestring): values = [values] spec = self.param_spec(name) if not isinstance(spec, SwitchSpec): raise ArcanaUsageError( "{} is standard parameter not a switch".format(spec)) switch = self._get_parameter(name) if spec.is_boolean: if values is not None: raise ArcanaDesignError( "Should not provide values ({}) to boolean switch " "'{}' in {}".format(values, name, self._param_error_location)) in_branch = switch.value else: if values is None: raise ArcanaDesignError( "Value(s) need(s) to be provided non-boolean switch" " '{}' in {}".format(name, self._param_error_location)) values = set(values) # Register parameter as being used by the pipeline unrecognised_values = values - set(spec.choices) if unrecognised_values: raise ArcanaDesignError( "Provided value(s) ('{}') for switch '{}' in {} " "is not a valid option ('{}')".format( "', '".join(unrecognised_values), name, self._param_error_location, "', '".join(spec.choices))) in_branch = switch.value in values if not in_branch: try: in_branch = spec.fallbacks[switch.value] in values except KeyError: pass return in_branch
def _list_outputs(self): outputs = self._outputs().get() if len(self.inputs.subject_ids) != len(self.inputs.inlist): raise ArcanaDesignError( "Length of subject IDs ({}) doesn't match that of input items " "({})".format(len(self.inputs.subject_ids), len(self.inputs.inlist))) if len(self.inputs.visit_ids) != len(self.inputs.inlist): raise ArcanaDesignError( "Length of visit IDs ({}) doesn't match that of input items " "({})".format(len(self.inputs.visit_ids), len(self.inputs.inlist))) session_ids = list(zip(self.inputs.subject_ids, self.inputs.visit_ids)) index = session_ids.index( (self.inputs.subject_id, self.inputs.visit_id)) outputs['out'] = self.inputs.inlist[index] return outputs
def _list_outputs(self): outputs = self.output_spec().get() # Connect iterables (i.e. subject_id and visit_id) subject_id = (self.inputs.subject_id if isdefined(self.inputs.subject_id) else None) visit_id = (self.inputs.visit_id if isdefined(self.inputs.visit_id) else None) missing_inputs = [] # Collate input checksums into a dictionary input_checksums = { n: getattr(self.inputs, n + CHECKSUM_SUFFIX) for n in self._pipeline_input_filesets } input_checksums.update({ n: getattr(self.inputs, n + FIELD_SUFFIX) for n in self._pipeline_input_fields }) output_checksums = {} with ExitStack() as stack: # Connect to set of repositories that the collections come from for repository in self.repositories: stack.enter_context(repository) for fileset_slice in self.fileset_collections: fileset = fileset_slice.item(subject_id, visit_id) path = getattr(self.inputs, fileset_slice.name + PATH_SUFFIX) if not isdefined(path): if fileset.name in self._required: missing_inputs.append(fileset.name) continue # skip the upload for this fileset fileset.path = path # Push to repository output_checksums[fileset.name] = fileset.checksums for field_slice in self.field_collections: field = field_slice.item(subject_id, visit_id) value = getattr(self.inputs, field_slice.name + FIELD_SUFFIX) if not isdefined(value): if field.name in self._required: missing_inputs.append(field.name) continue # skip the upload for this field field.value = value # Push to repository output_checksums[field.name] = field.value # Add input and output checksums to provenance record and sink to # all repositories that have received data (typically only one) prov = copy(self._prov) prov['inputs'] = input_checksums prov['outputs'] = output_checksums record = Record(self._pipeline_name, self.frequency, subject_id, visit_id, self._from_analysis, prov) for dataset in self.datasets: dataset.put_record(record) if missing_inputs: raise ArcanaDesignError( "Required derivatives '{}' to were not created by upstream " "nodes connected to sink {}".format( "', '".join(missing_inputs), self)) # Return cache file paths outputs['checksums'] = output_checksums return outputs
def _check_valid_trait_name(self, node, trait_name, conn_type): if conn_type == 'output': trait_spec = node.interface._outputs() elif conn_type == 'input': trait_spec = node.interface.inputs else: assert False, "unknown conn_type {}".format(conn_type) valid_trait_names = trait_spec.trait_names() if trait_name not in valid_trait_names: raise ArcanaDesignError( ("Node {} '{}' is not a valid trait of {} used for '{}' " "node of '{}' pipeline. Valid traits are '{}'").format( conn_type, trait_name, node.interface, node.name, self.name, "', '".join(valid_trait_names)))
def unhandled_branch(self, name): """ Convenient method for raising exception if a pipeline doesn't handle a particular switch value Parameters ---------- name : str Name of the switch value : str Value of the switch which hasn't been handled """ raise ArcanaDesignError( "'{}' value of '{}' switch in {} is not handled".format( self.parameter(name), name, self._param_error_location))
def __init__(self, name, desc=None, optional=False, default=None): if optional and default is not None: raise ArcanaUsageError( "'optional' doesn't make sense for specs ('{}') with default " "values".format(name)) self._desc = desc self._analysis = None self._optional = optional # Set the name of the default slice-like object so it matches # the name of the spec if default is not None: if default.frequency != self.frequency: raise ArcanaDesignError( "Frequency of default slice-like object passed to " "'{}' spec ('{}'), does not match spec ('{}')".format( name, default.frequency, self.frequency)) default = deepcopy(default) self._default = default
def connect_output(self, spec_name, node, node_output, format=None, **kwargs): """ Connects an output to a analysis fileset spec Parameters ---------- spec_name : str Name of the analysis fileset spec to connect to node : arcana.Node The node to connect the output from node_output : str Name of the output on the node to connect to the fileset format : FileFormat | None The file format the output is returned in. If it differs from the format in data spec then an implicit conversion is performed. If None the it is assumed to be returned in the file format of the entry the data spec """ name = self._map_name(spec_name, self._output_map) if name not in self.analysis.data_spec_names(): raise ArcanaDesignError( "Proposed output '{}' to {} is not a valid spec name ('{}')". format(name, self._error_msg_loc, "', '".join(self.analysis.data_spec_names()))) if name in self._output_conns: prev_node, prev_node_output, _, _ = self._output_conns[name] logger.info( "Reassigning '{}' output from {}:{} to {}:{} in {}".format( name, prev_node.name, prev_node_output, node.name, node_output, self._error_msg_loc)) self._check_valid_trait_name(node, node_output, 'output') self._output_conns[name] = (node, node_output, format, kwargs)
def _unwrap_maps(self, name_maps, name, analysis=None, **inner_maps): """ Unwraps potentially nested name-mapping dictionaries to get values for name, input_map, output_map and analysis. Unsed in __init__. Parameters ---------- name_maps : dict A dictionary containing the name_maps to apply to the values name : str Name passed from inner pipeline constructor analysis : Analysis The analysis to bind the pipeline to. Will be overridden by any values in the mods dict inner_maps : dict[str, dict[str,str]] input and output maps from inner pipeline constructors Returns ------- name : str Potentially modified name of the pipeline analysis : Analysis Potentially modified analysis maps : dict[str, dict[str,str]] Potentially modifed input and output maps """ # Set values of name and analysis name = name_maps.get('name', name) name = name_maps.get('prefix', '') + name analysis = name_maps.get('analysis', analysis) # Flatten input and output maps, combining maps from inner nests with # those in the "mods" dictionary maps = {} for mtype in ('input_map', 'output_map'): try: inner_map = inner_maps[mtype] except KeyError: try: maps[mtype] = name_maps[mtype] # Only outer map except KeyError: pass # No maps else: try: outer_map = name_maps[mtype] except KeyError: maps[mtype] = inner_map # Only inner map else: # Work through different combinations of inner and outer # map types (i.e. str & str, str & dict, dict & str, and # dict & dict) and combine into a single map if isinstance(outer_map, basestring): if isinstance(inner_map, basestring): # Concatenate prefixes maps[mtype] = outer_map + inner_map elif isinstance(inner_map, dict): # Add outer_map prefix to all values in inner map # dictionary maps[mtype] = { k: outer_map + v for k, v in inner_map.items() } else: raise ArcanaDesignError( "Unrecognised type for name map in '{}' " "pipeline can be str or dict[str,str]: {}". format(name, inner_map)) elif isinstance(outer_map, dict): if isinstance(inner_map, basestring): # Strip inner map prefix from outer dictionary # (which should have prefix included). This should # be an unlikely case I imagine maps[mtype] = { k[len(inner_map):]: v for k, v in outer_map.items() } elif isinstance(inner_map, dict): # Chain outer_map dictionary to inner map # dictionary maps[mtype] = deepcopy(outer_map) maps[mtype].update({ k: outer_map.get(v, v) for k, v in inner_map.items() }) else: raise ArcanaDesignError( "Unrecognised type for name map in '{}' " "pipeline can be str or dict[str,str]: {}". format(name, inner_map)) else: raise ArcanaDesignError( "Unrecognised type for name map in '{}' " "pipeline can be str or dict[str,str]: {}".format( name, outer_map)) try: outer_maps = name_maps['name_maps'] except KeyError: pass else: name, analysis, maps = self._unwrap_maps(outer_maps, name=name, analysis=analysis, **maps) return name, analysis, maps
def add(self, name, interface, inputs=None, outputs=None, requirements=None, wall_time=None, annotations=None, **kwargs): """ Adds a processing Node to the pipeline Parameters ---------- name : str Name for the node interface : nipype.Interface The interface to use for the node inputs : dict[str, (str, FileFormat) | (Node, str)] Connections from inputs of the pipeline and outputs of other nodes to inputs of node. The keys of the dictionary are the field names and the values are 2-tuple containing either the name of the data spec and the data format it is expected in for pipeline inputs or the sending Node and the the name of an output of the sending Node. Note that pipeline inputs can be specified outside this method using the 'connect_input' method and connections between nodes with the the 'connect' method. outputs : dict[str, (str, FileFormat)] Connections to outputs of the pipeline from fields of the interface. The keys of the dictionary are the names of the data specs that will be written to and the values are the interface field name and the data format it is produced in. Note that output connections can also be specified using the 'connect_output' method. requirements : list(Requirement) List of required packages need for the node to run (default: []) wall_time : float Time required to execute the node in minutes (default: 1) mem_gb : int Required memory for the node in GB n_procs : int Preferred number of threads to run the node on (default: 1) annotations : dict[str, *] Additional annotations to add to the node, which may be used by the Processor node to optimise execution (e.g. 'gpu': True) iterfield : str Name of field to be passed an iterable to iterator over. If present, a MapNode will be created instead of a regular node joinsource : str Name of iterator field to join. Typically one of the implicit iterators (i.e. Analysis.SUBJECT_ID or Analysis.VISIT_ID) to join over the subjects and/or visits joinfield : str Name of field to pass the joined list when creating a JoinNode Returns ------- node : Node The Node object that has been added to the pipeline """ if annotations is None: annotations = {} if requirements is None: requirements = [] if wall_time is None: wall_time = self.analysis.processor.default_wall_time if 'mem_gb' not in kwargs or kwargs['mem_gb'] is None: kwargs['mem_gb'] = self.analysis.processor.default_mem_gb if 'iterfield' in kwargs: if 'joinfield' in kwargs or 'joinsource' in kwargs: raise ArcanaDesignError( "Cannot provide both joinsource and iterfield to when " "attempting to add '{}' node to {}".format( name, self._error_msg_loc)) node_cls = self.analysis.environment.node_types['map'] elif 'joinsource' in kwargs or 'joinfield' in kwargs: if not ('joinfield' in kwargs and 'joinsource' in kwargs): raise ArcanaDesignError( ("Both joinsource and joinfield kwargs are required to " + "create a JoinNode (see {})").format( name, self._error_msg_loc)) joinsource = kwargs['joinsource'] if joinsource in self.analysis.ITERFIELDS: self._iterator_joins.add(joinsource) node_cls = self.analysis.environment.node_types['join'] # Prepend name of pipeline of joinsource to match name of nodes kwargs['joinsource'] = '{}_{}'.format(self.name, joinsource) else: node_cls = self.analysis.environment.node_types['base'] # Create node node = node_cls(self.analysis.environment, interface, name="{}_{}".format(self._name, name), requirements=requirements, wall_time=wall_time, annotations=annotations, **kwargs) # Ensure node is added to workflow self._workflow.add_nodes([node]) # Connect inputs, outputs and internal connections if inputs is not None: if not isinstance(inputs, dict): raise ArcanaDesignError( "inputs of {} node in {} needs to be a dictionary " "(not {})".format(name, self, inputs)) for node_input, connect_from in inputs.items(): if isinstance(connect_from[0], basestring): input_spec, input_format = connect_from self.connect_input(input_spec, node, node_input, input_format) else: conn_node, conn_field = connect_from self.connect(conn_node, conn_field, node, node_input) if outputs is not None: if not isinstance(outputs, dict): raise ArcanaDesignError( "outputs of {} node in {} needs to be a dictionary " "(not {})".format(name, self, outputs)) for output_spec, (node_output, output_format) in outputs.items(): self.connect_output(output_spec, node, node_output, output_format) return node
def pipeline(self, getter_name, required_outputs=None, pipeline_args=None): """ Returns a pipeline from a study by getting the method corresponding to the given name and checking that the required outputs are generated given the parameters of the study Parameters ---------- getter_name : str Name of the method that constructs the pipeline required_outputs : list[str] | None The list of outputs that are expected of the pipeline pipeline_args : dict[str, *] | None Any arguments that should be passed to the method to produce the pipeline. """ if pipeline_args is None: pipeline_args = () elif isinstance(pipeline_args, dict): pipeline_args = tuple(pipeline_args.items()) try: pipeline = self._pipelines_cache[(getter_name, pipeline_args)] except KeyError: try: getter = getattr(self, getter_name) except AttributeError: raise ArcanaDesignError( "There is no pipeline constructor method named '{}' in " "present in '{}' study".format(getter_name, self)) self._pipeline_to_generate = getter_name try: pipeline = getter(**dict(pipeline_args)) except ArcanaMissingDataException as e: e.msg += ("{}, which is required as an input when calling the " "pipeline constructor method '{}' to create a " "pipeline to produce '{}'".format( e, getter_name, "', '".join(required_outputs))) raise e finally: self._pipeline_to_generate = None if pipeline is None: raise ArcanaDesignError( "'{}' pipeline constructor in {} is missing return " "statement (should return a Pipeline object)".format( getter_name, self)) elif not isinstance(pipeline, Pipeline): raise ArcanaDesignError( "'{}' pipeline constructor in {} doesn't return a Pipeline" " object ({})".format(getter_name, self, pipeline)) # Check to see if the pipeline is equivalent to previously # generated pipelines (if two getter methods return equivalent # pipelines) and whether any outputs are to be generated twice # by different pipelines within the same workflow for prev_pipeline in self._pipelines_cache.values(): if pipeline == prev_pipeline: pipeline = prev_pipeline break elif any(o in prev_pipeline.outputs for o in pipeline.outputs): raise ArcanaDesignError( "'{}' outputs are produced by more than one pipeline " "({} and {})".format( set(pipeline.output_names).intersection( prev_pipeline.output_names), prev_pipeline, pipeline)) self._pipelines_cache[(getter_name, pipeline_args)] = pipeline if required_outputs is not None: # Check that the required outputs are created with the given # parameters missing_outputs = (set(required_outputs) - set(pipeline.output_names)) if missing_outputs: raise ArcanaOutputNotProducedException( "Required output(s) '{}', will not be created by the '{}' " "pipeline constructed by '{}' method in {} given the " "missing study inputs:\n{}\n" "and the provided switches:\n{}".format( "', '".join(missing_outputs), pipeline.name, getter_name, self, "\n".join(self.missing_inputs), '\n'.join('{}={}'.format(s.name, s.value) for s in self.switches))) return pipeline
def __new__(metacls, name, bases, dct): # @NoSelf @UnusedVariable if not any(issubclass(b, Study) for b in bases): raise ArcanaUsageError( "StudyMetaClass can only be used for classes that " "have Study as a base class") try: add_data_specs = dct['add_data_specs'] except KeyError: add_data_specs = [] try: add_param_specs = dct['add_param_specs'] except KeyError: add_param_specs = [] combined_attrs = set() combined_data_specs = {} combined_param_specs = {} for base in reversed(bases): # Get the combined class dictionary including base dicts # excluding auto-added properties for data and parameter specs combined_attrs.update( a for a in dir(base) if (not issubclass(base, Study) or a not in base.spec_names())) # TODO: need to check that fields are not overridden by filesets # and vice-versa try: combined_data_specs.update( (d.name, d) for d in base.data_specs()) except AttributeError: pass # Not a Study class try: combined_param_specs.update( (p.name, p) for p in base.param_specs()) except AttributeError: pass # Not a Study class combined_attrs.update(list(dct.keys())) combined_data_specs.update((d.name, d) for d in add_data_specs) combined_param_specs.update((p.name, p) for p in add_param_specs) # Check that the pipeline names in data specs correspond to a # pipeline method in the class and that if a pipeline is called with # arguments (for parameterizing a range of metrics for example) then # the same argument names are consistent across every call. pipeline_arg_names = {} for spec in add_data_specs: if spec.derived: if spec.pipeline_getter in ('pipeline', 'new_pipeline'): raise ArcanaDesignError( "Cannot use the names 'pipeline' or 'new_pipeline' " "('{}') for the name of a pipeline constructor in " "class {} as it clashes with base method to create " "pipelines".format(spec.pipeline_getter, name)) if spec.pipeline_getter not in combined_attrs: raise ArcanaDesignError( "Pipeline to generate '{}', '{}', is not present" " in '{}' class".format(spec.name, spec.pipeline_getter, name)) try: if pipeline_arg_names[ spec.pipeline_getter] != spec.pipeline_arg_names: raise ArcanaDesignError( "Inconsistent pipeline argument names used for " "'{}' pipeline getter {} and {}".format( spec.pipeline_getter, pipeline_arg_names[spec.pipeline_getter], spec.pipeline_arg_names)) except KeyError: pipeline_arg_names[ spec.pipeline_getter] = spec.pipeline_arg_names # Check for name clashes between data and parameter specs spec_name_clashes = (set(combined_data_specs) & set(combined_param_specs)) if spec_name_clashes: raise ArcanaDesignError( "'{}' name both data and parameter specs in '{}' class".format( "', '".join(spec_name_clashes), name)) reserved_clashes = [ n for n in combined_data_specs if n in Study.ITERFIELDS ] if reserved_clashes: raise ArcanaDesignError( "'{}' data spec names clash with reserved names in {}".format( "', '".join(reserved_clashes), name)) dct['_data_specs'] = combined_data_specs dct['_param_specs'] = combined_param_specs if '__metaclass__' not in dct: dct['__metaclass__'] = metacls # Append description of Study parameters to class try: docstring = dct['__doc__'] except KeyError: docstring = '{} Study class'.format(name) if 'Parameters' not in docstring: docstring += Study.__doc__ dct['__doc__'] = docstring return type(name, bases, dct)
def preprocess_pipeline(self, **name_maps): # @UnusedVariable @IgnorePep8 """ Performs a series of FSL preprocessing steps, including Eddy and Topup Parameters ---------- phase_dir : str{AP|LR|IS} The phase encode direction """ # Determine whether we can correct for distortion, i.e. if reference # scans are provided # Include all references references = [fsl_cite, eddy_cite, topup_cite, distort_correct_cite] if self.branch('preproc_denoise'): references.extend(dwidenoise_cites) pipeline = self.new_pipeline( name='preprocess', name_maps=name_maps, desc=( "Preprocess dMRI studies using distortion correction"), references=references) # Create nodes to gradients to FSL format if self.input('magnitude').format == dicom_format: extract_grad = pipeline.add( "extract_grad", ExtractFSLGradients(), inputs={ 'in_file': ('magnitude', dicom_format)}, outputs={ 'bvecs_file': ('grad_dirs', fsl_bvecs_format), 'bvals_file': ('bvalues', fsl_bvals_format)}, requirements=[mrtrix_req.v('3.0rc3')]) grad_fsl_kwargs = { 'connect': {'in1': (extract_grad, 'bvecs_file'), 'in2': (extract_grad, 'bvals_file')}} elif self.provided('grad_dirs') and self.provided('bvalues'): grad_fsl_kwargs = { 'inputs': {'in1': ('grad_dirs', fsl_bvecs_format), 'in2': ('bvalues', fsl_bvals_format)}} else: raise ArcanaDesignError( "Either input 'magnitude' image needs to be in DICOM format " "or gradient directions and b-values need to be explicitly " "provided to {}".format(self)) # Gradient merge node grad_fsl = pipeline.add( "grad_fsl", MergeTuple(2), **grad_fsl_kwargs) # Denoise the dwi-scan if self.branch('preproc_denoise'): # Run denoising denoise = pipeline.add( 'denoise', DWIDenoise(), inputs={ 'in_file': ('magnitude', nifti_gz_format)}, requirements=[mrtrix_req.v('3.0rc3')]) # Calculate residual noise subtract_operands = pipeline.add( 'subtract_operands', Merge(2), inputs={ 'in1': ('magnitude', nifti_gz_format)}, connect={ 'in2': (denoise, 'noise')}) pipeline.add( 'subtract', MRCalc( operation='subtract'), connect={ 'operands': (subtract_operands, 'out')}, outputs={ 'out_file': ('noise_residual', mrtrix_format)}, requirements=[mrtrix_req.v('3.0rc3')]) # Preproc kwargs preproc_kwargs = {} if (self.provided('dwi_reference') or self.provided('reverse_phase')): # Extract b=0 volumes dwiextract = pipeline.add( 'dwiextract', ExtractDWIorB0( bzero=True, out_ext='.nii.gz'), inputs={ 'in_file': ('magnitude', dicom_format)}, requirements=[mrtrix_req.v('3.0rc3')]) # Get first b=0 from dwi b=0 volumes mrconvert = pipeline.add( "mrconvert", MRConvert( coord=(3, 0)), connect={ 'in_file': (dwiextract, 'out_file')}, requirements=[mrtrix_req.v('3.0rc3')]) # Concatenate extracted forward rpe with reverse rpe mrcat = pipeline.add( 'mrcat', MRCat(), inputs={ 'second_scan': (( 'dwi_reference' if self.provided('dwi_reference') else 'reverse_phase'), mrtrix_format)}, connect={ 'first_scan': (mrconvert, 'out_file')}, requirements=[mrtrix_req.v('3.0rc3')]) # Create node to assign the right PED to the diffusion prep_dwi = pipeline.add( 'prepare_dwi', PrepareDWI(), inputs={ 'pe_dir': ('ped', float), 'ped_polarity': ('pe_angle', float)}) preproc_kwargs['rpe_pair'] = True distortion_correction = True preproc_conns = {'connect': {'se_epi': (mrcat, 'out_file')}} else: distortion_correction = False preproc_kwargs['rpe_none'] = True preproc_conns = {} if self.parameter('preproc_pe_dir') is not None: preproc_kwargs['pe_dir'] = self.parameter('preproc_pe_dir') preproc = pipeline.add( 'dwipreproc', DWIPreproc( no_clean_up=True, out_file_ext='.nii.gz', # FIXME: Need to determine this programmatically # eddy_parameters = '--data_is_shelled ' temp_dir='dwipreproc_tempdir', **preproc_kwargs), connect={ 'grad_fsl': (grad_fsl, 'out')}, outputs={ 'eddy_parameters': ('eddy_par', eddy_par_format)}, requirements=[mrtrix_req.v('3.0rc3'), fsl_req.v('5.0.10')], wall_time=60, **preproc_conns) if self.branch('preproc_denoise'): pipeline.connect(denoise, 'out_file', preproc, 'in_file') else: pipeline.connect_input('magnitude', preproc, 'in_file', nifti_gz_format) if distortion_correction: pipeline.connect(prep_dwi, 'pe', preproc, 'pe_dir') # Create node to reorient preproc out_file pipeline.add( 'fslreorient2std', fsl.utils.Reorient2Std(), connect={ 'in_file': (preproc, 'out_file')}, outputs={ 'out_file': ('preproc', nifti_gz_format)}, requirements=[fsl_req.v('5.0.9')]) return pipeline
def _connect_pipeline(self, pipeline, required_outputs, workflow, subject_inds, visit_inds, filter_array, force=False): """ Connects a pipeline to a overarching workflow that sets up iterators over subjects|visits present in the repository (if required) and repository source and sink nodes Parameters ---------- pipeline : Pipeline The pipeline to connect required_outputs : set[str] | None The outputs required to be produced by this pipeline. If None all are deemed to be required workflow : nipype.pipeline.engine.Workflow The overarching workflow to connect the pipeline to subject_inds : dct[str, int] A mapping of subject ID to row index in the filter array visit_inds : dct[str, int] A mapping of visit ID to column index in the filter array filter_array : 2-D numpy.array[bool] A two-dimensional boolean array, where rows correspond to subjects and columns correspond to visits in the repository. True values represent a combination of subject & visit ID to include in the current round of processing. Note that if the 'force' flag is not set, sessions won't be reprocessed unless the save provenance doesn't match that of the given pipeline. force : bool | 'all' A flag to force the processing of all sessions in the filter array, regardless of whether the parameters|pipeline used to generate existing data matches the given pipeline """ if self.reprocess == 'force': force = True # Close-off construction of the pipeline and created, input and output # nodes and provenance dictionary pipeline.cap() # Prepend prerequisite pipelines to complete workflow if they need # to be (re)processed final_nodes = [] # The array that represents the subject/visit pairs for which any # prerequisite pipeline will be (re)processed, and which therefore # needs to be included in the processing of the current pipeline. Row # indices correspond to subjects and column indices visits prqs_to_process_array = np.zeros((len(subject_inds), len(visit_inds)), dtype=bool) # The array that represents the subject/visit pairs for which any # prerequisite pipeline will be skipped due to missing inputs. Row # indices correspond to subjects and column indices visits prqs_to_skip_array = np.zeros((len(subject_inds), len(visit_inds)), dtype=bool) for getter_name in pipeline.prerequisites: prereq = pipeline.study.pipeline(getter_name) if prereq.to_process_array.any(): final_nodes.append(prereq.node('final')) prqs_to_process_array |= prereq.to_process_array prqs_to_skip_array |= prereq.to_skip_array # Get list of sessions that need to be processed (i.e. if # they don't contain the outputs of this pipeline) to_process_array, to_protect_array, to_skip_array = self._to_process( pipeline, required_outputs, prqs_to_process_array, prqs_to_skip_array, filter_array, subject_inds, visit_inds, force) # Store the arrays signifying which nodes to process, protect or skip # so they can be passed to downstream pipelines pipeline.to_process_array = to_process_array pipeline.to_protect_array = to_protect_array pipeline.to_skip_array = to_skip_array # Check to see if there are any sessions to process if not to_process_array.any(): raise ArcanaNoRunRequiredException( "No sessions to process for '{}' pipeline".format( pipeline.name)) # Set up workflow to run the pipeline, loading and saving from the # repository workflow.add_nodes([pipeline._workflow]) # If prerequisite pipelines need to be processed, connect their # "final" nodes to the initial node of this pipeline to ensure that # they are all processed before this pipeline is run. if final_nodes: prereqs = pipeline.add('prereqs', Merge(len(final_nodes))) for i, final_node in enumerate(final_nodes, start=1): workflow.connect(final_node, 'out', prereqs, 'in{}'.format(i)) else: prereqs = None # Construct iterator structure over subjects and sessions to be # processed iter_nodes = self._iterate(pipeline, to_process_array, subject_inds, visit_inds) sources = {} # Loop through each frequency present in the pipeline inputs and # create a corresponding source node for freq in pipeline.input_frequencies: try: inputs = list(pipeline.frequency_inputs(freq)) except ArcanaMissingDataException as e: raise ArcanaMissingDataException( str(e) + ", which is required for pipeline '{}'".format( pipeline.name)) inputnode = pipeline.inputnode(freq) sources[freq] = source = pipeline.add( '{}_source'.format(freq), RepositorySource(i.collection for i in inputs), inputs=({ 'prereqs': (prereqs, 'out') } if prereqs is not None else {})) # Connect iter_nodes to source and input nodes for iterator in pipeline.iterators(freq): pipeline.connect(iter_nodes[iterator], iterator, source, iterator) pipeline.connect(source, iterator, inputnode, iterator) for input in inputs: pipeline.connect(source, input.suffixed_name, inputnode, input.name) deiter_nodes = {} def deiter_node_sort_key(it): """ If there are two iter_nodes (i.e. both subject and visit ID) and one depends on the other (i.e. if the visit IDs per subject vary and vice-versa) we need to ensure that the dependent iterator is deiterated (joined) first. """ return iter_nodes[it].itersource is None # Connect all outputs to the repository sink, creating a new sink for # each frequency level (i.e 'per_session', 'per_subject', 'per_visit', # or 'per_study') for freq in pipeline.output_frequencies: outputs = list(pipeline.frequency_outputs(freq)) if pipeline.iterators(freq) - pipeline.iterators(): raise ArcanaDesignError( "Doesn't make sense to output '{}', which are of '{}' " "frequency, when the pipeline only iterates over '{}'". format("', '".join(o.name for o in outputs), freq, "', '".join(pipeline.iterators()))) outputnode = pipeline.outputnode(freq) # Connect filesets/fields to sink to sink node, skipping outputs # that are study inputs to_connect = { o.suffixed_name: (outputnode, o.name) for o in outputs if o.is_spec } # Connect iterators to sink node to_connect.update( {i: (iter_nodes[i], i) for i in pipeline.iterators()}) # Connect checksums/values from sources to sink node in order to # save in provenance, joining where necessary for input_freq in pipeline.input_frequencies: checksums_to_connect = [ i.checksum_suffixed_name for i in pipeline.frequency_inputs(input_freq) ] if not checksums_to_connect: # Rare case of a pipeline with no inputs only iter_nodes # that will only occur in unittests in all likelihood continue # Loop over iterators that need to be joined, i.e. that are # present in the input frequency but not the output frequency # and create join nodes source = sources[input_freq] for iterator in (pipeline.iterators(input_freq) - pipeline.iterators(freq)): join = pipeline.add( '{}_to_{}_{}_checksum_join'.format( input_freq, freq, iterator), IdentityInterface(checksums_to_connect), inputs={ tc: (source, tc) for tc in checksums_to_connect }, joinsource=iterator, joinfield=checksums_to_connect) source = join to_connect.update( {tc: (source, tc) for tc in checksums_to_connect}) # Add sink node sink = pipeline.add('{}_sink'.format(freq), RepositorySink((o.collection for o in outputs), pipeline, required_outputs), inputs=to_connect) # "De-iterate" (join) over iterators to get back to single child # node by the time we connect to the final node of the pipeline Set # the sink and subject_id as the default deiterator if there are no # deiterates (i.e. per_study) or to use as the upstream node to # connect the first deiterator for every frequency deiter_nodes[freq] = sink # for per_study the "deiterator" == sink for iterator in sorted(pipeline.iterators(freq), key=deiter_node_sort_key): # Connect to previous deiterator or sink # NB: we only need to keep a reference to the last one in the # chain in order to connect with the "final" node, so we can # overwrite the entry in the 'deiter_nodes' dict deiter_nodes[freq] = pipeline.add( '{}_{}_deiter'.format(freq, iterator), IdentityInterface(['checksums']), inputs={'checksums': (deiter_nodes[freq], 'checksums')}, joinsource=iterator, joinfield='checksums') # Create a final node, which is used to connect with downstream # pipelines pipeline.add('final', Merge(len(deiter_nodes)), inputs={ 'in{}'.format(i): (di, 'checksums') for i, di in enumerate(deiter_nodes.values(), start=1) })
def push_on_stack(pipeline, filt_array, req_outputs, downstream=()): """ Push a pipeline onto the stack of pipelines to be processed, detecting common upstream pipelines and resolving them to a single pipeline Parameters ---------- pipeline : Pipeline The pipeline to add to the stack filt_array : 2-D numpy.ndarray An array of sessions that are to be processed for the given pipeline req_outputs : list[str] The outputs of the pipeline that are actually required (non- required outputs can be ignored if they are not generated) downstream : tuple[Pipeline] The pipelines directly downstream of the pipeline to be added. Used to detect circular dependencies """ if req_outputs is None: req_outputs = pipeline.output_names # Check downstream piplines for circular dependencies downstream_pipelines = [p for p, _ in downstream] if pipeline in downstream_pipelines: recur_index = downstream_pipelines.index(pipeline) raise ArcanaDesignError( "{} cannot be a dependency of itself. Call-stack:\n{}". format( pipeline, '\n'.join( '{} ({})'.format(p, ', '.join(ro)) for p, ro in (((pipeline, req_outputs), ) + downstream[:(recur_index + 1)])))) if pipeline.name in stack: # Pop pipeline from stack in order to add it to the end of the # stack and ensure it is run before all downstream pipelines prev_pipeline, prev_req_outputs, prev_filt_array = stack.pop( pipeline.name) if pipeline is not prev_pipeline and pipeline != prev_pipeline: raise ArcanaDesignError( "Attempting to run two different pipelines with the " "same name, {} and {}".format(pipeline, prev_pipeline)) # Combined required outputs and filter array with previous # references to the pipeline req_outputs = copy(req_outputs) req_outputs.update(prev_req_outputs) filt_array = filt_array | prev_filt_array # If the pipeline to process contains summary outputs (i.e. 'per- # subject|visit|study' frequency), then we need to "dialate" the # filter array to include IDs across the scope of the study, e.g. # all subjects for per-vist, or all visits for per-subject. output_freqs = set(pipeline.output_frequencies) dialated_filt_array = self._dialate_array(filt_array, pipeline.joins) added = dialated_filt_array ^ filt_array if added.any(): filt_array = dialated_filt_array # Invert the index dictionaries to get index-to-ID maps inv_subject_inds = {v: k for k, v in subject_inds.items()} inv_visit_inds = {v: k for k, v in visit_inds.items()} logger.warning( "Dialated filter array used to process '{}' pipeline to " "include {} subject/visit IDs due to its '{}' summary " "outputs ".format( pipeline.name, ', '.join('({},{})'.format(inv_subject_inds[s], inv_visit_inds[v]) for s, v in zip(*np.nonzero(added))), "' and '".join(output_freqs))) stack[pipeline.name] = pipeline, req_outputs, filt_array # Recursively add all prerequisites to stack try: for (prq_getter, prq_req_outputs) in pipeline.prerequisites.items(): prereq = pipeline.study.pipeline(prq_getter, prq_req_outputs) push_on_stack(prereq, filt_array, prq_req_outputs, ((pipeline, req_outputs), ) + downstream) except (ArcanaMissingDataException, ArcanaOutputNotProducedException) as e: e.msg += ("\nwhich are required as inputs to the '{}' " "pipeline to produce '{}'".format( pipeline.name, "', '".join(req_outputs))) raise e