def get_field(self, field): """ Update the value of the field from the repository """ # Load fields JSON, locking to prevent read/write conflicts # Would be better if only checked if locked to allow # concurrent reads but not possible with multi-process # locks (in my understanding at least). fpath = self.fields_json_path(field) try: with InterProcessLock(fpath + self.LOCK_SUFFIX, logger=logger), open(fpath, 'r') as f: dct = json.load(f) val = dct[field.name] if field.array: val = [field.dtype(v) for v in val] else: val = field.dtype(val) except (KeyError, IOError) as e: try: # Check to see if the IOError wasn't just because of a # missing file if e.errno != errno.ENOENT: raise except AttributeError: pass raise ArcanaMissingDataException( "{} does not exist in the local repository {}".format( field.name, self)) return val
def bound_spec(self, name): """ Returns an input selector or derived spec bound to the study, i.e. where the repository tree is checked for existing outputs Parameters ---------- name : Str A name of a fileset or field """ # If the provided "name" is actually a data item or parameter then # replace it with its name. if isinstance(name, BaseData): name = name.name try: bound = self._inputs[name] except KeyError: # Get the spec from the class spec = self.data_spec(name) if not spec.derived and spec.default is None: raise ArcanaMissingDataException( "Input (i.e. non-generated) data '{}' " "was not supplied when the study '{}' was " "initiated".format(name, self.name)) else: try: bound = self._bound_specs[name] except KeyError: bound = self._bound_specs[name] = spec.bind(self) return bound
def mni_template_resolution(self): if self.parameter('mni_template_resolution') is not None: res = self.parameter('mni_template_resolution') else: raise ArcanaMissingDataException( "Automatic detection of dataset resolution is not implemented " "yet, please specify resolution of default MNI templates " "manually via 'mni_template_resolution' parameter") return res
def get_fileset(self, fileset): """ Set the path of the fileset from the repository """ # Don't need to cache fileset as it is already local as long # as the path is set if fileset._path is None: primary_path = self.fileset_path(fileset) aux_files = fileset.format.default_aux_file_paths(primary_path) if not op.exists(primary_path): raise ArcanaMissingDataException( "{} does not exist in {}".format(fileset, self)) for aux_name, aux_path in aux_files.items(): if not op.exists(aux_path): raise ArcanaMissingDataException( "{} is missing '{}' side car in {}".format( fileset, aux_name, self)) else: primary_path = fileset.path aux_files = fileset.aux_files return primary_path, aux_files
def _connect_pipeline(self, pipeline, required_outputs, workflow, subject_inds, visit_inds, filter_array, force=False): """ Connects a pipeline to a overarching workflow that sets up iterators over subjects|visits present in the repository (if required) and repository source and sink nodes Parameters ---------- pipeline : Pipeline The pipeline to connect required_outputs : set[str] | None The outputs required to be produced by this pipeline. If None all are deemed to be required workflow : nipype.pipeline.engine.Workflow The overarching workflow to connect the pipeline to subject_inds : dct[str, int] A mapping of subject ID to row index in the filter array visit_inds : dct[str, int] A mapping of visit ID to column index in the filter array filter_array : 2-D numpy.array[bool] A two-dimensional boolean array, where rows correspond to subjects and columns correspond to visits in the repository. True values represent a combination of subject & visit ID to include in the current round of processing. Note that if the 'force' flag is not set, sessions won't be reprocessed unless the save provenance doesn't match that of the given pipeline. force : bool | 'all' A flag to force the processing of all sessions in the filter array, regardless of whether the parameters|pipeline used to generate existing data matches the given pipeline """ if self.reprocess == 'force': force = True # Close-off construction of the pipeline and created, input and output # nodes and provenance dictionary pipeline.cap() # Prepend prerequisite pipelines to complete workflow if they need # to be (re)processed final_nodes = [] # The array that represents the subject/visit pairs for which any # prerequisite pipeline will be (re)processed, and which therefore # needs to be included in the processing of the current pipeline. Row # indices correspond to subjects and column indices visits prqs_to_process_array = np.zeros((len(subject_inds), len(visit_inds)), dtype=bool) # The array that represents the subject/visit pairs for which any # prerequisite pipeline will be skipped due to missing inputs. Row # indices correspond to subjects and column indices visits prqs_to_skip_array = np.zeros((len(subject_inds), len(visit_inds)), dtype=bool) for getter_name in pipeline.prerequisites: prereq = pipeline.study.pipeline(getter_name) if prereq.to_process_array.any(): final_nodes.append(prereq.node('final')) prqs_to_process_array |= prereq.to_process_array prqs_to_skip_array |= prereq.to_skip_array # Get list of sessions that need to be processed (i.e. if # they don't contain the outputs of this pipeline) to_process_array, to_protect_array, to_skip_array = self._to_process( pipeline, required_outputs, prqs_to_process_array, prqs_to_skip_array, filter_array, subject_inds, visit_inds, force) # Store the arrays signifying which nodes to process, protect or skip # so they can be passed to downstream pipelines pipeline.to_process_array = to_process_array pipeline.to_protect_array = to_protect_array pipeline.to_skip_array = to_skip_array # Check to see if there are any sessions to process if not to_process_array.any(): raise ArcanaNoRunRequiredException( "No sessions to process for '{}' pipeline".format( pipeline.name)) # Set up workflow to run the pipeline, loading and saving from the # repository workflow.add_nodes([pipeline._workflow]) # If prerequisite pipelines need to be processed, connect their # "final" nodes to the initial node of this pipeline to ensure that # they are all processed before this pipeline is run. if final_nodes: prereqs = pipeline.add('prereqs', Merge(len(final_nodes))) for i, final_node in enumerate(final_nodes, start=1): workflow.connect(final_node, 'out', prereqs, 'in{}'.format(i)) else: prereqs = None # Construct iterator structure over subjects and sessions to be # processed iter_nodes = self._iterate(pipeline, to_process_array, subject_inds, visit_inds) sources = {} # Loop through each frequency present in the pipeline inputs and # create a corresponding source node for freq in pipeline.input_frequencies: try: inputs = list(pipeline.frequency_inputs(freq)) except ArcanaMissingDataException as e: raise ArcanaMissingDataException( str(e) + ", which is required for pipeline '{}'".format( pipeline.name)) inputnode = pipeline.inputnode(freq) sources[freq] = source = pipeline.add( '{}_source'.format(freq), RepositorySource(i.collection for i in inputs), inputs=({ 'prereqs': (prereqs, 'out') } if prereqs is not None else {})) # Connect iter_nodes to source and input nodes for iterator in pipeline.iterators(freq): pipeline.connect(iter_nodes[iterator], iterator, source, iterator) pipeline.connect(source, iterator, inputnode, iterator) for input in inputs: pipeline.connect(source, input.suffixed_name, inputnode, input.name) deiter_nodes = {} def deiter_node_sort_key(it): """ If there are two iter_nodes (i.e. both subject and visit ID) and one depends on the other (i.e. if the visit IDs per subject vary and vice-versa) we need to ensure that the dependent iterator is deiterated (joined) first. """ return iter_nodes[it].itersource is None # Connect all outputs to the repository sink, creating a new sink for # each frequency level (i.e 'per_session', 'per_subject', 'per_visit', # or 'per_study') for freq in pipeline.output_frequencies: outputs = list(pipeline.frequency_outputs(freq)) if pipeline.iterators(freq) - pipeline.iterators(): raise ArcanaDesignError( "Doesn't make sense to output '{}', which are of '{}' " "frequency, when the pipeline only iterates over '{}'". format("', '".join(o.name for o in outputs), freq, "', '".join(pipeline.iterators()))) outputnode = pipeline.outputnode(freq) # Connect filesets/fields to sink to sink node, skipping outputs # that are study inputs to_connect = { o.suffixed_name: (outputnode, o.name) for o in outputs if o.is_spec } # Connect iterators to sink node to_connect.update( {i: (iter_nodes[i], i) for i in pipeline.iterators()}) # Connect checksums/values from sources to sink node in order to # save in provenance, joining where necessary for input_freq in pipeline.input_frequencies: checksums_to_connect = [ i.checksum_suffixed_name for i in pipeline.frequency_inputs(input_freq) ] if not checksums_to_connect: # Rare case of a pipeline with no inputs only iter_nodes # that will only occur in unittests in all likelihood continue # Loop over iterators that need to be joined, i.e. that are # present in the input frequency but not the output frequency # and create join nodes source = sources[input_freq] for iterator in (pipeline.iterators(input_freq) - pipeline.iterators(freq)): join = pipeline.add( '{}_to_{}_{}_checksum_join'.format( input_freq, freq, iterator), IdentityInterface(checksums_to_connect), inputs={ tc: (source, tc) for tc in checksums_to_connect }, joinsource=iterator, joinfield=checksums_to_connect) source = join to_connect.update( {tc: (source, tc) for tc in checksums_to_connect}) # Add sink node sink = pipeline.add('{}_sink'.format(freq), RepositorySink((o.collection for o in outputs), pipeline, required_outputs), inputs=to_connect) # "De-iterate" (join) over iterators to get back to single child # node by the time we connect to the final node of the pipeline Set # the sink and subject_id as the default deiterator if there are no # deiterates (i.e. per_study) or to use as the upstream node to # connect the first deiterator for every frequency deiter_nodes[freq] = sink # for per_study the "deiterator" == sink for iterator in sorted(pipeline.iterators(freq), key=deiter_node_sort_key): # Connect to previous deiterator or sink # NB: we only need to keep a reference to the last one in the # chain in order to connect with the "final" node, so we can # overwrite the entry in the 'deiter_nodes' dict deiter_nodes[freq] = pipeline.add( '{}_{}_deiter'.format(freq, iterator), IdentityInterface(['checksums']), inputs={'checksums': (deiter_nodes[freq], 'checksums')}, joinsource=iterator, joinfield='checksums') # Create a final node, which is used to connect with downstream # pipelines pipeline.add('final', Merge(len(deiter_nodes)), inputs={ 'in{}'.format(i): (di, 'checksums') for i, di in enumerate(deiter_nodes.values(), start=1) })
def intensity_normalisation_pipeline(self, **name_maps): if self.num_sessions < 2: raise ArcanaMissingDataException( "Cannot normalise intensities of DWI images as study only " "contains a single session") elif self.num_sessions < self.RECOMMENDED_NUM_SESSIONS_FOR_INTENS_NORM: logger.warning( "The number of sessions in the study ({}) is less than the " "recommended number for intensity normalisation ({}). The " "results may be unreliable".format( self.num_sessions, self.RECOMMENDED_NUM_SESSIONS_FOR_INTENS_NORM)) pipeline = self.new_pipeline( name='intensity_normalization', desc="Corrects for B1 field inhomogeneity", citations=[mrtrix_req.v('3.0rc3')], name_maps=name_maps) mrconvert = pipeline.add( 'mrconvert', MRConvert( out_ext='.mif'), inputs={ 'in_file': (self.series_preproc_spec_name, nifti_gz_format), 'grad_fsl': self.fsl_grads(pipeline)}, requirements=[mrtrix_req.v('3.0rc3')]) # Pair subject and visit ids together, expanding so they can be # joined and chained together session_ids = pipeline.add( 'session_ids', utility.IdentityInterface( ['subject_id', 'visit_id']), inputs={ 'subject_id': (Study.SUBJECT_ID, int), 'visit_id': (Study.VISIT_ID, int)}) # Set up join nodes join_fields = ['dwis', 'masks', 'subject_ids', 'visit_ids'] join_over_subjects = pipeline.add( 'join_over_subjects', utility.IdentityInterface( join_fields), inputs={ 'masks': (self.brain_mask_spec_name, nifti_gz_format), 'dwis': (mrconvert, 'out_file'), 'subject_ids': (session_ids, 'subject_id'), 'visit_ids': (session_ids, 'visit_id')}, joinsource=self.SUBJECT_ID, joinfield=join_fields) join_over_visits = pipeline.add( 'join_over_visits', Chain( join_fields), inputs={ 'dwis': (join_over_subjects, 'dwis'), 'masks': (join_over_subjects, 'masks'), 'subject_ids': (join_over_subjects, 'subject_ids'), 'visit_ids': (join_over_subjects, 'visit_ids')}, joinsource=self.VISIT_ID, joinfield=join_fields) # Intensity normalization intensity_norm = pipeline.add( 'dwiintensitynorm', DWIIntensityNorm(), inputs={ 'in_files': (join_over_visits, 'dwis'), 'masks': (join_over_visits, 'masks')}, outputs={ 'norm_intens_fa_template': ('fa_template', mrtrix_image_format), 'norm_intens_wm_mask': ('wm_mask', mrtrix_image_format)}, requirements=[mrtrix_req.v('3.0rc3')]) # Set up expand nodes pipeline.add( 'expand', SelectSession(), inputs={ 'subject_ids': (join_over_visits, 'subject_ids'), 'visit_ids': (join_over_visits, 'visit_ids'), 'inlist': (intensity_norm, 'out_files'), 'subject_id': (Study.SUBJECT_ID, int), 'visit_id': (Study.VISIT_ID, int)}, outputs={ 'norm_intensity': ('item', mrtrix_image_format)}) # Connect inputs return pipeline