Ejemplo n.º 1
0
 def get_field(self, field):
     """
     Update the value of the field from the repository
     """
     # Load fields JSON, locking to prevent read/write conflicts
     # Would be better if only checked if locked to allow
     # concurrent reads but not possible with multi-process
     # locks (in my understanding at least).
     fpath = self.fields_json_path(field)
     try:
         with InterProcessLock(fpath + self.LOCK_SUFFIX,
                               logger=logger), open(fpath, 'r') as f:
             dct = json.load(f)
         val = dct[field.name]
         if field.array:
             val = [field.dtype(v) for v in val]
         else:
             val = field.dtype(val)
     except (KeyError, IOError) as e:
         try:
             # Check to see if the IOError wasn't just because of a
             # missing file
             if e.errno != errno.ENOENT:
                 raise
         except AttributeError:
             pass
         raise ArcanaMissingDataException(
             "{} does not exist in the local repository {}".format(
                 field.name, self))
     return val
Ejemplo n.º 2
0
    def bound_spec(self, name):
        """
        Returns an input selector or derived spec bound to the study, i.e.
        where the repository tree is checked for existing outputs

        Parameters
        ----------
        name : Str
            A name of a fileset or field
        """
        # If the provided "name" is actually a data item or parameter then
        # replace it with its name.
        if isinstance(name, BaseData):
            name = name.name
        try:
            bound = self._inputs[name]
        except KeyError:
            # Get the spec from the class
            spec = self.data_spec(name)
            if not spec.derived and spec.default is None:
                raise ArcanaMissingDataException(
                    "Input (i.e. non-generated) data '{}' "
                    "was not supplied when the study '{}' was "
                    "initiated".format(name, self.name))
            else:
                try:
                    bound = self._bound_specs[name]
                except KeyError:
                    bound = self._bound_specs[name] = spec.bind(self)
        return bound
Ejemplo n.º 3
0
 def mni_template_resolution(self):
     if self.parameter('mni_template_resolution') is not None:
         res = self.parameter('mni_template_resolution')
     else:
         raise ArcanaMissingDataException(
             "Automatic detection of dataset resolution is not implemented "
             "yet, please specify resolution of default MNI templates "
             "manually via 'mni_template_resolution' parameter")
     return res
Ejemplo n.º 4
0
 def get_fileset(self, fileset):
     """
     Set the path of the fileset from the repository
     """
     # Don't need to cache fileset as it is already local as long
     # as the path is set
     if fileset._path is None:
         primary_path = self.fileset_path(fileset)
         aux_files = fileset.format.default_aux_file_paths(primary_path)
         if not op.exists(primary_path):
             raise ArcanaMissingDataException(
                 "{} does not exist in {}".format(fileset, self))
         for aux_name, aux_path in aux_files.items():
             if not op.exists(aux_path):
                 raise ArcanaMissingDataException(
                     "{} is missing '{}' side car in {}".format(
                         fileset, aux_name, self))
     else:
         primary_path = fileset.path
         aux_files = fileset.aux_files
     return primary_path, aux_files
Ejemplo n.º 5
0
    def _connect_pipeline(self,
                          pipeline,
                          required_outputs,
                          workflow,
                          subject_inds,
                          visit_inds,
                          filter_array,
                          force=False):
        """
        Connects a pipeline to a overarching workflow that sets up iterators
        over subjects|visits present in the repository (if required) and
        repository source and sink nodes

        Parameters
        ----------
        pipeline : Pipeline
            The pipeline to connect
        required_outputs : set[str] | None
            The outputs required to be produced by this pipeline. If None all
            are deemed to be required
        workflow : nipype.pipeline.engine.Workflow
            The overarching workflow to connect the pipeline to
        subject_inds : dct[str, int]
            A mapping of subject ID to row index in the filter array
        visit_inds : dct[str, int]
            A mapping of visit ID to column index in the filter array
        filter_array : 2-D numpy.array[bool]
            A two-dimensional boolean array, where rows correspond to
            subjects and columns correspond to visits in the repository. True
            values represent a combination of subject & visit ID to include
            in the current round of processing. Note that if the 'force'
            flag is not set, sessions won't be reprocessed unless the
            save provenance doesn't match that of the given pipeline.
        force : bool | 'all'
            A flag to force the processing of all sessions in the filter
            array, regardless of whether the parameters|pipeline used
            to generate existing data matches the given pipeline
        """
        if self.reprocess == 'force':
            force = True
        # Close-off construction of the pipeline and created, input and output
        # nodes and provenance dictionary
        pipeline.cap()
        # Prepend prerequisite pipelines to complete workflow if they need
        # to be (re)processed
        final_nodes = []
        # The array that represents the subject/visit pairs for which any
        # prerequisite pipeline will be (re)processed, and which therefore
        # needs to be included in the processing of the current pipeline. Row
        # indices correspond to subjects and column indices visits
        prqs_to_process_array = np.zeros((len(subject_inds), len(visit_inds)),
                                         dtype=bool)
        # The array that represents the subject/visit pairs for which any
        # prerequisite pipeline will be skipped due to missing inputs. Row
        # indices correspond to subjects and column indices visits
        prqs_to_skip_array = np.zeros((len(subject_inds), len(visit_inds)),
                                      dtype=bool)
        for getter_name in pipeline.prerequisites:
            prereq = pipeline.study.pipeline(getter_name)
            if prereq.to_process_array.any():
                final_nodes.append(prereq.node('final'))
                prqs_to_process_array |= prereq.to_process_array
            prqs_to_skip_array |= prereq.to_skip_array
        # Get list of sessions that need to be processed (i.e. if
        # they don't contain the outputs of this pipeline)
        to_process_array, to_protect_array, to_skip_array = self._to_process(
            pipeline, required_outputs, prqs_to_process_array,
            prqs_to_skip_array, filter_array, subject_inds, visit_inds, force)
        # Store the arrays signifying which nodes to process, protect or skip
        # so they can be passed to downstream pipelines
        pipeline.to_process_array = to_process_array
        pipeline.to_protect_array = to_protect_array
        pipeline.to_skip_array = to_skip_array
        # Check to see if there are any sessions to process
        if not to_process_array.any():
            raise ArcanaNoRunRequiredException(
                "No sessions to process for '{}' pipeline".format(
                    pipeline.name))
        # Set up workflow to run the pipeline, loading and saving from the
        # repository
        workflow.add_nodes([pipeline._workflow])
        # If prerequisite pipelines need to be processed, connect their
        # "final" nodes to the initial node of this pipeline to ensure that
        # they are all processed before this pipeline is run.
        if final_nodes:
            prereqs = pipeline.add('prereqs', Merge(len(final_nodes)))
            for i, final_node in enumerate(final_nodes, start=1):
                workflow.connect(final_node, 'out', prereqs, 'in{}'.format(i))
        else:
            prereqs = None
        # Construct iterator structure over subjects and sessions to be
        # processed
        iter_nodes = self._iterate(pipeline, to_process_array, subject_inds,
                                   visit_inds)
        sources = {}
        # Loop through each frequency present in the pipeline inputs and
        # create a corresponding source node
        for freq in pipeline.input_frequencies:
            try:
                inputs = list(pipeline.frequency_inputs(freq))
            except ArcanaMissingDataException as e:
                raise ArcanaMissingDataException(
                    str(e) + ", which is required for pipeline '{}'".format(
                        pipeline.name))
            inputnode = pipeline.inputnode(freq)
            sources[freq] = source = pipeline.add(
                '{}_source'.format(freq),
                RepositorySource(i.collection for i in inputs),
                inputs=({
                    'prereqs': (prereqs, 'out')
                } if prereqs is not None else {}))
            # Connect iter_nodes to source and input nodes
            for iterator in pipeline.iterators(freq):
                pipeline.connect(iter_nodes[iterator], iterator, source,
                                 iterator)
                pipeline.connect(source, iterator, inputnode, iterator)
            for input in inputs:
                pipeline.connect(source, input.suffixed_name, inputnode,
                                 input.name)
        deiter_nodes = {}

        def deiter_node_sort_key(it):
            """
            If there are two iter_nodes (i.e. both subject and visit ID) and
            one depends on the other (i.e. if the visit IDs per subject
            vary and vice-versa) we need to ensure that the dependent
            iterator is deiterated (joined) first.
            """
            return iter_nodes[it].itersource is None

        # Connect all outputs to the repository sink, creating a new sink for
        # each frequency level (i.e 'per_session', 'per_subject', 'per_visit',
        # or 'per_study')
        for freq in pipeline.output_frequencies:
            outputs = list(pipeline.frequency_outputs(freq))
            if pipeline.iterators(freq) - pipeline.iterators():
                raise ArcanaDesignError(
                    "Doesn't make sense to output '{}', which are of '{}' "
                    "frequency, when the pipeline only iterates over '{}'".
                    format("', '".join(o.name for o in outputs), freq,
                           "', '".join(pipeline.iterators())))
            outputnode = pipeline.outputnode(freq)
            # Connect filesets/fields to sink to sink node, skipping outputs
            # that are study inputs
            to_connect = {
                o.suffixed_name: (outputnode, o.name)
                for o in outputs if o.is_spec
            }
            # Connect iterators to sink node
            to_connect.update(
                {i: (iter_nodes[i], i)
                 for i in pipeline.iterators()})
            # Connect checksums/values from sources to sink node in order to
            # save in provenance, joining where necessary
            for input_freq in pipeline.input_frequencies:
                checksums_to_connect = [
                    i.checksum_suffixed_name
                    for i in pipeline.frequency_inputs(input_freq)
                ]
                if not checksums_to_connect:
                    # Rare case of a pipeline with no inputs only iter_nodes
                    # that will only occur in unittests in all likelihood
                    continue
                # Loop over iterators that need to be joined, i.e. that are
                # present in the input frequency but not the output frequency
                # and create join nodes
                source = sources[input_freq]
                for iterator in (pipeline.iterators(input_freq) -
                                 pipeline.iterators(freq)):
                    join = pipeline.add(
                        '{}_to_{}_{}_checksum_join'.format(
                            input_freq, freq, iterator),
                        IdentityInterface(checksums_to_connect),
                        inputs={
                            tc: (source, tc)
                            for tc in checksums_to_connect
                        },
                        joinsource=iterator,
                        joinfield=checksums_to_connect)
                    source = join
                to_connect.update(
                    {tc: (source, tc)
                     for tc in checksums_to_connect})
            # Add sink node
            sink = pipeline.add('{}_sink'.format(freq),
                                RepositorySink((o.collection for o in outputs),
                                               pipeline, required_outputs),
                                inputs=to_connect)
            # "De-iterate" (join) over iterators to get back to single child
            # node by the time we connect to the final node of the pipeline Set
            # the sink and subject_id as the default deiterator if there are no
            # deiterates (i.e. per_study) or to use as the upstream node to
            # connect the first deiterator for every frequency
            deiter_nodes[freq] = sink  # for per_study the "deiterator" == sink
            for iterator in sorted(pipeline.iterators(freq),
                                   key=deiter_node_sort_key):
                # Connect to previous deiterator or sink
                # NB: we only need to keep a reference to the last one in the
                # chain in order to connect with the "final" node, so we can
                # overwrite the entry in the 'deiter_nodes' dict
                deiter_nodes[freq] = pipeline.add(
                    '{}_{}_deiter'.format(freq, iterator),
                    IdentityInterface(['checksums']),
                    inputs={'checksums': (deiter_nodes[freq], 'checksums')},
                    joinsource=iterator,
                    joinfield='checksums')
        # Create a final node, which is used to connect with downstream
        # pipelines
        pipeline.add('final',
                     Merge(len(deiter_nodes)),
                     inputs={
                         'in{}'.format(i): (di, 'checksums')
                         for i, di in enumerate(deiter_nodes.values(), start=1)
                     })
Ejemplo n.º 6
0
    def intensity_normalisation_pipeline(self, **name_maps):

        if self.num_sessions < 2:
            raise ArcanaMissingDataException(
                "Cannot normalise intensities of DWI images as study only "
                "contains a single session")
        elif self.num_sessions < self.RECOMMENDED_NUM_SESSIONS_FOR_INTENS_NORM:
            logger.warning(
                "The number of sessions in the study ({}) is less than the "
                "recommended number for intensity normalisation ({}). The "
                "results may be unreliable".format(
                    self.num_sessions,
                    self.RECOMMENDED_NUM_SESSIONS_FOR_INTENS_NORM))

        pipeline = self.new_pipeline(
            name='intensity_normalization',
            desc="Corrects for B1 field inhomogeneity",
            citations=[mrtrix_req.v('3.0rc3')],
            name_maps=name_maps)

        mrconvert = pipeline.add(
            'mrconvert',
            MRConvert(
                out_ext='.mif'),
            inputs={
                'in_file': (self.series_preproc_spec_name, nifti_gz_format),
                'grad_fsl': self.fsl_grads(pipeline)},
            requirements=[mrtrix_req.v('3.0rc3')])

        # Pair subject and visit ids together, expanding so they can be
        # joined and chained together
        session_ids = pipeline.add(
            'session_ids',
            utility.IdentityInterface(
                ['subject_id', 'visit_id']),
            inputs={
                'subject_id': (Study.SUBJECT_ID, int),
                'visit_id': (Study.VISIT_ID, int)})

        # Set up join nodes
        join_fields = ['dwis', 'masks', 'subject_ids', 'visit_ids']
        join_over_subjects = pipeline.add(
            'join_over_subjects',
            utility.IdentityInterface(
                join_fields),
            inputs={
                'masks': (self.brain_mask_spec_name, nifti_gz_format),
                'dwis': (mrconvert, 'out_file'),
                'subject_ids': (session_ids, 'subject_id'),
                'visit_ids': (session_ids, 'visit_id')},
            joinsource=self.SUBJECT_ID,
            joinfield=join_fields)

        join_over_visits = pipeline.add(
            'join_over_visits',
            Chain(
                join_fields),
            inputs={
                'dwis': (join_over_subjects, 'dwis'),
                'masks': (join_over_subjects, 'masks'),
                'subject_ids': (join_over_subjects, 'subject_ids'),
                'visit_ids': (join_over_subjects, 'visit_ids')},
            joinsource=self.VISIT_ID,
            joinfield=join_fields)

        # Intensity normalization
        intensity_norm = pipeline.add(
            'dwiintensitynorm',
            DWIIntensityNorm(),
            inputs={
                'in_files': (join_over_visits, 'dwis'),
                'masks': (join_over_visits, 'masks')},
            outputs={
                'norm_intens_fa_template': ('fa_template',
                                            mrtrix_image_format),
                'norm_intens_wm_mask': ('wm_mask', mrtrix_image_format)},
            requirements=[mrtrix_req.v('3.0rc3')])

        # Set up expand nodes
        pipeline.add(
            'expand', SelectSession(),
            inputs={
                'subject_ids': (join_over_visits, 'subject_ids'),
                'visit_ids': (join_over_visits, 'visit_ids'),
                'inlist': (intensity_norm, 'out_files'),
                'subject_id': (Study.SUBJECT_ID, int),
                'visit_id': (Study.VISIT_ID, int)},
            outputs={
                'norm_intensity': ('item', mrtrix_image_format)})

        # Connect inputs
        return pipeline