def __init__(self, name, collection, frequency=None, format=None): # @ReservedAssignment collection = list(collection) if collection: implicit_frequency = self._common_attr(collection, 'frequency') if frequency is None: frequency = implicit_frequency elif frequency != implicit_frequency: raise ArcanaUsageError( "Implicit frequency '{}' does not match explicit " "frequency '{}' for '{}' FilesetCollection".format( implicit_frequency, frequency, name)) implicit_format = self._common_attr(collection, 'format') if format is None: format = implicit_format # @ReservedAssignment elif format != implicit_format: raise ArcanaUsageError( "Implicit format '{}' does not match explicit " "format '{}' for '{}' FilesetCollection".format( implicit_format, format, name)) if frequency is None: raise ArcanaUsageError( "Need to provide explicit frequency for empty " "FilesetCollection") if format is None: raise ArcanaUsageError("Need to provide explicit format for empty " "FilesetCollection") BaseFileset.__init__(self, name, format, frequency=frequency) BaseCollection.__init__(self, collection, frequency)
def __init__( self, name, format, pattern=None, # @ReservedAssignment @IgnorePep8 frequency='per_session', id=None, # @ReservedAssignment @IgnorePep8 order=None, dicom_tags=None, is_regex=False, from_study=None, repository=None, study_=None, collection_=None): if pattern is None and id is None: raise ArcanaUsageError( "Either 'pattern' or 'id' need to be provided to " "FilesetSelector constructor") BaseFileset.__init__(self, name, format, frequency) BaseMatch.__init__(self, pattern, is_regex, order, from_study, repository, study_, collection_) if dicom_tags is not None and format.name != 'dicom': raise ArcanaUsageError( "Cannot use 'dicom_tags' kwarg with non-DICOM " "format ({})".format(format)) self._dicom_tags = dicom_tags if order is not None and id is not None: raise ArcanaUsageError( "Cannot provide both 'order' and 'id' to a fileset" "match") self._id = str(id) if id is not None else id
def __init__(self, name, default, choices=None, desc=None, dtype=None): super(SwitchSpec, self).__init__(name, default, desc=desc, dtype=dtype) if self.is_boolean: if choices is not None: raise ArcanaUsageError( "Choices ({}) are only valid for non-boolean " "switches ('{}')".format("', '".join(choices), name)) elif choices is None: raise ArcanaUsageError("Choices must be provided for non-boolean " "switches ('{}')".format(name)) self._choices = tuple(choices) if choices is not None else None self._desc = desc
def check_valid(self, parameter, context=None): if parameter.value is not None: error_msg = ("Incorrect datatype for '{}' parameter provided " "({}){}, Should be {}".format( parameter.name, type(parameter.value), 'in ' + context if context is not None else '', self.dtype)) if self.array: if any(not isinstance(v, self.dtype) for v in parameter.value): raise ArcanaUsageError(error_msg + ' array') elif not isinstance(parameter.value, self.dtype): raise ArcanaUsageError(error_msg)
def check_valid(self, switch, context=''): super(SwitchSpec, self).check_valid(switch, context=context) if self.is_boolean: if not isinstance(switch.value, bool): raise ArcanaUsageError( "Value provided to switch '{}'{} should be a " "boolean (not {})".format(self.name, context, switch.value)) elif switch.value not in self.choices: raise ArcanaUsageError( "Value provided to switch '{}'{} ({}) is not a valid " "choice ('{}')".format(self.name, context, switch.value, "', '".join(self.choices)))
def _list_outputs(self): outputs = self._outputs().get() new_files = set(os.listdir(os.getcwd())) - self.listdir_before if len(new_files) > 1: raise ArcanaUsageError( "Zip repositorys can only contain a single directory, found '{}'" .format("', '".join(new_files))) try: unzipped = next(iter(new_files)) except StopIteration: raise ArcanaUsageError( "No files or directories found in unzipped directory") outputs['gunzipped'] = op.join(os.getcwd(), unzipped) return outputs
def segmentation_pipeline(self, img_type=2, **kwargs): pipeline = self.create_pipeline( name='FAST_segmentation', inputs=[DatasetSpec('brain', nifti_gz_format)], outputs=[DatasetSpec('wm_seg', nifti_gz_format)], desc="White matter segmentation of the reference image", version=1, citations=[fsl_cite], **kwargs) fast = pipeline.create_node(fsl.FAST(), name='fast', requirements=[fsl509_req]) fast.inputs.img_type = img_type fast.inputs.segments = True fast.inputs.out_basename = 'Reference_segmentation' pipeline.connect_input('brain', fast, 'in_files') split = pipeline.create_node(Split(), name='split') split.inputs.splits = [1, 1, 1] split.inputs.squeeze = True pipeline.connect(fast, 'tissue_class_files', split, 'inlist') if img_type == 1: pipeline.connect_output('wm_seg', split, 'out3') elif img_type == 2: pipeline.connect_output('wm_seg', split, 'out2') else: raise ArcanaUsageError( "'img_type' parameter can either be 1 or 2 (not {})".format( img_type)) return pipeline
def __init__( self, name, primary_match, format, association, # @ReservedAssignment @IgnorePep8 fieldmap_type=None, order=0): FilesetSelector.__init__( self, name, format, pattern=None, frequency='per_session', # @ReservedAssignment @IgnorePep8 id=None, order=order, dicom_tags=None, is_regex=False, from_study=None) self._primary_match = primary_match self._association = association if fieldmap_type is not None and association != 'fieldmap': raise ArcanaUsageError("'fieldmap_type' (provided to '{}' match) " "is only valid for 'fieldmap' " "associations (not '{}')".format( name, association)) self._fieldmap_type = fieldmap_type
def _common_attr(self, collection, attr_name): attr_set = set(getattr(c, attr_name) for c in collection) if len(attr_set) != 1: raise ArcanaUsageError( "Heterogeneous attributes for '{}' within {}".format( attr_name, self)) return next(iter(attr_set))
def from_path(cls, path, frequency='per_session', format=None, # @ReservedAssignment @IgnorePep8 **kwargs): if not os.path.exists(path): raise ArcanaUsageError( "Attempting to read Fileset from path '{}' but it " "does not exist".format(path)) if os.path.isdir(path): within_exts = frozenset( split_extension(f)[1] for f in os.listdir(path) if not f.startswith('.')) if format is None: # Try to guess format try: format = FileFormat.by_within_dir_exts(within_exts) # @ReservedAssignment @IgnorePep8 except ArcanaFileFormatNotRegisteredError: # Fall back to general directory format format = directory_format # @ReservedAssignment name = os.path.basename(path) else: filename = os.path.basename(path) name, ext = split_extension(filename) if format is None: try: format = FileFormat.by_ext(ext) # @ReservedAssignment @IgnorePep8 except ArcanaFileFormatNotRegisteredError as e: raise ArcanaFileFormatNotRegisteredError( str(e) + ", which is required to identify the " "format of the fileset at '{}'".format(path)) return cls(name, format, frequency=frequency, path=path, **kwargs)
def header_info_extraction_pipeline(self, **kwargs): if self.input('primary').format != dicom_format: raise ArcanaUsageError( "Can only extract header info if 'primary' dataset " "is provided in DICOM format ({})".format( self.input('primary').format)) return self.header_info_extraction_pipeline_factory( 'header_info_extraction', 'primary', **kwargs)
def __init__(self, name, collection, frequency=None, dtype=None, array=None): collection = list(collection) if collection: implicit_frequency = self._common_attr(collection, 'frequency') if frequency is None: frequency = implicit_frequency elif frequency != implicit_frequency: raise ArcanaUsageError( "Implicit frequency '{}' does not match explicit " "frequency '{}' for '{}' FilesetCollection".dtype( implicit_frequency, frequency, name)) implicit_dtype = self._common_attr(collection, 'dtype') if dtype is None: dtype = implicit_dtype # @ReservedAssignment elif dtype != implicit_dtype: raise ArcanaUsageError( "Implicit dtype '{}' does not match explicit " "dtype '{}' for '{}' FilesetCollection".dtype( implicit_dtype, dtype, name)) implicit_array = self._common_attr(collection, 'array') if array is None: array = implicit_array elif array != implicit_array: raise ArcanaUsageError( "Implicit array '{}' does not match explicit " "array '{}' for '{}' FilesetCollection".dtype( implicit_array, array, name)) if frequency is None: raise ArcanaUsageError( "Need to provide explicit frequency for empty " "FilesetCollection") if dtype is None: raise ArcanaUsageError("Need to provide explicit dtype for empty " "FilesetCollection") BaseField.__init__(self, name, dtype=dtype, frequency=frequency, array=array) BaseCollection.__init__(self, collection, frequency)
def __init__(self, name, desc=None, optional=False, default=None): if optional and default is not None: raise ArcanaUsageError( "'optional' doesn't make sense for specs ('{}') with default " "values".format(name)) self._desc = desc self._study = None self._optional = optional # Set the name of the default collection-like object so it matches # the name of the spec if default is not None: if default.frequency != self.frequency: raise ArcanaUsageError( "Frequency of default collection-like object passed to " "'{}' spec ('{}'), does not match spec ('{}')".format( name, default.freqency, self.frequency)) default = deepcopy(default) self._default = default
def __init__(self, name, pipeline_name, desc=None): if pipeline_name is not None: if not isinstance(pipeline_name, basestring): raise ArcanaUsageError( "Pipeline name for {} '{}' is not a string " "'{}'".format(name, pipeline_name)) self._pipeline_name = pipeline_name self._desc = desc self._study = None self._collection = None
def path_depth(self, dpath): relpath = op.relpath(dpath, self.root_dir) if '..' in relpath: raise ArcanaUsageError( "Path '{}' is not a sub-directory of '{}'".format( dpath, self.root_dir)) elif relpath == '.': depth = 0 else: depth = relpath.count(op.sep) + 1 return depth
def __init__(self, name, value): self._name = name if value is None: self._dtype = None else: if not isinstance(value, (int, float, str, tuple, list)): raise ArcanaUsageError( "Invalid type for '{}' parameter default ({}), {}, " "can be one of int, float or str".format( name, value, type(value))) self._dtype = (str if isinstance(value, str) else type(value)) self._value = value
def bind(self, study): """ Used for duck typing Collection objects with Spec and Match in source and sink initiation. Checks IDs match sessions in study. """ if self.frequency == 'per_subject': tree_subject_ids = list(study.tree.subject_ids) subject_ids = list(self._collection.keys()) if tree_subject_ids != subject_ids: raise ArcanaUsageError( "Subject IDs in collection provided to '{}' ('{}') " "do not match Study tree ('{}')".format( self.name, "', '".join(subject_ids), "', '".join(tree_subject_ids))) elif self.frequency == 'per_visit': tree_visit_ids = list(study.tree.visit_ids) visit_ids = list(self._collection.keys()) if tree_visit_ids != visit_ids: raise ArcanaUsageError( "Subject IDs in collection provided to '{}' ('{}') " "do not match Study tree ('{}')".format( self.name, "', '".join(visit_ids), "', '".join(tree_visit_ids))) elif self.frequency == 'per_session': for subject in study.tree.subjects: if subject.id not in self._collection: raise ArcanaUsageError( "Study subject ID '{}' was not found in colleciton " "provided to '{}' (found '{}')".format( subject.id, self.name, "', '".join(self._collection.keys()))) for session in subject.sessions: if session.visit_id not in self._collection[subject.id]: raise ArcanaUsageError( "Study visit ID '{}' for subject '{}' was not " "found in colleciton provided to '{}' (found '{}')" .format( subject.id, self.name, "', '".join( self._collection[subject.id].keys())))
def __init__(self, collection, frequency): self._frequency = frequency if collection: self._repository = self._common_attr(collection, 'repository') self._from_study = self._common_attr(collection, 'from_study') else: self._repository = None self._from_study = None if frequency == 'per_study': # If wrapped in an iterable if not isinstance(collection, self.CollectedClass): if len(collection) > 1: raise ArcanaUsageError( "More than one {} passed to {}".format( self.CONTAINED_CLASS.__name__, type(self).__name__)) collection = list(collection) self._collection = collection elif frequency == 'per_session': self._collection = OrderedDict() for subj_id in sorted(set(c.subject_id for c in collection)): self._collection[subj_id] = OrderedDict( sorted(((c.visit_id, c) for c in collection if c.subject_id == subj_id), key=itemgetter(0))) elif frequency == 'per_subject': self._collection = OrderedDict( sorted(((c.subject_id, c) for c in collection), key=itemgetter(0))) elif frequency == 'per_visit': self._collection = OrderedDict( sorted(((c.visit_id, c) for c in collection), key=itemgetter(0))) else: assert False for datum in self: if not isinstance(datum, self.CollectedClass): raise ArcanaUsageError("Invalid class {} in {}".format( datum, self))
def assertContentsEqual(self, collection, reference, context=None): if isinstance(collection, Fileset): collection = [collection] if isinstance(reference, (basestring, int, float)): if len(collection) != 1: raise ArcanaUsageError( "Multi-subject/visit collections cannot be compared" " against a single contents string (list or dict " "should be provided)") references = [str(reference)] filesets = list(collection) elif isinstance(reference, dict): references = [] filesets = [] for subj_id, subj_dct in references.items(): for visit_id, ref_value in subj_dct.items(): references.append(str(ref_value)) filesets.append(collection.item(subject_id=subj_id, visit_id=visit_id)) elif isinstance(reference, (list, tuple)): references = [str(r) for r in reference] filesets = list(collection) if len(references) != len(filesets): raise ArcanaUsageError( "Number of provided references ({}) does not match" " size of collection ({})".format(len(references), len(filesets))) else: raise ArcanaUsageError( "Unrecognised format for reference ({})" .format(reference)) for fileset, ref in zip(filesets, references): with open(fileset.path) as f: contents = f.read() msg = ("Contents of {} ({}) do not match reference ({})" .format(fileset, contents, ref)) if context is not None: msg += 'for ' + context self.assertEqual(contents, ref, msg)
def __init__(self, name, default, desc=None, dtype=None, array=False): super(ParameterSpec, self).__init__(name, default) self._desc = desc self._array = array if dtype is not None: if self.default is not None and ( not array and not isinstance(self.default, dtype) or array and any(not isinstance(d, dtype) for d in self.default)): raise ArcanaUsageError( "Provided default value ({}) does not match explicit " "dtype ({})".format(self.default, dtype)) self._dtype = dtype
def derivable(self): """ Whether the spec (only valid for derived specs) can be derived given the inputs and switches provided to the study """ if not self.derived: raise ArcanaUsageError("'{}' is not a derived {}".format( self.name, type(self))) try: for inpt in self.pipeline.study_inputs: self.study.spec(inpt.name) except (ArcanaOutputNotProducedException, ArcanaMissingDataException): return False return True
def parse_single_value(value): """ Tries to convert to int, float and then gives up and assumes the value is of type string. Useful when excepting values that may be string representations of numerical values """ if isinstance(value, (int, float)): return value try: value = int(value) except ValueError: try: value = float(value) except ValueError: if not isinstance(value, basestring): raise ArcanaUsageError( "Unrecognised value type {}".format(value)) return value
def pipeline(self): if self.pipeline_name is None: raise ArcanaUsageError( "{} is an acquired data spec so doesn't have a pipeline". format(self)) try: getter = getattr(self.study, self.pipeline_name) except AttributeError: raise ArcanaError( "There is no pipeline method named '{}' in present in " "'{}' study".format(self.pipeline_name, self.study)) # Set up study to record which parameters # referenced during the pipeline generation self.study._pipeline_to_generate = self.pipeline_name self.study._referenced_parameters = set() try: pipeline = getter() if pipeline is None: raise ArcanaDesignError( "'{}' pipeline constructor in {} is missing return " "statement (should return a Pipeline object)".format( self.pipeline_name, self.study)) # Copy referenced parameters to pipeline pipeline._referenced_parameters = ( self.study._referenced_parameters) except AttributeError as e: # Need to capture this as exception to avoid it getting # confused with specs that don't have pipelines raise ArcanaError("AttributeError was thrown attempting to " "construct '{}': {}".format( self.pipeline_name, e)) finally: # Reset referenced parameters after generating pipeline self.study._pipeline_to_generate = None self.study._referenced_parameters = None if self.name not in pipeline.output_names: raise ArcanaOutputNotProducedException( "'{}' is not produced by {} pipeline in {} class given the " "provided switches ({}) and the missing inputs ('{}')".format( self.name, pipeline.name, self.study.__class__.__name__, ', '.join('{}={}'.format(s.name, s.value) for s in self.study.switches), "', '".join(self.study.missing_inputs))) return pipeline
def branch(self, name, values=None): # @UnusedVariable @IgnorePep8 """ Checks whether the given switch matches the value provided Parameters ---------- name : str The name of the parameter to retrieve value : str | None The value(s) of the switch to match if a non-boolean switch """ if isinstance(values, basestring): values = [values] spec = self.parameter_spec(name) if not isinstance(spec, SwitchSpec): raise ArcanaUsageError( "{} is standard parameter not a switch".format(spec)) switch = self._get_parameter(name) if spec.is_boolean: if values is not None: raise ArcanaDesignError( "Should not provide values ({}) to boolean switch " "'{}' in {}".format(values, name, self._param_error_location)) in_branch = switch.value else: if values is None: raise ArcanaDesignError( "Value(s) need(s) to be provided non-boolean switch" " '{}' in {}".format(name, self._param_error_location)) # Register parameter as being used by the pipeline unrecognised_values = set(values) - set(spec.choices) if unrecognised_values: raise ArcanaDesignError( "Provided value(s) ('{}') for switch '{}' in {} " "is not a valid option ('{}')".format( "', '".join(unrecognised_values), name, self._param_error_location, "', '".join(spec.choices))) in_branch = switch.value in values if self._referenced_parameters is not None: self._referenced_parameters.add(name) return in_branch
def parse_value(value): # Split strings with commas into lists if ',' in value: value = value.split(',') # Cast all iterables (except strings) into lists if not isinstance(value, basestring): try: value = list(value) except TypeError: pass if isinstance(value, list): value = [parse_single_value(v) for v in value] # Check to see if datatypes are consistent dtypes = set(type(v) for v in value) if dtypes == set((float, int)): # If both ints and floats are presents, cast to floats value = [float(v) for v in value] elif len(dtypes) > 1: raise ArcanaUsageError( "Inconsistent datatypes in values array ({})".format(value)) else: value = parse_single_value(value) return value
def run(self, *pipelines, **kwargs): """ Connects all pipelines to that study's repository and runs them in the same NiPype workflow Parameters ---------- pipeline(s) : Pipeline, ... The pipeline to connect to repository subject_ids : list[str] The subset of subject IDs to process. If None all available will be processed. Note this is not a duplication of the study and visit IDs passed to the Study __init__, as they define the scope of the analysis and these simply limit the scope of the current run (e.g. to break the analysis into smaller chunks and run separately). Therefore, if the analysis joins over subjects, then all subjects will be processed and this parameter will be ignored. visit_ids : list[str] The same as 'subject_ids' but for visit IDs session_ids : list[str,str] The same as 'subject_ids' and 'visit_ids', except specifies a set of specific combinations in tuples of (subject ID, visit ID). force : bool | 'all' A flag to force the reprocessing of all sessions in the filter array, regardless of whether the parameters|pipeline used to generate them matches the current ones. NB: if True only the final pipeline will be reprocessed (prerequisite pipelines won't run unless they don't match provenance). To process all prerequisite pipelines 'all' should be passed to force. Returns ------- report : ReportNode The final report node, which can be connected to subsequent pipelines """ if not pipelines: raise ArcanaUsageError( "No pipelines provided to {}.run".format(self)) # Get filter kwargs (NB: in Python 3 they could be in the arg list) subject_ids = kwargs.pop('subject_ids', []) visit_ids = kwargs.pop('visit_ids', []) session_ids = kwargs.pop('session_ids', []) clean_work_dir = kwargs.pop('clean_work_dir', self._clean_work_dir_between_runs) # Create name by combining pipelines name = '_'.join(p.name for p in pipelines) # Clean work dir if required if clean_work_dir: workflow_work_dir = op.join(self.work_dir, name) if op.exists(workflow_work_dir): shutil.rmtree(workflow_work_dir) # Trim the end of very large names to avoid problems with # workflow names exceeding system limits. name = name[:WORKFLOW_MAX_NAME_LEN] workflow = pe.Workflow(name=name, base_dir=self.work_dir) already_connected = {} # Generate filter array to optionally restrict the run to certain # subject and visit IDs. tree = self.study.tree # Create maps from the subject|visit IDs to an index used to represent # them in the filter array subject_inds = {s.id: i for i, s in enumerate(tree.subjects)} visit_inds = {v.id: i for i, v in enumerate(tree.visits)} if not subject_ids and not visit_ids and not session_ids: # No filters applied so create a full filter array filter_array = np.ones((len(subject_inds), len(visit_inds)), dtype=bool) else: # Filters applied so create an empty filter array and populate # from filter lists filter_array = np.zeros((len(subject_inds), len(visit_inds)), dtype=bool) for subj_id in subject_ids: filter_array[subject_inds[subj_id], :] = True for visit_id in visit_ids: filter_array[:, visit_inds[visit_id]] = True for subj_id, visit_id in session_ids: filter_array[subject_inds[subj_id], visit_inds[visit_id]] = True if not filter_array.any(): raise ArcanaUsageError( "Provided filters:\n" + (" subject_ids: {}\n".format(', '.join(subject_ids)) if subject_ids is not None else '') + (" visit_ids: {}\n".format(', '.join(visit_ids)) if visit_ids is not None else '') + (" session_ids: {}\n".format(', '.join(session_ids)) if session_ids is not None else '') + "Did not match any sessions in the project:\n" + " subject_ids: {}\n".format(', '.join(subject_inds)) + " visit_ids: {}\n".format(', '.join(visit_inds))) for pipeline in pipelines: try: self._connect_pipeline(pipeline, workflow, subject_inds, visit_inds, filter_array, already_connected=already_connected, **kwargs) except ArcanaNoRunRequiredException: logger.info("Not running '{}' pipeline as its outputs " "are already present in the repository".format( pipeline.name)) # Reset the cached tree of filesets in the repository as it will # change after the pipeline has run. self.study.repository.clear_cache() # workflow.write_graph(graph2use='flat', format='svg') # print('Graph saved in {} directory'.format(os.getcwd())) return workflow.run(plugin=self._plugin)
def _to_process(self, pipeline, filter_array, subject_inds, visit_inds, force): """ Check whether the outputs of the pipeline are present in all sessions in the project repository and were generated with matching parameters and pipelines. Return an 2D boolean array (subjects: rows, visits: cols) with the sessions to process marked True. Parameters ---------- pipeline : Pipeline The pipeline to determine the sessions to process filter_array : 2-D numpy.array[bool] A two-dimensional boolean array, where rows and columns correspond correspond to subjects and visits in the repository tree. True values represent a subject/visit ID pairs to include in the current round of processing. Note that if the 'force' flag is not set, sessions won't be reprocessed unless the parameters and pipeline version saved in the provenance doesn't match that of the given pipeline. subject_inds : dict[str,int] Mapping from subject ID to index in filter|to_process arrays visit_inds : dict[str,int] Mapping from visit ID to index in filter|to_process arrays force : bool Whether to force reprocessing of all (filtered) sessions or not Returns ------- to_process : 2-D numpy.array[bool] A two-dimensional boolean array, where rows correspond to subjects and columns correspond to visits in the repository. True values represent subject/visit ID pairs to run the pipeline for """ # Check to see if the pipeline has any low frequency outputs, because # if not then each session can be processed indepdently. Otherwise, # the "session matrix" (as defined by subject_ids and visit_ids # passed to the Study class) needs to be complete, i.e. a session # exists (with the full complement of requird inputs) for each # subject/visit ID pair. tree = self.study.tree low_freq_outputs = [ o.name for o in pipeline.outputs if o.frequency != 'per_session' ] if low_freq_outputs and list(tree.incomplete_subjects): raise ArcanaUsageError( "Can't process '{}' pipeline as it has low frequency outputs " "(i.e. outputs that aren't of 'per_session' frequency) " "({}) and subjects ({}) that are missing one " "or more visits ({}). Please restrict the subject/visit " "IDs in the study __init__ to continue the analysis".format( self.name, ', '.join(low_freq_outputs), ', '.join(s.id for s in tree.incomplete_subjects), ', '.join(v.id for v in tree.incomplete_visits))) # Initialise an array of sessions to process to_process = np.zeros((len(subject_inds), len(visit_inds)), dtype=bool) for output in pipeline.frequency_outputs('per_study'): collection = self.study.spec(output).collection # Include all sessions if a per-study output needs to be # reprocessed. Note that this will almost always be the case if # any other output needs to be reprocessed. # # NB: Filter array should always have at least one true value at # this point if pipeline.metadata_mismatch(collection.item()) or force: to_process[:] = True # No point continuing since to_process array is already full return to_process for output in pipeline.frequency_outputs('per_subject'): collection = self.study.spec(output).collection for item in collection: i = subject_inds[item.subject_id] # NB: The output will be reprocessed using data from every # visit of each subject. However, the visits to include in the # analysis can be specified the initialisation of the Study. if ((pipeline.metadata_mismatch(item) or force) and filter_array[i, :].any()): to_process[i, :] = True for output in pipeline.frequency_outputs('per_visit'): collection = self.study.spec(output).collection for item in collection: j = visit_inds[item.visit_id] # NB: The output will be reprocessed using data from every # subject of each vist. However, the subject to include in the # analysis can be specified the initialisation of the Study. if ((pipeline.metadata_mismatch(item) or force) and filter_array[:, j].any()): to_process[:, j] = True for output in pipeline.frequency_outputs('per_session'): collection = self.study.spec(output).collection for item in collection: i = subject_inds[item.subject_id] j = visit_inds[item.visit_id] if ((pipeline.metadata_mismatch(item) or force) and filter_array[i, j]): to_process[i, j] = True if not to_process.any(): raise ArcanaNoRunRequiredException( "No sessions to process for '{}' pipeline".format( pipeline.name)) return to_process
def data(self, name, subject_id=None, visit_id=None, **kwargs): """ Returns the Fileset or Field associated with the name, generating derived filesets as required. Multiple names in a list can be provided, in which case their workflows are joined into a single workflow. Parameters ---------- name : str | List[str] The name of the FilesetSpec|FieldSpec to retried the filesets for subject_id : int | str | List[int|str] | None The subject ID or subject IDs to return. If None all are returned visit_id : int | str | List[int|str] | None The visit ID or visit IDs to return. If None all are returned Returns ------- data : Fileset | Field | List[Fileset | Field] | List[List[Fileset | Field]] If a single name is provided then data is either a single Fileset or field if a single subject_id and visit_id are provided, otherwise a list of filesets or fields corresponding to the given name. If muliple names are provided then a list is returned containing the data for each provided name. """ if isinstance(name, basestring): single_name = True names = [name] else: names = name single_name = False def is_single_id(id_): # @IgnorePep8 return isinstance(id_, (basestring, int)) subject_ids = ([subject_id] if is_single_id(subject_id) else subject_id) visit_ids = ([visit_id] if is_single_id(visit_id) else visit_id) # Work out which pipelines need to be run pipelines = [] for name in names: try: pipeline = self.spec(name).pipeline pipeline.required_outputs.add(name) pipelines.append(pipeline) except AttributeError: pass # Match objects don't have pipelines # Run all pipelines together if pipelines: self.processor.run(*pipelines, subject_ids=subject_ids, visit_ids=visit_ids, **kwargs) all_data = [] for name in names: spec = self.spec(name) data = spec.collection if subject_ids is not None and spec.frequency in ('per_session', 'per_subject'): data = [d for d in data if d.subject_id in subject_ids] if visit_ids is not None and spec.frequency in ('per_session', 'per_visit'): data = [d for d in data if d.visit_id in visit_ids] if not data: raise ArcanaUsageError( "No matching data found (subject_id={}, visit_id={})". format(subject_id, visit_id)) if is_single_id(subject_id) and is_single_id(visit_id): assert len(data) == 1 data = data[0] else: data = spec.CollectionClass(spec.name, data) if single_name: return data all_data.append(data) return all_data
def __init__(self, name, repository, processor, inputs, environment=None, parameters=None, subject_ids=None, visit_ids=None, enforce_inputs=True, reprocess=False, fill_tree=False): try: # This works for PY3 as the metaclass inserts it itself if # it isn't provided metaclass = type(self).__dict__['__metaclass__'] if not issubclass(metaclass, StudyMetaClass): raise KeyError except KeyError: raise ArcanaUsageError( "Need to have StudyMetaClass (or a sub-class) as " "the metaclass of all classes derived from Study") if isinstance(repository, basestring): repository = DirectoryRepository(repository, depth=None) if isinstance(processor, basestring): processor = LinearProcessor(processor) if environment is None: environment = StaticEnvironment() self._name = name self._repository = repository self._processor = processor.bind(self) self._environment = environment self._inputs = {} self._subject_ids = subject_ids self._visit_ids = visit_ids self._tree = self.repository.cached_tree(subject_ids=subject_ids, visit_ids=visit_ids, fill=fill_tree) if not self.subject_ids: raise ArcanaUsageError( "No subject IDs provided and destination repository " "is empty") if not self.visit_ids: raise ArcanaUsageError( "No visit IDs provided and destination repository " "is empty") self._reprocess = reprocess # For recording which parameters are accessed # during pipeline generation so they can be attributed to the # pipeline after it is generated (and then saved in the # provenance self._pipeline_to_generate = None self._referenced_parameters = None # Set parameters if parameters is None: parameters = {} elif not isinstance(parameters, dict): # Convert list of parameters into dictionary parameters = {o.name: o for o in parameters} self._parameters = {} for param_name, param in list(parameters.items()): if not isinstance(param, Parameter): param = Parameter(param_name, param) try: param_spec = self._param_specs[param_name] except KeyError: raise ArcanaNameError( param_name, "Provided parameter '{}' is not present in the " "allowable parameters for {} classes ('{}')".format( param_name, type(self).__name__, "', '".join(self.parameter_spec_names()))) param_spec.check_valid(param, context='{}(name={})'.format( type(self).__name__, name)) self._parameters[param_name] = param # Convert inputs to a dictionary if passed in as a list/tuple if not isinstance(inputs, dict): inputs = {i.name: i for i in inputs} # Add each "input fileset" checking to see whether the given # fileset_spec name is valid for the study types for inpt_name, inpt in list(inputs.items()): try: spec = self.data_spec(inpt_name) except ArcanaNameError: raise ArcanaNameError( inpt.name, "Input name '{}' isn't in data specs of {} ('{}')".format( inpt.name, self.__class__.__name__, "', '".join(self._data_specs))) else: if isinstance(spec, BaseFileset): if isinstance(inpt, BaseField): raise ArcanaUsageError( "Passed field ({}) as input to fileset spec" " {}".format(inpt, spec)) if spec.derived: try: # FIXME: should provide requirement manager to # converter_from but it hasn't been implemented yet spec.format.converter_from(inpt.format) except ArcanaNoConverterError as e: raise ArcanaNoConverterError( "{}, which is requried to convert:\n{} " "to\n{}.".format(e, inpt, spec)) else: if inpt.format not in spec.valid_formats: raise ArcanaUsageError( "Cannot pass {} as an input to {} as it is " "not in one of the valid formats ('{}')". format( inpt, spec, "', '".join(f.name for f in spec.valid_formats))) elif not isinstance(inpt, BaseField): raise ArcanaUsageError( "Passed fileset ({}) as input to field spec {}".format( inpt, spec)) self._inputs[inpt_name] = inpt.bind(self) # "Bind" data specs in the class to the current study object # this will allow them to prepend the study name to the name # of the fileset self._bound_specs = {} for spec in self.data_specs(): if spec.name not in self.input_names: if not spec.derived and spec.default is None: # Emit a warning if an acquired fileset has not been # provided for an "acquired fileset" msg = (" acquired fileset '{}' was not given as" " an input of {}.".format(spec.name, self)) if spec.optional: logger.info('Optional' + msg) else: if enforce_inputs: raise ArcanaMissingInputError( 'Non-optional' + msg + " Pipelines " "depending on this fileset will not " "run") else: self._bound_specs[spec.name] = spec.bind(self)
def __new__(metacls, name, bases, dct): # @NoSelf @UnusedVariable if not any(issubclass(b, Study) for b in bases): raise ArcanaUsageError( "StudyMetaClass can only be used for classes that " "have Study as a base class") try: add_data_specs = dct['add_data_specs'] except KeyError: add_data_specs = [] try: add_param_specs = dct['add_param_specs'] except KeyError: add_param_specs = [] combined_attrs = set() combined_data_specs = {} combined_param_specs = {} for base in reversed(bases): # Get the combined class dictionary including base dicts # excluding auto-added properties for data and parameter specs combined_attrs.update( a for a in dir(base) if (not issubclass(base, Study) or a not in base.spec_names())) try: combined_data_specs.update( (d.name, d) for d in base.data_specs()) except AttributeError: pass try: combined_param_specs.update( (p.name, p) for p in base.parameter_specs()) except AttributeError: pass combined_attrs.update(list(dct.keys())) combined_data_specs.update((d.name, d) for d in add_data_specs) combined_param_specs.update((p.name, p) for p in add_param_specs) # Check that the pipeline names in data specs correspond to a # pipeline method in the class for spec in add_data_specs: if spec.derived: if spec.pipeline_name == 'pipeline': raise ArcanaDesignError( "Cannot use the name 'pipeline' for the name of a " "pipeline constructor in class {} as it clashes " "with base method to create pipelines".format(name)) if spec.pipeline_name not in combined_attrs: raise ArcanaDesignError( "Pipeline to generate '{}', '{}', is not present" " in '{}' class".format(spec.name, spec.pipeline_name, name)) # Check for name clashes between data and parameter specs spec_name_clashes = (set(combined_data_specs) & set(combined_param_specs)) if spec_name_clashes: raise ArcanaDesignError( "'{}' name both data and parameter specs in '{}' class".format( "', '".join(spec_name_clashes), name)) reserved_clashes = [ n for n in combined_data_specs if n in Study.ITERFIELDS ] if reserved_clashes: raise ArcanaDesignError( "'{}' data spec names clash with reserved names".format( "', '".join(reserved_clashes), name)) dct['_data_specs'] = combined_data_specs dct['_param_specs'] = combined_param_specs if '__metaclass__' not in dct: dct['__metaclass__'] = metacls return type(name, bases, dct)