def _run_interface(self, runtime): loaded_modules = ModulesEnv.loaded() if first_req.name not in loaded_modules: raise ArcanaError("First Test module was not loaded in Node") if second_req.name not in loaded_modules: raise ArcanaError("Second Test module was not loaded in Node") return runtime
def assertField(self, name, ref_value, from_analysis, subject=None, visit=None, frequency='per_session', to_places=None): esc_name = from_analysis + '_' + name output_dir = self.get_session_dir(subject, visit, frequency) try: with open(op.join(output_dir, LocalFileSystemRepo.FIELDS_FNAME)) as f: fields = json.load(f) except IOError as e: if e.errno == errno.ENOENT: raise ArcanaError( "No fields were created by pipeline in analysis '{}'". format(from_analysis)) try: value = fields[esc_name] except KeyError: raise ArcanaError( "Field '{}' was not created by pipeline in analysis '{}'. " "Created fields were ('{}')".format(esc_name, from_analysis, "', '".join(fields))) msg = ("Field value '{}' for analysis '{}', {}, does not match " "reference value ({})".format(name, from_analysis, value, ref_value)) if to_places is not None: self.assertAlmostEqual( value, ref_value, to_places, '{} to {} decimal places'.format(msg, to_places)) else: self.assertEqual(value, ref_value, msg)
def put_record(self, record, dataset): base_cache_path = self._cache_path( record, name=self.PROV_SCAN, dataset=dataset) if not op.exists(base_cache_path): os.mkdir(base_cache_path) else: if not op.isdir(base_cache_path): raise ArcanaError( "Base provenance cache path ('{}') should be a directory" .format(base_cache_path)) cache_path = op.join(base_cache_path, record.pipeline_name + '.json') record.save(cache_path) # TODO: Should also save digest of prov.json to check to see if it # has been altered remotely xsession = self.get_xsession(record, dataset=dataset) xprov = self._login.classes.MrScanData( id=self.PROV_SCAN, type=self.PROV_SCAN, parent=xsession) # Delete existing provenance if present try: xresource = xprov.resources[record.pipeline_name] except KeyError: pass else: xresource.delete() # FIXME: should reuse the same resource for all provenance jsons xresource = xprov.create_resource(record.pipeline_name) xresource.upload(cache_path, op.basename(cache_path))
def save(self, path): """ Saves the provenance object to a JSON file, optionally including checksums for inputs and outputs (which are initially produced mid- run) to insert during the write Parameters ---------- path : str Path to save the generated JSON file inputs : dict[str, str | list[str] | list[list[str]]] | None Checksums of all pipeline inputs used by the pipeline. For inputs of matching frequency to the output derivative associated with the provenance object, the values of the dictionary will be single checksums. If the output is of lower frequency they will be lists of checksums or in the case of 'per_session' inputs to 'per_study' outputs, lists of lists of checksum. They need to be provided here if the provenance object was initialised without checksums outputs : dict[str, str] | None Checksums of all pipeline outputs. They need to be provided here if the provenance object was initialised without checksums """ with open(path, 'w') as f: try: json.dump(self.prov, f, indent=2) except TypeError: raise ArcanaError( "Could not serialise provenance record dictionary:\n{}" .format(pformat(self.prov)))
def __init__(self, name, dtype, frequency, array=False): super(BaseField, self).__init__(name, frequency) if dtype not in self.dtypes + (newstr, None): raise ArcanaError("Invalid dtype {}, can be one of {}".format( dtype, ', '.join(self._dtype_names()))) self._dtype = dtype self._array = array
def download_fileset(self, tmp_dir, xresource, xscan, fileset, session_label, cache_path): # Download resource to zip file zip_path = op.join(tmp_dir, 'download.zip') with open(zip_path, 'wb') as f: xresource.xnat_session.download_stream( xresource.uri + '/files', f, format='zip', verbose=True) checksums = self.get_checksums(fileset) # Extract downloaded zip file expanded_dir = op.join(tmp_dir, 'expanded') try: with ZipFile(zip_path) as zip_file: zip_file.extractall(expanded_dir) except BadZipfile as e: raise ArcanaError( "Could not unzip file '{}' ({})" .format(xresource.id, e)) data_path = op.join( expanded_dir, session_label, 'scans', (xscan.id + '-' + special_char_re.sub('_', xscan.type)), 'resources', xresource.label, 'files') # Remove existing cache if present try: shutil.rmtree(cache_path) except OSError as e: if e.errno != errno.ENOENT: raise e shutil.move(data_path, cache_path) with open(cache_path + XnatRepo.MD5_SUFFIX, 'w', **JSON_ENCODING) as f: json.dump(checksums, f, indent=2)
def get_session_dir(self, subject=None, visit=None, frequency='per_session'): if subject is None and frequency in ('per_session', 'per_subject'): subject = self.SUBJECT if visit is None and frequency in ('per_session', 'per_visit'): visit = self.VISIT if frequency == 'per_session': assert subject is not None assert visit is not None parts = [self.project, subject, visit] elif frequency == 'per_subject': assert subject is not None assert visit is None parts = [self.project, subject, XnatRepo.SUMMARY_NAME] elif frequency == 'per_visit': assert visit is not None assert subject is None parts = [self.project, XnatRepo.SUMMARY_NAME, visit] elif frequency == 'per_dataset': assert subject is None assert visit is None parts = [ self.project, XnatRepo.SUMMARY_NAME, XnatRepo.SUMMARY_NAME ] else: assert False session_id = '_'.join(parts) session_path = op.join(self.output_cache_dir, session_id) if not op.exists(session_path): raise ArcanaError( "Session path '{}' does not exist".format(session_path)) return session_path
def detect_version_str(self): try: return os.environ[self.name.upper() + '_VERSION'] except KeyError: loaded_modules = ModulesEnv.loaded() raise ArcanaError( "Did not find {} in environment variables, found '{}'. " "The loaded modules are {}".format( self.name.upper() + '_VERSION', "', '".join(os.environ.keys()), ', '.join(loaded_modules)))
def _gen_outfilename(self): if isdefined(self.inputs.out_file): if not self.inputs.out_file.endswith('.mat'): raise ArcanaError( "Output NODDI ROI should be saved with '.mat' extension " "(provided '{}')".format(self.inputs.out_file)) out_name = self.inputs.out_file else: base, _ = split_extension(os.path.basename(self.inputs.in_file)) out_name = os.path.join(os.getcwd(), "{}_ROI.mat".format(base)) return out_name
def add_session(self, filesets=None, fields=None, project_dir=None, subject=None, visit=None): if project_dir is None: project_dir = self.project_dir if filesets is None: filesets = {} if subject is None: subject = self.SUBJECT if visit is None: visit = self.VISIT session_dir = op.join(project_dir, subject, visit) os.makedirs(session_dir) for name, fileset in list(filesets.items()): if isinstance(fileset, Fileset): if fileset.format is None: raise ArcanaError( "Need to provide format for fileset to add to test " "dataset ({}) in {}".format(fileset, self)) dst_path = op.join(session_dir, name + fileset.format.ext_str) if fileset.format.directory: shutil.copytree(fileset.path, dst_path) else: shutil.copy(fileset.path, dst_path) elif isinstance(fileset, basestring): # Write string as text file with open(op.join(session_dir, name + '.txt'), 'w') as f: f.write(fileset) else: raise ArcanaError( "Unrecognised fileset ({}) in {} test setup. Can " "be either a Fileset or basestring object".format( fileset, self)) if fields is not None: with open(op.join(session_dir, LocalFileSystemRepo.FIELDS_FNAME), 'w', **JSON_ENCODING) as f: json.dump(fields, f, indent=2)
def _make_outputnode(self, frequency): """ Generates an output node for the given frequency. It also adds implicit file format conversion nodes to the pipeline. Parameters ---------- frequency : str The frequency (i.e. 'per_session', 'per_visit', 'per_subject' or 'per_dataset') of the output node to retrieve """ outputs = {} for output_name in self.output_names: output = self.analysis.bound_spec(output_name) if output.frequency == frequency: outputs[output_name] = output if not outputs: raise ArcanaError( "No outputs to '{}' pipeline for requested freqency '{}'". format(self.name, frequency)) # Get list of output names for the requested frequency, addding fields # to hold iterator IDs output_names = list(outputs.keys()) # Generate output node and connect it to appropriate nodes outputnode = self.add('{}_outputnode'.format(frequency), IdentityInterface(fields=output_names)) # Loop through list of nodes connected to analysis data specs and # connect them to the newly created output node for output_name, output in outputs.items(): (node, node_out, format, conv_kwargs) = self._output_conns[output_name] # If fileset formats differ between analysis and pipeline # outputs create converter node (if one hasn't been already) # and connect output to that before connecting to outputnode if self.requires_conversion(output, format): try: conv = output.format.converter_from(format, **conv_kwargs) except ArcanaNoConverterError as e: e.msg += (", which is required to convert '{}' output of " "'{}' node in '{}' pipeline".format( output.name, node.name, self.name)) raise e node = self.add('conv_{}_from_{}_format'.format( output.name, format.name), conv.interface, inputs={conv.input: (node, node_out)}, requirements=conv.requirements, mem_gb=conv.mem_gb, wall_time=conv.wall_time) node_out = conv.output self.connect(node, node_out, outputnode, output.name) return outputnode
def _create_project(self, project_name=None): if project_name is None: project_name = self.project if SERVER == 'https://mbi-xnat.erc.monash.edu.au': raise ArcanaError( "Shouldn't be creating projects on the production " "server") with xnat.connect(SERVER) as login: uri = '/data/archive/projects/{}'.format(project_name) query = {'xsiType': 'xnat:projectData', 'req_format': 'qa'} response = login.put(uri, query=query) if response.ok: logger.info("Created test project '{}'".format(project_name))
def value(self): if not self.exists: raise ArcanaDataNotDerivedYetError( self.name, "Cannot access value of {} as it hasn't been " "derived yet".format(repr(self))) if self._value is None: if self.dataset is not None: self._value = self.dataset.get_field(self) else: raise ArcanaError( "Neither value nor dataset has been set for Field(" "'{}')".format(self.name)) return self._value
def dicom_values(self, fileset, tags): """ Returns a dictionary with the DICOM header fields corresponding to the given tag names Parameters ---------- tags : List[Tuple[str, str]] List of DICOM tag values as 2-tuple of strings, e.g. [('0080', '0020')] repository_login : <repository-login-object> A login object for the repository to avoid having to relogin for every dicom_header call. Returns ------- dct : Dict[Tuple[str, str], str|int|float] """ try: if (fileset._path is None and fileset._repository is not None and hasattr(fileset.repository, 'dicom_header')): hdr = fileset.repository.dicom_header(fileset) if not hdr: raise ArcanaError( "No DICOM tags retrieved from {} by {}".format( fileset.repository, fileset)) values = [hdr[t] for t in tags] else: # Get the DICOM object for the first file in the fileset dcm_files = [ f for f in os.listdir(fileset.path) if f.endswith('.dcm') ] dcm = pydicom.dcmread(op.join(fileset.path, dcm_files[0])) values = [dcm[t].value for t in tags] except KeyError as e: fileset.repository.dicom_header(fileset) raise ArcanaError("{} does not have dicom tag {}".format( self, str(e))) return values
def path(self): if not self.exists: raise ArcanaDataNotDerivedYetError( self.name, "Cannot access path of {} as it hasn't been derived yet". format(self)) if self._path is None: if self.dataset is not None: self.get() # Retrieve from dataset else: raise ArcanaError( "Neither path nor dataset has been set for Fileset(" "'{}')".format(self.name)) return self._path
def __init__(self, work_dir, partition=None, account=None, email=None, mail_on=('FAIL',), generic_resources=None, ntasks_per_node=None, cpus_per_task=None, **kwargs): if email is None: try: email = os.environ['EMAIL'] except KeyError: raise ArcanaError( "'email' kwarg needs to be provided for SlurmProc" " if 'EMAIL' environment variable not set") self._email = email self._mail_on = mail_on self._account = account self._partition = partition self._ntasks_per_node = ntasks_per_node self._cpus_per_task = cpus_per_task self._generic_resources = generic_resources super(SlurmProc, self).__init__(work_dir, **kwargs)
def cap(self): """ "Caps" the construction of the pipeline, signifying that no more inputs and outputs are expected to be added and therefore the input and output nodes can be created along with the provenance. """ to_cap = (self._inputnodes, self._outputnodes, self._prov) if to_cap == (None, None, None): self._inputnodes = { f: self._make_inputnode(f) for f in self.input_frequencies } self._outputnodes = { f: self._make_outputnode(f) for f in self.output_frequencies } self._prov = self._gen_prov() elif None in to_cap: raise ArcanaError( "If one of _inputnodes, _outputnodes or _prov is not None then" " they all should be in {}".format(self))
def _list_outputs(self): if (not isdefined(self.inputs.compression) or (self.inputs.compression == 'y' or self.inputs.compression == 'i')): im_ext = '.nii.gz' else: im_ext = '.nii' outputs = self._outputs().get() # As Dcm2niix sometimes prepends a prefix onto the filenames to avoid # name clashes with multiple echos, we need to check the output folder # for all filenames that end with the "generated filename". out_dir = self._gen_filename('out_dir') fname = self._gen_filename('filename') + im_ext base, ext = split_extension(fname) match_re = re.compile(r'(_e\d+)?{}(_(?:e|c)\d+)?{}'.format( base, ext if ext is not None else '')) products = [ os.path.join(out_dir, f) for f in os.listdir(out_dir) if match_re.match(f) is not None ] if len(products) == 1: converted = products[0] elif len(products) > 1 and self.inputs.multifile_concat: ex_file = nib.load(products[0]) data = ex_file.get_data() merged_file = np.zeros( (data.shape[0], data.shape[1], data.shape[2], len(products))) for i, el in enumerate(products): f = nib.load(el) merged_file[:, :, :, i] = f.get_data() im2save = nib.Nifti1Image(merged_file, ex_file.affine) nib.save(im2save, out_dir + fname) converted = out_dir + fname elif len(products) > 1 and not self.inputs.multifile_concat: converted = products[-1] else: raise ArcanaError("No products produced by dcm2niix ({})".format( ', '.join(os.listdir(out_dir)))) outputs['converted'] = converted return outputs
def __init__(self, collections): super(RepositoryInterface, self).__init__() # Protect against iterators collections = list(collections) # Check for consistent frequencies in collections frequencies = set(c.frequency for c in collections) if len(frequencies) > 1: raise ArcanaError( "Attempting to sink multiple frequencies across collections {}" .format(', '.join(str(c) for c in collections))) elif frequencies: # NB: Exclude very rare case where pipeline doesn't have inputs, # would only really happen in unittests self._frequency = next(iter(frequencies)) # Extract set of repositories used to source/sink from/to self.datasets = set( chain(*((i.dataset for i in c if i.dataset is not None) for c in collections))) self.repositories = set(d.repository for d in self.datasets) # Segregate into fileset and field collections self.fileset_collections = [c for c in collections if c.is_fileset] self.field_collections = [c for c in collections if c.is_field]
def _list_outputs(self): outputs = self._outputs().get() dirname = self.out_dir os.makedirs(dirname) num_files = len(self.inputs.in_files) if isdefined(self.inputs.file_names): if len(self.inputs.file_names) != num_files: raise ArcanaError( "Number of provided filenames ({}) does not match number " "of provided files ({})".format( len(self.inputs.file_names), num_files)) out_files = (op.basename(f) for f in self.inputs.file_names) else: # Create filenames that will sort ascendingly with the order the # file is inputed to the interface ndigits = int(math.ceil(math.log10(num_files))) out_files = [] for i, fname in enumerate(self.inputs.in_files): ext = split_extension(fname)[1] if ext is None: ext_str = '' else: ext_str = ext out_files.append(str(i).zfill(ndigits) + ext_str) file_names = [] for in_file, out_file in zip(self.inputs.in_files, out_files): out_path = op.join(self.out_dir, out_file) if self.inputs.use_symlinks: os.symlink(in_file, out_path) else: if op.isdir(in_file): shutil.copytree(in_file, out_path) else: shutil.copy(in_file, out_path) file_names.append(op.basename(out_path)) outputs['out_dir'] = dirname outputs['file_names'] = file_names return outputs
def __init__(self, name, valid_formats, frequency='per_session', desc=None, optional=False, default=None): # Ensure allowed formats is a list try: valid_formats = tuple(valid_formats) except TypeError: valid_formats = (valid_formats, ) else: if not valid_formats: raise ArcanaError( "'{}' spec doesn't have any allowed formats".format(name)) self._valid_formats = valid_formats BaseFileset.__init__(self, name, None, frequency) BaseInputSpecMixin.__init__(self, name, desc, optional=optional, default=default)
def bind(self, analysis, **kwargs): """ Returns a copy of the Spec bound to the given analysis Parameters ---------- analysis : Analysis A analysis to bind the fileset spec to (should happen in the analysis __init__) """ if self._analysis is not None: # Avoid rebinding specs in sub-studies that have already # been bound to MultiAnalysis bound = self else: bound = copy(self) bound._analysis = analysis if not hasattr(analysis, self.pipeline_getter): raise ArcanaError( "{} does not have a method named '{}' required to " "derive {}".format(analysis, self.pipeline_getter, self)) bound._bind_tree(analysis.dataset.tree) return bound
def bind(self, study, **kwargs): """ Returns a copy of the Spec bound to the given study Parameters ---------- study : Study A study to bind the fileset spec to (should happen in the study __init__) """ if self._study is not None: # Avoid rebinding specs in sub-studies that have already # been bound to MultiStudy bound = self else: bound = copy(self) bound._study = study if not hasattr(study, self.pipeline_getter): raise ArcanaError( "{} does not have a method named '{}' required to " "derive {}".format(study, self.pipeline_getter, self)) bound._bind_tree(study.tree) return bound
def bind(self, analysis, **kwargs): """ Returns a copy of the InputSpec bound to the given analysis Parameters ---------- analysis : Analysis A analysis to bind the fileset spec to (should happen in the analysis __init__) """ if self.default is None: raise ArcanaError( ("Attempted to bind '{}' to {} but only acquired specs with " + "a default value should be bound to studies").format( self.name, analysis)) if self._analysis is not None: # This avoids rebinding specs to sub-studies that have already # been bound to the multi-analysis bound = self else: bound = copy(self) bound._analysis = analysis bound._default = bound.default.bind(analysis) return bound
def output_file_path(self, fname, from_analysis, subject=None, visit=None, frequency='per_session'): try: acq_path = self.BASE_CLASS.output_file_path(self, fname, from_analysis, subject=subject, visit=visit, frequency=frequency, derived=False) except KeyError: acq_path = None try: proc_path = self.BASE_CLASS.output_file_path(self, fname, from_analysis, subject=subject, visit=visit, frequency=frequency, derived=True) except KeyError: proc_path = None if acq_path is not None and op.exists(acq_path): if op.exists(proc_path): raise ArcanaError( "Both acquired and derived paths were found for " "'{}_{}' ({} and {})".format(from_analysis, fname, acq_path, proc_path)) path = acq_path else: path = proc_path return path
def name(self): if self._name is None: raise ArcanaError( "Name for atlas hasn't been set, it should be set in when " "it is passed as a default") return self._name
def analysis(self): if self._analysis is None: raise ArcanaError( "Can't access analysis property as {} has not been bound" .format(self)) return self._analysis
def _make_inputnode(self, frequency): """ Generates an input node for the given frequency. It also adds implicit file format conversion nodes to the pipeline. Parameters ---------- frequency : str The frequency (i.e. 'per_session', 'per_visit', 'per_subject' or 'per_dataset') of the input node to retrieve """ # Check to see whether there are any outputs for the given frequency inputs = {} for input_name in self.input_names: input = self.analysis.bound_spec(input_name) if input.frequency == frequency: inputs[input_name] = input # Get list of input names for the requested frequency, addding fields # to hold iterator IDs input_names = list(inputs.keys()) input_names.extend(self.analysis.FREQUENCIES[frequency]) if not input_names: raise ArcanaError( "No inputs to '{}' pipeline for requested freqency '{}'". format(self.name, frequency)) # Generate input node and connect it to appropriate nodes inputnode = self.add('{}_inputnode'.format(frequency), IdentityInterface(fields=input_names)) # Loop through list of nodes connected to analysis data specs and # connect them to the newly created input node for input_name, input in inputs.items(): # Keep track of previous conversion nodes to avoid replicating the # conversion for inputs that are used in multiple places prev_conv_nodes = {} for (node, node_in, format, conv_kwargs) in self._input_conns[input_name]: # If fileset formats differ between analysis and pipeline # inputs create converter node (if one hasn't been already) # and connect input to that before connecting to inputnode if self.requires_conversion(input, format): try: conv = format.converter_from(input.format, **conv_kwargs) except ArcanaNoConverterError as e: e.msg += ( "which is required to convert '{}' from {} to {} " "for '{}' input of '{}' node in '{}' pipeline". format(input.name, input.format, format, node_in, node.name, self.name)) raise e try: in_node = prev_conv_nodes[format.name] except KeyError: in_node = prev_conv_nodes[format.name] = self.add( 'conv_{}_to_{}_format'.format( input.name, format.name), conv.interface, inputs={conv.input: (inputnode, input.name)}, requirements=conv.requirements, mem_gb=conv.mem_gb, wall_time=conv.wall_time) try: in_node_out = conv.output_aux(format.aux_name) except AttributeError: # Not an auxiliary pointer in_node_out = conv.output else: in_node = inputnode in_node_out = input.name self.connect(in_node, in_node_out, node, node_in) # Connect iterator inputs for iterator, conns in self._iterator_conns.items(): # Check to see if this is the right frequency for the iterator # input, i.e. if it is the only iterator for this frequency if self.analysis.FREQUENCIES[frequency] == (iterator, ): for (node, node_in, format) in conns: self.connect(inputnode, iterator, node, node_in) return inputnode
def study(self): if self._study is None: raise ArcanaError( "Can't access study property as {} has not been bound".format( self)) return self._study
def analysis(self): if self._analysis is None: raise ArcanaError("{} is not bound to a analysis".format(self)) return self._analysis