def cache_inputs(self): """ Runs the Study's repository source node for each of the inputs of the study, thereby caching any data required from remote repositorys. Useful when launching many parallel jobs that will all try to concurrently access the remote repository, and probably lead to timeout errors. """ workflow = pe.Workflow(name='cache_download', base_dir=self.processor.work_dir) subjects = pe.Node(IdentityInterface(['subject_id']), name='subjects', environment=self.environment) sessions = pe.Node(IdentityInterface(['subject_id', 'visit_id']), name='sessions', environment=self.environment) subjects.iterables = ('subject_id', tuple(self.subject_ids)) sessions.iterables = ('visit_id', tuple(self.visit_ids)) source = pe.Node(RepositorySource( self.bound_spec(i).collection for i in self.inputs), name='source') workflow.connect(subjects, 'subject_id', sessions, 'subject_id') workflow.connect(sessions, 'subject_id', source, 'subject_id') workflow.connect(sessions, 'visit_id', source, 'visit_id') workflow.run()
def test_fields_roundtrip(self): STUDY_NAME = 'fields_roundtrip' study = DummyStudy(STUDY_NAME, self.repository, processor=SingleProc('a_dir'), inputs=[]) dummy_pipeline = study.dummy_pipeline() dummy_pipeline.cap() sink = pe.Node(RepositorySink( (study.bound_spec(f).collection for f in ['field1', 'field2', 'field3']), dummy_pipeline), name='fields_sink') sink.inputs.field1_field = field1 = 1 sink.inputs.field2_field = field2 = 2.0 sink.inputs.field3_field = field3 = '3' sink.inputs.subject_id = self.SUBJECT sink.inputs.visit_id = self.VISIT sink.inputs.desc = "Test sink of fields" sink.inputs.name = 'test_sink' sink.run() source = pe.Node(RepositorySource( study.bound_spec(f).collection for f in ['field1', 'field2', 'field3']), name='fields_source') source.inputs.visit_id = self.VISIT source.inputs.subject_id = self.SUBJECT source.inputs.desc = "Test source of fields" source.inputs.name = 'test_source' results = source.run() self.assertEqual(results.outputs.field1_field, field1) self.assertEqual(results.outputs.field2_field, field2) self.assertEqual(results.outputs.field3_field, field3)
def test_fields_roundtrip(self): repository = XnatRepo(server=SERVER, cache_dir=self.cache_dir) dataset = repository.dataset(self.project) analysis = DummyAnalysis( self.STUDY_NAME, dataset=dataset, processor=SingleProc('a_dir'), inputs=[FilesetFilter('source1', 'source1', text_format)]) fields = ['field{}'.format(i) for i in range(1, 4)] dummy_pipeline = analysis.dummy_pipeline() dummy_pipeline.cap() sink = pe.Node(RepositorySink( (analysis.bound_spec(f).slice for f in fields), dummy_pipeline), name='fields_sink') sink.inputs.field1_field = field1 = 1 sink.inputs.field2_field = field2 = 2.0 sink.inputs.field3_field = field3 = str('3') sink.inputs.subject_id = self.SUBJECT sink.inputs.visit_id = self.VISIT sink.inputs.desc = "Test sink of fields" sink.inputs.name = 'test_sink' sink.run() source = pe.Node(RepositorySource( analysis.bound_spec(f).slice for f in fields), name='fields_source') source.inputs.visit_id = self.VISIT source.inputs.subject_id = self.SUBJECT source.inputs.desc = "Test source of fields" source.inputs.name = 'test_source' results = source.run() self.assertEqual(results.outputs.field1_field, field1) self.assertEqual(results.outputs.field2_field, field2) self.assertEqual(results.outputs.field3_field, field3)
def test_repository_roundtrip(self): analysis = DummyAnalysis(self.STUDY_NAME, self.dataset, processor=SingleProc('a_dir'), inputs=[ FilesetFilter('source1', 'source1', text_format), FilesetFilter('source2', 'source2', text_format), FilesetFilter('source3', 'source3', text_format), FilesetFilter('source4', 'source4', text_format) ]) # TODO: Should test out other file formats as well. source_files = ('source1', 'source2', 'source3', 'source4') sink_files = ('sink1', 'sink3', 'sink4') inputnode = pe.Node(IdentityInterface(['subject_id', 'visit_id']), 'inputnode') inputnode.inputs.subject_id = self.SUBJECT inputnode.inputs.visit_id = self.VISIT source = pe.Node(RepositorySource( analysis.bound_spec(f).slice for f in source_files), name='source') dummy_pipeline = analysis.dummy_pipeline() dummy_pipeline.cap() sink = pe.Node(RepositorySink((analysis.bound_spec(f).slice for f in sink_files), dummy_pipeline), name='sink') sink.inputs.name = 'repository_sink' sink.inputs.desc = ( "A test session created by repository roundtrip unittest") # Create workflow connecting them together workflow = pe.Workflow('source_sink_unit_test', base_dir=self.work_dir) workflow.add_nodes((source, sink)) workflow.connect(inputnode, 'subject_id', source, 'subject_id') workflow.connect(inputnode, 'visit_id', source, 'visit_id') workflow.connect(inputnode, 'subject_id', sink, 'subject_id') workflow.connect(inputnode, 'visit_id', sink, 'visit_id') for source_name in source_files: if not source_name.endswith('2'): sink_name = source_name.replace('source', 'sink') workflow.connect(source, source_name + PATH_SUFFIX, sink, sink_name + PATH_SUFFIX) workflow.run() # Check local directory was created properly outputs = [ f for f in sorted( os.listdir(self.get_session_dir( from_analysis=self.STUDY_NAME))) if f not in (LocalFileSystemRepo.FIELDS_FNAME, LocalFileSystemRepo.PROV_DIR) ] self.assertEqual(outputs, ['sink1.txt', 'sink3.txt', 'sink4.txt'])
def test_summary(self): study = DummyStudy(self.SUMMARY_STUDY_NAME, self.repository, SingleProc('ad'), inputs=[ InputFilesets('source1', 'source1', text_format), InputFilesets('source2', 'source2', text_format), InputFilesets('source3', 'source3', text_format) ]) # TODO: Should test out other file formats as well. source_files = ['source1', 'source2', 'source3'] inputnode = pe.Node(IdentityInterface(['subject_id', 'visit_id']), 'inputnode') inputnode.inputs.subject_id = self.SUBJECT inputnode.inputs.visit_id = self.VISIT source = pe.Node(RepositorySource( study.bound_spec(f).collection for f in source_files), name='source') # Test subject sink subject_sink_files = ['subject_sink'] dummy_pipeline = study.dummy_pipeline() dummy_pipeline.cap() subject_sink = pe.Node(RepositorySink( (study.bound_spec(f).collection for f in subject_sink_files), dummy_pipeline), name='subject_sink') subject_sink.inputs.name = 'subject_summary' subject_sink.inputs.desc = ( "Tests the sinking of subject-wide filesets") # Test visit sink visit_sink_files = ['visit_sink'] visit_sink = pe.Node(RepositorySink( (study.bound_spec(f).collection for f in visit_sink_files), dummy_pipeline), name='visit_sink') visit_sink.inputs.name = 'visit_summary' visit_sink.inputs.desc = ("Tests the sinking of visit-wide filesets") # Test project sink study_sink_files = ['study_sink'] study_sink = pe.Node(RepositorySink( (study.bound_spec(f).collection for f in study_sink_files), dummy_pipeline), name='study_sink') study_sink.inputs.name = 'project_summary' study_sink.inputs.desc = ("Tests the sinking of project-wide filesets") # Create workflow connecting them together workflow = pe.Workflow('summary_unittest', base_dir=self.work_dir) workflow.add_nodes((source, subject_sink, visit_sink, study_sink)) workflow.connect(inputnode, 'subject_id', source, 'subject_id') workflow.connect(inputnode, 'visit_id', source, 'visit_id') workflow.connect(inputnode, 'subject_id', subject_sink, 'subject_id') workflow.connect(inputnode, 'visit_id', visit_sink, 'visit_id') workflow.connect(source, 'source1' + PATH_SUFFIX, subject_sink, 'subject_sink' + PATH_SUFFIX) workflow.connect(source, 'source2' + PATH_SUFFIX, visit_sink, 'visit_sink' + PATH_SUFFIX) workflow.connect(source, 'source3' + PATH_SUFFIX, study_sink, 'study_sink' + PATH_SUFFIX) workflow.run() # Check local summary directories were created properly subject_dir = self.get_session_dir(frequency='per_subject', from_study=self.SUMMARY_STUDY_NAME) self.assertEqual(sorted(os.listdir(subject_dir)), [BasicRepo.PROV_DIR, 'subject_sink.txt']) visit_dir = self.get_session_dir(frequency='per_visit', from_study=self.SUMMARY_STUDY_NAME) self.assertEqual(sorted(os.listdir(visit_dir)), [BasicRepo.PROV_DIR, 'visit_sink.txt']) project_dir = self.get_session_dir(frequency='per_study', from_study=self.SUMMARY_STUDY_NAME) self.assertEqual(sorted(os.listdir(project_dir)), [BasicRepo.PROV_DIR, 'study_sink.txt']) # Reload the data from the summary directories reloadinputnode = pe.Node(IdentityInterface(['subject_id', 'visit_id']), name='reload_inputnode') reloadinputnode.inputs.subject_id = self.SUBJECT reloadinputnode.inputs.visit_id = self.VISIT reloadsource_per_subject = pe.Node(RepositorySource( study.bound_spec(f).collection for f in subject_sink_files), name='reload_source_per_subject') reloadsource_per_visit = pe.Node(RepositorySource( study.bound_spec(f).collection for f in visit_sink_files), name='reload_source_per_visit') reloadsource_per_study = pe.Node(RepositorySource( study.bound_spec(f).collection for f in study_sink_files), name='reload_source_per_study') reloadsink = pe.Node(RepositorySink( (study.bound_spec(f).collection for f in ['resink1', 'resink2', 'resink3']), dummy_pipeline), name='reload_sink') reloadsink.inputs.name = 'reload_summary' reloadsink.inputs.desc = ( "Tests the reloading of subject and project summary filesets") reloadworkflow = pe.Workflow('reload_summary_unittest', base_dir=self.work_dir) for node in (reloadsource_per_subject, reloadsource_per_visit, reloadsource_per_study, reloadsink): for iterator in ('subject_id', 'visit_id'): reloadworkflow.connect(reloadinputnode, iterator, node, iterator) reloadworkflow.connect(reloadsource_per_subject, 'subject_sink' + PATH_SUFFIX, reloadsink, 'resink1' + PATH_SUFFIX) reloadworkflow.connect(reloadsource_per_visit, 'visit_sink' + PATH_SUFFIX, reloadsink, 'resink2' + PATH_SUFFIX) reloadworkflow.connect(reloadsource_per_study, 'study_sink' + PATH_SUFFIX, reloadsink, 'resink3' + PATH_SUFFIX) reloadworkflow.run() outputs = [ f for f in sorted( os.listdir( self.get_session_dir(from_study=self.SUMMARY_STUDY_NAME))) if f not in (BasicRepo.FIELDS_FNAME, BasicRepo.PROV_DIR) ] self.assertEqual(outputs, ['resink1.txt', 'resink2.txt', 'resink3.txt'])
def test_repository_roundtrip(self): # Create working dirs # Create DarisSource node repository = XnatRepo(server=SERVER, cache_dir=self.cache_dir) dataset = repository.dataset(self.project) analysis = DummyAnalysis(self.STUDY_NAME, dataset=dataset, processor=SingleProc('a_dir'), inputs=[ FilesetFilter('source1', 'source1', text_format), FilesetFilter('source2', 'source2', text_format), FilesetFilter('source3', 'source3', text_format), FilesetFilter('source4', 'source4', text_format) ]) # TODO: Should test out other file formats as well. source_files = ['source1', 'source2', 'source3', 'source4'] sink_files = ['sink1', 'sink3', 'sink4'] inputnode = pe.Node(IdentityInterface(['subject_id', 'visit_id']), 'inputnode') inputnode.inputs.subject_id = str(self.SUBJECT) inputnode.inputs.visit_id = str(self.VISIT) source = pe.Node(RepositorySource( analysis.bound_spec(f).slice for f in source_files), name='source') dummy_pipeline = analysis.dummy_pipeline() dummy_pipeline.cap() sink = pe.Node(RepositorySink((analysis.bound_spec(f).slice for f in sink_files), dummy_pipeline), name='sink') sink.inputs.name = 'repository-roundtrip-unittest' sink.inputs.desc = ( "A test session created by repository roundtrip unittest") # Create workflow connecting them together workflow = pe.Workflow('source-sink-unit-test', base_dir=self.work_dir) workflow.add_nodes((source, sink)) workflow.connect(inputnode, 'subject_id', source, 'subject_id') workflow.connect(inputnode, 'visit_id', source, 'visit_id') workflow.connect(inputnode, 'subject_id', sink, 'subject_id') workflow.connect(inputnode, 'visit_id', sink, 'visit_id') for source_name in source_files: if source_name != 'source2': sink_name = source_name.replace('source', 'sink') workflow.connect(source, source_name + PATH_SUFFIX, sink, sink_name + PATH_SUFFIX) workflow.run() # Check cache was created properly self.assertEqual(filter_scans(os.listdir(self.session_cache())), [ 'source1-source1', 'source2-source2', 'source3-source3', 'source4-source4' ]) expected_sink_filesets = ['sink1', 'sink3', 'sink4'] self.assertEqual( filter_scans( os.listdir(self.session_cache(from_analysis=self.STUDY_NAME))), [(e + '-' + e) for e in expected_sink_filesets]) with self._connect() as login: fileset_names = filter_scans(login.experiments[self.session_label( from_analysis=self.STUDY_NAME)].scans.keys()) self.assertEqual(fileset_names, expected_sink_filesets)
def test_summary(self): # Create working dirs # Create XnatSource node repository = XnatRepo(server=SERVER, cache_dir=self.cache_dir) analysis = DummyAnalysis(self.SUMMARY_STUDY_NAME, repository.dataset(self.project), SingleProc('ad'), inputs=[ FilesetFilter('source1', 'source1', text_format), FilesetFilter('source2', 'source2', text_format), FilesetFilter('source3', 'source3', text_format) ]) # TODO: Should test out other file formats as well. source_files = ['source1', 'source2', 'source3'] inputnode = pe.Node(IdentityInterface(['subject_id', 'visit_id']), 'inputnode') inputnode.inputs.subject_id = self.SUBJECT inputnode.inputs.visit_id = self.VISIT source = pe.Node(RepositorySource( [analysis.bound_spec(f).slice for f in source_files]), name='source') subject_sink_files = ['subject_sink'] dummy_pipeline = analysis.dummy_pipeline() dummy_pipeline.cap() subject_sink = pe.Node(RepositorySink( [analysis.bound_spec(f).slice for f in subject_sink_files], dummy_pipeline), name='subject_sink') subject_sink.inputs.name = 'subject_summary' subject_sink.inputs.desc = ( "Tests the sinking of subject-wide filesets") # Test visit sink visit_sink_files = ['visit_sink'] visit_sink = pe.Node(RepositorySink( [analysis.bound_spec(f).slice for f in visit_sink_files], dummy_pipeline), name='visit_sink') visit_sink.inputs.name = 'visit_summary' visit_sink.inputs.desc = ("Tests the sinking of visit-wide filesets") # Test project sink analysis_sink_files = ['analysis_sink'] analysis_sink = pe.Node(RepositorySink( [analysis.bound_spec(f).slice for f in analysis_sink_files], dummy_pipeline), name='analysis_sink') analysis_sink.inputs.name = 'project_summary' analysis_sink.inputs.desc = ( "Tests the sinking of project-wide filesets") # Create workflow connecting them together workflow = pe.Workflow('summary_unittest', base_dir=self.work_dir) workflow.add_nodes((source, subject_sink, visit_sink, analysis_sink)) workflow.connect(inputnode, 'subject_id', source, 'subject_id') workflow.connect(inputnode, 'visit_id', source, 'visit_id') workflow.connect(inputnode, 'subject_id', subject_sink, 'subject_id') workflow.connect(inputnode, 'visit_id', visit_sink, 'visit_id') workflow.connect(source, 'source1' + PATH_SUFFIX, subject_sink, 'subject_sink' + PATH_SUFFIX) workflow.connect(source, 'source2' + PATH_SUFFIX, visit_sink, 'visit_sink' + PATH_SUFFIX) workflow.connect(source, 'source3' + PATH_SUFFIX, analysis_sink, 'analysis_sink' + PATH_SUFFIX) workflow.run() analysis.clear_caches() # Refreshed cached repository tree object with self._connect() as login: # Check subject summary directories were created properly in cache expected_subj_filesets = ['subject_sink'] subject_dir = self.session_cache( visit=XnatRepo.SUMMARY_NAME, from_analysis=self.SUMMARY_STUDY_NAME) self.assertEqual(filter_scans(os.listdir(subject_dir)), [(e + '-' + e) for e in expected_subj_filesets]) # and on XNAT subject_fileset_names = filter_scans( login.projects[self.project].experiments[self.session_label( visit=XnatRepo.SUMMARY_NAME, from_analysis=self.SUMMARY_STUDY_NAME)].scans.keys()) self.assertEqual(expected_subj_filesets, subject_fileset_names) # Check visit summary directories were created properly in # cache expected_visit_filesets = ['visit_sink'] visit_dir = self.session_cache( subject=XnatRepo.SUMMARY_NAME, from_analysis=self.SUMMARY_STUDY_NAME) self.assertEqual(filter_scans(os.listdir(visit_dir)), [(e + '-' + e) for e in expected_visit_filesets]) # and on XNAT visit_fileset_names = filter_scans( login.projects[self.project].experiments[self.session_label( subject=XnatRepo.SUMMARY_NAME, from_analysis=self.SUMMARY_STUDY_NAME)].scans.keys()) self.assertEqual(expected_visit_filesets, visit_fileset_names) # Check project summary directories were created properly in cache expected_proj_filesets = ['analysis_sink'] project_dir = self.session_cache( subject=XnatRepo.SUMMARY_NAME, visit=XnatRepo.SUMMARY_NAME, from_analysis=self.SUMMARY_STUDY_NAME) self.assertEqual(filter_scans(os.listdir(project_dir)), [(e + '-' + e) for e in expected_proj_filesets]) # and on XNAT project_fileset_names = filter_scans( login.projects[self.project].experiments[self.session_label( subject=XnatRepo.SUMMARY_NAME, visit=XnatRepo.SUMMARY_NAME, from_analysis=self.SUMMARY_STUDY_NAME)].scans.keys()) self.assertEqual(expected_proj_filesets, project_fileset_names) # Reload the data from the summary directories reloadinputnode = pe.Node( IdentityInterface(['subject_id', 'visit_id']), 'reload_inputnode') reloadinputnode.inputs.subject_id = self.SUBJECT reloadinputnode.inputs.visit_id = self.VISIT reloadsource_per_subject = pe.Node(RepositorySource( analysis.bound_spec(f).slice for f in subject_sink_files), name='reload_source_per_subject') reloadsource_per_visit = pe.Node(RepositorySource( analysis.bound_spec(f).slice for f in visit_sink_files), name='reload_source_per_visit') reloadsource_per_dataset = pe.Node(RepositorySource( analysis.bound_spec(f).slice for f in analysis_sink_files), name='reload_source_per_dataset') reloadsink = pe.Node(RepositorySink( (analysis.bound_spec(f).slice for f in ['resink1', 'resink2', 'resink3']), dummy_pipeline), name='reload_sink') reloadsink.inputs.name = 'reload_summary' reloadsink.inputs.desc = ( "Tests the reloading of subject and project summary filesets") reloadworkflow = pe.Workflow('reload_summary_unittest', base_dir=self.work_dir) for node in (reloadsource_per_subject, reloadsource_per_visit, reloadsource_per_dataset, reloadsink): for iterator in ('subject_id', 'visit_id'): reloadworkflow.connect(reloadinputnode, iterator, node, iterator) reloadworkflow.connect(reloadsource_per_subject, 'subject_sink' + PATH_SUFFIX, reloadsink, 'resink1' + PATH_SUFFIX) reloadworkflow.connect(reloadsource_per_visit, 'visit_sink' + PATH_SUFFIX, reloadsink, 'resink2' + PATH_SUFFIX) reloadworkflow.connect(reloadsource_per_dataset, 'analysis_sink' + PATH_SUFFIX, reloadsink, 'resink3' + PATH_SUFFIX) reloadworkflow.run() # Check that the filesets self.assertEqual( filter_scans( os.listdir( self.session_cache( from_analysis=self.SUMMARY_STUDY_NAME))), ['resink1-resink1', 'resink2-resink2', 'resink3-resink3']) # and on XNAT with self._connect() as login: resinked_fileset_names = filter_scans( login.projects[self.project].experiments[self.session_label( from_analysis=self.SUMMARY_STUDY_NAME)].scans.keys()) self.assertEqual(sorted(resinked_fileset_names), ['resink1', 'resink2', 'resink3'])
def test_checksums(self): """ Tests check of downloaded checksums to see if file needs to be redownloaded """ cache_dir = op.join(self.work_dir, 'cache-checksum-check') DATASET_NAME = 'source1' STUDY_NAME = 'checksum_check_analysis' fileset_fname = DATASET_NAME + text_format.extension source_target_path = op.join(self.session_cache(cache_dir), DATASET_NAME + '-' + DATASET_NAME) md5_path = source_target_path + XnatRepo.MD5_SUFFIX source_target_fpath = op.join(source_target_path, fileset_fname) shutil.rmtree(cache_dir, ignore_errors=True) os.makedirs(cache_dir) source_repository = XnatRepo(server=SERVER, cache_dir=cache_dir) source_dataset = source_repository.dataset(self.project) sink_repository = XnatRepo(server=SERVER, cache_dir=cache_dir) sink_dataset = sink_repository.dataset(self.checksum_sink_project, subject_ids=['SUBJECT'], visit_ids=['VISIT'], fill_tree=True) analysis = DummyAnalysis(STUDY_NAME, dataset=sink_dataset, processor=SingleProc('ad'), inputs=[ FilesetFilter(DATASET_NAME, DATASET_NAME, text_format, dataset=source_dataset) ]) source = pe.Node(RepositorySource( [analysis.bound_spec(DATASET_NAME).slice]), name='checksum_check_source') source.inputs.subject_id = self.SUBJECT source.inputs.visit_id = self.VISIT source.run() self.assertTrue(op.exists(md5_path)) self.assertTrue(op.exists(source_target_fpath)) with open(md5_path) as f: checksums = json.load(f) # Stash the downloaded file in a new location and create a dummy # file instead stash_path = source_target_path + '.stash' shutil.move(source_target_path, stash_path) os.mkdir(source_target_path) with open(source_target_fpath, 'w') as f: f.write('dummy') # Run the download, which shouldn't download as the checksums are the # same source.run() with open(source_target_fpath) as f: d = f.read() self.assertEqual(d, 'dummy') # Replace the checksum with a dummy os.remove(md5_path) checksums['.'] = 'dummy_checksum' with open(md5_path, 'w', **JSON_ENCODING) as f: json.dump(checksums, f, indent=2) # Retry the download, which should now download since the checksums # differ source.run() with open(source_target_fpath) as f: d = f.read() with open(op.join(stash_path, fileset_fname)) as f: e = f.read() self.assertEqual(d, e) # Resink the source file and check that the generated MD5 checksum is # stored in identical format DATASET_NAME = 'sink1' dummy_pipeline = analysis.dummy_pipeline() dummy_pipeline.cap() sink = pe.Node(RepositorySink( [analysis.bound_spec(DATASET_NAME).slice], dummy_pipeline), name='checksum_check_sink') sink.inputs.name = 'checksum_check_sink' sink.inputs.desc = "Tests the generation of MD5 checksums" sink.inputs.subject_id = self.SUBJECT sink.inputs.visit_id = self.VISIT sink.inputs.sink1_path = source_target_fpath sink_target_path = op.join( self.session_cache(cache_dir, project=self.checksum_sink_project, subject=(self.SUBJECT), from_analysis=STUDY_NAME), DATASET_NAME + '-' + DATASET_NAME) sink_md5_path = sink_target_path + XnatRepo.MD5_SUFFIX sink.run() with open(md5_path) as f: source_checksums = json.load(f) with open(sink_md5_path) as f: sink_checksums = json.load(f) self.assertEqual( source_checksums, sink_checksums, ("Source checksum ({}) did not equal sink checksum ({})".format( source_checksums, sink_checksums)))
def test_delayed_download(self): """ Tests handling of race conditions where separate processes attempt to cache the same fileset """ cache_dir = op.join(self.work_dir, 'cache-delayed-download') DATASET_NAME = 'source1' target_path = op.join(self.session_cache(cache_dir), DATASET_NAME, DATASET_NAME + text_format.extension) tmp_dir = target_path + '.download' shutil.rmtree(cache_dir, ignore_errors=True) os.makedirs(cache_dir) repository = XnatRepo(server=SERVER, cache_dir=cache_dir) dataset = repository.dataset(self.project) analysis = DummyAnalysis( self.STUDY_NAME, dataset, SingleProc('ad'), inputs=[FilesetFilter(DATASET_NAME, DATASET_NAME, text_format)]) source = pe.Node(RepositorySource( [analysis.bound_spec(DATASET_NAME).slice]), name='delayed_source') source.inputs.subject_id = self.SUBJECT source.inputs.visit_id = self.VISIT result1 = source.run() source1_path = result1.outputs.source1_path self.assertTrue(op.exists(source1_path)) self.assertEqual( source1_path, target_path, "Output file path '{}' not equal to target path '{}'".format( source1_path, target_path)) # Clear cache to start again shutil.rmtree(cache_dir, ignore_errors=True) # Create tmp_dir before running interface, this time should wait for 1 # second, check to see that the session hasn't been created and then # clear it and redownload the fileset. os.makedirs(tmp_dir) source.inputs.race_cond_delay = 1 result2 = source.run() source1_path = result2.outputs.source1_path # Clear cache to start again shutil.rmtree(cache_dir, ignore_errors=True) # Create tmp_dir before running interface, this time should wait for 1 # second, check to see that the session hasn't been created and then # clear it and redownload the fileset. internal_dir = op.join(tmp_dir, 'internal') deleted_tmp_dir = tmp_dir + '.deleted' def simulate_download(): "Simulates a download in a separate process" os.makedirs(internal_dir) time.sleep(5) # Modify a file in the temp dir to make the source download keep # waiting logger.info('Updating simulated download directory') with open(op.join(internal_dir, 'download'), 'a') as f: f.write('downloading') time.sleep(10) # Simulate the finalising of the download by copying the previously # downloaded file into place and deleting the temp dir. logger.info('Finalising simulated download') with open(target_path, 'a') as f: f.write('simulated') shutil.move(tmp_dir, deleted_tmp_dir) source.inputs.race_cond_delay = 10 p = Process(target=simulate_download) p.start() # Start the simulated download in separate process time.sleep(1) source.run() # Run the local download p.join() with open(op.join(deleted_tmp_dir, 'internal', 'download')) as f: d = f.read() self.assertEqual(d, 'downloading') with open(target_path) as f: d = f.read() self.assertEqual(d, 'simulated')
def _connect_pipeline(self, pipeline, required_outputs, workflow, subject_inds, visit_inds, filter_array, force=False): """ Connects a pipeline to a overarching workflow that sets up iterators over subjects|visits present in the repository (if required) and repository source and sink nodes Parameters ---------- pipeline : Pipeline The pipeline to connect required_outputs : set[str] | None The outputs required to be produced by this pipeline. If None all are deemed to be required workflow : nipype.pipeline.engine.Workflow The overarching workflow to connect the pipeline to subject_inds : dct[str, int] A mapping of subject ID to row index in the filter array visit_inds : dct[str, int] A mapping of visit ID to column index in the filter array filter_array : 2-D numpy.array[bool] A two-dimensional boolean array, where rows correspond to subjects and columns correspond to visits in the repository. True values represent a combination of subject & visit ID to include in the current round of processing. Note that if the 'force' flag is not set, sessions won't be reprocessed unless the save provenance doesn't match that of the given pipeline. force : bool | 'all' A flag to force the processing of all sessions in the filter array, regardless of whether the parameters|pipeline used to generate existing data matches the given pipeline """ if self.reprocess == 'force': force = True # Close-off construction of the pipeline and created, input and output # nodes and provenance dictionary pipeline.cap() # Prepend prerequisite pipelines to complete workflow if they need # to be (re)processed final_nodes = [] # The array that represents the subject/visit pairs for which any # prerequisite pipeline will be (re)processed, and which therefore # needs to be included in the processing of the current pipeline. Row # indices correspond to subjects and column indices visits prqs_to_process_array = np.zeros((len(subject_inds), len(visit_inds)), dtype=bool) # The array that represents the subject/visit pairs for which any # prerequisite pipeline will be skipped due to missing inputs. Row # indices correspond to subjects and column indices visits prqs_to_skip_array = np.zeros((len(subject_inds), len(visit_inds)), dtype=bool) for getter_name in pipeline.prerequisites: prereq = pipeline.study.pipeline(getter_name) if prereq.to_process_array.any(): final_nodes.append(prereq.node('final')) prqs_to_process_array |= prereq.to_process_array prqs_to_skip_array |= prereq.to_skip_array # Get list of sessions that need to be processed (i.e. if # they don't contain the outputs of this pipeline) to_process_array, to_protect_array, to_skip_array = self._to_process( pipeline, required_outputs, prqs_to_process_array, prqs_to_skip_array, filter_array, subject_inds, visit_inds, force) # Store the arrays signifying which nodes to process, protect or skip # so they can be passed to downstream pipelines pipeline.to_process_array = to_process_array pipeline.to_protect_array = to_protect_array pipeline.to_skip_array = to_skip_array # Check to see if there are any sessions to process if not to_process_array.any(): raise ArcanaNoRunRequiredException( "No sessions to process for '{}' pipeline".format( pipeline.name)) # Set up workflow to run the pipeline, loading and saving from the # repository workflow.add_nodes([pipeline._workflow]) # If prerequisite pipelines need to be processed, connect their # "final" nodes to the initial node of this pipeline to ensure that # they are all processed before this pipeline is run. if final_nodes: prereqs = pipeline.add('prereqs', Merge(len(final_nodes))) for i, final_node in enumerate(final_nodes, start=1): workflow.connect(final_node, 'out', prereqs, 'in{}'.format(i)) else: prereqs = None # Construct iterator structure over subjects and sessions to be # processed iter_nodes = self._iterate(pipeline, to_process_array, subject_inds, visit_inds) sources = {} # Loop through each frequency present in the pipeline inputs and # create a corresponding source node for freq in pipeline.input_frequencies: try: inputs = list(pipeline.frequency_inputs(freq)) except ArcanaMissingDataException as e: raise ArcanaMissingDataException( str(e) + ", which is required for pipeline '{}'".format( pipeline.name)) inputnode = pipeline.inputnode(freq) sources[freq] = source = pipeline.add( '{}_source'.format(freq), RepositorySource(i.collection for i in inputs), inputs=({ 'prereqs': (prereqs, 'out') } if prereqs is not None else {})) # Connect iter_nodes to source and input nodes for iterator in pipeline.iterators(freq): pipeline.connect(iter_nodes[iterator], iterator, source, iterator) pipeline.connect(source, iterator, inputnode, iterator) for input in inputs: pipeline.connect(source, input.suffixed_name, inputnode, input.name) deiter_nodes = {} def deiter_node_sort_key(it): """ If there are two iter_nodes (i.e. both subject and visit ID) and one depends on the other (i.e. if the visit IDs per subject vary and vice-versa) we need to ensure that the dependent iterator is deiterated (joined) first. """ return iter_nodes[it].itersource is None # Connect all outputs to the repository sink, creating a new sink for # each frequency level (i.e 'per_session', 'per_subject', 'per_visit', # or 'per_study') for freq in pipeline.output_frequencies: outputs = list(pipeline.frequency_outputs(freq)) if pipeline.iterators(freq) - pipeline.iterators(): raise ArcanaDesignError( "Doesn't make sense to output '{}', which are of '{}' " "frequency, when the pipeline only iterates over '{}'". format("', '".join(o.name for o in outputs), freq, "', '".join(pipeline.iterators()))) outputnode = pipeline.outputnode(freq) # Connect filesets/fields to sink to sink node, skipping outputs # that are study inputs to_connect = { o.suffixed_name: (outputnode, o.name) for o in outputs if o.is_spec } # Connect iterators to sink node to_connect.update( {i: (iter_nodes[i], i) for i in pipeline.iterators()}) # Connect checksums/values from sources to sink node in order to # save in provenance, joining where necessary for input_freq in pipeline.input_frequencies: checksums_to_connect = [ i.checksum_suffixed_name for i in pipeline.frequency_inputs(input_freq) ] if not checksums_to_connect: # Rare case of a pipeline with no inputs only iter_nodes # that will only occur in unittests in all likelihood continue # Loop over iterators that need to be joined, i.e. that are # present in the input frequency but not the output frequency # and create join nodes source = sources[input_freq] for iterator in (pipeline.iterators(input_freq) - pipeline.iterators(freq)): join = pipeline.add( '{}_to_{}_{}_checksum_join'.format( input_freq, freq, iterator), IdentityInterface(checksums_to_connect), inputs={ tc: (source, tc) for tc in checksums_to_connect }, joinsource=iterator, joinfield=checksums_to_connect) source = join to_connect.update( {tc: (source, tc) for tc in checksums_to_connect}) # Add sink node sink = pipeline.add('{}_sink'.format(freq), RepositorySink((o.collection for o in outputs), pipeline, required_outputs), inputs=to_connect) # "De-iterate" (join) over iterators to get back to single child # node by the time we connect to the final node of the pipeline Set # the sink and subject_id as the default deiterator if there are no # deiterates (i.e. per_study) or to use as the upstream node to # connect the first deiterator for every frequency deiter_nodes[freq] = sink # for per_study the "deiterator" == sink for iterator in sorted(pipeline.iterators(freq), key=deiter_node_sort_key): # Connect to previous deiterator or sink # NB: we only need to keep a reference to the last one in the # chain in order to connect with the "final" node, so we can # overwrite the entry in the 'deiter_nodes' dict deiter_nodes[freq] = pipeline.add( '{}_{}_deiter'.format(freq, iterator), IdentityInterface(['checksums']), inputs={'checksums': (deiter_nodes[freq], 'checksums')}, joinsource=iterator, joinfield='checksums') # Create a final node, which is used to connect with downstream # pipelines pipeline.add('final', Merge(len(deiter_nodes)), inputs={ 'in{}'.format(i): (di, 'checksums') for i, di in enumerate(deiter_nodes.values(), start=1) })