def test_fields_roundtrip(self): repository = XnatRepo(server=SERVER, cache_dir=self.cache_dir) dataset = repository.dataset(self.project) analysis = DummyAnalysis( self.STUDY_NAME, dataset=dataset, processor=SingleProc('a_dir'), inputs=[FilesetFilter('source1', 'source1', text_format)]) fields = ['field{}'.format(i) for i in range(1, 4)] dummy_pipeline = analysis.dummy_pipeline() dummy_pipeline.cap() sink = pe.Node(RepositorySink( (analysis.bound_spec(f).slice for f in fields), dummy_pipeline), name='fields_sink') sink.inputs.field1_field = field1 = 1 sink.inputs.field2_field = field2 = 2.0 sink.inputs.field3_field = field3 = str('3') sink.inputs.subject_id = self.SUBJECT sink.inputs.visit_id = self.VISIT sink.inputs.desc = "Test sink of fields" sink.inputs.name = 'test_sink' sink.run() source = pe.Node(RepositorySource( analysis.bound_spec(f).slice for f in fields), name='fields_source') source.inputs.visit_id = self.VISIT source.inputs.subject_id = self.SUBJECT source.inputs.desc = "Test source of fields" source.inputs.name = 'test_source' results = source.run() self.assertEqual(results.outputs.field1_field, field1) self.assertEqual(results.outputs.field2_field, field2) self.assertEqual(results.outputs.field3_field, field3)
def test_cache_download(self): repository = XnatRepo(server=SERVER, cache_dir=tempfile.mkdtemp()) dataset = repository.dataset(self.project) analysis = self.create_analysis( TestAnalysis, 'cache_download', inputs=[ FilesetFilter('fileset1', 'fileset1', text_format), FilesetFilter('fileset3', 'fileset3', text_format) ], dataset=dataset) analysis.cache_inputs() for subject_id, visits in list(self.STRUCTURE.items()): subj_dir = op.join(repository.cache_dir, self.project, '{}_{}'.format(self.project, subject_id)) for visit_id in visits: sess_dir = op.join( subj_dir, '{}_{}_{}'.format(self.project, subject_id, visit_id)) for inpt in analysis.inputs: self.assertTrue( op.exists( op.join(sess_dir, inpt.name + '-' + inpt.name)))
def test_repository_roundtrip(self): # Create working dirs # Create DarisSource node repository = XnatRepo(server=SERVER, cache_dir=self.cache_dir) dataset = repository.dataset(self.project) analysis = DummyAnalysis(self.STUDY_NAME, dataset=dataset, processor=SingleProc('a_dir'), inputs=[ FilesetFilter('source1', 'source1', text_format), FilesetFilter('source2', 'source2', text_format), FilesetFilter('source3', 'source3', text_format), FilesetFilter('source4', 'source4', text_format) ]) # TODO: Should test out other file formats as well. source_files = ['source1', 'source2', 'source3', 'source4'] sink_files = ['sink1', 'sink3', 'sink4'] inputnode = pe.Node(IdentityInterface(['subject_id', 'visit_id']), 'inputnode') inputnode.inputs.subject_id = str(self.SUBJECT) inputnode.inputs.visit_id = str(self.VISIT) source = pe.Node(RepositorySource( analysis.bound_spec(f).slice for f in source_files), name='source') dummy_pipeline = analysis.dummy_pipeline() dummy_pipeline.cap() sink = pe.Node(RepositorySink((analysis.bound_spec(f).slice for f in sink_files), dummy_pipeline), name='sink') sink.inputs.name = 'repository-roundtrip-unittest' sink.inputs.desc = ( "A test session created by repository roundtrip unittest") # Create workflow connecting them together workflow = pe.Workflow('source-sink-unit-test', base_dir=self.work_dir) workflow.add_nodes((source, sink)) workflow.connect(inputnode, 'subject_id', source, 'subject_id') workflow.connect(inputnode, 'visit_id', source, 'visit_id') workflow.connect(inputnode, 'subject_id', sink, 'subject_id') workflow.connect(inputnode, 'visit_id', sink, 'visit_id') for source_name in source_files: if source_name != 'source2': sink_name = source_name.replace('source', 'sink') workflow.connect(source, source_name + PATH_SUFFIX, sink, sink_name + PATH_SUFFIX) workflow.run() # Check cache was created properly self.assertEqual(filter_scans(os.listdir(self.session_cache())), [ 'source1-source1', 'source2-source2', 'source3-source3', 'source4-source4' ]) expected_sink_filesets = ['sink1', 'sink3', 'sink4'] self.assertEqual( filter_scans( os.listdir(self.session_cache(from_analysis=self.STUDY_NAME))), [(e + '-' + e) for e in expected_sink_filesets]) with self._connect() as login: fileset_names = filter_scans(login.experiments[self.session_label( from_analysis=self.STUDY_NAME)].scans.keys()) self.assertEqual(fileset_names, expected_sink_filesets)
def test_summary(self): # Create working dirs # Create XnatSource node repository = XnatRepo(server=SERVER, cache_dir=self.cache_dir) analysis = DummyAnalysis(self.SUMMARY_STUDY_NAME, repository.dataset(self.project), SingleProc('ad'), inputs=[ FilesetFilter('source1', 'source1', text_format), FilesetFilter('source2', 'source2', text_format), FilesetFilter('source3', 'source3', text_format) ]) # TODO: Should test out other file formats as well. source_files = ['source1', 'source2', 'source3'] inputnode = pe.Node(IdentityInterface(['subject_id', 'visit_id']), 'inputnode') inputnode.inputs.subject_id = self.SUBJECT inputnode.inputs.visit_id = self.VISIT source = pe.Node(RepositorySource( [analysis.bound_spec(f).slice for f in source_files]), name='source') subject_sink_files = ['subject_sink'] dummy_pipeline = analysis.dummy_pipeline() dummy_pipeline.cap() subject_sink = pe.Node(RepositorySink( [analysis.bound_spec(f).slice for f in subject_sink_files], dummy_pipeline), name='subject_sink') subject_sink.inputs.name = 'subject_summary' subject_sink.inputs.desc = ( "Tests the sinking of subject-wide filesets") # Test visit sink visit_sink_files = ['visit_sink'] visit_sink = pe.Node(RepositorySink( [analysis.bound_spec(f).slice for f in visit_sink_files], dummy_pipeline), name='visit_sink') visit_sink.inputs.name = 'visit_summary' visit_sink.inputs.desc = ("Tests the sinking of visit-wide filesets") # Test project sink analysis_sink_files = ['analysis_sink'] analysis_sink = pe.Node(RepositorySink( [analysis.bound_spec(f).slice for f in analysis_sink_files], dummy_pipeline), name='analysis_sink') analysis_sink.inputs.name = 'project_summary' analysis_sink.inputs.desc = ( "Tests the sinking of project-wide filesets") # Create workflow connecting them together workflow = pe.Workflow('summary_unittest', base_dir=self.work_dir) workflow.add_nodes((source, subject_sink, visit_sink, analysis_sink)) workflow.connect(inputnode, 'subject_id', source, 'subject_id') workflow.connect(inputnode, 'visit_id', source, 'visit_id') workflow.connect(inputnode, 'subject_id', subject_sink, 'subject_id') workflow.connect(inputnode, 'visit_id', visit_sink, 'visit_id') workflow.connect(source, 'source1' + PATH_SUFFIX, subject_sink, 'subject_sink' + PATH_SUFFIX) workflow.connect(source, 'source2' + PATH_SUFFIX, visit_sink, 'visit_sink' + PATH_SUFFIX) workflow.connect(source, 'source3' + PATH_SUFFIX, analysis_sink, 'analysis_sink' + PATH_SUFFIX) workflow.run() analysis.clear_caches() # Refreshed cached repository tree object with self._connect() as login: # Check subject summary directories were created properly in cache expected_subj_filesets = ['subject_sink'] subject_dir = self.session_cache( visit=XnatRepo.SUMMARY_NAME, from_analysis=self.SUMMARY_STUDY_NAME) self.assertEqual(filter_scans(os.listdir(subject_dir)), [(e + '-' + e) for e in expected_subj_filesets]) # and on XNAT subject_fileset_names = filter_scans( login.projects[self.project].experiments[self.session_label( visit=XnatRepo.SUMMARY_NAME, from_analysis=self.SUMMARY_STUDY_NAME)].scans.keys()) self.assertEqual(expected_subj_filesets, subject_fileset_names) # Check visit summary directories were created properly in # cache expected_visit_filesets = ['visit_sink'] visit_dir = self.session_cache( subject=XnatRepo.SUMMARY_NAME, from_analysis=self.SUMMARY_STUDY_NAME) self.assertEqual(filter_scans(os.listdir(visit_dir)), [(e + '-' + e) for e in expected_visit_filesets]) # and on XNAT visit_fileset_names = filter_scans( login.projects[self.project].experiments[self.session_label( subject=XnatRepo.SUMMARY_NAME, from_analysis=self.SUMMARY_STUDY_NAME)].scans.keys()) self.assertEqual(expected_visit_filesets, visit_fileset_names) # Check project summary directories were created properly in cache expected_proj_filesets = ['analysis_sink'] project_dir = self.session_cache( subject=XnatRepo.SUMMARY_NAME, visit=XnatRepo.SUMMARY_NAME, from_analysis=self.SUMMARY_STUDY_NAME) self.assertEqual(filter_scans(os.listdir(project_dir)), [(e + '-' + e) for e in expected_proj_filesets]) # and on XNAT project_fileset_names = filter_scans( login.projects[self.project].experiments[self.session_label( subject=XnatRepo.SUMMARY_NAME, visit=XnatRepo.SUMMARY_NAME, from_analysis=self.SUMMARY_STUDY_NAME)].scans.keys()) self.assertEqual(expected_proj_filesets, project_fileset_names) # Reload the data from the summary directories reloadinputnode = pe.Node( IdentityInterface(['subject_id', 'visit_id']), 'reload_inputnode') reloadinputnode.inputs.subject_id = self.SUBJECT reloadinputnode.inputs.visit_id = self.VISIT reloadsource_per_subject = pe.Node(RepositorySource( analysis.bound_spec(f).slice for f in subject_sink_files), name='reload_source_per_subject') reloadsource_per_visit = pe.Node(RepositorySource( analysis.bound_spec(f).slice for f in visit_sink_files), name='reload_source_per_visit') reloadsource_per_dataset = pe.Node(RepositorySource( analysis.bound_spec(f).slice for f in analysis_sink_files), name='reload_source_per_dataset') reloadsink = pe.Node(RepositorySink( (analysis.bound_spec(f).slice for f in ['resink1', 'resink2', 'resink3']), dummy_pipeline), name='reload_sink') reloadsink.inputs.name = 'reload_summary' reloadsink.inputs.desc = ( "Tests the reloading of subject and project summary filesets") reloadworkflow = pe.Workflow('reload_summary_unittest', base_dir=self.work_dir) for node in (reloadsource_per_subject, reloadsource_per_visit, reloadsource_per_dataset, reloadsink): for iterator in ('subject_id', 'visit_id'): reloadworkflow.connect(reloadinputnode, iterator, node, iterator) reloadworkflow.connect(reloadsource_per_subject, 'subject_sink' + PATH_SUFFIX, reloadsink, 'resink1' + PATH_SUFFIX) reloadworkflow.connect(reloadsource_per_visit, 'visit_sink' + PATH_SUFFIX, reloadsink, 'resink2' + PATH_SUFFIX) reloadworkflow.connect(reloadsource_per_dataset, 'analysis_sink' + PATH_SUFFIX, reloadsink, 'resink3' + PATH_SUFFIX) reloadworkflow.run() # Check that the filesets self.assertEqual( filter_scans( os.listdir( self.session_cache( from_analysis=self.SUMMARY_STUDY_NAME))), ['resink1-resink1', 'resink2-resink2', 'resink3-resink3']) # and on XNAT with self._connect() as login: resinked_fileset_names = filter_scans( login.projects[self.project].experiments[self.session_label( from_analysis=self.SUMMARY_STUDY_NAME)].scans.keys()) self.assertEqual(sorted(resinked_fileset_names), ['resink1', 'resink2', 'resink3'])
def test_checksums(self): """ Tests check of downloaded checksums to see if file needs to be redownloaded """ cache_dir = op.join(self.work_dir, 'cache-checksum-check') DATASET_NAME = 'source1' STUDY_NAME = 'checksum_check_analysis' fileset_fname = DATASET_NAME + text_format.extension source_target_path = op.join(self.session_cache(cache_dir), DATASET_NAME + '-' + DATASET_NAME) md5_path = source_target_path + XnatRepo.MD5_SUFFIX source_target_fpath = op.join(source_target_path, fileset_fname) shutil.rmtree(cache_dir, ignore_errors=True) os.makedirs(cache_dir) source_repository = XnatRepo(server=SERVER, cache_dir=cache_dir) source_dataset = source_repository.dataset(self.project) sink_repository = XnatRepo(server=SERVER, cache_dir=cache_dir) sink_dataset = sink_repository.dataset(self.checksum_sink_project, subject_ids=['SUBJECT'], visit_ids=['VISIT'], fill_tree=True) analysis = DummyAnalysis(STUDY_NAME, dataset=sink_dataset, processor=SingleProc('ad'), inputs=[ FilesetFilter(DATASET_NAME, DATASET_NAME, text_format, dataset=source_dataset) ]) source = pe.Node(RepositorySource( [analysis.bound_spec(DATASET_NAME).slice]), name='checksum_check_source') source.inputs.subject_id = self.SUBJECT source.inputs.visit_id = self.VISIT source.run() self.assertTrue(op.exists(md5_path)) self.assertTrue(op.exists(source_target_fpath)) with open(md5_path) as f: checksums = json.load(f) # Stash the downloaded file in a new location and create a dummy # file instead stash_path = source_target_path + '.stash' shutil.move(source_target_path, stash_path) os.mkdir(source_target_path) with open(source_target_fpath, 'w') as f: f.write('dummy') # Run the download, which shouldn't download as the checksums are the # same source.run() with open(source_target_fpath) as f: d = f.read() self.assertEqual(d, 'dummy') # Replace the checksum with a dummy os.remove(md5_path) checksums['.'] = 'dummy_checksum' with open(md5_path, 'w', **JSON_ENCODING) as f: json.dump(checksums, f, indent=2) # Retry the download, which should now download since the checksums # differ source.run() with open(source_target_fpath) as f: d = f.read() with open(op.join(stash_path, fileset_fname)) as f: e = f.read() self.assertEqual(d, e) # Resink the source file and check that the generated MD5 checksum is # stored in identical format DATASET_NAME = 'sink1' dummy_pipeline = analysis.dummy_pipeline() dummy_pipeline.cap() sink = pe.Node(RepositorySink( [analysis.bound_spec(DATASET_NAME).slice], dummy_pipeline), name='checksum_check_sink') sink.inputs.name = 'checksum_check_sink' sink.inputs.desc = "Tests the generation of MD5 checksums" sink.inputs.subject_id = self.SUBJECT sink.inputs.visit_id = self.VISIT sink.inputs.sink1_path = source_target_fpath sink_target_path = op.join( self.session_cache(cache_dir, project=self.checksum_sink_project, subject=(self.SUBJECT), from_analysis=STUDY_NAME), DATASET_NAME + '-' + DATASET_NAME) sink_md5_path = sink_target_path + XnatRepo.MD5_SUFFIX sink.run() with open(md5_path) as f: source_checksums = json.load(f) with open(sink_md5_path) as f: sink_checksums = json.load(f) self.assertEqual( source_checksums, sink_checksums, ("Source checksum ({}) did not equal sink checksum ({})".format( source_checksums, sink_checksums)))
def test_delayed_download(self): """ Tests handling of race conditions where separate processes attempt to cache the same fileset """ cache_dir = op.join(self.work_dir, 'cache-delayed-download') DATASET_NAME = 'source1' target_path = op.join(self.session_cache(cache_dir), DATASET_NAME, DATASET_NAME + text_format.extension) tmp_dir = target_path + '.download' shutil.rmtree(cache_dir, ignore_errors=True) os.makedirs(cache_dir) repository = XnatRepo(server=SERVER, cache_dir=cache_dir) dataset = repository.dataset(self.project) analysis = DummyAnalysis( self.STUDY_NAME, dataset, SingleProc('ad'), inputs=[FilesetFilter(DATASET_NAME, DATASET_NAME, text_format)]) source = pe.Node(RepositorySource( [analysis.bound_spec(DATASET_NAME).slice]), name='delayed_source') source.inputs.subject_id = self.SUBJECT source.inputs.visit_id = self.VISIT result1 = source.run() source1_path = result1.outputs.source1_path self.assertTrue(op.exists(source1_path)) self.assertEqual( source1_path, target_path, "Output file path '{}' not equal to target path '{}'".format( source1_path, target_path)) # Clear cache to start again shutil.rmtree(cache_dir, ignore_errors=True) # Create tmp_dir before running interface, this time should wait for 1 # second, check to see that the session hasn't been created and then # clear it and redownload the fileset. os.makedirs(tmp_dir) source.inputs.race_cond_delay = 1 result2 = source.run() source1_path = result2.outputs.source1_path # Clear cache to start again shutil.rmtree(cache_dir, ignore_errors=True) # Create tmp_dir before running interface, this time should wait for 1 # second, check to see that the session hasn't been created and then # clear it and redownload the fileset. internal_dir = op.join(tmp_dir, 'internal') deleted_tmp_dir = tmp_dir + '.deleted' def simulate_download(): "Simulates a download in a separate process" os.makedirs(internal_dir) time.sleep(5) # Modify a file in the temp dir to make the source download keep # waiting logger.info('Updating simulated download directory') with open(op.join(internal_dir, 'download'), 'a') as f: f.write('downloading') time.sleep(10) # Simulate the finalising of the download by copying the previously # downloaded file into place and deleting the temp dir. logger.info('Finalising simulated download') with open(target_path, 'a') as f: f.write('simulated') shutil.move(tmp_dir, deleted_tmp_dir) source.inputs.race_cond_delay = 10 p = Process(target=simulate_download) p.start() # Start the simulated download in separate process time.sleep(1) source.run() # Run the local download p.join() with open(op.join(deleted_tmp_dir, 'internal', 'download')) as f: d = f.read() self.assertEqual(d, 'downloading') with open(target_path) as f: d = f.read() self.assertEqual(d, 'simulated')