Esempio n. 1
0
 def cache_inputs(self):
     """
     Runs the Study's repository source node for each of the inputs
     of the study, thereby caching any data required from remote
     repositorys. Useful when launching many parallel jobs that will
     all try to concurrently access the remote repository, and probably
     lead to timeout errors.
     """
     workflow = pe.Workflow(name='cache_download',
                            base_dir=self.processor.work_dir)
     subjects = pe.Node(IdentityInterface(['subject_id']),
                        name='subjects',
                        environment=self.environment)
     sessions = pe.Node(IdentityInterface(['subject_id', 'visit_id']),
                        name='sessions',
                        environment=self.environment)
     subjects.iterables = ('subject_id', tuple(self.subject_ids))
     sessions.iterables = ('visit_id', tuple(self.visit_ids))
     source = pe.Node(RepositorySource(
         self.bound_spec(i).collection for i in self.inputs),
                      name='source')
     workflow.connect(subjects, 'subject_id', sessions, 'subject_id')
     workflow.connect(sessions, 'subject_id', source, 'subject_id')
     workflow.connect(sessions, 'visit_id', source, 'visit_id')
     workflow.run()
Esempio n. 2
0
 def test_fields_roundtrip(self):
     STUDY_NAME = 'fields_roundtrip'
     study = DummyStudy(STUDY_NAME,
                        self.repository,
                        processor=SingleProc('a_dir'),
                        inputs=[])
     dummy_pipeline = study.dummy_pipeline()
     dummy_pipeline.cap()
     sink = pe.Node(RepositorySink(
         (study.bound_spec(f).collection
          for f in ['field1', 'field2', 'field3']), dummy_pipeline),
                    name='fields_sink')
     sink.inputs.field1_field = field1 = 1
     sink.inputs.field2_field = field2 = 2.0
     sink.inputs.field3_field = field3 = '3'
     sink.inputs.subject_id = self.SUBJECT
     sink.inputs.visit_id = self.VISIT
     sink.inputs.desc = "Test sink of fields"
     sink.inputs.name = 'test_sink'
     sink.run()
     source = pe.Node(RepositorySource(
         study.bound_spec(f).collection
         for f in ['field1', 'field2', 'field3']),
                      name='fields_source')
     source.inputs.visit_id = self.VISIT
     source.inputs.subject_id = self.SUBJECT
     source.inputs.desc = "Test source of fields"
     source.inputs.name = 'test_source'
     results = source.run()
     self.assertEqual(results.outputs.field1_field, field1)
     self.assertEqual(results.outputs.field2_field, field2)
     self.assertEqual(results.outputs.field3_field, field3)
Esempio n. 3
0
 def test_fields_roundtrip(self):
     repository = XnatRepo(server=SERVER, cache_dir=self.cache_dir)
     dataset = repository.dataset(self.project)
     analysis = DummyAnalysis(
         self.STUDY_NAME,
         dataset=dataset,
         processor=SingleProc('a_dir'),
         inputs=[FilesetFilter('source1', 'source1', text_format)])
     fields = ['field{}'.format(i) for i in range(1, 4)]
     dummy_pipeline = analysis.dummy_pipeline()
     dummy_pipeline.cap()
     sink = pe.Node(RepositorySink(
         (analysis.bound_spec(f).slice for f in fields), dummy_pipeline),
                    name='fields_sink')
     sink.inputs.field1_field = field1 = 1
     sink.inputs.field2_field = field2 = 2.0
     sink.inputs.field3_field = field3 = str('3')
     sink.inputs.subject_id = self.SUBJECT
     sink.inputs.visit_id = self.VISIT
     sink.inputs.desc = "Test sink of fields"
     sink.inputs.name = 'test_sink'
     sink.run()
     source = pe.Node(RepositorySource(
         analysis.bound_spec(f).slice for f in fields),
                      name='fields_source')
     source.inputs.visit_id = self.VISIT
     source.inputs.subject_id = self.SUBJECT
     source.inputs.desc = "Test source of fields"
     source.inputs.name = 'test_source'
     results = source.run()
     self.assertEqual(results.outputs.field1_field, field1)
     self.assertEqual(results.outputs.field2_field, field2)
     self.assertEqual(results.outputs.field3_field, field3)
Esempio n. 4
0
 def test_repository_roundtrip(self):
     analysis = DummyAnalysis(self.STUDY_NAME,
                              self.dataset,
                              processor=SingleProc('a_dir'),
                              inputs=[
                                  FilesetFilter('source1', 'source1',
                                                text_format),
                                  FilesetFilter('source2', 'source2',
                                                text_format),
                                  FilesetFilter('source3', 'source3',
                                                text_format),
                                  FilesetFilter('source4', 'source4',
                                                text_format)
                              ])
     # TODO: Should test out other file formats as well.
     source_files = ('source1', 'source2', 'source3', 'source4')
     sink_files = ('sink1', 'sink3', 'sink4')
     inputnode = pe.Node(IdentityInterface(['subject_id', 'visit_id']),
                         'inputnode')
     inputnode.inputs.subject_id = self.SUBJECT
     inputnode.inputs.visit_id = self.VISIT
     source = pe.Node(RepositorySource(
         analysis.bound_spec(f).slice for f in source_files),
                      name='source')
     dummy_pipeline = analysis.dummy_pipeline()
     dummy_pipeline.cap()
     sink = pe.Node(RepositorySink((analysis.bound_spec(f).slice
                                    for f in sink_files), dummy_pipeline),
                    name='sink')
     sink.inputs.name = 'repository_sink'
     sink.inputs.desc = (
         "A test session created by repository roundtrip unittest")
     # Create workflow connecting them together
     workflow = pe.Workflow('source_sink_unit_test', base_dir=self.work_dir)
     workflow.add_nodes((source, sink))
     workflow.connect(inputnode, 'subject_id', source, 'subject_id')
     workflow.connect(inputnode, 'visit_id', source, 'visit_id')
     workflow.connect(inputnode, 'subject_id', sink, 'subject_id')
     workflow.connect(inputnode, 'visit_id', sink, 'visit_id')
     for source_name in source_files:
         if not source_name.endswith('2'):
             sink_name = source_name.replace('source', 'sink')
             workflow.connect(source, source_name + PATH_SUFFIX, sink,
                              sink_name + PATH_SUFFIX)
     workflow.run()
     # Check local directory was created properly
     outputs = [
         f for f in sorted(
             os.listdir(self.get_session_dir(
                 from_analysis=self.STUDY_NAME)))
         if f not in (LocalFileSystemRepo.FIELDS_FNAME,
                      LocalFileSystemRepo.PROV_DIR)
     ]
     self.assertEqual(outputs, ['sink1.txt', 'sink3.txt', 'sink4.txt'])
Esempio n. 5
0
    def test_summary(self):
        study = DummyStudy(self.SUMMARY_STUDY_NAME,
                           self.repository,
                           SingleProc('ad'),
                           inputs=[
                               InputFilesets('source1', 'source1',
                                             text_format),
                               InputFilesets('source2', 'source2',
                                             text_format),
                               InputFilesets('source3', 'source3', text_format)
                           ])
        # TODO: Should test out other file formats as well.
        source_files = ['source1', 'source2', 'source3']
        inputnode = pe.Node(IdentityInterface(['subject_id', 'visit_id']),
                            'inputnode')
        inputnode.inputs.subject_id = self.SUBJECT
        inputnode.inputs.visit_id = self.VISIT
        source = pe.Node(RepositorySource(
            study.bound_spec(f).collection for f in source_files),
                         name='source')
        # Test subject sink
        subject_sink_files = ['subject_sink']
        dummy_pipeline = study.dummy_pipeline()
        dummy_pipeline.cap()
        subject_sink = pe.Node(RepositorySink(
            (study.bound_spec(f).collection for f in subject_sink_files),
            dummy_pipeline),
                               name='subject_sink')
        subject_sink.inputs.name = 'subject_summary'
        subject_sink.inputs.desc = (
            "Tests the sinking of subject-wide filesets")
        # Test visit sink
        visit_sink_files = ['visit_sink']
        visit_sink = pe.Node(RepositorySink(
            (study.bound_spec(f).collection for f in visit_sink_files),
            dummy_pipeline),
                             name='visit_sink')
        visit_sink.inputs.name = 'visit_summary'
        visit_sink.inputs.desc = ("Tests the sinking of visit-wide filesets")
        # Test project sink
        study_sink_files = ['study_sink']
        study_sink = pe.Node(RepositorySink(
            (study.bound_spec(f).collection for f in study_sink_files),
            dummy_pipeline),
                             name='study_sink')

        study_sink.inputs.name = 'project_summary'
        study_sink.inputs.desc = ("Tests the sinking of project-wide filesets")
        # Create workflow connecting them together
        workflow = pe.Workflow('summary_unittest', base_dir=self.work_dir)
        workflow.add_nodes((source, subject_sink, visit_sink, study_sink))
        workflow.connect(inputnode, 'subject_id', source, 'subject_id')
        workflow.connect(inputnode, 'visit_id', source, 'visit_id')
        workflow.connect(inputnode, 'subject_id', subject_sink, 'subject_id')
        workflow.connect(inputnode, 'visit_id', visit_sink, 'visit_id')
        workflow.connect(source, 'source1' + PATH_SUFFIX, subject_sink,
                         'subject_sink' + PATH_SUFFIX)
        workflow.connect(source, 'source2' + PATH_SUFFIX, visit_sink,
                         'visit_sink' + PATH_SUFFIX)
        workflow.connect(source, 'source3' + PATH_SUFFIX, study_sink,
                         'study_sink' + PATH_SUFFIX)
        workflow.run()
        # Check local summary directories were created properly
        subject_dir = self.get_session_dir(frequency='per_subject',
                                           from_study=self.SUMMARY_STUDY_NAME)
        self.assertEqual(sorted(os.listdir(subject_dir)),
                         [BasicRepo.PROV_DIR, 'subject_sink.txt'])
        visit_dir = self.get_session_dir(frequency='per_visit',
                                         from_study=self.SUMMARY_STUDY_NAME)
        self.assertEqual(sorted(os.listdir(visit_dir)),
                         [BasicRepo.PROV_DIR, 'visit_sink.txt'])
        project_dir = self.get_session_dir(frequency='per_study',
                                           from_study=self.SUMMARY_STUDY_NAME)
        self.assertEqual(sorted(os.listdir(project_dir)),
                         [BasicRepo.PROV_DIR, 'study_sink.txt'])
        # Reload the data from the summary directories
        reloadinputnode = pe.Node(IdentityInterface(['subject_id',
                                                     'visit_id']),
                                  name='reload_inputnode')
        reloadinputnode.inputs.subject_id = self.SUBJECT
        reloadinputnode.inputs.visit_id = self.VISIT
        reloadsource_per_subject = pe.Node(RepositorySource(
            study.bound_spec(f).collection for f in subject_sink_files),
                                           name='reload_source_per_subject')
        reloadsource_per_visit = pe.Node(RepositorySource(
            study.bound_spec(f).collection for f in visit_sink_files),
                                         name='reload_source_per_visit')
        reloadsource_per_study = pe.Node(RepositorySource(
            study.bound_spec(f).collection for f in study_sink_files),
                                         name='reload_source_per_study')
        reloadsink = pe.Node(RepositorySink(
            (study.bound_spec(f).collection
             for f in ['resink1', 'resink2', 'resink3']), dummy_pipeline),
                             name='reload_sink')
        reloadsink.inputs.name = 'reload_summary'
        reloadsink.inputs.desc = (
            "Tests the reloading of subject and project summary filesets")
        reloadworkflow = pe.Workflow('reload_summary_unittest',
                                     base_dir=self.work_dir)
        for node in (reloadsource_per_subject, reloadsource_per_visit,
                     reloadsource_per_study, reloadsink):
            for iterator in ('subject_id', 'visit_id'):
                reloadworkflow.connect(reloadinputnode, iterator, node,
                                       iterator)
        reloadworkflow.connect(reloadsource_per_subject,
                               'subject_sink' + PATH_SUFFIX, reloadsink,
                               'resink1' + PATH_SUFFIX)
        reloadworkflow.connect(reloadsource_per_visit,
                               'visit_sink' + PATH_SUFFIX, reloadsink,
                               'resink2' + PATH_SUFFIX)
        reloadworkflow.connect(reloadsource_per_study,
                               'study_sink' + PATH_SUFFIX, reloadsink,
                               'resink3' + PATH_SUFFIX)
        reloadworkflow.run()
        outputs = [
            f for f in sorted(
                os.listdir(
                    self.get_session_dir(from_study=self.SUMMARY_STUDY_NAME)))
            if f not in (BasicRepo.FIELDS_FNAME, BasicRepo.PROV_DIR)
        ]
        self.assertEqual(outputs,
                         ['resink1.txt', 'resink2.txt', 'resink3.txt'])
Esempio n. 6
0
    def test_repository_roundtrip(self):

        # Create working dirs
        # Create DarisSource node
        repository = XnatRepo(server=SERVER, cache_dir=self.cache_dir)
        dataset = repository.dataset(self.project)
        analysis = DummyAnalysis(self.STUDY_NAME,
                                 dataset=dataset,
                                 processor=SingleProc('a_dir'),
                                 inputs=[
                                     FilesetFilter('source1', 'source1',
                                                   text_format),
                                     FilesetFilter('source2', 'source2',
                                                   text_format),
                                     FilesetFilter('source3', 'source3',
                                                   text_format),
                                     FilesetFilter('source4', 'source4',
                                                   text_format)
                                 ])
        # TODO: Should test out other file formats as well.
        source_files = ['source1', 'source2', 'source3', 'source4']
        sink_files = ['sink1', 'sink3', 'sink4']
        inputnode = pe.Node(IdentityInterface(['subject_id', 'visit_id']),
                            'inputnode')
        inputnode.inputs.subject_id = str(self.SUBJECT)
        inputnode.inputs.visit_id = str(self.VISIT)
        source = pe.Node(RepositorySource(
            analysis.bound_spec(f).slice for f in source_files),
                         name='source')
        dummy_pipeline = analysis.dummy_pipeline()
        dummy_pipeline.cap()
        sink = pe.Node(RepositorySink((analysis.bound_spec(f).slice
                                       for f in sink_files), dummy_pipeline),
                       name='sink')
        sink.inputs.name = 'repository-roundtrip-unittest'
        sink.inputs.desc = (
            "A test session created by repository roundtrip unittest")
        # Create workflow connecting them together
        workflow = pe.Workflow('source-sink-unit-test', base_dir=self.work_dir)
        workflow.add_nodes((source, sink))
        workflow.connect(inputnode, 'subject_id', source, 'subject_id')
        workflow.connect(inputnode, 'visit_id', source, 'visit_id')
        workflow.connect(inputnode, 'subject_id', sink, 'subject_id')
        workflow.connect(inputnode, 'visit_id', sink, 'visit_id')
        for source_name in source_files:
            if source_name != 'source2':
                sink_name = source_name.replace('source', 'sink')
                workflow.connect(source, source_name + PATH_SUFFIX, sink,
                                 sink_name + PATH_SUFFIX)
        workflow.run()
        # Check cache was created properly
        self.assertEqual(filter_scans(os.listdir(self.session_cache())), [
            'source1-source1', 'source2-source2', 'source3-source3',
            'source4-source4'
        ])
        expected_sink_filesets = ['sink1', 'sink3', 'sink4']
        self.assertEqual(
            filter_scans(
                os.listdir(self.session_cache(from_analysis=self.STUDY_NAME))),
            [(e + '-' + e) for e in expected_sink_filesets])
        with self._connect() as login:
            fileset_names = filter_scans(login.experiments[self.session_label(
                from_analysis=self.STUDY_NAME)].scans.keys())
        self.assertEqual(fileset_names, expected_sink_filesets)
Esempio n. 7
0
 def test_summary(self):
     # Create working dirs
     # Create XnatSource node
     repository = XnatRepo(server=SERVER, cache_dir=self.cache_dir)
     analysis = DummyAnalysis(self.SUMMARY_STUDY_NAME,
                              repository.dataset(self.project),
                              SingleProc('ad'),
                              inputs=[
                                  FilesetFilter('source1', 'source1',
                                                text_format),
                                  FilesetFilter('source2', 'source2',
                                                text_format),
                                  FilesetFilter('source3', 'source3',
                                                text_format)
                              ])
     # TODO: Should test out other file formats as well.
     source_files = ['source1', 'source2', 'source3']
     inputnode = pe.Node(IdentityInterface(['subject_id', 'visit_id']),
                         'inputnode')
     inputnode.inputs.subject_id = self.SUBJECT
     inputnode.inputs.visit_id = self.VISIT
     source = pe.Node(RepositorySource(
         [analysis.bound_spec(f).slice for f in source_files]),
                      name='source')
     subject_sink_files = ['subject_sink']
     dummy_pipeline = analysis.dummy_pipeline()
     dummy_pipeline.cap()
     subject_sink = pe.Node(RepositorySink(
         [analysis.bound_spec(f).slice for f in subject_sink_files],
         dummy_pipeline),
                            name='subject_sink')
     subject_sink.inputs.name = 'subject_summary'
     subject_sink.inputs.desc = (
         "Tests the sinking of subject-wide filesets")
     # Test visit sink
     visit_sink_files = ['visit_sink']
     visit_sink = pe.Node(RepositorySink(
         [analysis.bound_spec(f).slice for f in visit_sink_files],
         dummy_pipeline),
                          name='visit_sink')
     visit_sink.inputs.name = 'visit_summary'
     visit_sink.inputs.desc = ("Tests the sinking of visit-wide filesets")
     # Test project sink
     analysis_sink_files = ['analysis_sink']
     analysis_sink = pe.Node(RepositorySink(
         [analysis.bound_spec(f).slice for f in analysis_sink_files],
         dummy_pipeline),
                             name='analysis_sink')
     analysis_sink.inputs.name = 'project_summary'
     analysis_sink.inputs.desc = (
         "Tests the sinking of project-wide filesets")
     # Create workflow connecting them together
     workflow = pe.Workflow('summary_unittest', base_dir=self.work_dir)
     workflow.add_nodes((source, subject_sink, visit_sink, analysis_sink))
     workflow.connect(inputnode, 'subject_id', source, 'subject_id')
     workflow.connect(inputnode, 'visit_id', source, 'visit_id')
     workflow.connect(inputnode, 'subject_id', subject_sink, 'subject_id')
     workflow.connect(inputnode, 'visit_id', visit_sink, 'visit_id')
     workflow.connect(source, 'source1' + PATH_SUFFIX, subject_sink,
                      'subject_sink' + PATH_SUFFIX)
     workflow.connect(source, 'source2' + PATH_SUFFIX, visit_sink,
                      'visit_sink' + PATH_SUFFIX)
     workflow.connect(source, 'source3' + PATH_SUFFIX, analysis_sink,
                      'analysis_sink' + PATH_SUFFIX)
     workflow.run()
     analysis.clear_caches()  # Refreshed cached repository tree object
     with self._connect() as login:
         # Check subject summary directories were created properly in cache
         expected_subj_filesets = ['subject_sink']
         subject_dir = self.session_cache(
             visit=XnatRepo.SUMMARY_NAME,
             from_analysis=self.SUMMARY_STUDY_NAME)
         self.assertEqual(filter_scans(os.listdir(subject_dir)),
                          [(e + '-' + e) for e in expected_subj_filesets])
         # and on XNAT
         subject_fileset_names = filter_scans(
             login.projects[self.project].experiments[self.session_label(
                 visit=XnatRepo.SUMMARY_NAME,
                 from_analysis=self.SUMMARY_STUDY_NAME)].scans.keys())
         self.assertEqual(expected_subj_filesets, subject_fileset_names)
         # Check visit summary directories were created properly in
         # cache
         expected_visit_filesets = ['visit_sink']
         visit_dir = self.session_cache(
             subject=XnatRepo.SUMMARY_NAME,
             from_analysis=self.SUMMARY_STUDY_NAME)
         self.assertEqual(filter_scans(os.listdir(visit_dir)),
                          [(e + '-' + e) for e in expected_visit_filesets])
         # and on XNAT
         visit_fileset_names = filter_scans(
             login.projects[self.project].experiments[self.session_label(
                 subject=XnatRepo.SUMMARY_NAME,
                 from_analysis=self.SUMMARY_STUDY_NAME)].scans.keys())
         self.assertEqual(expected_visit_filesets, visit_fileset_names)
         # Check project summary directories were created properly in cache
         expected_proj_filesets = ['analysis_sink']
         project_dir = self.session_cache(
             subject=XnatRepo.SUMMARY_NAME,
             visit=XnatRepo.SUMMARY_NAME,
             from_analysis=self.SUMMARY_STUDY_NAME)
         self.assertEqual(filter_scans(os.listdir(project_dir)),
                          [(e + '-' + e) for e in expected_proj_filesets])
         # and on XNAT
         project_fileset_names = filter_scans(
             login.projects[self.project].experiments[self.session_label(
                 subject=XnatRepo.SUMMARY_NAME,
                 visit=XnatRepo.SUMMARY_NAME,
                 from_analysis=self.SUMMARY_STUDY_NAME)].scans.keys())
         self.assertEqual(expected_proj_filesets, project_fileset_names)
     # Reload the data from the summary directories
     reloadinputnode = pe.Node(
         IdentityInterface(['subject_id', 'visit_id']), 'reload_inputnode')
     reloadinputnode.inputs.subject_id = self.SUBJECT
     reloadinputnode.inputs.visit_id = self.VISIT
     reloadsource_per_subject = pe.Node(RepositorySource(
         analysis.bound_spec(f).slice for f in subject_sink_files),
                                        name='reload_source_per_subject')
     reloadsource_per_visit = pe.Node(RepositorySource(
         analysis.bound_spec(f).slice for f in visit_sink_files),
                                      name='reload_source_per_visit')
     reloadsource_per_dataset = pe.Node(RepositorySource(
         analysis.bound_spec(f).slice for f in analysis_sink_files),
                                        name='reload_source_per_dataset')
     reloadsink = pe.Node(RepositorySink(
         (analysis.bound_spec(f).slice
          for f in ['resink1', 'resink2', 'resink3']), dummy_pipeline),
                          name='reload_sink')
     reloadsink.inputs.name = 'reload_summary'
     reloadsink.inputs.desc = (
         "Tests the reloading of subject and project summary filesets")
     reloadworkflow = pe.Workflow('reload_summary_unittest',
                                  base_dir=self.work_dir)
     for node in (reloadsource_per_subject, reloadsource_per_visit,
                  reloadsource_per_dataset, reloadsink):
         for iterator in ('subject_id', 'visit_id'):
             reloadworkflow.connect(reloadinputnode, iterator, node,
                                    iterator)
     reloadworkflow.connect(reloadsource_per_subject,
                            'subject_sink' + PATH_SUFFIX, reloadsink,
                            'resink1' + PATH_SUFFIX)
     reloadworkflow.connect(reloadsource_per_visit,
                            'visit_sink' + PATH_SUFFIX, reloadsink,
                            'resink2' + PATH_SUFFIX)
     reloadworkflow.connect(reloadsource_per_dataset,
                            'analysis_sink' + PATH_SUFFIX, reloadsink,
                            'resink3' + PATH_SUFFIX)
     reloadworkflow.run()
     # Check that the filesets
     self.assertEqual(
         filter_scans(
             os.listdir(
                 self.session_cache(
                     from_analysis=self.SUMMARY_STUDY_NAME))),
         ['resink1-resink1', 'resink2-resink2', 'resink3-resink3'])
     # and on XNAT
     with self._connect() as login:
         resinked_fileset_names = filter_scans(
             login.projects[self.project].experiments[self.session_label(
                 from_analysis=self.SUMMARY_STUDY_NAME)].scans.keys())
         self.assertEqual(sorted(resinked_fileset_names),
                          ['resink1', 'resink2', 'resink3'])
Esempio n. 8
0
 def test_checksums(self):
     """
     Tests check of downloaded checksums to see if file needs to be
     redownloaded
     """
     cache_dir = op.join(self.work_dir, 'cache-checksum-check')
     DATASET_NAME = 'source1'
     STUDY_NAME = 'checksum_check_analysis'
     fileset_fname = DATASET_NAME + text_format.extension
     source_target_path = op.join(self.session_cache(cache_dir),
                                  DATASET_NAME + '-' + DATASET_NAME)
     md5_path = source_target_path + XnatRepo.MD5_SUFFIX
     source_target_fpath = op.join(source_target_path, fileset_fname)
     shutil.rmtree(cache_dir, ignore_errors=True)
     os.makedirs(cache_dir)
     source_repository = XnatRepo(server=SERVER, cache_dir=cache_dir)
     source_dataset = source_repository.dataset(self.project)
     sink_repository = XnatRepo(server=SERVER, cache_dir=cache_dir)
     sink_dataset = sink_repository.dataset(self.checksum_sink_project,
                                            subject_ids=['SUBJECT'],
                                            visit_ids=['VISIT'],
                                            fill_tree=True)
     analysis = DummyAnalysis(STUDY_NAME,
                              dataset=sink_dataset,
                              processor=SingleProc('ad'),
                              inputs=[
                                  FilesetFilter(DATASET_NAME,
                                                DATASET_NAME,
                                                text_format,
                                                dataset=source_dataset)
                              ])
     source = pe.Node(RepositorySource(
         [analysis.bound_spec(DATASET_NAME).slice]),
                      name='checksum_check_source')
     source.inputs.subject_id = self.SUBJECT
     source.inputs.visit_id = self.VISIT
     source.run()
     self.assertTrue(op.exists(md5_path))
     self.assertTrue(op.exists(source_target_fpath))
     with open(md5_path) as f:
         checksums = json.load(f)
     # Stash the downloaded file in a new location and create a dummy
     # file instead
     stash_path = source_target_path + '.stash'
     shutil.move(source_target_path, stash_path)
     os.mkdir(source_target_path)
     with open(source_target_fpath, 'w') as f:
         f.write('dummy')
     # Run the download, which shouldn't download as the checksums are the
     # same
     source.run()
     with open(source_target_fpath) as f:
         d = f.read()
     self.assertEqual(d, 'dummy')
     # Replace the checksum with a dummy
     os.remove(md5_path)
     checksums['.'] = 'dummy_checksum'
     with open(md5_path, 'w', **JSON_ENCODING) as f:
         json.dump(checksums, f, indent=2)
     # Retry the download, which should now download since the checksums
     # differ
     source.run()
     with open(source_target_fpath) as f:
         d = f.read()
     with open(op.join(stash_path, fileset_fname)) as f:
         e = f.read()
     self.assertEqual(d, e)
     # Resink the source file and check that the generated MD5 checksum is
     # stored in identical format
     DATASET_NAME = 'sink1'
     dummy_pipeline = analysis.dummy_pipeline()
     dummy_pipeline.cap()
     sink = pe.Node(RepositorySink(
         [analysis.bound_spec(DATASET_NAME).slice], dummy_pipeline),
                    name='checksum_check_sink')
     sink.inputs.name = 'checksum_check_sink'
     sink.inputs.desc = "Tests the generation of MD5 checksums"
     sink.inputs.subject_id = self.SUBJECT
     sink.inputs.visit_id = self.VISIT
     sink.inputs.sink1_path = source_target_fpath
     sink_target_path = op.join(
         self.session_cache(cache_dir,
                            project=self.checksum_sink_project,
                            subject=(self.SUBJECT),
                            from_analysis=STUDY_NAME),
         DATASET_NAME + '-' + DATASET_NAME)
     sink_md5_path = sink_target_path + XnatRepo.MD5_SUFFIX
     sink.run()
     with open(md5_path) as f:
         source_checksums = json.load(f)
     with open(sink_md5_path) as f:
         sink_checksums = json.load(f)
     self.assertEqual(
         source_checksums, sink_checksums,
         ("Source checksum ({}) did not equal sink checksum ({})".format(
             source_checksums, sink_checksums)))
Esempio n. 9
0
    def test_delayed_download(self):
        """
        Tests handling of race conditions where separate processes attempt to
        cache the same fileset
        """
        cache_dir = op.join(self.work_dir, 'cache-delayed-download')
        DATASET_NAME = 'source1'
        target_path = op.join(self.session_cache(cache_dir), DATASET_NAME,
                              DATASET_NAME + text_format.extension)
        tmp_dir = target_path + '.download'
        shutil.rmtree(cache_dir, ignore_errors=True)
        os.makedirs(cache_dir)
        repository = XnatRepo(server=SERVER, cache_dir=cache_dir)
        dataset = repository.dataset(self.project)
        analysis = DummyAnalysis(
            self.STUDY_NAME,
            dataset,
            SingleProc('ad'),
            inputs=[FilesetFilter(DATASET_NAME, DATASET_NAME, text_format)])
        source = pe.Node(RepositorySource(
            [analysis.bound_spec(DATASET_NAME).slice]),
                         name='delayed_source')
        source.inputs.subject_id = self.SUBJECT
        source.inputs.visit_id = self.VISIT
        result1 = source.run()
        source1_path = result1.outputs.source1_path
        self.assertTrue(op.exists(source1_path))
        self.assertEqual(
            source1_path, target_path,
            "Output file path '{}' not equal to target path '{}'".format(
                source1_path, target_path))
        # Clear cache to start again
        shutil.rmtree(cache_dir, ignore_errors=True)
        # Create tmp_dir before running interface, this time should wait for 1
        # second, check to see that the session hasn't been created and then
        # clear it and redownload the fileset.
        os.makedirs(tmp_dir)
        source.inputs.race_cond_delay = 1
        result2 = source.run()
        source1_path = result2.outputs.source1_path
        # Clear cache to start again
        shutil.rmtree(cache_dir, ignore_errors=True)
        # Create tmp_dir before running interface, this time should wait for 1
        # second, check to see that the session hasn't been created and then
        # clear it and redownload the fileset.
        internal_dir = op.join(tmp_dir, 'internal')
        deleted_tmp_dir = tmp_dir + '.deleted'

        def simulate_download():
            "Simulates a download in a separate process"
            os.makedirs(internal_dir)
            time.sleep(5)
            # Modify a file in the temp dir to make the source download keep
            # waiting
            logger.info('Updating simulated download directory')
            with open(op.join(internal_dir, 'download'), 'a') as f:
                f.write('downloading')
            time.sleep(10)
            # Simulate the finalising of the download by copying the previously
            # downloaded file into place and deleting the temp dir.
            logger.info('Finalising simulated download')
            with open(target_path, 'a') as f:
                f.write('simulated')
            shutil.move(tmp_dir, deleted_tmp_dir)

        source.inputs.race_cond_delay = 10
        p = Process(target=simulate_download)
        p.start()  # Start the simulated download in separate process
        time.sleep(1)
        source.run()  # Run the local download
        p.join()
        with open(op.join(deleted_tmp_dir, 'internal', 'download')) as f:
            d = f.read()
        self.assertEqual(d, 'downloading')
        with open(target_path) as f:
            d = f.read()
        self.assertEqual(d, 'simulated')
Esempio n. 10
0
    def _connect_pipeline(self,
                          pipeline,
                          required_outputs,
                          workflow,
                          subject_inds,
                          visit_inds,
                          filter_array,
                          force=False):
        """
        Connects a pipeline to a overarching workflow that sets up iterators
        over subjects|visits present in the repository (if required) and
        repository source and sink nodes

        Parameters
        ----------
        pipeline : Pipeline
            The pipeline to connect
        required_outputs : set[str] | None
            The outputs required to be produced by this pipeline. If None all
            are deemed to be required
        workflow : nipype.pipeline.engine.Workflow
            The overarching workflow to connect the pipeline to
        subject_inds : dct[str, int]
            A mapping of subject ID to row index in the filter array
        visit_inds : dct[str, int]
            A mapping of visit ID to column index in the filter array
        filter_array : 2-D numpy.array[bool]
            A two-dimensional boolean array, where rows correspond to
            subjects and columns correspond to visits in the repository. True
            values represent a combination of subject & visit ID to include
            in the current round of processing. Note that if the 'force'
            flag is not set, sessions won't be reprocessed unless the
            save provenance doesn't match that of the given pipeline.
        force : bool | 'all'
            A flag to force the processing of all sessions in the filter
            array, regardless of whether the parameters|pipeline used
            to generate existing data matches the given pipeline
        """
        if self.reprocess == 'force':
            force = True
        # Close-off construction of the pipeline and created, input and output
        # nodes and provenance dictionary
        pipeline.cap()
        # Prepend prerequisite pipelines to complete workflow if they need
        # to be (re)processed
        final_nodes = []
        # The array that represents the subject/visit pairs for which any
        # prerequisite pipeline will be (re)processed, and which therefore
        # needs to be included in the processing of the current pipeline. Row
        # indices correspond to subjects and column indices visits
        prqs_to_process_array = np.zeros((len(subject_inds), len(visit_inds)),
                                         dtype=bool)
        # The array that represents the subject/visit pairs for which any
        # prerequisite pipeline will be skipped due to missing inputs. Row
        # indices correspond to subjects and column indices visits
        prqs_to_skip_array = np.zeros((len(subject_inds), len(visit_inds)),
                                      dtype=bool)
        for getter_name in pipeline.prerequisites:
            prereq = pipeline.study.pipeline(getter_name)
            if prereq.to_process_array.any():
                final_nodes.append(prereq.node('final'))
                prqs_to_process_array |= prereq.to_process_array
            prqs_to_skip_array |= prereq.to_skip_array
        # Get list of sessions that need to be processed (i.e. if
        # they don't contain the outputs of this pipeline)
        to_process_array, to_protect_array, to_skip_array = self._to_process(
            pipeline, required_outputs, prqs_to_process_array,
            prqs_to_skip_array, filter_array, subject_inds, visit_inds, force)
        # Store the arrays signifying which nodes to process, protect or skip
        # so they can be passed to downstream pipelines
        pipeline.to_process_array = to_process_array
        pipeline.to_protect_array = to_protect_array
        pipeline.to_skip_array = to_skip_array
        # Check to see if there are any sessions to process
        if not to_process_array.any():
            raise ArcanaNoRunRequiredException(
                "No sessions to process for '{}' pipeline".format(
                    pipeline.name))
        # Set up workflow to run the pipeline, loading and saving from the
        # repository
        workflow.add_nodes([pipeline._workflow])
        # If prerequisite pipelines need to be processed, connect their
        # "final" nodes to the initial node of this pipeline to ensure that
        # they are all processed before this pipeline is run.
        if final_nodes:
            prereqs = pipeline.add('prereqs', Merge(len(final_nodes)))
            for i, final_node in enumerate(final_nodes, start=1):
                workflow.connect(final_node, 'out', prereqs, 'in{}'.format(i))
        else:
            prereqs = None
        # Construct iterator structure over subjects and sessions to be
        # processed
        iter_nodes = self._iterate(pipeline, to_process_array, subject_inds,
                                   visit_inds)
        sources = {}
        # Loop through each frequency present in the pipeline inputs and
        # create a corresponding source node
        for freq in pipeline.input_frequencies:
            try:
                inputs = list(pipeline.frequency_inputs(freq))
            except ArcanaMissingDataException as e:
                raise ArcanaMissingDataException(
                    str(e) + ", which is required for pipeline '{}'".format(
                        pipeline.name))
            inputnode = pipeline.inputnode(freq)
            sources[freq] = source = pipeline.add(
                '{}_source'.format(freq),
                RepositorySource(i.collection for i in inputs),
                inputs=({
                    'prereqs': (prereqs, 'out')
                } if prereqs is not None else {}))
            # Connect iter_nodes to source and input nodes
            for iterator in pipeline.iterators(freq):
                pipeline.connect(iter_nodes[iterator], iterator, source,
                                 iterator)
                pipeline.connect(source, iterator, inputnode, iterator)
            for input in inputs:
                pipeline.connect(source, input.suffixed_name, inputnode,
                                 input.name)
        deiter_nodes = {}

        def deiter_node_sort_key(it):
            """
            If there are two iter_nodes (i.e. both subject and visit ID) and
            one depends on the other (i.e. if the visit IDs per subject
            vary and vice-versa) we need to ensure that the dependent
            iterator is deiterated (joined) first.
            """
            return iter_nodes[it].itersource is None

        # Connect all outputs to the repository sink, creating a new sink for
        # each frequency level (i.e 'per_session', 'per_subject', 'per_visit',
        # or 'per_study')
        for freq in pipeline.output_frequencies:
            outputs = list(pipeline.frequency_outputs(freq))
            if pipeline.iterators(freq) - pipeline.iterators():
                raise ArcanaDesignError(
                    "Doesn't make sense to output '{}', which are of '{}' "
                    "frequency, when the pipeline only iterates over '{}'".
                    format("', '".join(o.name for o in outputs), freq,
                           "', '".join(pipeline.iterators())))
            outputnode = pipeline.outputnode(freq)
            # Connect filesets/fields to sink to sink node, skipping outputs
            # that are study inputs
            to_connect = {
                o.suffixed_name: (outputnode, o.name)
                for o in outputs if o.is_spec
            }
            # Connect iterators to sink node
            to_connect.update(
                {i: (iter_nodes[i], i)
                 for i in pipeline.iterators()})
            # Connect checksums/values from sources to sink node in order to
            # save in provenance, joining where necessary
            for input_freq in pipeline.input_frequencies:
                checksums_to_connect = [
                    i.checksum_suffixed_name
                    for i in pipeline.frequency_inputs(input_freq)
                ]
                if not checksums_to_connect:
                    # Rare case of a pipeline with no inputs only iter_nodes
                    # that will only occur in unittests in all likelihood
                    continue
                # Loop over iterators that need to be joined, i.e. that are
                # present in the input frequency but not the output frequency
                # and create join nodes
                source = sources[input_freq]
                for iterator in (pipeline.iterators(input_freq) -
                                 pipeline.iterators(freq)):
                    join = pipeline.add(
                        '{}_to_{}_{}_checksum_join'.format(
                            input_freq, freq, iterator),
                        IdentityInterface(checksums_to_connect),
                        inputs={
                            tc: (source, tc)
                            for tc in checksums_to_connect
                        },
                        joinsource=iterator,
                        joinfield=checksums_to_connect)
                    source = join
                to_connect.update(
                    {tc: (source, tc)
                     for tc in checksums_to_connect})
            # Add sink node
            sink = pipeline.add('{}_sink'.format(freq),
                                RepositorySink((o.collection for o in outputs),
                                               pipeline, required_outputs),
                                inputs=to_connect)
            # "De-iterate" (join) over iterators to get back to single child
            # node by the time we connect to the final node of the pipeline Set
            # the sink and subject_id as the default deiterator if there are no
            # deiterates (i.e. per_study) or to use as the upstream node to
            # connect the first deiterator for every frequency
            deiter_nodes[freq] = sink  # for per_study the "deiterator" == sink
            for iterator in sorted(pipeline.iterators(freq),
                                   key=deiter_node_sort_key):
                # Connect to previous deiterator or sink
                # NB: we only need to keep a reference to the last one in the
                # chain in order to connect with the "final" node, so we can
                # overwrite the entry in the 'deiter_nodes' dict
                deiter_nodes[freq] = pipeline.add(
                    '{}_{}_deiter'.format(freq, iterator),
                    IdentityInterface(['checksums']),
                    inputs={'checksums': (deiter_nodes[freq], 'checksums')},
                    joinsource=iterator,
                    joinfield='checksums')
        # Create a final node, which is used to connect with downstream
        # pipelines
        pipeline.add('final',
                     Merge(len(deiter_nodes)),
                     inputs={
                         'in{}'.format(i): (di, 'checksums')
                         for i, di in enumerate(deiter_nodes.values(), start=1)
                     })