def _run(): """ Uses a todo file to identify the work to be done. :return 0 if successful, -1 if there's any sort of failure. Return status is used by airflow for task instance management and reporting. """ config, clients, name_builder, metadata_reader = _common() files_source = None if config.use_local_files: if config.cleanup_files_when_storing: files_source = data_source.DAOLocalFilesDataSource( config, clients.data_client, metadata_reader ) else: files_source = dsc.TodoFileDataSource(config) return rc.run_by_todo( name_builder=name_builder, meta_visitors=META_VISITORS, data_visitors=DATA_VISITORS, clients=clients, config=config, source=files_source, metadata_reader=metadata_reader, )
def test_todo_file(): todo_fqn = os.path.join(tc.TEST_DATA_DIR, 'todo.txt') with open(todo_fqn, 'w') as f: f.write('file1\n') f.write('file2\n') f.write('\n') try: test_config = mc.Config() test_config.work_fqn = todo_fqn test_subject = dsc.TodoFileDataSource(test_config) test_result = test_subject.get_work() assert test_result is not None, 'expect result' assert len(test_result) == 2, 'wrong number of files' finally: if os.path.exists(todo_fqn): os.unlink(todo_fqn)
def _run(): """ Uses a todo file with file names, even though Gemini provides information about existing data referenced by observation ID. """ ( clients, config, metadata_reader, meta_visitors, name_builder, ) = _common_init() if config.use_local_files or mc.TaskType.SCRAPE in config.task_types: source = dsc.ListDirSeparateDataSource(config) else: source = dsc.TodoFileDataSource(config) return rc.run_by_todo( config=config, name_builder=name_builder, meta_visitors=meta_visitors, source=source, metadata_reader=metadata_reader, clients=clients, )
def run_by_todo( config=None, name_builder=None, chooser=None, command_name=None, source=None, meta_visitors=[], data_visitors=[], modify_transfer=None, store_transfer=None, clients=None, ): """A default implementation for using the TodoRunner. :param config Config instance :param name_builder NameBuilder extension that creates an instance of a StorageName extension, from an entry from a DataSourceComposable listing :param command_name string that represents the specific pipeline application name :param source DataSource implementation, if there's a special data source :param meta_visitors list of modules with visit methods, that expect the metadata of a work file to exist on disk :param data_visitors list of modules with visit methods, that expect the work file to exist on disk :param chooser OrganizerChooser, if there's strange rules about file naming. :param modify_transfer Transfer extension that identifies how to retrieve data from a source for modification of CAOM2 metadata. By this time, files are usually stored at CADC, so it's probably a CadcTransfer instance, but this allows for the case that a file is never stored at CADC. Try to guess what this one is. :param store_transfer Transfer extension that identifies hot to retrieve data from a source for storage at CADC, probably an HTTP or FTP site. Don't try to guess what this one is. :param clients: ClientCollection instance """ if config is None: config = mc.Config() config.get_executors() _set_logging(config) if clients is None: clients = cc.ClientCollection(config) if name_builder is None: name_builder = name_builder_composable.StorageNameInstanceBuilder( config.collection) if source is None: if config.use_local_files: source = data_source_composable.ListDirSeparateDataSource( config, recursive=config.recurse_data_sources) else: source = data_source_composable.TodoFileDataSource(config) modify_transfer = _set_modify_transfer(modify_transfer, config, clients.data_client) organizer = ec.OrganizeExecutes( config, command_name, meta_visitors, data_visitors, chooser, store_transfer, modify_transfer, cadc_client=clients.data_client, caom_client=clients.metadata_client, ) runner = TodoRunner(config, organizer, name_builder, source) result = runner.run() result |= runner.run_retry() runner.report() return result
def _reset_for_retry(self, count): self._config.update_for_retry(count) # the log location changes for each retry self._organizer.set_log_location() self._data_source = data_source_composable.TodoFileDataSource( self._config)