def _run_state(): """Uses a state file with a timestamp to control which files will be retrieved from the CSA ftp host. Ingestion is based on fully-qualified file names from the CSA ftp host, because those are difficult to reproduce otherwise. """ builder = nbc.FileNameBuilder(NEOSSatName) config = mc.Config() config.get_executors() state = mc.State(config.state_fqn) start_time = state.get_bookmark(NEOS_BOOKMARK) temp = mc.increment_time(start_time, 0).timestamp() todo_list, max_timestamp = scrape.build_todo( temp, config.working_directory, config.state_fqn) max_date = datetime.fromtimestamp(max_timestamp) incremental_source = data_source.IncrementalSource(todo_list) transferrer = tc.FtpTransfer(config.data_source) return rc.run_by_state(config=config, name_builder=builder, command_name=APPLICATION, bookmark_name=NEOS_BOOKMARK, meta_visitors=META_VISITORS, data_visitors=DATA_VISITORS, end_time=max_date, chooser=None, source=incremental_source, store_transfer=transferrer)
def _run_by_public(): """Run the processing for observations that are public, but there are no artifacts representing the previews in CAOM, or a FITS file in ad. Called as gem_run_public. The time-boxing is based on timestamps from a state.yml file. Call once/day, since data release timestamps have times of 00:00:00.000. :return 0 if successful, -1 if there's any sort of failure. Return status is used by airflow for task instance management and reporting. """ config = mc.Config() config.get_executors() external_metadata.init_global(config=config) name_builder = nbc.FileNameBuilder(gem_name.GemName) incremental_source = data_source.PublicIncremental(config) meta_visitors = _define_meta_visitors(config) return rc.run_by_state(config=config, name_builder=name_builder, command_name=main_app.APPLICATION, bookmark_name=data_source.GEM_BOOKMARK, meta_visitors=meta_visitors, data_visitors=DATA_VISITORS, end_time=None, source=incremental_source, chooser=None)
def _run_state(): """Uses a state file with a timestamp to control which entries will be processed. """ config = mc.Config() config.get_executors() return rc.run_by_state(name_builder=nbc.FileNameBuilder(VliteName), command_name=APPLICATION, meta_visitors=META_VISITORS, data_visitors=DATA_VISITORS)
def _run(): """ Uses a todo file to identify the work to be done. :return 0 if successful, -1 if there's any sort of failure. Return status is used by airflow for task instance management and reporting. """ name_builder = nbc.FileNameBuilder(get_storage_name) return rc.run_by_todo(name_builder=name_builder, meta_visitors=META_VISITORS, data_visitors=DATA_VISITORS)
def _run_state(): """Uses a state file with a timestamp to control which entries will be processed. """ config = mc.Config() config.get_executors() source = dsc.QueryTimeBoxDataSource(config, preview_suffix='png') name_builder = nbc.FileNameBuilder(dao_name.DAOName) return rc.run_by_state(name_builder=name_builder, command_name=APPLICATION, bookmark_name=DAO_BOOKMARK, meta_visitors=META_VISITORS, data_visitors=DATA_VISITORS, source=source)
def _run(): """ Uses a todo file to identify the work to be done. :return 0 if successful, -1 if there's any sort of failure. Return status is used by airflow for task instance management and reporting. """ name_builder = nbc.FileNameBuilder(PHANGSName) return rc.run_by_todo(config=None, name_builder=name_builder, command_name=APPLICATION, meta_visitors=META_VISITORS, data_visitors=DATA_VISITORS, chooser=None)
def _run_state(): """Uses a state file with a timestamp to control which entries will be processed. """ name_builder = nbc.FileNameBuilder(PHANGSName) return rc.run_by_state(config=None, name_builder=name_builder, command_name=APPLICATION, bookmark_name=None, meta_visitors=META_VISITORS, data_visitors=DATA_VISITORS, end_time=None, source=None, chooser=None)
def _run(): """ Uses a todo file to identify the work to be done. :return 0 if successful, -1 if there's any sort of failure. Return status is used by airflow for task instance management and reporting. """ builder = nbc.FileNameBuilder(NEOSSatName) config = mc.Config() config.get_executors() transferrer = tc.FtpTransfer(config.data_source) return rc.run_by_todo(name_builder=builder, config=config, command_name=APPLICATION, meta_visitors=META_VISITORS, data_visitors=DATA_VISITORS, store_transfer=transferrer)
def _run_remote(): """ Uses a todo file to identify the work to be done. :return 0 if successful, -1 if there's any sort of failure. Return status is used by airflow for task instance management and reporting. """ config = mc.Config() config.get_executors() name_builder = nbc.FileNameBuilder(GemProcName) vos_client = Client(vospace_certfile=config.proxy_fqn) store_transfer = tc.VoFitsTransfer(vos_client) data_source = dsc.VaultListDirDataSource(vos_client, config) return rc.run_by_todo(config=config, name_builder=name_builder, command_name=APPLICATION, source=data_source, meta_visitors=META_VISITORS, data_visitors=DATA_VISITORS, store_transfer=store_transfer)
def _run_by_incremental(): """Run incremental processing for observations that are posted on the site archive.gemini.edu. TODO in the future this will depend on the incremental query endpoint. :return 0 if successful, -1 if there's any sort of failure. Return status is used by airflow for task instance management and reporting. """ config = mc.Config() config.get_executors() state = mc.State(config.state_fqn) end_timestamp_s = state.bookmarks.get(data_source.GEM_BOOKMARK).get( 'end_timestamp', datetime.now()) end_timestamp_dt = mc.make_time_tz(end_timestamp_s) logging.info(f'{main_app.APPLICATION} will end at {end_timestamp_s}') external_metadata.init_global(config=config) name_builder = nbc.FileNameBuilder(gem_name.GemName) incremental_source = data_source.IncrementalSource() meta_visitors = _define_meta_visitors(config) result = rc.run_by_state( config=config, name_builder=name_builder, command_name=main_app.APPLICATION, bookmark_name=data_source.GEM_BOOKMARK, meta_visitors=meta_visitors, data_visitors=DATA_VISITORS, end_time=end_timestamp_dt, source=incremental_source, chooser=None, ) if incremental_source.max_records_encountered: logging.warning('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!') logging.warning('Encountered maximum records!!') logging.warning('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!') result |= -1 return result
def test_run_state_v(client_mock, repo_mock): repo_mock.return_value.read.side_effect = tc.mock_read client_mock.get_node.side_effect = tc.mock_get_node # the test file is length 0 client_mock.return_value.copy.return_value = 0 test_wd = '/usr/src/app/caom2pipe/int_test' caom2pipe_bookmark = 'caom2_timestamp' test_config = mc.Config() test_config.working_directory = test_wd test_config.collection = 'TEST' test_config.interval = 10 test_config.log_file_directory = f'{test_wd}/logs' test_config.failure_fqn = \ f'{test_config.log_file_directory}/failure_log.txt' test_config.log_to_file = True test_config.logging_level = 'DEBUG' test_config.progress_file_name = 'progress.txt' test_config.proxy_file_name = f'{test_wd}/cadcproxy.pem' test_config.rejected_file_name = 'rejected.yml' test_config.rejected_directory = f'{test_wd}/rejected' test_config._report_fqn = f'{test_config.log_file_directory}/app_report.txt' test_config.resource_id = 'ivo://cadc.nrc.ca/sc2repo' test_config.retry_file_name = 'retries.txt' test_config.retry_fqn = \ f'{test_config.log_file_directory}/{test_config.retry_file_name}' test_config.state_file_name = 'state.yml' test_config.success_fqn = \ f'{test_config.log_file_directory}/success_log.txt' test_config.tap_id = 'ivo://cadc.nrc.ca/sc2tap' test_config.task_types = [ mc.TaskType.STORE, mc.TaskType.INGEST, mc.TaskType.MODIFY ] test_config.features.use_file_names = True test_config.features.use_urls = False test_config.features.supports_latest_client = True test_config.use_local_files = False if not os.path.exists(test_wd): os.mkdir(test_wd) # if this test is failing, did the docker container get # restarted recently? # first create /caom2pipe_test/1000003f.fits.fz, # then check that the test_start_time and test_end_time values # correspond somewhat to the timestamp on that file # # this timestamp is 15 minutes earlier than the timestamp of the # file in /caom2pipe_test # test_start_time = '2021-05-08 02:25:09' with open(test_config.state_fqn, 'w') as f: f.write('bookmarks:\n') f.write(f' {caom2pipe_bookmark}:\n') f.write(f' last_record: {test_start_time}\n') test_end_time = datetime(2021, 5, 8, 2, 41, 27, 965132, tzinfo=timezone.utc) with open(test_config.proxy_fqn, 'w') as f: f.write('test content\n') test_data_source = TestListDirTimeBoxDataSource() test_builder = nbc.FileNameBuilder(tc.TestStorageName) transferrer = TestTransfer() try: test_result = rc.run_by_state( bookmark_name=caom2pipe_bookmark, command_name='collection2caom2', config=test_config, end_time=test_end_time, name_builder=test_builder, source=test_data_source, modify_transfer=None, store_transfer=transferrer, ) assert test_result is not None, 'expect a result' assert test_result == 0, 'expect success' assert client_mock.return_value.copy.called, 'expect put call' args, kwargs = client_mock.return_value.copy.call_args assert args[0] == 'ad:TEST/test_obs_id.fits.gz', 'wrong args[0]' assert (args[1] == '/usr/src/app/caom2pipe/int_test/test_obs_id/' 'test_obs_id.fits'), 'wrong args[1]' # state file checking test_state = mc.State(test_config.state_fqn) assert test_state is not None, 'expect state content' test_checkpoint = test_state.get_bookmark(caom2pipe_bookmark) assert test_checkpoint == test_end_time, 'wrong bookmark' # success file testing assert os.path.exists(test_config.log_file_directory), 'log directory' assert os.path.exists(test_config.success_fqn), 'success fqn' assert os.path.exists(test_config.progress_fqn), 'progress fqn' log_file = f'{test_config.log_file_directory}/test_obs_id.log' actual = glob.glob(f'{test_config.log_file_directory}/**') assert os.path.exists(log_file), f'specific log file {actual}' xml_file = f'{test_config.log_file_directory}/test_obs_id.fits.xml' assert os.path.exists(xml_file), f'xml file {actual}' # reporting testing report_file = f'{test_config.log_file_directory}/app_report.txt' assert os.path.exists(report_file), f'report file {actual}' pass_through_test = False with open(report_file, 'r') as f: for line in f: pass_through_test = True if 'Number' in line: bits = line.split(':') found = False if 'Inputs' in bits[0]: assert bits[1].strip() == '1', 'wrong inputs' found = True elif 'Successes' in bits[0]: assert bits[1].strip() == '1', 'wrong successes' found = True elif 'Timeouts' in bits[0]: assert bits[1].strip() == '0', 'wrong timeouts' found = True elif 'Retries' in bits[0]: assert bits[1].strip() == '0', 'wrong retries' found = True elif 'Errors' in bits[0]: assert bits[1].strip() == '0', 'wrong errors' found = True elif 'Rejections' in bits[0]: assert bits[1].strip() == '0', 'wrong rejections' found = True assert found, f'{line}' assert pass_through_test, 'found a report file and checked it' finally: f_list = glob.glob(f'{test_wd}/**', recursive=True) for entry in f_list: try: if os.path.isdir(entry): os.rmdir(entry) else: os.unlink(entry) except OSError as e: logging.error(f'failed to delete {e}')