def _run_by_state(): """Uses a state file with a timestamp to control which quicklook files will be retrieved from VLASS. Ingestion is based on URLs, because a URL that contains the phrase 'QA_REJECTED' is the only way to tell if the attribute 'requirements' should be set to 'fail', or not. """ config = mc.Config() config.get_executors() state = mc.State(config.state_fqn) # a way to get a datetime from a string, or maybe a datetime, depending # on the execution environment start_time = mc.increment_time(state.get_bookmark(VLASS_BOOKMARK), 0) todo_list, max_date = scrape.build_file_url_list(start_time) if len(todo_list) > 0: state = mc.State(config.state_fqn) work.init_web_log(state, config) # still make all subsequent calls if len == 0, for consistent reporting source = data_source.NraoPage(todo_list) name_builder = builder.VlassInstanceBuilder(config) return rc.run_by_state(config=config, command_name=sn.APPLICATION, bookmark_name=VLASS_BOOKMARK, meta_visitors=META_VISITORS, data_visitors=DATA_VISITORS, name_builder=name_builder, source=source, end_time=max_date, store_transfer=tc.HttpTransfer())
def _setup(test_input, local=False): # make sure the working directory TEST_EXEC_DIR has the correct things # in it if test_input.config_file is not None: config_file_target = TEST_EXEC_DIR / 'config.yml' shutil.copy(test_input.config_file, config_file_target) test_start_time = None state_file_target = None if test_input.state_file is not None: state_file_target = TEST_EXEC_DIR / 'state.yml' shutil.copy(test_input.state_file, state_file_target) # make the state file won't take decades to execute test_start_time = datetime.now( tz=dateutil.tz.UTC, ) - timedelta(minutes=5) state = mc.State(state_file_target.as_posix()) state.save_state(test_input.bookmark, test_start_time) if test_input.cache_file is not None: cache_file_target = TEST_EXEC_DIR / 'cache.yml' shutil.copy(test_input.cache_file, cache_file_target) with open(TEST_EXEC_DIR / 'cadcproxy.pem', 'w') as f: f.write('test content') if test_input.test_file is not None and local: shutil.copy(test_input.test_file, TEST_DATA_DIR / test_input.test_file.name) return test_start_time, state_file_target
def test_run_single(do_mock, test_config): _clean_up_log_files(test_config) progress_file = os.path.join(tc.TEST_DATA_DIR, 'progress.txt') test_config.features.expects_retry = False test_config.progress_fqn = progress_file test_config.state_fqn = STATE_FILE test_config.interval = 5 test_state = mc.State(test_config.state_fqn) test_state.save_state('gemini_timestamp', datetime.utcnow()) do_mock.return_value = -1 test_url = 'http://localhost/test_url.fits' test_storage_name = mc.StorageName(url=test_url) test_result = rc.run_single( test_config, test_storage_name, 'test_command', meta_visitors=None, data_visitors=None, ) assert test_result is not None, 'expect a result' assert test_result == -1, 'wrong result' assert do_mock.called, 'do mock not called' assert do_mock.call_count == 1, do_mock.call_count args, kwargs = do_mock.call_args test_storage = args[0] assert isinstance(test_storage, mc.StorageName), type(test_storage) assert test_storage.obs_id is None, 'wrong obs id' assert test_storage.url == test_url, test_storage.url
def _run_state(): """Uses a state file with a timestamp to control which quicklook files will be retrieved from VLASS. Ingestion is based on URLs, because a URL that contains the phrase 'QA_REJECTED' is the only way to tell if the attribute 'requirements' should be set to 'fail', or not. """ config = mc.Config() config.get_executors() state = mc.State(config.state_fqn) # a way to get a datetime from a string, or maybe a datetime, depending # on the execution environment start_time = mc.increment_time(state.get_bookmark(VLASS_BOOKMARK), 0) todo_list, max_date = scrape.build_file_url_list(start_time) source = data_source.NraoPage(todo_list) name_builder = nbc.EntryBuilder(storage_name.VlassName) storage_name.set_use_storage_inventory( config.features.supports_latest_client) return rc.run_by_state( config=config, bookmark_name=VLASS_BOOKMARK, meta_visitors=META_VISITORS, data_visitors=DATA_VISITORS, name_builder=name_builder, source=source, end_time=max_date, store_transfer=tc.HttpTransfer(), )
def build_todo(start_date, sidecar_dir, state_fqn): """ Build a list of file names where the modification time for the file is >= start_time. :param start_date timestamp in seconds since the epoch :param sidecar_dir where to cache ftp directory listing progress :param state_fqn where to find the configurable list of sub-directories, for bookmarked queries :return a dict, where keys are file names on the ftp host server, and values are timestamps, plus the max timestamp from the ftp host server for file addition """ logging.debug(f'Begin build_todo with date {start_date}') temp = {} state = mc.State(state_fqn) sub_dirs = state.get_context(NEOS_CONTEXT) # query the sub-directories of the root directory, because the timestamps # do not bubble up for modifications, only for additions for subdir in sub_dirs: query_dir = os.path.join(NEOS_DIR, str(subdir)) temp.update( _append_todo(start_date, sidecar_dir, ASC_FTP_SITE, query_dir, {}, {})) todo_list, max_date = _remove_dir_names(temp, start_date) logging.info( f'End build_todo with {len(todo_list)} records, date {max_date}.') return todo_list, max_date
def _run_state(): """Uses a state file with a timestamp to control which files will be retrieved from the CSA ftp host. Ingestion is based on fully-qualified file names from the CSA ftp host, because those are difficult to reproduce otherwise. """ builder = nbc.FileNameBuilder(NEOSSatName) config = mc.Config() config.get_executors() state = mc.State(config.state_fqn) start_time = state.get_bookmark(NEOS_BOOKMARK) temp = mc.increment_time(start_time, 0).timestamp() todo_list, max_timestamp = scrape.build_todo( temp, config.working_directory, config.state_fqn) max_date = datetime.fromtimestamp(max_timestamp) incremental_source = data_source.IncrementalSource(todo_list) transferrer = tc.FtpTransfer(config.data_source) return rc.run_by_state(config=config, name_builder=builder, command_name=APPLICATION, bookmark_name=NEOS_BOOKMARK, meta_visitors=META_VISITORS, data_visitors=DATA_VISITORS, end_time=max_date, chooser=None, source=incremental_source, store_transfer=transferrer)
def test_aug_visit_works(query_endpoint_mock, get_mock): get_mock.return_value.__enter__.return_value.raw = test_scrape.WL_INDEX query_endpoint_mock.side_effect = test_scrape._query_endpoint test_config = mc.Config() test_config.get_executors() test_state = mc.State(test_config.state_fqn) work.init_web_log(test_state, test_config) test_name = sn.VlassName( file_name='VLASS1.2.ql.T07t13.J081828-133000.10.2048.v1.I.iter1.' 'image.pbcor.tt0.subim.fits', entry='VLASS1.2.ql.T07t13.J081828-133000.10.2048.v1.I.iter1.' 'image.pbcor.tt0.subim.fits') test_file = os.path.join(TEST_DATA_DIR, f'{test_name.obs_id}.xml') test_obs = mc.read_obs_from_file(test_file) assert test_obs is not None, 'unexpected None' data_dir = os.path.join(THIS_DIR, '../../data') kwargs = {'working_directory': data_dir, 'cadc_client': Mock()} test_result = time_bounds_augmentation.visit(test_obs, **kwargs) assert test_obs is not None, 'unexpected modification' assert test_result is not None, 'should have a result status' assert len(test_result) == 1, 'modified artifacts count' assert test_result['artifacts'] == 2, 'artifact count' plane = test_obs.planes[test_name.product_id] chunk = plane.artifacts[test_name.file_uri].parts['0'].chunks[0] assert chunk is not None assert chunk.time is not None, 'no time information' assert chunk.time.axis is not None, 'no axis information' assert chunk.time.axis.bounds is not None, 'no bounds information' assert len(chunk.time.axis.bounds.samples) == 1, \ 'wrong amount of bounds info' assert chunk.time.exposure == 234.0, 'wrong exposure value'
def test_state(): if os.path.exists(TEST_STATE_FILE): os.unlink(TEST_STATE_FILE) with open(TEST_STATE_FILE, 'w') as f: f.write('bookmarks:\n' ' gemini_timestamp:\n' ' last_record: 2019-07-23 20:52:03.524443\n' 'context:\n' ' neossat_context:\n' ' - NEOSS\n' ' - 2020\n') with pytest.raises(mc.CadcException): test_subject = mc.State('nonexistent') test_subject = mc.State(TEST_STATE_FILE) assert test_subject is not None, 'expect result' test_result = test_subject.get_bookmark('gemini_timestamp') assert test_result is not None, 'expect content' assert isinstance(test_result, datetime) test_context = test_subject.get_context('neossat_context') assert test_context is not None, 'expect a result' assert isinstance(test_context, list), 'wrong return type' assert len(test_context) == 2, 'wrong return length' assert 'NEOSS' in test_context, 'wrong content' test_context.append('2019') test_subject.save_state('gemini_timestamp', test_result + timedelta(3)) test_subject.save_state('neossat_context', test_context) with open(TEST_STATE_FILE, 'r') as f: text = f.readlines() compare = ''.join(ii for ii in text) assert '2019-07-23' not in compare, 'content not updated' assert '2019' in compare, 'context content not updated'
def _run(): """Run the processing for observations using a todo file to identify the work to be done, but with the support of a Builder, so that StorageName instances can be provided. This is important here, because the instrument name needs to be provided to the StorageName constructor. :return 0 if successful, -1 if there's any sort of failure. Return status is used by airflow for task instance management and reporting. """ config = mc.Config() config.get_executors() state = mc.State(config.state_fqn) work.init_web_log(state, config) name_builder = builder.VlassInstanceBuilder(config) return rc.run_by_todo(config=config, name_builder=name_builder, command_name=sn.APPLICATION, meta_visitors=META_VISITORS, data_visitors=DATA_VISITORS, store_transfer=tc.HttpTransfer())
def _run_state(): """Run incremental processing for observations that are posted on the site archive.gemini.edu. TODO in the future this will depend on the incremental query endpoint. :return 0 if successful, -1 if there's any sort of failure. Return status is used by airflow for task instance management and reporting. """ ( clients, config, metadata_reader, meta_visitors, name_builder, ) = _common_init() state = mc.State(config.state_fqn) end_timestamp_s = state.bookmarks.get(data_source.GEM_BOOKMARK).get( 'end_timestamp', datetime.now()) end_timestamp_dt = mc.make_time_tz(end_timestamp_s) logging.info(f'{main_app.APPLICATION} will end at {end_timestamp_s}') incremental_source = data_source.IncrementalSource(metadata_reader) result = rc.run_by_state( config=config, name_builder=name_builder, bookmark_name=data_source.GEM_BOOKMARK, meta_visitors=meta_visitors, data_visitors=DATA_VISITORS, end_time=end_timestamp_dt, source=incremental_source, clients=clients, metadata_reader=metadata_reader, ) if incremental_source.max_records_encountered: logging.warning('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!') logging.warning('Encountered maximum records!!') logging.warning('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!') result |= -1 return result
def _run_by_incremental(): """Run incremental processing for observations that are posted on the site archive.gemini.edu. TODO in the future this will depend on the incremental query endpoint. :return 0 if successful, -1 if there's any sort of failure. Return status is used by airflow for task instance management and reporting. """ config = mc.Config() config.get_executors() state = mc.State(config.state_fqn) end_timestamp_s = state.bookmarks.get(data_source.GEM_BOOKMARK).get( 'end_timestamp', datetime.now()) end_timestamp_dt = mc.make_time_tz(end_timestamp_s) logging.info(f'{main_app.APPLICATION} will end at {end_timestamp_s}') external_metadata.init_global(config=config) name_builder = nbc.FileNameBuilder(gem_name.GemName) incremental_source = data_source.IncrementalSource() meta_visitors = _define_meta_visitors(config) result = rc.run_by_state( config=config, name_builder=name_builder, command_name=main_app.APPLICATION, bookmark_name=data_source.GEM_BOOKMARK, meta_visitors=meta_visitors, data_visitors=DATA_VISITORS, end_time=end_timestamp_dt, source=incremental_source, chooser=None, ) if incremental_source.max_records_encountered: logging.warning('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!') logging.warning('Encountered maximum records!!') logging.warning('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!') result |= -1 return result
def test_neoss_state( data_mock, csa_mock, caom_mock, transfer_mock, local_header_mock, test_input_name, ): if 'NEOSS' not in test_input_name: return test_input = INPUTS.get(test_input_name) # make sure the working directory TEXT_EXEC_DIR has nothing in it _cleanup() # make sure the working directory TEST_EXEC_DIR has the correct things # in it config_file_target = TEST_EXEC_DIR / 'config.yml' shutil.copy(test_input.config_file, config_file_target) state_file_target = TEST_EXEC_DIR / 'state.yml' shutil.copy(test_input.state_file, state_file_target) with open(TEST_EXEC_DIR / 'cadcproxy.pem', 'w') as f: f.write('test content') # make the state file won't take decades to execute test_start_time = datetime.now(tz=dateutil.tz.UTC) - timedelta(minutes=5) state = mc.State(state_file_target.as_posix()) state.save_state(test_input.bookmark, test_start_time) def _csa_mock(start_date, ign1, ign2, ign3, ign4, ign5): return { '/users/OpenData_DonneesOuvertes/pub/NEOSSAT/ASTRO/2019/256/' 'NEOS_SCI_2019213215700.fits': [False, start_date + timedelta(minutes=5).total_seconds()], } csa_mock.side_effect = _csa_mock def _transfer_get(src, dst): assert (src == '/users/OpenData_DonneesOuvertes/pub/NEOSSAT/ASTRO/2019/256/' 'NEOS_SCI_2019213215700.fits'), 'wrong source' assert ( dst == '/usr/src/app/integration_test/mock_test/data/execution/' '2019213215700/NEOS_SCI_2019213215700.fits'), 'wrong dst' with open(dst, 'w') as f2: f2.write('test content') transfer_mock.return_value.get.side_effect = _transfer_get caom_mock.return_value.metadata_client.read.side_effect = [ None, SimpleObservation( 'obs_id', 'NEOSSAT', Algorithm(name='exposure'), ), ] def _local_header(ignore): x = """SIMPLE = T / Written by IDL: Fri Oct 6 01:48:35 2017 BITPIX = -32 / Bits per pixel NAXIS = 2 / Number of dimensions NAXIS1 = 14 / NAXIS2 = 24 / RA = '22:53:27.5' DEC = '-30:04:37.6' MODE = '14 - FINE_SETTLE' OBJECT = '2020-P4-C' EXPOSURE= 128.0311 DATATYPE= 'REDUC ' /Data type, SCIENCE/CALIB/REJECT/FOCUS/TEST END """ delim = '\nEND' extensions = \ [e + delim for e in x.split(delim) if e.strip()] headers = [fits.Header.fromstring(e, sep='\n') for e in extensions] return headers local_header_mock.side_effect = _local_header def _info(uri): return FileInfo( id=uri, md5sum='abc', size=42, ) data_mock.return_value.info.side_effect = _info # import the module for execution sys.path.append(test_input.test_path) test_module = import_module('composable') getcwd_orig = os.getcwd os.getcwd = Mock(return_value=TEST_EXEC_DIR) try: test_result = test_module._run_state() assert test_result is not None, f'expect a result {test_input_name}' assert test_result == 0, f'wrong test result {test_input_name}' # was state updated? post_state = mc.State(state_file_target.as_posix()) assert (post_state.get_bookmark(test_input.bookmark) > test_start_time), f'state not updated {test_input_name}' assert (caom_mock.return_value.data_client.put.called ), f'{test_input_name} put not called' caom_mock.return_value.data_client.put.assert_called_with( '/usr/src/app/integration_test/mock_test/data/execution/' '2019213215700', 'cadc:NEOSSAT/NEOS_SCI_2019213215700.fits', None, ), f'{test_input_name} wrong put args' except Exception as e: logging.error(traceback.format_exc()) raise e finally: os.getcwd = getcwd_orig del sys.modules['composable']
def test_run_state_v(client_mock, repo_mock): repo_mock.return_value.read.side_effect = tc.mock_read client_mock.get_node.side_effect = tc.mock_get_node # the test file is length 0 client_mock.return_value.copy.return_value = 0 test_wd = '/usr/src/app/caom2pipe/int_test' caom2pipe_bookmark = 'caom2_timestamp' test_config = mc.Config() test_config.working_directory = test_wd test_config.collection = 'TEST' test_config.interval = 10 test_config.log_file_directory = f'{test_wd}/logs' test_config.failure_fqn = \ f'{test_config.log_file_directory}/failure_log.txt' test_config.log_to_file = True test_config.logging_level = 'DEBUG' test_config.progress_file_name = 'progress.txt' test_config.proxy_file_name = f'{test_wd}/cadcproxy.pem' test_config.rejected_file_name = 'rejected.yml' test_config.rejected_directory = f'{test_wd}/rejected' test_config._report_fqn = f'{test_config.log_file_directory}/app_report.txt' test_config.resource_id = 'ivo://cadc.nrc.ca/sc2repo' test_config.retry_file_name = 'retries.txt' test_config.retry_fqn = \ f'{test_config.log_file_directory}/{test_config.retry_file_name}' test_config.state_file_name = 'state.yml' test_config.success_fqn = \ f'{test_config.log_file_directory}/success_log.txt' test_config.tap_id = 'ivo://cadc.nrc.ca/sc2tap' test_config.task_types = [ mc.TaskType.STORE, mc.TaskType.INGEST, mc.TaskType.MODIFY ] test_config.features.use_file_names = True test_config.features.use_urls = False test_config.features.supports_latest_client = True test_config.use_local_files = False if not os.path.exists(test_wd): os.mkdir(test_wd) # if this test is failing, did the docker container get # restarted recently? # first create /caom2pipe_test/1000003f.fits.fz, # then check that the test_start_time and test_end_time values # correspond somewhat to the timestamp on that file # # this timestamp is 15 minutes earlier than the timestamp of the # file in /caom2pipe_test # test_start_time = '2021-05-08 02:25:09' with open(test_config.state_fqn, 'w') as f: f.write('bookmarks:\n') f.write(f' {caom2pipe_bookmark}:\n') f.write(f' last_record: {test_start_time}\n') test_end_time = datetime(2021, 5, 8, 2, 41, 27, 965132, tzinfo=timezone.utc) with open(test_config.proxy_fqn, 'w') as f: f.write('test content\n') test_data_source = TestListDirTimeBoxDataSource() test_builder = nbc.FileNameBuilder(tc.TestStorageName) transferrer = TestTransfer() try: test_result = rc.run_by_state( bookmark_name=caom2pipe_bookmark, command_name='collection2caom2', config=test_config, end_time=test_end_time, name_builder=test_builder, source=test_data_source, modify_transfer=None, store_transfer=transferrer, ) assert test_result is not None, 'expect a result' assert test_result == 0, 'expect success' assert client_mock.return_value.copy.called, 'expect put call' args, kwargs = client_mock.return_value.copy.call_args assert args[0] == 'ad:TEST/test_obs_id.fits.gz', 'wrong args[0]' assert (args[1] == '/usr/src/app/caom2pipe/int_test/test_obs_id/' 'test_obs_id.fits'), 'wrong args[1]' # state file checking test_state = mc.State(test_config.state_fqn) assert test_state is not None, 'expect state content' test_checkpoint = test_state.get_bookmark(caom2pipe_bookmark) assert test_checkpoint == test_end_time, 'wrong bookmark' # success file testing assert os.path.exists(test_config.log_file_directory), 'log directory' assert os.path.exists(test_config.success_fqn), 'success fqn' assert os.path.exists(test_config.progress_fqn), 'progress fqn' log_file = f'{test_config.log_file_directory}/test_obs_id.log' actual = glob.glob(f'{test_config.log_file_directory}/**') assert os.path.exists(log_file), f'specific log file {actual}' xml_file = f'{test_config.log_file_directory}/test_obs_id.fits.xml' assert os.path.exists(xml_file), f'xml file {actual}' # reporting testing report_file = f'{test_config.log_file_directory}/app_report.txt' assert os.path.exists(report_file), f'report file {actual}' pass_through_test = False with open(report_file, 'r') as f: for line in f: pass_through_test = True if 'Number' in line: bits = line.split(':') found = False if 'Inputs' in bits[0]: assert bits[1].strip() == '1', 'wrong inputs' found = True elif 'Successes' in bits[0]: assert bits[1].strip() == '1', 'wrong successes' found = True elif 'Timeouts' in bits[0]: assert bits[1].strip() == '0', 'wrong timeouts' found = True elif 'Retries' in bits[0]: assert bits[1].strip() == '0', 'wrong retries' found = True elif 'Errors' in bits[0]: assert bits[1].strip() == '0', 'wrong errors' found = True elif 'Rejections' in bits[0]: assert bits[1].strip() == '0', 'wrong rejections' found = True assert found, f'{line}' assert pass_through_test, 'found a report file and checked it' finally: f_list = glob.glob(f'{test_wd}/**', recursive=True) for entry in f_list: try: if os.path.isdir(entry): os.rmdir(entry) else: os.unlink(entry) except OSError as e: logging.error(f'failed to delete {e}')
def test_run_state( fits2caom2_mock, fits2caom2_in_out_mock, tap_query_mock, tap_mock, clients_mock, test_config, ): # tap_mock is used by the data_source_composable class fits2caom2_mock.side_effect = _mock_write clients_mock.return_value.metadata_client.read.side_effect = Mock( return_value=None) tap_query_mock.side_effect = _mock_get_work test_end_time = datetime.fromtimestamp(1579740838, tz=timezone.utc) start_time = test_end_time - timedelta(seconds=900) _write_state(start_time) test_config.task_types = [mc.TaskType.INGEST] test_config.state_fqn = STATE_FILE test_config.interval = 10 individual_log_file = ( f'{test_config.log_file_directory}/NEOS_SCI_2015347000000_clean.log') if os.path.exists(test_config.progress_fqn): os.unlink(test_config.progress_fqn) if os.path.exists(test_config.success_fqn): os.unlink(test_config.success_fqn) if os.path.exists(individual_log_file): os.unlink(individual_log_file) test_chooser = ec.OrganizeChooser() # use_local_files set so run_by_state chooses QueryTimeBoxDataSourceTS test_config.use_local_files = False test_result = rc.run_by_state( config=test_config, chooser=test_chooser, command_name=TEST_COMMAND, bookmark_name=TEST_BOOKMARK, end_time=test_end_time, ) assert test_result is not None, 'expect a result' assert test_result == 0, 'expect success' if fits2caom2_mock.called: fits2caom2_mock.assert_called_once_with() elif fits2caom2_in_out_mock.called: fits2caom2_in_out_mock.assert_called_once_with(ANY) test_state = mc.State(STATE_FILE) test_bookmark = test_state.get_bookmark(TEST_BOOKMARK) assert test_bookmark == test_end_time, 'wrong time' assert os.path.exists(test_config.progress_fqn), 'expect progress file' assert os.path.exists( test_config.success_fqn), 'log_to_file set to false, no success file' assert not os.path.exists( individual_log_file), f'log_to_file is False, no entry log' # test that runner does nothing when times haven't changed start_time = test_end_time _write_state(start_time) fits2caom2_mock.reset_mock() fits2caom2_in_out_mock.reset_mock() test_result = rc.run_by_state( config=test_config, chooser=test_chooser, command_name=TEST_COMMAND, bookmark_name=TEST_BOOKMARK, end_time=test_end_time, ) assert test_result is not None, 'expect a result' assert test_result == 0, 'expect success' assert not fits2caom2_mock.called, 'expect no fits2caom2 call' assert ( not fits2caom2_in_out_mock.called), 'expect no update fits2caom2 call'
def run(self): """ Uses an iterable with an instance of StateRunnerMeta. :return: 0 for success, -1 for failure """ self._logger.debug(f'Begin run state for {self._bookmark_name}') if not os.path.exists(os.path.dirname(self._config.progress_fqn)): os.makedirs(os.path.dirname(self._config.progress_fqn)) state = mc.State(self._config.state_fqn) if self._data_source.start_time_ts is None: temp = state.get_bookmark(self._bookmark_name) start_time = mc.convert_to_ts(temp) else: start_time = self._data_source.start_time_ts # make sure prev_exec_time is offset-aware type datetime.timestamp prev_exec_time = start_time incremented_ts = mc.increment_time_tz( prev_exec_time, self._config.interval).timestamp() exec_time = min(incremented_ts, self._end_time) self._logger.debug( f'Starting at {datetime.utcfromtimestamp(start_time)}, ending at ' f'{datetime.utcfromtimestamp(self._end_time)}') result = 0 cumulative = 0 cumulative_correct = 0 if prev_exec_time == self._end_time: self._logger.info( f'Start time is the same as end time ' f'{datetime.utcfromtimestamp(start_time)}, stopping.') exec_time = prev_exec_time else: cumulative = 0 result = 0 while exec_time <= self._end_time: self._logger.info( f'Processing from ' f'{datetime.utcfromtimestamp(prev_exec_time)} to ' f'{datetime.utcfromtimestamp(exec_time)}') save_time = exec_time self._organizer.success_count = 0 entries = self._data_source.get_time_box_work( prev_exec_time, exec_time) num_entries = len(entries) if num_entries > 0: self._logger.info(f'Processing {num_entries} entries.') self._organizer.complete_record_count = num_entries self._organizer.set_log_location() pop_action = entries.pop if isinstance(entries, deque): pop_action = entries.popleft while len(entries) > 0: entry = pop_action() result |= self._process_entry(entry.entry_name) save_time = min(mc.convert_to_ts(entry.entry_ts), exec_time) self._finish_run() cumulative += num_entries cumulative_correct += self._organizer.success_count self._record_progress(num_entries, cumulative, start_time, save_time) state.save_state(self._bookmark_name, datetime.utcfromtimestamp(save_time)) if exec_time == self._end_time: # the last interval will always have the exec time # equal to the end time, which will fail the while check # so leave after the last interval has been processed # # but the while <= check is required so that an interval # smaller than exec_time -> end_time will get executed, # so don't get rid of the '=' in the while loop # comparison, just because this one exists break prev_exec_time = exec_time new_time = mc.increment_time_tz( prev_exec_time, self._config.interval).timestamp() exec_time = min(new_time, self._end_time) self._reporter.add_entries(cumulative) self._reporter.add_successes(cumulative_correct) state.save_state(self._bookmark_name, datetime.utcfromtimestamp(exec_time)) self._logger.info('==================================================') self._logger.info( f'Done {self._organizer.command_name}, saved state is ' f'{datetime.utcfromtimestamp(exec_time)}') self._logger.info( f'{cumulative_correct} of {cumulative} records processed ' f'correctly.') self._logger.info('==================================================') return result
collection = sys.argv[1] COLLECTION_KEYS = { 'gem': 'gemini_bookmark', 'dao': 'dao_timestamp', 'neossat': 'neossat_timestamp', 'cfht': 'cfht_timestamp', 'vlass': 'vlass_timestamp' } collection_key = COLLECTION_KEYS.get(collection, f'{collection}_bookmark') config = mc.Config() config.get_executors() tomorrow = datetime.utcnow() + timedelta(days=1) if collection == 'gem': # gemini counts back 14 days for incremental harvesting because # that's how their endpoints can work .... tomorrow = datetime.utcnow() + timedelta(days=15) if not os.path.exists(config.state_fqn): with open(config.state_fqn, 'w') as f: f.write('bookmarks:\n') f.write(f' {collection_key}:\n') f.write(f' last_record: {tomorrow}\n') state = mc.State(config.state_fqn) state.save_state(collection_key, tomorrow) print(f'::: state saved key {collection_key} value {tomorrow}') sys.exit(0)
def test_run_state( fits2caom2_mock, tap_query_mock, tap_mock, set_clients_mock, repo_get_mock, test_config ): # tap mock is used by the data_source_composable class set_clients_mock.side_effect = _clients_mock fits2caom2_mock.side_effect = _mock_write repo_get_mock.side_effect = Mock(return_value=None) tap_query_mock.side_effect = _mock_get_work test_end_time = datetime.fromtimestamp(1579740838, tz=timezone.utc) start_time = test_end_time - timedelta(seconds=900) _write_state(start_time) test_config.task_types = [mc.TaskType.INGEST] test_config.state_fqn = STATE_FILE test_config.interval = 10 if os.path.exists(test_config.progress_fqn): os.unlink(test_config.progress_fqn) if os.path.exists(test_config.success_fqn): os.unlink(test_config.success_fqn) test_chooser = ec.OrganizeChooser() test_result = rc.run_by_state( config=test_config, chooser=test_chooser, command_name=TEST_COMMAND, bookmark_name=TEST_BOOKMARK, end_time=test_end_time, ) assert test_result is not None, 'expect a result' assert test_result == 0, 'expect success' assert fits2caom2_mock.called, 'expect fits2caom2 call' fits2caom2_mock.assert_called_once_with() test_state = mc.State(STATE_FILE) test_bookmark = test_state.get_bookmark(TEST_BOOKMARK) assert test_bookmark == test_end_time, 'wrong time' assert os.path.exists(test_config.progress_fqn), 'expect progress file' assert ( not os.path.exists(test_config.success_fqn) ), 'log_to_file set to false, no success file' # test that runner does nothing when times haven't changed start_time = test_end_time _write_state(start_time) fits2caom2_mock.reset_mock() test_result = rc.run_by_state( config=test_config, chooser=test_chooser, command_name=TEST_COMMAND, bookmark_name=TEST_BOOKMARK, end_time=test_end_time, ) assert test_result is not None, 'expect a result' assert test_result == 0, 'expect success' assert not fits2caom2_mock.called, 'expect no fits2caom2 call'
def test_run_state_v(client_mock): client_mock.metadata_client.read.side_effect = tc.mock_read client_mock.data_client.info.return_value = FileInfo( id='cadc:TEST/anything.fits', size=42, md5sum='9473fdd0d880a43c21b7778d34872157', ) test_wd = '/usr/src/app/caom2pipe/int_test' caom2pipe_bookmark = 'caom2_timestamp' test_config = mc.Config() test_config.working_directory = test_wd test_config.collection = 'TEST' test_config.interval = 10 test_config.log_file_directory = f'{test_wd}/logs' test_config.failure_fqn = ( f'{test_config.log_file_directory}/failure_log.txt') test_config.log_to_file = True test_config.logging_level = 'INFO' test_config.progress_file_name = 'progress.txt' test_config.proxy_file_name = f'{test_wd}/cadcproxy.pem' test_config.rejected_file_name = 'rejected.yml' test_config.rejected_directory = f'{test_wd}/rejected' test_config._report_fqn = ( f'{test_config.log_file_directory}/app_report.txt') test_config.resource_id = 'ivo://cadc.nrc.ca/sc2repo' test_config.retry_file_name = 'retries.txt' test_config.retry_fqn = ( f'{test_config.log_file_directory}/{test_config.retry_file_name}') test_config.state_file_name = 'state.yml' test_config.success_fqn = ( f'{test_config.log_file_directory}/success_log.txt') test_config.tap_id = 'ivo://cadc.nrc.ca/sc2tap' test_config.task_types = [ mc.TaskType.STORE, mc.TaskType.INGEST, mc.TaskType.MODIFY, ] test_config.features.use_file_names = True test_config.features.use_urls = False test_config.features.supports_latest_client = True test_config.use_local_files = False test_config.storage_inventory_resource_id = 'ivo://cadc.nrc.ca/test' if not os.path.exists(test_wd): os.mkdir(test_wd) test_start_time, test_end_time = _get_times(test_config, caom2pipe_bookmark) with open(test_config.proxy_fqn, 'w') as f: f.write('test content\n') test_data_source = TestListDirTimeBoxDataSource() test_builder = nbc.GuessingBuilder(tc.TestStorageName) transferrer = TestTransfer() try: test_result = rc.run_by_state( bookmark_name=caom2pipe_bookmark, command_name='collection2caom2', config=test_config, end_time=test_end_time, name_builder=test_builder, source=test_data_source, modify_transfer=transferrer, store_transfer=transferrer, clients=client_mock, ) assert test_result is not None, 'expect a result' assert test_result == 0, 'expect success' assert client_mock.data_client.put.called, 'expect put call' client_mock.data_client.put.assert_called_with( '/usr/src/app/caom2pipe/int_test/test_obs_id', 'cadc:TEST/test_file.fits.gz', None, ), 'wrong call args' # state file checking test_state = mc.State(test_config.state_fqn) assert test_state is not None, 'expect state content' test_checkpoint = test_state.get_bookmark(caom2pipe_bookmark) assert test_checkpoint == test_end_time, 'wrong bookmark' # success file testing assert os.path.exists(test_config.log_file_directory), 'log directory' assert os.path.exists(test_config.success_fqn), 'success fqn' assert os.path.exists(test_config.progress_fqn), 'progress fqn' log_file = f'{test_config.log_file_directory}/test_obs_id.log' actual = glob.glob(f'{test_config.log_file_directory}/**') assert os.path.exists(log_file), f'specific log file {actual}' xml_file = f'{test_config.log_file_directory}/test_obs_id.xml' assert os.path.exists(xml_file), f'xml file {actual}' # reporting testing report_file = f'{test_config.log_file_directory}/app_report.txt' assert os.path.exists(report_file), f'report file {actual}' pass_through_test = False with open(report_file, 'r') as f: for line in f: pass_through_test = True if 'Number' in line: bits = line.split(':') found = False if 'Inputs' in bits[0]: assert bits[1].strip() == '1', 'wrong inputs' found = True elif 'Successes' in bits[0]: assert bits[1].strip() == '1', 'wrong successes' found = True elif 'Timeouts' in bits[0]: assert bits[1].strip() == '0', 'wrong timeouts' found = True elif 'Retries' in bits[0]: assert bits[1].strip() == '0', 'wrong retries' found = True elif 'Errors' in bits[0]: assert bits[1].strip() == '0', 'wrong errors' found = True elif 'Rejections' in bits[0]: assert bits[1].strip() == '0', 'wrong rejections' found = True assert found, f'{line}' assert pass_through_test, 'found a report file and checked it' finally: f_list = glob.glob(f'{test_wd}/**', recursive=True) for entry in f_list: try: if os.path.isdir(entry): os.rmdir(entry) else: os.unlink(entry) except OSError as e: logging.error(f'failed to delete {e}')
def test_state( data_mock, web_log_mock, nrao_mock, caom_mock, transfer_mock, local_header_mock, qa_mock, test_input_name, ): if 'TODO' in test_input_name: return test_input = INPUTS.get(test_input_name) # make sure the working directory TEXT_EXEC_DIR has nothing in it for child in TEST_EXEC_DIR.iterdir(): if child == TEST_EXEC_DIR: continue if child.is_dir(): for child_2 in child.iterdir(): child_2.unlink() child.rmdir() else: child.unlink() # make sure the working directory TEST_EXEC_DIR has the correct things # in it config_file_target = TEST_EXEC_DIR / 'config.yml' shutil.copy(test_input.config_file, config_file_target) state_file_target = TEST_EXEC_DIR / 'state.yml' shutil.copy(test_input.state_file, state_file_target) with open(TEST_EXEC_DIR / 'cadcproxy.pem', 'w') as f: f.write('test content') # make the state file won't take decades to execute test_start_time = datetime.now(tz=dateutil.tz.UTC) - timedelta(minutes=5) state = mc.State(state_file_target.as_posix()) state.save_state(test_input.bookmark, test_start_time) # import the module for execution sys.path.append(test_input.test_path) test_module = import_module('composable') nrao_mock.side_effect = _nrao_mock def _web_log_init(ignore): global web_log_content web_log_content = { 'VLASS1.1_T07t13.J083838-153000_P68878v1_2020_08_29T21_' '48_48.092': '2020-09-09 07:53', } web_log_mock.side_effect = _web_log_init def _transfer_get(src, dst): assert (src == 'https://archive-new.nrao.edu/vlass/quicklook/VLASS1.1/T07t13/' 'VLASS1.1.ql.T07t13.J083838-153000.10.2048.v1.I.iter1.image.' 'pbcor.tt0.rms.subim.fits'), 'wrong source' assert ( dst == '/usr/src/app/integration_test/mock_test/data/execution/' 'VLASS1.1.T07t13.J083838-153000/' 'VLASS1.1.ql.T07t13.J083838-153000.10.2048.v1.I.iter1.image.' 'pbcor.tt0.rms.subim.fits'), 'wrong dst' with open(dst, 'w') as f2: f2.write('test content') transfer_mock.return_value.get.side_effect = _transfer_get caom_mock.return_value.metadata_client.read.side_effect = [ None, SimpleObservation( 'obs_id', 'VLASS', Algorithm(name='exposure'), ), ] def _local_header(ignore): x = """SIMPLE = T / Written by IDL: Fri Oct 6 01:48:35 2017 BITPIX = -32 / Bits per pixel NAXIS = 2 / Number of dimensions NAXIS1 = 2048 / NAXIS2 = 2048 / TYPE = 'image' BMAJ = 1.09 BMIN = 0.19 DATATYPE= 'REDUC ' /Data type, SCIENCE/CALIB/REJECT/FOCUS/TEST END """ delim = '\nEND' extensions = \ [e + delim for e in x.split(delim) if e.strip()] headers = [fits.Header.fromstring(e, sep='\n') for e in extensions] return headers local_header_mock.side_effect = _local_header def _info(uri): return FileInfo( id=uri, md5sum='abc', size=42, ) data_mock.return_value.info.side_effect = _info qa_mock.return_value = False getcwd_orig = os.getcwd os.getcwd = Mock(return_value=TEST_EXEC_DIR) logging.getLogger('StorageClientWrapper').setLevel(logging.DEBUG) try: test_result = test_module._run_state() assert test_result is not None, f'expect a result {test_input_name}' assert test_result == 0, f'wrong test result {test_input_name}' # was state updated? post_state = mc.State(state_file_target.as_posix()) assert (post_state.get_bookmark(test_input.bookmark) > test_start_time), f'state not updated {test_input_name}' assert (caom_mock.return_value.data_client.put.called ), f'{test_input_name} put not called' caom_mock.return_value.data_client.put.assert_called_with( '/usr/src/app/integration_test/mock_test/data/execution/' 'VLASS1.1.T07t13.J083838-153000', 'nrao:VLASS/VLASS1.1.ql.T07t13.J083838-153000.10.2048.v1.I.' 'iter1.image.pbcor.tt0.rms.subim.fits', None, ), f'{test_input_name} wrong put args' finally: os.getcwd = getcwd_orig del sys.modules['composable']
def test_gem_state( data_mock, caom_mock, local_header_mock, json_mock, filter_mock, http_get_mock, endpoint_mock, tap_mock, external_header_mock, test_input_name, ): if 'GEM_STATE' not in test_input_name: return test_input = INPUTS.get(test_input_name) _cleanup() getcwd_orig = os.getcwd os.getcwd = Mock(return_value=TEST_EXEC_DIR) test_start_time, state_file_target = _setup(test_input) def _json_mock(url, ignore_session): response = Mock() response.close = Mock() fqn = test_input.input_dir / 'input.json' with open(fqn) as f: response.text = f.read() def x(): return json.loads(response.text) response.json = x return response json_mock.side_effect = _json_mock def _endpoint_mock(ignore): assert (ignore.startswith( 'https://archive.gemini.edu/jsonsummary/canonical/NotFail/' 'notengineering/entrytimedaterange') ), 'wrong url for incremental querying' return _json_mock(ignore, None) endpoint_mock.side_effect = _endpoint_mock def _filter_mock(): from astropy.table import parse_single_table fqn = test_input.input_dir / 'filter.xml' content = parse_single_table(fqn) return content, None filter_mock.side_effect = _filter_mock caom_mock.return_value.metadata_client.read.side_effect = [ None, SimpleObservation( 'obs_id', 'GEMINI', Algorithm(name='exposure'), ), ] def _tap_query( ignore_query, output_file, data_only=True, response_format='csv', ): output_file.write( 'observationID,instrument_name\n' 'GS-CAL20191214-1-029,F2\n', ) # caom_mock.return_value.query_client.query.side_effect = _tap_query tap_mock.return_value.query.side_effect = _tap_query def _local_header(ignore): x = """SIMPLE = T / Written by IDL: Fri Oct 6 01:48:35 2017 BITPIX = -32 / Bits per pixel NAXIS = 2 / Number of dimensions NAXIS1 = 14 / NAXIS2 = 24 / INSTRUME= 'F2' DATALAB = 'GS-CAL20191214-1-029 END """ delim = '\nEND' extensions = \ [e + delim for e in x.split(delim) if e.strip()] headers = [fits.Header.fromstring(e, sep='\n') for e in extensions] return headers local_header_mock.side_effect = _local_header external_header_mock.side_effect = _local_header def _info(uri): return FileInfo( id=uri, md5sum='abc', size=42, ) data_mock.return_value.info.side_effect = _info # import the module for execution sys.path.append(test_input.test_path) test_module = import_module('composable') try: test_result = test_module._run_state() assert test_result is not None, f'expect a result {test_input_name}' assert test_result == 0, f'wrong test result {test_input_name}' # was state updated? post_state = mc.State(state_file_target.as_posix()) assert (post_state.get_bookmark(test_input.bookmark) > test_start_time), f'state not updated {test_input_name}' assert (caom_mock.return_value.data_client.put.called ), f'{test_input_name} put not called' caom_mock.return_value.data_client.put.assert_called_with( '/usr/src/app/integration_test/mock_test/data/execution/' 'GS-CAL20191214-1-029', 'gemini:GEMINI/S20191214S0301.fits', ), f'{test_input_name} wrong put args' assert http_get_mock.called, 'expect http get call' http_get_mock.assert_called_with( 'https://archive.gemini.edu/file/S20191214S0301.fits', '/usr/src/app/integration_test/mock_test/data/execution/' 'GS-CAL20191214-1-029/S20191214S0301.fits', ), 'wrong http get args' except Exception as e: logging.error(traceback.format_exc()) raise e finally: os.getcwd = getcwd_orig del sys.modules['composable']
def retrieve_obs_metadata(obs_id): """Maybe someday this can be done with astroquery, but the VLASS metadata isn't in the database that astroquery.Nrao points to, so that day is not today.""" metadata = {} mod_obs_id = obs_id.replace('.', '_', 2).replace('_', '.', 1) global web_log_content if len(web_log_content) == 0: config = mc.Config() config.get_executors() logging.warning('Initializing from /weblog. This may take a while.') state = mc.State(config.state_fqn) init_web_log(state) latest_key = None max_ts = None tz_info = tz.gettz('US/Socorro') # there may be multiple processing runs for a single obs id, use the # most recent for key in web_log_content.keys(): if key.startswith(mod_obs_id): dt_bits = '_'.join( ii for ii in key.replace('/', '').split('_')[3:] ) dt_tz = make_date_time(dt_bits).replace(tzinfo=tz_info) if max_ts is None: max_ts = dt_tz latest_key = key else: if max_ts < dt_tz: max_ts = dt_tz latest_key = key session = mc.get_endpoint_session() if latest_key is not None: obs_url = f'{QL_WEB_LOG_URL}{latest_key}' logging.debug(f'Querying {obs_url}') response = None try: response = mc.query_endpoint_session(obs_url, session) if response is None: logging.error(f'Could not query {obs_url}') else: soup = BeautifulSoup(response.text, features='lxml') response.close() pipeline_bit = soup.find(string=re.compile('pipeline-')) if pipeline_bit is None: logging.error(f'Did not find pipeline on {obs_url}') else: pipeline_url = ( f'{obs_url}{pipeline_bit.strip()}html/index.html' ) logging.debug(f'Querying {pipeline_url}') response = mc.query_endpoint_session(pipeline_url, session) if response is None: logging.error(f'Could not query {pipeline_url}') else: metadata = _parse_single_field(response.text) metadata['reference'] = pipeline_url logging.debug(f'Setting reference to {pipeline_url}') response.close() finally: if response is not None: response.close() return metadata