def build_good_todo(start_date, session): """Create the list of work, based on timestamps from the NRAO Quicklook page. :return a dict, where keys are timestamps, and values are lists of URLs. """ temp = {} max_date = start_date response = None try: # get the last modified date on the quicklook images listing response = mc.query_endpoint_session(QL_URL, session) if response is None: logging.warning(f'Could not query {QL_URL}') else: epochs = _parse_top_page_no_date(response.text) response.close() for epoch in epochs: epoch_url = f'{QL_URL}{epoch}' logging.info(f'Checking epoch {epoch} on date {epochs[epoch]}') response = mc.query_endpoint_session(epoch_url, session) if response is None: logging.warning(f'Could not query epoch {epoch_url}') else: tiles = _parse_tile_page(response.text, start_date) response.close() # get the list of tiles for tile in tiles: logging.info( f'Checking tile {tile} with date {tiles[tile]}') tile_url = f'{epoch_url}{tile}' response = mc.query_endpoint_session(tile_url, session) if response is None: logging.warning(f'Could not query {tile_url}') else: observations = _parse_id_page( response.text, start_date) response.close() # for each tile, get the list of observations for observation in observations: obs_url = f'{tile_url}{observation}' dt_as_s = observations[observation].timestamp() max_date = max(max_date, observations[observation]) if dt_as_s in temp: temp[dt_as_s].append(obs_url) else: temp[dt_as_s] = [obs_url] finally: if response is not None: response.close() return temp, max_date
def retrieve_obs_metadata(obs_id): """Maybe someday this can be done with astroquery, but the VLASS metadata isn't in the database that astroquery.Nrao points to, so that day is not today.""" metadata = {} mod_obs_id = obs_id.replace('.', '_', 2).replace('_', '.', 1) global web_log_content if len(web_log_content) == 0: raise mc.CadcException('Must initialize weblog content.') latest_key = None max_ts = None tz_info = tz.gettz('US/Socorro') # there may be multiple processing runs for a single obs id, use the # most recent for key in web_log_content.keys(): if key.startswith(mod_obs_id): dt_bits = '_'.join(ii for ii in key.replace('/', '').split('_')[3:]) dt_tz = make_date_time(dt_bits).replace(tzinfo=tz_info) if max_ts is None: max_ts = dt_tz latest_key = key else: if max_ts < dt_tz: max_ts = dt_tz latest_key = key session = mc.get_endpoint_session() if latest_key is not None: obs_url = f'{QL_WEB_LOG_URL}{latest_key}' logging.debug(f'Querying {obs_url}') response = None try: response = mc.query_endpoint_session(obs_url, session) if response is None: logging.error(f'Could not query {obs_url}') else: pipeline_bit = _parse_for_reference(response.text, 'pipeline-') response.close() if pipeline_bit is None: logging.error(f'Did not find pipeline on {obs_url}') else: pipeline_url = \ f'{obs_url}{pipeline_bit.strip()}html/index.html' logging.debug(f'Querying {pipeline_url}') response = mc.query_endpoint_session(pipeline_url, session) if response is None: logging.error(f'Could not query {pipeline_url}') else: metadata = _parse_single_field(response.text) metadata['reference'] = pipeline_url logging.debug(f'Setting reference to {pipeline_url}') response.close() finally: if response is not None: response.close() return metadata
def build_qa_rejected_todo(start_date, session): """ :return a dict, where keys are timestamps, and values are lists of URLs. """ rejected = {} max_date = start_date response = None try: # get the last modified date on the quicklook images listing response = mc.query_endpoint_session(QL_URL, session) if response is None: logging.warning(f'Could not query {QL_URL}') else: epochs = _parse_top_page_no_date(response.text) response.close() for epoch in epochs: epoch_name = epoch.split('/')[-2] epoch_rejected_url = f'{QL_URL}{epoch}QA_REJECTED/' logging.info( f'Checking epoch {epoch_name} on date {epochs[epoch]}' ) try: response = mc.query_endpoint_session( epoch_rejected_url, session ) if response is None: logging.warning( f'Could not query epoch {epoch_rejected_url}' ) else: temp, rejected_max = _parse_rejected_page( response.text, epoch_name, start_date, epoch_rejected_url, ) max_date = max(max_date, rejected_max) response.close() temp_rejected = rejected rejected = {**temp, **temp_rejected} except mc.CadcException as e: if 'Not Found for url' in str(e): logging.info( f'No QA_REJECTED directory for ' f'{epoch_name}. Continuing.' ) else: raise e finally: if response is not None: response.close() return rejected, max_date
def query_top_page(): """Query the timestamp from the top page, for reporting. """ start_date = make_date_time('01Jan2017 12:00') response = None max_date = None try: # get the last modified date on the quicklook images listing session = mc.get_endpoint_session() response = mc.query_endpoint_session(QL_URL, session) if response is None: logging.warning(f'Could not query {QL_URL}') else: epochs = _parse_top_page(response.text, start_date) for key, value in epochs.items(): logging.info(f'{key} {make_date_time(value)}') if max_date is None: max_date = value else: max_date = max(max_date, value) finally: if response is not None: response.close() return max_date
def get_pi_metadata(program_id): global pm if program_id in pm: metadata = pm[program_id] else: program_url = 'https://archive.gemini.edu/programinfo/' + program_id # Open the URL and fetch the JSON document for the observation response = None try: response = mc.query_endpoint_session( program_url, gofr.query_session ) xml_metadata = response.text finally: if response: response.close() metadata = None soup = BeautifulSoup(xml_metadata, 'lxml') tds = soup.find_all('td') if len(tds) > 0: # sometimes the program id points to an html page with an empty # table, see e.g. N20200210S0077_bias title = None if len(tds[1].contents) > 0: title = tds[1].contents[0].replace('\n', ' ') pi_name = None if len(tds[3].contents) > 0: pi_name = tds[3].contents[0] metadata = {'title': title, 'pi_name': pi_name} pm[program_id] = metadata logging.debug('End get_obs_metadata') return metadata
def get_obs_metadata(file_id): """ Download the Gemini observation metadata for the given obs_id. :param file_id: The file ID :return: Dictionary of observation metadata. """ logging.debug('Begin get_obs_metadata for {}'.format(file_id)) global om if om.contains(file_id): om.reset_index(file_id) else: gemini_url = '{}{}'.format(GEMINI_METADATA_URL, file_id) # Open the URL and fetch the JSON document for the observation response = None try: response = mc.query_endpoint_session( gemini_url, gofr.query_session ) metadata = response.json() finally: if response is not None: response.close() if len(metadata) == 0: raise mc.CadcException(f'Could not find JSON record for {file_id} ' f'at archive.gemini.edu.') om.add(metadata, file_id) logging.debug('End get_obs_metadata for {}'.format(file_id))
def test_query_endpoint_session(): session_mock = Mock() test_result = mc.query_endpoint_session('https://localhost', session_mock, timeout=25) assert test_result is not None, 'expected result' assert session_mock.get.called, 'mock not called' session_mock.get.assert_called_with('https://localhost', timeout=25)
def list_files_on_page(url, start_time, session): """:return a dict, where keys are URLS, and values are timestamps, from a specific page listing at NRAO.""" response = None try: logging.debug(f'Querying {url}') response = mc.query_endpoint_session(url, session) if response is None: raise mc.CadcException(f'Could not query {url}') else: result = _parse_specific_file_list_page(response.text, start_time) response.close() return result finally: if response is not None: response.close()
def retrieve_json(source_name, logger, session): # source name is a file id, because that's the only part that's # required to be unique for a retrieval from archive.gemini.edu # logger.debug(f'Begin retrieve_json for {source_name}') gemini_url = f'{GEMINI_METADATA_URL}{source_name}' # Open the URL and fetch the JSON document for the observation response = None try: response = mc.query_endpoint_session(gemini_url, session) metadata = response.json() finally: if response is not None: response.close() if len(metadata) == 0: raise mc.CadcException( f'Could not find JSON record for {source_name} at ' f'{gemini_url}.' ) logger.debug(f'End _retrieve_json') return metadata
def get_time_box_work(self, prev_exec_time, exec_time): """ :param prev_exec_time float timestamp start of the time-boxed chunk :param exec_time float timestamp end of the time-boxed chunk :return: a deque of file names with time their associated JSON (DB) records were modified from archive.gemini.edu. """ self._logger.debug( f'Begin get_time_box_work from {prev_exec_time} to {exec_time}.' ) # datetime format 2019-12-01T00:00:00.000000 prev_dt_str = mc.make_time_tz(prev_exec_time).strftime( mc.ISO_8601_FORMAT ) exec_dt_str = mc.make_time_tz(exec_time).strftime(mc.ISO_8601_FORMAT) url = ( f'https://archive.gemini.edu/jsonsummary/canonical/' f'NotFail/notengineering/' f'entrytimedaterange={prev_dt_str}%20{exec_dt_str}/' f'?orderby=entrytime' ) # needs to be ordered by timestamps when processed self._logger.info(f'Querying {url}') entries = deque() response = None try: response = mc.query_endpoint_session(url, self._session) if response is None: logging.warning(f'Could not query {url}.') else: metadata = response.json() response.close() if metadata is not None: if len(metadata) == 0: self._logger.warning( f'No query results returned for interval from ' f'{prev_exec_time} to {exec_time}.' ) else: for entry in metadata: file_name = entry.get('name') entrytime = mc.make_time_tz(entry.get('entrytime')) entries.append( dsc.StateRunnerMeta( file_name, entrytime.timestamp() ) ) uri = mc.build_uri(COLLECTION, file_name, SCHEME) # all the other cases where add_json_record is # called, there's a list as input, so conform to # that typing here self._metadata_reader.add_json_record(uri, [entry]) self._metadata_reader.add_file_info_record(uri) finally: if response is not None: response.close() if len(entries) == 10000: self._max_records_encountered = True self._encounter_start = prev_exec_time self._encounter_end = exec_time self._logger.debug('End get_time_box_work.') return entries
def retrieve_obs_metadata(obs_id): """Maybe someday this can be done with astroquery, but the VLASS metadata isn't in the database that astroquery.Nrao points to, so that day is not today.""" metadata = {} mod_obs_id = obs_id.replace('.', '_', 2).replace('_', '.', 1) global web_log_content if len(web_log_content) == 0: config = mc.Config() config.get_executors() logging.warning('Initializing from /weblog. This may take a while.') state = mc.State(config.state_fqn) init_web_log(state) latest_key = None max_ts = None tz_info = tz.gettz('US/Socorro') # there may be multiple processing runs for a single obs id, use the # most recent for key in web_log_content.keys(): if key.startswith(mod_obs_id): dt_bits = '_'.join( ii for ii in key.replace('/', '').split('_')[3:] ) dt_tz = make_date_time(dt_bits).replace(tzinfo=tz_info) if max_ts is None: max_ts = dt_tz latest_key = key else: if max_ts < dt_tz: max_ts = dt_tz latest_key = key session = mc.get_endpoint_session() if latest_key is not None: obs_url = f'{QL_WEB_LOG_URL}{latest_key}' logging.debug(f'Querying {obs_url}') response = None try: response = mc.query_endpoint_session(obs_url, session) if response is None: logging.error(f'Could not query {obs_url}') else: soup = BeautifulSoup(response.text, features='lxml') response.close() pipeline_bit = soup.find(string=re.compile('pipeline-')) if pipeline_bit is None: logging.error(f'Did not find pipeline on {obs_url}') else: pipeline_url = ( f'{obs_url}{pipeline_bit.strip()}html/index.html' ) logging.debug(f'Querying {pipeline_url}') response = mc.query_endpoint_session(pipeline_url, session) if response is None: logging.error(f'Could not query {pipeline_url}') else: metadata = _parse_single_field(response.text) metadata['reference'] = pipeline_url logging.debug(f'Setting reference to {pipeline_url}') response.close() finally: if response is not None: response.close() return metadata