def _common_init(): config = mc.Config() config.get_executors() clients = GemClientCollection(config) meta_visitors = META_VISITORS gemini_session = mc.get_endpoint_session() provenance_finder = gemini_metadata.ProvenanceFinder( config, clients.query_client, gemini_session) svofps_session = mc.get_endpoint_session() filter_cache = svofps.FilterMetadataCache(svofps_session) clients.gemini_session = gemini_session clients.svo_session = svofps_session if config.use_local_files or mc.TaskType.SCRAPE in config.task_types: metadata_reader = gemini_metadata.GeminiFileMetadataReader( gemini_session, provenance_finder, filter_cache) meta_visitors = [ fits2caom2_augmentation, preview_augmentation, cleanup_augmentation, ] elif [mc.TaskType.VISIT] == config.task_types: metadata_reader = gemini_metadata.GeminiStorageClientReader( clients.data_client, gemini_session, provenance_finder, filter_cache, ) else: metadata_reader = gemini_metadata.GeminiMetadataReader( gemini_session, provenance_finder, filter_cache) reader_lookup = gemini_metadata.GeminiMetadataLookup(metadata_reader) reader_lookup.reader = metadata_reader name_builder = builder.GemObsIDBuilder(config, metadata_reader, reader_lookup) return clients, config, metadata_reader, meta_visitors, name_builder
def query_top_page(): """Query the timestamp from the top page, for reporting. """ start_date = make_date_time('01Jan2017 12:00') response = None max_date = None try: # get the last modified date on the quicklook images listing session = mc.get_endpoint_session() response = mc.query_endpoint_session(QL_URL, session) if response is None: logging.warning(f'Could not query {QL_URL}') else: epochs = _parse_top_page(response.text, start_date) for key, value in epochs.items(): logging.info(f'{key} {make_date_time(value)}') if max_date is None: max_date = value else: max_date = max(max_date, value) finally: if response is not None: response.close() return max_date
def build_todo(start_date): """Take the list of good files, and the list of rejected files, and make them into one todo list. :return a dict, where keys are timestamps, and values are lists of URLs. """ logging.debug(f'Begin build_todo with date {start_date}') session = mc.get_endpoint_session() good, good_date = build_good_todo(start_date, session) logging.info(f'{len(good)} good records to process. Check for rejected.') rejected, rejected_date = build_qa_rejected_todo(start_date, session) logging.info(f'{len(rejected)} rejected records to process, date will be ' f'{rejected_date}') result = collections.OrderedDict() for k, v in sorted(sorted(good.items()) + sorted(rejected.items())): temp = result.setdefault(k, []) result[k] = temp + list(set(v)) if good_date != start_date and rejected_date != start_date: # return the min of the two, because a date from the good list # has not necessarily been encountered on the rejected list, and # vice-versa return_date = min(good_date, rejected_date) else: return_date = max(good_date, rejected_date) logging.debug( f'End build_todo with {len(result)} records, date {return_date}') return result, return_date
def retrieve_obs_metadata(obs_id): """Maybe someday this can be done with astroquery, but the VLASS metadata isn't in the database that astroquery.Nrao points to, so that day is not today.""" metadata = {} mod_obs_id = obs_id.replace('.', '_', 2).replace('_', '.', 1) global web_log_content if len(web_log_content) == 0: raise mc.CadcException('Must initialize weblog content.') latest_key = None max_ts = None tz_info = tz.gettz('US/Socorro') # there may be multiple processing runs for a single obs id, use the # most recent for key in web_log_content.keys(): if key.startswith(mod_obs_id): dt_bits = '_'.join(ii for ii in key.replace('/', '').split('_')[3:]) dt_tz = make_date_time(dt_bits).replace(tzinfo=tz_info) if max_ts is None: max_ts = dt_tz latest_key = key else: if max_ts < dt_tz: max_ts = dt_tz latest_key = key session = mc.get_endpoint_session() if latest_key is not None: obs_url = f'{QL_WEB_LOG_URL}{latest_key}' logging.debug(f'Querying {obs_url}') response = None try: response = mc.query_endpoint_session(obs_url, session) if response is None: logging.error(f'Could not query {obs_url}') else: pipeline_bit = _parse_for_reference(response.text, 'pipeline-') response.close() if pipeline_bit is None: logging.error(f'Did not find pipeline on {obs_url}') else: pipeline_url = \ f'{obs_url}{pipeline_bit.strip()}html/index.html' logging.debug(f'Querying {pipeline_url}') response = mc.query_endpoint_session(pipeline_url, session) if response is None: logging.error(f'Could not query {pipeline_url}') else: metadata = _parse_single_field(response.text) metadata['reference'] = pipeline_url logging.debug(f'Setting reference to {pipeline_url}') response.close() finally: if response is not None: response.close() return metadata
def __init__(self, config=None): super(CachingObsFileRelationship, self).__init__() self._use_local_files = False self._is_connected = True self._collection = 'GEMINI' if config is not None: self._use_local_files = config.use_local_files self._is_connected = config.is_connected self._collection = config.collection # use accessor methods for _tap_client, because of how this class # will eventually be used - as a global, accessible by all and # everywhere, and initialized before there's a config self._tap_client = None self._session = mc.get_endpoint_session() self._logger = logging.getLogger(__name__)
def _refresh(self): start_date = self._refresh_bookmark if self._refresh_bookmark is None: start_date = datetime(year=2017, month=1, day=1, hour=0, tzinfo=self._tz) session = mc.get_endpoint_session() todo_list, ignore_max_date = scrape.build_qa_rejected_todo( start_date, session ) for timestamp, urls in todo_list.items(): for url in urls: # there are trailing slashes on the NRAO VLASS QL page obs_id = sn.VlassName.get_obs_id_from_file_name( url.split('/')[-2]) self._logger.debug(f'Add QA REJECTED {obs_id}.') self._qa_rejected_obs_ids.append(obs_id) self._refresh_bookmark = self._new_bookmark
def build_url_list(start_date): """ Differs from build_file_url_list in that it loads the page and checks the timestamps for the file individually, as opposed to trusting that the source location directory timestamps are representative. :return a dict, where keys are URLs, and values are timestamps """ result = {} session = mc.get_endpoint_session() todo_list, ignore_date = build_good_todo(start_date, session) rejected, ignore_date = build_qa_rejected_todo(start_date, session) for coll in [todo_list, rejected]: if len(coll) > 0: for timestamp, urls in coll.items(): for url in urls: temp = list_files_on_page(url, start_date, session) for key, value in temp.items(): # key f_name # value timestamp temp_url = f'{url}{key}' result[temp_url] = value.timestamp() return result
response = mc.query_endpoint_session(program_url, gemini_session) xml_metadata = response.text finally: if response: response.close() metadata = None soup = BeautifulSoup(xml_metadata, 'lxml') tds = soup.find_all('td') if len(tds) > 0: # sometimes the program id points to an html page with an # empty table, see e.g. N20200210S0077_bias title = None if len(tds[1].contents) > 0: title = tds[1].contents[0].replace('\n', ' ') pi_name = None if len(tds[3].contents) > 0: pi_name = tds[3].contents[0] metadata = { 'title': title, 'pi_name': pi_name, } pm[program_id] = metadata logging.debug('End get_pi_metadata') return metadata gemini_session = mc.get_endpoint_session() # lazy initialization for program metadata from Gemini pm = {}
def retrieve_obs_metadata(obs_id): """Maybe someday this can be done with astroquery, but the VLASS metadata isn't in the database that astroquery.Nrao points to, so that day is not today.""" metadata = {} mod_obs_id = obs_id.replace('.', '_', 2).replace('_', '.', 1) global web_log_content if len(web_log_content) == 0: config = mc.Config() config.get_executors() logging.warning('Initializing from /weblog. This may take a while.') state = mc.State(config.state_fqn) init_web_log(state) latest_key = None max_ts = None tz_info = tz.gettz('US/Socorro') # there may be multiple processing runs for a single obs id, use the # most recent for key in web_log_content.keys(): if key.startswith(mod_obs_id): dt_bits = '_'.join( ii for ii in key.replace('/', '').split('_')[3:] ) dt_tz = make_date_time(dt_bits).replace(tzinfo=tz_info) if max_ts is None: max_ts = dt_tz latest_key = key else: if max_ts < dt_tz: max_ts = dt_tz latest_key = key session = mc.get_endpoint_session() if latest_key is not None: obs_url = f'{QL_WEB_LOG_URL}{latest_key}' logging.debug(f'Querying {obs_url}') response = None try: response = mc.query_endpoint_session(obs_url, session) if response is None: logging.error(f'Could not query {obs_url}') else: soup = BeautifulSoup(response.text, features='lxml') response.close() pipeline_bit = soup.find(string=re.compile('pipeline-')) if pipeline_bit is None: logging.error(f'Did not find pipeline on {obs_url}') else: pipeline_url = ( f'{obs_url}{pipeline_bit.strip()}html/index.html' ) logging.debug(f'Querying {pipeline_url}') response = mc.query_endpoint_session(pipeline_url, session) if response is None: logging.error(f'Could not query {pipeline_url}') else: metadata = _parse_single_field(response.text) metadata['reference'] = pipeline_url logging.debug(f'Setting reference to {pipeline_url}') response.close() finally: if response is not None: response.close() return metadata