예제 #1
0
def _common_init():
    config = mc.Config()
    config.get_executors()
    clients = GemClientCollection(config)
    meta_visitors = META_VISITORS
    gemini_session = mc.get_endpoint_session()
    provenance_finder = gemini_metadata.ProvenanceFinder(
        config, clients.query_client, gemini_session)
    svofps_session = mc.get_endpoint_session()
    filter_cache = svofps.FilterMetadataCache(svofps_session)
    clients.gemini_session = gemini_session
    clients.svo_session = svofps_session
    if config.use_local_files or mc.TaskType.SCRAPE in config.task_types:
        metadata_reader = gemini_metadata.GeminiFileMetadataReader(
            gemini_session, provenance_finder, filter_cache)
        meta_visitors = [
            fits2caom2_augmentation,
            preview_augmentation,
            cleanup_augmentation,
        ]
    elif [mc.TaskType.VISIT] == config.task_types:
        metadata_reader = gemini_metadata.GeminiStorageClientReader(
            clients.data_client,
            gemini_session,
            provenance_finder,
            filter_cache,
        )
    else:
        metadata_reader = gemini_metadata.GeminiMetadataReader(
            gemini_session, provenance_finder, filter_cache)
    reader_lookup = gemini_metadata.GeminiMetadataLookup(metadata_reader)
    reader_lookup.reader = metadata_reader
    name_builder = builder.GemObsIDBuilder(config, metadata_reader,
                                           reader_lookup)
    return clients, config, metadata_reader, meta_visitors, name_builder
예제 #2
0
def query_top_page():
    """Query the timestamp from the top page, for reporting.
    """
    start_date = make_date_time('01Jan2017 12:00')
    response = None
    max_date = None

    try:
        # get the last modified date on the quicklook images listing
        session = mc.get_endpoint_session()
        response = mc.query_endpoint_session(QL_URL, session)
        if response is None:
            logging.warning(f'Could not query {QL_URL}')
        else:
            epochs = _parse_top_page(response.text, start_date)
            for key, value in epochs.items():
                logging.info(f'{key} {make_date_time(value)}')
                if max_date is None:
                    max_date = value
                else:
                    max_date = max(max_date, value)
    finally:
        if response is not None:
            response.close()

    return max_date
예제 #3
0
def build_todo(start_date):
    """Take the list of good files, and the list of rejected files,
    and make them into one todo list.

    :return a dict, where keys are timestamps, and values are lists
       of URLs.
    """
    logging.debug(f'Begin build_todo with date {start_date}')
    session = mc.get_endpoint_session()
    good, good_date = build_good_todo(start_date, session)
    logging.info(f'{len(good)} good records to process. Check for rejected.')
    rejected, rejected_date = build_qa_rejected_todo(start_date, session)
    logging.info(f'{len(rejected)} rejected records to process, date will be '
                 f'{rejected_date}')
    result = collections.OrderedDict()
    for k, v in sorted(sorted(good.items()) + sorted(rejected.items())):
        temp = result.setdefault(k, [])
        result[k] = temp + list(set(v))

    if good_date != start_date and rejected_date != start_date:
        # return the min of the two, because a date from the good list
        # has not necessarily been encountered on the rejected list, and
        # vice-versa
        return_date = min(good_date, rejected_date)
    else:
        return_date = max(good_date, rejected_date)
    logging.debug(
        f'End build_todo with {len(result)} records, date {return_date}')
    return result, return_date
예제 #4
0
def retrieve_obs_metadata(obs_id):
    """Maybe someday this can be done with astroquery, but the VLASS
    metadata isn't in the database that astroquery.Nrao points to, so
    that day is not today."""
    metadata = {}
    mod_obs_id = obs_id.replace('.', '_', 2).replace('_', '.', 1)
    global web_log_content
    if len(web_log_content) == 0:
        raise mc.CadcException('Must initialize weblog content.')
    latest_key = None
    max_ts = None
    tz_info = tz.gettz('US/Socorro')
    # there may be multiple processing runs for a single obs id, use the
    # most recent
    for key in web_log_content.keys():
        if key.startswith(mod_obs_id):
            dt_bits = '_'.join(ii
                               for ii in key.replace('/', '').split('_')[3:])
            dt_tz = make_date_time(dt_bits).replace(tzinfo=tz_info)
            if max_ts is None:
                max_ts = dt_tz
                latest_key = key
            else:
                if max_ts < dt_tz:
                    max_ts = dt_tz
                    latest_key = key

    session = mc.get_endpoint_session()
    if latest_key is not None:
        obs_url = f'{QL_WEB_LOG_URL}{latest_key}'
        logging.debug(f'Querying {obs_url}')
        response = None
        try:
            response = mc.query_endpoint_session(obs_url, session)
            if response is None:
                logging.error(f'Could not query {obs_url}')
            else:
                pipeline_bit = _parse_for_reference(response.text, 'pipeline-')
                response.close()
                if pipeline_bit is None:
                    logging.error(f'Did not find pipeline on {obs_url}')
                else:
                    pipeline_url = \
                        f'{obs_url}{pipeline_bit.strip()}html/index.html'
                    logging.debug(f'Querying {pipeline_url}')
                    response = mc.query_endpoint_session(pipeline_url, session)
                    if response is None:
                        logging.error(f'Could not query {pipeline_url}')
                    else:
                        metadata = _parse_single_field(response.text)
                        metadata['reference'] = pipeline_url
                        logging.debug(f'Setting reference to {pipeline_url}')
                    response.close()
        finally:
            if response is not None:
                response.close()
    return metadata
예제 #5
0
 def __init__(self, config=None):
     super(CachingObsFileRelationship, self).__init__()
     self._use_local_files = False
     self._is_connected = True
     self._collection = 'GEMINI'
     if config is not None:
         self._use_local_files = config.use_local_files
         self._is_connected = config.is_connected
         self._collection = config.collection
     # use accessor methods for _tap_client, because of how this class
     # will eventually be used - as a global, accessible by all and
     # everywhere, and initialized before there's a config
     self._tap_client = None
     self._session = mc.get_endpoint_session()
     self._logger = logging.getLogger(__name__)
예제 #6
0
    def _refresh(self):
        start_date = self._refresh_bookmark
        if self._refresh_bookmark is None:
            start_date = datetime(year=2017, month=1, day=1, hour=0,
                                  tzinfo=self._tz)
        session = mc.get_endpoint_session()
        todo_list, ignore_max_date = scrape.build_qa_rejected_todo(
            start_date, session
        )

        for timestamp, urls in todo_list.items():
            for url in urls:
                # there are trailing slashes on the NRAO VLASS QL page
                obs_id = sn.VlassName.get_obs_id_from_file_name(
                    url.split('/')[-2])
                self._logger.debug(f'Add QA REJECTED {obs_id}.')
                self._qa_rejected_obs_ids.append(obs_id)
        self._refresh_bookmark = self._new_bookmark
예제 #7
0
def build_url_list(start_date):
    """
    Differs from build_file_url_list in that it loads the page and checks the
    timestamps for the file individually, as opposed to trusting that the
    source location directory timestamps are representative.

    :return a dict, where keys are URLs, and values are timestamps
    """
    result = {}
    session = mc.get_endpoint_session()
    todo_list, ignore_date = build_good_todo(start_date, session)
    rejected, ignore_date = build_qa_rejected_todo(start_date, session)
    for coll in [todo_list, rejected]:
        if len(coll) > 0:
            for timestamp, urls in coll.items():
                for url in urls:
                    temp = list_files_on_page(url, start_date, session)
                    for key, value in temp.items():
                        # key f_name
                        # value timestamp
                        temp_url = f'{url}{key}'
                        result[temp_url] = value.timestamp()
    return result
예제 #8
0
                response = mc.query_endpoint_session(program_url,
                                                     gemini_session)
                xml_metadata = response.text
            finally:
                if response:
                    response.close()
            metadata = None
            soup = BeautifulSoup(xml_metadata, 'lxml')
            tds = soup.find_all('td')
            if len(tds) > 0:
                # sometimes the program id points to an html page with an
                # empty table, see e.g. N20200210S0077_bias
                title = None
                if len(tds[1].contents) > 0:
                    title = tds[1].contents[0].replace('\n', ' ')
                pi_name = None
                if len(tds[3].contents) > 0:
                    pi_name = tds[3].contents[0]
                metadata = {
                    'title': title,
                    'pi_name': pi_name,
                }
                pm[program_id] = metadata
        logging.debug('End get_pi_metadata')
    return metadata


gemini_session = mc.get_endpoint_session()
# lazy initialization for program metadata from Gemini
pm = {}
예제 #9
0
def retrieve_obs_metadata(obs_id):
    """Maybe someday this can be done with astroquery, but the VLASS
    metadata isn't in the database that astroquery.Nrao points to, so
    that day is not today."""
    metadata = {}
    mod_obs_id = obs_id.replace('.', '_', 2).replace('_', '.', 1)
    global web_log_content
    if len(web_log_content) == 0:
        config = mc.Config()
        config.get_executors()
        logging.warning('Initializing from /weblog. This may take a while.')
        state = mc.State(config.state_fqn)
        init_web_log(state)
    latest_key = None
    max_ts = None
    tz_info = tz.gettz('US/Socorro')
    # there may be multiple processing runs for a single obs id, use the
    # most recent
    for key in web_log_content.keys():
        if key.startswith(mod_obs_id):
            dt_bits = '_'.join(
                ii for ii in key.replace('/', '').split('_')[3:]
            )
            dt_tz = make_date_time(dt_bits).replace(tzinfo=tz_info)
            if max_ts is None:
                max_ts = dt_tz
                latest_key = key
            else:
                if max_ts < dt_tz:
                    max_ts = dt_tz
                    latest_key = key

    session = mc.get_endpoint_session()
    if latest_key is not None:
        obs_url = f'{QL_WEB_LOG_URL}{latest_key}'
        logging.debug(f'Querying {obs_url}')
        response = None
        try:
            response = mc.query_endpoint_session(obs_url, session)
            if response is None:
                logging.error(f'Could not query {obs_url}')
            else:
                soup = BeautifulSoup(response.text, features='lxml')
                response.close()
                pipeline_bit = soup.find(string=re.compile('pipeline-'))
                if pipeline_bit is None:
                    logging.error(f'Did not find pipeline on {obs_url}')
                else:
                    pipeline_url = (
                        f'{obs_url}{pipeline_bit.strip()}html/index.html'
                    )
                    logging.debug(f'Querying {pipeline_url}')
                    response = mc.query_endpoint_session(pipeline_url, session)
                    if response is None:
                        logging.error(f'Could not query {pipeline_url}')
                    else:
                        metadata = _parse_single_field(response.text)
                        metadata['reference'] = pipeline_url
                        logging.debug(f'Setting reference to {pipeline_url}')
                    response.close()
        finally:
            if response is not None:
                response.close()
    return metadata