コード例 #1
0
def build_good_todo(start_date, session):
    """Create the list of work, based on timestamps from the NRAO
    Quicklook page.

    :return a dict, where keys are timestamps, and values are lists
       of URLs.
    """
    temp = {}
    max_date = start_date

    response = None

    try:
        # get the last modified date on the quicklook images listing
        response = mc.query_endpoint_session(QL_URL, session)
        if response is None:
            logging.warning(f'Could not query {QL_URL}')
        else:
            epochs = _parse_top_page_no_date(response.text)
            response.close()

            for epoch in epochs:
                epoch_url = f'{QL_URL}{epoch}'
                logging.info(f'Checking epoch {epoch} on date {epochs[epoch]}')
                response = mc.query_endpoint_session(epoch_url, session)
                if response is None:
                    logging.warning(f'Could not query epoch {epoch_url}')
                else:
                    tiles = _parse_tile_page(response.text, start_date)
                    response.close()

                    # get the list of tiles
                    for tile in tiles:
                        logging.info(
                            f'Checking tile {tile} with date {tiles[tile]}')
                        tile_url = f'{epoch_url}{tile}'
                        response = mc.query_endpoint_session(tile_url, session)
                        if response is None:
                            logging.warning(f'Could not query {tile_url}')
                        else:
                            observations = _parse_id_page(
                                response.text, start_date)
                            response.close()

                            # for each tile, get the list of observations
                            for observation in observations:
                                obs_url = f'{tile_url}{observation}'
                                dt_as_s = observations[observation].timestamp()
                                max_date = max(max_date,
                                               observations[observation])
                                if dt_as_s in temp:
                                    temp[dt_as_s].append(obs_url)
                                else:
                                    temp[dt_as_s] = [obs_url]
    finally:
        if response is not None:
            response.close()
    return temp, max_date
コード例 #2
0
def retrieve_obs_metadata(obs_id):
    """Maybe someday this can be done with astroquery, but the VLASS
    metadata isn't in the database that astroquery.Nrao points to, so
    that day is not today."""
    metadata = {}
    mod_obs_id = obs_id.replace('.', '_', 2).replace('_', '.', 1)
    global web_log_content
    if len(web_log_content) == 0:
        raise mc.CadcException('Must initialize weblog content.')
    latest_key = None
    max_ts = None
    tz_info = tz.gettz('US/Socorro')
    # there may be multiple processing runs for a single obs id, use the
    # most recent
    for key in web_log_content.keys():
        if key.startswith(mod_obs_id):
            dt_bits = '_'.join(ii
                               for ii in key.replace('/', '').split('_')[3:])
            dt_tz = make_date_time(dt_bits).replace(tzinfo=tz_info)
            if max_ts is None:
                max_ts = dt_tz
                latest_key = key
            else:
                if max_ts < dt_tz:
                    max_ts = dt_tz
                    latest_key = key

    session = mc.get_endpoint_session()
    if latest_key is not None:
        obs_url = f'{QL_WEB_LOG_URL}{latest_key}'
        logging.debug(f'Querying {obs_url}')
        response = None
        try:
            response = mc.query_endpoint_session(obs_url, session)
            if response is None:
                logging.error(f'Could not query {obs_url}')
            else:
                pipeline_bit = _parse_for_reference(response.text, 'pipeline-')
                response.close()
                if pipeline_bit is None:
                    logging.error(f'Did not find pipeline on {obs_url}')
                else:
                    pipeline_url = \
                        f'{obs_url}{pipeline_bit.strip()}html/index.html'
                    logging.debug(f'Querying {pipeline_url}')
                    response = mc.query_endpoint_session(pipeline_url, session)
                    if response is None:
                        logging.error(f'Could not query {pipeline_url}')
                    else:
                        metadata = _parse_single_field(response.text)
                        metadata['reference'] = pipeline_url
                        logging.debug(f'Setting reference to {pipeline_url}')
                    response.close()
        finally:
            if response is not None:
                response.close()
    return metadata
コード例 #3
0
def build_qa_rejected_todo(start_date, session):
    """
    :return a dict, where keys are timestamps, and values are lists
       of URLs.
    """
    rejected = {}
    max_date = start_date

    response = None
    try:
        # get the last modified date on the quicklook images listing
        response = mc.query_endpoint_session(QL_URL, session)
        if response is None:
            logging.warning(f'Could not query {QL_URL}')
        else:
            epochs = _parse_top_page_no_date(response.text)
            response.close()

            for epoch in epochs:
                epoch_name = epoch.split('/')[-2]
                epoch_rejected_url = f'{QL_URL}{epoch}QA_REJECTED/'
                logging.info(
                    f'Checking epoch {epoch_name} on date {epochs[epoch]}'
                )
                try:
                    response = mc.query_endpoint_session(
                        epoch_rejected_url, session
                    )
                    if response is None:
                        logging.warning(
                            f'Could not query epoch {epoch_rejected_url}'
                        )
                    else:
                        temp, rejected_max = _parse_rejected_page(
                            response.text,
                            epoch_name,
                            start_date,
                            epoch_rejected_url,
                        )
                        max_date = max(max_date, rejected_max)
                        response.close()
                        temp_rejected = rejected
                        rejected = {**temp, **temp_rejected}
                except mc.CadcException as e:
                    if 'Not Found for url' in str(e):
                        logging.info(
                            f'No QA_REJECTED directory for '
                            f'{epoch_name}. Continuing.'
                        )
                    else:
                        raise e
    finally:
        if response is not None:
            response.close()
    return rejected, max_date
コード例 #4
0
def query_top_page():
    """Query the timestamp from the top page, for reporting.
    """
    start_date = make_date_time('01Jan2017 12:00')
    response = None
    max_date = None

    try:
        # get the last modified date on the quicklook images listing
        session = mc.get_endpoint_session()
        response = mc.query_endpoint_session(QL_URL, session)
        if response is None:
            logging.warning(f'Could not query {QL_URL}')
        else:
            epochs = _parse_top_page(response.text, start_date)
            for key, value in epochs.items():
                logging.info(f'{key} {make_date_time(value)}')
                if max_date is None:
                    max_date = value
                else:
                    max_date = max(max_date, value)
    finally:
        if response is not None:
            response.close()

    return max_date
コード例 #5
0
def get_pi_metadata(program_id):
    global pm
    if program_id in pm:
        metadata = pm[program_id]
    else:
        program_url = 'https://archive.gemini.edu/programinfo/' + program_id
        # Open the URL and fetch the JSON document for the observation
        response = None
        try:
            response = mc.query_endpoint_session(
                program_url, gofr.query_session
            )
            xml_metadata = response.text
        finally:
            if response:
                response.close()
        metadata = None
        soup = BeautifulSoup(xml_metadata, 'lxml')
        tds = soup.find_all('td')
        if len(tds) > 0:
            # sometimes the program id points to an html page with an empty
            # table, see e.g. N20200210S0077_bias
            title = None
            if len(tds[1].contents) > 0:
                title = tds[1].contents[0].replace('\n', ' ')
            pi_name = None
            if len(tds[3].contents) > 0:
                pi_name = tds[3].contents[0]
            metadata = {'title': title,
                        'pi_name': pi_name}
            pm[program_id] = metadata
        logging.debug('End get_obs_metadata')
    return metadata
コード例 #6
0
def get_obs_metadata(file_id):
    """
    Download the Gemini observation metadata for the given obs_id.

    :param file_id: The file ID
    :return: Dictionary of observation metadata.
    """
    logging.debug('Begin get_obs_metadata for {}'.format(file_id))
    global om
    if om.contains(file_id):
        om.reset_index(file_id)
    else:
        gemini_url = '{}{}'.format(GEMINI_METADATA_URL, file_id)

        # Open the URL and fetch the JSON document for the observation
        response = None
        try:
            response = mc.query_endpoint_session(
                gemini_url, gofr.query_session
            )
            metadata = response.json()
        finally:
            if response is not None:
                response.close()
        if len(metadata) == 0:
            raise mc.CadcException(f'Could not find JSON record for {file_id} '
                                   f'at archive.gemini.edu.')
        om.add(metadata, file_id)
    logging.debug('End get_obs_metadata for {}'.format(file_id))
コード例 #7
0
def test_query_endpoint_session():
    session_mock = Mock()
    test_result = mc.query_endpoint_session('https://localhost',
                                            session_mock,
                                            timeout=25)
    assert test_result is not None, 'expected result'
    assert session_mock.get.called, 'mock not called'
    session_mock.get.assert_called_with('https://localhost', timeout=25)
コード例 #8
0
def list_files_on_page(url, start_time, session):
    """:return a dict, where keys are URLS, and values are timestamps, from
    a specific page listing at NRAO."""
    response = None
    try:
        logging.debug(f'Querying {url}')
        response = mc.query_endpoint_session(url, session)
        if response is None:
            raise mc.CadcException(f'Could not query {url}')
        else:
            result = _parse_specific_file_list_page(response.text, start_time)
            response.close()
            return result
    finally:
        if response is not None:
            response.close()
コード例 #9
0
ファイル: gemini_metadata.py プロジェクト: opencadc/gem2caom2
def retrieve_json(source_name, logger, session):
    # source name is a file id, because that's the only part that's
    # required to be unique for a retrieval from archive.gemini.edu
    #
    logger.debug(f'Begin retrieve_json for {source_name}')
    gemini_url = f'{GEMINI_METADATA_URL}{source_name}'
    # Open the URL and fetch the JSON document for the observation
    response = None
    try:
        response = mc.query_endpoint_session(gemini_url, session)
        metadata = response.json()
    finally:
        if response is not None:
            response.close()
    if len(metadata) == 0:
        raise mc.CadcException(
            f'Could not find JSON record for {source_name} at '
            f'{gemini_url}.'
        )
    logger.debug(f'End _retrieve_json')
    return metadata
コード例 #10
0
    def get_time_box_work(self, prev_exec_time, exec_time):
        """
        :param prev_exec_time float timestamp start of the time-boxed chunk
        :param exec_time float timestamp end of the time-boxed chunk
        :return: a deque of file names with time their associated JSON (DB)
            records were modified from archive.gemini.edu.
        """

        self._logger.debug(
            f'Begin get_time_box_work from {prev_exec_time} to {exec_time}.'
        )
        # datetime format 2019-12-01T00:00:00.000000
        prev_dt_str = mc.make_time_tz(prev_exec_time).strftime(
            mc.ISO_8601_FORMAT
        )
        exec_dt_str = mc.make_time_tz(exec_time).strftime(mc.ISO_8601_FORMAT)
        url = (
            f'https://archive.gemini.edu/jsonsummary/canonical/'
            f'NotFail/notengineering/'
            f'entrytimedaterange={prev_dt_str}%20{exec_dt_str}/'
            f'?orderby=entrytime'
        )

        # needs to be ordered by timestamps when processed
        self._logger.info(f'Querying {url}')
        entries = deque()
        response = None
        try:
            response = mc.query_endpoint_session(url, self._session)
            if response is None:
                logging.warning(f'Could not query {url}.')
            else:
                metadata = response.json()
                response.close()
                if metadata is not None:
                    if len(metadata) == 0:
                        self._logger.warning(
                            f'No query results returned for interval from '
                            f'{prev_exec_time} to {exec_time}.'
                        )
                    else:
                        for entry in metadata:
                            file_name = entry.get('name')
                            entrytime = mc.make_time_tz(entry.get('entrytime'))
                            entries.append(
                                dsc.StateRunnerMeta(
                                    file_name, entrytime.timestamp()
                                )
                            )
                            uri = mc.build_uri(COLLECTION, file_name, SCHEME)
                            # all the other cases where add_json_record is
                            # called, there's a list as input, so conform to
                            # that typing here
                            self._metadata_reader.add_json_record(uri, [entry])
                            self._metadata_reader.add_file_info_record(uri)
        finally:
            if response is not None:
                response.close()
        if len(entries) == 10000:
            self._max_records_encountered = True
            self._encounter_start = prev_exec_time
            self._encounter_end = exec_time
        self._logger.debug('End get_time_box_work.')
        return entries
コード例 #11
0
def retrieve_obs_metadata(obs_id):
    """Maybe someday this can be done with astroquery, but the VLASS
    metadata isn't in the database that astroquery.Nrao points to, so
    that day is not today."""
    metadata = {}
    mod_obs_id = obs_id.replace('.', '_', 2).replace('_', '.', 1)
    global web_log_content
    if len(web_log_content) == 0:
        config = mc.Config()
        config.get_executors()
        logging.warning('Initializing from /weblog. This may take a while.')
        state = mc.State(config.state_fqn)
        init_web_log(state)
    latest_key = None
    max_ts = None
    tz_info = tz.gettz('US/Socorro')
    # there may be multiple processing runs for a single obs id, use the
    # most recent
    for key in web_log_content.keys():
        if key.startswith(mod_obs_id):
            dt_bits = '_'.join(
                ii for ii in key.replace('/', '').split('_')[3:]
            )
            dt_tz = make_date_time(dt_bits).replace(tzinfo=tz_info)
            if max_ts is None:
                max_ts = dt_tz
                latest_key = key
            else:
                if max_ts < dt_tz:
                    max_ts = dt_tz
                    latest_key = key

    session = mc.get_endpoint_session()
    if latest_key is not None:
        obs_url = f'{QL_WEB_LOG_URL}{latest_key}'
        logging.debug(f'Querying {obs_url}')
        response = None
        try:
            response = mc.query_endpoint_session(obs_url, session)
            if response is None:
                logging.error(f'Could not query {obs_url}')
            else:
                soup = BeautifulSoup(response.text, features='lxml')
                response.close()
                pipeline_bit = soup.find(string=re.compile('pipeline-'))
                if pipeline_bit is None:
                    logging.error(f'Did not find pipeline on {obs_url}')
                else:
                    pipeline_url = (
                        f'{obs_url}{pipeline_bit.strip()}html/index.html'
                    )
                    logging.debug(f'Querying {pipeline_url}')
                    response = mc.query_endpoint_session(pipeline_url, session)
                    if response is None:
                        logging.error(f'Could not query {pipeline_url}')
                    else:
                        metadata = _parse_single_field(response.text)
                        metadata['reference'] = pipeline_url
                        logging.debug(f'Setting reference to {pipeline_url}')
                    response.close()
        finally:
            if response is not None:
                response.close()
    return metadata