Esempio n. 1
0
    def get_time_box_work(self, prev_exec_time, exec_time):
        """
        :param prev_exec_time float timestamp start of the time-boxed chunk
        :param exec_time float timestamp end of the time-boxed chunk
        :return: a deque of file names with time their associated JSON (DB)
            records were modified from archive.gemini.edu.
        """

        self._logger.debug(f'Begin get_time_box_work from {prev_exec_time} to '
                           f'{exec_time}.')
        # datetime format 2019-12-01T00:00:00.000000
        prev_dt_str = mc.make_time_tz(prev_exec_time).strftime(
            mc.ISO_8601_FORMAT)
        exec_dt_str = mc.make_time_tz(exec_time).strftime(mc.ISO_8601_FORMAT)
        url = f'https://archive.gemini.edu/jsonsummary/canonical/' \
              f'NotFail/notengineering/' \
              f'entrytimedaterange={prev_dt_str}%20{exec_dt_str}/' \
              f'?orderby=entrytime'

        # needs to be ordered by timestamps when processed
        self._logger.info(f'Querying {url}')
        entries = deque()
        response = None
        try:
            response = mc.query_endpoint(url)
            if response is None:
                logging.warning(f'Could not query {url}.')
            else:
                metadata = response.json()
                response.close()
                if metadata is not None:
                    if len(metadata) == 0:
                        self._logger.warning(f'No query results returned for '
                                             f'interval from {prev_exec_time} '
                                             f'to {exec_time}.')
                    else:
                        for entry in metadata:
                            file_name = entry.get('name')
                            entrytime = mc.make_time_tz(entry.get('entrytime'))
                            entries.append(
                                dsc.StateRunnerMeta(file_name,
                                                    entrytime.timestamp()))
        finally:
            if response is not None:
                response.close()
        if len(entries) == 10000:
            self._max_records_encountered = True
            self._encounter_start = prev_exec_time
            self._encounter_end = exec_time
        self._logger.debug('End get_time_box_work.')
        return entries
Esempio n. 2
0
 def add_file_info_record(self, uri):
     """add_json_record has already been successfully called"""
     if uri not in self._file_info.keys():
         record = self._json_metadata[uri]
         self._file_info[uri] = FileInfo(
             id=uri,
             size=record.get('data_size'),
             name=record.get('filename'),
             md5sum=record.get('data_md5'),
             lastmod=mc.make_time_tz(record.get('lastmod')),
             file_type=data_util.get_file_type(record.get('filename')),
             encoding=data_util.get_file_encoding(record.get('filename')),
         )
         self._logger.debug(f'Adding FileInfo for {uri}')
 def _append_work(self, prev_exec_time, exec_time, entry):
     self._logger.info(f'Search for work in {entry}.')
     targets = self._vault_client.glob(f'{entry}/*')
     for target in targets:
         target_node = self._vault_client.get_node(target)
         target_node_mtime = mc.make_time_tz(target_node.props.get('date'))
         if target_node.isdir() and self._recursive:
             if exec_time >= target_node_mtime >= prev_exec_time:
                 self._append_work(prev_exec_time, exec_time,
                                   target_node.uri)
         else:
             if self.default_filter(target_node):
                 if (exec_time >= target_node_mtime >= prev_exec_time):
                     self._temp[target_node_mtime].append(target_node.uri)
                     self._logger.info(
                         f'Add {target_node.uri} to work list.')
Esempio n. 4
0
def _run_state():
    """Run incremental processing for observations that are posted on the site
    archive.gemini.edu. TODO in the future this will depend on the incremental
    query endpoint.

    :return 0 if successful, -1 if there's any sort of failure. Return status
        is used by airflow for task instance management and reporting.
    """
    (
        clients,
        config,
        metadata_reader,
        meta_visitors,
        name_builder,
    ) = _common_init()
    state = mc.State(config.state_fqn)
    end_timestamp_s = state.bookmarks.get(data_source.GEM_BOOKMARK).get(
        'end_timestamp', datetime.now())
    end_timestamp_dt = mc.make_time_tz(end_timestamp_s)
    logging.info(f'{main_app.APPLICATION} will end at {end_timestamp_s}')
    incremental_source = data_source.IncrementalSource(metadata_reader)
    result = rc.run_by_state(
        config=config,
        name_builder=name_builder,
        bookmark_name=data_source.GEM_BOOKMARK,
        meta_visitors=meta_visitors,
        data_visitors=DATA_VISITORS,
        end_time=end_timestamp_dt,
        source=incremental_source,
        clients=clients,
        metadata_reader=metadata_reader,
    )
    if incremental_source.max_records_encountered:
        logging.warning('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
        logging.warning('Encountered maximum records!!')
        logging.warning('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
        result |= -1
    return result
Esempio n. 5
0
def _run_by_incremental():
    """Run incremental processing for observations that are posted on the site
    archive.gemini.edu. TODO in the future this will depend on the incremental
    query endpoint.

    :return 0 if successful, -1 if there's any sort of failure. Return status
        is used by airflow for task instance management and reporting.
    """
    config = mc.Config()
    config.get_executors()
    state = mc.State(config.state_fqn)
    end_timestamp_s = state.bookmarks.get(data_source.GEM_BOOKMARK).get(
        'end_timestamp', datetime.now())
    end_timestamp_dt = mc.make_time_tz(end_timestamp_s)
    logging.info(f'{main_app.APPLICATION} will end at {end_timestamp_s}')
    external_metadata.init_global(config=config)
    name_builder = nbc.FileNameBuilder(gem_name.GemName)
    incremental_source = data_source.IncrementalSource()
    meta_visitors = _define_meta_visitors(config)
    result = rc.run_by_state(
        config=config,
        name_builder=name_builder,
        command_name=main_app.APPLICATION,
        bookmark_name=data_source.GEM_BOOKMARK,
        meta_visitors=meta_visitors,
        data_visitors=DATA_VISITORS,
        end_time=end_timestamp_dt,
        source=incremental_source,
        chooser=None,
    )
    if incremental_source.max_records_encountered:
        logging.warning('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
        logging.warning('Encountered maximum records!!')
        logging.warning('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
        result |= -1
    return result
Esempio n. 6
0
def _update_from_comment(observation, phangs_name, headers):
    # From ER: 04-03-21
    # COMMENT Produced with PHANGS-ALMA pipeline version 4.0 Build 935
    # - Provenance.version
    # COMMENT Galaxy properties from PHANGS sample table version 1.6
    # COMMENT Calibration Level 4 (ANALYSIS_PRODUCT)
    # - Calibration level (either 3 or 4)
    # COMMENT PHANGS-ALMA Public Release 1
    # - Provenance.project = PHANGS-ALMA
    # COMMENT Generated by the Physics at High Angular resolution
    # COMMENT in nearby GalaxieS (PHANGS) collaboration
    # - Provenance.organization = PHANGS
    # COMMENT Canonical Reference: Leroy et al. (2021), ApJ, Submitted
    # - Update to reference when accepted
    # COMMENT Release generated at 2021-03-04T07:28:10.245340
    # - Provenance.lastExecuted
    # COMMENT Data from ALMA Proposal ID: 2017.1.00886.L
    # - Proposal.proposalID
    # COMMENT ALMA Proposal PI: Schinnerer, Eva
    # - Proposal.pi_name
    # COMMENT Observed in MJD interval [58077.386275,58081.464121]
    # COMMENT Observed in MJD interval [58290.770032,58365.629222]
    # COMMENT Observed in MJD interval [58037.515807,58047.541173]
    # COMMENT Observed in MJD interval [58353.589805,58381.654757]
    # COMMENT Observed in MJD interval [58064.3677,58072.458597]
    # COMMENT Observed in MJD interval [58114.347649,58139.301879]
    chunk = None
    for plane in observation.planes.values():
        if plane.product_id != phangs_name.product_id:
            continue
        if plane.provenance is None:
            plane.provenance = Provenance(name='PHANGS-ALMA pipeline')

        for artifact in plane.artifacts.values():
            if artifact.uri != phangs_name.file_uri:
                continue
            for part in artifact.parts.values():
                chunk = part.chunks[0]
                break

        for entry in headers[0].get('COMMENT'):
            if 'pipeline version ' in entry:
                plane.provenance.version = entry.split(' version ')[1]
            elif 'Calibration Level' in entry:
                level = entry.split()[2]
                if level == '4':
                    plane.calibration_level = CalibrationLevel.ANALYSIS_PRODUCT
            elif 'PHANGS-ALMA Public Release' in entry:
                plane.provenance.project = 'PHANGS-ALMA'
            elif 'in nearby GalaxieS (PHANGS) collaboration' in entry:
                plane.provenance.organization = 'PHANGS'
            elif 'Release generated at ' in entry:
                plane.provenance.last_executed = mc.make_time_tz(
                    entry.split(' at ')[1])
            elif 'Data from ALMA Proposal ID:' in entry:
                observation.proposal = Proposal(entry.split(':')[1].strip())
            elif 'Canonical Reference: ' in entry:
                plane.provenance.producer = entry.split(': ')[1]
            elif 'ALMA Proposal PI:' in entry:
                observation.proposal.pi_name = entry.split(': ')[1]
            elif 'Observed in MJD interval ' in entry:
                if chunk is not None:
                    bits = entry.split()[4].split(',')
                    start_ref_coord = RefCoord(
                        0.5, mc.to_float(bits[0].replace('[', '')))
                    end_ref_coord = RefCoord(
                        1.5, mc.to_float(bits[1].replace(']', '')))
                    sample = CoordRange1D(start_ref_coord, end_ref_coord)
                    if chunk.time is None:
                        coord_bounds = CoordBounds1D()
                        axis = CoordAxis1D(axis=Axis('TIME', 'd'))
                        chunk.time = TemporalWCS(axis, timesys='UTC')
                        chunk.time.axis.bounds = coord_bounds
                    chunk.time.axis.bounds.samples.append(sample)