Ejemplo n.º 1
0
def test_timing_logger_context_return():
    logger = _Logger()
    timing_logger = TimingLogger("Testing", logger=logger)
    timing_logger._now = _fake_clock([
        datetime(2019, 12, 12, 10, 10, 10, 0),
        datetime(2019, 12, 12, 11, 12, 13, 0)
    ])

    with timing_logger as timer:
        logger("Hello world")

    assert timing_logger.elapsed.total_seconds() == 3723
    assert timer.elapsed.total_seconds() == 3723
Ejemplo n.º 2
0
def test_timing_logger_custom():
    logger = _Logger()
    timing_logger = TimingLogger("Testing", logger=logger)
    timing_logger._now = _fake_clock([
        datetime(2019, 12, 12, 10, 10, 10, 10000),
        datetime(2019, 12, 12, 11, 12, 13, 14141)
    ])

    with timing_logger:
        logger("Hello world")

    assert logger.logs == [
        "Testing: start 2019-12-12 10:10:10.010000", "Hello world",
        "Testing: end 2019-12-12 11:12:13.014141, elapsed 1:02:03.004141"
    ]
Ejemplo n.º 3
0
def main(argv=None):
    logging.basicConfig(level=logging.INFO)
    process_graph, args = handle_cli(argv)
    _log.info(
        "Evaluating process graph: {pg}".format(pg=safe_repr(process_graph)))

    _setup_local_spark(print=_log.info)

    # Local imports to workaround the pyspark import issues.
    from openeo_driver.ProcessGraphDeserializer import evaluate
    from openeogeotrellis.backend import GeoPySparkBackendImplementation

    env = EvalEnv({
        "version":
        args.api_version,
        "pyramid_levels":
        "highest",
        "user":
        None,  # TODO
        "require_bounds":
        True,
        "correlation_id":
        f"cli-pid{os.getpid()}",
        "backend_implementation":
        GeoPySparkBackendImplementation(use_zookeeper=False),
    })

    with TimingLogger(title="Evaluate process graph", logger=_log):
        result = evaluate(process_graph, env=env)

    if isinstance(result, ImageCollectionResult):
        filename = args.output or f"result.{result.format}"
        with TimingLogger(title=f"Saving result to {filename!r}", logger=_log):
            result.save_result(filename)
    elif isinstance(result, JSONResult):
        if args.output:
            with open(args.output, "w") as f:
                json.dump(result.prepare_for_json(), f)
        else:
            print(result.prepare_for_json())
    elif isinstance(result, dict):
        # TODO: support storing JSON result to file
        print(result)
    else:
        # TODO: support more result types
        raise ValueError(result)
Ejemplo n.º 4
0
def test_timing_logger_fail():
    logger = _Logger()
    timing_logger = TimingLogger("Testing", logger=logger)
    timing_logger._now = _fake_clock([
        datetime(2019, 12, 12, 10, 10, 10, 10000),
        datetime(2019, 12, 12, 11, 12, 13, 14141)
    ])

    try:
        with timing_logger:
            raise ValueError("Hello world")
    except ValueError:
        pass

    assert logger.logs == [
        "Testing: start 2019-12-12 10:10:10.010000",
        "Testing: fail 2019-12-12 11:12:13.014141, elapsed 1:02:03.004141"
    ]
Ejemplo n.º 5
0
 def _debug_show_rdd_info(self, rdd):
     with TimingLogger(title="Collect RDD info", logger=logger):
         record_count = rdd.count()
         key_ranges = {
             k: rdd.map(lambda f: f["key"][k]).distinct().collect()
             for k in ["col", "row", "instant"]
         }
         paths = rdd.map(lambda f: f["feature"]["id"]).distinct().count()
         logger.info(
             f"RDD info: {record_count} records, {paths} creo paths, key_ranges: {key_ranges}"
         )
Ejemplo n.º 6
0
def test_timing_logger_basic(caplog):
    caplog.set_level(logging.INFO)
    with TimingLogger("Testing"):
        logging.info("Hello world")

    pattern = re.compile(r'''
        .*Testing.+start.+\d{4}-\d{2}-\d{2}\ \d{2}:\d{2}:\d{2}.*\n
        .*Hello\ world.*\n
        .*Testing.*end.+\d{4}-\d{2}-\d{2}\ \d{2}:\d{2}:\d{2}.*elapsed.+[0-9.]+\n
        .*
        ''', re.VERBOSE | re.DOTALL)
    assert pattern.match(caplog.text)
Ejemplo n.º 7
0
    def _creodias_dem_subset_srtm_hgt_unzip(
        bbox: Tuple,
        bbox_epsg: int,
        srtm_root="/eodata/auxdata/SRTMGL1/dem"
    ) -> tempfile.TemporaryDirectory:
        """
        Create subset of Creodias SRTM hgt files covering the given lon-lat bbox to pass to Orfeo
        obtained from unzipping the necessary .SRTMGL1.hgt.zip files at /eodata/auxdata/SRTMGL1/dem/
        (e.g. N50E003.SRTMGL1.hgt.zip)

        :return: tempfile.TemporaryDirectory to be used as context manager (for automatic cleanup)
        """
        # Get range of lon-lat tiles to cover
        to_lonlat = pyproj.Transformer.from_crs(crs_from=bbox_epsg,
                                                crs_to=4326,
                                                always_xy=True)
        bbox_lonlat = shapely.ops.transform(to_lonlat.transform,
                                            shapely.geometry.box(*bbox)).bounds
        lon_min, lat_min, lon_max, lat_max = [int(b) for b in bbox_lonlat]

        # Unzip to temp dir
        temp_dir = tempfile.TemporaryDirectory(suffix="-openeo-dem-srtm")
        msg = f"Unzip SRTM tiles from {srtm_root} in range lon [{lon_min}:{lon_max}] x lat [{lat_min}:{lat_max}] to {temp_dir}"
        with TimingLogger(title=msg, logger=logger):
            for lon in range(lon_min, lon_max + 1):
                for lat in range(lat_min, lat_max + 1):
                    # Something like: N50E003.SRTMGL1.hgt.zip"
                    basename = "{ns}{lat:02d}{ew}{lon:03d}.SRTMGL1.hgt".format(
                        ew="E" if lon >= 0 else "W",
                        lon=abs(lon),
                        ns="N" if lat >= 0 else "S",
                        lat=abs(lat))
                    zip_filename = pathlib.Path(srtm_root) / (basename +
                                                              '.zip')
                    with zipfile.ZipFile(zip_filename, 'r') as z:
                        logger.info(f"{zip_filename}: {z.infolist()}")
                        z.extractall(temp_dir.name)

        return temp_dir
Ejemplo n.º 8
0
 def _creo_scan_for_band_tiffs(creo_path: pathlib.Path,
                               log_prefix: str) -> Dict[str, pathlib.Path]:
     """
     Scan given creodias path for TIFF files
     :param creo_path: path to product root folder
     :param log_prefix: prefix for logging
     :return: dictionary mapping band name (vv, vh, ...) to tiff path
     """
     with TimingLogger(title=f"{log_prefix} Scan {creo_path}",
                       logger=logger):
         # We expect the desired geotiff files under `creo_path` at location like
         #       measurements/s1a-iw-grd-vh-20200606t063717-20200606t063746-032893-03cf5f-002.tiff
         # TODO Get tiff path from manifest instead of assuming this `measurement` file structure?
         band_regex = re.compile(r"^s1[ab]-iw-grd-([hv]{2})-",
                                 flags=re.IGNORECASE)
         band_tiffs = {}
         for tiff in creo_path.glob("measurement/*.tiff"):
             match = band_regex.match(tiff.name)
             if match:
                 band_tiffs[match.group(1).lower()] = tiff
         if not band_tiffs:
             raise OpenEOApiException("No tiffs found")
         logger.info(f"{log_prefix} Detected band tiffs: {band_tiffs}")
     return band_tiffs
Ejemplo n.º 9
0
        def process_product(product: Tuple[str, List[dict]]):
            import faulthandler
            faulthandler.enable()
            creo_path, features = product

            # Short ad-hoc product id for logging purposes.
            prod_id = re.sub(r"[^A-Z0-9]", "", creo_path.upper())[-10:]
            log_prefix = f"p{os.getpid()}-prod{prod_id}"
            logger.info(f"{log_prefix} creo path {creo_path}")
            logger.info(
                f"{log_prefix} sar_backscatter_arguments: {sar_backscatter_arguments!r}"
            )

            creo_path = pathlib.Path(creo_path)
            if not creo_path.exists():
                raise OpenEOApiException("Creo path does not exist")

            # Get whole extent of tile layout
            col_min = min(f["key"]["col"] for f in features)
            col_max = max(f["key"]["col"] for f in features)
            cols = col_max - col_min + 1
            row_min = min(f["key"]["row"] for f in features)
            row_max = max(f["key"]["row"] for f in features)
            rows = row_max - row_min + 1
            instants = set(f["key"]["instant"] for f in features)
            assert len(instants) == 1, f"Not single instant: {instants}"
            instant = instants.pop()
            logger.info(
                f"{log_prefix} Layout key extent: col[{col_min}:{col_max}] row[{row_min}:{row_max}]"
                f" ({cols}x{rows}={cols * rows} tiles) instant[{instant}].")

            layout_extent = get_total_extent(features)

            key_epsgs = set(f["key_epsg"] for f in features)
            assert len(key_epsgs) == 1, f"Multiple key CRSs {key_epsgs}"
            layout_epsg = key_epsgs.pop()
            layout_width_px = tile_size * (col_max - col_min + 1)
            layout_height_px = tile_size * (row_max - row_min + 1)
            logger.info(
                f"{log_prefix} Layout extent {layout_extent} EPSG {layout_epsg}:"
                f" {layout_width_px}x{layout_height_px}px")

            band_tiffs = S1BackscatterOrfeo._creo_scan_for_band_tiffs(
                creo_path, log_prefix)

            dem_dir_context = S1BackscatterOrfeo._get_dem_dir_context(
                sar_backscatter_arguments=sar_backscatter_arguments,
                extent=layout_extent,
                epsg=layout_epsg)

            msg = f"{log_prefix} Process {creo_path} "
            with TimingLogger(title=msg,
                              logger=logger), dem_dir_context as dem_dir:
                # Allocate numpy array tile
                orfeo_bands = numpy.zeros(
                    (len(bands), layout_height_px, layout_width_px),
                    dtype=result_dtype)

                for b, band in enumerate(bands):
                    if band.lower() not in band_tiffs:
                        raise OpenEOApiException(f"No tiff for band {band}")
                    data, nodata = S1BackscatterOrfeoV2._orfeo_pipeline(
                        input_tiff=band_tiffs[band.lower()],
                        extent=layout_extent,
                        extent_epsg=layout_epsg,
                        dem_dir=dem_dir,
                        extent_width_px=layout_width_px,
                        extent_height_px=layout_height_px,
                        sar_calibration_lut=sar_calibration_lut,
                        noise_removal=noise_removal,
                        elev_geoid=elev_geoid,
                        elev_default=elev_default,
                        log_prefix=f"{log_prefix}-{band}",
                        orfeo_memory=orfeo_memory)
                    orfeo_bands[b] = data

                if sar_backscatter_arguments.options.get("to_db", False):
                    # TODO: keep this "to_db" shortcut feature or drop it
                    #       and require user to use standard openEO functionality (`apply` based conversion)?
                    logger.info(
                        f"{log_prefix} Converting backscatter intensity to decibel"
                    )
                    orfeo_bands = 10 * numpy.log10(orfeo_bands)

                # Split orfeo output in tiles
                logger.info(
                    f"{log_prefix} Split {orfeo_bands.shape} in tiles of {tile_size}"
                )
                cell_type = geopyspark.CellType(orfeo_bands.dtype.name)
                tiles = []
                for c in range(col_max - col_min + 1):
                    for r in range(row_max - row_min + 1):
                        col = col_min + c
                        row = row_min + r
                        key = geopyspark.SpaceTimeKey(
                            col=col,
                            row=row,
                            instant=_instant_ms_to_day(instant))
                        tile = orfeo_bands[:,
                                           r * tile_size:(r + 1) * tile_size,
                                           c * tile_size:(c + 1) * tile_size]
                        if not (tile == nodata).all():
                            logger.info(
                                f"{log_prefix} Create Tile for key {key} from {tile.shape}"
                            )
                            tile = geopyspark.Tile(tile,
                                                   cell_type,
                                                   no_data_value=nodata)
                            tiles.append((key, tile))

            logger.info(
                f"{log_prefix} Layout extent split in {len(tiles)} tiles")
            return tiles
Ejemplo n.º 10
0
        def process_feature(product: Tuple[str, List[dict]]):
            import faulthandler
            faulthandler.enable()
            creo_path, features = product

            prod_id = re.sub(r"[^A-Z0-9]", "", creo_path.upper())[-10:]
            log_prefix = f"p{os.getpid()}-prod{prod_id}"
            print(f"{log_prefix} creo path {creo_path}")
            logger.info(
                f"{log_prefix} sar_backscatter_arguments: {sar_backscatter_arguments!r}"
            )

            layout_extent = get_total_extent(features)
            key_epsgs = set(f["key_epsg"] for f in features)
            assert len(key_epsgs) == 1, f"Multiple key CRSs {key_epsgs}"
            layout_epsg = key_epsgs.pop()

            dem_dir_context = S1BackscatterOrfeo._get_dem_dir_context(
                sar_backscatter_arguments=sar_backscatter_arguments,
                extent=layout_extent,
                epsg=layout_epsg)

            creo_path = pathlib.Path(creo_path)

            band_tiffs = S1BackscatterOrfeo._creo_scan_for_band_tiffs(
                creo_path, log_prefix)

            resultlist = []

            with dem_dir_context as dem_dir:

                for feature in features:
                    col, row, instant = (feature["key"][k]
                                         for k in ["col", "row", "instant"])

                    key_ext = feature["key_extent"]
                    key_epsg = layout_epsg

                    logger.info(
                        f"{log_prefix} Feature creo path: {creo_path}, key {key_ext} (EPSG {key_epsg})"
                    )
                    logger.info(
                        f"{log_prefix} sar_backscatter_arguments: {sar_backscatter_arguments!r}"
                    )
                    if not creo_path.exists():
                        raise OpenEOApiException("Creo path does not exist")

                    msg = f"{log_prefix} Process {creo_path} and load into geopyspark Tile"
                    with TimingLogger(title=msg, logger=logger):
                        # Allocate numpy array tile
                        tile_data = numpy.zeros(
                            (len(bands), tile_size, tile_size),
                            dtype=result_dtype)

                        for b, band in enumerate(bands):
                            if band.lower() not in band_tiffs:
                                raise OpenEOApiException(
                                    f"No tiff for band {band}")
                            data, nodata = S1BackscatterOrfeo._orfeo_pipeline(
                                input_tiff=band_tiffs[band.lower()],
                                extent=key_ext,
                                extent_epsg=key_epsg,
                                dem_dir=dem_dir,
                                extent_width_px=tile_size,
                                extent_height_px=tile_size,
                                sar_calibration_lut=sar_calibration_lut,
                                noise_removal=noise_removal,
                                elev_geoid=elev_geoid,
                                elev_default=elev_default,
                                log_prefix=f"{log_prefix}-{band}")
                            tile_data[b] = data

                        if sar_backscatter_arguments.options.get(
                                "to_db", False):
                            # TODO: keep this "to_db" shortcut feature or drop it
                            #       and require user to use standard openEO functionality (`apply` based conversion)?
                            logger.info(
                                f"{log_prefix} Converting backscatter intensity to decibel"
                            )
                            tile_data = 10 * numpy.log10(tile_data)

                        key = geopyspark.SpaceTimeKey(
                            row=row,
                            col=col,
                            instant=_instant_ms_to_day(instant))
                        cell_type = geopyspark.CellType(tile_data.dtype.name)
                        logger.info(
                            f"{log_prefix} Create Tile for key {key} from {tile_data.shape}"
                        )
                        tile = geopyspark.Tile(tile_data,
                                               cell_type,
                                               no_data_value=nodata)
                        resultlist.append((key, tile))

            return resultlist
Ejemplo n.º 11
0
    def _orfeo_pipeline(input_tiff: pathlib.Path,
                        extent: dict,
                        extent_epsg: int,
                        dem_dir: Union[str, None],
                        extent_width_px: int,
                        extent_height_px: int,
                        sar_calibration_lut: str,
                        noise_removal: bool,
                        elev_geoid: str,
                        elev_default: float = None,
                        log_prefix: str = "",
                        orfeo_memory: int = 512):
        logger.info(f"{log_prefix} Input tiff {input_tiff}")

        utm_zone, utm_northhem = utm_zone_from_epsg(extent_epsg)
        logger.info(
            f"{log_prefix} extent {extent} (UTM {utm_zone}, EPSG {extent_epsg})"
        )

        max_total_memory_in_bytes = os.environ.get('PYTHON_MAX_MEMORY')

        if max_total_memory_in_bytes:
            set_max_memory(int(max_total_memory_in_bytes))

        with TimingLogger(
                title=f"{log_prefix} Orfeo processing pipeline on {input_tiff}",
                logger=logger):

            arr = multiprocessing.Array(ctypes.c_double,
                                        extent_width_px * extent_height_px)

            ortho_rect = S1BackscatterOrfeo.configure_pipeline(
                dem_dir, elev_default, elev_geoid, input_tiff, log_prefix,
                noise_removal, orfeo_memory, sar_calibration_lut, utm_northhem,
                utm_zone)

            def run():

                ortho_rect.SetParameterInt("outputs.sizex", extent_width_px)
                ortho_rect.SetParameterInt("outputs.sizey", extent_height_px)
                ortho_rect.SetParameterInt("outputs.ulx", int(extent["xmin"]))
                ortho_rect.SetParameterInt("outputs.uly", int(extent["ymax"]))

                ortho_rect.Execute()
                # ram = ortho_rect.PropagateRequestedRegion("io.out", myRegion)
                try:
                    localdata = ortho_rect.GetImageAsNumpyArray('io.out')
                    np.copyto(
                        np.frombuffer(arr.get_obj()).reshape(
                            (extent_width_px, extent_height_px)), localdata)
                except RuntimeError as e:
                    logger.error(
                        f"Error while running Orfeo toolbox. {input_tiff} {extent} EPSG {extent_epsg} {sar_calibration_lut}",
                        exc_info=True)

            p = Process(target=run, args=())
            p.start()
            p.join()
            if (p.exitcode == -signal.SIGSEGV):
                logger.error(
                    f"Segmentation fault while running Orfeo toolbox. {input_tiff} {extent} EPSG {extent_epsg} {sar_calibration_lut}"
                )

            data = np.reshape(np.frombuffer(arr.get_obj()),
                              (extent_width_px, extent_height_px))

            logger.info(
                f"{log_prefix} Final orfeo pipeline result: shape {data.shape},"
                f" min {numpy.nanmin(data)}, max {numpy.nanmax(data)}")
            return data, 0
Ejemplo n.º 12
0
            elif isinstance(result, ImageCollectionResult):
                result.imagecollection.download(output_file,
                                                bbox="",
                                                time="",
                                                format=result.format,
                                                **result.options)
                logger.info("wrote image collection to %s" % output_file)
            elif isinstance(result, JSONResult):
                with open(output_file, 'w') as f:
                    json.dump(result.prepare_for_json(), f)
                logger.info("wrote JSON result to %s" % output_file)
            elif isinstance(result, MultipleFilesResult):
                result.reduce(output_file, delete_originals=True)
                logger.info("reduced %d files to %s" %
                            (len(result.files), output_file))
            else:
                with open(output_file, 'w') as f:
                    json.dump(result, f)
                logger.info("wrote JSON result to %s" % output_file)
    except Exception as e:
        logger.exception("error processing batch job")
        user_facing_logger.exception("error processing batch job")
        raise e


if __name__ == '__main__':
    _setup_app_logging()

    with TimingLogger("batch_job.py main", logger=logger):
        main(sys.argv)