Beispiel #1
0
def test_extract_result_metadata_aggregate_spatial_delayed_vector():
    tracer = DryRunDataTracer()
    cube = tracer.load_collection(collection_id="Sentinel2",
                                  arguments={
                                      "temporal_extent":
                                      ["2020-02-02", "2020-03-03"],
                                  })
    cube = cube.filter_bbox(west=4, south=51, east=5, north=52)
    geometries = DelayedVector(
        str(get_test_data_file("multipolygon01.geojson")))
    cube = cube.aggregate_spatial(geometries=geometries, reducer="mean")

    metadata = extract_result_metadata(tracer)
    expected = {
        "bbox": (5.0, 5.0, 45.0, 40.0),
        "geometry": {
            'type':
            'Polygon',
            'coordinates': (((5.0, 5.0), (5.0, 40.0), (45.0, 40.0),
                             (45.0, 5.0), (5.0, 5.0)), ),
        },
        "area": {
            "value": approx(6763173869883.0, 1.0),
            "unit": "square meter"
        },
        "start_datetime": "2020-02-02T00:00:00Z",
        "end_datetime": "2020-03-03T00:00:00Z",
        "links": []
    }
    assert metadata == expected
Beispiel #2
0
def test_dry_run_data_tracer():
    tracer = DryRunDataTracer()
    source = DataSource.load_collection("S2")
    trace = DataTrace(parent=source, operation="ndvi", arguments={})
    res = tracer.add_trace(trace)
    assert res is trace
    assert tracer.get_trace_leaves() == [trace]
Beispiel #3
0
def test_extract_result_metadata():
    tracer = DryRunDataTracer()
    cube = tracer.load_collection(collection_id="Sentinel2",
                                  arguments={
                                      "temporal_extent":
                                      ["2020-02-02", "2020-03-03"],
                                  })
    cube = cube.filter_bbox(west=4, south=51, east=5, north=52)

    metadata = extract_result_metadata(tracer)
    expected = {
        "bbox": [4, 51, 5, 52],
        "geometry": {
            "type":
            "Polygon",
            "coordinates": (((4.0, 51.0), (4.0, 52.0), (5.0, 52.0),
                             (5.0, 51.0), (4.0, 51.0)), )
        },
        "area": {
            "value": approx(7725459381.443416, 0.01),
            "unit": "square meter"
        },
        "start_datetime": "2020-02-02T00:00:00Z",
        "end_datetime": "2020-03-03T00:00:00Z",
        "links": []
    }
    assert metadata == expected
Beispiel #4
0
def test_tracer_load_collection():
    tracer = DryRunDataTracer()
    arguments = {
        "temporal_extent": ("2020-01-01", "2020-02-02"),
        "spatial_extent": {"west": 1, "south": 51, "east": 2, "north": 52},
        "bands": ["red", "blue"],
    }
    cube = tracer.load_collection("S2", arguments)
    traces = tracer.get_trace_leaves()
    assert [t.describe() for t in traces] == [
        "load_collection<-temporal_extent<-spatial_extent<-bands"
    ]
Beispiel #5
0
def test_dry_run_data_tracer_process_traces():
    tracer = DryRunDataTracer()
    source = DataSource.load_collection("S2")
    trace1 = DataTrace(parent=source, operation="ndvi", arguments={})
    tracer.add_trace(trace1)
    trace2 = DataTrace(parent=source, operation="evi", arguments={})
    tracer.add_trace(trace2)
    assert tracer.get_trace_leaves() == [trace1, trace2]
    traces = tracer.process_traces([trace1, trace2], operation="filter_bbox", arguments={"bbox": "mol"})
    assert tracer.get_trace_leaves() == traces
    assert set(t.describe() for t in traces) == {
        "load_collection<-ndvi<-filter_bbox",
        "load_collection<-evi<-filter_bbox",
    }
Beispiel #6
0
def dry_run_tracer() -> DryRunDataTracer:
    return DryRunDataTracer()
Beispiel #7
0
def extract_result_metadata(tracer: DryRunDataTracer) -> dict:
    logger.info("Extracting result metadata from {t!r}".format(t=tracer))

    rfc3339 = Rfc3339(propagate_none=True)

    source_constraints = tracer.get_source_constraints()

    # Take union of extents
    temporal_extent = temporal_extent_union(*[
        sc["temporal_extent"] for _, sc in source_constraints
        if "temporal_extent" in sc
    ])
    extents = [
        sc["spatial_extent"] for _, sc in source_constraints
        if "spatial_extent" in sc
    ]
    if (len(extents) > 0):
        spatial_extent = spatial_extent_union(*extents)
        bbox = [spatial_extent[b] for b in ["west", "south", "east", "north"]]
        if all(b is not None for b in bbox):
            polygon = Polygon.from_bounds(*bbox)
            geometry = mapping(polygon)
            area = area_in_square_meters(polygon, spatial_extent["crs"])
        else:
            bbox = None
            geometry = None
            area = None
    else:
        bbox = None
        geometry = None
        area = None

    start_date, end_date = [rfc3339.datetime(d) for d in temporal_extent]

    aggregate_spatial_geometries = tracer.get_geometries(
    )  # TODO: consider "filter_spatial" geometries too?
    if aggregate_spatial_geometries:
        if len(aggregate_spatial_geometries) > 1:
            logger.warning("Multiple aggregate_spatial geometries: {c}".format(
                c=len(aggregate_spatial_geometries)))
        agg_geometry = aggregate_spatial_geometries[0]
        if isinstance(agg_geometry, BaseGeometry):
            bbox = agg_geometry.bounds
            geometry = mapping(agg_geometry)
            area = area_in_square_meters(agg_geometry, "EPSG:4326")
        elif isinstance(agg_geometry, DelayedVector):
            bbox = agg_geometry.bounds
            # Intentionally don't return the complete vector file. https://github.com/Open-EO/openeo-api/issues/339
            geometry = mapping(Polygon.from_bounds(*bbox))
            area = agg_geometry.area
        else:
            logger.warning("Unsupported geometry to calculate area: " +
                           str(agg_geometry))

    links = tracer.get_metadata_links()
    links = [link for k, v in links.items() for link in v]

    # TODO: dedicated type?
    # TODO: match STAC format?
    return {
        'geometry': geometry,
        'bbox': bbox,
        'area': {
            'value': area,
            'unit': 'square meter'
        } if area else None,
        'start_datetime': start_date,
        'end_datetime': end_date,
        'links': links
    }
Beispiel #8
0
def run_job(job_specification,
            output_file: Path,
            metadata_file: Path,
            api_version,
            job_dir,
            dependencies: dict,
            user_id: str = None):
    logger.info(f"Job spec: {json.dumps(job_specification,indent=1)}")
    process_graph = job_specification['process_graph']

    backend_implementation = GeoPySparkBackendImplementation()
    logger.info(f"Using backend implementation {backend_implementation}")
    correlation_id = str(uuid.uuid4())
    logger.info(f"Correlation id: {correlation_id}")
    env = EvalEnv({
        'version': api_version or "1.0.0",
        'pyramid_levels': 'highest',
        'user': User(user_id=user_id),
        'require_bounds': True,
        'correlation_id': correlation_id,
        'dependencies': dependencies,
        "backend_implementation": backend_implementation,
    })
    tracer = DryRunDataTracer()
    logger.info("Starting process graph evaluation")
    result = ProcessGraphDeserializer.evaluate(process_graph,
                                               env=env,
                                               do_dry_run=tracer)
    logger.info("Evaluated process graph, result (type {t}): {r!r}".format(
        t=type(result), r=result))

    if isinstance(result, DelayedVector):
        geojsons = (mapping(geometry) for geometry in result.geometries)
        result = JSONResult(geojsons)

    if isinstance(result, DriverDataCube):
        format_options = job_specification.get('output', {})
        format_options["batch_mode"] = True
        result = ImageCollectionResult(cube=result,
                                       format='GTiff',
                                       options=format_options)

    if not isinstance(result, SaveResult):  # Assume generic JSON result
        result = JSONResult(result)

    global_metadata_attributes = {
        "title": job_specification.get("title", ""),
        "description": job_specification.get("description", ""),
        "institution": "openEO platform - Geotrellis backend: " + __version__
    }

    assets_metadata = None
    if ('write_assets' in dir(result)):
        result.options["batch_mode"] = True
        result.options["file_metadata"] = global_metadata_attributes
        if (result.options.get("sample_by_feature")):
            geoms = tracer.get_geometries("filter_spatial")
            if len(geoms) > 1:
                logger.warning(
                    "Multiple aggregate_spatial geometries: {c}".format(
                        c=len(geoms)))
            elif len(geoms) == 0:
                logger.warning(
                    "sample_by_feature enabled, but no geometries found. They can be specified using filter_spatial."
                )
            else:
                result.options["geometries"] = geoms[0]
            if (result.options["geometries"] == None):
                logger.error(
                    "samply_by_feature was set, but no geometries provided through filter_spatial. Make sure to provide geometries."
                )
        assets_metadata = result.write_assets(str(output_file))
        for name, asset in assets_metadata.items():
            _add_permissions(Path(asset["href"]), stat.S_IWGRP)
        logger.info("wrote image collection to %s" % output_file)

    elif isinstance(result, ImageCollectionResult):
        result.options["batch_mode"] = True
        result.save_result(filename=str(output_file))
        _add_permissions(output_file, stat.S_IWGRP)
        logger.info("wrote image collection to %s" % output_file)
    elif isinstance(result, MultipleFilesResult):
        result.reduce(output_file, delete_originals=True)
        _add_permissions(output_file, stat.S_IWGRP)
        logger.info("reduced %d files to %s" %
                    (len(result.files), output_file))
    elif isinstance(result, NullResult):
        logger.info("skipping output file %s" % output_file)
    else:
        raise NotImplementedError(
            "unsupported result type {r}".format(r=type(result)))

    if any(card4l
           for _, card4l in dependencies.values()):  # TODO: clean this up
        logger.debug("awaiting Sentinel Hub CARD4L data...")

        s3_service = get_jvm().org.openeo.geotrellissentinelhub.S3Service()

        poll_interval_secs = 10
        max_delay_secs = 600

        card4l_dependencies = [
            (collection_id, source_location)
            for (collection_id,
                 metadata_properties), (source_location,
                                        card4l) in dependencies.items()
            if card4l
        ]

        for collection_id, source_location in card4l_dependencies:
            uri_parts = urlparse(source_location)
            bucket_name = uri_parts.hostname
            request_group_id = uri_parts.path[1:]

            try:
                # FIXME: incorporate collection_id and metadata_properties to make sure the files don't clash
                s3_service.download_stac_data(bucket_name, request_group_id,
                                              str(job_dir), poll_interval_secs,
                                              max_delay_secs)
                logger.info("downloaded CARD4L data in {b}/{g} to {d}".format(
                    b=bucket_name, g=request_group_id, d=job_dir))
            except Py4JJavaError as e:
                java_exception = e.java_exception

                if (java_exception.getClass().getName(
                ) == 'org.openeo.geotrellissentinelhub.S3Service$StacMetadataUnavailableException'
                    ):
                    logger.warning(
                        "could not find CARD4L metadata to download from s3://{b}/{r} after {d}s"
                        .format(b=bucket_name,
                                r=request_group_id,
                                d=max_delay_secs))
                else:
                    raise e

        _transform_stac_metadata(job_dir)

    unique_process_ids = CollectUniqueProcessIdsVisitor().accept_process_graph(
        process_graph).process_ids

    _export_result_metadata(tracer=tracer,
                            result=result,
                            output_file=output_file,
                            metadata_file=metadata_file,
                            unique_process_ids=unique_process_ids,
                            asset_metadata=assets_metadata)

    if ConfigParams().is_kube_deploy:
        import boto3
        from openeogeotrellis.utils import s3_client

        bucket = os.environ.get('SWIFT_BUCKET')
        s3_instance = s3_client()

        logger.info("Writing results to object storage")
        for file in os.listdir(job_dir):
            full_path = str(job_dir) + "/" + file
            s3_instance.upload_file(full_path, bucket, full_path.strip("/"))