コード例 #1
0
ファイル: utils.py プロジェクト: wri/gfw-data-api
async def create_wm_tile_set_job(
    dataset: str,
    version: str,
    creation_options: RasterTileSetSourceCreationOptions,
    job_name: str,
    parents: Optional[List[Job]] = None,
    use_resampler: bool = False,
) -> Tuple[Job, str]:

    asset_uri = get_asset_uri(
        dataset,
        version,
        AssetType.raster_tile_set,
        creation_options.dict(by_alias=True),
        "epsg:3857",
    )

    # Create an asset record
    asset_options = AssetCreateIn(
        asset_type=AssetType.raster_tile_set,
        asset_uri=asset_uri,
        is_managed=True,
        creation_options=creation_options,
        metadata=RasterTileSetMetadata(),
    ).dict(by_alias=True)
    wm_asset_record = await create_asset(dataset, version, **asset_options)

    logger.debug(f"Created asset for {asset_uri}")

    # TODO: Consider removing the use_resampler argument and changing this
    # to "if creation_options.calc is None:"
    # Make sure to test different scenarios when done!
    if use_resampler:
        job = await create_resample_job(
            dataset,
            version,
            creation_options,
            int(creation_options.grid.strip("zoom_")),
            job_name,
            callback_constructor(wm_asset_record.asset_id),
            parents=parents,
        )
    else:
        job = await create_pixetl_job(
            dataset,
            version,
            creation_options,
            job_name,
            callback_constructor(wm_asset_record.asset_id),
            parents=parents,
        )

    zoom_level = int(creation_options.grid.strip("zoom_"))
    job = scale_batch_job(job, zoom_level)

    return job, asset_uri
コード例 #2
0
async def static_vector_1x1_asset(
    dataset: str,
    version: str,
    asset_id: UUID,
    input_data: Dict[str, Any],
) -> ChangeLog:
    """Create Vector tile cache and NDJSON file as intermediate data."""

    #######################
    # Update asset metadata
    #######################

    creation_options = creation_option_factory(AssetType.grid_1x1,
                                               input_data["creation_options"])

    field_attributes: List[Dict[str, Any]] = await get_field_attributes(
        dataset, version, creation_options)

    grid_1x1_uri = get_asset_uri(dataset, version, AssetType.grid_1x1)

    await assets.update_asset(
        asset_id,
        fields=field_attributes,
    )

    ############################
    # Define jobs
    ############################

    # Create table schema
    command: List[str] = [
        "export_1x1_grid.sh",
        "-d",
        dataset,
        "-v",
        version,
        "-C",
        ",".join([field["field_name"] for field in field_attributes]),
        "-T",
        grid_1x1_uri,
    ]

    export_1x1_grid = PostgresqlClientJob(
        dataset=dataset,
        job_name="export_1x1_grid",
        job_queue=DATA_LAKE_JOB_QUEUE,
        command=command,
        memory=9000,
        environment=reader_secrets,
        callback=callback_constructor(asset_id),
    )

    #######################
    # execute jobs
    #######################

    log: ChangeLog = await execute([export_1x1_grid])

    return log
コード例 #3
0
ファイル: test_batch.py プロジェクト: wri/gfw-data-api
async def test_batch_failure():
    dataset = "test"
    version = "v1.1.1"
    creation_options = {
        "source_type": "vector",
        "source_uri": [f"s3://{BUCKET}/{GEOJSON_NAME}"],
        "source_driver": "GeoJSON",
        "zipped": False,
    }

    async with ContextEngine("WRITE"):
        await datasets.create_dataset(dataset)
        await versions.create_version(dataset, version)
        new_asset = await assets.create_asset(
            dataset,
            version,
            asset_type="Database table",
            asset_uri="s3://path/to/file",
            creation_options=creation_options,
        )

    job_env = writer_secrets + [
        {"name": "STATUS_URL", "value": f"http://app_test:{PORT}/tasks"}
    ]
    callback = callback_constructor(new_asset.asset_id)

    # Can't have two parents with same name

    job1 = PostgresqlClientJob(
        dataset=dataset,
        job_name="job1",
        command=["test_mock_s3_awscli.sh", "-s", f"s3://{BUCKET}/{GEOJSON_NAME}"],
        environment=job_env,
        callback=callback,
    )
    job2 = PostgresqlClientJob(
        dataset=dataset,
        job_name="job1",
        command=["test_mock_s3_awscli.sh", "-s", f"s3://{BUCKET}/{GEOJSON_NAME}"],
        environment=job_env,
        callback=callback,
    )

    job3 = PostgresqlClientJob(
        dataset=dataset,
        job_name="job3",
        command=["test_mock_s3_awscli.sh", "-s", f"s3://{BUCKET}/{GEOJSON_NAME}"],
        environment=job_env,
        callback=callback,
        parents=[job1.job_name, job2.job_name],
    )
    message = ""

    try:
        await execute([job1, job2, job3])
    except TooManyRetriesError as e:
        message = str(e)

    assert message == ""
コード例 #4
0
async def raster_tile_set_asset(
    dataset: str,
    version: str,
    asset_id: UUID,
    input_data: Dict[str, Any],
) -> ChangeLog:

    # If being created as a source (default) asset, creation_options["source_uri"]
    # will be a list. When being created as an auxiliary asset, it will be None.
    # In the latter case we will generate one for pixETL based on the default asset,
    # below.

    co = deepcopy(input_data["creation_options"])

    source_uris: Optional[List[str]] = co.get("source_uri")
    if source_uris is None:
        default_asset: ORMAsset = await get_default_asset(dataset, version)

        if default_asset.creation_options[
                "source_type"] == RasterSourceType.raster:
            co["source_type"] = RasterSourceType.raster
            co["source_uri"] = [
                tile_uri_to_tiles_geojson(default_asset.asset_uri)
            ]
            co["source_driver"] = RasterDrivers.geotiff
            auxiliary_assets = co.pop("auxiliary_assets", None)
            if auxiliary_assets:
                for aux_asset_id in auxiliary_assets:
                    auxiliary_asset: ORMAsset = await get_asset(aux_asset_id)
                    co["source_uri"].append(
                        tile_uri_to_tiles_geojson(auxiliary_asset.asset_uri))

        elif default_asset.creation_options[
                "source_type"] == VectorSourceType.vector:
            co["source_type"] = VectorSourceType.vector

    creation_options = PixETLCreationOptions(**co)

    callback: Callback = callback_constructor(asset_id)

    create_raster_tile_set_job: Job = await create_pixetl_job(
        dataset, version, creation_options, "create_raster_tile_set", callback)

    log: ChangeLog = await execute([create_raster_tile_set_job])

    return log
コード例 #5
0
async def test_batch_scheduler(batch_client, httpd):

    _, logs = batch_client
    httpd_port = httpd.server_port

    ############################
    # Setup test
    ############################

    job_env = writer_secrets + [{
        "name": "STATUS_URL",
        "value": f"http://app_test:{httpd_port}/tasks"
    }]

    batch.POLL_WAIT_TIME = 1

    dataset = "test"
    version = "v1.1.1"
    input_data = {
        "source_type": "vector",
        "source_uri": [f"s3://{BUCKET}/{GEOJSON_NAME}"],
        "creation_options": {
            "src_driver": "GeoJSON",
            "zipped": False
        },
        "metadata": {},
    }

    new_asset = await create_asset(dataset, version, "Database table",
                                   "s3://path/to/file", input_data)
    callback = callback_constructor(new_asset.asset_id)
    ############################
    # Test if mocking batch jobs using the different environments works
    ############################

    job1 = PostgresqlClientJob(
        job_name="job1",
        command=[
            "test_mock_s3_awscli.sh", "-s", f"s3://{BUCKET}/{GEOJSON_NAME}"
        ],
        environment=job_env,
        callback=callback,
    )
    job2 = GdalPythonImportJob(
        job_name="job2",
        command=[
            "test_mock_s3_ogr2ogr.sh",
            "-d",
            "test",
            "-v",
            "v1.0.0",
            "-s",
            f"s3://{BUCKET}/{GEOJSON_NAME}",
            "-l",
            "test",
            "-f",
            GEOJSON_NAME,
        ],
        environment=job_env,
        parents=[job1.job_name],
        callback=callback,
    )
    job3 = GdalPythonExportJob(
        job_name="job3",
        command=[
            "test_mock_s3_awscli.sh", "-s", f"s3://{BUCKET}/{GEOJSON_NAME}"
        ],
        environment=job_env,
        parents=[job2.job_name],
        callback=callback,
    )
    job4 = TileCacheJob(
        job_name="job4",
        command=[
            "test_mock_s3_awscli.sh", "-s", f"s3://{BUCKET}/{GEOJSON_NAME}"
        ],
        environment=job_env,
        parents=[job3.job_name],
        callback=callback,
    )

    log = await batch.execute([job1, job2, job3, job4])
    assert log.status == "pending"

    tasks_rows = await tasks.get_tasks(new_asset.asset_id)

    task_ids = [str(task.task_id) for task in tasks_rows]

    # make sure, all jobs completed
    status = await poll_jobs(task_ids)
    assert status == "saved"

    check_callbacks(task_ids, httpd.server_port)
コード例 #6
0
ファイル: test_jobs.py プロジェクト: wri/gfw-data-api
def test_jobs_model():

    callback = callback_constructor(uuid4())

    job = Job(
        dataset="test",
        job_name="test",
        job_queue="test",
        job_definition="test",
        command=["1"],
        environment=[{
            "name": "TEST",
            "value": "TEST"
        }],
        vcpus=1,
        memory=2,
        attempts=1,
        attempt_duration_seconds=1,
        parents=None,
        callback=callback,
    )

    assert job.environment == [
        {
            "name": "TEST",
            "value": "TEST"
        },
        {
            "name": "CORES",
            "value": "1"
        },
        {
            "name": "MAX_MEM",
            "value": "2"
        },
    ]

    job.vcpus = 45
    assert job.environment == [
        {
            "name": "TEST",
            "value": "TEST"
        },
        {
            "name": "CORES",
            "value": "45"
        },
        {
            "name": "MAX_MEM",
            "value": "2"
        },
    ]

    job.memory = 100
    assert job.environment == [
        {
            "name": "TEST",
            "value": "TEST"
        },
        {
            "name": "CORES",
            "value": "45"
        },
        {
            "name": "MAX_MEM",
            "value": "100"
        },
    ]
コード例 #7
0
async def raster_tile_cache_asset(
    dataset: str,
    version: str,
    asset_id: UUID,
    input_data: Dict[str, Any],
) -> ChangeLog:
    """Generate Raster Tile Cache Assets."""

    # TODO: Refactor to be easier to test

    min_zoom = input_data["creation_options"]["min_zoom"]
    max_zoom = input_data["creation_options"]["max_zoom"]
    max_static_zoom = input_data["creation_options"]["max_static_zoom"]
    implementation = input_data["creation_options"]["implementation"]
    symbology = input_data["creation_options"]["symbology"]
    resampling = input_data["creation_options"]["resampling"]

    # source_asset_id is currently required. Could perhaps make it optional
    # in the case that the default asset is the only one.
    source_asset: ORMAsset = await get_asset(
        input_data["creation_options"]["source_asset_id"]
    )

    # Get the creation options from the original raster tile set asset and
    # overwrite settings. Make sure source_type and source_driver are set in
    # case it is an auxiliary asset

    new_source_uri = [
        tile_uri_to_tiles_geojson(
            get_asset_uri(
                dataset,
                version,
                AssetType.raster_tile_set,
                source_asset.creation_options,
            )
        ).replace("/geotiff", "/gdal-geotiff")
    ]

    # The first thing we do for each zoom level is reproject the source asset
    # to web-mercator. We don't want the calc string (if any) used to
    # create the source asset to be applied again to the already transformed
    # data, so set it to None.
    source_asset_co = RasterTileSetSourceCreationOptions(
        # TODO: With python 3.9, we can use the `|` operator here
        #  waiting for https://github.com/tiangolo/uvicorn-gunicorn-fastapi-docker/pull/67
        **{
            **source_asset.creation_options,
            **{
                "source_type": RasterSourceType.raster,
                "source_driver": RasterDrivers.geotiff,
                "source_uri": new_source_uri,
                "calc": None,
                "resampling": resampling,
                "compute_stats": False,
                "compute_histogram": False,
                "symbology": Symbology(**symbology),
                "subset": None,
            },
        }
    )

    # If float data type, convert to int in derivative assets for performance
    # FIXME: Make this work for multi-band inputs
    max_zoom_calc = None
    if source_asset_co.data_type == DataType.boolean:
        pass  # So the next line doesn't break
    elif np.issubdtype(np.dtype(source_asset_co.data_type), np.floating):
        logger.info("Source datatype is float subtype, converting to int")
        source_asset_co, max_zoom_calc = convert_float_to_int(
            source_asset.stats, source_asset_co
        )

    assert source_asset_co.symbology is not None
    symbology_function = symbology_constructor[source_asset_co.symbology.type].function

    # We want to make sure that the final RGB asset is named after the
    # implementation of the tile cache and that the source_asset name is not
    # already used by another intermediate asset.
    # TODO: Actually make sure the intermediate assets aren't going to
    # overwrite any existing assets
    if symbology_function == no_symbology:
        source_asset_co.pixel_meaning = implementation
    else:
        source_asset_co.pixel_meaning = (
            f"{source_asset_co.pixel_meaning}_{implementation}"
        )

    job_list: List[Job] = []
    jobs_dict: Dict[int, Dict[str, Job]] = dict()

    for zoom_level in range(max_zoom, min_zoom - 1, -1):
        jobs_dict[zoom_level] = dict()

        if zoom_level == max_zoom:
            source_reprojection_parent_jobs: List[Job] = []
        else:
            source_reprojection_parent_jobs = [
                jobs_dict[zoom_level + 1]["source_reprojection_job"]
            ]

        (
            source_reprojection_job,
            source_reprojection_uri,
        ) = await reproject_to_web_mercator(
            dataset,
            version,
            source_asset_co,
            zoom_level,
            max_zoom,
            source_reprojection_parent_jobs,
            max_zoom_resampling=PIXETL_DEFAULT_RESAMPLING,
            max_zoom_calc=max_zoom_calc,
            use_resampler=max_zoom_calc is None,
        )
        jobs_dict[zoom_level]["source_reprojection_job"] = source_reprojection_job
        job_list.append(source_reprojection_job)

        symbology_jobs: List[Job]
        symbology_uri: str

        symbology_co = source_asset_co.copy(deep=True)
        symbology_jobs, symbology_uri = await symbology_function(
            dataset,
            version,
            implementation,
            symbology_co,
            zoom_level,
            max_zoom,
            jobs_dict,
        )
        job_list += symbology_jobs

        bit_depth: int = symbology_constructor[source_asset_co.symbology.type].bit_depth

        if zoom_level <= max_static_zoom:
            tile_cache_job: Job = await create_tile_cache(
                dataset,
                version,
                symbology_uri,
                zoom_level,
                implementation,
                callback_constructor(asset_id),
                [*symbology_jobs, source_reprojection_job],
                bit_depth,
            )
            job_list.append(tile_cache_job)

    log: ChangeLog = await execute(job_list)
    return log
コード例 #8
0
async def _merge_assets(
    dataset: str,
    version: str,
    pixel_meaning: str,
    asset1_uri: str,
    asset2_uri: str,
    zoom_level: int,
    parents: List[Job],
    calc_str: str = "np.ma.array([A, B, C, D])",
    band_count: int = 4,
) -> Tuple[List[Job], str]:
    """Create RGBA-encoded raster tile set from two source assets, potentially
    using a custom merge function (the default works for 3+1 band sources, such
    as RGB + Intensity as Alpha)"""

    encoded_co = RasterTileSetSourceCreationOptions(
        pixel_meaning=pixel_meaning,
        data_type=DataType.uint8,  # FIXME: Revisit for 16-bit assets
        band_count=band_count,
        no_data=None,
        resampling=ResamplingMethod.nearest,
        grid=Grid(f"zoom_{zoom_level}"),
        compute_stats=False,
        compute_histogram=False,
        source_type=RasterSourceType.raster,
        source_driver=RasterDrivers.geotiff,
        source_uri=[asset1_uri, asset2_uri],
        calc=calc_str,
        photometric=PhotometricType.rgb,
    )

    asset_uri = get_asset_uri(
        dataset,
        version,
        AssetType.raster_tile_set,
        encoded_co.dict(by_alias=True),
        "epsg:3857",
    )

    logger.debug(
        f"ATTEMPTING TO CREATE MERGED ASSET WITH THESE CREATION OPTIONS: {encoded_co}"
    )

    # Create an asset record
    asset_options = AssetCreateIn(
        asset_type=AssetType.raster_tile_set,
        asset_uri=asset_uri,
        is_managed=True,
        creation_options=encoded_co,
        metadata=RasterTileSetMetadata(),
    ).dict(by_alias=True)

    asset = await create_asset(dataset, version, **asset_options)
    logger.debug(
        f"ZOOM LEVEL {zoom_level} MERGED ASSET CREATED WITH ASSET_ID {asset.asset_id}"
    )

    callback = callback_constructor(asset.asset_id)
    pixetl_job = await create_pixetl_job(
        dataset,
        version,
        encoded_co,
        job_name=f"merge_assets_zoom_{zoom_level}",
        callback=callback,
        parents=parents,
    )

    pixetl_job = scale_batch_job(pixetl_job, zoom_level)

    return (
        [pixetl_job],
        tile_uri_to_tiles_geojson(asset_uri),
    )
コード例 #9
0
async def _create_colormapped_asset(
    dataset: str,
    version: str,
    pixel_meaning: str,
    source_asset_co: RasterTileSetSourceCreationOptions,
    zoom_level: int,
    jobs_dict: Dict,
) -> Tuple[List[Job], str]:
    wm_source_co = source_asset_co.copy(deep=True,
                                        update={"grid": f"zoom_{zoom_level}"})

    wm_source_uri: str = tile_uri_to_tiles_geojson(
        get_asset_uri(
            dataset,
            version,
            AssetType.raster_tile_set,
            wm_source_co.dict(by_alias=True),
            "epsg:3857",
        ))

    colormap_co = wm_source_co.copy(
        deep=True,
        update={
            "source_uri": [wm_source_uri],
            "calc": None,
            "resampling": PIXETL_DEFAULT_RESAMPLING,
            "pixel_meaning": pixel_meaning,
        },
    )

    colormap_asset_uri = get_asset_uri(
        dataset,
        version,
        AssetType.raster_tile_set,
        colormap_co.dict(by_alias=True),
        "epsg:3857",
    )

    # Create an asset record
    colormap_asset_model = AssetCreateIn(
        asset_type=AssetType.raster_tile_set,
        asset_uri=colormap_asset_uri,
        is_managed=True,
        creation_options=colormap_co,
    ).dict(by_alias=True)
    colormap_asset_record = await create_asset(dataset, version,
                                               **colormap_asset_model)

    logger.debug(f"Created asset record for {colormap_asset_uri} "
                 f"with creation options: {colormap_co}")

    parents = [jobs_dict[zoom_level]["source_reprojection_job"]]
    job_name = sanitize_batch_job_name(
        f"{dataset}_{version}_{pixel_meaning}_{zoom_level}")

    # Apply the colormap
    gdaldem_job = await create_gdaldem_job(
        dataset,
        version,
        colormap_co,
        job_name,
        callback_constructor(colormap_asset_record.asset_id),
        parents=parents,
    )
    gdaldem_job = scale_batch_job(gdaldem_job, zoom_level)

    return [gdaldem_job], colormap_asset_uri