Python DatasetAssembler Examples, eodatasets3.DatasetAssembler Python Examples

Example #1

0

Show file

File: wagl.py Project: EricHay/eo-datasets

def write_measurement_h5(
    p: DatasetAssembler,
    name: str,
    g: h5py.Dataset,
    overviews=images.DEFAULT_OVERVIEWS,
    overview_resampling=Resampling.nearest,
    expand_valid_data=True,
    file_id: str = None,
):
    """
    Write a measurement by copying it from a hdf5 dataset.
    """
    if hasattr(g, "chunks"):
        data = g[:]
    else:
        data = g

    p.write_measurement_numpy(
        name=name,
        array=data,
        grid_spec=images.GridSpec(
            shape=g.shape,
            transform=Affine.from_gdal(*g.attrs["geotransform"]),
            crs=CRS.from_wkt(g.attrs["crs_wkt"]),
        ),
        nodata=(g.attrs.get("no_data_value")),
        overviews=overviews,
        overview_resampling=overview_resampling,
        expand_valid_data=expand_valid_data,
        file_id=file_id,
    )

Example #2

0

Show file

File: wagl.py Project: EricHay/eo-datasets

def _unpack_products(p: DatasetAssembler, product_list: Iterable[str],
                     h5group: h5py.Group) -> None:
    """
    Unpack and package the NBAR and NBART products.
    """
    # listing of all datasets of IMAGE CLASS type
    img_paths = _find_h5_paths(h5group, "IMAGE")

    for product in product_list:
        with do(f"Starting {product}", heading=True):
            for pathname in [
                    p for p in img_paths if "/{}/".format(product.upper()) in p
            ]:
                with do(f"Path {pathname!r}"):
                    dataset = h5group[pathname]
                    band_name = utils.normalise_band_name(
                        dataset.attrs["alias"])
                    write_measurement_h5(
                        p,
                        f"{product}:{band_name}",
                        dataset,
                        overview_resampling=Resampling.average,
                        file_id=_file_id(dataset),
                    )

            if (p.platform, product) in _THUMBNAILS:
                red, green, blue = _THUMBNAILS[(p.platform, product)]
                with do(f"Thumbnailing {product}"):
                    p.write_thumbnail(red,
                                      green,
                                      blue,
                                      kind=product,
                                      static_stretch=(1, 3000))

Example #3

0

Show file

File: wagl.py Project: EricHay/eo-datasets

def _read_wagl_metadata(p: DatasetAssembler, granule_group: h5py.Group):
    try:
        wagl_path, *ancil_paths = [
            pth for pth in (_find_h5_paths(granule_group, "SCALAR"))
            if "METADATA" in pth
        ]
    except ValueError:
        raise ValueError("No nbar metadata found in granule")

    [wagl_doc] = loads_yaml(granule_group[wagl_path][()])

    try:
        p.processed = get_path(wagl_doc,
                               ("system_information", "time_processed"))
    except PathAccessError:
        raise ValueError(
            f"WAGL dataset contains no time processed. Path {wagl_path}")

    for i, path in enumerate(ancil_paths, start=2):
        wagl_doc.setdefault(f"wagl_{i}", {}).update(
            list(loads_yaml(granule_group[path][()]))[0]["ancillary"])

    p.properties["dea:dataset_maturity"] = _determine_maturity(
        p.datetime, p.processed, wagl_doc)

    _take_software_versions(p, wagl_doc)
    p.extend_user_metadata("wagl", wagl_doc)

Example #4

0

Show file

File: _utils.py Project: MatthewJA/datacube-alchemist

def _write_thumbnail(task: AlchemistTask, dataset_assembler: DatasetAssembler):
    if task.settings.output.preview_image is not None:
        dataset_assembler.write_thumbnail(*task.settings.output.preview_image)
    elif task.settings.output.preview_image_singleband is not None:
        dataset_assembler.write_thumbnail_singleband(
            **task.settings.output.preview_image_singleband
        )

Example #5

0

Show file

File: wagl.py Project: EricHay/eo-datasets

def _read_gqa_doc(p: DatasetAssembler, doc: Dict):
    _take_software_versions(p, doc)
    p.extend_user_metadata("gqa", doc)

    # TODO: more of the GQA fields?
    for k, v in _flatten_dict(doc["residual"], separator="_"):
        p.properties[f"gqa:{k}"] = v

Example #6

0

Show file

File: wagl.py Project: EricHay/eo-datasets

def _read_fmask_doc(p: DatasetAssembler, doc: Dict):
    for name, value in doc["percent_class_distribution"].items():
        # From Josh: fmask cloud cover trumps the L1 cloud cover.
        if name == "cloud":
            del p.properties[f"eo:cloud_cover"]
            p.properties[f"eo:cloud_cover"] = value

        p.properties[f"fmask:{name}"] = value

    _take_software_versions(p, doc)
    p.extend_user_metadata("fmask", doc)

Example #7

0

Show file

def test_complain_about_missing_fields(tmp_path: Path, l1_ls8_folder: Path):
    """
    It should complain immediately if I add a file without enough metadata to write the filename.

    (and with a friendly error message)
    """

    out = tmp_path / "out"
    out.mkdir()

    [blue_geotiff_path] = l1_ls8_folder.rglob("L*_B2.TIF")

    # Default simple naming conventions need at least a date and family...
    with pytest.raises(
        ValueError, match="Need more properties to fulfill naming conventions."
    ):
        with DatasetAssembler(out) as p:
            p.write_measurement("blue", blue_geotiff_path)

    # It should mention the field that's missing (we added a date, so product_family is needed)
    with DatasetAssembler(out) as p:
        with pytest.raises(ValueError, match="odc:product_family"):
            p.datetime = datetime(2019, 7, 4, 13, 7, 5)
            p.write_measurement("blue", blue_geotiff_path)

    # DEA naming conventions should have stricter standards, and will tell your which fields you need to add.
    with DatasetAssembler(out, naming_conventions="dea") as p:
        # We set all the fields that work in default naming conventions.
        p.datetime = datetime(2019, 7, 4, 13, 7, 5)
        p.product_family = "quaternarius"
        p.processed_now()

        # These fields are mandatory for DEA, and so should be complained about.
        expected_extra_fields_needed = (
            "eo:platform",
            "eo:instrument",
            "odc:dataset_version",
            "odc:producer",
            "odc:region_code",
        )
        with pytest.raises(ValueError) as got_error:
            p.write_measurement("blue", blue_geotiff_path)

        # All needed fields should have been in the error message.
        for needed_field_name in expected_extra_fields_needed:
            assert needed_field_name in got_error.value.args[0], (
                f"Expected field {needed_field_name} to "
                f"be listed as mandatory in the error message"
            )

Example #8

0

Show file

def assert_names_match(
    tmp_path: Path,
    # Given:
    conventions,
    properties: Mapping,
    # Then expect:
    expect_metadata_path: str = None,
    expect_label: str = None,
):
    __tracebackhide__ = operator.methodcaller("errisinstance", AssertionError)
    """
    Easily test a set of naming conventions: Do certain properties lead to expected file names?
    """

    with DatasetAssembler(tmp_path, naming_conventions=conventions) as p:
        p.properties.update(properties)

        dataset_id, metadata_path = p.done()

    if expect_metadata_path:
        metadata_path_offset = metadata_path.relative_to(tmp_path).as_posix()
        assert metadata_path_offset == expect_metadata_path

    with metadata_path.open("r") as f:
        doc = yaml.YAML(typ="safe").load(f)

    if expect_label:
        assert doc["label"] == expect_label, "Unexpected dataset label"

Example #9

0

Show file

def test_dea_interim_folder_calculation(tmp_path: Path):
    """
    DEA Naming conventions should include maturity in the folder name
    when it's not a 'final' dataset.
    """
    with DatasetAssembler(tmp_path, naming_conventions="dea") as p:
        p.platform = "landsat-7"
        # Should not end up in the path, as it's the default:
        p.product_maturity = "stable"
        p.instrument = "ETM+"
        p.datetime = datetime(1998, 7, 30)
        p.product_family = "frogs"
        p.processed = "1999-11-20 00:00:53.152462Z"
        p.maturity = "interim"
        p.producer = "ga.gov.au"
        p.properties["landsat:landsat_scene_id"] = "LE70930821999324EDC00"
        p.dataset_version = "1.2.3"
        p.region_code = "093082"

        p.done()

    [metadata_path] = tmp_path.rglob("*.odc-metadata.yaml")
    calculated_path: Path = metadata_path.relative_to(tmp_path)
    assert calculated_path == Path(
        #                                  ⇩⇩⇩⇩⇩⇩⇩⇩ Adds interim flag
        "ga_ls7e_frogs_1/093/082/1998/07/30_interim/ga_ls7e_frogs_1-2-3_093082_1998-07-30_interim.odc-metadata.yaml"
    )

Example #10

0

Show file

File: sentinel_l1c_prepare.py Project: joestasks/eo-datasets

def _extract_sinergise_fields(path: Path,
                              p: DatasetAssembler) -> Iterable[Path]:
    """Extract Sinergise metadata and return list of image offsets"""
    product_info_path = path / "productInfo.json"
    metadata_xml_path = path / "metadata.xml"

    if not product_info_path.exists():
        raise ValueError(
            "No productInfo.json file found. "
            "Are you sure the input is a sinergise dataset folder?")

    p.properties.update(process_sinergise_product_info(product_info_path))
    p.add_accessory_file("metadata:sinergise_product_info", product_info_path)

    p.properties.update(process_tile_metadata(metadata_xml_path.read_text()))
    p.add_accessory_file("metadata:s2_tile", metadata_xml_path)

    # TODO: sinergise folders could `process_datastrip_metadata()` in an outer directory?

    return path.glob("*.jp2")

Example #11

0

Show file

def test_africa_naming_conventions(tmp_path: Path):
    """
    Minimal fields needed for DEAfrica naming conventions
    """
    with DatasetAssembler(tmp_path, naming_conventions="deafrica") as p:

        # Just the fields listed in required_fields.
        p.producer = "digitalearthafrica.org"
        p.datetime = datetime(1998, 7, 30)
        p.region_code = "090081"
        p.product_family = "wofs"
        p.platform = "LANDSAT_8"
        p.processed_now()
        p.dataset_version = "0.1.2"

        dataset_id, metadata_path = p.done()

    metadata_path_offset = metadata_path.relative_to(tmp_path).as_posix()
    assert (
        metadata_path_offset ==
        "wofs_ls/0-1-2/090/081/1998/07/30/wofs_ls_090081_1998-07-30.odc-metadata.yaml"
    )

    with DatasetAssembler(tmp_path, naming_conventions="deafrica") as p:

        # Just the fields listed in required_fields.
        p.producer = "digitalearthafrica.org"
        p.datetime = datetime(1998, 7, 30)
        p.region_code = "090081"
        p.product_family = "fc"
        p.platform = "LANDSAT_8"
        p.processed_now()
        p.dataset_version = "0.1.2"

        dataset_id, metadata_path = p.done()

    metadata_path_offset = metadata_path.relative_to(tmp_path).as_posix()
    assert (
        metadata_path_offset ==
        "fc_ls/0-1-2/090/081/1998/07/30/fc_ls_090081_1998-07-30.odc-metadata.yaml"
    )

Example #12

0

Show file

File: wagl.py Project: GeoscienceAustralia/eo-datasets

def _unpack_products(p: DatasetAssembler, product_list: Iterable[str],
                     h5group: h5py.Group) -> None:
    """
    Unpack and package the NBAR and NBART products.
    """
    # listing of all datasets of IMAGE CLASS type
    img_paths = _find_h5_paths(h5group, "IMAGE")

    for product in product_list:
        with sub_product(product, p):
            for pathname in [
                    p for p in img_paths if f"/{product.upper()}/" in p
            ]:

                with do(f"Path {pathname!r}"):
                    dataset = h5group[pathname]
                    band_name = utils.normalise_band_name(
                        dataset.attrs["alias"])
                    write_measurement_h5(
                        p,
                        f"{product}:{band_name}",
                        dataset,
                        overview_resampling=Resampling.average,
                        file_id=_file_id(dataset),
                    )

            if product in _THUMBNAILS:
                red, green, blue = _THUMBNAILS[product]
                with do(f"Thumbnailing {product}"):
                    p.write_thumbnail(
                        red,
                        green,
                        blue,
                        static_stretch=(1, 3000),
                        # Because of our strange sub-products and filename standards, we want the
                        # 'kind' to be included in the recorded thumbnail accessory metadata,
                        # but not in the filename.
                        # So we manually calculate a filename without the 'kind' field included.
                        kind=product,
                        path=p.names.thumbnail_filename(),
                    )

Example #13

0

Show file

def test_dataset_multi_platform(tmp_path: Path):
    """Can we make a dataset derived from multiple platforms?"""

    # No platform is included in names when there's a mix.
    with DatasetAssembler(tmp_path) as p:
        p.platforms = ["Sentinel_2a", "landsat_7"]
        assert p.platform == "landsat-7,sentinel-2a"

        p.datetime = datetime(2019, 1, 1)
        p.product_family = "peanuts"
        p.processed_now()

        dataset_id, metadata_path = p.done()

    with metadata_path.open("r") as f:
        doc = yaml.YAML(typ="safe").load(f)

    assert doc["label"] == "peanuts_2019-01-01"
    metadata_path_offset = metadata_path.relative_to(tmp_path).as_posix()
    assert (metadata_path_offset ==
            "peanuts/2019/01/01/peanuts_2019-01-01.odc-metadata.yaml")

    # ... but show the platform abbreviation when there's a known group.
    with DatasetAssembler(tmp_path) as p:
        p.platforms = ["Sentinel_2a", "sentinel_2b"]
        assert p.platform == "sentinel-2a,sentinel-2b"

        p.datetime = datetime(2019, 1, 1)
        p.product_family = "peanuts"
        p.processed_now()

        dataset_id, metadata_path = p.done()

    with metadata_path.open("r") as f:
        doc = yaml.YAML(typ="safe").load(f)

    assert doc["label"] == "s2_peanuts_2019-01-01"
    metadata_path_offset = metadata_path.relative_to(tmp_path).as_posix()
    assert (metadata_path_offset ==
            "s2_peanuts/2019/01/01/s2_peanuts_2019-01-01.odc-metadata.yaml")

Example #14

0

Show file

def test_minimal_generated_naming_package(tmp_path: Path, l1_ls8_folder: Path):
    """
    What's the minimum number of fields we can set and still generate file/product
    names to produce a package?
    """

    out = tmp_path / "out"
    out.mkdir()

    [blue_geotiff_path] = l1_ls8_folder.rglob("L*_B2.TIF")

    with DatasetAssembler(out) as p:
        p.datetime = datetime(2019, 7, 4, 13, 7, 5)
        p.product_family = "quaternarius"
        p.processed_now()

        p.write_measurement("blue", blue_geotiff_path)

        # A friendly __str__ for notebook/terminal users:
        assert str(p) == dedent(
            f"""
            Assembling quaternarius (unfinished)
            - 1 measurements: blue
            - 4 properties: datetime, odc:file_format, odc:processing_datetime, odc:prod...
            Writing to location: {out}/quaternarius/2019/07/04/quaternarius_2019-07-04.odc-metadata.yaml
        """
        )

        # p.done() will validate the dataset and write it to the destination atomically.
        dataset_id, metadata_path = p.done()

    assert dataset_id is not None
    assert_file_structure(
        out,
        {
            "quaternarius": {
                "2019": {
                    "07": {
                        "04": {
                            # Set a dataset version to get rid of 'beta' label.
                            "quaternarius_2019-07-04.odc-metadata.yaml": "",
                            "quaternarius_2019-07-04.proc-info.yaml": "",
                            "quaternarius_2019-07-04_blue.tif": "",
                            "quaternarius_2019-07-04.sha1": "",
                        }
                    }
                }
            }
        },
    )

Example #15

0

Show file

File: _utils.py Project: MatthewJA/datacube-alchemist

def _write_stac(
    metadata_path: Path,
    task: AlchemistTask,
    dataset_assembler: DatasetAssembler,
):
    out_dataset = serialise.from_path(metadata_path)
    stac_path = Path(str(metadata_path).replace("odc-metadata.yaml", "stac-item.json"))
    # Madness in deferred destination logic
    uri_base = dataset_assembler.names.destination_folder(
        Path(task.settings.output.location)
    )
    uri_base = str(uri_base) + "/"

    stac = dc_to_stac(
        out_dataset,
        metadata_path,
        stac_path,
        uri_base.replace("s3:/", "s3://"),
        task.settings.output.explorer_url,
        False,
    )

    with stac_path.open("w") as f:
        json.dump(stac, f, default=json_fallback)
    dataset_assembler.add_accessory_file("metadata:stac", stac_path)

    # dataset_assembler._checksum.write(dataset_assembler._accessories["checksum:sha1"])
    # Need a new checksummer because EODatasets is insane
    checksummer = PackageChecksum()
    checksum_file = (
        dataset_assembler._dataset_location
        / dataset_assembler._accessories["checksum:sha1"].name
    )
    checksummer.read(checksum_file)
    checksummer.add_file(stac_path)
    checksummer.write(checksum_file)
    return stac

Example #16

0

Show file

File: wagl.py Project: GeoscienceAustralia/eo-datasets

def write_measurement_h5(
    p: DatasetAssembler,
    full_name: str,
    g: h5py.Dataset,
    overviews=images.DEFAULT_OVERVIEWS,
    overview_resampling=Resampling.nearest,
    expand_valid_data=True,
    file_id: str = None,
):
    """
    Write a measurement by copying it from a hdf5 dataset.
    """
    if hasattr(g, "chunks"):
        data = g[:]
    else:
        data = g

    product_name, band_name = full_name.split(":")
    p.write_measurement_numpy(
        array=data,
        grid_spec=images.GridSpec(
            shape=g.shape,
            transform=Affine.from_gdal(*g.attrs["geotransform"]),
            crs=CRS.from_wkt(g.attrs["crs_wkt"]),
        ),
        nodata=g.attrs.get("no_data_value"),
        overviews=overviews,
        overview_resampling=overview_resampling,
        expand_valid_data=expand_valid_data,
        file_id=file_id,
        # Because of our strange sub-products and filename standards, we want the
        # product_name to be included in the recorded band metadata,
        # but not in its filename.
        # So we manually calculate a filename without the extra product name prefix.
        name=full_name,
        path=p.names.measurement_filename(band_name, "tif", file_id=file_id),
    )

Example #17

0

Show file

def test_dataset_no_measurements(tmp_path: Path):
    """Can we make a dataset with no measurements? (eg. telemetry data)"""
    with DatasetAssembler(tmp_path) as p:
        # A custom label too.
        p.label = "chipmonk_sightings_2019"
        p.datetime = datetime(2019, 1, 1)
        p.product_family = "chipmonk_sightings"
        p.processed_now()

        dataset_id, metadata_path = p.done()

    with metadata_path.open("r") as f:
        doc = yaml.load(f)

    assert doc["label"] == "chipmonk_sightings_2019", "Couldn't override label field"

Example #18

0

Show file

def test_minimal_s2_dataset(tmp_path: Path):
    """A minimal dataset with sentinel platform/instrument"""
    with DatasetAssembler(tmp_path) as p:
        # A custom label too.
        p.platform = "sentinel-2a"
        p.instrument = "msi"
        p.datetime = datetime(2018, 11, 4)
        p.product_family = "blueberries"
        p.processed = "2018-11-05T12:23:23"

        dataset_id, metadata_path = p.done()

    with metadata_path.open("r") as f:
        doc = yaml.load(f)

    assert doc["label"] == "s2am_blueberries_2018-11-04", "Unexpected dataset label"

Example #19

0

Show file

def test_custom_naming(tmp_path: Path):
    """
    We can create naming conventions separately, and later give it to assembler.
    """
    p = _basic_properties_set()
    convention = namer(properties=p)
    convention.dataset_folder = Path("my/custom/folder/")

    with DatasetAssembler(tmp_path, names=convention) as a:
        dataset_id, metadata_path = a.done()

    metadata_path_offset = metadata_path.relative_to(tmp_path).as_posix()
    assert (
        metadata_path_offset ==
        "my/custom/folder/ga_s2am_tester_1-2-3_023543_2013-02-03.odc-metadata.yaml"
    )

Example #20

0

Show file

def test_minimal_s1_dataset(tmp_path: Path):
    """A minimal dataset with sentinel-1a/b platform/instrument"""
    with DatasetAssembler(tmp_path) as p:
        # A custom label too.
        p.platform = "sentinel-1a"
        p.instrument = "c-sar"
        p.datetime = datetime(2018, 11, 4)
        p.product_family = "bck"
        p.processed = "2018-11-05T12:23:23"

        dataset_id, metadata_path = p.done()

    with metadata_path.open("r") as f:
        doc = yaml.safe_load(f)

    assert doc["label"] == "s1ac_bck_2018-11-04", "Unexpected dataset label"

Example #21

0

Show file

def test_dataset_given_properties(tmp_path: Path):
    """Can we give existing properties to the assembler?"""

    properties = {
        "datetime": datetime(2019, 1, 1),
        "odc:product_family": "chipmonk_sightings",
        "odc:processing_datetime": "2021-06-15T01:33:43.378850",
    }
    names = namer(properties=properties)
    with DatasetAssembler(tmp_path, names=names) as p:
        # It should have normalised properties!
        assert p.processed == datetime(2021, 6, 15, 1, 33, 43, 378850, timezone.utc)

        dataset_id, metadata_path = p.done()

    relative_path = metadata_path.relative_to(tmp_path)
    assert relative_path == Path(
        "chipmonk_sightings/2019/01/01/chipmonk_sightings_2019-01-01.odc-metadata.yaml"
    )

Example #22

0

Show file

File: sentinel_l1c_prepare.py Project: joestasks/eo-datasets

def prepare_and_write(
    ds_path: Path,
    output_yaml: Path,
    producer: str,
) -> Tuple[uuid.UUID, Path]:
    with DatasetAssembler(
            metadata_path=output_yaml,
            dataset_location=ds_path,
    ) as p:
        p.properties["odc:producer"] = producer

        if producer == "esa.int":
            jp2_offsets = _extract_esa_fields(ds_path, p)
        elif producer == "sinergise.com":
            jp2_offsets = _extract_sinergise_fields(ds_path, p)
        else:
            raise NotImplementedError(
                f"Unknown s2 producer {producer}. Expected 'sinergise.com' or 'esa.int'"
            )

        p.dataset_id = _get_stable_id(p)
        p.properties["eo:platform"] = _get_platform_name(p.properties)
        p.properties["eo:instrument"] = "MSI"
        p.properties["odc:dataset_version"] = f"1.0.{p.processed:%Y%m%d}"

        p.properties["odc:file_format"] = "JPEG2000"
        p.properties["odc:product_family"] = "level1"

        for path in jp2_offsets:
            band_number = _extract_band_number(path.stem)
            if band_number.lower() in ("tci", "pvi", "preview"):
                continue
            if band_number not in SENTINEL_MSI_BAND_ALIASES:
                raise RuntimeError(
                    f"Unknown band number {band_number!r} in image {path}")

            p.note_measurement(
                path=path,
                name=SENTINEL_MSI_BAND_ALIASES[band_number],
                relative_to_dataset_location=True,
            )

        return p.done()

Example #23

0

Show file

File: wagl.py Project: GeoscienceAustralia/eo-datasets

def _apply_wagl_metadata(p: DatasetAssembler, wagl_doc: Dict):
    source = wagl_doc["source_datasets"]
    p.datetime = source["acquisition_datetime"]
    p.platform = source["platform_id"]
    p.instrument = source["sensor_id"]

    try:
        p.processed = get_path(wagl_doc,
                               ("system_information", "time_processed"))
    except PathAccessError:
        raise RuntimeError("WAGL dataset contains no processed time.")

    _take_software_versions(p, wagl_doc)
    p.extend_user_metadata("wagl", wagl_doc)

Example #24

0

Show file

def test_minimal_s2_dataset_normal(tmp_path: Path):
    """A minimal dataset with sentinel platform/instrument"""
    with DatasetAssembler(tmp_path) as p:
        p.platform = "sentinel-2a"
        p.instrument = "msi"
        p.datetime = datetime(2018, 11, 4)
        p.product_family = "blueberries"
        p.processed = "2018-11-05T12:23:23"
        p.properties[
            "sentinel:sentinel_tile_id"] = "S2A_OPER_MSI_L1C_TL_SGS__20170822T015626_A011310_T54KYU_N02.05"

        dataset_id, metadata_path = p.done()

    with metadata_path.open("r") as f:
        doc = yaml.YAML(typ="safe").load(f)

    metadata_path_offset = metadata_path.relative_to(tmp_path).as_posix()
    assert metadata_path_offset == (
        "s2am_blueberries/2018/11/04/s2am_blueberries_2018-11-04.odc-metadata.yaml"
    )

    assert doc[
        "label"] == "s2am_blueberries_2018-11-04", "Unexpected dataset label"

Example #25

0

Show file

def test_minimal_package_with_product_name(tmp_path: Path, l1_ls8_folder: Path):
    """
    You can specify an ODC product name manually to avoid most of the name generation.
    """
    out = tmp_path / "out"
    out.mkdir()

    [blue_geotiff_path] = l1_ls8_folder.rglob("L*_B2.TIF")

    with DatasetAssembler(out) as p:
        p.datetime = datetime(2019, 7, 4, 13, 7, 5)
        p.product_name = "loch_ness_sightings"
        p.processed = datetime(2019, 7, 4, 13, 8, 7)

        p.write_measurement("blue", blue_geotiff_path)

        dataset_id, metadata_path = p.done()

    assert dataset_id is not None
    assert_file_structure(
        out,
        {
            "loch_ness_sightings": {
                "2019": {
                    "07": {
                        "04": {
                            # Set a dataset version to get rid of 'beta' label.
                            "loch_ness_sightings_2019-07-04.odc-metadata.yaml": "",
                            "loch_ness_sightings_2019-07-04.proc-info.yaml": "",
                            "loch_ness_sightings_2019-07-04_blue.tif": "",
                            "loch_ness_sightings_2019-07-04.sha1": "",
                        }
                    }
                }
            }
        },
    )

Example #26

0

Show file

File: landsat_l2_prepare.py Project: tonybutzer/indexc2

def prepare_and_write(
    ds_path: Path,
    collection_location: Path,
    # TODO: Can we infer producer automatically? This is bound to cause mistakes othewise
    producer="usgs.gov",
) -> Tuple[uuid.UUID, Path]:
    """
    Prepare an eo3 metadata file for a Level2 dataset.

    Input dataset path can be a folder or a tar file.
    """
    mtl_doc, mtl_filename = get_mtl_content(
        ds_path, root_element="landsat_metadata_file")
    if not mtl_doc:
        raise ValueError(f"No MTL file found for {ds_path}")

    usgs_collection_number = mtl_doc["product_contents"].get(
        "collection_number")
    if usgs_collection_number is None:
        raise NotImplementedError(
            "Dataset has no collection number: pre-collection data is not supported."
        )

    data_format = mtl_doc["product_contents"]["output_format"]
    if data_format.upper() != "GEOTIFF":
        raise NotImplementedError(
            f"Only GTiff currently supported, got {data_format}")
    file_format = FileFormat.GeoTIFF

    # Assumed below.
    if (mtl_doc["projection_attributes"]["grid_cell_size_reflective"] !=
            mtl_doc["projection_attributes"]["grid_cell_size_thermal"]):
        raise NotImplementedError(
            "reflective and thermal have different cell sizes")
    ground_sample_distance = min(
        value for name, value in mtl_doc["projection_attributes"].items()
        if name.startswith("grid_cell_size_"))

    with DatasetAssembler(
            collection_location=collection_location,
            # Detministic ID based on USGS's product id (which changes when the scene is reprocessed by them)
            dataset_id=uuid.uuid5(
                USGS_UUID_NAMESPACE,
                mtl_doc["product_contents"]["landsat_product_id"]),
            naming_conventions="dea",
            if_exists=IfExists.Overwrite,
    ) as p:
        p.platform = mtl_doc["image_attributes"]["spacecraft_id"]
        p.instrument = mtl_doc["image_attributes"]["sensor_id"]
        p.product_family = "level2"
        p.producer = producer
        p.datetime = "{}T{}".format(
            mtl_doc["image_attributes"]["date_acquired"],
            mtl_doc["image_attributes"]["scene_center_time"],
        )
        # p.processed = mtl_doc["metadata_file_info"]["file_date"]
        p.processed = mtl_doc['level2_processing_record'][
            'date_product_generated']
        p.properties["odc:file_format"] = file_format
        p.properties["eo:gsd"] = ground_sample_distance
        p.properties["eo:cloud_cover"] = mtl_doc["image_attributes"][
            "cloud_cover"]
        p.properties["eo:sun_azimuth"] = mtl_doc["image_attributes"][
            "sun_azimuth"]
        p.properties["eo:sun_elevation"] = mtl_doc["image_attributes"][
            "sun_elevation"]
        p.properties["landsat:collection_number"] = usgs_collection_number
        for section, fields in _COPYABLE_MTL_FIELDS:
            for field in fields:
                value = mtl_doc[section].get(field)
                if value is not None:
                    p.properties[f"landsat:{field}"] = value

        p.region_code = f"{p.properties['landsat:wrs_path']:03d}{p.properties['landsat:wrs_row']:03d}"
        org_collection_number = utils.get_collection_number(
            p.producer, p.properties["landsat:collection_number"])
        p.dataset_version = f"{org_collection_number}.0.{p.processed:%Y%m%d}"

        band_aliases = get_band_alias_mappings(p.platform, p.instrument)

        bands = list(_iter_bands_paths(mtl_doc))
        # add to do one band - remove this to do all the bands
        # bands = bands[0:1]
        for usgs_band_id, file_location in bands:
            # p.note_measurement(
            #     band_aliases[usgs_band_id],
            #     file_location,
            #     relative_to_dataset_location=True,
            # )
            path_file = os.path.join(ds_path, file_location)
            p.write_measurement(band_aliases[usgs_band_id], path_file)

        p.add_accessory_file("metadata:landsat_mtl", Path(mtl_filename))

        return p.done()

Example #27

0

Show file

def prepare_and_write(
    ds_path: Path,
    output_yaml_path: Path,
    source_telemetry: Path = None,
    # TODO: Can we infer producer automatically? This is bound to cause mistakes othewise
    producer="usgs.gov",
) -> Tuple[uuid.UUID, Path]:
    """
    Prepare an eo3 metadata file for a Level1 dataset.

    Input dataset path can be a folder or a tar file.
    """
    mtl_doc, mtl_filename = get_mtl_content(ds_path)
    if not mtl_doc:
        raise ValueError(f"No MTL file found for {ds_path}")

    usgs_collection_number = mtl_doc["metadata_file_info"].get(
        "collection_number")
    if usgs_collection_number is None:
        raise NotImplementedError(
            "Dataset has no collection number: pre-collection data is not supported."
        )

    data_format = mtl_doc["product_metadata"]["output_format"]
    if data_format.upper() != "GEOTIFF":
        raise NotImplementedError(
            f"Only GTiff currently supported, got {data_format}")
    file_format = FileFormat.GeoTIFF

    # Assumed below.
    projection_params = mtl_doc["projection_parameters"]
    if ("grid_cell_size_thermal" in projection_params
            and "grid_cell_size_reflective" in projection_params
            and (projection_params["grid_cell_size_reflective"] !=
                 projection_params["grid_cell_size_thermal"])):
        raise NotImplementedError(
            "reflective and thermal have different cell sizes")
    ground_sample_distance = min(value
                                 for name, value in projection_params.items()
                                 if name.startswith("grid_cell_size_"))

    with DatasetAssembler(
            metadata_path=output_yaml_path,
            dataset_location=ds_path,
            # Detministic ID based on USGS's product id (which changes when the scene is reprocessed by them)
            dataset_id=uuid.uuid5(
                USGS_UUID_NAMESPACE,
                mtl_doc["metadata_file_info"]["landsat_product_id"]),
            naming_conventions="dea",
            if_exists=IfExists.Overwrite,
    ) as p:
        if source_telemetry:
            # Only GA's data has source telemetry...
            assert producer == "ga.gov.au"
            p.add_source_path(source_telemetry)

        p.platform = mtl_doc["product_metadata"]["spacecraft_id"]
        p.instrument = mtl_doc["product_metadata"]["sensor_id"]
        p.product_family = "level1"
        p.producer = producer
        p.datetime = "{}T{}".format(
            mtl_doc["product_metadata"]["date_acquired"],
            mtl_doc["product_metadata"]["scene_center_time"],
        )
        p.processed = mtl_doc["metadata_file_info"]["file_date"]
        p.properties["odc:file_format"] = file_format
        p.properties["eo:gsd"] = ground_sample_distance
        cloud_cover = mtl_doc["image_attributes"]["cloud_cover"]
        # Cloud cover is -1 when missing (such as TIRS-only data)
        if cloud_cover != -1:
            p.properties["eo:cloud_cover"] = cloud_cover
        p.properties["eo:sun_azimuth"] = mtl_doc["image_attributes"][
            "sun_azimuth"]
        p.properties["eo:sun_elevation"] = mtl_doc["image_attributes"][
            "sun_elevation"]
        p.properties["landsat:collection_number"] = usgs_collection_number
        for section, fields in _COPYABLE_MTL_FIELDS:
            for field in fields:
                value = mtl_doc[section].get(field)
                if value is not None:
                    p.properties[f"landsat:{field}"] = value

        p.region_code = f"{p.properties['landsat:wrs_path']:03d}{p.properties['landsat:wrs_row']:03d}"
        org_collection_number = utils.get_collection_number(
            p.producer, p.properties["landsat:collection_number"])
        p.dataset_version = f"{org_collection_number}.0.{p.processed:%Y%m%d}"

        # NRT product?
        # Category is one of: T1, T2 or RT ('real time')
        if p.properties["landsat:collection_category"] == "RT":
            p.properties["odc:dataset_maturity"] = "nrt"

        band_aliases = get_band_alias_mappings(p.platform, p.instrument)
        for usgs_band_id, file_location in _iter_bands_paths(mtl_doc):
            p.note_measurement(
                band_aliases[usgs_band_id],
                file_location,
                relative_to_dataset_location=True,
            )

        p.add_accessory_file("metadata:landsat_mtl", Path(mtl_filename))

        return p.done()

Example #28

0

Show file

File: wagl.py Project: EricHay/eo-datasets

def package(
    out_directory: Path,
    granule: Granule,
    included_products: Iterable[str] = DEFAULT_PRODUCTS,
    include_oa: bool = True,
) -> Tuple[UUID, Path]:
    """
    Package an L2 product.

    :param include_oa:

    :param out_directory:
        The base directory for output datasets. A DEA-naming-conventions folder hierarchy
        will be created inside this folder.

    :param granule:
        Granule information. You probably want to make one with Granule.from_path()

    :param included_products:
        A list of imagery products to include in the package.
        Defaults to all products.

    :return:
        The dataset UUID and output metadata path
    """
    included_products = tuple(s.lower() for s in included_products)

    with h5py.File(granule.wagl_hdf5, "r") as fid:
        granule_group = fid[granule.name]

        with DatasetAssembler(
                out_directory,
                # WAGL stamps a good, random ID already.
                dataset_id=granule.wagl_metadata.get("id"),
                naming_conventions="dea",
        ) as p:
            level1 = granule.source_level1_metadata
            p.add_source_dataset(level1, auto_inherit_properties=True)

            # It's a GA ARD product.
            p.producer = "ga.gov.au"
            p.product_family = "ard"

            org_collection_number = utils.get_collection_number(
                p.producer, p.properties["landsat:collection_number"])
            # TODO: wagl's algorithm version should determine our dataset version number, right?
            p.dataset_version = f"{org_collection_number}.0.0"
            p.region_code = _extract_reference_code(p, granule.name)

            _read_wagl_metadata(p, granule_group)
            _read_gqa_doc(p, granule.gqa_doc)
            _read_fmask_doc(p, granule.fmask_doc)

            _unpack_products(p, included_products, granule_group)

            if include_oa:
                with do(f"Starting OA", heading=True):
                    _unpack_observation_attributes(
                        p,
                        included_products,
                        granule_group,
                        infer_datetime_range=level1.platform.startswith(
                            "landsat"),
                    )
                if granule.fmask_image:
                    with do(f"Writing fmask from {granule.fmask_image} "):
                        p.write_measurement(
                            "oa:fmask",
                            granule.fmask_image,
                            expand_valid_data=False,
                            overview_resampling=Resampling.mode,
                        )

            with do("Finishing package"):
                return p.done()

Example #29

0

Show file

File: wagl.py Project: EricHay/eo-datasets

def _create_contiguity(
    p: DatasetAssembler,
    product_list: Iterable[str],
    resolution_yx: Tuple[float, float],
    timedelta_product: str = "nbar",
    timedelta_data: numpy.ndarray = None,
):
    """
    Create the contiguity (all pixels valid) dataset.

    Write a contiguity mask file based on the intersection of valid data pixels across all
    bands from the input files.
    """
    for product in product_list:
        contiguity = None
        for grid, band_name, path in p.iter_measurement_paths():
            if not band_name.startswith(f"{product.lower()}:"):
                continue
            # Only our given res group (no pan band in Landsat)
            if grid.resolution_yx != resolution_yx:
                continue

            with rasterio.open(path) as ds:
                ds: DatasetReader
                if contiguity is None:
                    contiguity = numpy.ones((ds.height, ds.width),
                                            dtype="uint8")
                    geobox = GridSpec.from_rio(ds)
                elif ds.shape != contiguity.shape:
                    raise NotImplementedError(
                        "Contiguity from measurements of different shape")

                for band in ds.indexes:
                    contiguity &= ds.read(band) > 0

        if contiguity is None:
            secho(f"No images found for requested product {product}", fg="red")
            continue

        p.write_measurement_numpy(
            f"oa:{product.lower()}_contiguity",
            contiguity,
            geobox,
            nodata=255,
            overviews=None,
            expand_valid_data=False,
        )

        # masking the timedelta_data with contiguity mask to get max and min timedelta within the NBAR product
        # footprint for Landsat sensor. For Sentinel sensor, it inherits from level 1 yaml file
        if timedelta_data is not None and product.lower() == timedelta_product:
            valid_timedelta_data = numpy.ma.masked_where(
                contiguity == 0, timedelta_data)

            def offset_from_center(v: numpy.datetime64):
                return p.datetime + timedelta(microseconds=v.astype(float) *
                                              1_000_000.0)

            p.datetime_range = (
                offset_from_center(numpy.ma.min(valid_timedelta_data)),
                offset_from_center(numpy.ma.max(valid_timedelta_data)),
            )

Example #30

0

Show file

File: wagl.py Project: EricHay/eo-datasets

def _take_software_versions(p: DatasetAssembler, doc: Dict):
    versions = doc.pop("software_versions", {})

    for name, o in versions.items():
        p.note_software_version(name, o.get("repo_url"), o.get("version"))