Esempio n. 1
0
def _munge_dataset_to_eo3(ds: Dataset) -> DatasetDoc:
    """
    Convert to the DatasetDoc format that eodatasets expects.
    """
    if ds.metadata_type.name in {"eo_plus", "eo_s2_nrt", "gqa_eo"}:
        # Handle S2 NRT metadata identically to eo_plus files.
        # gqa_eo is the S2 ARD with extra quality check fields.
        return _convert_eo_plus(ds)

    if ds.metadata_type.name == "eo":
        return _convert_eo(ds)

    # Else we have an already mostly eo3 style dataset
    product = ProductDoc(name=ds.type.name)
    # Wrap properties to avoid typos and the like
    properties = StacPropertyView(ds.metadata_doc.get("properties", {}))
    if properties.get("eo:gsd"):
        del properties["eo:gsd"]
    return DatasetDoc(
        id=ds.id,
        product=product,
        crs=str(ds.crs),
        properties=properties,
        geometry=ds.extent,
    )
Esempio n. 2
0
def test_unknown_abbreviations():
    d = DatasetDoc()
    names = NamingConventions(d.properties)

    with ignore_property_overrides():
        # Unknown platforms are abbreviated by just removing dashes.
        d.platform = "grover-1"
        assert names.platform_abbreviated == "grover1"

        # Constellation can be used as a fallback grouping.
        d.platforms = ["clippings-1a", "clippings-2b"]
        d.properties["constellation"] = "clippings"
        assert names.platform_abbreviated == "clippings"

        # Unless unknown platforms aren't allowed
        # (DEA wants to be stricter and add real abbreviations for everything.)
        names = namer(d.properties, conventions="dea")
        with pytest.raises(
            ValueError, match="don't know the DEA abbreviation for platform"
        ):
            print(names.platform_abbreviated)
Esempio n. 3
0
def _convert_eo(ds) -> DatasetDoc:
    # Definitely need: # - 'datetime' # - 'eo:instrument' # - 'eo:platform' # - 'odc:region_code'
    region_code = _guess_region_code(ds)
    properties = StacPropertyView(
        {
            "odc:region_code": region_code,
            "datetime": ds.center_time,
            "eo:instrument": ds.metadata.instrument,
            "eo:platform": ds.metadata.platform,
            "landsat:landsat_scene_id": ds.metadata.instrument,  # Used to find abbreviated instrument id
        }
    )
    product = ProductDoc(name=ds.type.name)
    return DatasetDoc(id=ds.id, product=product, crs=str(ds.crs), properties=properties)
Esempio n. 4
0
def convert_eo(ds) -> DatasetDoc:
    # Definitely need: # - 'datetime' # - 'eo:instrument' # - 'eo:platform' # - 'odc:region_code'
    properties = StacPropertyView({
        'odc:region_code': ds.metadata_doc['region_code'],
        'datetime': ds.center_time,
        'eo:instrument': ds.metadata.instrument,
        'eo:platform': ds.metadata.platform,
        'landsat:landsat_scene_id': ds.metadata.instrument,  # Used to find abbreviated instrument id
    })
    product = ProductDoc(name=ds.type.name)
    return DatasetDoc(
        id=ds.id,
        product=product,
        crs=ds.crs.crs_str,
        properties=properties
    )
Esempio n. 5
0
def test_multi_platform_fields():
    """
    Multiple platforms can be specified.

    (they are normalised in eo3 as a sorted, comma-separated list)
    """
    d = DatasetDoc()
    assert d.platform is None
    assert d.platforms == set()

    d.platforms = {"LANDSAT_5", "LANDSAT_4"}
    assert d.platform == "landsat-4,landsat-5"
    assert d.platforms == {"landsat-4", "landsat-5"}

    d = DatasetDoc()
    d.platform = "sentinel-2a, landsat_5, LANDSAT_5"
    assert d.platform == "landsat-5,sentinel-2a"
    assert d.platforms == {"landsat-5", "sentinel-2a"}

    d = DatasetDoc()
    d.platform = ""
    assert d.platform is None
Esempio n. 6
0
def _munge_dataset_to_eo3(ds: Dataset) -> DatasetDoc:
    """
    Convert to the DatasetDoc format that eodatasets expects.
    """
    if ds.metadata_type.name == 'eo_plus':
        return convert_eo_plus(ds)

    if ds.metadata_type.name == 'eo':
        return convert_eo(ds)

    # Else we have an already mostly eo3 style dataset
    product = ProductDoc(name=ds.type.name)
    # Wrap properties to avoid typos and the like
    properties = StacPropertyView(ds.metadata_doc.get('properties', {}))
    return DatasetDoc(
        id=ds.id,
        product=product,
        crs=ds.crs.crs_str,
        properties=properties
    )
Esempio n. 7
0
def test_naming_abbreviations():
    d = DatasetDoc()
    names = NamingConventions(d.properties)

    with ignore_property_overrides():
        assert names.platform_abbreviated is None

        # A single platform uses its known abbreviation.
        d.platforms = ["landsat-5"]
        assert names.platform_abbreviated == "ls5"

        # Multiple platforms from a known group use the group name.
        d.platforms = ["landsat-5", "landsat_7"]
        assert names.platform_abbreviated == "ls"
        d.platforms = ["sentinel-2a", "sentinel-2b"]
        assert names.platform_abbreviated == "s2"

        # Non-groupable platforms are dash-separated.
        d.platforms = ["landsat-5", "sentinel-2a"]
        assert names.platform_abbreviated is None
Esempio n. 8
0
    def done(
        self, validate_correctness: bool = True, sort_measurements: bool = True
    ) -> Tuple[uuid.UUID, Path]:
        """
        Write the dataset and move it into place.

        It will be validated, metadata will be written, and if all is correct, it will be
        moved to the output location.

        The final move is done atomically, so the dataset will only exist in the output
        location if it is complete.

        :param validate_correctness: Run the eo3-validator on the resulting metadata.
        :param sort_measurements: Order measurements alphabetically. (instead of insert-order)
        :raises: :class:`IncompleteDatasetError` If any critical metadata is incomplete.

        :returns: The id and final path to the dataset metadata file.
        """
        self.note_software_version(
            "eodatasets3",
            "https://github.com/GeoscienceAustralia/eo-datasets",
            eodatasets3.__version__,
        )

        crs, grid_docs, measurement_docs = self._measurements.as_geo_docs()

        if measurement_docs and sort_measurements:
            measurement_docs = dict(sorted(measurement_docs.items()))

        valid_data = self._measurements.consume_and_get_valid_data()
        # Avoid the messiness of different empty collection types.
        # (to have a non-null geometry we'd also need non-null grids and crses)
        if valid_data.is_empty:
            valid_data = None

        if self._is_writing_files():
            # (the checksum isn't written yet -- it'll be the last file)
            self.add_accessory_file(
                "checksum:sha1", self.names.checksum_path(self._work_path)
            )

            processing_metadata = self.names.metadata_path(
                self._work_path, suffix="proc-info.yaml"
            )
            self._write_yaml(
                {**self._user_metadata, "software_versions": self._software_versions},
                processing_metadata,
                allow_external_paths=True,
            )
            self.add_accessory_file("metadata:processor", processing_metadata)

        dataset = DatasetDoc(
            id=self.dataset_id,
            label=self.label,
            product=ProductDoc(
                name=self.names.product_name, href=self.names.product_uri
            ),
            crs=self._crs_str(crs) if crs is not None else None,
            geometry=valid_data,
            grids=grid_docs,
            properties=self.properties,
            accessories={
                name: AccessoryDoc(path, name=name)
                for name, path in self._accessories.items()
            },
            measurements=measurement_docs,
            lineage=self._lineage,
        )

        doc = serialise.to_formatted_doc(dataset)
        self._write_yaml(
            doc,
            self._metadata_path
            or self.names.metadata_path(self._work_path, suffix="odc-metadata.yaml"),
        )

        if validate_correctness:
            for m in validate.validate_dataset(doc):
                if m.level in (Level.info, Level.warning):
                    warnings.warn(DatasetCompletenessWarning(m))
                elif m.level == Level.error:
                    raise IncompleteDatasetError(m)
                else:
                    raise RuntimeError(
                        f"Internal error: Unhandled type of message level: {m.level}"
                    )

        # If we're writing data, not just a metadata file, finish the package and move it into place.
        if self._is_writing_files():
            self._checksum.write(self._accessories["checksum:sha1"])

            # Match the lower r/w permission bits to the output folder.
            # (Temp directories default to 700 otherwise.)
            self._work_path.chmod(self.collection_location.stat().st_mode & 0o777)

            # GDAL writes extra metadata in aux files,
            # but we consider it a mistake if you're using those extensions.
            for aux_file in self._work_path.rglob("*.aux.xml"):
                warnings.warn(
                    f"Cleaning unexpected gdal aux file {aux_file.as_posix()!r}"
                )
                aux_file.unlink()

            if not self._dataset_location:
                self._dataset_location = self.names.destination_folder(
                    self.collection_location
                )
            # Now atomically move to final location.
            # Someone else may have created the output while we were working.
            # Try, and then decide how to handle it if so.
            try:
                self._dataset_location.parent.mkdir(parents=True, exist_ok=True)
                self._work_path.rename(self._dataset_location)
            except OSError:
                if not self._dataset_location.exists():
                    # Some other error?
                    raise

                if self._exists_behaviour == IfExists.Skip:
                    # Something else created it while we were busy.
                    warnings.warn(
                        f"Skipping -- exists: {self.names.destination_folder}"
                    )
                elif self._exists_behaviour == IfExists.ThrowError:
                    raise
                elif self._exists_behaviour == IfExists.Overwrite:
                    raise NotImplementedError("overwriting outputs not yet implemented")
                else:
                    raise RuntimeError(
                        f"Unexpected exists behaviour: {self._exists_behaviour}"
                    )

        target_metadata_path = self._metadata_path or self.names.metadata_path(
            self._dataset_location, suffix="odc-metadata.yaml"
        )
        assert target_metadata_path.exists()
        self._is_completed = True
        return dataset.id, target_metadata_path
Esempio n. 9
0
def as_stac_item(dataset: DatasetItem):
    """
    Get a dict corresponding to a stac item
    """
    ds: Dataset = dataset.odc_dataset

    if ds is not None and is_doc_eo3(ds.metadata_doc):
        dataset_doc = serialise.from_doc(ds.metadata_doc, skip_validation=True)
        dataset_doc.locations = ds.uris

        # Geometry is optional in eo3, and needs to be calculated from grids if missing.
        # We can use ODC's own calculation that happens on index.
        if dataset_doc.geometry is None:
            fallback_extent = ds.extent
            if fallback_extent is not None:
                dataset_doc.geometry = fallback_extent.geom
                dataset_doc.crs = str(ds.crs)

        if ds.sources:
            dataset_doc.lineage = {
                classifier: [d.id]
                for classifier, d in ds.sources
            }
        # Does ODC still put legacy lineage into indexed documents?
        elif ("source_datasets" in dataset_doc.lineage) and len(
                dataset_doc.lineage) == 1:
            # From old to new lineage type.
            dataset_doc.lineage = {
                classifier: [dataset["id"]]
                for classifier, dataset in
                dataset_doc.lineage["source_datasets"]
            }

    else:
        # eo1 to eo3

        dataset_doc = DatasetDoc(
            id=dataset.dataset_id,
            # Filled-in below.
            label=None,
            product=ProductDoc(dataset.product_name),
            locations=ds.uris if ds is not None else None,
            crs=str(dataset.geometry.crs),
            geometry=dataset.geometry.geom,
            grids=None,
            # TODO: Convert these from stac to eo3
            properties=Eo3Dict({
                "datetime":
                utc(dataset.center_time),
                **(dict(_build_properties(ds.metadata)) if ds else {}),
                "odc:processing_datetime":
                utc(dataset.creation_time),
            }),
            measurements={
                name: _band_to_measurement(
                    b,
                    dataset_location=ds.uris[0]
                    if ds is not None and ds.uris else None,
                )
                for name, b in ds.measurements.items()
            } if ds is not None else {},
            accessories=_accessories_from_eo1(ds.metadata_doc)
            if ds is not None else {},
            # TODO: Fill in lineage. The datacube API only gives us full datasets, which is
            #       expensive. We only need a list of IDs here.
            lineage={},
        )

    if dataset_doc.label is None and ds is not None:
        dataset_doc.label = _utils.dataset_label(ds)

    item_doc = eo3stac.to_stac_item(
        dataset=dataset_doc,
        stac_item_destination_url=url_for(
            ".item",
            collection=dataset.product_name,
            dataset_id=dataset.dataset_id,
        ),
        odc_dataset_metadata_url=url_for("dataset.raw_doc",
                                         id_=dataset.dataset_id),
        explorer_base_url=url_for("default_redirect"),
    )
    # Add the region code that Explorer inferred.
    # (Explorer's region codes predate ODC's and support
    #  many more products.
    item_doc["properties"]["cubedash:region_code"] = dataset.region_code

    return item_doc
def create_eo3(granule_dir, granule_id):
    """
    Creates an eo3 document.

    :param granule_dir (Path): the granule directory
    :return: DatasetDoc of eo3 metadata
    """

    with open(granule_dir / "ARD-METADATA.yaml") as fin:
        metadata = yaml.safe_load(fin)

    try:
        coords = metadata['grid_spatial']['projection']['valid_data']['coordinates']
        expand_valid_data = False
    except KeyError:
        expand_valid_data = True

    assembler = DatasetAssembler(
            dataset_location=granule_dir,
            metadata_path=granule_dir / "dummy",
    )

    if "S2A" in str(granule_dir):
        assembler.product_family = "s2a_ard_granule"
        platform = "SENTINEL_2A"        
    else:
        assembler.product_family = "s2b_ard_granule"
        platform = "SENTINEL_2B"

    assembler.processed_now()

    add_datetime(assembler, granule_dir)
    add_to_eo3(assembler, granule_dir, "NBART", lambda x: code_to_band[x.split('_')[-1]], expand_valid_data)
    add_to_eo3(assembler, granule_dir, "SUPPLEMENTARY", lambda x: x[3:].lower(), expand_valid_data)
    add_to_eo3(assembler, granule_dir, "QA", lambda x: x[3:].lower().replace('combined_', ''), expand_valid_data)

    crs, grid_docs, measurement_docs = assembler._measurements.as_geo_docs()
    valid_data = assembler._measurements.consume_and_get_valid_data()

    assembler.properties["odc:region_code"] = metadata["provider"]["reference_code"]
    assembler.properties["gqa:cep90"] = metadata["gqa"]["residual"]["cep90"]
    assembler.properties["gqa:error_message"] = metadata["gqa"]["error_message"]
    assembler.properties["gqa:final_gcp_count"] =metadata["gqa"]["final_gcp_count"]
    assembler.properties["gqa:ref_source"] = metadata["gqa"]["ref_source"]
    assembler.properties["sentinel:datatake_start_datetime"] = granule_id.split("_")[-4]
    assembler.properties["eo:platform"] = platform
    assembler.properties["eo:instrument"] = "MSI"

    for key in ["abs_iterative_mean", "abs", "iterative_mean", "iterative_stddev", "mean", "stddev"]:
        assembler.properties[f"gqa:{key}_xy"] = metadata["gqa"]["residual"][key]["xy"]

    eo3 = DatasetDoc(
        id=assembler.dataset_id,
        label=assembler.label,
        product=ProductDoc(
            name=assembler.names.product_name, href=assembler.names.product_uri
        ),
        crs=assembler._crs_str(crs) if crs is not None else None,
        geometry=valid_data,
        grids=grid_docs,
        properties=assembler.properties,
        accessories={
            name: AccessoryDoc(path, name=name)
            for name, path in assembler._accessories.items()
        },
        measurements=measurement_docs,
        lineage=assembler._lineage,
    )

    if not expand_valid_data:
        eo3.geometry = Polygon(coords[0])

    for measurement in eo3.measurements.values():
        if measurement.grid is None:
            measurement.grid = 'default'

    return eo3