Exemple #1
0
def test_valid_document_works(tmp_path: Path, example_metadata: Dict):
    generated_doc = dump_roundtrip(example_metadata)

    # Do a serialisation roundtrip and check that it's still identical.
    reserialised_doc = dump_roundtrip(
        serialise.to_doc(serialise.from_doc(generated_doc)))

    assert_same(generated_doc, reserialised_doc)

    assert serialise.from_doc(generated_doc) == serialise.from_doc(
        reserialised_doc)
Exemple #2
0
def assert_unchanged_after_roundstrip(doc: Dict):
    generated_doc = dump_roundtrip(doc)

    # Do a serialisation roundtrip and check that it's still identical.
    reserialised_doc = dump_roundtrip(
        serialise.to_doc(serialise.from_doc(generated_doc)))

    # One allowed difference: input dates can be many string formats,
    # but we normalise them with timezone (UTC default)
    _normalise_datetime_props(generated_doc)

    assert serialise.from_doc(generated_doc) == serialise.from_doc(
        reserialised_doc)
Exemple #3
0
def assert_expected_eo3_path(
        expected_doc: Dict,
        expected_path: Path,
        ignore_fields=(),
):
    """
    Check an output path of an EO3 dataset matches an expected document.

    This is slightly smarter about doing geometry equality etc within the document.
    """
    __tracebackhide__ = operator.methodcaller("errisinstance", AssertionError)
    assert (expected_path.exists()
            ), f"Expected output EO3 path doesn't exist: {expected_path}"
    assert_same_as_file(
        expected_doc,
        expected_path,
        # We check the geometry below
        ignore_fields=("geometry", ) + tuple(ignore_fields),
    )

    if "geometry" not in ignore_fields:
        # Compare geometry after parsing, rather than comparing the raw dict values.
        produced_dataset = serialise.from_path(expected_path)
        expected_dataset = serialise.from_doc(expected_doc,
                                              skip_validation=True)
        if expected_dataset.geometry is None:
            assert produced_dataset.geometry is None
        else:
            assert_shapes_mostly_equal(produced_dataset.geometry,
                                       expected_dataset.geometry, 0.00000001)
Exemple #4
0
    def add_source_path(
        self,
        *paths: Path,
        classifier: str = None,
        auto_inherit_properties: bool = False,
    ):
        """
        Record a source dataset using the path to its metadata document.

        :param paths:

        See other parameters in :func:`DatasetAssembler.add_source_dataset`
        """
        for _, doc in find_and_read_documents(*paths):
            # Newer documents declare a schema.
            if "$schema" in doc:
                self.add_source_dataset(
                    serialise.from_doc(doc),
                    classifier=classifier,
                    auto_inherit_properties=auto_inherit_properties,
                )
            else:
                if auto_inherit_properties:
                    raise NotImplementedError(
                        "Can't (yet) inherit properties from old-style metadata"
                    )
                classifier = classifier or doc.get("product_type")
                if not classifier:
                    # TODO: This rule is a little obscure to force people to know.
                    #       We could somehow figure out from the product?
                    raise ValueError(
                        "Source dataset (of old-style eo) doesn't have a 'product_type' property (eg. 'level1', 'fc'), "
                        "you must specify a classifier for the kind of source dataset."
                    )
                self._lineage[classifier].append(doc["id"])
Exemple #5
0
def process_dataset(s3_obj):
    
    s3_eo3_path = s3_obj.url
    s3_stac_path = s3_eo3_path.replace("eo3", "stac")
    s3_stac_path = s3_stac_path.replace("yaml", "json")
    s3_path = s3_eo3_path.replace("eo3-ARD-METADATA.yaml", "")
    granule = os.path.join(*s3_eo3_path.split('/')[5:-1])
    nci_path = os.path.join(NCI_DIR, *s3_eo3_path.split('/')[5:-1], "ARD-METADATA.yaml")
    
    if "S2A_OPER_MSI_ARD" in granule:
        platform = "SENTINEL_2A"
    elif "S2B_OPER_MSI_ARD" in granule:
        platform = "SENTINEL_2B"
    else:
        raise ValueError(
            f"Expected granule id to contain either 'S2A_OPER_MSI_ARD' or 'S2B_OPER_MSI_ARD', found '{granule}'"
        )
    
    with open(nci_path) as fin:
        eo_metadata = yaml.safe_load(fin)
    
    eo3_metadata = yaml.safe_load(s3_obj.data)
    
    eo3_metadata["properties"]["odc:region_code"] = eo_metadata["provider"]["reference_code"]
    eo3_metadata["properties"]["gqa:cep90"] = eo_metadata["gqa"]["residual"]["cep90"]
    eo3_metadata["properties"]["gqa:error_message"] = eo_metadata["gqa"]["error_message"]
    eo3_metadata["properties"]["gqa:final_gcp_count"] = eo_metadata["gqa"]["final_gcp_count"]
    eo3_metadata["properties"]["gqa:ref_source"] = eo_metadata["gqa"]["ref_source"]
    eo3_metadata["properties"]["sentinel:datatake_start_datetime"] = granule.split("_")[-4]
    eo3_metadata["properties"]["eo:platform"] = platform
    eo3_metadata["properties"]["eo:instrument"] = "MSI"
    
    for key in ["abs_iterative_mean", "abs", "iterative_mean", "iterative_stddev", "mean", "stddev"]:
        eo3_metadata["properties"][f"gqa:{key}_xy"] = eo_metadata["gqa"]["residual"][key]["xy"]

    eo3 = serialise.from_doc(eo3_metadata)
    stac = to_stac_item(
        eo3,
        stac_item_destination_url=s3_stac_path,
        odc_dataset_metadata_url=s3_eo3_path,
        dataset_location=s3_path,
    )
    stac_dump = json.dumps(stac, default=json_fallback, indent=4)
    eo3_dump = yaml.safe_dump(eo3_metadata, default_flow_style=False)

    s3_dump(
        eo3_dump, 
        s3_eo3_path, 
        ACL="bucket-owner-full-control",
        ContentType="text/vnd.yaml",
    )

    s3_dump(
        stac_dump, 
        s3_stac_path, 
        ACL="bucket-owner-full-control",
        ContentType="application/json"
    )
Exemple #6
0
def test_location_single_serialisation(tmp_path: Path,
                                       l1_ls8_folder_md_expected: Dict):

    # Always serialises a single location as 'location'
    location = "https://some/test/path"

    # Given multiple
    l1_ls8_folder_md_expected["locations"] = [location]

    reserialised_doc = dump_roundtrip(
        serialise.to_doc(serialise.from_doc(l1_ls8_folder_md_expected)))

    # We get singular
    assert reserialised_doc["location"] == location
    assert "locations" not in reserialised_doc
Exemple #7
0
def get_dataset_file_offsets(dataset: Dataset) -> Dict[str, str]:
    """
    Get (usually relative) paths for all known files of a dataset.

    Returns {name, url}
    """

    # Get paths to measurements (usually relative, but may not be)
    uri_list = {
        name: m["path"]
        for name, m in dataset.measurements.items() if m.get("path")
    }

    # Add accessories too, if possible
    if is_doc_eo3(dataset.metadata_doc):
        dataset_doc = serialise.from_doc(dataset.metadata_doc,
                                         skip_validation=True)
        uri_list.update(
            {name: a.path
             for name, a in dataset_doc.accessories.items()})

    return uri_list
Exemple #8
0
def check_prepare_outputs(
    invoke_script,
    run_args,
    expected_doc: Dict,
    expected_metadata_path: Path,
    ignore_fields=(),
):
    __tracebackhide__ = operator.methodcaller("errisinstance", AssertionError)
    run_prepare_cli(invoke_script, *run_args)

    assert expected_metadata_path.exists()
    assert_same_as_file(
        expected_doc,
        expected_metadata_path,
        # We check the geometry below
        ignore_fields=("geometry",) + tuple(ignore_fields),
    )

    # Compare geometry after parsing, rather than comparing the raw dict values.
    produced_dataset = serialise.from_path(expected_metadata_path)
    expected_dataset = serialise.from_doc(expected_doc, skip_validation=True)
    assert_shapes_mostly_equal(
        produced_dataset.geometry, expected_dataset.geometry, 0.00000001
    )
def l1_ls8_dataset(l1_ls8_folder_md_expected: Dict) -> DatasetDoc:
    return serialise.from_doc(l1_ls8_folder_md_expected)
Exemple #10
0
def as_stac_item(dataset: DatasetItem):
    """
    Get a dict corresponding to a stac item
    """
    ds: Dataset = dataset.odc_dataset

    if ds is not None and is_doc_eo3(ds.metadata_doc):
        dataset_doc = serialise.from_doc(ds.metadata_doc, skip_validation=True)
        dataset_doc.locations = ds.uris

        # Geometry is optional in eo3, and needs to be calculated from grids if missing.
        # We can use ODC's own calculation that happens on index.
        if dataset_doc.geometry is None:
            fallback_extent = ds.extent
            if fallback_extent is not None:
                dataset_doc.geometry = fallback_extent.geom
                dataset_doc.crs = str(ds.crs)

        if ds.sources:
            dataset_doc.lineage = {
                classifier: [d.id]
                for classifier, d in ds.sources
            }
        # Does ODC still put legacy lineage into indexed documents?
        elif ("source_datasets" in dataset_doc.lineage) and len(
                dataset_doc.lineage) == 1:
            # From old to new lineage type.
            dataset_doc.lineage = {
                classifier: [dataset["id"]]
                for classifier, dataset in
                dataset_doc.lineage["source_datasets"]
            }

    else:
        # eo1 to eo3

        dataset_doc = DatasetDoc(
            id=dataset.dataset_id,
            # Filled-in below.
            label=None,
            product=ProductDoc(dataset.product_name),
            locations=ds.uris if ds is not None else None,
            crs=str(dataset.geometry.crs),
            geometry=dataset.geometry.geom,
            grids=None,
            # TODO: Convert these from stac to eo3
            properties=Eo3Dict({
                "datetime":
                utc(dataset.center_time),
                **(dict(_build_properties(ds.metadata)) if ds else {}),
                "odc:processing_datetime":
                utc(dataset.creation_time),
            }),
            measurements={
                name: _band_to_measurement(
                    b,
                    dataset_location=ds.uris[0]
                    if ds is not None and ds.uris else None,
                )
                for name, b in ds.measurements.items()
            } if ds is not None else {},
            accessories=_accessories_from_eo1(ds.metadata_doc)
            if ds is not None else {},
            # TODO: Fill in lineage. The datacube API only gives us full datasets, which is
            #       expensive. We only need a list of IDs here.
            lineage={},
        )

    if dataset_doc.label is None and ds is not None:
        dataset_doc.label = _utils.dataset_label(ds)

    item_doc = eo3stac.to_stac_item(
        dataset=dataset_doc,
        stac_item_destination_url=url_for(
            ".item",
            collection=dataset.product_name,
            dataset_id=dataset.dataset_id,
        ),
        odc_dataset_metadata_url=url_for("dataset.raw_doc",
                                         id_=dataset.dataset_id),
        explorer_base_url=url_for("default_redirect"),
    )
    # Add the region code that Explorer inferred.
    # (Explorer's region codes predate ODC's and support
    #  many more products.
    item_doc["properties"]["cubedash:region_code"] = dataset.region_code

    return item_doc
Exemple #11
0
def validate_dataset(
    doc: Dict,
    product_definition: Optional[Dict] = None,
    thorough: bool = False,
    readable_location: Union[str, Path] = None,
    expect_extra_measurements: bool = False,
) -> ValidationMessages:
    """
    Validate a a dataset document, optionally against the given product.

    By default this will only look at the metadata, run with thorough=True to
    open the data files too.

    :param product_definition: Optionally check that the dataset matches this product definition.
    :param thorough: Open the imagery too, to check that data types etc match.
    :param readable_location: Dataset location to use, if not the metadata path.
    :param expect_extra_measurements:
            Allow some dataset measurements to be missing from the product definition.
            This is (deliberately) allowed by ODC, but often a mistake.
            This flag disables the warning.
    """
    schema = doc.get("$schema")
    if schema is None:
        yield _error(
            "no_schema",
            f"No $schema field. "
            f"You probably want an ODC dataset schema {model.ODC_DATASET_SCHEMA_URL!r}",
        )
        return
    if schema != model.ODC_DATASET_SCHEMA_URL:
        yield _error(
            "unknown_doc_type",
            f"Unknown doc schema {schema!r}. Only ODC datasets are supported ({model.ODC_DATASET_SCHEMA_URL!r})",
        )
        return

    has_doc_errors = False
    for error in serialise.DATASET_SCHEMA.iter_errors(doc):
        has_doc_errors = True
        displayable_path = ".".join(error.absolute_path)

        hint = None
        if displayable_path == "crs" and "not of type" in error.message:
            hint = "epsg codes should be prefixed with 'epsg:1234'"

        context = f"({displayable_path}) " if displayable_path else ""
        yield _error("structure", f"{context}{error.message} ", hint=hint)

    if has_doc_errors:
        return

    dataset = serialise.from_doc(doc, skip_validation=True)

    if not dataset.product.href:
        _info("product_href", "A url (href) is recommended for products")

    yield from _validate_geo(dataset)

    # Note that a dataset may have no measurements (eg. telemetry data).
    # (TODO: a stricter mode for when we know we should have geo and measurement info)
    if dataset.measurements:
        for name, measurement in dataset.measurements.items():
            grid_name = measurement.grid
            if grid_name != "default" or dataset.grids:
                if grid_name not in dataset.grids:
                    yield _error(
                        "invalid_grid_ref",
                        f"Measurement {name!r} refers to unknown grid {grid_name!r}",
                    )

            if is_absolute(measurement.path):
                yield _warning(
                    "absolute_path",
                    f"measurement {name!r} has an absolute path: {measurement.path!r}",
                )

    yield from _validate_stac_properties(dataset)

    required_measurements: Dict[str, ExpectedMeasurement] = {}
    if product_definition is not None:
        required_measurements.update({
            m.name: m
            for m in map(
                ExpectedMeasurement.from_definition,
                product_definition.get("measurements") or (),
            )
        })

        product_name = product_definition.get("name")
        if product_name != dataset.product.name:
            # This is only informational as it's possible products may be indexed with finer-grained
            # categories than the original datasets: eg. a separate "nrt" product, or test product.
            yield _info(
                "product_mismatch",
                f"Dataset product name {dataset.product.name!r} "
                f"does not match the given product ({product_name!r}",
            )

        for name in required_measurements:
            if name not in dataset.measurements.keys():
                yield _error(
                    "missing_measurement",
                    f"Product {product_name} expects a measurement {name!r})",
                )
        measurements_not_in_product = set(
            dataset.measurements.keys()).difference(
                set(m["name"]
                    for m in product_definition.get("measurements") or ()))
        if (not expect_extra_measurements) and measurements_not_in_product:
            things = ", ".join(sorted(measurements_not_in_product))
            yield _warning(
                "extra_measurements",
                f"Dataset has measurements not present in product definition for {product_name!r}: {things}",
                hint=
                "This may be valid, as it's allowed by ODC. Set `expect_extra_measurements` to mute this.",
            )

    # If we have a location:
    # For each measurement, try to load it.
    # If loadable:
    if thorough:
        for name, measurement in dataset.measurements.items():
            full_path = uri_resolve(readable_location, measurement.path)
            expected_measurement = required_measurements.get(name)

            band = measurement.band or 1
            with rasterio.open(full_path) as ds:
                ds: DatasetReader

                if band not in ds.indexes:
                    yield _error(
                        "incorrect_band",
                        f"Measurement {name!r} file contains no rio index {band!r}.",
                        hint=f"contains indexes {ds.indexes!r}",
                    )
                    continue

                if not expected_measurement:
                    # The measurement is not in the product definition
                    #
                    # This is only informational because a product doesn't have to define all
                    # measurements that the datasets contain.
                    #
                    # This is historically because dataset documents reflect the measurements that
                    # are stored on disk, which can differ. But products define the set of measurments
                    # that are mandatory in every dataset.
                    #
                    # (datasets differ when, for example, sensors go offline, or when there's on-disk
                    #  measurements like panchromatic that GA doesn't want in their product definitions)
                    if required_measurements:
                        yield _info(
                            "unspecified_measurement",
                            f"Measurement {name} is not in the product",
                        )
                else:
                    expected_dtype = expected_measurement.dtype
                    band_dtype = ds.dtypes[band - 1]
                    # TODO: NaN handling
                    if expected_dtype != band_dtype:
                        yield _error(
                            "different_dtype",
                            f"{name} dtype: "
                            f"product {expected_dtype!r} != dataset {band_dtype!r}",
                        )

                    # TODO: the nodata can also be a fill value, as mentioned by Kirill.
                    expected_nodata = expected_measurement.nodata
                    ds_nodata = ds.nodatavals[band - 1]
                    if expected_nodata != ds_nodata and not (
                            _is_nan(expected_nodata) and _is_nan(ds_nodata)):
                        yield _info(
                            "different_nodata",
                            f"{name} nodata: "
                            f"product {expected_nodata !r} != dataset {ds_nodata !r}",
                        )
Exemple #12
0
def prepare_and_write(
    dataset_location: Path,
    output_yaml: Path,
    producer: str,
    granule_id: str = None,
    embed_location: bool = None,
) -> Tuple[DatasetDoc, Path]:
    if embed_location is None:
        # Default to embedding the location if they're not in the same folder.
        embed_location = output_yaml.parent not in dataset_location.parents
        _LOG.debug(
            "Auto-embed location?",
            auto_embed=bool(embed_location),
            data_location=dataset_location.parent,
            yaml_location=output_yaml.parent,
        )

    with DatasetPrepare(
            metadata_path=output_yaml,
            dataset_location=dataset_location,
    ) as p:
        p.properties["odc:producer"] = producer

        if producer == "esa.int":
            jp2_offsets = _extract_esa_fields(dataset_location,
                                              p,
                                              granule_id=granule_id)
        elif producer == "sinergise.com":
            jp2_offsets = _extract_sinergise_fields(dataset_location.parent, p)
        else:
            raise NotImplementedError(
                f"Unknown s2 producer {producer}. Expected 'sinergise.com' or 'esa.int'"
            )

        p.dataset_id = _get_stable_id(p)

        p.platform = _get_platform_name(p.properties)
        p.instrument = "MSI"
        p.constellation = "sentinel-2"

        # TODO: How to read collection number from metadata? (once ESA etc add one)
        collection_number = 0
        p.dataset_version = f"{collection_number}.0.{p.processed:%Y%m%d}"

        p.properties["odc:file_format"] = "JPEG2000"
        p.product_family = "level1"

        for path in jp2_offsets:
            band_number = _extract_band_number(path.stem)
            if band_number.lower() in ("tci", "pvi", "preview"):
                continue
            if band_number not in SENTINEL_MSI_BAND_ALIASES:
                raise RuntimeError(
                    f"Unknown band number {band_number!r} in image {path}")

            p.note_measurement(
                path=path,
                name=SENTINEL_MSI_BAND_ALIASES[band_number],
                relative_to_dataset_location=True,
            )

        dataset_id, metadata_path = p.done(embed_location=embed_location)
        doc = serialise.from_doc(p.written_dataset_doc,
                                 skip_validation=True,
                                 normalise_properties=False)
        if not doc.locations:
            doc.locations = [names.resolve_location(dataset_location)]
        return doc, metadata_path
Exemple #13
0
def validate_dataset(
    doc: Dict,
    product_definition: Optional[Dict] = None,
    metadata_type_definition: Optional[Dict] = None,
    thorough: bool = False,
    readable_location: Union[str, Path] = None,
    expect: ValidationExpectations = None,
) -> ValidationMessages:
    """
    Validate a dataset document, optionally against the given product.

    By default this will only look at the metadata, run with thorough=True to
    open the data files too.

    :param product_definition: Optionally check that the dataset matches this product definition.
    :param thorough: Open the imagery too, to check that data types etc match.
    :param readable_location: Dataset location to use, if not the metadata path.
    :param expect: Where can we be lenient in validation?
    """
    validation_context = {}
    expect = expect or ValidationExpectations()
    if metadata_type_definition is not None:
        expect = expect.with_document_overrides(metadata_type_definition)
        validation_context["type"] = metadata_type_definition["name"]
    if product_definition is not None:
        expect = expect.with_document_overrides(product_definition)
        validation_context["product"] = product_definition["name"]

    # noinspection PyShadowingNames
    def _info(code: str, reason: str, hint: str = None):
        return ValidationMessage(Level.info,
                                 code,
                                 reason,
                                 hint=hint,
                                 context=validation_context)

    # noinspection PyShadowingNames
    def _warning(code: str, reason: str, hint: str = None):
        return ValidationMessage(Level.warning,
                                 code,
                                 reason,
                                 hint=hint,
                                 context=validation_context)

    # noinspection PyShadowingNames
    def _error(code: str, reason: str, hint: str = None):
        return ValidationMessage(Level.error,
                                 code,
                                 reason,
                                 hint=hint,
                                 context=validation_context)

    schema = doc.get("$schema")
    if schema is None:
        yield _error(
            "no_schema",
            f"No $schema field. "
            f"You probably want an ODC dataset schema {model.ODC_DATASET_SCHEMA_URL!r}",
        )
        return
    if schema != model.ODC_DATASET_SCHEMA_URL:
        yield _error(
            "unknown_doc_type",
            f"Unknown doc schema {schema!r}. Only ODC datasets are supported ({model.ODC_DATASET_SCHEMA_URL!r})",
        )
        return

    has_doc_errors = False
    for error in serialise.DATASET_SCHEMA.iter_errors(doc):
        has_doc_errors = True
        displayable_path = ".".join(error.absolute_path)

        hint = None
        if displayable_path == "crs" and "not of type" in error.message:
            hint = "epsg codes should be prefixed with 'epsg:1234'"

        context = f"({displayable_path}) " if displayable_path else ""
        yield _error("structure", f"{context}{error.message} ", hint=hint)

    if has_doc_errors:
        return

    dataset = serialise.from_doc(doc, skip_validation=True)

    if not dataset.product.href:
        _info("product_href", "A url (href) is recommended for products")

    yield from _validate_geo(dataset, expect_geometry=expect.require_geometry)

    # Note that a dataset may have no measurements (eg. telemetry data).
    # (TODO: a stricter mode for when we know we should have geo and measurement info)
    if dataset.measurements:
        for name, measurement in dataset.measurements.items():
            grid_name = measurement.grid
            if grid_name != "default" or dataset.grids:
                if grid_name not in dataset.grids:
                    yield _error(
                        "invalid_grid_ref",
                        f"Measurement {name!r} refers to unknown grid {grid_name!r}",
                    )

            if is_absolute(measurement.path):
                yield _warning(
                    "absolute_path",
                    f"measurement {name!r} has an absolute path: {measurement.path!r}",
                )

    yield from _validate_stac_properties(dataset)

    required_measurements: Dict[str, ExpectedMeasurement] = {}
    if product_definition is not None:
        required_measurements.update({
            m.name: m
            for m in map(
                ExpectedMeasurement.from_definition,
                product_definition.get("measurements") or (),
            )
        })

        product_name = product_definition.get("name")
        if product_name != dataset.product.name:
            # This is only informational as it's possible products may be indexed with finer-grained
            # categories than the original datasets: eg. a separate "nrt" product, or test product.
            yield _info(
                "product_mismatch",
                f"Dataset product name {dataset.product.name!r} "
                f"does not match the given product ({product_name!r}",
            )

        for name in required_measurements:
            if name not in dataset.measurements.keys():
                yield _error(
                    "missing_measurement",
                    f"Product {product_name} expects a measurement {name!r})",
                )
        measurements_not_in_product = set(
            dataset.measurements.keys()).difference({
                m["name"]
                for m in product_definition.get("measurements") or ()
            })
        # Remove the measurements that are allowed to be extra.
        measurements_not_in_product.difference_update(
            expect.allow_extra_measurements or set())

        if measurements_not_in_product:
            things = ", ".join(sorted(measurements_not_in_product))
            yield _warning(
                "extra_measurements",
                f"Dataset has measurements not present in product definition for {product_name!r}: {things}",
                hint=
                "This may be valid, as it's allowed by ODC. Set `expect_extra_measurements` to mute this.",
            )

    if metadata_type_definition:
        # Datacube does certain transforms on an eo3 doc before storage.
        # We need to do the same, as the fields will be read from the storage.
        prepared_doc = prep_eo3(doc)

        all_nullable_fields = tuple(expect.allow_nullable_fields) + tuple(
            expect.allow_missing_fields)
        for field_name, offsets in _get_field_offsets(
                metadata_type=metadata_type_definition):
            if (
                    # If a field is required...
                (field_name not in expect.allow_missing_fields) and
                    # ... and none of its offsets are in the document
                    not any(
                        _has_offset(prepared_doc, offset)
                        for offset in offsets)):
                # ... warn them.
                product_name = (product_definition.get("name") if
                                product_definition else dataset.product.name)
                readable_offsets = " or ".join("->".join(offset)
                                               for offset in offsets)
                yield _warning(
                    "missing_field",
                    f"Dataset is missing field {field_name!r} "
                    f"for type {metadata_type_definition['name']!r}",
                    hint=f"Expected at {readable_offsets}",
                )
                continue

            if field_name not in all_nullable_fields:
                value = None
                for offset in offsets:
                    value = toolz.get_in(offset, prepared_doc)
                if value is None:
                    yield _info(
                        "null_field",
                        f"Value is null for configured field {field_name!r}",
                    )

    dataset_location = dataset.locations[
        0] if dataset.locations else readable_location

    # If we have a location:
    # For each measurement, try to load it.
    # If loadable:
    if thorough:
        for name, measurement in dataset.measurements.items():
            full_path = uri_resolve(dataset_location, measurement.path)
            expected_measurement = required_measurements.get(name)

            band = measurement.band or 1
            with rasterio.open(full_path) as ds:
                ds: DatasetReader

                if band not in ds.indexes:
                    yield _error(
                        "incorrect_band",
                        f"Measurement {name!r} file contains no rio index {band!r}.",
                        hint=f"contains indexes {ds.indexes!r}",
                    )
                    continue

                if not expected_measurement:
                    # The measurement is not in the product definition
                    #
                    # This is only informational because a product doesn't have to define all
                    # measurements that the datasets contain.
                    #
                    # This is historically because dataset documents reflect the measurements that
                    # are stored on disk, which can differ. But products define the set of measurments
                    # that are mandatory in every dataset.
                    #
                    # (datasets differ when, for example, sensors go offline, or when there's on-disk
                    #  measurements like panchromatic that GA doesn't want in their product definitions)
                    if required_measurements:
                        yield _info(
                            "unspecified_measurement",
                            f"Measurement {name} is not in the product",
                        )
                else:
                    expected_dtype = expected_measurement.dtype
                    band_dtype = ds.dtypes[band - 1]
                    # TODO: NaN handling
                    if expected_dtype != band_dtype:
                        yield _error(
                            "different_dtype",
                            f"{name} dtype: "
                            f"product {expected_dtype!r} != dataset {band_dtype!r}",
                        )

                    ds_nodata = ds.nodatavals[band - 1]

                    # If the dataset is missing 'nodata', we can allow anything in product 'nodata'.
                    # (In ODC, nodata might be a fill value for loading data.)
                    if ds_nodata is None:
                        continue

                    # Otherwise check that nodata matches.
                    expected_nodata = expected_measurement.nodata
                    if expected_nodata != ds_nodata and not (
                            _is_nan(expected_nodata) and _is_nan(ds_nodata)):
                        yield _error(
                            "different_nodata",
                            f"{name} nodata: "
                            f"product {expected_nodata !r} != dataset {ds_nodata !r}",
                        )
Exemple #14
0
def update_metadata(
        nci_metadata_file, s3_bucket, s3_base_url, explorer_base_url, sns_topic, s3_path
):
    """
    Uploads updated metadata with nbar element removed, updated checksum file, STAC doc created
    and publish SNS message

    :param nci_metadata_file: Path of metadata file in NCI
    :param s3_bucket: Name of S3 bucket
    :param s3_base_url: Base URL of the S3 bucket
    :param explorer_base_url: Base URL of the explorer
    :param sns_topic: ARN of the SNS topic
    :param s3_path: Path in S3
    :return: List of errors
    """
    # Initialise error list
    metadata_error_list = []
    # Initialise checksum list
    new_checksum_list = {}

    nci_metadata_file_path = Path(nci_metadata_file)
    temp_metadata = serialise.load_yaml(nci_metadata_file_path)

    # Deleting Nbar related metadata
    # Because Landsat 8 is different, we need to check if the fields exist
    # before removing them.
    if "nbar_blue" in temp_metadata["measurements"]:
        del temp_metadata["measurements"]["nbar_blue"]
    if "nbar_green" in temp_metadata["measurements"]:
        del temp_metadata["measurements"]["nbar_green"]
    if "nbar_nir" in temp_metadata["measurements"]:
        del temp_metadata["measurements"]["nbar_nir"]
    if "nbar_red" in temp_metadata["measurements"]:
        del temp_metadata["measurements"]["nbar_red"]
    if "nbar_swir_1" in temp_metadata["measurements"]:
        del temp_metadata["measurements"]["nbar_swir_1"]
    if "nbar_swir_2" in temp_metadata["measurements"]:
        del temp_metadata["measurements"]["nbar_swir_2"]
    if "nbar_coastal_aerosol" in temp_metadata["measurements"]:
        del temp_metadata["measurements"]["nbar_coastal_aerosol"]
    if "nbar_panchromatic" in temp_metadata["measurements"]:
        del temp_metadata["measurements"]["nbar_panchromatic"]
    if "oa_nbar_contiguity" in temp_metadata["measurements"]:
        del temp_metadata["measurements"]["oa_nbar_contiguity"]
    if "thumbnail:nbar" in temp_metadata["accessories"]:
        del temp_metadata["accessories"]["thumbnail:nbar"]

    # Format an eo3 dataset dict for human-readable yaml serialisation.
    temp_metadata = serialise.prepare_formatting(temp_metadata)

    # Dump metadata yaml into buffer
    with io.BytesIO() as temp_yaml:
        serialise.dumps_yaml(temp_yaml, temp_metadata)
        temp_yaml.seek(
            0
        )  # Seek back to the beginning of the file before next read/write
        new_checksum_list[nci_metadata_file_path.name] = verify.calculate_hash(
            temp_yaml
        )

        # Write odc metadata yaml object into S3
        s3_metadata_file = f"{s3_path}/{nci_metadata_file_path.name}"
        try:
            upload_s3_resource(s3_bucket, s3_metadata_file, temp_yaml.getvalue())
            LOG.info(f"Finished uploading metadata to {s3_metadata_file}")
        except S3SyncException as exp:
            LOG.error(f"Failed uploading metadata to {s3_metadata_file} - {exp}")
            metadata_error_list.append(
                f"Failed uploading metadata to {s3_metadata_file} - {exp}"
            )

    # Create stac metadata
    name = nci_metadata_file_path.stem.replace(".odc-metadata", "")
    stac_output_file_path = nci_metadata_file_path.with_name(f"{name}.stac-item.json")
    stac_url_path = f"{s3_base_url if s3_base_url else boto3.client('s3').meta.endpoint_url}/{s3_path}/"
    item_doc = dc_to_stac(
        serialise.from_doc(temp_metadata),
        nci_metadata_file_path,
        stac_output_file_path,
        stac_url_path,
        explorer_base_url,
        True,
    )
    stac_dump = json.dumps(item_doc, indent=4, default=json_fallback)

    # Write stac json to buffer
    with io.BytesIO() as temp_stac:
        temp_stac.write(stac_dump.encode())
        temp_stac.seek(
            0
        )  # Seek back to the beginning of the file before next read/write
        new_checksum_list[stac_output_file_path.name] = verify.calculate_hash(temp_stac)

        # Write stac metadata json object into S3
        s3_stac_file = f"{s3_path}/{stac_output_file_path.name}"
        try:
            upload_s3_resource(s3_bucket, s3_stac_file, temp_stac.getvalue())
            LOG.info(f"Finished uploading STAC metadata to {s3_stac_file}")
        except S3SyncException as exp:
            LOG.error(f"Failed uploading STAC metadata to {s3_stac_file} - {exp}")
            metadata_error_list.append(
                f"Failed uploading STAC metadata to {s3_stac_file} - {exp}"
            )

    # Publish message containing STAC metadata to SNS Topic
    message_attributes = get_common_message_attributes(json.loads(stac_dump))
    message_attributes.update(
        {"action": {"DataType": "String", "StringValue": "ADDED"}}
    )
    try:
        publish_sns(sns_topic, stac_dump, message_attributes)
        LOG.info(f"Finished publishing SNS Message to SNS Topic {sns_topic}")
    except S3SyncException as exp:
        LOG.error(f"Failed publishing SNS Message to SNS Topic {sns_topic} - {exp}")
        metadata_error_list.append(
            f"Failed publishing SNS Message to SNS Topic {sns_topic} - {exp}"
        )

    # Update checksum file
    checksum_filename = nci_metadata_file_path.stem.replace(".odc-metadata", "")
    checksum_file_path = nci_metadata_file_path.with_name(f"{checksum_filename}.sha1")
    try:
        upload_checksum(
            nci_metadata_file_path,
            checksum_file_path,
            new_checksum_list,
            s3_bucket,
            s3_path,
        )
        LOG.info(
            f"Finished uploading checksum file " f"{s3_path}/{checksum_file_path.name}"
        )
    except S3SyncException as exp:
        LOG.error(
            f"Failed uploading checksum file "
            f"{s3_path}/{checksum_file_path.name} - {exp}"
        )
        metadata_error_list.append(
            f"Failed uploading checksum file "
            f"{s3_path}/{checksum_file_path.name} - {exp}"
        )

    return metadata_error_list