def write_measurement_h5( p: DatasetAssembler, name: str, g: h5py.Dataset, overviews=images.DEFAULT_OVERVIEWS, overview_resampling=Resampling.nearest, expand_valid_data=True, file_id: str = None, ): """ Write a measurement by copying it from a hdf5 dataset. """ if hasattr(g, "chunks"): data = g[:] else: data = g p.write_measurement_numpy( name=name, array=data, grid_spec=images.GridSpec( shape=g.shape, transform=Affine.from_gdal(*g.attrs["geotransform"]), crs=CRS.from_wkt(g.attrs["crs_wkt"]), ), nodata=(g.attrs.get("no_data_value")), overviews=overviews, overview_resampling=overview_resampling, expand_valid_data=expand_valid_data, file_id=file_id, )
def _unpack_products(p: DatasetAssembler, product_list: Iterable[str], h5group: h5py.Group) -> None: """ Unpack and package the NBAR and NBART products. """ # listing of all datasets of IMAGE CLASS type img_paths = _find_h5_paths(h5group, "IMAGE") for product in product_list: with do(f"Starting {product}", heading=True): for pathname in [ p for p in img_paths if "/{}/".format(product.upper()) in p ]: with do(f"Path {pathname!r}"): dataset = h5group[pathname] band_name = utils.normalise_band_name( dataset.attrs["alias"]) write_measurement_h5( p, f"{product}:{band_name}", dataset, overview_resampling=Resampling.average, file_id=_file_id(dataset), ) if (p.platform, product) in _THUMBNAILS: red, green, blue = _THUMBNAILS[(p.platform, product)] with do(f"Thumbnailing {product}"): p.write_thumbnail(red, green, blue, kind=product, static_stretch=(1, 3000))
def _read_wagl_metadata(p: DatasetAssembler, granule_group: h5py.Group): try: wagl_path, *ancil_paths = [ pth for pth in (_find_h5_paths(granule_group, "SCALAR")) if "METADATA" in pth ] except ValueError: raise ValueError("No nbar metadata found in granule") [wagl_doc] = loads_yaml(granule_group[wagl_path][()]) try: p.processed = get_path(wagl_doc, ("system_information", "time_processed")) except PathAccessError: raise ValueError( f"WAGL dataset contains no time processed. Path {wagl_path}") for i, path in enumerate(ancil_paths, start=2): wagl_doc.setdefault(f"wagl_{i}", {}).update( list(loads_yaml(granule_group[path][()]))[0]["ancillary"]) p.properties["dea:dataset_maturity"] = _determine_maturity( p.datetime, p.processed, wagl_doc) _take_software_versions(p, wagl_doc) p.extend_user_metadata("wagl", wagl_doc)
def _write_thumbnail(task: AlchemistTask, dataset_assembler: DatasetAssembler): if task.settings.output.preview_image is not None: dataset_assembler.write_thumbnail(*task.settings.output.preview_image) elif task.settings.output.preview_image_singleband is not None: dataset_assembler.write_thumbnail_singleband( **task.settings.output.preview_image_singleband )
def _read_gqa_doc(p: DatasetAssembler, doc: Dict): _take_software_versions(p, doc) p.extend_user_metadata("gqa", doc) # TODO: more of the GQA fields? for k, v in _flatten_dict(doc["residual"], separator="_"): p.properties[f"gqa:{k}"] = v
def _read_fmask_doc(p: DatasetAssembler, doc: Dict): for name, value in doc["percent_class_distribution"].items(): # From Josh: fmask cloud cover trumps the L1 cloud cover. if name == "cloud": del p.properties[f"eo:cloud_cover"] p.properties[f"eo:cloud_cover"] = value p.properties[f"fmask:{name}"] = value _take_software_versions(p, doc) p.extend_user_metadata("fmask", doc)
def test_complain_about_missing_fields(tmp_path: Path, l1_ls8_folder: Path): """ It should complain immediately if I add a file without enough metadata to write the filename. (and with a friendly error message) """ out = tmp_path / "out" out.mkdir() [blue_geotiff_path] = l1_ls8_folder.rglob("L*_B2.TIF") # Default simple naming conventions need at least a date and family... with pytest.raises( ValueError, match="Need more properties to fulfill naming conventions." ): with DatasetAssembler(out) as p: p.write_measurement("blue", blue_geotiff_path) # It should mention the field that's missing (we added a date, so product_family is needed) with DatasetAssembler(out) as p: with pytest.raises(ValueError, match="odc:product_family"): p.datetime = datetime(2019, 7, 4, 13, 7, 5) p.write_measurement("blue", blue_geotiff_path) # DEA naming conventions should have stricter standards, and will tell your which fields you need to add. with DatasetAssembler(out, naming_conventions="dea") as p: # We set all the fields that work in default naming conventions. p.datetime = datetime(2019, 7, 4, 13, 7, 5) p.product_family = "quaternarius" p.processed_now() # These fields are mandatory for DEA, and so should be complained about. expected_extra_fields_needed = ( "eo:platform", "eo:instrument", "odc:dataset_version", "odc:producer", "odc:region_code", ) with pytest.raises(ValueError) as got_error: p.write_measurement("blue", blue_geotiff_path) # All needed fields should have been in the error message. for needed_field_name in expected_extra_fields_needed: assert needed_field_name in got_error.value.args[0], ( f"Expected field {needed_field_name} to " f"be listed as mandatory in the error message" )
def assert_names_match( tmp_path: Path, # Given: conventions, properties: Mapping, # Then expect: expect_metadata_path: str = None, expect_label: str = None, ): __tracebackhide__ = operator.methodcaller("errisinstance", AssertionError) """ Easily test a set of naming conventions: Do certain properties lead to expected file names? """ with DatasetAssembler(tmp_path, naming_conventions=conventions) as p: p.properties.update(properties) dataset_id, metadata_path = p.done() if expect_metadata_path: metadata_path_offset = metadata_path.relative_to(tmp_path).as_posix() assert metadata_path_offset == expect_metadata_path with metadata_path.open("r") as f: doc = yaml.YAML(typ="safe").load(f) if expect_label: assert doc["label"] == expect_label, "Unexpected dataset label"
def test_dea_interim_folder_calculation(tmp_path: Path): """ DEA Naming conventions should include maturity in the folder name when it's not a 'final' dataset. """ with DatasetAssembler(tmp_path, naming_conventions="dea") as p: p.platform = "landsat-7" # Should not end up in the path, as it's the default: p.product_maturity = "stable" p.instrument = "ETM+" p.datetime = datetime(1998, 7, 30) p.product_family = "frogs" p.processed = "1999-11-20 00:00:53.152462Z" p.maturity = "interim" p.producer = "ga.gov.au" p.properties["landsat:landsat_scene_id"] = "LE70930821999324EDC00" p.dataset_version = "1.2.3" p.region_code = "093082" p.done() [metadata_path] = tmp_path.rglob("*.odc-metadata.yaml") calculated_path: Path = metadata_path.relative_to(tmp_path) assert calculated_path == Path( # ⇩⇩⇩⇩⇩⇩⇩⇩ Adds interim flag "ga_ls7e_frogs_1/093/082/1998/07/30_interim/ga_ls7e_frogs_1-2-3_093082_1998-07-30_interim.odc-metadata.yaml" )
def _extract_sinergise_fields(path: Path, p: DatasetAssembler) -> Iterable[Path]: """Extract Sinergise metadata and return list of image offsets""" product_info_path = path / "productInfo.json" metadata_xml_path = path / "metadata.xml" if not product_info_path.exists(): raise ValueError( "No productInfo.json file found. " "Are you sure the input is a sinergise dataset folder?") p.properties.update(process_sinergise_product_info(product_info_path)) p.add_accessory_file("metadata:sinergise_product_info", product_info_path) p.properties.update(process_tile_metadata(metadata_xml_path.read_text())) p.add_accessory_file("metadata:s2_tile", metadata_xml_path) # TODO: sinergise folders could `process_datastrip_metadata()` in an outer directory? return path.glob("*.jp2")
def test_africa_naming_conventions(tmp_path: Path): """ Minimal fields needed for DEAfrica naming conventions """ with DatasetAssembler(tmp_path, naming_conventions="deafrica") as p: # Just the fields listed in required_fields. p.producer = "digitalearthafrica.org" p.datetime = datetime(1998, 7, 30) p.region_code = "090081" p.product_family = "wofs" p.platform = "LANDSAT_8" p.processed_now() p.dataset_version = "0.1.2" dataset_id, metadata_path = p.done() metadata_path_offset = metadata_path.relative_to(tmp_path).as_posix() assert ( metadata_path_offset == "wofs_ls/0-1-2/090/081/1998/07/30/wofs_ls_090081_1998-07-30.odc-metadata.yaml" ) with DatasetAssembler(tmp_path, naming_conventions="deafrica") as p: # Just the fields listed in required_fields. p.producer = "digitalearthafrica.org" p.datetime = datetime(1998, 7, 30) p.region_code = "090081" p.product_family = "fc" p.platform = "LANDSAT_8" p.processed_now() p.dataset_version = "0.1.2" dataset_id, metadata_path = p.done() metadata_path_offset = metadata_path.relative_to(tmp_path).as_posix() assert ( metadata_path_offset == "fc_ls/0-1-2/090/081/1998/07/30/fc_ls_090081_1998-07-30.odc-metadata.yaml" )
def _unpack_products(p: DatasetAssembler, product_list: Iterable[str], h5group: h5py.Group) -> None: """ Unpack and package the NBAR and NBART products. """ # listing of all datasets of IMAGE CLASS type img_paths = _find_h5_paths(h5group, "IMAGE") for product in product_list: with sub_product(product, p): for pathname in [ p for p in img_paths if f"/{product.upper()}/" in p ]: with do(f"Path {pathname!r}"): dataset = h5group[pathname] band_name = utils.normalise_band_name( dataset.attrs["alias"]) write_measurement_h5( p, f"{product}:{band_name}", dataset, overview_resampling=Resampling.average, file_id=_file_id(dataset), ) if product in _THUMBNAILS: red, green, blue = _THUMBNAILS[product] with do(f"Thumbnailing {product}"): p.write_thumbnail( red, green, blue, static_stretch=(1, 3000), # Because of our strange sub-products and filename standards, we want the # 'kind' to be included in the recorded thumbnail accessory metadata, # but not in the filename. # So we manually calculate a filename without the 'kind' field included. kind=product, path=p.names.thumbnail_filename(), )
def test_dataset_multi_platform(tmp_path: Path): """Can we make a dataset derived from multiple platforms?""" # No platform is included in names when there's a mix. with DatasetAssembler(tmp_path) as p: p.platforms = ["Sentinel_2a", "landsat_7"] assert p.platform == "landsat-7,sentinel-2a" p.datetime = datetime(2019, 1, 1) p.product_family = "peanuts" p.processed_now() dataset_id, metadata_path = p.done() with metadata_path.open("r") as f: doc = yaml.YAML(typ="safe").load(f) assert doc["label"] == "peanuts_2019-01-01" metadata_path_offset = metadata_path.relative_to(tmp_path).as_posix() assert (metadata_path_offset == "peanuts/2019/01/01/peanuts_2019-01-01.odc-metadata.yaml") # ... but show the platform abbreviation when there's a known group. with DatasetAssembler(tmp_path) as p: p.platforms = ["Sentinel_2a", "sentinel_2b"] assert p.platform == "sentinel-2a,sentinel-2b" p.datetime = datetime(2019, 1, 1) p.product_family = "peanuts" p.processed_now() dataset_id, metadata_path = p.done() with metadata_path.open("r") as f: doc = yaml.YAML(typ="safe").load(f) assert doc["label"] == "s2_peanuts_2019-01-01" metadata_path_offset = metadata_path.relative_to(tmp_path).as_posix() assert (metadata_path_offset == "s2_peanuts/2019/01/01/s2_peanuts_2019-01-01.odc-metadata.yaml")
def test_minimal_generated_naming_package(tmp_path: Path, l1_ls8_folder: Path): """ What's the minimum number of fields we can set and still generate file/product names to produce a package? """ out = tmp_path / "out" out.mkdir() [blue_geotiff_path] = l1_ls8_folder.rglob("L*_B2.TIF") with DatasetAssembler(out) as p: p.datetime = datetime(2019, 7, 4, 13, 7, 5) p.product_family = "quaternarius" p.processed_now() p.write_measurement("blue", blue_geotiff_path) # A friendly __str__ for notebook/terminal users: assert str(p) == dedent( f""" Assembling quaternarius (unfinished) - 1 measurements: blue - 4 properties: datetime, odc:file_format, odc:processing_datetime, odc:prod... Writing to location: {out}/quaternarius/2019/07/04/quaternarius_2019-07-04.odc-metadata.yaml """ ) # p.done() will validate the dataset and write it to the destination atomically. dataset_id, metadata_path = p.done() assert dataset_id is not None assert_file_structure( out, { "quaternarius": { "2019": { "07": { "04": { # Set a dataset version to get rid of 'beta' label. "quaternarius_2019-07-04.odc-metadata.yaml": "", "quaternarius_2019-07-04.proc-info.yaml": "", "quaternarius_2019-07-04_blue.tif": "", "quaternarius_2019-07-04.sha1": "", } } } } }, )
def _write_stac( metadata_path: Path, task: AlchemistTask, dataset_assembler: DatasetAssembler, ): out_dataset = serialise.from_path(metadata_path) stac_path = Path(str(metadata_path).replace("odc-metadata.yaml", "stac-item.json")) # Madness in deferred destination logic uri_base = dataset_assembler.names.destination_folder( Path(task.settings.output.location) ) uri_base = str(uri_base) + "/" stac = dc_to_stac( out_dataset, metadata_path, stac_path, uri_base.replace("s3:/", "s3://"), task.settings.output.explorer_url, False, ) with stac_path.open("w") as f: json.dump(stac, f, default=json_fallback) dataset_assembler.add_accessory_file("metadata:stac", stac_path) # dataset_assembler._checksum.write(dataset_assembler._accessories["checksum:sha1"]) # Need a new checksummer because EODatasets is insane checksummer = PackageChecksum() checksum_file = ( dataset_assembler._dataset_location / dataset_assembler._accessories["checksum:sha1"].name ) checksummer.read(checksum_file) checksummer.add_file(stac_path) checksummer.write(checksum_file) return stac
def write_measurement_h5( p: DatasetAssembler, full_name: str, g: h5py.Dataset, overviews=images.DEFAULT_OVERVIEWS, overview_resampling=Resampling.nearest, expand_valid_data=True, file_id: str = None, ): """ Write a measurement by copying it from a hdf5 dataset. """ if hasattr(g, "chunks"): data = g[:] else: data = g product_name, band_name = full_name.split(":") p.write_measurement_numpy( array=data, grid_spec=images.GridSpec( shape=g.shape, transform=Affine.from_gdal(*g.attrs["geotransform"]), crs=CRS.from_wkt(g.attrs["crs_wkt"]), ), nodata=g.attrs.get("no_data_value"), overviews=overviews, overview_resampling=overview_resampling, expand_valid_data=expand_valid_data, file_id=file_id, # Because of our strange sub-products and filename standards, we want the # product_name to be included in the recorded band metadata, # but not in its filename. # So we manually calculate a filename without the extra product name prefix. name=full_name, path=p.names.measurement_filename(band_name, "tif", file_id=file_id), )
def test_dataset_no_measurements(tmp_path: Path): """Can we make a dataset with no measurements? (eg. telemetry data)""" with DatasetAssembler(tmp_path) as p: # A custom label too. p.label = "chipmonk_sightings_2019" p.datetime = datetime(2019, 1, 1) p.product_family = "chipmonk_sightings" p.processed_now() dataset_id, metadata_path = p.done() with metadata_path.open("r") as f: doc = yaml.load(f) assert doc["label"] == "chipmonk_sightings_2019", "Couldn't override label field"
def test_minimal_s2_dataset(tmp_path: Path): """A minimal dataset with sentinel platform/instrument""" with DatasetAssembler(tmp_path) as p: # A custom label too. p.platform = "sentinel-2a" p.instrument = "msi" p.datetime = datetime(2018, 11, 4) p.product_family = "blueberries" p.processed = "2018-11-05T12:23:23" dataset_id, metadata_path = p.done() with metadata_path.open("r") as f: doc = yaml.load(f) assert doc["label"] == "s2am_blueberries_2018-11-04", "Unexpected dataset label"
def test_custom_naming(tmp_path: Path): """ We can create naming conventions separately, and later give it to assembler. """ p = _basic_properties_set() convention = namer(properties=p) convention.dataset_folder = Path("my/custom/folder/") with DatasetAssembler(tmp_path, names=convention) as a: dataset_id, metadata_path = a.done() metadata_path_offset = metadata_path.relative_to(tmp_path).as_posix() assert ( metadata_path_offset == "my/custom/folder/ga_s2am_tester_1-2-3_023543_2013-02-03.odc-metadata.yaml" )
def test_minimal_s1_dataset(tmp_path: Path): """A minimal dataset with sentinel-1a/b platform/instrument""" with DatasetAssembler(tmp_path) as p: # A custom label too. p.platform = "sentinel-1a" p.instrument = "c-sar" p.datetime = datetime(2018, 11, 4) p.product_family = "bck" p.processed = "2018-11-05T12:23:23" dataset_id, metadata_path = p.done() with metadata_path.open("r") as f: doc = yaml.safe_load(f) assert doc["label"] == "s1ac_bck_2018-11-04", "Unexpected dataset label"
def test_dataset_given_properties(tmp_path: Path): """Can we give existing properties to the assembler?""" properties = { "datetime": datetime(2019, 1, 1), "odc:product_family": "chipmonk_sightings", "odc:processing_datetime": "2021-06-15T01:33:43.378850", } names = namer(properties=properties) with DatasetAssembler(tmp_path, names=names) as p: # It should have normalised properties! assert p.processed == datetime(2021, 6, 15, 1, 33, 43, 378850, timezone.utc) dataset_id, metadata_path = p.done() relative_path = metadata_path.relative_to(tmp_path) assert relative_path == Path( "chipmonk_sightings/2019/01/01/chipmonk_sightings_2019-01-01.odc-metadata.yaml" )
def prepare_and_write( ds_path: Path, output_yaml: Path, producer: str, ) -> Tuple[uuid.UUID, Path]: with DatasetAssembler( metadata_path=output_yaml, dataset_location=ds_path, ) as p: p.properties["odc:producer"] = producer if producer == "esa.int": jp2_offsets = _extract_esa_fields(ds_path, p) elif producer == "sinergise.com": jp2_offsets = _extract_sinergise_fields(ds_path, p) else: raise NotImplementedError( f"Unknown s2 producer {producer}. Expected 'sinergise.com' or 'esa.int'" ) p.dataset_id = _get_stable_id(p) p.properties["eo:platform"] = _get_platform_name(p.properties) p.properties["eo:instrument"] = "MSI" p.properties["odc:dataset_version"] = f"1.0.{p.processed:%Y%m%d}" p.properties["odc:file_format"] = "JPEG2000" p.properties["odc:product_family"] = "level1" for path in jp2_offsets: band_number = _extract_band_number(path.stem) if band_number.lower() in ("tci", "pvi", "preview"): continue if band_number not in SENTINEL_MSI_BAND_ALIASES: raise RuntimeError( f"Unknown band number {band_number!r} in image {path}") p.note_measurement( path=path, name=SENTINEL_MSI_BAND_ALIASES[band_number], relative_to_dataset_location=True, ) return p.done()
def _apply_wagl_metadata(p: DatasetAssembler, wagl_doc: Dict): source = wagl_doc["source_datasets"] p.datetime = source["acquisition_datetime"] p.platform = source["platform_id"] p.instrument = source["sensor_id"] try: p.processed = get_path(wagl_doc, ("system_information", "time_processed")) except PathAccessError: raise RuntimeError("WAGL dataset contains no processed time.") _take_software_versions(p, wagl_doc) p.extend_user_metadata("wagl", wagl_doc)
def test_minimal_s2_dataset_normal(tmp_path: Path): """A minimal dataset with sentinel platform/instrument""" with DatasetAssembler(tmp_path) as p: p.platform = "sentinel-2a" p.instrument = "msi" p.datetime = datetime(2018, 11, 4) p.product_family = "blueberries" p.processed = "2018-11-05T12:23:23" p.properties[ "sentinel:sentinel_tile_id"] = "S2A_OPER_MSI_L1C_TL_SGS__20170822T015626_A011310_T54KYU_N02.05" dataset_id, metadata_path = p.done() with metadata_path.open("r") as f: doc = yaml.YAML(typ="safe").load(f) metadata_path_offset = metadata_path.relative_to(tmp_path).as_posix() assert metadata_path_offset == ( "s2am_blueberries/2018/11/04/s2am_blueberries_2018-11-04.odc-metadata.yaml" ) assert doc[ "label"] == "s2am_blueberries_2018-11-04", "Unexpected dataset label"
def test_minimal_package_with_product_name(tmp_path: Path, l1_ls8_folder: Path): """ You can specify an ODC product name manually to avoid most of the name generation. """ out = tmp_path / "out" out.mkdir() [blue_geotiff_path] = l1_ls8_folder.rglob("L*_B2.TIF") with DatasetAssembler(out) as p: p.datetime = datetime(2019, 7, 4, 13, 7, 5) p.product_name = "loch_ness_sightings" p.processed = datetime(2019, 7, 4, 13, 8, 7) p.write_measurement("blue", blue_geotiff_path) dataset_id, metadata_path = p.done() assert dataset_id is not None assert_file_structure( out, { "loch_ness_sightings": { "2019": { "07": { "04": { # Set a dataset version to get rid of 'beta' label. "loch_ness_sightings_2019-07-04.odc-metadata.yaml": "", "loch_ness_sightings_2019-07-04.proc-info.yaml": "", "loch_ness_sightings_2019-07-04_blue.tif": "", "loch_ness_sightings_2019-07-04.sha1": "", } } } } }, )
def prepare_and_write( ds_path: Path, collection_location: Path, # TODO: Can we infer producer automatically? This is bound to cause mistakes othewise producer="usgs.gov", ) -> Tuple[uuid.UUID, Path]: """ Prepare an eo3 metadata file for a Level2 dataset. Input dataset path can be a folder or a tar file. """ mtl_doc, mtl_filename = get_mtl_content( ds_path, root_element="landsat_metadata_file") if not mtl_doc: raise ValueError(f"No MTL file found for {ds_path}") usgs_collection_number = mtl_doc["product_contents"].get( "collection_number") if usgs_collection_number is None: raise NotImplementedError( "Dataset has no collection number: pre-collection data is not supported." ) data_format = mtl_doc["product_contents"]["output_format"] if data_format.upper() != "GEOTIFF": raise NotImplementedError( f"Only GTiff currently supported, got {data_format}") file_format = FileFormat.GeoTIFF # Assumed below. if (mtl_doc["projection_attributes"]["grid_cell_size_reflective"] != mtl_doc["projection_attributes"]["grid_cell_size_thermal"]): raise NotImplementedError( "reflective and thermal have different cell sizes") ground_sample_distance = min( value for name, value in mtl_doc["projection_attributes"].items() if name.startswith("grid_cell_size_")) with DatasetAssembler( collection_location=collection_location, # Detministic ID based on USGS's product id (which changes when the scene is reprocessed by them) dataset_id=uuid.uuid5( USGS_UUID_NAMESPACE, mtl_doc["product_contents"]["landsat_product_id"]), naming_conventions="dea", if_exists=IfExists.Overwrite, ) as p: p.platform = mtl_doc["image_attributes"]["spacecraft_id"] p.instrument = mtl_doc["image_attributes"]["sensor_id"] p.product_family = "level2" p.producer = producer p.datetime = "{}T{}".format( mtl_doc["image_attributes"]["date_acquired"], mtl_doc["image_attributes"]["scene_center_time"], ) # p.processed = mtl_doc["metadata_file_info"]["file_date"] p.processed = mtl_doc['level2_processing_record'][ 'date_product_generated'] p.properties["odc:file_format"] = file_format p.properties["eo:gsd"] = ground_sample_distance p.properties["eo:cloud_cover"] = mtl_doc["image_attributes"][ "cloud_cover"] p.properties["eo:sun_azimuth"] = mtl_doc["image_attributes"][ "sun_azimuth"] p.properties["eo:sun_elevation"] = mtl_doc["image_attributes"][ "sun_elevation"] p.properties["landsat:collection_number"] = usgs_collection_number for section, fields in _COPYABLE_MTL_FIELDS: for field in fields: value = mtl_doc[section].get(field) if value is not None: p.properties[f"landsat:{field}"] = value p.region_code = f"{p.properties['landsat:wrs_path']:03d}{p.properties['landsat:wrs_row']:03d}" org_collection_number = utils.get_collection_number( p.producer, p.properties["landsat:collection_number"]) p.dataset_version = f"{org_collection_number}.0.{p.processed:%Y%m%d}" band_aliases = get_band_alias_mappings(p.platform, p.instrument) bands = list(_iter_bands_paths(mtl_doc)) # add to do one band - remove this to do all the bands # bands = bands[0:1] for usgs_band_id, file_location in bands: # p.note_measurement( # band_aliases[usgs_band_id], # file_location, # relative_to_dataset_location=True, # ) path_file = os.path.join(ds_path, file_location) p.write_measurement(band_aliases[usgs_band_id], path_file) p.add_accessory_file("metadata:landsat_mtl", Path(mtl_filename)) return p.done()
def prepare_and_write( ds_path: Path, output_yaml_path: Path, source_telemetry: Path = None, # TODO: Can we infer producer automatically? This is bound to cause mistakes othewise producer="usgs.gov", ) -> Tuple[uuid.UUID, Path]: """ Prepare an eo3 metadata file for a Level1 dataset. Input dataset path can be a folder or a tar file. """ mtl_doc, mtl_filename = get_mtl_content(ds_path) if not mtl_doc: raise ValueError(f"No MTL file found for {ds_path}") usgs_collection_number = mtl_doc["metadata_file_info"].get( "collection_number") if usgs_collection_number is None: raise NotImplementedError( "Dataset has no collection number: pre-collection data is not supported." ) data_format = mtl_doc["product_metadata"]["output_format"] if data_format.upper() != "GEOTIFF": raise NotImplementedError( f"Only GTiff currently supported, got {data_format}") file_format = FileFormat.GeoTIFF # Assumed below. projection_params = mtl_doc["projection_parameters"] if ("grid_cell_size_thermal" in projection_params and "grid_cell_size_reflective" in projection_params and (projection_params["grid_cell_size_reflective"] != projection_params["grid_cell_size_thermal"])): raise NotImplementedError( "reflective and thermal have different cell sizes") ground_sample_distance = min(value for name, value in projection_params.items() if name.startswith("grid_cell_size_")) with DatasetAssembler( metadata_path=output_yaml_path, dataset_location=ds_path, # Detministic ID based on USGS's product id (which changes when the scene is reprocessed by them) dataset_id=uuid.uuid5( USGS_UUID_NAMESPACE, mtl_doc["metadata_file_info"]["landsat_product_id"]), naming_conventions="dea", if_exists=IfExists.Overwrite, ) as p: if source_telemetry: # Only GA's data has source telemetry... assert producer == "ga.gov.au" p.add_source_path(source_telemetry) p.platform = mtl_doc["product_metadata"]["spacecraft_id"] p.instrument = mtl_doc["product_metadata"]["sensor_id"] p.product_family = "level1" p.producer = producer p.datetime = "{}T{}".format( mtl_doc["product_metadata"]["date_acquired"], mtl_doc["product_metadata"]["scene_center_time"], ) p.processed = mtl_doc["metadata_file_info"]["file_date"] p.properties["odc:file_format"] = file_format p.properties["eo:gsd"] = ground_sample_distance cloud_cover = mtl_doc["image_attributes"]["cloud_cover"] # Cloud cover is -1 when missing (such as TIRS-only data) if cloud_cover != -1: p.properties["eo:cloud_cover"] = cloud_cover p.properties["eo:sun_azimuth"] = mtl_doc["image_attributes"][ "sun_azimuth"] p.properties["eo:sun_elevation"] = mtl_doc["image_attributes"][ "sun_elevation"] p.properties["landsat:collection_number"] = usgs_collection_number for section, fields in _COPYABLE_MTL_FIELDS: for field in fields: value = mtl_doc[section].get(field) if value is not None: p.properties[f"landsat:{field}"] = value p.region_code = f"{p.properties['landsat:wrs_path']:03d}{p.properties['landsat:wrs_row']:03d}" org_collection_number = utils.get_collection_number( p.producer, p.properties["landsat:collection_number"]) p.dataset_version = f"{org_collection_number}.0.{p.processed:%Y%m%d}" # NRT product? # Category is one of: T1, T2 or RT ('real time') if p.properties["landsat:collection_category"] == "RT": p.properties["odc:dataset_maturity"] = "nrt" band_aliases = get_band_alias_mappings(p.platform, p.instrument) for usgs_band_id, file_location in _iter_bands_paths(mtl_doc): p.note_measurement( band_aliases[usgs_band_id], file_location, relative_to_dataset_location=True, ) p.add_accessory_file("metadata:landsat_mtl", Path(mtl_filename)) return p.done()
def package( out_directory: Path, granule: Granule, included_products: Iterable[str] = DEFAULT_PRODUCTS, include_oa: bool = True, ) -> Tuple[UUID, Path]: """ Package an L2 product. :param include_oa: :param out_directory: The base directory for output datasets. A DEA-naming-conventions folder hierarchy will be created inside this folder. :param granule: Granule information. You probably want to make one with Granule.from_path() :param included_products: A list of imagery products to include in the package. Defaults to all products. :return: The dataset UUID and output metadata path """ included_products = tuple(s.lower() for s in included_products) with h5py.File(granule.wagl_hdf5, "r") as fid: granule_group = fid[granule.name] with DatasetAssembler( out_directory, # WAGL stamps a good, random ID already. dataset_id=granule.wagl_metadata.get("id"), naming_conventions="dea", ) as p: level1 = granule.source_level1_metadata p.add_source_dataset(level1, auto_inherit_properties=True) # It's a GA ARD product. p.producer = "ga.gov.au" p.product_family = "ard" org_collection_number = utils.get_collection_number( p.producer, p.properties["landsat:collection_number"]) # TODO: wagl's algorithm version should determine our dataset version number, right? p.dataset_version = f"{org_collection_number}.0.0" p.region_code = _extract_reference_code(p, granule.name) _read_wagl_metadata(p, granule_group) _read_gqa_doc(p, granule.gqa_doc) _read_fmask_doc(p, granule.fmask_doc) _unpack_products(p, included_products, granule_group) if include_oa: with do(f"Starting OA", heading=True): _unpack_observation_attributes( p, included_products, granule_group, infer_datetime_range=level1.platform.startswith( "landsat"), ) if granule.fmask_image: with do(f"Writing fmask from {granule.fmask_image} "): p.write_measurement( "oa:fmask", granule.fmask_image, expand_valid_data=False, overview_resampling=Resampling.mode, ) with do("Finishing package"): return p.done()
def _create_contiguity( p: DatasetAssembler, product_list: Iterable[str], resolution_yx: Tuple[float, float], timedelta_product: str = "nbar", timedelta_data: numpy.ndarray = None, ): """ Create the contiguity (all pixels valid) dataset. Write a contiguity mask file based on the intersection of valid data pixels across all bands from the input files. """ for product in product_list: contiguity = None for grid, band_name, path in p.iter_measurement_paths(): if not band_name.startswith(f"{product.lower()}:"): continue # Only our given res group (no pan band in Landsat) if grid.resolution_yx != resolution_yx: continue with rasterio.open(path) as ds: ds: DatasetReader if contiguity is None: contiguity = numpy.ones((ds.height, ds.width), dtype="uint8") geobox = GridSpec.from_rio(ds) elif ds.shape != contiguity.shape: raise NotImplementedError( "Contiguity from measurements of different shape") for band in ds.indexes: contiguity &= ds.read(band) > 0 if contiguity is None: secho(f"No images found for requested product {product}", fg="red") continue p.write_measurement_numpy( f"oa:{product.lower()}_contiguity", contiguity, geobox, nodata=255, overviews=None, expand_valid_data=False, ) # masking the timedelta_data with contiguity mask to get max and min timedelta within the NBAR product # footprint for Landsat sensor. For Sentinel sensor, it inherits from level 1 yaml file if timedelta_data is not None and product.lower() == timedelta_product: valid_timedelta_data = numpy.ma.masked_where( contiguity == 0, timedelta_data) def offset_from_center(v: numpy.datetime64): return p.datetime + timedelta(microseconds=v.astype(float) * 1_000_000.0) p.datetime_range = ( offset_from_center(numpy.ma.min(valid_timedelta_data)), offset_from_center(numpy.ma.max(valid_timedelta_data)), )
def _take_software_versions(p: DatasetAssembler, doc: Dict): versions = doc.pop("software_versions", {}) for name, o in versions.items(): p.note_software_version(name, o.get("repo_url"), o.get("version"))