def test_report_json_binary_serialization( dummy_test_plan_report_with_binary_asserts, ): """JSON Serialized & deserialized reports should be equal.""" test_plan_schema = TestReportSchema() data = test_plan_schema.dumps(dummy_test_plan_report_with_binary_asserts) j = json.loads(data) # passing assertion hx_1_1 = get_path(j, "entries.1.entries.0.entries.1.first") hx_1_2 = get_path(j, "entries.1.entries.0.entries.1.second") assert str(b"\xF2") == hx_1_1 == hx_1_2 # failing assertion hx_2_1 = get_path(j, "entries.1.entries.0.entries.2.first") hx_2_2 = get_path(j, "entries.1.entries.0.entries.2.second") assert str(b"\x00\xb1\xC1") == hx_2_1 assert str(b"\x00\xB2\xC2") == hx_2_2 # dict.match the schema for that producing list of tuples KEY_INDEX = 1 FIRST_INDEX = 3 SECOND_INDEX = 4 comps = get_path(j, "entries.1.entries.0.entries.3.comparison") assert comps[0][KEY_INDEX] == str(b"binarykey\xB1") assert comps[1][FIRST_INDEX][1] == str(b"binary value\xB1") assert comps[1][SECOND_INDEX][1] == str(b"binary value\xB1") assert comps[3][FIRST_INDEX][1] == str(b"binary\xB1") assert comps[3][SECOND_INDEX][1] == str(b"binary\xB1") assert comps[7][FIRST_INDEX][1] == str(b"binary\xB1") assert comps[7][SECOND_INDEX][1] == str(b"binary\xB1")
def _visit(path, key, value, a, b) -> bool: """Traverses all elements in `compare_nested` argument a and b... and tries to obtain the path `p` in `b` using boltons.iterutils.get_path. The following cases can occur: 1. If the path does not exist in `b` a KeyError will be raised. 2. If the index `k` does not exist an IndexError is raised. 3. If the other path exists, a comparison will be made using `_compare`. When the elements are not equal traversing `a` will be stopped by raising a RuntimeError. """ data_structure = iterutils.get_path(a, path) other_data_structure = iterutils.get_path(b, path) other_value = other_data_structure[key] if not _EqCompareNested._enter(None, key, value)[1]: other_value = other_data_structure[key] # check lengths of Sequence types first and raise # prior starting a more expensive comparison! if isinstance(other_data_structure, (Sequence, Set)) and len( other_data_structure ) != len(data_structure): raise RuntimeError("len does not match") if isinstance(other_data_structure, Mapping) and any( other_data_structure.keys() ^ data_structure.keys() ): raise RuntimeError("keys do not match") if not _EqCompareNested._compare(value, other_value): raise RuntimeError("not equal") return True
def test_depth_one(self): root = ['test'] assert get_path(root, (0, )) == 'test' assert get_path(root, '0') == 'test' root = {'key': 'value'} assert get_path(root, ('key', )) == 'value' assert get_path(root, 'key') == 'value'
def test_depth_one(self): root = ['test'] assert get_path(root, (0,)) == 'test' assert get_path(root, '0') == 'test' root = {'key': 'value'} assert get_path(root, ('key',)) == 'value' assert get_path(root, 'key') == 'value'
def test_report_json_binary_serialization( dummy_test_plan_report_with_binary_asserts, ): """JSON Serialized & deserialized reports should be equal.""" test_plan_schema = TestReportSchema(strict=True) data = test_plan_schema.dumps( dummy_test_plan_report_with_binary_asserts).data j = json.loads(data) bkey = EntriesField._BYTES_KEY # passing assertion hx_1_1 = get_path(j, "entries.1.entries.0.entries.1.first")[bkey] hx_1_2 = get_path(j, "entries.1.entries.0.entries.1.second")[bkey] assert ["0xF2"] == hx_1_1 == hx_1_2 # failing assertion hx_2_1 = get_path(j, "entries.1.entries.0.entries.2.first")[bkey] hx_2_2 = get_path(j, "entries.1.entries.0.entries.2.second")[bkey] assert ["0x00", "0xB1", "0xC1"] == hx_2_1 assert ["0x00", "0xB2", "0xC2"] == hx_2_2 # dict.match the schema for that producing list of tuples KEY_INDEX = 1 FIRST_INDEX = 3 SECOND_INDEX = 4 comps = get_path(j, "entries.1.entries.0.entries.3.comparison") assert comps[0][KEY_INDEX][bkey] == EntriesField._binary_to_hex_list( b"binarykey\xB1") assert comps[1][FIRST_INDEX][1][bkey] == EntriesField._binary_to_hex_list( b"binary value\xB1") assert comps[1][SECOND_INDEX][1][bkey] == EntriesField._binary_to_hex_list( b"binary value\xB1") assert comps[3][FIRST_INDEX][1][bkey] == EntriesField._binary_to_hex_list( b"binary\xB1") assert comps[3][SECOND_INDEX][1][bkey] == EntriesField._binary_to_hex_list( b"binary\xB1") assert comps[7][FIRST_INDEX][1][bkey] == EntriesField._binary_to_hex_list( b"binary\xB1") assert comps[7][SECOND_INDEX][1][bkey] == EntriesField._binary_to_hex_list( b"binary\xB1") deserialized_report = test_plan_schema.loads(data).data check_report( actual=deserialized_report, expected=dummy_test_plan_report_with_binary_asserts, )
def _read_wagl_metadata(p: DatasetAssembler, granule_group: h5py.Group): try: wagl_path, *ancil_paths = [ pth for pth in (_find_h5_paths(granule_group, "SCALAR")) if "METADATA" in pth ] except ValueError: raise ValueError("No nbar metadata found in granule") [wagl_doc] = loads_yaml(granule_group[wagl_path][()]) try: p.processed = get_path(wagl_doc, ("system_information", "time_processed")) except PathAccessError: raise ValueError( f"WAGL dataset contains no time processed. Path {wagl_path}") for i, path in enumerate(ancil_paths, start=2): wagl_doc.setdefault(f"wagl_{i}", {}).update( list(loads_yaml(granule_group[path][()]))[0]["ancillary"]) p.properties["dea:dataset_maturity"] = _determine_maturity( p.datetime, p.processed, wagl_doc) _take_software_versions(p, wagl_doc) p.extend_user_metadata("wagl", wagl_doc)
def _load_level1_doc( wagl_doc: Dict, user_specified_l1_path: Optional[Path] = None, allow_missing_provenance=False, ): if user_specified_l1_path: if not user_specified_l1_path.exists(): raise ValueError( f"No level1 metadata found at given path {user_specified_l1_path}" ) level1_path = user_specified_l1_path else: level1_path = Path( get_path(wagl_doc, ("source_datasets", "source_level1"))) # If a directory, assume "<dirname>.odc-metadata.yaml" if level1_path.is_dir(): metadata_path = level1_path / (level1_path.name + ".odc-metadata.yaml") # Otherwise it's a sibling file with ".odc-metadata.yaml" suffix else: if level1_path.suffix.lower() == ".yaml": metadata_path = level1_path else: metadata_path = level1_path.with_suffix(".odc-metadata.yaml") if not metadata_path.exists(): if not allow_missing_provenance: raise ValueError( "No level1 found or provided. " f"WAGL said it was at path {str(level1_path)!r}. " "Which has no metadata doc we can find, and you didn't specify an alternative. " f"(allow_missing_provenance={allow_missing_provenance})") return None return serialise.from_path(metadata_path)
def info( self, max_rows: int = None, max_length: int = None, show_values: bool = False, path: tuple = None, ): """Print the content to the stdout. Parameters ---------- max_rows : The maximum number of rows that will be printed. If rows are cut, a corresponding message will be printed max_length : The maximum line length. Longer lines will be truncated show_values : Set to `True` if primitive values should be displayed path tuple representing the lookup path in the yaml/asdf tree """ tree = { key: value for key, value in self._asdf_handle.tree.items() if key not in ["asdf_library", "history"] } if path is not None: tree = get_path(tree, path) asdf.info(tree, max_rows=max_rows, max_cols=max_length, show_values=show_values)
def add_record(self, in_dict): bindables = [] for path, col, _ in self._flat_fields: val = get_path(in_dict, path) bindables.append(val) conn = sqlite3.connect(self.file_path) with conn: conn.execute(self._insert_q, bindables) return
def visit(p, k, v): try: update_v = itu.get_path(d2, p)[k] except (itu.PathAccessError, KeyError): return (k, v) if isinstance(update_v, collections.abc.Mapping) and isinstance( v, collections.abc.Mapping): return (k, {**v, **update_v}) return (k, update_v)
def validate_paths( paths: List[Path], thorough: bool = False ) -> Generator[Tuple[Path, List[ValidationMessage]], None, None]: """Validate the list of paths. Product documents can be specified before their datasets.""" products: Dict[str, Dict] = {} for path in paths: # Load yaml. If product, add to products. # Otherwise validate. doc = serialise.load_yaml(path) messages = [] if is_product(doc): messages.extend(validate_product(doc)) products[doc["name"]] = doc yield path, messages continue # TODO: follow ODC's match rules? product = None product_name = get_path(doc, ("product", "name"), default=None) if products: if len(products) == 1: [product] = products.values() elif product_name is not None: product = products.get(product_name) if product is None: messages.append( _warning( "unknown_product", "Cannot match dataset to product", hint=f"Nothing matches {product_name!r}" if product_name else "No product name in dataset (TODO: field matching)", )) else: messages.append( ValidationMessage( Level.error if thorough else Level.info, "no_product", "No product provided: validating dataset information alone", )) messages.extend( validate_dataset( doc, product_definition=product, readable_location=path, thorough=thorough, )) yield path, messages
def visit_missing_file(path, key, value): if type(value) is dict: return True if not value: message = 'The namelist file ' + key + ' = ' + \ str(get_path(nlst, (path))[key]) + ' does not exist' if key not in [*hrldas_exempt_list, *hydro_exempt_list]: raise ValueError(message) else: warnings.warn(message) return False
def _dover_enter(p, k, v): ret = default_enter(p, k, v) if k == None: return ret try: n = get_path(config, p) if (k not in n): n[k] = v return (ret[0], False) except: pass return ret
def _apply_wagl_metadata(p: DatasetAssembler, wagl_doc: Dict): source = wagl_doc["source_datasets"] p.datetime = source["acquisition_datetime"] p.platform = source["platform_id"] p.instrument = source["sensor_id"] try: p.processed = get_path(wagl_doc, ("system_information", "time_processed")) except PathAccessError: raise RuntimeError("WAGL dataset contains no processed time.") _take_software_versions(p, wagl_doc) p.extend_user_metadata("wagl", wagl_doc)
def test_pools_have_monitors(config_path, _): """ Require monitor configuration for each service """ config_path = config_path[ 0: -1] # remove 'class' from config_path, which is always the last element monitors = iterutils.get_path(tenants, config_path).get("monitors") print(f"currently processing: {config_path}, monitors: {monitors}", end=" ") assert monitors # not empty and not None for monitor in monitors: assert monitor # make we don't have an empty entry in the list of monitors either!
def view_tree(file: types_path_and_file_like, path: Tuple = None, **kwargs): """Display YAML header using IPython JSON display repr. This function works in JupyterLab. Parameters ---------- file : filename or file-like object pointing towards / containing an ASDF file. path : tuple representing the lookup path in the yaml/asdf tree kwargs kwargs passed down to JSON constructor Returns ------- IPython.display.JSON JSON object for rich output in JupyterLab Examples -------- Visualize the full tree of an existing ASDF file:: weldx.asdf.utils.view_tree("single_pass_weld_example.asdf") Visualize a specific element in the tree structure by proving the path:: weldx.asdf.utils.view_tree( "single_pass_weld_example.asdf", path=("process",) ) weldx.asdf.utils.view_tree( "single_pass_weld_example.asdf", path=("process", "welding_process") ) """ from IPython.display import JSON if isinstance(file, str): root = file + "/" else: root = "/" yaml_dict = get_yaml_header(file, parse=True) if path: root = root + "/".join(path) yaml_dict = get_path(yaml_dict, path) kwargs["root"] = root return JSON(yaml_dict, **kwargs)
def _get_level1_metadata_path(wagl_doc: Dict) -> Optional[Path]: """ Find the expected matching metadata file for the source dataset """ source_level1 = Path( get_path(wagl_doc, ("source_datasets", "source_level1"))) if not source_level1.exists(): return None # If a directory, assume "<dirname>.odc-metadata.yaml" if source_level1.is_dir(): return source_level1 / (source_level1.name + ".odc-metadata.yaml") # Otherwise it's a sibling file with ".odc-metadata.yaml" suffix else: return source_level1.with_suffix(".odc-metadata.yaml")
def get(self, key: str, default: Any = None) -> Any: """Retrieve config value. Args: key (str): Key (in dot-notation) of value to return. default (Any, optional): Default value to return. Defaults to None. Returns: Any: Value at key given """ key_path = key.split('.') value = iterutils.get_path(self.config, key_path, default=default) return value
def diffs_dict(self): '''The diffs_dict attribute has getter (@property) and setter methods. The get method summarizes all differences across all the attributes of the members list attribute and (should) only report member attributes when there is at least one difference between members. The setter method is meant as a convenient way to specify the differences in member attributes across the ensemble. ''' if len(self) == 1: print('Ensemble is of lenght 1, no differences.') return {} mem_0_ref_dict = dictify(self.members[0]) all_diff_keys = set({}) for ii in range(1, len(self)): mem_ii_ref_dict = dictify(self.members[ii]) diff = DeepDiffEq(mem_0_ref_dict, mem_ii_ref_dict, eq_types={pathlib.PosixPath}) unexpected_diffs = set(diff.keys()) - set(['values_changed']) if len(unexpected_diffs): unexpected_diffs1 = { uu: diff0[uu] for uu in list(unexpected_diffs) } raise ValueError( 'Unexpected attribute differences between ensemble members:', unexpected_diffs1) diff_keys = list(diff['values_changed'].keys()) all_diff_keys = all_diff_keys | set( [ss.replace('root', '') for ss in diff_keys]) diff_tuples = [ss.replace('][', ',') for ss in list(all_diff_keys)] diff_tuples = [ss.replace('[', '(') for ss in list(diff_tuples)] diff_tuples = [ss.replace(']', ')') for ss in list(diff_tuples)] diff_tuples = [ast.literal_eval(ss) for ss in list(diff_tuples)] self.__diffs_dict = {} for dd in diff_tuples: self.__diffs_dict[dd] = [ get_path(dictify(mm), dd) for mm in self.members ] return (self.__diffs_dict)
def remerge_enter(path, key, value): new_parent, new_items = default_enter(path, key, value) if ret and not path and key is None: new_parent = ret try: # TODO: type check? new_parent = get_path(ret, path + (key,)) except KeyError: pass if isinstance(value, list): # lists are purely additive. See https://github.com/mahmoud/boltons/issues/81 new_parent.extend(value) new_items = [] return new_parent, new_items
def asdf_json_repr(file, path: Tuple = None, **kwargs): """Display YAML header using IPython JSON display repr. This function works in JupyterLab. Parameters ---------- file filename or BytesIO buffer of ASDF file path tuple representing the lookup path in the yaml/asdf tree kwargs kwargs passed down to JSON constructor Returns ------- IPython.display.JSON JSON object for rich output in JupyterLab Examples -------- Visualize the full tree of an existing ASDF file:: weldx.asdf.utils.asdf_json_repr("single_pass_weld_example.asdf") Visualize a specific element in the tree structure by proving the path:: weldx.asdf.utils.asdf_json_repr( "single_pass_weld_example.asdf", path=("process", "welding_process") ) """ from IPython.display import JSON if isinstance(file, str): root = file + "/" else: root = "/" code = get_yaml_header(file) yaml_dict = yaml.load(code, Loader=yaml.BaseLoader) if path: root = root + "/".join(path) yaml_dict = get_path(yaml_dict, path) kwargs["root"] = root return JSON(yaml_dict, **kwargs)
def member_diffs(self): """Get method for ensemble member differences. Only differences are reported.""" if len(self) == 1: print('Ensemble is of length 1, no differences.') return {} mem_0_ref_dict = dictify(self.members[0]) # TODO(JLM): Could this be parallelized? all_diff_keys = set({}) for imem, mem in enumerate(self.members): if imem == 0: continue mem_ii_ref_dict = dictify(mem) diff = DeepDiffEq(mem_0_ref_dict, mem_ii_ref_dict, eq_types={pathlib.PosixPath}) unexpected_diffs = set(diff.keys()) - set(['values_changed']) if len(unexpected_diffs): unexpected_diffs1 = { uu: diff[uu] for uu in list(unexpected_diffs) } raise ValueError( 'Unexpected attribute differences between ensemble members:', unexpected_diffs1) diff_keys = list(diff['values_changed'].keys()) all_diff_keys = all_diff_keys | set( [ss.replace('root', '') for ss in diff_keys]) # This translates hierarchical dict entries to tuples. diff_tuples = [ss.replace('][', ',') for ss in list(all_diff_keys)] diff_tuples = [ss.replace('[', '(') for ss in list(diff_tuples)] diff_tuples = [ss.replace(']', ')') for ss in list(diff_tuples)] diff_tuples = [ast.literal_eval(ss) for ss in list(diff_tuples)] self.__member_diffs = {} for dd in diff_tuples: self.__member_diffs[dd] = [ get_path(dictify(mm), dd) for mm in self.members ] return (self.__member_diffs)
def validate_eo3_doc( doc: Dict, location: Union[str, Path], products: Dict[str, Dict], thorough: bool = False, expect_extra_measurements=False, ) -> List[ValidationMessage]: messages = [] # TODO: follow ODC's match rules? product = None product_name = get_path(doc, ("product", "name"), default=None) if products: if len(products) == 1: [product] = products.values() elif product_name is not None: product = products.get(product_name) if product is None: messages.append( _warning( "unknown_product", "Cannot match dataset to product", hint=f"Nothing matches {product_name!r}" if product_name else "No product name in dataset (TODO: field matching)", )) else: messages.append( ValidationMessage( Level.error if thorough else Level.info, "no_product", "No product provided: validating dataset information alone", )) messages.extend( validate_dataset( doc, product_definition=product, readable_location=location, thorough=thorough, expect_extra_measurements=expect_extra_measurements, )) return messages
def diffs_dict(self): if len(self) == 1: print('Ensemble is of lenght 1, no differences.') return {} mem_0_ref_dict = dictify(self.members[0]) all_diff_keys = set({}) for ii in range(1, len(self)): mem_ii_ref_dict = dictify(self.members[ii]) diff = DeepDiffEq(mem_0_ref_dict, mem_ii_ref_dict, eq_types={pathlib.PosixPath}) unexpected_diffs = set(diff.keys()) - set(['values_changed']) if len(unexpected_diffs): unexpected_diffs1 = { uu: diff0[uu] for uu in list(unexpected_diffs) } raise ValueError( 'Unexpected attribute differences between ensemble members:', unexpected_diffs1) diff_keys = list(diff['values_changed'].keys()) all_diff_keys = all_diff_keys | set( [ss.replace('root', '') for ss in diff_keys]) diff_tuples = [ss.replace('][', ',') for ss in list(all_diff_keys)] diff_tuples = [ss.replace('[', '(') for ss in list(diff_tuples)] diff_tuples = [ss.replace(']', ')') for ss in list(diff_tuples)] diff_tuples = [ast.literal_eval(ss) for ss in list(diff_tuples)] self.__diffs_dict = {} for dd in diff_tuples: self.__diffs_dict[dd] = [ get_path(dictify(mm), dd) for mm in self.members ] return (self.__diffs_dict)
def load_config(config_path: Path, config_key: Tuple = ()) -> Configuration: config = get_path(toml.load(config_path), config_key, default={}) config["config_path"] = config_path return Configuration.parse_obj(config)
def test_depth_two(self): root = {'key': ['test']} assert get_path(root, ('key', 0)) == 'test' assert get_path(root, 'key.0') == 'test'
def __delitem__(self, k: str) -> None: nested_tables, k = Conf._extract_table_list(k) table = iterutils.get_path(self._toml_dict, nested_tables) del table[k]
def package_non_standard(outdir, granule): """ yaml creator for the ard pipeline. """ outdir = Path(outdir) / granule.name indir = granule.wagl_hdf5.parent if indir.is_file(): shutil.copy(indir, outdir) else: shutil.copytree(indir, outdir) wagl_h5 = outdir / str(granule.name + ".wagl.h5") dataset_doc = outdir / str(granule.name + ".yaml") boolean_h5 = Path(str(wagl_h5).replace("wagl.h5", "converted.datasets.h5")) fmask_img = outdir / str(granule.name + ".fmask.img") f = h5py.File(boolean_h5) with DatasetAssembler(metadata_path=dataset_doc, naming_conventions="dea") as da: level1 = granule.source_level1_metadata da.add_source_dataset(level1, auto_inherit_properties=True, inherit_geometry=True) da.product_family = "ard" da.producer = "ga.gov.au" da.properties["odc:file_format"] = "HDF5" with h5py.File(wagl_h5, "r") as fid: img_paths = [ppjoin(fid.name, pth) for pth in find(fid, "IMAGE")] granule_group = fid[granule.name] try: wagl_path, *ancil_paths = [ pth for pth in find(granule_group, "SCALAR") if "METADATA" in pth ] except ValueError: raise ValueError("No nbar metadata found in granule") [wagl_doc] = loads_yaml(granule_group[wagl_path][()]) da.processed = get_path(wagl_doc, ("system_information", "time_processed")) platform = da.properties["eo:platform"] if platform == "sentinel-2a" or platform == "sentinel-2b": org_collection_number = 3 else: org_collection_number = utils.get_collection_number( platform, da.producer, da.properties["landsat:collection_number"]) da.dataset_version = f"{org_collection_number}.1.0" da.region_code = eodatasets3.wagl._extract_reference_code( da, granule.name) eodatasets3.wagl._read_gqa_doc(da, granule.gqa_doc) eodatasets3.wagl._read_fmask_doc(da, granule.fmask_doc) with rasterio.open(fmask_img) as ds: fmask_layer = "/{}/OA_FMASK/oa_fmask".format(granule.name) data = ds.read(1) fmask_ds = f.create_dataset(fmask_layer, data=data, compression="lzf", shuffle=True) fmask_ds.attrs["crs_wkt"] = ds.crs.wkt fmask_ds.attrs["geotransform"] = ds.transform.to_gdal() fmask_ds.attrs[ "description"] = "Converted from ERDAS Imagine format to HDF5 to work with the limitations of varied formats within ODC" # noqa E501 grid_spec = images.GridSpec( shape=ds.shape, transform=ds.transform, crs=CRS.from_wkt(fmask_ds.attrs["crs_wkt"]), ) measurement_name = "oa_fmask" pathname = str(outdir.joinpath(boolean_h5)) no_data = fmask_ds.attrs.get("no_data_value") if no_data is None: no_data = float("nan") da._measurements.record_image( measurement_name, grid_spec, pathname, fmask_ds[:], layer="/{}".format(fmask_layer), nodata=no_data, expand_valid_data=False, ) for pathname in img_paths: ds = fid[pathname] ds_path = Path(ds.name) # eodatasets internally uses this grid spec to group image datasets grid_spec = images.GridSpec( shape=ds.shape, transform=Affine.from_gdal(*ds.attrs["geotransform"]), crs=CRS.from_wkt(ds.attrs["crs_wkt"]), ) # product group name; lambertian, nbar, nbart, oa if "STANDARDISED-PRODUCTS" in str(ds_path): product_group = ds_path.parent.name elif "INTERPOLATED-ATMOSPHERIC-COEFFICIENTS" in str(ds_path): product_group = "oa_{}".format(ds_path.parent.name) else: product_group = "oa" # spatial resolution group # used to separate measurements with the same name resolution_group = "rg{}".format( ds_path.parts[2].split("-")[-1]) measurement_name = ("_".join([ resolution_group, product_group, ds.attrs.get("alias", ds_path.name), ]).replace("-", "_").lower()) # we don't wan't hyphens in odc land # include this band in defining the valid data bounds? include = True if "nbart" in measurement_name else False no_data = ds.attrs.get("no_data_value") if no_data is None: no_data = float("nan") # if we are of type bool, we'll have to convert just for GDAL if ds.dtype.name == "bool": pathname = str(outdir.joinpath(boolean_h5)) out_ds = f.create_dataset( measurement_name, data=np.uint8(ds[:]), compression="lzf", shuffle=True, chunks=ds.chunks, ) for k, v in ds.attrs.items(): out_ds.attrs[k] = v da._measurements.record_image( measurement_name, grid_spec, pathname, out_ds[:], layer="/{}".format(out_ds.name), nodata=no_data, expand_valid_data=include, ) else: pathname = str(outdir.joinpath(wagl_h5)) # work around as note_measurement doesn't allow us to specify the gridspec da._measurements.record_image( measurement_name, grid_spec, pathname, ds[:], layer="/{}".format(ds.name), nodata=no_data, expand_valid_data=include, ) # the longest part here is generating the valid data bounds vector # landsat 7 post SLC-OFF can take a really long time return da.done()
def _match_product( dataset_doc: Dict, product_definitions: Dict[str, Dict] ) -> Tuple[Optional[Dict], List[ValidationMessage]]: """Match the given dataset to a product definition""" product = None # EO3 datasets often put the product name directly inside. specified_product_name = get_path(dataset_doc, ("product", "name"), default=None) specified_product_name = specified_product_name or get_path( dataset_doc, ("properties", "odc:product"), default=None) if specified_product_name and (specified_product_name in product_definitions): product = product_definitions[specified_product_name] matching_products = { name: definition for name, definition in product_definitions.items() if changes.contains(dataset_doc, definition["metadata"]) } # We we have nothing, give up! if (not matching_products) and (not product): # Find the product that most closely matches it, to helpfully show the differences! closest_product_name = None closest_differences = None for name, definition in product_definitions.items(): diffs = tuple( _get_product_mismatch_reasons(dataset_doc, definition)) if (closest_differences is None) or len(diffs) < len(closest_differences): closest_product_name = name closest_differences = diffs difference_hint = _differences_as_hint(closest_differences) return None, [ _error( "unknown_product", "Dataset does not match the given products", hint= f"Closest match is {closest_product_name}, with differences:" f"\n{difference_hint}", ) ] messages = [] if specified_product_name not in matching_products: if product: difference_hint = _differences_as_hint( _get_product_mismatch_reasons(dataset_doc, product)) messages.append( _info( "strange_product_claim", f"Dataset claims to be product {specified_product_name!r}, but doesn't match its fields", hint=f"{difference_hint}", )) else: messages.append( _info( "unknown_product_claim", f"Dataset claims to be product {specified_product_name!r}, but it wasn't supplied.", )) if len(matching_products) > 1: matching_names = ", ".join(matching_products.keys()) messages.append( _error( "product_match_clash", "Multiple products match the given dataset", hint=f"Maybe you need more fields in the 'metadata' section?\n" f"Claims to be a {specified_product_name!r}, and matches {matching_names!r}" if specified_product_name else f"Maybe you need more fields in the 'metadata' section?\n" f"Matches {matching_names!r}", )) # (We wont pick one from the bunch here. Maybe they already matched one above to use in continuing validation.) # Just like ODC, match rules will rule all. Even if their metadata has a "product_name" field. if len(matching_products) == 1: [product] = matching_products.values() return product, messages
def for_path( cls, wagl_hdf5: Path, granule_names: Optional[Sequence[str]] = None, level1_metadata_path: Optional[Path] = None, fmask_image_path: Optional[Path] = None, fmask_doc_path: Optional[Path] = None, gqa_doc_path: Optional[Path] = None, ): """ Create granules by scanning the given hdf5 file. Optionally specify additional files and level1 path. If they are not specified it look for them using WAGL's output naming conventions. """ if not wagl_hdf5.exists(): raise ValueError(f"Input hdf5 doesn't exist {wagl_hdf5}") with h5py.File(wagl_hdf5, "r") as fid: granule_names = granule_names or fid.keys() for granule_name in granule_names: if granule_name not in fid: raise ValueError( f"Granule {granule_name!r} not found in file {wagl_hdf5}" ) wagl_doc_field = get_path( fid, (granule_name, "METADATA", "CURRENT")) if not wagl_doc_field: raise ValueError( f"Granule contains no wagl metadata: {granule_name} in {wagl_hdf5}" ) [wagl_doc] = loads_yaml(wagl_doc_field[()]) if not level1_metadata_path: level1_tar_path = Path( get_path(wagl_doc, ("source_datasets", "source_level1"))) level1_metadata_path = level1_tar_path.with_suffix( ".odc-metadata.yaml") if not level1_metadata_path.exists(): raise ValueError( f"No level1 metadata found at {level1_metadata_path}") level1 = serialise.from_path(level1_metadata_path) fmask_image_path = fmask_image_path or wagl_hdf5.with_name( f"{granule_name}.fmask.img") if not fmask_image_path.exists(): raise ValueError( f"No fmask image found at {fmask_image_path}") fmask_doc_path = fmask_doc_path or fmask_image_path.with_suffix( ".yaml") if not fmask_doc_path.exists(): raise ValueError(f"No fmask found at {fmask_doc_path}") with fmask_doc_path.open("r") as fl: [fmask_doc] = loads_yaml(fl) gqa_doc_path = gqa_doc_path or wagl_hdf5.with_name( f"{granule_name}.gqa.yaml") if not gqa_doc_path.exists(): raise ValueError(f"No gqa found at {gqa_doc_path}") with gqa_doc_path.open("r") as fl: [gqa_doc] = loads_yaml(fl) yield cls( name=granule_name, wagl_hdf5=wagl_hdf5, wagl_metadata=wagl_doc, source_level1_metadata=level1, fmask_doc=fmask_doc, fmask_image=fmask_image_path, gqa_doc=gqa_doc, )
def for_path( cls, wagl_hdf5: Path, granule_names: Optional[Sequence[str]] = None, level1_metadata_path: Optional[Path] = None, fmask_image_path: Optional[Path] = None, fmask_doc_path: Optional[Path] = None, s2cloudless_prob_path: Optional[Path] = None, s2cloudless_mask_path: Optional[Path] = None, s2cloudless_doc_path: Optional[Path] = None, gqa_doc_path: Optional[Path] = None, tesp_doc_path: Optional[Path] = None, allow_missing_provenance: bool = False, ): """ Create granules by scanning the given hdf5 file. Optionally specify additional files and level1 path. If they are not specified it look for them using WAGL's output naming conventions. :param allow_missing_provenance: """ if not wagl_hdf5.exists(): raise ValueError(f"Input hdf5 doesn't exist {wagl_hdf5}") with h5py.File(wagl_hdf5, "r") as fid: granule_names = granule_names or fid.keys() for granule_name in granule_names: if granule_name not in fid: raise ValueError( f"Granule {granule_name!r} not found in file {wagl_hdf5}" ) wagl_doc_field = get_path( fid, (granule_name, "METADATA", "CURRENT")) if not wagl_doc_field: raise ValueError( f"Granule contains no wagl metadata: {granule_name} in {wagl_hdf5}" ) [wagl_doc] = loads_yaml(wagl_doc_field[()]) level1 = _load_level1_doc(wagl_doc, level1_metadata_path, allow_missing_provenance) fmask_image_path = fmask_image_path or wagl_hdf5.with_name( f"{granule_name}.fmask.img") if not fmask_image_path.exists(): raise ValueError( f"No fmask image found at {fmask_image_path}") fmask_doc_path = fmask_doc_path or fmask_image_path.with_suffix( ".yaml") if not fmask_doc_path.exists(): raise ValueError(f"No fmask found at {fmask_doc_path}") with fmask_doc_path.open("r") as fl: [fmask_doc] = loads_yaml(fl) if "sentinel" in wagl_doc["source_datasets"][ "platform_id"].lower(): s2cloudless_prob_path = ( s2cloudless_prob_path or wagl_hdf5.with_name( f"{granule_name}.prob.s2cloudless.tif")) if not s2cloudless_prob_path.exists(): raise ValueError( f"No s2cloudless probability image found at {s2cloudless_prob_path}" ) s2cloudless_mask_path = ( s2cloudless_mask_path or wagl_hdf5.with_name( f"{granule_name}.mask.s2cloudless.tif")) if not s2cloudless_mask_path.exists(): raise ValueError( f"No s2cloudless mask image found at {s2cloudless_mask_path}" ) s2cloudless_doc_path = s2cloudless_doc_path or wagl_hdf5.with_name( f"{granule_name}.s2cloudless.yaml") if not s2cloudless_doc_path.exists(): raise ValueError( f"No s2cloudless metadata found at {s2cloudless_doc_path}" ) with s2cloudless_doc_path.open("r") as fl: [s2cloudless_doc] = loads_yaml(fl) else: s2cloudless_prob_path = None s2cloudless_mask_path = None s2cloudless_doc = None gqa_doc_path = gqa_doc_path or wagl_hdf5.with_name( f"{granule_name}.gqa.yaml") if not gqa_doc_path.exists(): raise ValueError(f"No gqa found at {gqa_doc_path}") with gqa_doc_path.open("r") as fl: [gqa_doc] = loads_yaml(fl) # Optional doc if tesp_doc_path: # But if they gave us a path, we're strict about it existing. if not tesp_doc_path.exists(): raise ValueError( f"Supplied tesp doc path doesn't exist: {tesp_doc_path}" ) else: tesp_doc_path = wagl_hdf5.with_name( f"{granule_name}.tesp.yaml") if tesp_doc_path.exists(): with tesp_doc_path.open("r") as fl: [tesp_doc] = loads_yaml(fl) yield cls( name=granule_name, wagl_hdf5=wagl_hdf5, wagl_metadata=wagl_doc, source_level1_metadata=level1, fmask_doc=fmask_doc, fmask_image=fmask_image_path, s2cloudless_prob=s2cloudless_prob_path, s2cloudless_mask=s2cloudless_mask_path, s2cloudless_doc=s2cloudless_doc, gqa_doc=gqa_doc, tesp_doc=tesp_doc, )