Example #1
0
def test_report_json_binary_serialization(
    dummy_test_plan_report_with_binary_asserts,
):
    """JSON Serialized & deserialized reports should be equal."""
    test_plan_schema = TestReportSchema()
    data = test_plan_schema.dumps(dummy_test_plan_report_with_binary_asserts)

    j = json.loads(data)

    # passing assertion
    hx_1_1 = get_path(j, "entries.1.entries.0.entries.1.first")
    hx_1_2 = get_path(j, "entries.1.entries.0.entries.1.second")
    assert str(b"\xF2") == hx_1_1 == hx_1_2

    # failing assertion
    hx_2_1 = get_path(j, "entries.1.entries.0.entries.2.first")
    hx_2_2 = get_path(j, "entries.1.entries.0.entries.2.second")
    assert str(b"\x00\xb1\xC1") == hx_2_1
    assert str(b"\x00\xB2\xC2") == hx_2_2

    # dict.match the schema for that producing list of tuples

    KEY_INDEX = 1
    FIRST_INDEX = 3
    SECOND_INDEX = 4

    comps = get_path(j, "entries.1.entries.0.entries.3.comparison")
    assert comps[0][KEY_INDEX] == str(b"binarykey\xB1")
    assert comps[1][FIRST_INDEX][1] == str(b"binary value\xB1")
    assert comps[1][SECOND_INDEX][1] == str(b"binary value\xB1")
    assert comps[3][FIRST_INDEX][1] == str(b"binary\xB1")
    assert comps[3][SECOND_INDEX][1] == str(b"binary\xB1")
    assert comps[7][FIRST_INDEX][1] == str(b"binary\xB1")
    assert comps[7][SECOND_INDEX][1] == str(b"binary\xB1")
Example #2
0
    def _visit(path, key, value, a, b) -> bool:
        """Traverses all elements in `compare_nested` argument a and b...

        and tries to obtain the path `p` in `b` using boltons.iterutils.get_path.
        The following cases can occur:
        1. If the path does not exist in `b` a KeyError will be raised.
        2. If the index `k` does not exist an IndexError is raised.
        3. If the other path exists, a comparison will be made using `_compare`.
           When the elements are not equal traversing `a` will be stopped
           by raising a RuntimeError.
        """
        data_structure = iterutils.get_path(a, path)
        other_data_structure = iterutils.get_path(b, path)

        other_value = other_data_structure[key]

        if not _EqCompareNested._enter(None, key, value)[1]:
            other_value = other_data_structure[key]
            # check lengths of Sequence types first and raise
            # prior starting a more expensive comparison!
            if isinstance(other_data_structure, (Sequence, Set)) and len(
                other_data_structure
            ) != len(data_structure):
                raise RuntimeError("len does not match")
            if isinstance(other_data_structure, Mapping) and any(
                other_data_structure.keys() ^ data_structure.keys()
            ):
                raise RuntimeError("keys do not match")
            if not _EqCompareNested._compare(value, other_value):
                raise RuntimeError("not equal")
        return True
Example #3
0
    def test_depth_one(self):
        root = ['test']
        assert get_path(root, (0, )) == 'test'
        assert get_path(root, '0') == 'test'

        root = {'key': 'value'}
        assert get_path(root, ('key', )) == 'value'
        assert get_path(root, 'key') == 'value'
Example #4
0
    def test_depth_one(self):
        root = ['test']
        assert get_path(root, (0,)) == 'test'
        assert get_path(root, '0') == 'test'

        root = {'key': 'value'}
        assert get_path(root, ('key',)) == 'value'
        assert get_path(root, 'key') == 'value'
Example #5
0
def test_report_json_binary_serialization(
    dummy_test_plan_report_with_binary_asserts, ):
    """JSON Serialized & deserialized reports should be equal."""
    test_plan_schema = TestReportSchema(strict=True)
    data = test_plan_schema.dumps(
        dummy_test_plan_report_with_binary_asserts).data

    j = json.loads(data)
    bkey = EntriesField._BYTES_KEY

    # passing assertion
    hx_1_1 = get_path(j, "entries.1.entries.0.entries.1.first")[bkey]
    hx_1_2 = get_path(j, "entries.1.entries.0.entries.1.second")[bkey]
    assert ["0xF2"] == hx_1_1 == hx_1_2

    # failing assertion
    hx_2_1 = get_path(j, "entries.1.entries.0.entries.2.first")[bkey]
    hx_2_2 = get_path(j, "entries.1.entries.0.entries.2.second")[bkey]
    assert ["0x00", "0xB1", "0xC1"] == hx_2_1
    assert ["0x00", "0xB2", "0xC2"] == hx_2_2

    # dict.match the schema for that producing list of tuples

    KEY_INDEX = 1
    FIRST_INDEX = 3
    SECOND_INDEX = 4

    comps = get_path(j, "entries.1.entries.0.entries.3.comparison")
    assert comps[0][KEY_INDEX][bkey] == EntriesField._binary_to_hex_list(
        b"binarykey\xB1")
    assert comps[1][FIRST_INDEX][1][bkey] == EntriesField._binary_to_hex_list(
        b"binary value\xB1")
    assert comps[1][SECOND_INDEX][1][bkey] == EntriesField._binary_to_hex_list(
        b"binary value\xB1")
    assert comps[3][FIRST_INDEX][1][bkey] == EntriesField._binary_to_hex_list(
        b"binary\xB1")
    assert comps[3][SECOND_INDEX][1][bkey] == EntriesField._binary_to_hex_list(
        b"binary\xB1")
    assert comps[7][FIRST_INDEX][1][bkey] == EntriesField._binary_to_hex_list(
        b"binary\xB1")
    assert comps[7][SECOND_INDEX][1][bkey] == EntriesField._binary_to_hex_list(
        b"binary\xB1")

    deserialized_report = test_plan_schema.loads(data).data
    check_report(
        actual=deserialized_report,
        expected=dummy_test_plan_report_with_binary_asserts,
    )
Example #6
0
def _read_wagl_metadata(p: DatasetAssembler, granule_group: h5py.Group):
    try:
        wagl_path, *ancil_paths = [
            pth for pth in (_find_h5_paths(granule_group, "SCALAR"))
            if "METADATA" in pth
        ]
    except ValueError:
        raise ValueError("No nbar metadata found in granule")

    [wagl_doc] = loads_yaml(granule_group[wagl_path][()])

    try:
        p.processed = get_path(wagl_doc,
                               ("system_information", "time_processed"))
    except PathAccessError:
        raise ValueError(
            f"WAGL dataset contains no time processed. Path {wagl_path}")

    for i, path in enumerate(ancil_paths, start=2):
        wagl_doc.setdefault(f"wagl_{i}", {}).update(
            list(loads_yaml(granule_group[path][()]))[0]["ancillary"])

    p.properties["dea:dataset_maturity"] = _determine_maturity(
        p.datetime, p.processed, wagl_doc)

    _take_software_versions(p, wagl_doc)
    p.extend_user_metadata("wagl", wagl_doc)
Example #7
0
def _load_level1_doc(
    wagl_doc: Dict,
    user_specified_l1_path: Optional[Path] = None,
    allow_missing_provenance=False,
):

    if user_specified_l1_path:
        if not user_specified_l1_path.exists():
            raise ValueError(
                f"No level1 metadata found at given path {user_specified_l1_path}"
            )
        level1_path = user_specified_l1_path
    else:
        level1_path = Path(
            get_path(wagl_doc, ("source_datasets", "source_level1")))

    # If a directory, assume "<dirname>.odc-metadata.yaml"
    if level1_path.is_dir():
        metadata_path = level1_path / (level1_path.name + ".odc-metadata.yaml")
    # Otherwise it's a sibling file with ".odc-metadata.yaml" suffix
    else:
        if level1_path.suffix.lower() == ".yaml":
            metadata_path = level1_path
        else:
            metadata_path = level1_path.with_suffix(".odc-metadata.yaml")

    if not metadata_path.exists():
        if not allow_missing_provenance:
            raise ValueError(
                "No level1 found or provided. "
                f"WAGL said it was at path {str(level1_path)!r}. "
                "Which has no metadata doc we can find, and you didn't specify an alternative. "
                f"(allow_missing_provenance={allow_missing_provenance})")
        return None
    return serialise.from_path(metadata_path)
Example #8
0
    def info(
        self,
        max_rows: int = None,
        max_length: int = None,
        show_values: bool = False,
        path: tuple = None,
    ):
        """Print the content to the stdout.

        Parameters
        ----------
        max_rows :
            The maximum number of rows that will be printed. If rows are cut, a
            corresponding message will be printed
        max_length :
            The maximum line length. Longer lines will be truncated
        show_values :
            Set to `True` if primitive values should be displayed
        path
            tuple representing the lookup path in the yaml/asdf tree

        """
        tree = {
            key: value
            for key, value in self._asdf_handle.tree.items()
            if key not in ["asdf_library", "history"]
        }
        if path is not None:
            tree = get_path(tree, path)
        asdf.info(tree,
                  max_rows=max_rows,
                  max_cols=max_length,
                  show_values=show_values)
Example #9
0
 def add_record(self, in_dict):
     bindables = []
     for path, col, _ in self._flat_fields:
         val = get_path(in_dict, path)
         bindables.append(val)
     conn = sqlite3.connect(self.file_path)
     with conn:
         conn.execute(self._insert_q, bindables)
     return
Example #10
0
 def visit(p, k, v):
     try:
         update_v = itu.get_path(d2, p)[k]
     except (itu.PathAccessError, KeyError):
         return (k, v)
     if isinstance(update_v, collections.abc.Mapping) and isinstance(
             v, collections.abc.Mapping):
         return (k, {**v, **update_v})
     return (k, update_v)
Example #11
0
 def add_record(self, in_dict):
     bindables = []
     for path, col, _ in self._flat_fields:
         val = get_path(in_dict, path)
         bindables.append(val)
     conn = sqlite3.connect(self.file_path)
     with conn:
         conn.execute(self._insert_q, bindables)
     return
Example #12
0
def validate_paths(
    paths: List[Path],
    thorough: bool = False
) -> Generator[Tuple[Path, List[ValidationMessage]], None, None]:
    """Validate the list of paths. Product documents can be specified before their datasets."""
    products: Dict[str, Dict] = {}

    for path in paths:
        # Load yaml. If product, add to products.
        # Otherwise validate.
        doc = serialise.load_yaml(path)
        messages = []

        if is_product(doc):
            messages.extend(validate_product(doc))
            products[doc["name"]] = doc
            yield path, messages
            continue

        # TODO: follow ODC's match rules?
        product = None
        product_name = get_path(doc, ("product", "name"), default=None)

        if products:
            if len(products) == 1:
                [product] = products.values()
            elif product_name is not None:
                product = products.get(product_name)

            if product is None:
                messages.append(
                    _warning(
                        "unknown_product",
                        "Cannot match dataset to product",
                        hint=f"Nothing matches {product_name!r}"
                        if product_name else
                        "No product name in dataset (TODO: field matching)",
                    ))
        else:
            messages.append(
                ValidationMessage(
                    Level.error if thorough else Level.info,
                    "no_product",
                    "No product provided: validating dataset information alone",
                ))

        messages.extend(
            validate_dataset(
                doc,
                product_definition=product,
                readable_location=path,
                thorough=thorough,
            ))
        yield path, messages
Example #13
0
 def visit_missing_file(path, key, value):
     if type(value) is dict:
         return True
     if not value:
         message = 'The namelist file ' + key + ' = ' + \
                   str(get_path(nlst, (path))[key]) + ' does not exist'
         if key not in [*hrldas_exempt_list, *hydro_exempt_list]:
             raise ValueError(message)
         else:
             warnings.warn(message)
     return False
Example #14
0
 def _dover_enter(p, k, v):
     ret = default_enter(p, k, v)
     if k == None:
         return ret
     try:
         n = get_path(config, p)
         if (k not in n):
             n[k] = v
             return (ret[0], False)
     except:
         pass
     return ret
Example #15
0
def _apply_wagl_metadata(p: DatasetAssembler, wagl_doc: Dict):
    source = wagl_doc["source_datasets"]
    p.datetime = source["acquisition_datetime"]
    p.platform = source["platform_id"]
    p.instrument = source["sensor_id"]

    try:
        p.processed = get_path(wagl_doc,
                               ("system_information", "time_processed"))
    except PathAccessError:
        raise RuntimeError("WAGL dataset contains no processed time.")

    _take_software_versions(p, wagl_doc)
    p.extend_user_metadata("wagl", wagl_doc)
    def test_pools_have_monitors(config_path, _):
        """
        Require monitor configuration for each service
        """
        config_path = config_path[
            0:
            -1]  # remove 'class' from config_path, which is always the last element
        monitors = iterutils.get_path(tenants, config_path).get("monitors")

        print(f"currently processing: {config_path}, monitors: {monitors}",
              end=" ")
        assert monitors  # not empty and not None
        for monitor in monitors:
            assert monitor  # make we don't have an empty entry in the list of monitors either!
Example #17
0
def view_tree(file: types_path_and_file_like, path: Tuple = None, **kwargs):
    """Display YAML header using IPython JSON display repr.

    This function works in JupyterLab.

    Parameters
    ----------
    file :
        filename or file-like object pointing towards / containing an ASDF file.
    path :
        tuple representing the lookup path in the yaml/asdf tree
    kwargs
        kwargs passed down to JSON constructor

    Returns
    -------
    IPython.display.JSON
        JSON object for rich output in JupyterLab

    Examples
    --------
    Visualize the full tree of an existing ASDF file::

        weldx.asdf.utils.view_tree("single_pass_weld_example.asdf")

    Visualize a specific element in the tree structure by proving the path::

        weldx.asdf.utils.view_tree(
            "single_pass_weld_example.asdf", path=("process",)
        )

        weldx.asdf.utils.view_tree(
            "single_pass_weld_example.asdf", path=("process", "welding_process")
        )

    """
    from IPython.display import JSON

    if isinstance(file, str):
        root = file + "/"
    else:
        root = "/"

    yaml_dict = get_yaml_header(file, parse=True)
    if path:
        root = root + "/".join(path)
        yaml_dict = get_path(yaml_dict, path)
    kwargs["root"] = root
    return JSON(yaml_dict, **kwargs)
Example #18
0
def _get_level1_metadata_path(wagl_doc: Dict) -> Optional[Path]:
    """
    Find the expected matching metadata file for the source dataset
    """
    source_level1 = Path(
        get_path(wagl_doc, ("source_datasets", "source_level1")))
    if not source_level1.exists():
        return None

    # If a directory, assume "<dirname>.odc-metadata.yaml"
    if source_level1.is_dir():
        return source_level1 / (source_level1.name + ".odc-metadata.yaml")
    # Otherwise it's a sibling file with ".odc-metadata.yaml" suffix
    else:
        return source_level1.with_suffix(".odc-metadata.yaml")
Example #19
0
    def get(self, key: str, default: Any = None) -> Any:
        """Retrieve config value.

        Args:
            key (str): Key (in dot-notation) of value to return.
            default (Any, optional): Default value to return.
                Defaults to None.

        Returns:
            Any: Value at key given

        """
        key_path = key.split('.')
        value = iterutils.get_path(self.config, key_path, default=default)
        return value
Example #20
0
    def diffs_dict(self):
        '''The diffs_dict attribute has getter (@property) and setter methods.
        The get method summarizes all differences across all the attributes of the
        members list attribute and (should) only report member attributes when there
        is at least one difference between members.
        The setter method is meant as a convenient way to specify the differences in
        member attributes across the ensemble.
        '''

        if len(self) == 1:
            print('Ensemble is of lenght 1, no differences.')
            return {}

        mem_0_ref_dict = dictify(self.members[0])

        all_diff_keys = set({})
        for ii in range(1, len(self)):
            mem_ii_ref_dict = dictify(self.members[ii])
            diff = DeepDiffEq(mem_0_ref_dict,
                              mem_ii_ref_dict,
                              eq_types={pathlib.PosixPath})

            unexpected_diffs = set(diff.keys()) - set(['values_changed'])
            if len(unexpected_diffs):
                unexpected_diffs1 = {
                    uu: diff0[uu]
                    for uu in list(unexpected_diffs)
                }
                raise ValueError(
                    'Unexpected attribute differences between ensemble members:',
                    unexpected_diffs1)

            diff_keys = list(diff['values_changed'].keys())
            all_diff_keys = all_diff_keys | set(
                [ss.replace('root', '') for ss in diff_keys])

        diff_tuples = [ss.replace('][', ',') for ss in list(all_diff_keys)]
        diff_tuples = [ss.replace('[', '(') for ss in list(diff_tuples)]
        diff_tuples = [ss.replace(']', ')') for ss in list(diff_tuples)]
        diff_tuples = [ast.literal_eval(ss) for ss in list(diff_tuples)]

        self.__diffs_dict = {}
        for dd in diff_tuples:
            self.__diffs_dict[dd] = [
                get_path(dictify(mm), dd) for mm in self.members
            ]

        return (self.__diffs_dict)
Example #21
0
    def remerge_enter(path, key, value):
        new_parent, new_items = default_enter(path, key, value)
        if ret and not path and key is None:
            new_parent = ret
        try:
            # TODO: type check?
            new_parent = get_path(ret, path + (key,))
        except KeyError:
            pass

        if isinstance(value, list):
            # lists are purely additive. See https://github.com/mahmoud/boltons/issues/81
            new_parent.extend(value)
            new_items = []

        return new_parent, new_items
Example #22
0
def asdf_json_repr(file, path: Tuple = None, **kwargs):
    """Display YAML header using IPython JSON display repr.

    This function works in JupyterLab.

    Parameters
    ----------
    file
        filename or BytesIO buffer of ASDF file
    path
        tuple representing the lookup path in the yaml/asdf tree
    kwargs
        kwargs passed down to JSON constructor

    Returns
    -------
    IPython.display.JSON
        JSON object for rich output in JupyterLab

    Examples
    --------
    Visualize the full tree of an existing ASDF file::

        weldx.asdf.utils.asdf_json_repr("single_pass_weld_example.asdf")

    Visualize a specific element in the tree structure by proving the path::

        weldx.asdf.utils.asdf_json_repr(
            "single_pass_weld_example.asdf", path=("process", "welding_process")
        )


    """
    from IPython.display import JSON

    if isinstance(file, str):
        root = file + "/"
    else:
        root = "/"

    code = get_yaml_header(file)
    yaml_dict = yaml.load(code, Loader=yaml.BaseLoader)
    if path:
        root = root + "/".join(path)
        yaml_dict = get_path(yaml_dict, path)
    kwargs["root"] = root
    return JSON(yaml_dict, **kwargs)
Example #23
0
    def member_diffs(self):
        """Get method for ensemble member differences. Only differences are reported."""

        if len(self) == 1:
            print('Ensemble is of length 1, no differences.')
            return {}

        mem_0_ref_dict = dictify(self.members[0])

        # TODO(JLM): Could this be parallelized?
        all_diff_keys = set({})
        for imem, mem in enumerate(self.members):
            if imem == 0:
                continue
            mem_ii_ref_dict = dictify(mem)
            diff = DeepDiffEq(mem_0_ref_dict,
                              mem_ii_ref_dict,
                              eq_types={pathlib.PosixPath})

            unexpected_diffs = set(diff.keys()) - set(['values_changed'])
            if len(unexpected_diffs):
                unexpected_diffs1 = {
                    uu: diff[uu]
                    for uu in list(unexpected_diffs)
                }
                raise ValueError(
                    'Unexpected attribute differences between ensemble members:',
                    unexpected_diffs1)

            diff_keys = list(diff['values_changed'].keys())
            all_diff_keys = all_diff_keys | set(
                [ss.replace('root', '') for ss in diff_keys])

        # This translates hierarchical dict entries to tuples.
        diff_tuples = [ss.replace('][', ',') for ss in list(all_diff_keys)]
        diff_tuples = [ss.replace('[', '(') for ss in list(diff_tuples)]
        diff_tuples = [ss.replace(']', ')') for ss in list(diff_tuples)]
        diff_tuples = [ast.literal_eval(ss) for ss in list(diff_tuples)]

        self.__member_diffs = {}
        for dd in diff_tuples:
            self.__member_diffs[dd] = [
                get_path(dictify(mm), dd) for mm in self.members
            ]

        return (self.__member_diffs)
Example #24
0
def validate_eo3_doc(
    doc: Dict,
    location: Union[str, Path],
    products: Dict[str, Dict],
    thorough: bool = False,
    expect_extra_measurements=False,
) -> List[ValidationMessage]:
    messages = []

    # TODO: follow ODC's match rules?
    product = None
    product_name = get_path(doc, ("product", "name"), default=None)

    if products:
        if len(products) == 1:
            [product] = products.values()
        elif product_name is not None:
            product = products.get(product_name)

        if product is None:
            messages.append(
                _warning(
                    "unknown_product",
                    "Cannot match dataset to product",
                    hint=f"Nothing matches {product_name!r}" if product_name
                    else "No product name in dataset (TODO: field matching)",
                ))
    else:
        messages.append(
            ValidationMessage(
                Level.error if thorough else Level.info,
                "no_product",
                "No product provided: validating dataset information alone",
            ))

    messages.extend(
        validate_dataset(
            doc,
            product_definition=product,
            readable_location=location,
            thorough=thorough,
            expect_extra_measurements=expect_extra_measurements,
        ))
    return messages
Example #25
0
    def diffs_dict(self):

        if len(self) == 1:
            print('Ensemble is of lenght 1, no differences.')
            return {}

        mem_0_ref_dict = dictify(self.members[0])

        all_diff_keys = set({})
        for ii in range(1, len(self)):
            mem_ii_ref_dict = dictify(self.members[ii])
            diff = DeepDiffEq(mem_0_ref_dict,
                              mem_ii_ref_dict,
                              eq_types={pathlib.PosixPath})

            unexpected_diffs = set(diff.keys()) - set(['values_changed'])
            if len(unexpected_diffs):
                unexpected_diffs1 = {
                    uu: diff0[uu]
                    for uu in list(unexpected_diffs)
                }
                raise ValueError(
                    'Unexpected attribute differences between ensemble members:',
                    unexpected_diffs1)

            diff_keys = list(diff['values_changed'].keys())
            all_diff_keys = all_diff_keys | set(
                [ss.replace('root', '') for ss in diff_keys])

        diff_tuples = [ss.replace('][', ',') for ss in list(all_diff_keys)]
        diff_tuples = [ss.replace('[', '(') for ss in list(diff_tuples)]
        diff_tuples = [ss.replace(']', ')') for ss in list(diff_tuples)]
        diff_tuples = [ast.literal_eval(ss) for ss in list(diff_tuples)]

        self.__diffs_dict = {}
        for dd in diff_tuples:
            self.__diffs_dict[dd] = [
                get_path(dictify(mm), dd) for mm in self.members
            ]

        return (self.__diffs_dict)
Example #26
0
def load_config(config_path: Path, config_key: Tuple = ()) -> Configuration:
    config = get_path(toml.load(config_path), config_key, default={})
    config["config_path"] = config_path
    return Configuration.parse_obj(config)
Example #27
0
 def test_depth_two(self):
     root = {'key': ['test']}
     assert get_path(root, ('key', 0)) == 'test'
     assert get_path(root, 'key.0') == 'test'
Example #28
0
 def __delitem__(self, k: str) -> None:
     nested_tables, k = Conf._extract_table_list(k)
     table = iterutils.get_path(self._toml_dict, nested_tables)
     del table[k]
Example #29
0
 def test_depth_two(self):
     root = {'key': ['test']}
     assert get_path(root, ('key', 0)) == 'test'
     assert get_path(root, 'key.0') == 'test'
Example #30
0
def package_non_standard(outdir, granule):
    """
    yaml creator for the ard pipeline.
    """

    outdir = Path(outdir) / granule.name
    indir = granule.wagl_hdf5.parent

    if indir.is_file():
        shutil.copy(indir, outdir)
    else:
        shutil.copytree(indir, outdir)

    wagl_h5 = outdir / str(granule.name + ".wagl.h5")
    dataset_doc = outdir / str(granule.name + ".yaml")
    boolean_h5 = Path(str(wagl_h5).replace("wagl.h5", "converted.datasets.h5"))
    fmask_img = outdir / str(granule.name + ".fmask.img")

    f = h5py.File(boolean_h5)

    with DatasetAssembler(metadata_path=dataset_doc,
                          naming_conventions="dea") as da:
        level1 = granule.source_level1_metadata
        da.add_source_dataset(level1,
                              auto_inherit_properties=True,
                              inherit_geometry=True)
        da.product_family = "ard"
        da.producer = "ga.gov.au"
        da.properties["odc:file_format"] = "HDF5"

        with h5py.File(wagl_h5, "r") as fid:
            img_paths = [ppjoin(fid.name, pth) for pth in find(fid, "IMAGE")]
            granule_group = fid[granule.name]

            try:
                wagl_path, *ancil_paths = [
                    pth for pth in find(granule_group, "SCALAR")
                    if "METADATA" in pth
                ]
            except ValueError:
                raise ValueError("No nbar metadata found in granule")

            [wagl_doc] = loads_yaml(granule_group[wagl_path][()])

            da.processed = get_path(wagl_doc,
                                    ("system_information", "time_processed"))

            platform = da.properties["eo:platform"]
            if platform == "sentinel-2a" or platform == "sentinel-2b":
                org_collection_number = 3
            else:
                org_collection_number = utils.get_collection_number(
                    platform, da.producer,
                    da.properties["landsat:collection_number"])

            da.dataset_version = f"{org_collection_number}.1.0"
            da.region_code = eodatasets3.wagl._extract_reference_code(
                da, granule.name)

            eodatasets3.wagl._read_gqa_doc(da, granule.gqa_doc)
            eodatasets3.wagl._read_fmask_doc(da, granule.fmask_doc)

            with rasterio.open(fmask_img) as ds:
                fmask_layer = "/{}/OA_FMASK/oa_fmask".format(granule.name)
                data = ds.read(1)
                fmask_ds = f.create_dataset(fmask_layer,
                                            data=data,
                                            compression="lzf",
                                            shuffle=True)
                fmask_ds.attrs["crs_wkt"] = ds.crs.wkt
                fmask_ds.attrs["geotransform"] = ds.transform.to_gdal()

                fmask_ds.attrs[
                    "description"] = "Converted from ERDAS Imagine format to HDF5 to work with the limitations of varied formats within ODC"  # noqa E501

                grid_spec = images.GridSpec(
                    shape=ds.shape,
                    transform=ds.transform,
                    crs=CRS.from_wkt(fmask_ds.attrs["crs_wkt"]),
                )

                measurement_name = "oa_fmask"

                pathname = str(outdir.joinpath(boolean_h5))

                no_data = fmask_ds.attrs.get("no_data_value")
                if no_data is None:
                    no_data = float("nan")

                da._measurements.record_image(
                    measurement_name,
                    grid_spec,
                    pathname,
                    fmask_ds[:],
                    layer="/{}".format(fmask_layer),
                    nodata=no_data,
                    expand_valid_data=False,
                )

            for pathname in img_paths:
                ds = fid[pathname]
                ds_path = Path(ds.name)

                # eodatasets internally uses this grid spec to group image datasets
                grid_spec = images.GridSpec(
                    shape=ds.shape,
                    transform=Affine.from_gdal(*ds.attrs["geotransform"]),
                    crs=CRS.from_wkt(ds.attrs["crs_wkt"]),
                )

                # product group name; lambertian, nbar, nbart, oa
                if "STANDARDISED-PRODUCTS" in str(ds_path):
                    product_group = ds_path.parent.name
                elif "INTERPOLATED-ATMOSPHERIC-COEFFICIENTS" in str(ds_path):
                    product_group = "oa_{}".format(ds_path.parent.name)
                else:
                    product_group = "oa"

                # spatial resolution group
                # used to separate measurements with the same name
                resolution_group = "rg{}".format(
                    ds_path.parts[2].split("-")[-1])

                measurement_name = ("_".join([
                    resolution_group,
                    product_group,
                    ds.attrs.get("alias", ds_path.name),
                ]).replace("-",
                           "_").lower())  # we don't wan't hyphens in odc land

                # include this band in defining the valid data bounds?
                include = True if "nbart" in measurement_name else False

                no_data = ds.attrs.get("no_data_value")
                if no_data is None:
                    no_data = float("nan")

                # if we are of type bool, we'll have to convert just for GDAL
                if ds.dtype.name == "bool":
                    pathname = str(outdir.joinpath(boolean_h5))
                    out_ds = f.create_dataset(
                        measurement_name,
                        data=np.uint8(ds[:]),
                        compression="lzf",
                        shuffle=True,
                        chunks=ds.chunks,
                    )

                    for k, v in ds.attrs.items():
                        out_ds.attrs[k] = v

                    da._measurements.record_image(
                        measurement_name,
                        grid_spec,
                        pathname,
                        out_ds[:],
                        layer="/{}".format(out_ds.name),
                        nodata=no_data,
                        expand_valid_data=include,
                    )
                else:
                    pathname = str(outdir.joinpath(wagl_h5))

                    # work around as note_measurement doesn't allow us to specify the gridspec
                    da._measurements.record_image(
                        measurement_name,
                        grid_spec,
                        pathname,
                        ds[:],
                        layer="/{}".format(ds.name),
                        nodata=no_data,
                        expand_valid_data=include,
                    )

        # the longest part here is generating the valid data bounds vector
        # landsat 7 post SLC-OFF can take a really long time
        return da.done()
Example #31
0
def _match_product(
    dataset_doc: Dict, product_definitions: Dict[str, Dict]
) -> Tuple[Optional[Dict], List[ValidationMessage]]:
    """Match the given dataset to a product definition"""

    product = None

    # EO3 datasets often put the product name directly inside.
    specified_product_name = get_path(dataset_doc, ("product", "name"),
                                      default=None)
    specified_product_name = specified_product_name or get_path(
        dataset_doc, ("properties", "odc:product"), default=None)

    if specified_product_name and (specified_product_name
                                   in product_definitions):
        product = product_definitions[specified_product_name]

    matching_products = {
        name: definition
        for name, definition in product_definitions.items()
        if changes.contains(dataset_doc, definition["metadata"])
    }

    # We we have nothing, give up!
    if (not matching_products) and (not product):

        # Find the product that most closely matches it, to helpfully show the differences!
        closest_product_name = None
        closest_differences = None
        for name, definition in product_definitions.items():
            diffs = tuple(
                _get_product_mismatch_reasons(dataset_doc, definition))
            if (closest_differences is
                    None) or len(diffs) < len(closest_differences):
                closest_product_name = name
                closest_differences = diffs

        difference_hint = _differences_as_hint(closest_differences)
        return None, [
            _error(
                "unknown_product",
                "Dataset does not match the given products",
                hint=
                f"Closest match is {closest_product_name}, with differences:"
                f"\n{difference_hint}",
            )
        ]

    messages = []

    if specified_product_name not in matching_products:
        if product:
            difference_hint = _differences_as_hint(
                _get_product_mismatch_reasons(dataset_doc, product))
            messages.append(
                _info(
                    "strange_product_claim",
                    f"Dataset claims to be product {specified_product_name!r}, but doesn't match its fields",
                    hint=f"{difference_hint}",
                ))
        else:
            messages.append(
                _info(
                    "unknown_product_claim",
                    f"Dataset claims to be product {specified_product_name!r}, but it wasn't supplied.",
                ))

    if len(matching_products) > 1:
        matching_names = ", ".join(matching_products.keys())
        messages.append(
            _error(
                "product_match_clash",
                "Multiple products match the given dataset",
                hint=f"Maybe you need more fields in the 'metadata' section?\n"
                f"Claims to be a {specified_product_name!r}, and matches {matching_names!r}"
                if specified_product_name else
                f"Maybe you need more fields in the 'metadata' section?\n"
                f"Matches {matching_names!r}",
            ))
        # (We wont pick one from the bunch here. Maybe they already matched one above to use in continuing validation.)

    # Just like ODC, match rules will rule all. Even if their metadata has a "product_name" field.
    if len(matching_products) == 1:
        [product] = matching_products.values()

    return product, messages
Example #32
0
    def for_path(
        cls,
        wagl_hdf5: Path,
        granule_names: Optional[Sequence[str]] = None,
        level1_metadata_path: Optional[Path] = None,
        fmask_image_path: Optional[Path] = None,
        fmask_doc_path: Optional[Path] = None,
        gqa_doc_path: Optional[Path] = None,
    ):
        """
        Create granules by scanning the given hdf5 file.

        Optionally specify additional files and level1 path.

        If they are not specified it look for them using WAGL's output naming conventions.
        """
        if not wagl_hdf5.exists():
            raise ValueError(f"Input hdf5 doesn't exist {wagl_hdf5}")

        with h5py.File(wagl_hdf5, "r") as fid:
            granule_names = granule_names or fid.keys()

            for granule_name in granule_names:
                if granule_name not in fid:
                    raise ValueError(
                        f"Granule {granule_name!r} not found in file {wagl_hdf5}"
                    )

                wagl_doc_field = get_path(
                    fid, (granule_name, "METADATA", "CURRENT"))
                if not wagl_doc_field:
                    raise ValueError(
                        f"Granule contains no wagl metadata: {granule_name} in {wagl_hdf5}"
                    )

                [wagl_doc] = loads_yaml(wagl_doc_field[()])

                if not level1_metadata_path:
                    level1_tar_path = Path(
                        get_path(wagl_doc,
                                 ("source_datasets", "source_level1")))
                    level1_metadata_path = level1_tar_path.with_suffix(
                        ".odc-metadata.yaml")
                if not level1_metadata_path.exists():
                    raise ValueError(
                        f"No level1 metadata found at {level1_metadata_path}")

                level1 = serialise.from_path(level1_metadata_path)

                fmask_image_path = fmask_image_path or wagl_hdf5.with_name(
                    f"{granule_name}.fmask.img")
                if not fmask_image_path.exists():
                    raise ValueError(
                        f"No fmask image found at {fmask_image_path}")

                fmask_doc_path = fmask_doc_path or fmask_image_path.with_suffix(
                    ".yaml")
                if not fmask_doc_path.exists():
                    raise ValueError(f"No fmask found at {fmask_doc_path}")
                with fmask_doc_path.open("r") as fl:
                    [fmask_doc] = loads_yaml(fl)

                gqa_doc_path = gqa_doc_path or wagl_hdf5.with_name(
                    f"{granule_name}.gqa.yaml")
                if not gqa_doc_path.exists():
                    raise ValueError(f"No gqa found at {gqa_doc_path}")
                with gqa_doc_path.open("r") as fl:
                    [gqa_doc] = loads_yaml(fl)

                yield cls(
                    name=granule_name,
                    wagl_hdf5=wagl_hdf5,
                    wagl_metadata=wagl_doc,
                    source_level1_metadata=level1,
                    fmask_doc=fmask_doc,
                    fmask_image=fmask_image_path,
                    gqa_doc=gqa_doc,
                )
Example #33
0
    def for_path(
        cls,
        wagl_hdf5: Path,
        granule_names: Optional[Sequence[str]] = None,
        level1_metadata_path: Optional[Path] = None,
        fmask_image_path: Optional[Path] = None,
        fmask_doc_path: Optional[Path] = None,
        s2cloudless_prob_path: Optional[Path] = None,
        s2cloudless_mask_path: Optional[Path] = None,
        s2cloudless_doc_path: Optional[Path] = None,
        gqa_doc_path: Optional[Path] = None,
        tesp_doc_path: Optional[Path] = None,
        allow_missing_provenance: bool = False,
    ):
        """
        Create granules by scanning the given hdf5 file.

        Optionally specify additional files and level1 path.

        If they are not specified it look for them using WAGL's output naming conventions.
        :param allow_missing_provenance:
        """
        if not wagl_hdf5.exists():
            raise ValueError(f"Input hdf5 doesn't exist {wagl_hdf5}")

        with h5py.File(wagl_hdf5, "r") as fid:
            granule_names = granule_names or fid.keys()

            for granule_name in granule_names:
                if granule_name not in fid:
                    raise ValueError(
                        f"Granule {granule_name!r} not found in file {wagl_hdf5}"
                    )

                wagl_doc_field = get_path(
                    fid, (granule_name, "METADATA", "CURRENT"))
                if not wagl_doc_field:
                    raise ValueError(
                        f"Granule contains no wagl metadata: {granule_name} in {wagl_hdf5}"
                    )

                [wagl_doc] = loads_yaml(wagl_doc_field[()])

                level1 = _load_level1_doc(wagl_doc, level1_metadata_path,
                                          allow_missing_provenance)

                fmask_image_path = fmask_image_path or wagl_hdf5.with_name(
                    f"{granule_name}.fmask.img")
                if not fmask_image_path.exists():
                    raise ValueError(
                        f"No fmask image found at {fmask_image_path}")

                fmask_doc_path = fmask_doc_path or fmask_image_path.with_suffix(
                    ".yaml")
                if not fmask_doc_path.exists():
                    raise ValueError(f"No fmask found at {fmask_doc_path}")
                with fmask_doc_path.open("r") as fl:
                    [fmask_doc] = loads_yaml(fl)

                if "sentinel" in wagl_doc["source_datasets"][
                        "platform_id"].lower():
                    s2cloudless_prob_path = (
                        s2cloudless_prob_path or wagl_hdf5.with_name(
                            f"{granule_name}.prob.s2cloudless.tif"))
                    if not s2cloudless_prob_path.exists():
                        raise ValueError(
                            f"No s2cloudless probability image found at {s2cloudless_prob_path}"
                        )

                    s2cloudless_mask_path = (
                        s2cloudless_mask_path or wagl_hdf5.with_name(
                            f"{granule_name}.mask.s2cloudless.tif"))
                    if not s2cloudless_mask_path.exists():
                        raise ValueError(
                            f"No s2cloudless mask image found at {s2cloudless_mask_path}"
                        )

                    s2cloudless_doc_path = s2cloudless_doc_path or wagl_hdf5.with_name(
                        f"{granule_name}.s2cloudless.yaml")
                    if not s2cloudless_doc_path.exists():
                        raise ValueError(
                            f"No s2cloudless metadata found at {s2cloudless_doc_path}"
                        )
                    with s2cloudless_doc_path.open("r") as fl:
                        [s2cloudless_doc] = loads_yaml(fl)
                else:
                    s2cloudless_prob_path = None
                    s2cloudless_mask_path = None
                    s2cloudless_doc = None

                gqa_doc_path = gqa_doc_path or wagl_hdf5.with_name(
                    f"{granule_name}.gqa.yaml")
                if not gqa_doc_path.exists():
                    raise ValueError(f"No gqa found at {gqa_doc_path}")
                with gqa_doc_path.open("r") as fl:
                    [gqa_doc] = loads_yaml(fl)

                # Optional doc
                if tesp_doc_path:
                    # But if they gave us a path, we're strict about it existing.
                    if not tesp_doc_path.exists():
                        raise ValueError(
                            f"Supplied tesp doc path doesn't exist: {tesp_doc_path}"
                        )
                else:
                    tesp_doc_path = wagl_hdf5.with_name(
                        f"{granule_name}.tesp.yaml")
                if tesp_doc_path.exists():
                    with tesp_doc_path.open("r") as fl:
                        [tesp_doc] = loads_yaml(fl)

                yield cls(
                    name=granule_name,
                    wagl_hdf5=wagl_hdf5,
                    wagl_metadata=wagl_doc,
                    source_level1_metadata=level1,
                    fmask_doc=fmask_doc,
                    fmask_image=fmask_image_path,
                    s2cloudless_prob=s2cloudless_prob_path,
                    s2cloudless_mask=s2cloudless_mask_path,
                    s2cloudless_doc=s2cloudless_doc,
                    gqa_doc=gqa_doc,
                    tesp_doc=tesp_doc,
                )