Esempio n. 1
0
File: query.py Progetto: sixy6e/gost
def query_filepath(path: Path, pattern: str) -> pandas.DataFrame:
    """
    Find datasets by globbing the filesystem.

    :param path:
        The full pathname to the directory containing the product.

    :param pattern:
        A string containing pattern used to glob the filesystem.
        eg `*/*/2019/05/*/*.odc-metadata.yaml`
    """
    files = list(path.rglob(pattern))

    uuid = []
    yaml_pathname = []
    proc_info_pathname = []

    _LOG.info(
        "finding datasets",
        path=path,
        pattern=pattern,
    )

    for fname in files:
        _LOG.info("processing dataset", dataset=str(fname))

        doc = load_odc_metadata(fname)
        uuid.append(doc.parent_uuid)
        yaml_pathname.append(str(fname))

        # procssing info document
        pathname = fname.parent.joinpath(doc.proc_info)
        proc_info_pathname.append(str(pathname))

    dataframe = pandas.DataFrame({
        "level1_uuid": uuid,
        "yaml_pathname": yaml_pathname,
        "proc_info_pathname": proc_info_pathname,
    })

    return dataframe
Esempio n. 2
0
def comparison(outdir: Union[str, Path], proc_info: bool) -> None:
    """
    Test and Reference product intercomparison evaluation.
    """

    outdir = Path(outdir)
    if proc_info:
        log_fname = outdir.joinpath(DirectoryNames.LOGS.value,
                                    LogNames.PROC_INFO_INTERCOMPARISON.value)
    else:
        log_fname = outdir.joinpath(DirectoryNames.LOGS.value,
                                    LogNames.MEASUREMENT_INTERCOMPARISON.value)

    out_stream = MPIStreamIO(str(log_fname))
    structlog.configure(processors=DEFAULT_PROCESSORS,
                        logger_factory=MPILoggerFactory(out_stream))

    # processor info
    rank = COMM.Get_rank()
    n_processors = COMM.Get_size()

    results_fname = outdir.joinpath(DirectoryNames.RESULTS.value,
                                    FileNames.RESULTS.value)

    with h5py.File(str(results_fname), "r") as fid:
        dataframe = read_h5_table(fid, DatasetNames.QUERY.value)

    if rank == 0:
        index = dataframe.index.values.tolist()
        blocks = scatter(index, n_processors)

        # some basic attribute information
        doc: Union[Granule, None] = load_odc_metadata(
            Path(dataframe.iloc[0].yaml_pathname_reference))
        attrs: Dict[str, Any] = {
            "framing": doc.framing,
            "thematic": False,
            "proc-info": False,
        }
    else:
        blocks = None
        doc = None
        attrs = dict()

    COMM.Barrier()

    # equally partition the work across all procesors
    indices = COMM.scatter(blocks, root=0)

    if proc_info:
        attrs["proc-info"] = True
        if rank == 0:
            _LOG.info("procssing proc-info documents")

        gqa_dataframe, ancillary_dataframe = _process_proc_info(
            dataframe.iloc[indices], rank)

        if rank == 0:
            _LOG.info("saving gqa dataframe results to tables")

            if not results_fname.parent.exists():
                results_fname.parent.mkdir(parents=True)

            with h5py.File(str(results_fname), "a") as fid:
                dataset_name = PPath(DatasetGroups.INTERCOMPARISON.value,
                                     DatasetNames.GQA_RESULTS.value)

                write_dataframe(gqa_dataframe,
                                str(dataset_name),
                                fid,
                                attrs=attrs)

            _LOG.info("saving ancillary dataframe results to tables")

            if not results_fname.parent.exists():
                results_fname.parent.mkdir(parents=True)

            with h5py.File(str(results_fname), "a") as fid:
                dataset_name = PPath(
                    DatasetGroups.INTERCOMPARISON.value,
                    DatasetNames.ANCILLARY_RESULTS.value,
                )

                write_dataframe(ancillary_dataframe,
                                str(dataset_name),
                                fid,
                                attrs=attrs)

            _LOG.info("saving software versions dataframe to tables")

            with h5py.File(str(results_fname), "a") as fid:
                dataset_name = PPath(DatasetNames.SOFTWARE_VERSIONS.value)

                software_attrs = {
                    "description": "ARD Pipeline software versions"
                }
                software_df = compare_proc_info.compare_software(dataframe)
                write_dataframe(software_df,
                                str(dataset_name),
                                fid,
                                attrs=software_attrs)

    else:
        if rank == 0:
            _LOG.info("processing odc-metadata documents")
        results = _process_odc_doc(dataframe.iloc[indices], rank)

        if rank == 0:
            # save each table
            _LOG.info("saving dataframes to tables")
            with h5py.File(str(results_fname), "a") as fid:

                attrs["thematic"] = False
                write_dataframe(
                    results[0],
                    str(
                        PPath(
                            DatasetGroups.INTERCOMPARISON.value,
                            DatasetNames.GENERAL_RESULTS.value,
                        )),
                    fid,
                    attrs=attrs,
                )

                attrs["thematic"] = True
                write_dataframe(
                    results[1],
                    str(
                        PPath(
                            DatasetGroups.INTERCOMPARISON.value,
                            DatasetNames.FMASK_RESULTS.value,
                        )),
                    fid,
                    attrs=attrs,
                )

                write_dataframe(
                    results[2],
                    str(
                        PPath(
                            DatasetGroups.INTERCOMPARISON.value,
                            DatasetNames.CONTIGUITY_RESULTS.value,
                        )),
                    fid,
                    attrs=attrs,
                )

                write_dataframe(
                    results[3],
                    str(
                        PPath(
                            DatasetGroups.INTERCOMPARISON.value,
                            DatasetNames.SHADOW_RESULTS.value,
                        )),
                    fid,
                    attrs=attrs,
                )

    if rank == 0:
        workflow = "proc-info field" if proc_info else "product measurement"
        msg = f"{workflow} comparison processing finished"
        _LOG.info(msg)
Esempio n. 3
0
def test_load_odc(doc_path: Path):
    """Check that no exception is raised when loading the yaml document."""
    try:
        load_odc_metadata(doc_path)
    except Exception as exc:
        assert False, f"'load_odc_metadata' raised an exception: {exc}"
Esempio n. 4
0
import math
from typing import Dict, List
import pytest

from gost.odc_documents import load_odc_metadata, load_proc_info
from gost.data_model import Granule, GranuleProcInfo
from . import LS5_ODC_DOC_PATH, LS7_ODC_DOC_PATH, LS8_ODC_DOC_PATH
from . import LS5_ODC_PROC_PATH, LS7_ODC_PROC_PATH, LS8_ODC_PROC_PATH


NaN = math.nan
LS5_GRN = load_odc_metadata(LS5_ODC_DOC_PATH)
LS7_GRN = load_odc_metadata(LS7_ODC_DOC_PATH)
LS8_GRN = load_odc_metadata(LS8_ODC_DOC_PATH)
LS5_PROC = load_proc_info(LS5_ODC_PROC_PATH)
LS7_PROC = load_proc_info(LS7_ODC_PROC_PATH)
LS8_PROC = load_proc_info(LS8_ODC_PROC_PATH)


def _check_nan(a, b):
    if not math.isnan(a):
        result = a == b
    else:
        result = math.isnan(a) == math.isnan(b)

    return result


@pytest.mark.parametrize(
    "granule, expected_result", [(LS5_GRN, "WRS2"), (LS7_GRN, "WRS2"), (LS8_GRN, "WRS2")]
)
Esempio n. 5
0
def process_yamls(
        dataframe: pandas.DataFrame) -> Tuple[Dict[str, List[Any]], ...]:
    """Process dataframe containing records to process."""

    # initialise placeholders for the results
    general_records = GeneralRecords()
    fmask_records = FmaskRecords()
    contiguity_records = ContiguityRecords()
    shadow_records = TerrainShadowRecords()

    for i, row in dataframe.iterrows():
        _LOG.info(
            "processing document",
            yaml_doc_test=row.yaml_pathname_test,
            yaml_doc_reference=row.yaml_pathname_reference,
        )

        doc_test = load_odc_metadata(Path(row.yaml_pathname_test))
        doc_reference = load_odc_metadata(Path(row.yaml_pathname_reference))

        for measurement_name in doc_test.measurements:
            _LOG.info(
                "processing measurement",
                measurement=measurement_name,
            )

            test_measurement = doc_test.measurements[measurement_name]
            reference_measurement = doc_reference.measurements[
                measurement_name]

            if not reference_measurement.pathname().exists():
                _LOG.info(
                    "missing reference measurement",
                    measurement_reference=str(
                        reference_measurement.pathname()),
                    measurement_test=str(test_measurement.pathname()),
                )
                continue

            if not test_measurement.pathname().exists():
                _LOG.info(
                    "missing test measurement",
                    measurement_reference=str(
                        reference_measurement.pathname()),
                    measurement_test=str(test_measurement.pathname()),
                )
                continue

            # open the handler for the datasets
            test_measurement.open()
            reference_measurement.open()

            # size of full image in pixels (null and valid)
            size = numpy.prod(test_measurement.shape)

            # compute results
            if measurement_name in FMASK_MEASUREMENT_NAMES:
                # the idea here is to analyse the thematic data differently
                fmask_records.add_base_info(
                    doc_reference,
                    reference_measurement.pathname(),
                    test_measurement.pathname(),
                    size,
                    measurement_name,
                )

                # thematic evaluation
                fmask_results = evaluate_themes(reference_measurement,
                                                test_measurement, FmaskThemes)
                for key in fmask_results:
                    value = fmask_results[key]
                    getattr(fmask_records, key).append(value)
            elif measurement_name in CONTIGUITY_MEASUREMENT_NAMES:
                # base records
                contiguity_records.add_base_info(
                    doc_reference,
                    reference_measurement.pathname(),
                    test_measurement.pathname(),
                    size,
                    measurement_name,
                )

                # thematic evaluation
                contiguity_results = evaluate_themes(reference_measurement,
                                                     test_measurement,
                                                     ContiguityThemes)
                for key in contiguity_results:
                    value = contiguity_results[key]
                    getattr(contiguity_records, key).append(value)
            elif measurement_name in SHADOW_MEASUREMENT_NAMES:
                # base records
                shadow_records.add_base_info(
                    doc_reference,
                    reference_measurement.pathname(),
                    test_measurement.pathname(),
                    size,
                    measurement_name,
                )

                # thematic evaluation
                shadow_results = evaluate_themes(reference_measurement,
                                                 test_measurement,
                                                 TerrainShadowThemes)
                for key in shadow_results:
                    value = shadow_results[key]
                    getattr(shadow_records, key).append(value)
            else:
                # null data evaluation
                null_info = evaluate_nulls(reference_measurement,
                                           test_measurement)
                general_records.percent_data_2_null.append(null_info[0])
                general_records.percent_null_2_data.append(null_info[1])

                diff = evaluate(reference_measurement, test_measurement)
                abs_diff = numpy.abs(diff)
                h = distribution(diff)
                h_abs = distribution(abs_diff)

                # store results
                general_records.add_base_info(
                    doc_reference,
                    reference_measurement.pathname(),
                    test_measurement.pathname(),
                    size,
                    measurement_name,
                )

                if "nbar" in measurement_name or measurement_name in BAND_IDS:
                    # get difference as a percent reflectance (0->100)
                    general_records.min_residual.append(h["omin"] / 100)
                    general_records.max_residual.append(h["omax"] / 100)
                    general_records.max_absolute.append(h_abs["omax"] / 100)
                else:
                    general_records.min_residual.append(h["omin"])
                    general_records.max_residual.append(h["omax"])
                    general_records.max_absolute.append(h_abs["omax"])

                general_records.percent_different.append(
                    (diff != 0).sum() / diff.size * 100)

                # percentiles of the cumulative distribution
                hist = h_abs["histogram"]
                cdf = numpy.cumsum(hist / hist.sum())
                p1_idx = numpy.searchsorted(cdf, 0.9)
                p2_idx = numpy.searchsorted(cdf, 0.99)
                pct_90 = h_abs["loc"][p1_idx]
                pct_99 = h_abs["loc"][p2_idx]

                # moments
                mean = numpy.mean(diff)
                stddev = numpy.std(diff, ddof=1)
                skewness = stats.skew(diff)
                kurtosis = stats.kurtosis(diff)

                # percentiles from cumulative distribution
                if "nbar" in measurement_name or measurement_name in BAND_IDS:
                    # get difference as a percent reflectance (0->100)
                    general_records.percentile_90.append(pct_90 / 100)
                    general_records.percentile_99.append(pct_99 / 100)
                    general_records.mean_residual.append(mean / 100)
                    general_records.standard_deviation.append(stddev / 100)
                else:
                    general_records.percentile_90.append(pct_90)
                    general_records.percentile_99.append(pct_99)
                    general_records.mean_residual.append(mean)
                    general_records.standard_deviation.append(stddev)

                general_records.skewness.append(skewness)
                general_records.kurtosis.append(kurtosis)

            # close the handler for the datasets
            test_measurement.close()
            reference_measurement.close()

    results = (
        general_records.records(),
        fmask_records.records(),
        contiguity_records.records(),
        shadow_records.records(),
    )
    return results
Esempio n. 6
0
def process_yamls(
    dataframe: pandas.DataFrame,
) -> Tuple[Dict[str, List[Any]], Dict[str, List[Any]]]:
    """Compare gqa and ancillary fields."""

    doc = load_proc_info(Path(dataframe.iloc[0].proc_info_pathname_test))

    gqa_results: Dict[str,
                      Any] = {key: []
                              for key in doc.geometric_quality.fields}
    ancillary_results: Dict[str, Any] = {
        key: []
        for key in doc.ancillary.flatten()
    }

    gqa_results["reference_pathname"] = []
    gqa_results["test_pathname"] = []
    gqa_results["region_code"] = []
    gqa_results["granule_id"] = []

    ancillary_results["reference_pathname"] = []
    ancillary_results["test_pathname"] = []
    ancillary_results["region_code"] = []
    ancillary_results["granule_id"] = []

    for _, row in dataframe.iterrows():
        _LOG.info(
            "processing document",
            yaml_doc_test=row.proc_info_pathname_test,
            yaml_doc_reference=row.proc_info_pathname_reference,
        )

        doc_reference = load_odc_metadata(Path(row.yaml_pathname_reference))
        proc_info_test = load_proc_info(Path(row.proc_info_pathname_test))
        proc_info_reference = load_proc_info(
            Path(row.proc_info_pathname_reference))

        gqa_results["region_code"].append(doc_reference.region_code)
        gqa_results["granule_id"].append(doc_reference.granule_id)
        gqa_results["reference_pathname"].append(
            row.proc_info_pathname_reference)
        gqa_results["test_pathname"].append(row.proc_info_pathname_test)

        ancillary_results["region_code"].append(doc_reference.region_code)
        ancillary_results["granule_id"].append(doc_reference.granule_id)
        ancillary_results["reference_pathname"].append(
            row.proc_info_pathname_reference)
        ancillary_results["test_pathname"].append(row.proc_info_pathname_test)

        gqa_result = compare_gqa(proc_info_reference.geometric_quality,
                                 proc_info_test.geometric_quality)
        for key in gqa_result:
            gqa_results[key].append(gqa_result[key])

        ancillary_result = compare_ancillary(proc_info_reference.ancillary,
                                             proc_info_test.ancillary)

        for key in ancillary_result:
            ancillary_results[key].append(ancillary_result[key])

    return gqa_results, ancillary_results
Esempio n. 7
0
File: query.py Progetto: sixy6e/gost
def query_db(
    env: str,
    product_name: str,
    time: Optional[Tuple[str, str]] = None,
    lon: Optional[Tuple[str, str]] = None,
    lat: Optional[Tuple[str, str]] = None,
    additional_filters: Optional[Dict[str, Any]] = None,
) -> pandas.DataFrame:
    """
    Generic datacube query wrapper.

    :param env:
        Name of the database environment to query.

    :param product_name:
        Name of the product to query.

    :param time:
        The time (earliest, latest) extents (optional).

    :param lon:
        The longitude (left, right) extents (optional).

    :param lat:
        The latitude (top, bottom) extents (optional).

    :param additional_filters:
        A dictionary containing {field: value} pairs that datacube can
        use to further filter datasets (optional).
        e.g. {"region_code": "092084"}
    """

    dc = datacube.Datacube(env=env)

    _LOG.info(
        "finding datasets",
        env=env,
        product_name=product_name,
        time=time,
        lon=lon,
        lat=lat,
    )
    datasets = dc.find_datasets(product_name,
                                time=time,
                                lon=lon,
                                lat=lat,
                                **additional_filters)

    uuid = []
    yaml_pathname = []
    proc_info_pathname = []

    for dataset in datasets:
        _LOG.info("processing dataset", dataset=str(dataset.local_path))
        doc = load_odc_metadata(dataset.local_path)

        uuid.append(doc.parent_uuid)
        yaml_pathname.append(str(dataset.local_path))

        # procssing info document
        proc_info_pathname = dataset.local_path.parent.joinpath(doc.proc_info)
        proc_info_pathname.append(str(proc_info_pathname))

    dataframe = pandas.DataFrame({
        "level1_uuid": uuid,
        "yaml_pathname": yaml_pathname,
        "proc_info_pathname": proc_info_pathname,
    })

    return dataframe