def query_filepath(path: Path, pattern: str) -> pandas.DataFrame: """ Find datasets by globbing the filesystem. :param path: The full pathname to the directory containing the product. :param pattern: A string containing pattern used to glob the filesystem. eg `*/*/2019/05/*/*.odc-metadata.yaml` """ files = list(path.rglob(pattern)) uuid = [] yaml_pathname = [] proc_info_pathname = [] _LOG.info( "finding datasets", path=path, pattern=pattern, ) for fname in files: _LOG.info("processing dataset", dataset=str(fname)) doc = load_odc_metadata(fname) uuid.append(doc.parent_uuid) yaml_pathname.append(str(fname)) # procssing info document pathname = fname.parent.joinpath(doc.proc_info) proc_info_pathname.append(str(pathname)) dataframe = pandas.DataFrame({ "level1_uuid": uuid, "yaml_pathname": yaml_pathname, "proc_info_pathname": proc_info_pathname, }) return dataframe
def comparison(outdir: Union[str, Path], proc_info: bool) -> None: """ Test and Reference product intercomparison evaluation. """ outdir = Path(outdir) if proc_info: log_fname = outdir.joinpath(DirectoryNames.LOGS.value, LogNames.PROC_INFO_INTERCOMPARISON.value) else: log_fname = outdir.joinpath(DirectoryNames.LOGS.value, LogNames.MEASUREMENT_INTERCOMPARISON.value) out_stream = MPIStreamIO(str(log_fname)) structlog.configure(processors=DEFAULT_PROCESSORS, logger_factory=MPILoggerFactory(out_stream)) # processor info rank = COMM.Get_rank() n_processors = COMM.Get_size() results_fname = outdir.joinpath(DirectoryNames.RESULTS.value, FileNames.RESULTS.value) with h5py.File(str(results_fname), "r") as fid: dataframe = read_h5_table(fid, DatasetNames.QUERY.value) if rank == 0: index = dataframe.index.values.tolist() blocks = scatter(index, n_processors) # some basic attribute information doc: Union[Granule, None] = load_odc_metadata( Path(dataframe.iloc[0].yaml_pathname_reference)) attrs: Dict[str, Any] = { "framing": doc.framing, "thematic": False, "proc-info": False, } else: blocks = None doc = None attrs = dict() COMM.Barrier() # equally partition the work across all procesors indices = COMM.scatter(blocks, root=0) if proc_info: attrs["proc-info"] = True if rank == 0: _LOG.info("procssing proc-info documents") gqa_dataframe, ancillary_dataframe = _process_proc_info( dataframe.iloc[indices], rank) if rank == 0: _LOG.info("saving gqa dataframe results to tables") if not results_fname.parent.exists(): results_fname.parent.mkdir(parents=True) with h5py.File(str(results_fname), "a") as fid: dataset_name = PPath(DatasetGroups.INTERCOMPARISON.value, DatasetNames.GQA_RESULTS.value) write_dataframe(gqa_dataframe, str(dataset_name), fid, attrs=attrs) _LOG.info("saving ancillary dataframe results to tables") if not results_fname.parent.exists(): results_fname.parent.mkdir(parents=True) with h5py.File(str(results_fname), "a") as fid: dataset_name = PPath( DatasetGroups.INTERCOMPARISON.value, DatasetNames.ANCILLARY_RESULTS.value, ) write_dataframe(ancillary_dataframe, str(dataset_name), fid, attrs=attrs) _LOG.info("saving software versions dataframe to tables") with h5py.File(str(results_fname), "a") as fid: dataset_name = PPath(DatasetNames.SOFTWARE_VERSIONS.value) software_attrs = { "description": "ARD Pipeline software versions" } software_df = compare_proc_info.compare_software(dataframe) write_dataframe(software_df, str(dataset_name), fid, attrs=software_attrs) else: if rank == 0: _LOG.info("processing odc-metadata documents") results = _process_odc_doc(dataframe.iloc[indices], rank) if rank == 0: # save each table _LOG.info("saving dataframes to tables") with h5py.File(str(results_fname), "a") as fid: attrs["thematic"] = False write_dataframe( results[0], str( PPath( DatasetGroups.INTERCOMPARISON.value, DatasetNames.GENERAL_RESULTS.value, )), fid, attrs=attrs, ) attrs["thematic"] = True write_dataframe( results[1], str( PPath( DatasetGroups.INTERCOMPARISON.value, DatasetNames.FMASK_RESULTS.value, )), fid, attrs=attrs, ) write_dataframe( results[2], str( PPath( DatasetGroups.INTERCOMPARISON.value, DatasetNames.CONTIGUITY_RESULTS.value, )), fid, attrs=attrs, ) write_dataframe( results[3], str( PPath( DatasetGroups.INTERCOMPARISON.value, DatasetNames.SHADOW_RESULTS.value, )), fid, attrs=attrs, ) if rank == 0: workflow = "proc-info field" if proc_info else "product measurement" msg = f"{workflow} comparison processing finished" _LOG.info(msg)
def test_load_odc(doc_path: Path): """Check that no exception is raised when loading the yaml document.""" try: load_odc_metadata(doc_path) except Exception as exc: assert False, f"'load_odc_metadata' raised an exception: {exc}"
import math from typing import Dict, List import pytest from gost.odc_documents import load_odc_metadata, load_proc_info from gost.data_model import Granule, GranuleProcInfo from . import LS5_ODC_DOC_PATH, LS7_ODC_DOC_PATH, LS8_ODC_DOC_PATH from . import LS5_ODC_PROC_PATH, LS7_ODC_PROC_PATH, LS8_ODC_PROC_PATH NaN = math.nan LS5_GRN = load_odc_metadata(LS5_ODC_DOC_PATH) LS7_GRN = load_odc_metadata(LS7_ODC_DOC_PATH) LS8_GRN = load_odc_metadata(LS8_ODC_DOC_PATH) LS5_PROC = load_proc_info(LS5_ODC_PROC_PATH) LS7_PROC = load_proc_info(LS7_ODC_PROC_PATH) LS8_PROC = load_proc_info(LS8_ODC_PROC_PATH) def _check_nan(a, b): if not math.isnan(a): result = a == b else: result = math.isnan(a) == math.isnan(b) return result @pytest.mark.parametrize( "granule, expected_result", [(LS5_GRN, "WRS2"), (LS7_GRN, "WRS2"), (LS8_GRN, "WRS2")] )
def process_yamls( dataframe: pandas.DataFrame) -> Tuple[Dict[str, List[Any]], ...]: """Process dataframe containing records to process.""" # initialise placeholders for the results general_records = GeneralRecords() fmask_records = FmaskRecords() contiguity_records = ContiguityRecords() shadow_records = TerrainShadowRecords() for i, row in dataframe.iterrows(): _LOG.info( "processing document", yaml_doc_test=row.yaml_pathname_test, yaml_doc_reference=row.yaml_pathname_reference, ) doc_test = load_odc_metadata(Path(row.yaml_pathname_test)) doc_reference = load_odc_metadata(Path(row.yaml_pathname_reference)) for measurement_name in doc_test.measurements: _LOG.info( "processing measurement", measurement=measurement_name, ) test_measurement = doc_test.measurements[measurement_name] reference_measurement = doc_reference.measurements[ measurement_name] if not reference_measurement.pathname().exists(): _LOG.info( "missing reference measurement", measurement_reference=str( reference_measurement.pathname()), measurement_test=str(test_measurement.pathname()), ) continue if not test_measurement.pathname().exists(): _LOG.info( "missing test measurement", measurement_reference=str( reference_measurement.pathname()), measurement_test=str(test_measurement.pathname()), ) continue # open the handler for the datasets test_measurement.open() reference_measurement.open() # size of full image in pixels (null and valid) size = numpy.prod(test_measurement.shape) # compute results if measurement_name in FMASK_MEASUREMENT_NAMES: # the idea here is to analyse the thematic data differently fmask_records.add_base_info( doc_reference, reference_measurement.pathname(), test_measurement.pathname(), size, measurement_name, ) # thematic evaluation fmask_results = evaluate_themes(reference_measurement, test_measurement, FmaskThemes) for key in fmask_results: value = fmask_results[key] getattr(fmask_records, key).append(value) elif measurement_name in CONTIGUITY_MEASUREMENT_NAMES: # base records contiguity_records.add_base_info( doc_reference, reference_measurement.pathname(), test_measurement.pathname(), size, measurement_name, ) # thematic evaluation contiguity_results = evaluate_themes(reference_measurement, test_measurement, ContiguityThemes) for key in contiguity_results: value = contiguity_results[key] getattr(contiguity_records, key).append(value) elif measurement_name in SHADOW_MEASUREMENT_NAMES: # base records shadow_records.add_base_info( doc_reference, reference_measurement.pathname(), test_measurement.pathname(), size, measurement_name, ) # thematic evaluation shadow_results = evaluate_themes(reference_measurement, test_measurement, TerrainShadowThemes) for key in shadow_results: value = shadow_results[key] getattr(shadow_records, key).append(value) else: # null data evaluation null_info = evaluate_nulls(reference_measurement, test_measurement) general_records.percent_data_2_null.append(null_info[0]) general_records.percent_null_2_data.append(null_info[1]) diff = evaluate(reference_measurement, test_measurement) abs_diff = numpy.abs(diff) h = distribution(diff) h_abs = distribution(abs_diff) # store results general_records.add_base_info( doc_reference, reference_measurement.pathname(), test_measurement.pathname(), size, measurement_name, ) if "nbar" in measurement_name or measurement_name in BAND_IDS: # get difference as a percent reflectance (0->100) general_records.min_residual.append(h["omin"] / 100) general_records.max_residual.append(h["omax"] / 100) general_records.max_absolute.append(h_abs["omax"] / 100) else: general_records.min_residual.append(h["omin"]) general_records.max_residual.append(h["omax"]) general_records.max_absolute.append(h_abs["omax"]) general_records.percent_different.append( (diff != 0).sum() / diff.size * 100) # percentiles of the cumulative distribution hist = h_abs["histogram"] cdf = numpy.cumsum(hist / hist.sum()) p1_idx = numpy.searchsorted(cdf, 0.9) p2_idx = numpy.searchsorted(cdf, 0.99) pct_90 = h_abs["loc"][p1_idx] pct_99 = h_abs["loc"][p2_idx] # moments mean = numpy.mean(diff) stddev = numpy.std(diff, ddof=1) skewness = stats.skew(diff) kurtosis = stats.kurtosis(diff) # percentiles from cumulative distribution if "nbar" in measurement_name or measurement_name in BAND_IDS: # get difference as a percent reflectance (0->100) general_records.percentile_90.append(pct_90 / 100) general_records.percentile_99.append(pct_99 / 100) general_records.mean_residual.append(mean / 100) general_records.standard_deviation.append(stddev / 100) else: general_records.percentile_90.append(pct_90) general_records.percentile_99.append(pct_99) general_records.mean_residual.append(mean) general_records.standard_deviation.append(stddev) general_records.skewness.append(skewness) general_records.kurtosis.append(kurtosis) # close the handler for the datasets test_measurement.close() reference_measurement.close() results = ( general_records.records(), fmask_records.records(), contiguity_records.records(), shadow_records.records(), ) return results
def process_yamls( dataframe: pandas.DataFrame, ) -> Tuple[Dict[str, List[Any]], Dict[str, List[Any]]]: """Compare gqa and ancillary fields.""" doc = load_proc_info(Path(dataframe.iloc[0].proc_info_pathname_test)) gqa_results: Dict[str, Any] = {key: [] for key in doc.geometric_quality.fields} ancillary_results: Dict[str, Any] = { key: [] for key in doc.ancillary.flatten() } gqa_results["reference_pathname"] = [] gqa_results["test_pathname"] = [] gqa_results["region_code"] = [] gqa_results["granule_id"] = [] ancillary_results["reference_pathname"] = [] ancillary_results["test_pathname"] = [] ancillary_results["region_code"] = [] ancillary_results["granule_id"] = [] for _, row in dataframe.iterrows(): _LOG.info( "processing document", yaml_doc_test=row.proc_info_pathname_test, yaml_doc_reference=row.proc_info_pathname_reference, ) doc_reference = load_odc_metadata(Path(row.yaml_pathname_reference)) proc_info_test = load_proc_info(Path(row.proc_info_pathname_test)) proc_info_reference = load_proc_info( Path(row.proc_info_pathname_reference)) gqa_results["region_code"].append(doc_reference.region_code) gqa_results["granule_id"].append(doc_reference.granule_id) gqa_results["reference_pathname"].append( row.proc_info_pathname_reference) gqa_results["test_pathname"].append(row.proc_info_pathname_test) ancillary_results["region_code"].append(doc_reference.region_code) ancillary_results["granule_id"].append(doc_reference.granule_id) ancillary_results["reference_pathname"].append( row.proc_info_pathname_reference) ancillary_results["test_pathname"].append(row.proc_info_pathname_test) gqa_result = compare_gqa(proc_info_reference.geometric_quality, proc_info_test.geometric_quality) for key in gqa_result: gqa_results[key].append(gqa_result[key]) ancillary_result = compare_ancillary(proc_info_reference.ancillary, proc_info_test.ancillary) for key in ancillary_result: ancillary_results[key].append(ancillary_result[key]) return gqa_results, ancillary_results
def query_db( env: str, product_name: str, time: Optional[Tuple[str, str]] = None, lon: Optional[Tuple[str, str]] = None, lat: Optional[Tuple[str, str]] = None, additional_filters: Optional[Dict[str, Any]] = None, ) -> pandas.DataFrame: """ Generic datacube query wrapper. :param env: Name of the database environment to query. :param product_name: Name of the product to query. :param time: The time (earliest, latest) extents (optional). :param lon: The longitude (left, right) extents (optional). :param lat: The latitude (top, bottom) extents (optional). :param additional_filters: A dictionary containing {field: value} pairs that datacube can use to further filter datasets (optional). e.g. {"region_code": "092084"} """ dc = datacube.Datacube(env=env) _LOG.info( "finding datasets", env=env, product_name=product_name, time=time, lon=lon, lat=lat, ) datasets = dc.find_datasets(product_name, time=time, lon=lon, lat=lat, **additional_filters) uuid = [] yaml_pathname = [] proc_info_pathname = [] for dataset in datasets: _LOG.info("processing dataset", dataset=str(dataset.local_path)) doc = load_odc_metadata(dataset.local_path) uuid.append(doc.parent_uuid) yaml_pathname.append(str(dataset.local_path)) # procssing info document proc_info_pathname = dataset.local_path.parent.joinpath(doc.proc_info) proc_info_pathname.append(str(proc_info_pathname)) dataframe = pandas.DataFrame({ "level1_uuid": uuid, "yaml_pathname": yaml_pathname, "proc_info_pathname": proc_info_pathname, }) return dataframe