def test_h5_metadata_error(tmpdir): """ Tests exception is raised if unable to retrieve metadata document for h5_metadata collection """ with h5py.File(tmpdir.join("test.h5"), "w", driver="core") as f: with pytest.raises(MetadataError) as _: current_h5_metadata(f)
def test_h5_metadata_single(tmpdir): """ Tests retrieving a metadata document from a simple h5 collection """ test_doc = {"id": "297933b9-d728-464b-be25-c405b1efa49b"} with h5py.File(tmpdir.join("test.h5"), "w", driver="core") as f: dataset_path = "/{}/{}".format(DatasetName.METADATA.value, DatasetName.CURRENT_METADATA.value) f.create_dataset(dataset_path, (1, ), dtype=VLEN_STRING) f[dataset_path][()] = yaml.dump(test_doc, default_flow_style=False) retrieve = current_h5_metadata(f) assert retrieve == test_doc
def test_h5_metadata_collection(tmpdir): """ Tests retrieving a metadata document from a multidataset h5 collection """ h5_dataset = '/1990/JUL/0600' test_doc = { 'id': '297933b9-d728-464b-be25-c405b1efa49v' } with h5py.File(tmpdir.join('test.h5'), 'w', driver='core') as f: dataset_path = '/{}{}/{}'.format( DatasetName.METADATA.value, h5_dataset, DatasetName.CURRENT_METADATA.value ) f.create_dataset(dataset_path, (1,), dtype=VLEN_STRING) f[dataset_path][()] = yaml.dump(test_doc, default_flow_style=False) retrieve = current_h5_metadata(f, dataset_path=h5_dataset) assert retrieve == test_doc
def get_pixel(filename, dataset_name, lonlat): """Return a pixel from `filename` at the longitude and latitude given by the tuple `lonlat`. Optionally, the `band` can be specified.""" with h5py.File(filename, 'r') as fid: ds = fid[dataset_name] geobox = GriddedGeoBox.from_h5_dataset(ds) x, y = [int(v) for v in ~geobox.transform * lonlat] # TODO; read metadata yaml for uuid if ds.ndim == 3: data = ds[:, y, x] elif ds.ndim == 2: data = ds[y, x] else: raise NotImplementedError("Only 2 and 3 dimensional data is supported") # else: TODO; cater for the 4D data we pulled from ECMWF # for 4D [day, level, y, x] we need another input param `day` # data = ds[day, :, y, x] metadata = current_h5_metadata(fid, dataset_path=dataset_name) return data, metadata['id']
def get_dsm( acquisition, pathname, buffer_distance=8000, out_group=None, compression=H5CompressionFilter.LZF, filter_opts=None, ): """ Given an acquisition and a national Digitial Surface Model, extract a subset from the DSM based on the acquisition extents plus an x & y margins. The subset is then smoothed with a 3x3 gaussian filter. A square margins is applied to the extents. :param acquisition: An instance of an acquisition object. :param pathname: A string pathname of the DSM with a ':' to seperate the filename from the import HDF5 dataset name. :param buffer_distance: A number representing the desired distance (in the same units as the acquisition) in which to calculate the extra number of pixels required to buffer an image. Default is 8000. :param out_group: If set to None (default) then the results will be returned as an in-memory hdf5 file, i.e. the `core` driver. Otherwise, a writeable HDF5 `Group` object. The dataset name will be as follows: * DatasetName.DSM_SMOOTHED :param compression: The compression filter to use. Default is H5CompressionFilter.LZF :filter_opts: A dict of key value pairs available to the given configuration instance of H5CompressionFilter. For example H5CompressionFilter.LZF has the keywords *chunks* and *shuffle* available. Default is None, which will use the default settings for the chosen H5CompressionFilter instance. :return: An opened `h5py.File` object, that is either in-memory using the `core` driver, or on disk. """ # Use the 1st acquisition to setup the geobox geobox = acquisition.gridded_geo_box() shape = geobox.get_shape_yx() # buffered image extents/margins margins = pixel_buffer(acquisition, buffer_distance) # Get the dimensions and geobox of the new image dem_cols = shape[1] + margins.left + margins.right dem_rows = shape[0] + margins.top + margins.bottom dem_shape = (dem_rows, dem_cols) dem_origin = geobox.convert_coordinates( (0 - margins.left, 0 - margins.top)) dem_geobox = GriddedGeoBox( dem_shape, origin=dem_origin, pixelsize=geobox.pixelsize, crs=geobox.crs.ExportToWkt(), ) # split the DSM filename, dataset name, and load fname, dname = pathname.split(":") with h5py.File(fname, "r") as dsm_fid: dsm_ds = dsm_fid[dname] dsm_geobox = GriddedGeoBox.from_dataset(dsm_ds) # calculate full border extents into CRS of DSM extents = dem_geobox.project_extents(dsm_geobox.crs) ul_xy = (extents[0], extents[3]) ur_xy = (extents[2], extents[3]) lr_xy = (extents[2], extents[1]) ll_xy = (extents[0], extents[1]) # load the subset and corresponding geobox subs, subs_geobox = read_subset(dsm_ds, ul_xy, ur_xy, lr_xy, ll_xy, edge_buffer=1) # ancillary metadata tracking metadata = current_h5_metadata(dsm_fid, dataset_path=dname) # Retrive the DSM data dsm_data = reproject_array_to_array(subs, subs_geobox, dem_geobox, resampling=Resampling.bilinear) # free memory subs = None # Output the reprojected result # Initialise the output files if out_group is None: fid = h5py.File("dsm-subset.h5", "w", driver="core", backing_store=False) else: fid = out_group if filter_opts is None: filter_opts = {} else: filter_opts = filter_opts.copy() if acquisition.tile_size[0] == 1: filter_opts["chunks"] = (1, dem_cols) else: # TODO: rework the tiling regime for larger dsm # for non single row based tiles, we won't have ideal # matching reads for tiled processing between the acquisition # and the DEM filter_opts["chunks"] = acquisition.tile_size kwargs = compression.config(**filter_opts).dataset_compression_kwargs() group = fid.create_group(GroupName.ELEVATION_GROUP.value) param_grp = group.create_group("PARAMETERS") param_grp.attrs["left_buffer"] = margins.left param_grp.attrs["right_buffer"] = margins.right param_grp.attrs["top_buffer"] = margins.top param_grp.attrs["bottom_buffer"] = margins.bottom # dataset attributes attrs = { "crs_wkt": geobox.crs.ExportToWkt(), "geotransform": dem_geobox.transform.to_gdal(), } # Smooth the DSM dsm_data = filter_dsm(dsm_data) dname = DatasetName.DSM_SMOOTHED.value out_sm_dset = group.create_dataset(dname, data=dsm_data, **kwargs) desc = "A subset of a Digital Surface Model smoothed with a gaussian " "kernel." attrs["description"] = desc attrs["id"] = numpy.array([metadata["id"]], VLEN_STRING) attach_image_attributes(out_sm_dset, attrs) if out_group is None: return fid
def load_brdf_tile(src_poly, src_crs, fid, dataset_name, fid_mask): """ Summarize BRDF data from a single tile. """ ds = fid[dataset_name] def segmentize_src_poly(length_scale): src_poly_geom = ogr.CreateGeometryFromWkt(src_poly.wkt) src_poly_geom.Segmentize(length_scale) return wkt.loads(src_poly_geom.ExportToWkt()) ds_height, ds_width = ds.shape dst_geotransform = rasterio.transform.Affine.from_gdal( *ds.attrs['geotransform']) dst_crs = CRS.from_wkt(ds.attrs['crs_wkt']) # assumes the length scales are the same (m) dst_poly = ops.transform( coord_transformer(src_crs, dst_crs), segmentize_src_poly(np.sqrt(np.abs(dst_geotransform.determinant)))) bound_poly = ops.transform(lambda x, y: dst_geotransform * (x, y), box(0., 0., ds_width, ds_height, ccw=False)) if not bound_poly.intersects(dst_poly): return BrdfTileSummary.empty() ocean_poly = ops.transform(lambda x, y: fid_mask.transform * (x, y), box(0., 0., fid_mask.width, fid_mask.height)) if not ocean_poly.intersects(dst_poly): return BrdfTileSummary.empty() # read ocean mask file for correspoing tile window # land=1, ocean=0 bound_poly_coords = list(bound_poly.exterior.coords)[:4] ocean_mask, _ = read_subset(fid_mask, *bound_poly_coords) ocean_mask = ocean_mask.astype(bool) # inside=1, outside=0 roi_mask = rasterize([(dst_poly, 1)], fill=0, out_shape=(ds_height, ds_width), transform=dst_geotransform) roi_mask = roi_mask.astype(bool) # both ocean_mask and mask shape should be same if ocean_mask.shape != roi_mask.shape: raise ValueError('ocean mask and ROI mask do not have the same shape') if roi_mask.shape != ds.shape: raise ValueError( 'BRDF dataset and ROI mask do not have the same shape') roi_mask = roi_mask & ocean_mask def layer_sum(param): layer = ds[param][:, :] common_mask = roi_mask & (layer != ds.attrs['_FillValue']) layer = layer.astype('float32') layer[~common_mask] = np.nan layer = ds.attrs['scale_factor'] * (layer - ds.attrs['add_offset']) return {'sum': np.nansum(layer), 'count': np.sum(common_mask)} return BrdfTileSummary( {param: layer_sum(param.value) for param in BrdfModelParameters}, [current_h5_metadata(fid)['id']])
def get_aerosol_data(acquisition, aerosol_dict): """ Extract the aerosol value for an acquisition. The version 2 retrieves the data from a HDF5 file, and provides more control over how the data is selected geo-metrically. Better control over timedeltas. """ dt = acquisition.acquisition_datetime geobox = acquisition.gridded_geo_box() roi_poly = Polygon([ geobox.ul_lonlat, geobox.ur_lonlat, geobox.lr_lonlat, geobox.ll_lonlat ]) descr = ["AATSR_PIX", "AATSR_CMP_YEAR_MONTH", "AATSR_CMP_MONTH"] names = [ "ATSR_LF_%Y%m", "aot_mean_%b_%Y_All_Aerosols", "aot_mean_%b_All_Aerosols" ] exts = ["/pix", "/cmp", "/cmp"] pathnames = [ppjoin(ext, dt.strftime(n)) for ext, n in zip(exts, names)] # temporary until we sort out a better default mechanism # how do we want to support default values, whilst still support provenance if "user" in aerosol_dict: tier = AerosolTier.USER metadata = {"id": numpy.array([], VLEN_STRING), "tier": tier.name} return aerosol_dict["user"], metadata aerosol_fname = aerosol_dict["pathname"] data = None delta_tolerance = datetime.timedelta(days=0.5) with h5py.File(aerosol_fname, "r") as fid: for pathname, description in zip(pathnames, descr): tier = AerosolTier[description] if pathname in fid: df = read_h5_table(fid, pathname) aerosol_poly = wkt.loads(fid[pathname].attrs["extents"]) if aerosol_poly.intersects(roi_poly): if description == "AATSR_PIX": abs_diff = (df["timestamp"] - dt).abs() df = df[abs_diff < delta_tolerance] df.reset_index(inplace=True, drop=True) if df.shape[0] == 0: continue intersection = aerosol_poly.intersection(roi_poly) pts = GeoSeries( [Point(x, y) for x, y in zip(df["lon"], df["lat"])]) idx = pts.within(intersection) data = df[idx]["aerosol"].mean() if numpy.isfinite(data): # ancillary metadata tracking md = current_h5_metadata(fid, dataset_path=pathname) metadata = { "id": numpy.array([md["id"]], VLEN_STRING), "tier": tier.name, } return data, metadata # default aerosol value data = 0.06 metadata = { "id": numpy.array([], VLEN_STRING), "tier": AerosolTier.FALLBACK_DEFAULT.name, } return data, metadata