def get_choice(self, x): self.show_menu = False self.s.close() self.m.remove_layer(self.p) self.p = None choice = x['new'] if choice == 'Show flow': self.show_flow = True elif choice == 'Hide flow': self.show_flow = False self.m.remove_layer(self.io) self.io = None elif choice == 'Delineate watershed': self.show_flow = False self.m.remove_layer(self.io) self.io = None self.label.value = 'Delineating watershed, please wait...' delineate(*self.coord) self.label.value = 'Watershed delineated' ds_mask = xr.open_zarr('tmp/ds_mask/0').compute() mask = ds_mask['mask'].values polygon = get_polygon(mask, ds_mask.lat.values[0]+0.5/1200, ds_mask.lon.values[0]-0.5/1200) self.m.add_layer(polygon) self.label.value = 'Watershed displayed' elif choice == 'Set marker': if self.marker is not None: self.m.remove_layer(self.marker) self.marker = Marker(location=self.coord) self.m.add_layer(self.marker) elif choice == 'Close': pass
def get_gpm_precipitation(da_mask_gpm): ds_gpm = xr.open_zarr(gcsfs.GCSMap('pangeo-data/gpm_imerg_early')) da_gpm = ds_gpm['precipitationCal'] pix_deg_gpm = 0.1 da_area_gpm = pixel_area(pix_deg_gpm) da_mask_gpm = da_area_gpm.reindex_like(da_mask_gpm, method='nearest', tolerance=0.001) * da_mask_gpm da_mask_gpm = da_mask_gpm / da_mask_gpm.sum(['lat', 'lon']) p_gpm = (da_gpm.reindex_like(da_mask_gpm, method='nearest', tolerance=0.01) * da_mask_gpm).sum(['lat', 'lon']) p_gpm = p_gpm.persist() return p_gpm
def test_dask_distributed_zarr_integration_test(loop): chunks = {'dim1': 4, 'dim2': 3, 'dim3': 5} with cluster() as (s, [a, b]): with Client(s['address'], loop=loop) as c: original = create_test_data().chunk(chunks) with create_tmp_file(allow_cleanup_failure=ON_WINDOWS, suffix='.zarr') as filename: original.to_zarr(filename) with xr.open_zarr(filename) as restored: assert isinstance(restored.var1.data, da.Array) computed = restored.compute() assert_allclose(original, computed)
def get_trmm_precipitation(da_mask_trmm): ds_trmm = xr.open_zarr(gcsfs.GCSMap('pangeo-data/trmm_3b42rt')) # TRMM data was stored with lon in 0/360 range, rearrange it in -180/180: long_0_360 = ds_trmm.lon.values ds_trmm.lon.values = np.where(long_0_360 < 180, long_0_360, long_0_360 - 360) da_trmm = ds_trmm['precipitation'].sortby('lon') pix_deg_trmm = 0.25 da_area_trmm = pixel_area(pix_deg_trmm) da_mask_trmm = da_area_trmm.reindex_like(da_mask_trmm, method='nearest', tolerance=0.001) * da_mask_trmm da_mask_trmm = da_mask_trmm / da_mask_trmm.sum(['lat', 'lon']) p_trmm = (da_trmm.reindex_like(da_mask_trmm, method='nearest', tolerance=0.001) * da_mask_trmm).sum(['lat', 'lon']) p_trmm = p_trmm.persist() return p_trmm
def get_gpm_mask(labels, gcs_path): pix_deg_flow = 1 / 1200 pix_deg_gpm = 0.1 ratio = int(pix_deg_gpm / pix_deg_flow) da_mask_gpm = [] for label in labels: ds = xr.open_zarr(gcsfs.GCSMap(f'{gcs_path}/{label}')) da1 = ds['mask'].compute() da2 = adjust_bbox(da1, {'lat': (pix_deg_gpm, -pix_deg_flow), 'lon': (pix_deg_gpm, pix_deg_flow)}) da3 = aggregate_da(da2, {'lat': ratio, 'lon': ratio}) / (ratio * ratio) da3 = da3.rename({'lat_agg': 'lat', 'lon_agg': 'lon'}) da3.lon.values = np.round(da3.lon.values, 2) da3.lat.values = np.round(da3.lat.values, 2) da_mask_gpm.append(da3) da_mask_gpm = xr.concat(da_mask_gpm, 'label').assign_coords(label=labels) return da_mask_gpm
def _get_dataset_lazily(self, index: int, **zarr_kwargs) -> xr.Dataset: """ Read the dataset for the level at given *index*. :param index: the level index :param zarr_kwargs: kwargs passed to xr.open_zarr() :return: the dataset for the level at *index*. """ ext, level_path = self._level_paths[index] if ext == ".link": with open(level_path, "r") as fp: level_path = fp.read() # if file_path is a relative path, resolve it against the levels directory if not os.path.isabs(level_path): base_dir = os.path.dirname(self._dir_path) level_path = os.path.join(base_dir, level_path) with measure_time( tag=f"opened local dataset {level_path} for level {index}"): return assert_cube(xr.open_zarr(level_path, **zarr_kwargs), name=level_path)
def test_vars2dim(self): result = self.invoke_cli(["vars2dim", TEST_ZARR_DIR]) output_path = self.TEST_OUTPUT self.assertEqual(0, result.exit_code) self.assertTrue(os.path.isdir(output_path)) ds = xr.open_zarr(output_path) self.assertIn("var", ds.dims) self.assertEqual(3, ds.dims["var"]) self.assertIn("var", ds.coords) self.assertIn("data", ds.data_vars) var_names = ds["var"] self.assertEqual(("var", ), var_names.dims) self.assertTrue(hasattr(var_names, "encoding")) self.assertEqual("<U13", str(var_names.dtype)) self.assertEqual(3, len(var_names)) self.assertIn("precipitation", str(var_names[0])) self.assertIn("soil_moisture", str(var_names[1])) self.assertIn("temperature", str(var_names[2]))
def test_dataset_to_zarr(self): dataset = xarray.Dataset( {'foo': ('x', np.arange(0, 60, 10))}, coords={'x': np.arange(6)}, attrs={'meta': 'data'}, ) chunked = dataset.chunk({'x': 3}) temp_dir = self.create_tempdir().full_path (test_util.EagerPipeline() | xbeam.DatasetToZarr(chunked, temp_dir)) actual = xarray.open_zarr(temp_dir, consolidated=True) xarray.testing.assert_identical(actual, dataset) temp_dir = self.create_tempdir().full_path with self.assertRaisesRegex( ValueError, 'template does not have any variables chunked with Dask', ): (test_util.EagerPipeline() | xbeam.DatasetToZarr(dataset, temp_dir))
def test_vcf_to_zarr__large_vcf(shared_datadir, is_path, tmp_path): path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz", is_path) output = tmp_path.joinpath("vcf.zarr").as_posix() vcf_to_zarr(path, output, chunk_length=5_000) ds = xr.open_zarr(output) # type: ignore[no-untyped-call] assert ds["sample_id"].shape == (1, ) assert ds["call_genotype"].shape == (19910, 1, 2) assert ds["call_genotype_mask"].shape == (19910, 1, 2) assert ds["call_genotype_phased"].shape == (19910, 1) assert ds["variant_allele"].shape == (19910, 4) assert ds["variant_contig"].shape == (19910, ) assert ds["variant_id"].shape == (19910, ) assert ds["variant_id_mask"].shape == (19910, ) assert ds["variant_position"].shape == (19910, ) assert ds["variant_allele"].dtype == "O" assert ds["variant_id"].dtype == "O"
def test_vcf_to_zarr__mutable_mapping(shared_datadir, is_path): path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz", is_path) output: MutableMapping[str, bytes] = {} vcf_to_zarr(path, output, chunk_length=5_000) ds = xr.open_zarr(output) # type: ignore[no-untyped-call] assert ds["sample_id"].shape == (1, ) assert ds["call_genotype"].shape == (19910, 1, 2) assert ds["call_genotype_mask"].shape == (19910, 1, 2) assert ds["call_genotype_phased"].shape == (19910, 1) assert ds["variant_allele"].shape == (19910, 4) assert ds["variant_contig"].shape == (19910, ) assert ds["variant_id"].shape == (19910, ) assert ds["variant_id_mask"].shape == (19910, ) assert ds["variant_position"].shape == (19910, ) assert ds["variant_allele"].dtype == "O" assert ds["variant_id"].dtype == "O"
def read_multiple_obs(obs_files, x_data): """ read and format multiple observation files. we read in the pretrain data to make sure we have the same indexing. :param obs_files: [list] list of filenames of observation files :param pre_train_file: [str] the file of pre_training data :return: [xr dataset] the observations in the same time """ obs = [x_data.sortby(["seg_id_nat", "date"])] for filename in obs_files: ds = xr.open_zarr(filename) obs.append(ds) if "site_id" in ds.variables: del ds["site_id"] obs = xr.merge(obs, join="left") obs = obs[["temp_c", "discharge_cms"]] obs = obs.rename( {"temp_c": "seg_tave_water", "discharge_cms": "seg_outflow"} ) return obs
def fmt_preds_obs(pred_data, obs_file, variable): """ combine predictions and observations in one dataframe :param pred_data:[str] filepath to the predictions file :param obs_file:[str] filepath to the observations file :param variable: [str] either 'flow' or 'temp' """ obs_var, seg_var = get_var_names(variable) pred_data = load_if_not_df(pred_data) # pred_data.loc[:, "seg_id_nat"] = pred_data["seg_id_nat"].astype(int) if {"date", "seg_id_nat"}.issubset(pred_data.columns): pred_data.set_index(["date", "seg_id_nat"], inplace=True) obs = xr.open_zarr(obs_file).to_dataframe() obs_cln = obs[[obs_var]] obs_cln.columns = ["obs"] preds = pred_data[[seg_var]] preds.columns = ["pred"] obs_cln_trim = trim_obs(obs_cln, preds) combined = preds.join(obs_cln_trim) return combined
def test_update_corrupt_cube(self): self.write_cube('2019-01-01', 3) cube = xr.open_zarr(self.CUBE_PATH) t, y, x = cube.precipitation.shape new_shape = y, t, x t, y, x = cube.precipitation.dims new_dims = y, t, x cube['precipitation'] = xr.DataArray( cube.precipitation.values.reshape(new_shape), dims=new_dims, coords=cube.precipitation.coords) cube.to_zarr(self.CUBE_PATH_2) with self.assertRaises(ValueError) as cm: insert_time_slice(self.CUBE_PATH_2, 2, self.make_slice('2019-01-02T06:30')) self.assertEqual( "dimension 'time' of variable 'precipitation' must be first dimension", f"{cm.exception}")
def test_vcf_to_zarr__multiple_max_alt_alleles(shared_datadir, is_path, tmp_path): paths = [ path_for_test(shared_datadir, "CEUTrio.20.gatk3.4.g.vcf.bgz", is_path), path_for_test(shared_datadir, "CEUTrio.21.gatk3.4.g.vcf.bgz", is_path), ] output = tmp_path.joinpath("vcf_concat.zarr").as_posix() with pytest.warns(MaxAltAllelesExceededWarning): vcf_to_zarr( paths, output, target_part_size="40KB", chunk_length=5_000, max_alt_alleles=1, ) ds = xr.open_zarr(output) # the maximum number of alt alleles actually seen is stored as an attribute assert ds.attrs["max_alt_alleles_seen"] == 7
def test_append_overlapping_append_newer(self): for consolidated in False, True: with self.subTest(consolidated=consolidated): dst_path = "my.zarr" self.add_path(dst_path) ds1, ds2 = new_append_test_datasets( ["2001-01-01", "2001-01-02", "2001-01-03"], ["2001-01-02", "2001-01-03", "2001-01-04", "2001-02-05"] ) ds1.to_zarr(dst_path, consolidated=consolidated) w = DatasetWriter(dst_path, output_append=True, output_append_dim="t", output_append_mode=AppendMode.newer, output_consolidated=consolidated) w.write_dataset(ds2) ds3 = xr.open_zarr(dst_path, consolidated=consolidated) expected = np.array(["2001-01-01", "2001-01-02", "2001-01-03", "2001-01-04", "2001-02-05"], dtype="datetime64[ns]") np.testing.assert_equal(expected, ds3.t.data)
def read_image(infile): """ Read xarray zarr format image from disk Parameters ---------- infile : str input zarr image filename Returns ------- xarray.core.dataset.Dataset New xarray Dataset of image contents """ import os from xarray import open_zarr infile = os.path.expanduser(infile) xds = open_zarr(infile) return xds
def __init__( self, IMC_name, ROI_number, path=None, use_panel_file=False, panel_file=None, panel_skip_rows=4, ): self.IMC_name = IMC_name self.ROI_number = ROI_number if path: self.filepath = path else: self.filepath = "/data/meds1_d/storage/raw/imc/" print("Loading IMC dataset") path_to_open = ( os.path.join(self.filepath, self.IMC_name) + "/Q00" + str(self.ROI_number) ) ds = xr.open_zarr(path_to_open) image = ds["Q00" + str(ROI_number)] image.load() channels = pd.DataFrame(image.meta[0]["q_channels"]) self.data = image self.channels = channels self.channels["good"] = np.zeros(len(self.channels)) if use_panel_file: panel = pd.read_excel(panel_file, skiprows=panel_skip_rows) for item in self.channels.metal: if "(" in item: metal = item[:2] number = item[3:6] tomatch = number + metal match = panel[panel["Metal"] == tomatch].Target.tolist() if len(match) > 0: self.channels.loc[self.channels.metal == item, "target"] = match self.channels.loc[self.channels.metal == item, "good"] = 1 else: self.channels["good"] = np.ones(len(self.channels))
def test_vcf_to_zarr__fields(shared_datadir, tmp_path): path = path_for_test(shared_datadir, "sample.vcf.gz") output = tmp_path.joinpath("vcf.zarr").as_posix() vcf_to_zarr( path, output, chunk_length=5, chunk_width=2, fields=["INFO/DP", "INFO/AA", "INFO/DB", "FORMAT/DP"], ) ds = xr.open_zarr(output) assert_array_equal(ds["variant_DP"], [-1, -1, 14, 11, 10, 13, 9, -1, -1]) assert ds["variant_DP"].attrs["comment"] == "Total Depth" assert_array_equal(ds["variant_AA"], ["", "", "", "", "T", "T", "G", "", ""]) assert ds["variant_AA"].attrs["comment"] == "Ancestral Allele" assert_array_equal( ds["variant_DB"], [False, False, True, False, True, False, False, False, False]) assert ds["variant_DB"].attrs["comment"] == "dbSNP membership, build 129" dp = np.array( [ [-1, -1, -1], [-1, -1, -1], [1, 8, 5], [3, 5, 3], [6, 0, 4], [-1, 4, 2], [4, 2, 3], [-1, -1, -1], [-1, -1, -1], ], dtype="i4", ) assert_array_equal(ds["call_DP"], dp) assert ds["call_DP"].attrs["comment"] == "Read Depth"
def read(self, path: str, **kwargs) -> xr.Dataset: path_or_store = path consolidated = False if isinstance(path, str): endpoint_url = None region_name = None root = None if 'endpoint_url' in kwargs: endpoint_url = kwargs.pop('endpoint_url') root = path if 'region_name' in kwargs: region_name = kwargs.pop('region_name') if path.startswith("http://") or path.startswith("https://"): import urllib3.util url = urllib3.util.parse_url(path_or_store) if url.port is not None: endpoint_url = f'{url.scheme}://{url.host}:{url.port}' else: endpoint_url = f'{url.scheme}://{url.host}' root = url.path if root.startswith('/'): root = root[1:] if endpoint_url and root is not None: s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict( endpoint_url=endpoint_url, region_name=region_name)) consolidated = s3.exists(f'{root}/.zmetadata') path_or_store = s3fs.S3Map(root=root, s3=s3, check=False) if 'max_cache_size' in kwargs: max_cache_size = kwargs.pop('max_cache_size') if max_cache_size > 0: path_or_store = zarr.LRUStoreCache( path_or_store, max_size=max_cache_size) else: consolidated = os.path.exists( os.path.join(path_or_store, '.zmetadata')) return xr.open_zarr(path_or_store, consolidated=consolidated, **kwargs)
def load(self, filename: Optional[str] = None): """Load pose tracks estimated using DeepPoseKit. Args: filepath (str): name of the file produced by DeepPoseKit Returns: poses_ego (xarray.DataArray): poses in EGOcentric (centered around thorax, head aligned straight upwards), [frames, flies, bodypart, x/y] poses_allo (xarray.DataArray): poses in ALLOcentric (frame) coordinates, [frames, flies, bodypart, x/y] partnames (List[str]): list of names for each body part (is already part of the poses_ego/allo xrs) first_pose_frame, last_pose_frame (int): frame corresponding to first and last item in the poses_ego/allo arrays (could attach this info as xr dim) """ if filename is None: filename = self.path with zarr.ZipStore(filename, mode='r') as zarr_store: ds = xr.open_zarr(zarr_store).load() # the final `load()` prevents nb_flies = len(ds.flies) box_size = np.array(ds.attrs['box_size']) poses_allo = ds.poses + ds.box_centers - box_size/2 first_pose_frame = int(np.argmin(np.isnan(ds.poses.data[:, 0, 0, 0]).data)) last_pose_frame = int(np.argmin(~np.isnan(np.array(ds.poses.data[first_pose_frame:, 0, 0, 0]).data)) + first_pose_frame) if last_pose_frame == first_pose_frame or last_pose_frame == 0: last_pose_frame = ds.poses.shape[0] # CUT to first/last frame with poses poses_ego = ds.poses[first_pose_frame:last_pose_frame, ...] poses_allo = poses_allo[first_pose_frame:last_pose_frame, ...] poses_ego = poses_ego - poses_ego.sel(poseparts='thorax') # CENTER egocentric poses around thorax # ROTATE egocentric poses such that the angle between head and thorax is 0 degrees (straight upwards) head_thorax_angle = 270 + np.arctan2(poses_ego.sel(poseparts='head', coords='y'), poses_ego.sel(poseparts='head', coords='x')) * 180 / np.pi for cnt, (a, p_ego) in enumerate(zip(head_thorax_angle.data, poses_ego.data)): for fly in range(nb_flies): poses_ego.data[cnt, fly, ...] = rotate_pose(p_ego[fly], -a[fly]) return poses_ego, poses_allo, ds.poseparts, first_pose_frame, last_pose_frame
def test_replace_time_slice(self): self.write_cube('2019-01-02', 10) replace_time_slice(self.CUBE_PATH, 5, self.make_slice('2019-01-06T02:00')) replace_time_slice(self.CUBE_PATH, 9, self.make_slice('2019-01-11T02:00')) replace_time_slice(self.CUBE_PATH, 0, self.make_slice('2019-01-01T02:00')) cube = xr.open_zarr(self.CUBE_PATH) expected = np.array([ '2019-01-01T14:00', '2019-01-03T12:00', '2019-01-04T12:00', '2019-01-05T12:00', '2019-01-06T12:00', '2019-01-06T14:00', '2019-01-08T12:00', '2019-01-09T12:00', '2019-01-10T12:00', '2019-01-11T14:00' ], dtype=cube.time.dtype) self.assertEqual(10, cube.time.size) self.assertEqual(None, cube.time.chunks) np.testing.assert_equal(cube.time.values, expected)
def perform_cmip6_query(conf, query_string): df_sub = conf.df.query(query_string) if (df_sub.zstore.values.size == 0): return df_sub mapper = conf.fs.get_mapper(df_sub.zstore.values[-1]) ds = xr.open_zarr(mapper, consolidated=True) time_object = ds["time"].values[0] # Convert if necesssary if time_object.year == 1: times = ds["time"].values times_plus_2000 = [] for t in times: times_plus_2000.append( cftime.DatetimeNoLeap(t.year + 2000, t.month, t.day, t.hour)) ds["time"].values = times_plus_2000 ds = xr.decode_cf(ds) return ds
def get_sst(): import fsspec import xarray as xr #climatology years cyr1,cyr2='1993-01-01','2018-12-31' #sst file_location = 's3://mur-sst/zarr' ds = xr.open_zarr(fsspec.get_mapper(file_location, anon=True),consolidated=True) ds_sst = ds.drop({'analysis_error','mask','sea_ice_fraction'}) tem = ds_sst.analysed_sst.attrs tem['var_name']='mur_sst' ds_sst.analysed_sst.attrs=tem ds_sst_clim = ds_sst.sel(time=slice(cyr1,cyr2)) ds_sst_clim = ds_sst_clim.groupby('time.dayofyear').mean('time',keep_attrs=True,skipna=False) #put data into a dictionary data_dict={'sst':ds_sst} clim_dict={'sst_clim':ds_sst_clim} return data_dict,clim_dict
def test_upsample_with_multiple_methods(self): result = self.invoke_cli([ 'resample', '--variables', 'temperature', '-F', '12H', '-T', '6H', # '-K', 'quadratic', # '-M', 'interpolate', '-M', 'nearest', TEST_ZARR_DIR ]) self.assertEqual(0, result.exit_code) self.assertTrue(os.path.isdir('out.zarr')) ds = xr.open_zarr('out.zarr') assert_cube(ds) # self.assertIn('temperature_interpolate', ds) self.assertIn('temperature_nearest', ds)
def test_zarr_mapping_set_2d(dtype=int): arr = np.array([2.0], dtype=dtype) schema = xr.Dataset({"a": (["x"], arr)}).chunk() coords = {"time": [0, 1, 2], "space": list("xyz")} store = {} m = vcm.ZarrMapping.from_schema(store, schema, dims=["time", "space"], coords=coords) for time, space in product(coords["time"], coords["space"]): m[time, space] = schema ds = xr.open_zarr(store) assert list(ds.a.dims) == ["time", "space", "x"] # check all data for time, space in product(coords["time"], coords["space"]): a = ds.sel(space=space, time=time).drop(["time", "space"]).load() b = schema.load() xr.testing.assert_equal(a, b)
def test_dask_distributed_zarr_integration_test(loop, consolidated, compute): if consolidated: zarr = pytest.importorskip('zarr', minversion="2.2.1.dev2") write_kwargs = dict(consolidated=True) read_kwargs = dict(consolidated=True) else: write_kwargs = read_kwargs = {} chunks = {'dim1': 4, 'dim2': 3, 'dim3': 5} with cluster() as (s, [a, b]): with Client(s['address'], loop=loop) as c: original = create_test_data().chunk(chunks) with create_tmp_file(allow_cleanup_failure=ON_WINDOWS, suffix='.zarrc') as filename: maybe_futures = original.to_zarr(filename, compute=compute, **write_kwargs) if not compute: maybe_futures.compute() with xr.open_zarr(filename, **read_kwargs) as restored: assert isinstance(restored.var1.data, da.Array) computed = restored.compute() assert_allclose(original, computed)
def __init__(self, dataset_name, img_res=(512, 512), downsize_factor=(4, 4), local_path=None): self.dataset_name = dataset_name self.img_res = img_res self.downsize_factor = downsize_factor uris = [ '/rigel/ocp/projects/shared_data/sst_superresolution/LLC4320/SST.{tstep:010d}.zarr' for tstep in range(0, 4088 + 1, 73) ][:2] dsets = [xr.open_zarr(uri, consolidated=True) for uri in uris] ds = xr.combine_nested(dsets, 'timestep') print(ds) sst_coarse = ds.SST.coarsen(x=self.downsize_factor[0], y=self.downsize_factor[1]).mean() print(sst_coarse) region = 306 self.hr = ds.SST[0, region].load().values self.lr = sst_coarse[timestep, region].load().values
def test_vcf_to_zarr__multiple(shared_datadir, is_path, tmp_path): paths = [ path_for_test(shared_datadir, "CEUTrio.20.gatk3.4.g.vcf.bgz", is_path), path_for_test(shared_datadir, "CEUTrio.21.gatk3.4.g.vcf.bgz", is_path), ] output = tmp_path.joinpath("vcf_concat.zarr").as_posix() vcf_to_zarr(paths, output, target_part_size=None, chunk_length=5_000) ds = xr.open_zarr(output) # type: ignore[no-untyped-call] assert ds["sample_id"].shape == (1, ) assert ds["call_genotype"].shape == (19910, 1, 2) assert ds["call_genotype_mask"].shape == (19910, 1, 2) assert ds["call_genotype_phased"].shape == (19910, 1) assert ds["variant_allele"].shape == (19910, 4) assert ds["variant_contig"].shape == (19910, ) assert ds["variant_id"].shape == (19910, ) assert ds["variant_id_mask"].shape == (19910, ) assert ds["variant_position"].shape == (19910, ) assert ds.chunks["variants"] == (5000, 5000, 5000, 4910)
def data_tile_meta(prefix, var, tile_pos): z, x, y = tile_pos tile_meta = mercantile.Tile(x=x, y=y, z=z) min_lon, min_lat, max_lon, max_lat = mercantile.bounds(tile_meta) data_tile = xr.open_zarr(store=get_store(prefix))[var] data_tile = data_tile.sel(longitude=slice(min_lon, max_lon), latitude=slice(max_lat, min_lat)) return { "variable": var, "z": z, "x": x, "y": y, "lats": [min_lat, max_lat], "lons": [min_lon, max_lon], f"{var}Max": float(data_tile.max().compute()), f"{var}Min": float(data_tile.min().compute()), "units": data_tile.units, "description": data_tile.long_name, }
def test_NetCDFtoZarrMultiVarSequentialRecipe( daily_xarray_dataset, netcdf_local_paths_by_variable, tmp_target, tmp_cache ): paths, items_per_file, fnames_by_variable, path_format = netcdf_local_paths_by_variable pattern = VariableSequencePattern( path_format, keys={"variable": ["foo", "bar"], "n": list(range(len(paths) // 2))} ) r = recipe.NetCDFtoZarrMultiVarSequentialRecipe( input_pattern=pattern, sequence_dim="time", inputs_per_chunk=1, nitems_per_input=items_per_file, target=tmp_target, input_cache=tmp_cache, ) _manually_execute_recipe(r) ds_target = xr.open_zarr(tmp_target.get_mapper(), consolidated=True).compute() print(ds_target) print(daily_xarray_dataset) assert ds_target.identical(daily_xarray_dataset)
def delineate(lat, lon, sub_latlon=[], sub_nb=None, acc_delta=np.inf, progress=False): pix_deg = 1 / 1200 tile_deg = 5 lat = (lat // pix_deg) * pix_deg + pix_deg lon = (lon // pix_deg) * pix_deg for ll in sub_latlon: ll[0] = (ll[0] // pix_deg) * pix_deg + pix_deg ll[1] = (ll[1] // pix_deg) * pix_deg if sub_nb is not None: dir_tile, acc_tile, y, x = get_tile(lat, lon, 2, pix_deg, tile_deg) acc_delta = acc_tile[y, x] / (sub_nb + 1) getSubBass = True sample_i = 0 samples = np.empty((1024, 2), dtype=np.float64) lengths = np.empty(1024, dtype=np.float64) areas = np.empty(1024, dtype=np.float64) areas = np.empty(1024, dtype=np.float64) labels = np.empty((1024, 3), dtype=np.int32) dirNeighbors = np.empty(1024, dtype=np.uint8) accNeighbors = np.empty(1024, dtype=np.float64) ws_latlon = np.empty(2, dtype=np.float64) # output mask -> mxw = 10000 # bytes myw = mxw * 8 # bits mm = np.empty((myw, mxw), dtype=np.uint8) mm_back = np.empty((myw, mxw), dtype=np.uint8) mx0_deg = 0 my0_deg = 0 # <- output mask simple_delineation = False if len(sub_latlon) == 0: _sub_latlon = np.empty((1, 2), dtype=np.float64) _sub_latlon[0, :] = [lat, lon] if not np.isfinite(acc_delta): simple_delineation = True else: _sub_latlon = np.empty((len(sub_latlon), 2), dtype=np.float64) _sub_latlon[:, :] = sub_latlon tile_size = int(round(tile_deg / pix_deg)) if simple_delineation: sample_size = 1 samples[0] = [lat, lon] else: if progress: print('Getting bassin partition...') samples, labels, areas, lengths, sample_size, mx0_deg, my0_deg, ws_mask, ws_latlon, dirNeighbors, accNeighbors = cdelineate(lat, lon, getSubBass, sample_i, samples, labels, areas, lengths, pix_deg, tile_deg, acc_delta, _sub_latlon, mm, mm_back, mx0_deg, my0_deg, dirNeighbors, accNeighbors) if not is_empty_latlon(_sub_latlon): print("WARNING: not all subbasins have been processed. This means that they don't fall into different pixels, or that they are not located in the basin. Please check their lat/lon coordinates.") for i in range(_sub_latlon.shape[0]): if _sub_latlon[i, 0] > -900: print(_sub_latlon[i, 0], _sub_latlon[i, 1]) #print('Delineating sub-bassins...') mask, latlon = [], [] getSubBass = False lat_min = np.inf lat_max = -np.inf lon_min = np.inf lon_max = -np.inf new_labels = [] shutil.rmtree('tmp/ws', ignore_errors=True) shutil.rmtree('tmp/ds_mask', ignore_errors=True) this_range = range(sample_size) if progress: this_range = tqdm(this_range) for sample_i in this_range: _, _, _, _, _, mx0_deg, my0_deg, ws_mask, ws_latlon, dirNeighbors, accNeighbors = cdelineate(lat, lon, getSubBass, sample_i, samples, labels, areas, lengths, pix_deg, tile_deg, acc_delta, _sub_latlon, mm, mm_back, mx0_deg, my0_deg, dirNeighbors, accNeighbors) clat = np.array([ws_latlon[0] - (i + 0.5) * pix_deg for i in range(ws_mask.shape[0])]) clon = np.array([ws_latlon[1] + (i + 0.5) * pix_deg for i in range(ws_mask.shape[1])]) da_mask = xr.DataArray(ws_mask, coords=[clat, clon], dims=['lat', 'lon']) if sample_i == 0: new_labels.append('0') else: i = labels[sample_i][0] new_labels.append(new_labels[i] + ',' + str(labels[sample_i][2])) ds_mask = da_mask.to_dataset(name='mask') ds_mask.to_zarr(store=f'tmp/ws/{sample_i}', mode='w') #lat_min = min(lat_min, clat[-1]) #lat_max = max(lat_max, clat[0]) #lon_min = min(lon_min, clon[0]) #lon_max = max(lon_max, clon[-1]) #vmin = {'lat': lat_min, 'lon': lon_min} #vmax = {'lat': lat_max, 'lon': lon_max} #new_lat = np.arange(lat_max, lat_min-tolerance, -pix_deg) #new_lon = np.arange(lon_min, lon_max+tolerance, pix_deg) #for sample_i in range(sample_size): # label = new_labels[sample_size-1-sample_i] # ds_mask = xr.open_zarr(f'tmp/ws/{sample_i}').compute() # da_mask = ds_mask[str(sample_i)] # ilat = (np.abs(new_lat - da_mask.lat.values[0])).argmin() # ilon = (np.abs(new_lon - da_mask.lon.values[0])).argmin() # nlat = 1 + int(round((lat_max - lat_min) / pix_deg)) # nlon = 1 + int(round((lon_max - lon_min) / pix_deg)) # mask = da.zeros((nlat, nlon), chunks=(1000,1000), dtype='uint8') # #mask = zarr.zeros((nlat, nlon), chunks=(nlat, nlon), dtype='uint8') # da_mask2 = xr.DataArray(mask, coords=[new_lat, new_lon], dims=['lat', 'lon']) # da_mask2[ilat:ilat+da_mask.shape[0], ilon:ilon+da_mask.shape[1]] = da_mask.values # if sample_i == 0: # ds_mask = da_mask2.to_dataset(name=label) # else: # ds_mask[label] = da_mask2 #ds_mask.to_zarr(store=f'tmp/ds_ws', mode='w') #shutil.rmtree('tmp/ds_ws', ignore_errors=True) for sample_i in range(sample_size): label = new_labels[sample_size-1-sample_i] ds_mask = xr.open_zarr(f'tmp/ws/{sample_i}').compute() da_mask = ds_mask['mask'] olatlon = [samples[sample_i][0]-pix_deg/2, samples[sample_i][1]+pix_deg/2] da_mask.attrs = {'outlet': olatlon, 'area': areas[sample_size-1-sample_i], 'length': lengths[sample_size-1-sample_i], 'label': label} ds_mask = da_mask.to_dataset() ds_mask.to_zarr(store=f'tmp/ds_mask/{label}', mode='w') shutil.rmtree(f'tmp/ws/{sample_i}', ignore_errors=True)