def test_raise(self): da = open_dataset(self.nc_poslons).tas with pytest.raises(ValueError): subset.subset_bbox( da, lon_bnds=self.lonGCM, lat_bnds=self.latGCM, start_date="2056", end_date="2055", ) da = open_dataset( self.nc_2dlonlat).tasmax.drop_vars(names=["lon", "lat"]) with pytest.raises(Exception): subset.subset_bbox(da, lon_bnds=self.lon, lat_bnds=self.lat)
def test_single_bounds_rectilinear(self): da = open_dataset(self.nc_file).tasmax out = subset.subset_bbox(da, lon_bnds=self.lon) assert out.lon.values.size != 0 assert out.lat.values.size != 0 np.testing.assert_array_equal(out.lat, da.lat) assert np.all(out.lon <= np.max(self.lon)) assert np.all(out.lon.values >= np.min(self.lon)) out = subset.subset_bbox(da, lat_bnds=self.lat) assert out.lon.values.size != 0 assert out.lat.values.size != 0 np.testing.assert_array_equal(out.lon, da.lon) assert np.all(out.lat <= np.max(self.lat)) assert np.all(out.lat.values >= np.min(self.lat))
def test_single_bounds_curvilinear(self): da = open_dataset(self.nc_2dlonlat).tasmax out = subset.subset_bbox(da, lon_bnds=self.lon) assert out.lon.values.size != 0 assert out.lat.values.size != 0 mask1 = ~(np.isnan(out.sel(time=out.time[0]))) assert np.all(out.lon.values[mask1.values] <= np.max(self.lon)) assert np.all(out.lon.values[mask1.values] >= np.min(self.lon)) out = subset.subset_bbox(da, lat_bnds=self.lat) assert out.lon.values.size != 0 assert out.lat.values.size != 0 mask1 = ~(np.isnan(out.sel(time=out.time[0]))) assert np.all(out.lat.values[mask1.values] <= np.max(self.lat)) assert np.all(out.lat.values[mask1.values] >= np.min(self.lat))
def test_positive_lons(self): da = open_dataset(self.nc_poslons).tas out = subset.subset_bbox(da, lon_bnds=self.lonGCM, lat_bnds=self.latGCM) assert out.lon.values.size != 0 assert out.lat.values.size != 0 assert np.all(out.lon >= np.min(np.asarray(self.lonGCM) + 360)) assert np.all(out.lon <= np.max(np.asarray(self.lonGCM) + 360)) assert np.all(out.lat >= np.min(self.latGCM)) assert np.all(out.lat <= np.max(self.latGCM)) out = subset.subset_bbox(da, lon_bnds=np.array(self.lonGCM) + 360, lat_bnds=self.latGCM) assert np.all(out.lon >= np.min(np.asarray(self.lonGCM) + 360))
def test_time(self): da = open_dataset(self.nc_poslons).tas da = da.assign_coords(lon=(da.lon - 360)) out = subset.subset_bbox( da, lon_bnds=self.lonGCM, lat_bnds=self.latGCM, start_date="2050", end_date="2059", ) assert out.lon.values.size != 0 assert out.lat.values.size != 0 assert np.all(out.lon >= np.min(self.lonGCM)) assert np.all(out.lon <= np.max(self.lonGCM)) assert np.all(out.lat >= np.min(self.latGCM)) assert np.all(out.lat <= np.max(self.latGCM)) np.testing.assert_array_equal(out.time.min().dt.year, 2050) np.testing.assert_array_equal(out.time.min().dt.month, 1) np.testing.assert_array_equal(out.time.min().dt.day, 1) np.testing.assert_array_equal(out.time.max().dt.year, 2059) np.testing.assert_array_equal(out.time.max().dt.month, 12) np.testing.assert_array_equal(out.time.max().dt.day, 31) out = subset.subset_bbox( da, lon_bnds=self.lonGCM, lat_bnds=self.latGCM, start_date="2050-02-05", end_date="2059-07-15", ) assert out.lon.values.size != 0 assert out.lat.values.size != 0 assert np.all(out.lon >= np.min(self.lonGCM)) assert np.all(out.lon <= np.max(self.lonGCM)) assert np.all(out.lat >= np.min(self.latGCM)) assert np.all(out.lat <= np.max(self.latGCM)) np.testing.assert_array_equal(out.time.min().dt.year, 2050) np.testing.assert_array_equal(out.time.min().dt.month, 2) np.testing.assert_array_equal(out.time.min().dt.day, 5) np.testing.assert_array_equal(out.time.max().dt.year, 2059) np.testing.assert_array_equal(out.time.max().dt.month, 7) np.testing.assert_array_equal(out.time.max().dt.day, 15)
def test_irregular_dataset(self): da = open_dataset(self.nc_2dlonlat) out = subset.subset_bbox(da, lon_bnds=[-150, 100], lat_bnds=[10, 60]) variables = list(da.data_vars) variables.pop(variables.index("tasmax")) # only tasmax should be subsetted/masked others should remain untouched for v in variables: assert out[v].dims == da[v].dims np.testing.assert_array_equal(out[v], da[v]) # ensure results are equal to previous test on DataArray only out1 = subset.subset_bbox(da.tasmax, lon_bnds=[-150, 100], lat_bnds=[10, 60]) np.testing.assert_array_equal(out1, out.tasmax) # additional test if dimensions have no coordinates da = da.drop_vars(["rlon", "rlat"]) subset.subset_bbox(da.tasmax, lon_bnds=[-150, 100], lat_bnds=[10, 60])
def test_badly_named_latlons(self): da = open_dataset(self.nc_file) extended_latlons = {"lat": "latitude", "lon": "longitude"} da_extended_names = da.rename(extended_latlons) out = subset.subset_bbox(da_extended_names, lon_bnds=self.lon, lat_bnds=self.lat) assert {"latitude", "longitude"}.issubset(out.dims) long_for_some_reason = {"lon": "long"} da_long = da.rename(long_for_some_reason) out = subset.subset_bbox(da_long, lon_bnds=self.lon, lat_bnds=self.lat) assert {"long"}.issubset(out.dims) lons_lats = {"lon": "lons", "lat": "lats"} da_lonslats = da.rename(lons_lats) out = subset.subset_bbox(da_lonslats, lon_bnds=self.lon, lat_bnds=self.lat) assert {"lons", "lats"}.issubset(out.dims)
def test_dataset(self): da = xr.open_mfdataset( [self.nc_file, self.nc_file.replace("tasmax", "tasmin")], combine="by_coords", ) out = subset.subset_bbox(da, lon_bnds=self.lon, lat_bnds=self.lat) assert np.all(out.lon >= np.min(self.lon)) assert np.all(out.lon <= np.max(self.lon)) assert np.all(out.lat >= np.min(self.lat)) assert np.all(out.lat <= np.max(self.lat)) np.testing.assert_array_equal(out.tasmin.shape, out.tasmax.shape)
def test_warnings(self): da = open_dataset(self.nc_poslons).tas da = da.assign_coords(lon=(da.lon - 360)) with pytest.raises(TypeError): subset.subset_bbox(da, lon_bnds=self.lon, lat_bnds=self.lat, start_yr=2050, end_yr=2059) with pytest.warns(None) as record: subset.subset_bbox( da, lon_bnds=self.lon, lat_bnds=self.lat, start_date="2050", end_date="2055", ) assert ( '"start_yr" and "end_yr" (type: int) are being deprecated. Temporal subsets will soon exclusively' ' support "start_date" and "end_date" (type: str) using formats of "%Y", "%Y-%m" or "%Y-%m-%d".' not in [str(q.message) for q in record])
def test_irregular(self): da = open_dataset(self.nc_2dlonlat).tasmax out = subset.subset_bbox(da, lon_bnds=self.lon, lat_bnds=self.lat) # for irregular lat lon grids data matrix remains rectangular in native proj # but with data outside bbox assigned nans. This means it can have lon and lats outside the bbox. # Check only non-nans gridcells using mask mask1 = ~(np.isnan(out.sel(time=out.time[0]))) assert out.lon.values.size != 0 assert out.lat.values.size != 0 assert np.all(out.lon.values[mask1.values] >= np.min(self.lon)) assert np.all(out.lon.values[mask1.values] <= np.max(self.lon)) assert np.all(out.lat.values[mask1.values] >= np.min(self.lat)) assert np.all(out.lat.values[mask1.values] <= np.max(self.lat))
def test_inverted_coords(self): lon = np.linspace(-90, -60, 200) lat = np.linspace(40, 80, 100) da = xr.Dataset(data_vars=None, coords={ "lon": np.flip(lon), "lat": np.flip(lat) }) da["data"] = xr.DataArray(np.random.rand(lon.size, lat.size), dims=["lon", "lat"]) out = subset.subset_bbox(da, lon_bnds=self.lon, lat_bnds=self.lat) assert out.lon.values.size != 0 assert out.lat.values.size != 0 assert np.all(out.lon >= np.min(np.asarray(self.lon))) assert np.all(out.lon <= np.max(np.asarray(self.lon))) assert np.all(out.lat >= np.min(self.lat)) assert np.all(out.lat <= np.max(self.lat))
def _subset(resource): nonlocal count # if not subsetting by time, it's not necessary to decode times time_subset = start_date is not None or end_date is not None dataset = try_opendap(resource, decode_times=time_subset) with lock: count += 1 write_log( process, f"Subsetting file {count} of {n_files} ({getattr(resource, resource.prop)})", subtask_percentage=(count - 1) * 100 // n_files, ) dataset = dataset[variables] if variables else dataset try: subsetted = subset_bbox( dataset, lon_bnds=[lon0, lon1], lat_bnds=[lat0, lat1], start_date=start_date, end_date=end_date, ) except ValueError: subsetted = False if subsetted is False or not all(subsetted.dims.values()): LOGGER.warning(f"Subset is empty for dataset: {resource.url}") return p = make_subset_file_name(resource) output_filename = Path(process.workdir) / p dataset_to_netcdf(subsetted, output_filename) output_files.append(output_filename)
def test_simple(self): da = open_dataset(self.nc_file).tasmax out = subset.subset_bbox(da, lon_bnds=self.lon, lat_bnds=self.lat) assert out.lon.values.size != 0 assert out.lat.values.size != 0 assert np.all(out.lon >= np.min(self.lon)) assert np.all(out.lon <= np.max(self.lon)) assert np.all(out.lat.values >= np.min(self.lat)) assert np.all(out.lat <= np.max(self.lat)) da = open_dataset(self.nc_poslons).tas da = da.assign_coords(lon=(da.lon - 360)) yr_st = 2050 yr_ed = 2059 out = subset.subset_bbox( da, lon_bnds=self.lonGCM, lat_bnds=self.latGCM, start_date=str(yr_st), end_date=str(yr_ed), ) assert out.lon.values.size != 0 assert out.lat.values.size != 0 assert np.all(out.lon >= np.min(self.lonGCM)) assert np.all(out.lon <= np.max(self.lonGCM)) assert np.all(out.lat >= np.min(self.latGCM)) assert np.all(out.lat <= np.max(self.latGCM)) np.testing.assert_array_equal(out.time.dt.year.max(), yr_ed) np.testing.assert_array_equal(out.time.dt.year.min(), yr_st) out = subset.subset_bbox(da, lon_bnds=self.lon, lat_bnds=self.lat, start_date=str(yr_st)) assert out.lon.values.size != 0 assert out.lat.values.size != 0 assert np.all(out.lon >= np.min(self.lon)) assert np.all(out.lon <= np.max(self.lon)) assert np.all(out.lat >= np.min(self.lat)) assert np.all(out.lat <= np.max(self.lat)) np.testing.assert_array_equal(out.time.dt.year.max(), da.time.dt.year.max()) np.testing.assert_array_equal(out.time.dt.year.min(), yr_st) out = subset.subset_bbox(da, lon_bnds=self.lon, lat_bnds=self.lat, end_date=str(yr_ed)) assert out.lon.values.size != 0 assert out.lat.values.size != 0 assert np.all(out.lon >= np.min(self.lon)) assert np.all(out.lon <= np.max(self.lon)) assert np.all(out.lat >= np.min(self.lat)) assert np.all(out.lat <= np.max(self.lat)) np.testing.assert_array_equal(out.time.dt.year.max(), yr_ed) np.testing.assert_array_equal(out.time.dt.year.min(), da.time.dt.year.min())
def get_subsetted_forecast(region_coll, ds, times, is_caspar): """ This function takes a dataset, a region and the time sampling array and returns the subsetted values for the given region and times Parameters ---------- region_coll : fiona.collection.Collection The region vectors. ds : xarray.Dataset The dataset containing the raw, worldwide forecast data times: dt.datetime The array of times required to do the forecast. is_caspar: boolean True if the data comes from Caspar, false otherwise. Used to define lat/lon on rotated grid. Returns ------- forecast : xararray.Dataset The forecast dataset. """ # Extract the bounding box to subset the entire forecast grid to something # more manageable lon_min = region_coll.bounds[0] lon_max = region_coll.bounds[2] lat_min = region_coll.bounds[1] lat_max = region_coll.bounds[3] # Subset the data to the desired location (bounding box) and times ds = subset.subset_bbox(ds, lon_bnds=[lon_min, lon_max], lat_bnds=[lat_min, lat_max]).sel(time=times) # Rioxarray requires CRS definitions for variables # Get CRS, e.g. 4326 crs = int(re.match(r"epsg:(\d+)", region_coll.crs["init"]).group(1)) # Here the name of the variable could differ based on the Caspar file processing tas = ds.tas.rio.write_crs(crs) pr = ds.pr.rio.write_crs(crs) ds = xr.merge([tas, pr]) # Now apply the mask of the basin contour and average the values to get a single time series if is_caspar: ds.rio.set_spatial_dims("rlon", "rlat") ds["rlon"] = ds["rlon"] - 360 # clip the netcdf and average across space. shdf = [next(iter(region_coll))["geometry"]] forecast = ds.rio.clip(shdf, crs=crs) forecast = forecast.mean(dim={"rlat", "rlon"}, keep_attrs=True) else: ds.rio.set_spatial_dims("lon", "lat") ds["lon"] = ds["lon"] - 360 # clip the netcdf and average across space. shdf = [next(iter(region_coll))["geometry"]] forecast = ds.rio.clip(shdf, crs=crs) forecast = forecast.mean(dim={"lat", "lon"}, keep_attrs=True) return forecast