def test_regridder_save(self, tmp_path): size_reference = (10, 10) size_target = (20, 20) reference_ds, _, _ = _make_dataset(size_reference) target_ds, _, _ = _make_dataset(size_target) processor = BasePreProcessor(tmp_path) processor.regrid(target_ds, reference_ds) weight_filename = 'nearest_s2d_100x100_10x10.nc' assert (processor.preprocessed_folder / weight_filename).exists() is False, \ f'Regridder weight file not deleted!'
def test_incorrect_method(self, tmp_path): size_reference = (10, 10) size_target = (100, 100) reference_ds, _, _ = _make_dataset(size_reference) target_ds, _, _ = _make_dataset(size_target) processor = BasePreProcessor(tmp_path) with pytest.raises(AssertionError) as e: processor.regrid(target_ds, reference_ds, method='woops!') expected_message_contains = 'not an acceptable regridding method. Must be one of' assert expected_message_contains in str(e), \ f'Expected {e} to contain {expected_message_contains}'
def test_regridding(self, tmp_path): size_reference = (10, 10) size_target = (20, 20) reference_ds, _, _ = _make_dataset(size_reference) target_ds, _, _ = _make_dataset(size_target) processor = BasePreProcessor(tmp_path) regridded_ds = processor.regrid(target_ds, reference_ds) # add the time dimension assert regridded_ds.VHI.values.shape[1:] == size_reference, \ f'Expected regridded Dataset to have shape {size_reference}, ' \ f'got {regridded_ds.VHI.values.shape}'
def test_resampling(self): monthly_in, _, _ = _make_dataset(size=(10, 10)) monthly = BasePreProcessor.resample_time(monthly_in, resample_length='M') assert len(monthly_in.time) == len(monthly.time)
def test_load_regridder(self, tmp_path): test_dataset, _, _ = _make_dataset(size=(10, 10)) test_dataset.to_netcdf(tmp_path / 'regridder.nc') output = BasePreProcessor.load_reference_grid(tmp_path / 'regridder.nc') assert set(output.variables) == { 'lat', 'lon' }, f'Got extra variables: {output.variables}'
def test_load_regridder(self, tmp_path): test_dataset, _, _ = _make_dataset(size=(10, 10)) test_dataset.to_netcdf(tmp_path / "regridder.nc") output = BasePreProcessor.load_reference_grid(tmp_path / "regridder.nc") assert set(output.variables) == { "lat", "lon", }, f"Got extra variables: {output.variables}"
def test_chop_roi(self, tmp_path): size_original = (80, 80) original_ds, _, _ = _make_dataset(size_original) original_shape = original_ds.VHI.shape processor = BasePreProcessor(tmp_path) subset_str = 'east_africa' new_ds = processor.chop_roi(ds=original_ds, subset_str=subset_str) output_shape = new_ds.VHI.shape assert original_shape != output_shape, f"The chop_roi should lead to\ smaller datasets than the original. Expected output_shape: {output_shape}\ to be different from original_shape: {original_shape}" assert ((new_ds.lat.values.min() >= -11) & (new_ds.lat.values.max() <= 23) ), f"Expected latitude to be in the range -11 : 23. Currently:\ {new_ds.lat.values.min()} : {new_ds.lat.values.max()}" assert ( (new_ds.lon.values.min() >= 21) & (new_ds.lon.values.max() <= 51.8) ), f"Expected longitude to be in the range 21 : 51.8. Currently:\
# create forecast_horizon fh = pd.to_timedelta(ds_new.time.values - ds_new.initialisation_date.values) ds_new["time"] = fh ds_new = ds_new.rename({"time": "forecast_horizon"}) # create a new coord time = ds_new.initialisation_date + ds_new.forecast_horizon ds_new = ds_new.assign_coords(time=time) # ------------------------------------------------------------------------------ # Test PREPROCESSING # ------------------------------------------------------------------------------ from src.preprocess.base import BasePreProcessor b = BasePreProcessor() ds1_kenya = b.chop_roi(ds1, inverse_lat=True) ds2_kenya = b.chop_roi(ds2, inverse_lat=True) # concat across initialisation dates ds_kenya = xr.concat([ds1_kenya, ds2_kenya], dim="initialisation_date") stacked = ds_kenya.stack(time=("initialisation_date", "forecast_horizon")) # stack each individually k1 = ds1_kenya.stack(time=("initialisation_date", "forecast_horizon")) k2 = ds2_kenya.stack(time=("initialisation_date", "forecast_horizon")) # test selectors stacked.sel(forecast_horizon=np.timedelta64(28, "D")) stacked.sel(initialisation_date="1997-01-01") stacked.swap_dims({"time": "valid_time"}).sel(valid_time="1997-04")
from src.engineer import Engineer data_path = Path("/Volumes/Lees_Extend/data/ecmwf_sowc/data") engineer = Engineer(data_path) # engineer.engineer(test_year=1994, target_variable='VHI') # wrong shapes! datasets = engineer._get_preprocessed_files() ds_list = [xr.open_dataset(ds) for ds in datasets] dims_list = [[dim for dim in ds.dims] for ds in ds_list] variable_list = [[var for var in ds.variables if var not in dims_list[i]][0] for i, ds in enumerate(ds_list)] da_list = [ds[variable_list[i]] for i, ds in enumerate(ds_list)] pp = BasePreProcessor(data_path) c_ds = ds_list[0] e_ds = ds_list[1] v_ds = ds_list[2] v_ds = pp.resample_time(v_ds) c_ds = pp.regrid(c_ds, v_ds) c_ds = pp.resample_time(c_ds) v_ds.to_netcdf(vhi_path.home() / vhi_path.parent / "vhi_kenya_regrid.nc") v_ds.to_netcdf(chirps_path.home() / chirps_path.parent / "chirps_kenya_regrid.nc") # engineer process engineer._get_preprocessed_files
"""NOTE: https://github.com/esowc/ml_drought for the `src` code""" from pathlib import Path from src.preprocess.base import BasePreProcessor era5_dir = Path('/soge-home/data/analysis/era5/0.28125x0.28125/hourly/') winds = ['u_component_of_wind', 'v_component_of_wind'] wind_component = ['u', 'v'] base_our_dir = Path('/soge-home/projects/crop_yield/hackathon/') processor = BasePreProcessor(Path('/soge-home/users/chri4118'))) for wind, component in zip(winds, wind_component): out_dir = base_our_dir / wind + '_surface' if not out_dir.exists(): out_dir.mkdir(exist_ok=True, parents=True) ds = xr.open_mfdataset(base_our_dir / wind / '*.nc') ds = ds.isel(level=0) ds.to_netcdf(out_dir / 'data_africa.nc')
import xarray as xr import sys sys.path.append("../..") from scripts.utils import get_data_path from src.preprocess.base import BasePreProcessor if __name__ == "__main__": data_dir = get_data_path() vci = xr.open_dataset(data_dir / "interim/VCI_preprocessed/data_india.nc") regrid_ds = xr.open_dataset( data_dir / "interim/reanalysis-era5-land_preprocessed/data_india.nc" ) print("** Begin Regridding **") processor = BasePreProcessor(data_dir) vci = processor.regrid(ds=vci, reference_ds=regrid_ds) print("** Saving file **") vci.to_netcdf(data_dir / "interim/VCI_preprocessed/regrid_data_india.nc")