def process_seas5(): # if the working directory is alread ml_drought don't need ../data if Path(".").absolute().as_posix().split("/")[-1] == "ml_drought": data_path = Path("data") else: data_path = Path("../data") regrid_path = ( data_path / "interim/reanalysis-era5-single-levels-monthly-means_preprocessed/data_kenya.nc" ) assert regrid_path.exists(), f"{regrid_path} not available" datasets = [ d.name for d in (data_path / "raw").iterdir() if "seasonal" in d.name ] for dataset in datasets: variables = [v.name for v in (data_path / "raw" / dataset).glob("*")] for variable in variables: if variable == "total_precipitation": processor = S5Preprocessor(data_path) processor.preprocess( subset_str="kenya", regrid=regrid_path, resample_time=None, upsampling=False, variable=variable, )
def test_initialisation(self, tmp_path): data_dir = tmp_path / "data" if not data_dir.exists(): data_dir.mkdir(exist_ok=True, parents=True) S5Preprocessor(data_dir) assert (data_dir / "interim" / "s5_preprocessed").exists() assert (data_dir / "interim" / "s5_interim").exists()
def process_seas5(): data_path = get_data_path() regrid_path = data_path / "interim/chirps_preprocessed/chirps_kenya.nc" assert regrid_path.exists(), f"{regrid_path} not available" processor = S5Preprocessor(data_path) processor.preprocess( subset_str="kenya", regrid=regrid_path, resample_time="M", upsampling=False )
def preprocess_s5_ouce(): if Path(".").absolute().as_posix().split("/")[-1] == "ml_drought": data_path = Path("data") else: data_path = Path("../data") variable = "total_precipitation" daily_s5_dir = Path("/soge-home/data/model/seas5/1.0x1.0/daily") s = S5Preprocessor(data_path, ouce_server=True) s.preprocess( variable=variable, regrid=None, resample_time=None, **{ "ouce_dir": daily_s5_dir, "infer": True }, )
def test_find_grib_file(self, tmp_path): # create grib file to test if it can be found by s5 preprocessor _ = save_dummy_seas5( tmp_path, "2018-01-01", to_grib=True, dataset="seasonal-monthly-pressure-levels", variable="temperature", ) out_dir = tmp_path / "data" / "raw" / "seasonal-monthly-pressure-levels" out_dir = out_dir / "temperature" / "2018" assert (out_dir / "01.grib").exists() processor = S5Preprocessor(tmp_path / "data") # check the preprocessor can find the grib file created fpaths = processor.get_filepaths( grib=True, target_folder=processor.raw_folder, variable="temperature" ) assert fpaths[0].name == "01.grib", ( f"unable to find the created dataset" "at data/raw/s5/seasonal-monthly-pressure-levels" )
def test_preprocess(self, tmp_path): out_dir = tmp_path / "data" / "raw" / "s5" out_dir = ( out_dir / "seasonal-monthly-pressure-levels" / "2m_temperature" / str(2018) ) if not out_dir.exists(): out_dir.mkdir(exist_ok=True, parents=True) # preprocessor working with pretend ouce data (because writing to .grib is failing) ouce_dir = make_dummy_ouce_s5_data(tmp_path) kenya = get_kenya() regrid_dataset, _, _ = _make_dataset( size=(20, 20), latmin=kenya.latmin, latmax=kenya.latmax, lonmin=kenya.lonmin, lonmax=kenya.lonmax, ) # the reference dataset to regrid to regrid_path = tmp_path / "regridder.nc" regrid_dataset.to_netcdf(regrid_path) # run the preprocessing processor = S5Preprocessor(tmp_path / "data", ouce_server=True) processor.preprocess( subset_str="kenya", regrid=regrid_path, variable="2m_temperature", cleanup=True, **dict(ouce_dir=ouce_dir.parents[2], infer=True), ) # check preprocessed file exists assert ( processor.preprocessed_folder / "s5_preprocessed" / "s5_t2m_kenya.nc" ).exists(), ( "Expecting to find the kenyan_subset netcdf file" "at the preprocessed / s5_preprocessed / s5_{variable}_{subset_str}.nc" ) # open the data out_data = xr.open_dataset( processor.preprocessed_folder / "s5_preprocessed" / "s5_t2m_kenya.nc" ) # check the subsetting happened properly expected_dims = [ "lat", "lon", "initialisation_date", "forecast_horizon", "number", ] assert len(list(out_data.dims)) == len(expected_dims) for dim in expected_dims: assert dim in list( out_data.dims ), f"Expected {dim} to be in the processed dataset dims" lons = out_data.lon.values assert (lons.min() >= kenya.lonmin) and ( lons.max() <= kenya.lonmax ), "Longitudes not correctly subset" lats = out_data.lat.values assert (lats.min() >= kenya.latmin) and ( lats.max() <= kenya.latmax ), "Latitudes not correctly subset" # check the lat/lon is the correct shape assert out_data.t2m.values.shape[-2:] == (20, 20) # test the stacking to select the forecast time # NOTE: this is how you select data from the S5 data for the `real time` out_data["valid_time"] = ( out_data.initialisation_date + out_data.forecast_horizon ) stacked = out_data.stack(time=("initialisation_date", "forecast_horizon")) assert stacked.time.shape == (10,), "should be a 1D vector" selected = stacked.swap_dims({"time": "valid_time"}).sel(valid_time="2008-03") assert selected.time.size == 6, ( "Should have only selected 6 timesteps" " for the month 2008-03. The calculation of valid_time is " "complicated but it should select the forecasts that enter into" "the month of interest." ) # check the cleanup has worked assert ( not processor.interim.exists() ), f"Interim S5 folder should have been deleted"
%load_ext autoreload %autoreload 2 from src.preprocess import S5Preprocessor from pathlib import Path data_path = Path('data') p = S5Preprocessor(data_path) # p.get_filepaths(p.raw_folder, variable='total_precipitation') regrid_ds = data_path / "interim" / "chirps_preprocessed" / "chirps_kenya.nc" p.preprocess(variable='total_precipitation', regrid=regrid_ds) # p.merge_and_resample( # variable='tp' # )