Beispiel #1
0
def process_seas5():
    # if the working directory is alread ml_drought don't need ../data
    if Path(".").absolute().as_posix().split("/")[-1] == "ml_drought":
        data_path = Path("data")
    else:
        data_path = Path("../data")
    regrid_path = (
        data_path /
        "interim/reanalysis-era5-single-levels-monthly-means_preprocessed/data_kenya.nc"
    )
    assert regrid_path.exists(), f"{regrid_path} not available"

    datasets = [
        d.name for d in (data_path / "raw").iterdir() if "seasonal" in d.name
    ]
    for dataset in datasets:
        variables = [v.name for v in (data_path / "raw" / dataset).glob("*")]

        for variable in variables:
            if variable == "total_precipitation":
                processor = S5Preprocessor(data_path)
                processor.preprocess(
                    subset_str="kenya",
                    regrid=regrid_path,
                    resample_time=None,
                    upsampling=False,
                    variable=variable,
                )
Beispiel #2
0
    def test_initialisation(self, tmp_path):
        data_dir = tmp_path / "data"
        if not data_dir.exists():
            data_dir.mkdir(exist_ok=True, parents=True)

        S5Preprocessor(data_dir)
        assert (data_dir / "interim" / "s5_preprocessed").exists()
        assert (data_dir / "interim" / "s5_interim").exists()
Beispiel #3
0
def process_seas5():
    data_path = get_data_path()

    regrid_path = data_path / "interim/chirps_preprocessed/chirps_kenya.nc"
    assert regrid_path.exists(), f"{regrid_path} not available"

    processor = S5Preprocessor(data_path)
    processor.preprocess(
        subset_str="kenya", regrid=regrid_path, resample_time="M", upsampling=False
    )
Beispiel #4
0
def preprocess_s5_ouce():
    if Path(".").absolute().as_posix().split("/")[-1] == "ml_drought":
        data_path = Path("data")
    else:
        data_path = Path("../data")
    variable = "total_precipitation"
    daily_s5_dir = Path("/soge-home/data/model/seas5/1.0x1.0/daily")
    s = S5Preprocessor(data_path, ouce_server=True)
    s.preprocess(
        variable=variable,
        regrid=None,
        resample_time=None,
        **{
            "ouce_dir": daily_s5_dir,
            "infer": True
        },
    )
Beispiel #5
0
    def test_find_grib_file(self, tmp_path):
        # create grib file to test if it can be found by s5 preprocessor
        _ = save_dummy_seas5(
            tmp_path,
            "2018-01-01",
            to_grib=True,
            dataset="seasonal-monthly-pressure-levels",
            variable="temperature",
        )
        out_dir = tmp_path / "data" / "raw" / "seasonal-monthly-pressure-levels"
        out_dir = out_dir / "temperature" / "2018"
        assert (out_dir / "01.grib").exists()

        processor = S5Preprocessor(tmp_path / "data")

        # check the preprocessor can find the grib file created
        fpaths = processor.get_filepaths(
            grib=True, target_folder=processor.raw_folder, variable="temperature"
        )
        assert fpaths[0].name == "01.grib", (
            f"unable to find the created dataset"
            "at data/raw/s5/seasonal-monthly-pressure-levels"
        )
Beispiel #6
0
    def test_preprocess(self, tmp_path):
        out_dir = tmp_path / "data" / "raw" / "s5"
        out_dir = (
            out_dir / "seasonal-monthly-pressure-levels" / "2m_temperature" / str(2018)
        )
        if not out_dir.exists():
            out_dir.mkdir(exist_ok=True, parents=True)

        # preprocessor working with pretend ouce data (because writing to .grib is failing)
        ouce_dir = make_dummy_ouce_s5_data(tmp_path)
        kenya = get_kenya()
        regrid_dataset, _, _ = _make_dataset(
            size=(20, 20),
            latmin=kenya.latmin,
            latmax=kenya.latmax,
            lonmin=kenya.lonmin,
            lonmax=kenya.lonmax,
        )

        # the reference dataset to regrid to
        regrid_path = tmp_path / "regridder.nc"
        regrid_dataset.to_netcdf(regrid_path)

        # run the preprocessing
        processor = S5Preprocessor(tmp_path / "data", ouce_server=True)

        processor.preprocess(
            subset_str="kenya",
            regrid=regrid_path,
            variable="2m_temperature",
            cleanup=True,
            **dict(ouce_dir=ouce_dir.parents[2], infer=True),
        )

        # check preprocessed file exists
        assert (
            processor.preprocessed_folder / "s5_preprocessed" / "s5_t2m_kenya.nc"
        ).exists(), (
            "Expecting to find the kenyan_subset netcdf file"
            "at the preprocessed / s5_preprocessed / s5_{variable}_{subset_str}.nc"
        )

        # open the data
        out_data = xr.open_dataset(
            processor.preprocessed_folder / "s5_preprocessed" / "s5_t2m_kenya.nc"
        )

        # check the subsetting happened properly
        expected_dims = [
            "lat",
            "lon",
            "initialisation_date",
            "forecast_horizon",
            "number",
        ]
        assert len(list(out_data.dims)) == len(expected_dims)
        for dim in expected_dims:
            assert dim in list(
                out_data.dims
            ), f"Expected {dim} to be in the processed dataset dims"

        lons = out_data.lon.values
        assert (lons.min() >= kenya.lonmin) and (
            lons.max() <= kenya.lonmax
        ), "Longitudes not correctly subset"

        lats = out_data.lat.values
        assert (lats.min() >= kenya.latmin) and (
            lats.max() <= kenya.latmax
        ), "Latitudes not correctly subset"

        # check the lat/lon is the correct shape
        assert out_data.t2m.values.shape[-2:] == (20, 20)

        # test the stacking to select the forecast time
        # NOTE: this is how you select data from the S5 data for the `real time`
        out_data["valid_time"] = (
            out_data.initialisation_date + out_data.forecast_horizon
        )
        stacked = out_data.stack(time=("initialisation_date", "forecast_horizon"))
        assert stacked.time.shape == (10,), "should be a 1D vector"
        selected = stacked.swap_dims({"time": "valid_time"}).sel(valid_time="2008-03")

        assert selected.time.size == 6, (
            "Should have only selected 6 timesteps"
            " for the month 2008-03. The calculation of valid_time is "
            "complicated but it should select the forecasts that enter into"
            "the month of interest."
        )

        # check the cleanup has worked
        assert (
            not processor.interim.exists()
        ), f"Interim S5 folder should have been deleted"
Beispiel #7
0
%load_ext autoreload
%autoreload 2

from src.preprocess import S5Preprocessor
from pathlib import Path

data_path = Path('data')
p = S5Preprocessor(data_path)

# p.get_filepaths(p.raw_folder, variable='total_precipitation')

regrid_ds = data_path / "interim" / "chirps_preprocessed" / "chirps_kenya.nc"
p.preprocess(variable='total_precipitation', regrid=regrid_ds)
# p.merge_and_resample(
#    variable='tp'
# )