Example #1
0
    def _make_boku_ndvi_dataset(
        size,
        lonmin=-180.0,
        lonmax=180.0,
        latmin=-55.152,
        latmax=75.024,
        kenya_only=False,
    ):
        lat_len, lon_len = size
        if kenya_only:
            kenya = get_kenya()
            latmin = kenya.latmin
            latmax = kenya.latmax
            lonmin = kenya.lonmin
            lonmax = kenya.lonmax

        # create the vector
        longitudes = np.linspace(lonmin, lonmax, lon_len)
        latitudes = np.linspace(latmin, latmax, lat_len)

        dims = ["lat", "lon"]
        coords = {"lat": latitudes, "lon": longitudes}

        modis_vals = np.append(np.arange(1, 252), 255)
        data = np.random.choice(modis_vals, size=size)

        return xr.Dataset({"boku_ndvi": (dims, data)}, coords=coords)
    def test_preprocess(self, tmp_path):

        (tmp_path / "raw/boku_ndvi_1000").mkdir(parents=True)

        RAW_FILES = [
            "MCD13A2.t200915.006.EAv1.1_km_10_days_NDVI.O1.nc",
            "MCD13A2.t201107.006.EAv1.1_km_10_days_NDVI.O1.nc",
            "MCD13A2.t201330.006.EAv1.1_km_10_days_NDVI.O1.nc",
            "MCD13A2.t201733.006.EAv1.1_km_10_days_NDVI.O1.nc",
        ]

        for raw_file in RAW_FILES:
            data_path = tmp_path / f"raw/boku_ndvi_1000/{raw_file}"
            dataset = self._make_boku_ndvi_dataset(size=(100, 100),
                                                   kenya_only=True)
            dataset.to_netcdf(path=data_path)

        kenya = get_kenya()
        regrid_dataset, _, _ = _make_dataset(
            size=(20, 20),
            latmin=kenya.latmin,
            latmax=kenya.latmax,
            lonmin=kenya.lonmin,
            lonmax=kenya.lonmax,
        )

        regrid_path = tmp_path / "regridder.nc"
        regrid_dataset.to_netcdf(regrid_path)

        processor = BokuNDVIPreprocessor(tmp_path)
        processor.preprocess(subset_str="kenya",
                             regrid=regrid_path,
                             cleanup=True)

        expected_out_path = (
            tmp_path / "interim/boku_ndvi_1000_preprocessed/data_kenya.nc")
        assert (expected_out_path.exists(
        )), f"Expected processed file to be saved to {expected_out_path}"

        # check the subsetting happened correctly
        out_data = xr.open_dataset(expected_out_path)
        expected_dims = ["lat", "lon", "time"]
        assert len(list(out_data.dims)) == len(expected_dims)
        for dim in expected_dims:
            assert dim in list(
                out_data.dims
            ), f"Expected {dim} to be in the processed dataset dims"

        lons = out_data.lon.values
        assert (lons.min() >= kenya.lonmin) and (
            lons.max() <= kenya.lonmax), "Longitudes not correctly subset"

        lats = out_data.lat.values
        assert (lats.min() >= kenya.latmin) and (
            lats.max() <= kenya.latmax), "Latitudes not correctly subset"

        assert out_data["boku_ndvi"].values.shape[1:] == (20, 20)

        assert (not processor.interim.exists()
                ), f"Interim boku_ndvi folder should have been deleted"
Example #3
0
    def test_preprocess(self, tmp_path, cleanup):

        (tmp_path / 'raw/esa_cci_landcover').mkdir(parents=True)
        data_path = tmp_path / 'raw/esa_cci_landcover/1992-v2.0.7b_testy_test.nc'
        dataset = self._make_ESA_CCI_dataset(size=(100, 100))
        dataset.to_netcdf(path=data_path)

        legend_path = tmp_path / 'raw/esa_cci_landcover/legend.csv'
        self._make_ESA_CCI_legend().to_csv(legend_path)

        kenya = get_kenya()
        regrid_dataset, _, _ = _make_dataset(size=(20, 20),
                                             latmin=kenya.latmin,
                                             latmax=kenya.latmax,
                                             lonmin=kenya.lonmin,
                                             lonmax=kenya.lonmax)

        regrid_path = tmp_path / 'regridder.nc'
        regrid_dataset.to_netcdf(regrid_path)

        processor = ESACCIPreprocessor(tmp_path)
        processor.preprocess(subset_str='kenya',
                             regrid=regrid_path,
                             cleanup=cleanup)

        expected_out_path = (tmp_path /
                             'interim/static/esa_cci_landcover_interim'
                             '/1992_1992-v2.0.7b_testy_test_kenya.nc')
        if not cleanup:
            assert expected_out_path.exists(), \
                f'Expected processed file to be saved to {expected_out_path}'

        expected_out_processed = (tmp_path /
                                  'interim/static/esa_cci_landcover_'
                                  'preprocessed' /
                                  'esa_cci_landcover_kenya_one_hot.nc')
        assert expected_out_processed.exists(), \
            'expected a processed folder'

        # check the subsetting happened correctly
        out_data = xr.open_dataset(expected_out_processed)
        expected_dims = ['lat', 'lon']
        assert len(list(out_data.dims)) == len(expected_dims)
        for dim in expected_dims:
            assert dim in list(out_data.dims), \
                f'Expected {dim} to be in the processed dataset dims'

        lons = out_data.lon.values
        assert (lons.min() >= kenya.lonmin) and (lons.max() <= kenya.lonmax), \
            'Longitudes not correctly subset'

        lats = out_data.lat.values
        assert (lats.min() >= kenya.latmin) and (lats.max() <= kenya.latmax), \
            'Latitudes not correctly subset'

        if cleanup:
            assert not processor.interim.exists(), \
                f'Interim esa_cci_landcover folder should have been deleted'
Example #4
0
    def test_preprocess(self, tmp_path):

        (tmp_path / "raw/reanalysis-era5-land/"
         "2m_temperature/1979_2019").mkdir(parents=True)
        data_path = (tmp_path / "raw/reanalysis-era5-land/"
                     "2m_temperature/1979_2019/01_12.nc")
        dataset = self._make_era5_dataset(size=(100, 100))
        dataset.to_netcdf(path=data_path)

        kenya = get_kenya()
        regrid_dataset, _, _ = _make_dataset(
            size=(20, 20),
            latmin=kenya.latmin,
            latmax=kenya.latmax,
            lonmin=kenya.lonmin,
            lonmax=kenya.lonmax,
        )

        regrid_path = tmp_path / "regridder.nc"
        regrid_dataset.to_netcdf(regrid_path)

        processor = ERA5LandPreprocessor(tmp_path)
        processor.preprocess(
            subset_str="kenya",
            regrid=regrid_path,
            parallel_processes=1,
            variable="2m_temperature",
        )

        expected_out_path = (
            tmp_path / "interim/reanalysis-era5"
            "-land_preprocessed/reanalysis-era5-land_kenya.nc")
        assert (expected_out_path.exists(
        )), f"Expected processed file to be saved to {expected_out_path}"

        # check the subsetting happened correctly
        out_data = xr.open_dataset(expected_out_path)
        expected_dims = ["lat", "lon", "time"]
        assert len(list(out_data.dims)) == len(expected_dims)
        for dim in expected_dims:
            assert dim in list(
                out_data.dims
            ), f"Expected {dim} to be in the processed dataset dims"

        lons = out_data.lon.values
        assert (lons.min() >= kenya.lonmin) and (
            lons.max() <= kenya.lonmax), "Longitudes not correctly subset"

        lats = out_data.lat.values
        assert (lats.min() >= kenya.latmin) and (
            lats.max() <= kenya.latmax), "Latitudes not correctly subset"

        assert out_data.t2m.values.shape[1:] == (20, 20)

        assert (not processor.interim.exists()
                ), f"Interim era5 folder should have been deleted"
Example #5
0
    def test_preprocess(self, tmp_path):

        (tmp_path / "raw/era5POS/global").mkdir(parents=True)
        data_path = tmp_path / "raw/era5POS/global/testy_test.nc"
        dataset = self._make_era5POS_dataset(size=(100, 100))
        dataset.to_netcdf(path=data_path)

        kenya = get_kenya()
        regrid_dataset, _, _ = _make_dataset(
            size=(20, 20),
            latmin=kenya.latmin,
            latmax=kenya.latmax,
            lonmin=kenya.lonmin,
            lonmax=kenya.lonmax,
        )

        regrid_path = tmp_path / "regridder.nc"
        regrid_dataset.to_netcdf(regrid_path)

        processor = PlanetOSPreprocessor(tmp_path)
        processor.preprocess(subset_str="kenya", regrid=regrid_path, parallel=False)

        expected_out_path = tmp_path / "interim/era5POS_preprocessed/data_kenya.nc"
        assert (
            expected_out_path.exists()
        ), f"Expected processed file to be saved to {expected_out_path}"

        # check the subsetting happened correctly
        out_data = xr.open_dataset(expected_out_path)
        expected_dims = ["lat", "lon", "time"]
        assert len(list(out_data.dims)) == len(expected_dims)
        for dim in expected_dims:
            assert dim in list(
                out_data.dims
            ), f"Expected {dim} to be in the processed dataset dims"

        lons = out_data.lon.values
        assert (lons.min() >= kenya.lonmin) and (
            lons.max() <= kenya.lonmax
        ), "Longitudes not correctly subset"

        lats = out_data.lat.values
        assert (lats.min() >= kenya.latmin) and (
            lats.max() <= kenya.latmax
        ), "Latitudes not correctly subset"

        assert out_data.VHI.values.shape[1:] == (20, 20)
        assert out_data.precip.values.shape[1:] == (20, 20)

        assert (
            not processor.interim.exists()
        ), f"Interim era5 folder should have been deleted"
Example #6
0
    def test_preprocess(self, tmp_path):

        (tmp_path / "raw/gleam/monthly").mkdir(parents=True)
        data_path = tmp_path / "raw/gleam/monthly/testy_test.nc"
        dataset = self._make_gleam_dataset(size=(100, 100))
        dataset.to_netcdf(path=data_path)

        kenya = get_kenya()
        regrid_dataset, _, _ = _make_dataset(
            size=(20, 20),
            latmin=kenya.latmin,
            latmax=kenya.latmax,
            lonmin=kenya.lonmin,
            lonmax=kenya.lonmax,
        )

        regrid_path = tmp_path / "regridder.nc"
        regrid_dataset.to_netcdf(regrid_path)

        processor = GLEAMPreprocessor(tmp_path)
        processor.preprocess(subset_str="kenya", regrid=regrid_path)

        expected_out_path = tmp_path / "interim/gleam_preprocessed/data_kenya.nc"
        assert (
            expected_out_path.exists()
        ), f"Expected processed file to be saved to {expected_out_path}"

        # check the subsetting happened correctly
        out_data = xr.open_dataset(expected_out_path)
        expected_dims = ["lat", "lon", "time"]
        assert len(list(out_data.dims)) == len(expected_dims)
        for dim in expected_dims:
            assert dim in list(
                out_data.dims
            ), f"Expected {dim} to be in the processed dataset dims"

        lons = out_data.lon.values
        assert (lons.min() >= kenya.lonmin) and (
            lons.max() <= kenya.lonmax
        ), "Longitudes not correctly subset"

        lats = out_data.lat.values
        assert (lats.min() >= kenya.latmin) and (
            lats.max() <= kenya.latmax
        ), "Latitudes not correctly subset"

        assert set(out_data.data_vars) == {"E"}, f"Got unexpected variables!"

        assert (
            not processor.interim.exists()
        ), f"Interim gleam folder should have been deleted"
    def test_preprocess(self, tmp_path):

        (tmp_path / 'raw/reanalysis-era5-single-levels-monthly-means/'
                    '2m_temperature/1979_2019').mkdir(parents=True)
        data_path = tmp_path / 'raw/reanalysis-era5-single-levels-monthly-means/' \
                               '2m_temperature/1979_2019/01_12.nc'
        dataset = self._make_era5_dataset(size=(100, 100))
        dataset.to_netcdf(path=data_path)

        kenya = get_kenya()
        regrid_dataset, _, _ = _make_dataset(size=(20, 20),
                                             latmin=kenya.latmin, latmax=kenya.latmax,
                                             lonmin=kenya.lonmin, lonmax=kenya.lonmax)

        regrid_path = tmp_path / 'regridder.nc'
        regrid_dataset.to_netcdf(regrid_path)

        processor = ERA5MonthlyMeanPreprocessor(tmp_path)
        processor.preprocess(subset_str='kenya', regrid=regrid_path,
                             parallel=False)

        expected_out_path = tmp_path / 'interim/reanalysis-era5-single-levels-monthly-' \
                                       'means_preprocessed/data_kenya.nc'
        assert expected_out_path.exists(), \
            f'Expected processed file to be saved to {expected_out_path}'

        # check the subsetting happened correctly
        out_data = xr.open_dataset(expected_out_path)
        expected_dims = ['lat', 'lon', 'time']
        assert len(list(out_data.dims)) == len(expected_dims)
        for dim in expected_dims:
            assert dim in list(out_data.dims), \
                f'Expected {dim} to be in the processed dataset dims'

        lons = out_data.lon.values
        assert (lons.min() >= kenya.lonmin) and (lons.max() <= kenya.lonmax), \
            'Longitudes not correctly subset'

        lats = out_data.lat.values
        assert (lats.min() >= kenya.latmin) and (lats.max() <= kenya.latmax), \
            'Latitudes not correctly subset'

        assert out_data.t2m.values.shape[1:] == (20, 20)

        assert not processor.interim.exists(), \
            f'Interim era5 folder should have been deleted'
Example #8
0
    def test_preprocess(self, tmp_path):

        (tmp_path / 'raw/gleam/monthly').mkdir(parents=True)
        data_path = tmp_path / 'raw/gleam/monthly/testy_test.nc'
        dataset = self._make_gleam_dataset(size=(100, 100))
        dataset.to_netcdf(path=data_path)

        kenya = get_kenya()
        regrid_dataset, _, _ = _make_dataset(size=(20, 20),
                                             latmin=kenya.latmin,
                                             latmax=kenya.latmax,
                                             lonmin=kenya.lonmin,
                                             lonmax=kenya.lonmax)

        regrid_path = tmp_path / 'regridder.nc'
        regrid_dataset.to_netcdf(regrid_path)

        processor = GLEAMPreprocessor(tmp_path)
        processor.preprocess(subset_str='kenya', regrid=regrid_path)

        expected_out_path = tmp_path / 'interim/gleam_preprocessed/data_kenya.nc'
        assert expected_out_path.exists(), \
            f'Expected processed file to be saved to {expected_out_path}'

        # check the subsetting happened correctly
        out_data = xr.open_dataset(expected_out_path)
        expected_dims = ['lat', 'lon', 'time']
        assert len(list(out_data.dims)) == len(expected_dims)
        for dim in expected_dims:
            assert dim in list(out_data.dims), \
                f'Expected {dim} to be in the processed dataset dims'

        lons = out_data.lon.values
        assert (lons.min() >= kenya.lonmin) and (lons.max() <= kenya.lonmax), \
            'Longitudes not correctly subset'

        lats = out_data.lat.values
        assert (lats.min() >= kenya.latmin) and (lats.max() <= kenya.latmax), \
            'Latitudes not correctly subset'

        assert set(out_data.data_vars) == {'E'}, f'Got unexpected variables!'

        assert not processor.interim.exists(), \
            f'Interim gleam folder should have been deleted'
    def test_preprocess(self, tmp_path):

        (tmp_path / 'raw/chirps/global').mkdir(parents=True)
        data_path = tmp_path / 'raw/chirps/global/testy_test.nc'
        dataset = self._make_chirps_dataset(size=(100, 100))
        dataset.to_netcdf(path=data_path)

        kenya = get_kenya()
        regrid_dataset, _, _ = _make_dataset(size=(20, 20),
                                             latmin=kenya.latmin, latmax=kenya.latmax,
                                             lonmin=kenya.lonmin, lonmax=kenya.lonmax)

        regrid_path = tmp_path / 'regridder.nc'
        regrid_dataset.to_netcdf(regrid_path)

        processor = CHIRPSPreprocesser(tmp_path)
        processor.preprocess(subset_str='kenya', regrid=regrid_path,
                             parallel=False)

        expected_out_path = tmp_path / 'interim/chirps_preprocessed/data_kenya.nc'
        assert expected_out_path.exists(), \
            f'Expected processed file to be saved to {expected_out_path}'

        # check the subsetting happened correctly
        out_data = xr.open_dataset(expected_out_path)
        expected_dims = ['lat', 'lon', 'time']
        assert len(list(out_data.dims)) == len(expected_dims)
        for dim in expected_dims:
            assert dim in list(out_data.dims), \
                f'Expected {dim} to be in the processed dataset dims'

        lons = out_data.lon.values
        assert (lons.min() >= kenya.lonmin) and (lons.max() <= kenya.lonmax), \
            'Longitudes not correctly subset'

        lats = out_data.lat.values
        assert (lats.min() >= kenya.latmin) and (lats.max() <= kenya.latmax), \
            'Latitudes not correctly subset'

        assert out_data.VHI.values.shape[1:] == (20, 20)

        assert not processor.interim.exists(), \
            f'Interim chirps folder should have been deleted'
Example #10
0
    def test_preprocess(self, tmp_path):
        out_dir = tmp_path / "data" / "raw" / "s5"
        out_dir = (
            out_dir / "seasonal-monthly-pressure-levels" / "2m_temperature" / str(2018)
        )
        if not out_dir.exists():
            out_dir.mkdir(exist_ok=True, parents=True)

        # preprocessor working with pretend ouce data (because writing to .grib is failing)
        ouce_dir = make_dummy_ouce_s5_data(tmp_path)
        kenya = get_kenya()
        regrid_dataset, _, _ = _make_dataset(
            size=(20, 20),
            latmin=kenya.latmin,
            latmax=kenya.latmax,
            lonmin=kenya.lonmin,
            lonmax=kenya.lonmax,
        )

        # the reference dataset to regrid to
        regrid_path = tmp_path / "regridder.nc"
        regrid_dataset.to_netcdf(regrid_path)

        # run the preprocessing
        processor = S5Preprocessor(tmp_path / "data", ouce_server=True)

        processor.preprocess(
            subset_str="kenya",
            regrid=regrid_path,
            variable="2m_temperature",
            cleanup=True,
            **dict(ouce_dir=ouce_dir.parents[2], infer=True),
        )

        # check preprocessed file exists
        assert (
            processor.preprocessed_folder / "s5_preprocessed" / "s5_t2m_kenya.nc"
        ).exists(), (
            "Expecting to find the kenyan_subset netcdf file"
            "at the preprocessed / s5_preprocessed / s5_{variable}_{subset_str}.nc"
        )

        # open the data
        out_data = xr.open_dataset(
            processor.preprocessed_folder / "s5_preprocessed" / "s5_t2m_kenya.nc"
        )

        # check the subsetting happened properly
        expected_dims = [
            "lat",
            "lon",
            "initialisation_date",
            "forecast_horizon",
            "number",
        ]
        assert len(list(out_data.dims)) == len(expected_dims)
        for dim in expected_dims:
            assert dim in list(
                out_data.dims
            ), f"Expected {dim} to be in the processed dataset dims"

        lons = out_data.lon.values
        assert (lons.min() >= kenya.lonmin) and (
            lons.max() <= kenya.lonmax
        ), "Longitudes not correctly subset"

        lats = out_data.lat.values
        assert (lats.min() >= kenya.latmin) and (
            lats.max() <= kenya.latmax
        ), "Latitudes not correctly subset"

        # check the lat/lon is the correct shape
        assert out_data.t2m.values.shape[-2:] == (20, 20)

        # test the stacking to select the forecast time
        # NOTE: this is how you select data from the S5 data for the `real time`
        out_data["valid_time"] = (
            out_data.initialisation_date + out_data.forecast_horizon
        )
        stacked = out_data.stack(time=("initialisation_date", "forecast_horizon"))
        assert stacked.time.shape == (10,), "should be a 1D vector"
        selected = stacked.swap_dims({"time": "valid_time"}).sel(valid_time="2008-03")

        assert selected.time.size == 6, (
            "Should have only selected 6 timesteps"
            " for the month 2008-03. The calculation of valid_time is "
            "complicated but it should select the forecasts that enter into"
            "the month of interest."
        )

        # check the cleanup has worked
        assert (
            not processor.interim.exists()
        ), f"Interim S5 folder should have been deleted"
Example #11
0
    def test_preprocess(self, tmp_path, cleanup):

        (tmp_path / "raw/esa_cci_landcover").mkdir(parents=True)
        data_path = tmp_path / "raw/esa_cci_landcover/1992-v2.0.7b_testy_test.nc"
        dataset = self._make_ESA_CCI_dataset(size=(100, 100))
        dataset.to_netcdf(path=data_path)

        legend_path = tmp_path / "raw/esa_cci_landcover/legend.csv"
        self._make_ESA_CCI_legend().to_csv(legend_path)

        kenya = get_kenya()
        regrid_dataset, _, _ = _make_dataset(
            size=(20, 20),
            latmin=kenya.latmin,
            latmax=kenya.latmax,
            lonmin=kenya.lonmin,
            lonmax=kenya.lonmax,
        )

        regrid_path = tmp_path / "regridder.nc"
        regrid_dataset.to_netcdf(regrid_path)

        processor = ESACCIPreprocessor(tmp_path)
        processor.preprocess(subset_str="kenya",
                             regrid=regrid_path,
                             cleanup=cleanup)

        expected_out_path = (tmp_path /
                             "interim/static/esa_cci_landcover_interim"
                             "/1992_1992-v2.0.7b_testy_test_kenya.nc")
        if not cleanup:
            assert (expected_out_path.exists(
            )), f"Expected processed file to be saved to {expected_out_path}"

        expected_out_processed = (tmp_path /
                                  "interim/static/esa_cci_landcover_"
                                  "preprocessed" /
                                  "esa_cci_landcover_kenya_one_hot.nc")
        assert expected_out_processed.exists(), "expected a processed folder"

        # check the subsetting happened correctly
        out_data = xr.open_dataset(expected_out_processed)
        expected_dims = ["lat", "lon"]
        assert len(list(out_data.dims)) == len(expected_dims)
        for dim in expected_dims:
            assert dim in list(
                out_data.dims
            ), f"Expected {dim} to be in the processed dataset dims"

        lons = out_data.lon.values
        assert (lons.min() >= kenya.lonmin) and (
            lons.max() <= kenya.lonmax), "Longitudes not correctly subset"

        lats = out_data.lat.values
        assert (lats.min() >= kenya.latmin) and (
            lats.max() <= kenya.latmax), "Latitudes not correctly subset"

        if cleanup:
            assert (
                not processor.interim.exists()
            ), f"Interim esa_cci_landcover folder should have been deleted"
    variable='tp', time_period='dayofyear', hilo='low', method='std'
)

e_runs = era.calculate_runs()

chirp = EventDetector(chirps_dir)
chirp.detect(
    variable='precip', time_period='month', hilo='low', method='std'
)

c_runs = chirp.calculate_runs()

mask = get_ds_mask(chirp.ds.precip)
c_runs = c_runs.where(~mask)

kenya = get_kenya()

# ------------------------------------------------------------------------------
#
# ------------------------------------------------------------------------------
c = chirp

fig, ax = plt.subplots()
c.clim.mean(dim=['lat','lon']).precip.plot(ax=ax)
c.thresh.mean(dim=['lat','lon']).precip.plot(ax=ax)
ax.set_title('Threshold & Climatology Values for Precip (monthly) [mm day-1]')


fig, ax = plot_geog_location(kenya, lakes=False, borders=True, rivers=True, scale=0.8)
c_runs.mean(dim='time').plot(ax=ax)
ax.set_title('Mean Run Length (Consecutive Months with -1 STD)')
%load_ext autoreload
%autoreload 2

data_dir = Path('data')

# Initialise the ERA5 exporter
e = ERA5Exporter(data_dir)

# valid SEAS5 exporters
valid_datasets = [
    'seasonal-original-single-levels', 'seasonal-original-pressure-levels',
    'seasonal-monthly-single-levels', 'seasonal-monthly-pressure-levels',
]

kenya_region = get_kenya()

variable = 'total_precipitation'
dataset = 'seasonal-original-single-levels'
area = e.create_area(kenya_region)


# times
years = [_ for _ in range(2017,2019)]
months = [_ for _ in range(1,13)]

# leadtime_hour (0 - 215 days)
leadtime_hours = [days * 24 for days in range(1, 20)]
all_leadtimes = [days * 24 for days in range(1, 216)]

\
Example #14
0
    def test_preprocess(self, tmp_path, granularity):
        if granularity == "monthly":
            basename = "reanalysis-era5-single-levels-monthly-means"
            processor = ERA5MonthlyMeanPreprocessor(tmp_path)
        elif granularity == "hourly":
            basename = "reanalysis-era5-single-levels"
            processor = ERA5HourlyPreprocessor(tmp_path)

        (tmp_path / f"raw/{basename}/" "2m_temperature/1979_2019").mkdir(parents=True)
        data_path = tmp_path / f"raw/{basename}/" "2m_temperature/1979_2019/01_12.nc"
        if granularity == "hourly":
            dataset = self._make_era5_dataset(size=(100, 100), monthly=False)
        else:
            dataset = self._make_era5_dataset(size=(100, 100), monthly=True)
        dataset.to_netcdf(path=data_path)

        kenya = get_kenya()
        regrid_dataset, _, _ = _make_dataset(
            size=(20, 20),
            latmin=kenya.latmin,
            latmax=kenya.latmax,
            lonmin=kenya.lonmin,
            lonmax=kenya.lonmax,
        )

        regrid_path = tmp_path / "regridder.nc"
        regrid_dataset.to_netcdf(regrid_path)

        processor.preprocess(subset_str="kenya", regrid=regrid_path, parallel=False)

        expected_out_path = (
            tmp_path / f"interim/{basename}" "_preprocessed/data_kenya.nc"
        )
        assert (
            expected_out_path.exists()
        ), f"Expected processed file to be saved to {expected_out_path}"

        # check the subsetting happened correctly
        out_data = xr.open_dataset(expected_out_path)
        expected_dims = ["lat", "lon", "time"]
        assert len(list(out_data.dims)) == len(expected_dims)
        for dim in expected_dims:
            assert dim in list(
                out_data.dims
            ), f"Expected {dim} to be in the processed dataset dims"

        lons = out_data.lon.values
        assert (lons.min() >= kenya.lonmin) and (
            lons.max() <= kenya.lonmax
        ), "Longitudes not correctly subset"

        lats = out_data.lat.values
        assert (lats.min() >= kenya.latmin) and (
            lats.max() <= kenya.latmax
        ), "Latitudes not correctly subset"

        assert out_data.t2m.values.shape[1:] == (20, 20)

        assert (
            not processor.interim.exists()
        ), f"Interim era5 folder should have been deleted"