def _create_dummy_true_preds_data(tmp_path):
        # save the preds
        parent_dir = tmp_path / 'models' / 'one_month_forecast' / 'ealstm'
        parent_dir.mkdir(exist_ok=True, parents=True)
        save_fnames = ['preds_2018_1.nc', 'preds_2018_2.nc', 'preds_2018_3.nc']
        times = ['2018-01-31', '2018-02-28', '2018-03-31']
        for fname, time in zip(save_fnames, times):
            ds, _, _ = _make_dataset((30, 30),
                                     variable_name='VHI',
                                     lonmin=30,
                                     lonmax=35,
                                     latmin=-2,
                                     latmax=2,
                                     start_date=time,
                                     end_date=time)
            ds.to_netcdf(parent_dir / fname)

        # save the TRUTH (test files)
        save_dnames = ['2018_1', '2018_2', '2018_3']
        parent_dir = tmp_path / 'features' / 'one_month_forecast' / 'test'
        parent_dir.mkdir(exist_ok=True, parents=True)
        for dname, time in zip(save_dnames, times):
            ds, _, _ = _make_dataset((30, 30),
                                     variable_name='VHI',
                                     lonmin=30,
                                     lonmax=35,
                                     latmin=-2,
                                     latmax=2,
                                     start_date=time,
                                     end_date=time)

            (parent_dir / dname).mkdir(exist_ok=True, parents=True)
            ds.to_netcdf(parent_dir / dname / 'y.nc')
def make_test_datasets(tmp_dir):
    x_pred, _, _ = _make_dataset(size=(5, 5))
    x_coeff, _, _ = _make_dataset(size=(5, 5), variable_name="precip")

    x = xr.merge([x_pred, x_coeff])
    y = x_pred.isel(time=[0])

    data_dir = tmp_path / experiment
    if not data_dir.exists():
        data_dir.mkdir(parents=True, exist_ok=True)

    # save netcdf
    x_file = data_dir / "x.nc"
    y_file = data_dir / "y.nc"
    if not x_file.exists():
        x.to_netcdf(x_file)
    if not y_file.exists():
        y.to_netcdf(y_file)

    # make normalising dictionary
    norm_dict = {}
    for var in x.data_vars:
        norm_dict[var] = {
            "mean": x[var].mean(dim=["lat", "lon"], skipna=True).values,
            "std": x[var].std(dim=["lat", "lon"], skipna=True).values,
        }

    return data_dir
Beispiel #3
0
    def test_predict(self, tmp_path, use_pred_months, predict_delta):
        x, _, _ = _make_dataset(size=(5, 5), const=True)
        y = x.isel(time=[-1])

        train_features = tmp_path / "features/one_month_forecast/train/1980_1"
        train_features.mkdir(parents=True)

        test_features = tmp_path / "features/one_month_forecast/test/1980_1"
        test_features.mkdir(parents=True)

        norm_dict = {"VHI": {"mean": 0.0, "std": 1.0}}
        with (tmp_path / "features/one_month_forecast/normalizing_dict.pkl"
              ).open("wb") as f:
            pickle.dump(norm_dict, f)

        x.to_netcdf(test_features / "x.nc")
        y.to_netcdf(test_features / "y.nc")

        x.to_netcdf(train_features / "x.nc")
        y.to_netcdf(train_features / "y.nc")

        # static
        x_static, _, _ = _make_dataset(size=(5, 5), add_times=False)
        static_features = tmp_path / f"features/static"
        static_features.mkdir(parents=True)
        x_static.to_netcdf(static_features / "data.nc")

        static_norm_dict = {"VHI": {"mean": 0.0, "std": 1.0}}
        with (tmp_path /
              f"features/static/normalizing_dict.pkl").open("wb") as f:
            pickle.dump(static_norm_dict, f)

        dense_features = [10]
        hidden_size = 128
        rnn_dropout = 0.25

        model = RecurrentNetwork(
            hidden_size=hidden_size,
            dense_features=dense_features,
            rnn_dropout=rnn_dropout,
            data_folder=tmp_path,
            predict_delta=predict_delta,
        )
        model.train()
        test_arrays_dict, pred_dict = model.predict()

        # the foldername "1980_1" is the only one which should be in the dictionaries
        assert ("1980_1" in test_arrays_dict.keys()) and (len(test_arrays_dict)
                                                          == 1)
        assert ("1980_1" in pred_dict.keys()) and (len(pred_dict) == 1)

        if not predict_delta:
            # _make_dataset with const=True returns all ones
            assert (test_arrays_dict["1980_1"]["y"] == 1).all()
Beispiel #4
0
    def test_predict(self, tmp_path, use_pred_months):
        x, _, _ = _make_dataset(size=(5, 5), const=True)
        y = x.isel(time=[-1])

        train_features = tmp_path / 'features/one_month_forecast/train/hello'
        train_features.mkdir(parents=True)

        test_features = tmp_path / 'features/one_month_forecast/test/hello'
        test_features.mkdir(parents=True)

        norm_dict = {'VHI': {'mean': 0.0, 'std': 1.0}}
        with (tmp_path / 'features/one_month_forecast/normalizing_dict.pkl'
              ).open('wb') as f:
            pickle.dump(norm_dict, f)

        x.to_netcdf(test_features / 'x.nc')
        y.to_netcdf(test_features / 'y.nc')

        x.to_netcdf(train_features / 'x.nc')
        y.to_netcdf(train_features / 'y.nc')

        # static
        x_static, _, _ = _make_dataset(size=(5, 5), add_times=False)
        static_features = tmp_path / f'features/static'
        static_features.mkdir(parents=True)
        x_static.to_netcdf(static_features / 'data.nc')

        static_norm_dict = {'VHI': {'mean': 0.0, 'std': 1.0}}
        with (tmp_path /
              f'features/static/normalizing_dict.pkl').open('wb') as f:
            pickle.dump(static_norm_dict, f)

        dense_features = [10]
        hidden_size = 128
        rnn_dropout = 0.25

        model = EARecurrentNetwork(hidden_size=hidden_size,
                                   dense_features=dense_features,
                                   rnn_dropout=rnn_dropout,
                                   data_folder=tmp_path)
        model.train()
        test_arrays_dict, pred_dict = model.predict()

        # the foldername "hello" is the only one which should be in the dictionaries
        assert ('hello' in test_arrays_dict.keys()) and (len(test_arrays_dict)
                                                         == 1)
        assert ('hello' in pred_dict.keys()) and (len(pred_dict) == 1)

        # _make_dataset with const=True returns all ones
        assert (test_arrays_dict['hello']['y'] == 1).all()
Beispiel #5
0
    def test_train(self, tmp_path, capsys, use_pred_months, predict_delta,
                   static):
        x, _, _ = _make_dataset(size=(5, 5), const=True)
        y = x.isel(time=[-1])

        test_features = tmp_path / "features/one_month_forecast/train/1980_1"
        test_features.mkdir(parents=True)

        norm_dict = {"VHI": {"mean": 0, "std": 1}}
        with (tmp_path / "features/one_month_forecast/normalizing_dict.pkl"
              ).open("wb") as f:
            pickle.dump(norm_dict, f)

        x.to_netcdf(test_features / "x.nc")
        y.to_netcdf(test_features / "y.nc")

        # static
        x_static, _, _ = _make_dataset(size=(5, 5), add_times=False)
        static_features = tmp_path / f"features/static"
        static_features.mkdir(parents=True)
        x_static.to_netcdf(static_features / "data.nc")

        static_norm_dict = {"VHI": {"mean": 0.0, "std": 1.0}}
        with (tmp_path /
              f"features/static/normalizing_dict.pkl").open("wb") as f:
            pickle.dump(static_norm_dict, f)

        dense_features = [10]
        hidden_size = 128
        rnn_dropout = 0.25

        model = RecurrentNetwork(
            hidden_size=hidden_size,
            dense_features=dense_features,
            rnn_dropout=rnn_dropout,
            data_folder=tmp_path,
            include_monthly_aggs=True,
            predict_delta=predict_delta,
            static=static,
        )
        check_inversion = False
        model.train(check_inversion=check_inversion)

        captured = capsys.readouterr()
        expected_stdout = "Epoch 1, train smooth L1:"
        assert expected_stdout in captured.out

        assert type(model.model) == RNN, f"Model attribute not an RNN!"
Beispiel #6
0
    def _create_dummy_landcover_data(tmp_path):
        parent_dir = tmp_path / "interim" / "static" / "esa_cci_landcover_preprocessed"
        parent_dir.mkdir(exist_ok=True, parents=True)
        fname = "esa_cci_landcover_kenya_one_hot.nc"
        vars = [
            "Cropland, irrigated or post-flooding_one_hot",
            "Herbaceous cover_one_hot",
            "No data_one_hot",
            "Tree or shrub cover_one_hot",
        ]
        # create non-overlapping groups
        # https://stackoverflow.com/a/52356978/9940782
        groups = np.random.randint(0, 4, (30, 30))
        masks = (groups[..., None] == np.arange(4)[None, :]).T.astype(int)

        all_ds = []
        for group, var in enumerate(vars):
            ds, _, _ = _make_dataset(
                (30, 30),
                variable_name=var,
                lonmin=30,
                lonmax=35,
                latmin=-2,
                latmax=2,
                add_times=False,
                const=True,
            )
            # assign the values from the mask to the da.values
            ds[var].values = masks[group, :, :]
            all_ds.append(ds)

        ds = xr.merge([*all_ds])
        ds.to_netcdf(parent_dir / fname)
Beispiel #7
0
    def test_dataset(self, tmp_path):
        target_variable = "target"
        input_variables = ["feature"]
        for path in [
            Path("tests/testconfigs/test_config_simulate.yml"),
            Path("tests/testconfigs/test_config.yml"),
        ]:
            cfg = Config(path)
            cfg._cfg["forecast_variables"] = cfg.input_variables

            create_and_assign_temp_run_path_to_config(cfg, tmp_path)
            raw_ds = _make_dataset().isel(lat=slice(0, 2), lon=slice(0, 1))
            static = create_static(cfg=cfg, ds=raw_ds)
            ds = XarrayDataset(
                raw_ds, cfg=cfg, mode="train", DEBUG=True, static_data=static
            )

            assert ds.target == target_variable
            assert (
                ds.inputs == input_variables + ["autoregressive"]
                if cfg.autoregressive
                else input_variables
            )

            x_features = (
                len(input_variables) + 1 if cfg.autoregressive else len(input_variables)
            )
            seq_length = cfg.seq_length
            for i in range(10):
                data = ds.__getitem__(i)
                x, y = data["x_d"], data["y"]

                assert y.shape == (1, 1)
                assert x.shape == (
                    seq_length,
                    x_features + 2 if cfg.encode_doys else x_features,
                ), f"Shape Mismatch! Expect: {(seq_length, x_features)} Got: {x.shape}"

                meta = data["meta"]
                times = (
                    meta["target_time"]
                    .detach()
                    .numpy()
                    .astype("datetime64[ns]")
                    .flatten()
                )
                pixel, _ = ds.lookup_table[int(meta["index"])]
                latlon = tuple([float(l) for l in str(pixel).split("_")])

                y_unnorm = (
                    ds.normalizer.individual_inverse(y, pixel, variable="target")
                    .detach()
                    .numpy()
                )

                #  extract from the original xr.Dataset
                y_exp = raw_ds.sel(
                    lat=latlon[0], lon=latlon[1], time=times, method="nearest"
                )[cfg.target_variable].values
                assert np.isclose(y_unnorm.reshape(y_exp.shape), y_exp, atol=1e-5)
Beispiel #8
0
    def test_dataloader(self, tmp_path):
        ds = _make_dataset()
        cfg = Config(Path("tests/testconfigs/test_config.yml"))
        create_and_assign_temp_run_path_to_config(cfg, tmp_path)
        static = create_static(cfg=cfg, ds=ds)
        dl = PixelDataLoader(
            ds,
            cfg=cfg,
            num_workers=1,
            mode="train",
            batch_size=cfg.batch_size,
            static_data=static,
        )

        assert dl.batch_size == cfg.batch_size

        seq_length = cfg.seq_length
        autoregressive = cfg.autoregressive
        data = next(iter(dl))
        x, y = data["x_d"], data["y"]
        n_inputs = len(["features"]) + 1 if autoregressive else len(["features"])

        assert x.shape == (
            cfg.batch_size,
            seq_length,
            n_inputs + 2 if cfg.encode_doys else n_inputs,
        ), f"Size Mismatch! Expected: {(cfg.batch_size, seq_length, n_inputs)} Got: {x.shape}"
Beispiel #9
0
    def test_static_inputs(self, tmp_path):
        ds = _make_dataset().isel(lat=slice(0, 2), lon=slice(0, 1))
        ds_static = ds.mean(dim="time")

        cfg = Config(Path("tests/testconfigs/test_config.yml"))
        create_and_assign_temp_run_path_to_config(cfg, tmp_path)
        assert False
Beispiel #10
0
def _create_dummy_precip_data(tmp_path):
    data_dir = tmp_path / "data" / "interim" / "chirps_preprocessed"
    if not data_dir.exists():
        data_dir.mkdir(parents=True, exist_ok=True)

    precip, _, _ = _make_dataset((30, 30), variable_name="precip")
    precip.to_netcdf(data_dir / "chirps_kenya.nc")

    return data_dir / "chirps_kenya.nc"
Beispiel #11
0
    def test_forecast_inputs(self, tmp_path):
        ds = _make_dataset().isel(lat=slice(0, 2), lon=slice(0, 1))
        ds_forecast = (
            ds.shift(time=1).rename({"feature": "feature_fcast1"}).drop("target")
        )
        ds = xr.merge([ds, ds_forecast])

        cfg = Config(Path("tests/testconfigs/test_config.yml"))
        create_and_assign_temp_run_path_to_config(cfg, tmp_path)
        assert False
Beispiel #12
0
    def test_train(self, tmp_path, capsys, use_pred_months):
        x, _, _ = _make_dataset(size=(5, 5), const=True)
        y = x.isel(time=[-1])

        test_features = tmp_path / 'features/one_month_forecast/train/hello'
        test_features.mkdir(parents=True)

        norm_dict = {'VHI': {'mean': 0, 'std': 1}}
        with (tmp_path / 'features/one_month_forecast/normalizing_dict.pkl'
              ).open('wb') as f:
            pickle.dump(norm_dict, f)

        x.to_netcdf(test_features / 'x.nc')
        y.to_netcdf(test_features / 'y.nc')

        # static
        x_static, _, _ = _make_dataset(size=(5, 5), add_times=False)
        static_features = tmp_path / f'features/static'
        static_features.mkdir(parents=True)
        x_static.to_netcdf(static_features / 'data.nc')

        static_norm_dict = {'VHI': {'mean': 0.0, 'std': 1.0}}
        with (tmp_path /
              f'features/static/normalizing_dict.pkl').open('wb') as f:
            pickle.dump(static_norm_dict, f)

        dense_features = [10]
        hidden_size = 128
        rnn_dropout = 0.25

        model = EARecurrentNetwork(hidden_size=hidden_size,
                                   dense_features=dense_features,
                                   rnn_dropout=rnn_dropout,
                                   data_folder=tmp_path)
        model.train()

        captured = capsys.readouterr()
        expected_stdout = 'Epoch 1, train smooth L1: 0.'
        assert expected_stdout in captured.out

        assert type(model.model) == EALSTM, \
            f'Model attribute not an EALSTM!'
Beispiel #13
0
def make_test_data(data_dir, experiment='one_month_forecast'):
    # create data (X, y)
    x, _, _ = _make_dataset(size=(5, 5), const=True)
    x_static, _, _ = _make_dataset(size=(5, 5), add_times=False)
    y = x.isel(time=[-1])

    x_add1, _, _ = _make_dataset(size=(5, 5), const=True, variable_name='precip')
    x_add2, _, _ = _make_dataset(size=(5, 5), const=True, variable_name='temp')

    x = xr.merge([x, x_add1, x_add2])

    # calculate normalising dictionaries
    norm_dict = {'VHI': {'mean': 0, 'std': 1},
                 'precip': {'mean': 0, 'std': 1},
                 'temp': {'mean': 0, 'std': 1}}
    static_norm_dict = {'VHI': {'mean': 0.0,
                        'std': 1.0}}

    # make the appropriate folders
    test_features = data_dir / f'features/{experiment}/train/hello'
    test_features.mkdir(parents=True, exist_ok=True)
    pred_features = data_dir / f'features/{experiment}/test/hello'
    pred_features.mkdir(parents=True, exist_ok=True)
    static_features = data_dir / f'features/static'
    static_features.mkdir(parents=True, exist_ok=True)

    # write the data out
    with (
        data_dir / f'features/{experiment}/normalizing_dict.pkl'
    ).open('wb') as f:
        pickle.dump(norm_dict, f)

    with (
        data_dir / f'features/static/normalizing_dict.pkl'
    ).open('wb') as f:
        pickle.dump(static_norm_dict, f)

    x.to_netcdf(test_features / 'x.nc')
    x.to_netcdf(pred_features / 'x.nc')
    y.to_netcdf(test_features / 'y.nc')
    y.to_netcdf(pred_features / 'y.nc')
    x_static.to_netcdf(static_features / 'data.nc')
    def test_get_background(self, tmp_path):
        x, _, _ = _make_dataset(size=(5, 5), const=True)
        y = x.isel(time=[-1])

        train_features = tmp_path / "features/one_month_forecast/train/1980_1"
        train_features.mkdir(parents=True)

        x.to_netcdf(train_features / "x.nc")
        y.to_netcdf(train_features / "y.nc")

        norm_dict = {"VHI": {"mean": 0, "std": 1}}
        with (tmp_path / "features/one_month_forecast/normalizing_dict.pkl"
              ).open("wb") as f:
            pickle.dump(norm_dict, f)

        # static
        x_static, _, _ = _make_dataset(size=(5, 5), add_times=False)
        static_features = tmp_path / f"features/static"
        static_features.mkdir(parents=True)
        x_static.to_netcdf(static_features / "data.nc")

        static_norm_dict = {"VHI": {"mean": 0.0, "std": 1.0}}
        with (tmp_path /
              f"features/static/normalizing_dict.pkl").open("wb") as f:
            pickle.dump(static_norm_dict, f)

        model = LinearNetwork(
            data_folder=tmp_path,
            layer_sizes=[100],
            dropout=0.25,
            include_pred_month=True,
        )
        background = model._get_background(sample_size=3)
        assert (background[0].shape[0] == 3
                ), f"Got {background[0].shape[0]} samples back, expected 3"
        assert (background[1].shape[0] == 3
                ), f"Got {background[1].shape[0]} samples back, expected 3"
        assert (len(background[1].shape) == 2
                ), f"Expected 2 dimensions, got {len(background[1].shape)}"
Beispiel #15
0
    def _create_dummy_true_preds_data(tmp_path):
        # save the preds
        parent_dir = tmp_path / "models" / "one_month_forecast" / "ealstm"
        parent_dir.mkdir(exist_ok=True, parents=True)
        save_fnames = ["preds_2018_1.nc", "preds_2018_2.nc", "preds_2018_3.nc"]
        times = ["2018-01-31", "2018-02-28", "2018-03-31"]
        for fname, time in zip(save_fnames, times):
            ds, _, _ = _make_dataset(
                (30, 30),
                variable_name="VHI",
                lonmin=30,
                lonmax=35,
                latmin=-2,
                latmax=2,
                start_date=time,
                end_date=time,
            )
            ds.to_netcdf(parent_dir / fname)

        # save the TRUTH (test files)
        save_dnames = ["2018_1", "2018_2", "2018_3"]
        parent_dir = tmp_path / "features" / "one_month_forecast" / "test"
        parent_dir.mkdir(exist_ok=True, parents=True)
        for dname, time in zip(save_dnames, times):
            ds, _, _ = _make_dataset(
                (30, 30),
                variable_name="VHI",
                lonmin=30,
                lonmax=35,
                latmin=-2,
                latmax=2,
                start_date=time,
                end_date=time,
            )

            (parent_dir / dname).mkdir(exist_ok=True, parents=True)
            ds.to_netcdf(parent_dir / dname / "y.nc")
    def test_get_background(self, tmp_path):
        x, _, _ = _make_dataset(size=(5, 5), const=True)
        y = x.isel(time=[-1])

        train_features = tmp_path / 'features/one_month_forecast/train/hello'
        train_features.mkdir(parents=True)

        x.to_netcdf(train_features / 'x.nc')
        y.to_netcdf(train_features / 'y.nc')

        norm_dict = {'VHI': {'mean': 0, 'std': 1}}
        with (tmp_path / 'features/one_month_forecast/normalizing_dict.pkl'
              ).open('wb') as f:
            pickle.dump(norm_dict, f)

        # static
        x_static, _, _ = _make_dataset(size=(5, 5), add_times=False)
        static_features = tmp_path / f'features/static'
        static_features.mkdir(parents=True)
        x_static.to_netcdf(static_features / 'data.nc')

        static_norm_dict = {'VHI': {'mean': 0.0, 'std': 1.0}}
        with (tmp_path /
              f'features/static/normalizing_dict.pkl').open('wb') as f:
            pickle.dump(static_norm_dict, f)

        model = LinearNetwork(data_folder=tmp_path,
                              layer_sizes=[100],
                              dropout=0.25,
                              include_pred_month=True)
        background = model._get_background(sample_size=3)
        assert background[0].shape[0] == 3, \
            f'Got {background[0].shape[0]} samples back, expected 3'
        assert background[1].shape[0] == 3, \
            f'Got {background[1].shape[0]} samples back, expected 3'
        assert len(background[1].shape) == 2, \
            f'Expected 2 dimensions, got {len(background[1].shape)}'
Beispiel #17
0
    def test_stack_xarray(self):
        ds = _make_dataset().isel(lat=slice(0, 2), lon=slice(0, 1))
        stacked, sample = _stack_xarray(ds, spatial_coords=["lat", "lon"])

        #  check that stacking works
        unstacked = sample.unstack()
        pixel = unstacked.isel(
            lat=np.random.choice(len(unstacked["lat"].values)),
            lon=np.random.choice(len(unstacked["lon"].values)),
        )
        lat, lon = [float(ll) for ll in str(pixel.values).split("_")]

        assert np.allclose(
            [lat, lon], [float(pixel.lat.values), float(pixel.lon.values)]
        )
Beispiel #18
0
    def _create_dummy_admin_boundaries_data(tmp_path, prefix: str):
        ds, _, _ = _make_dataset((30, 30), variable_name='VHI',
                                 lonmin=30, lonmax=35,
                                 latmin=-2, latmax=2,
                                 add_times=False)
        ds.VHI.astype(int)

        (tmp_path / 'analysis' / 'boundaries_preprocessed').mkdir(
            exist_ok=True, parents=True
        )
        ds.attrs['keys'] = ', '.join([str(i) for i in range(3)])
        ds.attrs['values'] = ', '.join([f'region_{i}' for i in np.arange(0, 3)])
        ds.to_netcdf(
            tmp_path / 'analysis' / 'boundaries_preprocessed' / f'province_l{prefix}_kenya.nc'
        )
Beispiel #19
0
    def test_linear_regression_forward_pass(self, tmp_path):
        ds = _make_dataset()
        cfg = Config(Path("tests/testconfigs/test_config.yml"))
        create_and_assign_temp_run_path_to_config(cfg, tmp_path)
        dl = PixelDataLoader(ds, cfg=cfg, mode="train", DEBUG=True)

        model = LinearRegression(
            input_size=(dl.input_size + dl.static_input_size + dl.forecast_input_size)
            * cfg.seq_length,
            output_size=dl.output_size,
            forecast_horizon=dl.horizon,
        )
        data = dl.__iter__().__next__()
        y_hat = model(data)

        assert isinstance(y_hat, dict)
        assert y_hat["y_hat"].shape == (1, 1)
Beispiel #20
0
    def _create_dummy_admin_boundaries_data(tmp_path, prefix: str):
        ds, _, _ = _make_dataset(
            (30, 30),
            variable_name="VHI",
            lonmin=30,
            lonmax=35,
            latmin=-2,
            latmax=2,
            add_times=False,
        )
        ds.VHI.astype(int)

        (tmp_path / "analysis" / "boundaries_preprocessed").mkdir(
            exist_ok=True, parents=True)
        ds.attrs["keys"] = ", ".join([str(i) for i in range(3)])
        ds.attrs["values"] = ", ".join(
            [f"region_{i}" for i in np.arange(0, 3)])
        ds.to_netcdf(tmp_path / "analysis" / "boundaries_preprocessed" /
                     f"province_l{prefix}_kenya.nc")
    def test_train(self, tmp_path, capsys, use_pred_months, use_latlons,
                   experiment, monthly_agg, static):
        # make the x, y data (5*5 latlons, 36 timesteps, 3 features)
        x, _, _ = _make_dataset(size=(5, 5), const=True)
        y = x.isel(time=[-1])

        x_add1, _, _ = _make_dataset(size=(5, 5),
                                     const=True,
                                     variable_name='precip')
        x_add2, _, _ = _make_dataset(size=(5, 5),
                                     const=True,
                                     variable_name='temp')
        x = xr.merge([x, x_add1, x_add2])

        norm_dict = {
            'VHI': {
                'mean': 0,
                'std': 1
            },
            'precip': {
                'mean': 0,
                'std': 1
            },
            'temp': {
                'mean': 0,
                'std': 1
            }
        }

        test_features = tmp_path / f'features/{experiment}/train/hello'
        test_features.mkdir(parents=True, exist_ok=True)

        # make the normalising dictionary
        with (tmp_path /
              f'features/{experiment}/normalizing_dict.pkl').open('wb') as f:
            pickle.dump(norm_dict, f)

        x.to_netcdf(test_features / 'x.nc')
        y.to_netcdf(test_features / 'y.nc')

        if static:
            x_static, _, _ = _make_dataset(size=(5, 5), add_times=False)
            static_features = tmp_path / f'features/static'
            static_features.mkdir(parents=True)
            x_static.to_netcdf(static_features / 'data.nc')

            static_norm_dict = {'VHI': {'mean': 0.0, 'std': 1.0}}
            with (tmp_path /
                  f'features/static/normalizing_dict.pkl').open('wb') as f:
                pickle.dump(static_norm_dict, f)

        layer_sizes = [10]
        dropout = 0.25

        model = LinearNetwork(data_folder=tmp_path,
                              layer_sizes=layer_sizes,
                              dropout=dropout,
                              experiment=experiment,
                              include_pred_month=use_pred_months,
                              include_latlons=use_latlons,
                              include_monthly_aggs=monthly_agg,
                              include_static=static)

        model.train()

        # check the number of input features is properly initialised
        n_input_features = [p for p in model.model.dense_layers.parameters()
                            ][0].shape[-1]

        # Expect to have 12 more features if use_pred_months
        if experiment == 'nowcast':
            n_expected = 107
        else:
            # NOTE: data hasn't been through `src.Engineer` therefore including
            #  current data (hence why more features than `nowcast`)
            n_expected = 108

        if monthly_agg:
            n_expected *= 2
        if use_pred_months:
            n_expected += 12
        if use_latlons:
            n_expected += 2

        n_expected += 3  # +3 for the yearly means

        if static:
            n_expected += 1  # for the static variable

        assert n_input_features == n_expected, "Expected the number" \
            f"of input features to be: {n_expected}" \
            f"Got: {n_input_features}"

        captured = capsys.readouterr()
        expected_stdout = 'Epoch 1, train smooth L1: '
        assert expected_stdout in captured.out

        assert type(model.model) == LinearModel, \
            f'Model attribute not a linear regression!'
Beispiel #22
0
    def test_predict_and_explain(self, tmp_path, use_pred_months,
                                 predict_delta):
        x, _, _ = _make_dataset(size=(5, 5), const=True)
        y = x.isel(time=[-1])

        train_features = tmp_path / "features/one_month_forecast/train/1980_1"
        train_features.mkdir(parents=True)

        test_features = tmp_path / "features/one_month_forecast/test/1980_1"
        test_features.mkdir(parents=True)

        norm_dict = {"VHI": {"mean": 0.0, "std": 1.0}}
        with (tmp_path / "features/one_month_forecast/normalizing_dict.pkl"
              ).open("wb") as f:
            pickle.dump(norm_dict, f)

        x.to_netcdf(test_features / "x.nc")
        y.to_netcdf(test_features / "y.nc")

        x.to_netcdf(train_features / "x.nc")
        y.to_netcdf(train_features / "y.nc")

        # static
        x_static, _, _ = _make_dataset(size=(5, 5), add_times=False)
        static_features = tmp_path / f"features/static"
        static_features.mkdir(parents=True)
        x_static.to_netcdf(static_features / "data.nc")

        static_norm_dict = {"VHI": {"mean": 0.0, "std": 1.0}}
        with (tmp_path /
              f"features/static/normalizing_dict.pkl").open("wb") as f:
            pickle.dump(static_norm_dict, f)

        dense_features = [10]
        hidden_size = 128
        rnn_dropout = 0.25

        model = EARecurrentNetwork(
            hidden_size=hidden_size,
            dense_features=dense_features,
            rnn_dropout=rnn_dropout,
            data_folder=tmp_path,
            predict_delta=predict_delta,
            normalize_y=True,
        )
        model.train()
        test_arrays_dict, pred_dict = model.predict()

        # the foldername "1980_1" is the only one which should be in the dictionaries
        assert ("1980_1" in test_arrays_dict.keys()) and (len(test_arrays_dict)
                                                          == 1)
        assert ("1980_1" in pred_dict.keys()) and (len(pred_dict) == 1)

        if not predict_delta:
            # _make_dataset with const=True returns all ones
            assert (test_arrays_dict["1980_1"]["y"] == 1).all()
        else:
            # _make_dataset with const=True & predict_delta
            # returns a change of 0
            assert (test_arrays_dict["1980_1"]["y"] == 0).all()

        # test the Morris explanation works
        test_dl = next(
            iter(
                model.get_dataloader(mode="test",
                                     to_tensor=True,
                                     shuffle_data=False)))

        for key, val in test_dl.items():
            output_m = model.explain(val.x,
                                     save_explanations=True,
                                     method="morris")
            assert type(output_m) is TrainData
            assert (model.model_dir /
                    "analysis/morris_value_historical.npy").exists()
    def test_predict(self, tmp_path, use_pred_months, use_latlons, experiment):
        x, _, _ = _make_dataset(size=(5, 5), const=True)
        y = x.isel(time=[-1])

        train_features = tmp_path / f"features/{experiment}/train/1980_1"
        train_features.mkdir(parents=True)

        test_features = tmp_path / f"features/{experiment}/test/1980_1"
        test_features.mkdir(parents=True)

        # static
        x_static, _, _ = _make_dataset(size=(5, 5), add_times=False)
        static_features = tmp_path / f"features/static"
        static_features.mkdir(parents=True)
        x_static.to_netcdf(static_features / "data.nc")

        static_norm_dict = {"VHI": {"mean": 0.0, "std": 1.0}}
        with (tmp_path /
              f"features/static/normalizing_dict.pkl").open("wb") as f:
            pickle.dump(static_norm_dict, f)

        # if nowcast we need another x feature
        if experiment == "nowcast":
            x_add1, _, _ = _make_dataset(size=(5, 5),
                                         const=True,
                                         variable_name="precip")
            x_add2, _, _ = _make_dataset(size=(5, 5),
                                         const=True,
                                         variable_name="temp")
            x = xr.merge([x, x_add1, x_add2])

            norm_dict = {
                "VHI": {
                    "mean": 0,
                    "std": 1
                },
                "precip": {
                    "mean": 0,
                    "std": 1
                },
                "temp": {
                    "mean": 0,
                    "std": 1
                },
            }
        else:
            norm_dict = {"VHI": {"mean": 0, "std": 1}}

        with (tmp_path /
              f"features/{experiment}/normalizing_dict.pkl").open("wb") as f:
            pickle.dump(norm_dict, f)

        x.to_netcdf(test_features / "x.nc")
        y.to_netcdf(test_features / "y.nc")

        x.to_netcdf(train_features / "x.nc")
        y.to_netcdf(train_features / "y.nc")

        layer_sizes = [10]
        dropout = 0.25

        model = LinearNetwork(
            data_folder=tmp_path,
            layer_sizes=layer_sizes,
            dropout=dropout,
            experiment=experiment,
            include_pred_month=use_pred_months,
            include_latlons=use_latlons,
        )
        model.train()
        test_arrays_dict, pred_dict = model.predict()

        # the foldername "1980_1" is the only one which should be in the dictionaries
        assert ("1980_1" in test_arrays_dict.keys()) and (len(test_arrays_dict)
                                                          == 1)
        assert ("1980_1" in pred_dict.keys()) and (len(pred_dict) == 1)

        # _make_dataset with const=True returns all ones
        assert (test_arrays_dict["1980_1"]["y"] == 1).all()
    def test_train(
        self,
        tmp_path,
        capsys,
        use_pred_months,
        use_latlons,
        experiment,
        monthly_agg,
        static,
        predict_delta,
    ):
        # make the x, y data (5*5 latlons, 36 timesteps, 3 features)
        x, _, _ = _make_dataset(size=(5, 5), const=True)
        y = x.isel(time=[-1])

        x_add1, _, _ = _make_dataset(size=(5, 5),
                                     const=True,
                                     variable_name="precip")
        x_add2, _, _ = _make_dataset(size=(5, 5),
                                     const=True,
                                     variable_name="temp")
        x = xr.merge([x, x_add1, x_add2])

        norm_dict = {
            "VHI": {
                "mean": 0,
                "std": 1
            },
            "precip": {
                "mean": 0,
                "std": 1
            },
            "temp": {
                "mean": 0,
                "std": 1
            },
        }

        test_features = tmp_path / f"features/{experiment}/train/1980_1"
        test_features.mkdir(parents=True, exist_ok=True)

        # make the normalising dictionary
        with (tmp_path /
              f"features/{experiment}/normalizing_dict.pkl").open("wb") as f:
            pickle.dump(norm_dict, f)

        x.to_netcdf(test_features / "x.nc")
        y.to_netcdf(test_features / "y.nc")

        if static:
            x_static, _, _ = _make_dataset(size=(5, 5), add_times=False)
            static_features = tmp_path / f"features/static"
            static_features.mkdir(parents=True)
            x_static.to_netcdf(static_features / "data.nc")

            static_norm_dict = {"VHI": {"mean": 0.0, "std": 1.0}}
            with (tmp_path /
                  f"features/static/normalizing_dict.pkl").open("wb") as f:
                pickle.dump(static_norm_dict, f)

        layer_sizes = [10]
        dropout = 0.25

        model = LinearNetwork(
            data_folder=tmp_path,
            layer_sizes=layer_sizes,
            dropout=dropout,
            experiment=experiment,
            include_pred_month=use_pred_months,
            include_latlons=use_latlons,
            include_monthly_aggs=monthly_agg,
            static="embeddings",
            predict_delta=predict_delta,
        )

        model.train()

        captured = capsys.readouterr()
        expected_stdout = "Epoch 1, train smooth L1: "
        assert expected_stdout in captured.out

        assert (type(model.model) == LinearModel
                ), f"Model attribute not a linear regression!"
Beispiel #25
0
    def test_train(
        self,
        tmp_path,
        capsys,
        use_pred_months,
        use_static_embedding,
        static,
        check_inversion,
    ):
        # make directories
        for ts in ["2001_11", "2001_12"]:
            test_features = tmp_path / f"features/one_month_forecast/train/{ts}"
            test_features.mkdir(parents=True)

        norm_dict = {"VHI": {"mean": 0, "std": 1}}
        with (tmp_path / "features/one_month_forecast/normalizing_dict.pkl").open(
            "wb"
        ) as f:
            pickle.dump(norm_dict, f)

        # save the X, y data pairs
        x, _, _ = _make_dataset(size=(5, 5), const=True)

        for ts in ["2001_11", "2001_12"]:
            if ts == "2001_12":
                y = x.sel(time="2001-12")
                x_save = x.sel(time=slice("2000-12", "2001-11"))
            else:
                y = x.sel(time="2001-11")
                x_save = x.sel(time=slice("2000-11", "2001-10"))
            x_save.to_netcdf(test_features / "x.nc")
            y.to_netcdf(test_features / "y.nc")

        # static
        x_static, _, _ = _make_dataset(size=(5, 5), add_times=False)
        static_features = tmp_path / f"features/static"
        static_features.mkdir(parents=True)
        x_static.to_netcdf(static_features / "data.nc")

        static_norm_dict = {"VHI": {"mean": 0.0, "std": 1.0}}
        with (tmp_path / f"features/static/normalizing_dict.pkl").open("wb") as f:
            pickle.dump(static_norm_dict, f)

        dense_features = [10]
        hidden_size = 128
        rnn_dropout = 0.25

        model = EARecurrentNetwork(
            hidden_size=hidden_size,
            dense_features=dense_features,
            rnn_dropout=rnn_dropout,
            data_folder=tmp_path,
            static_embedding_size=use_static_embedding,
            normalize_y=True,
            include_yearly_aggs=False,
            static=static,
        )
        model.train(check_inversion=check_inversion)

        captured = capsys.readouterr()
        expected_stdout = "Epoch 1, train smooth L1: 0."
        assert expected_stdout in captured.out

        assert type(model.model) == EALSTM, f"Model attribute not an EALSTM!"

        # ------------------
        # Check static embedding
        # -------------------
        if use_static_embedding is not None:
            all_e, (all_static_x, all_latlons, all_pred_months) = get_static_embedding(
                ealstm=model
            )
            assert (
                all_e[0].shape[0] == 25
            ), f"Expect 25 latlon values (pixels). Got: {all_e[0].shape}"
            assert (
                all_latlons[0].shape[0] == 25
            ), f"Expect 25 latlon values (pixels). Got: {all_e[0].shape}"

            # Moved the PredMonth OHE to the dynamic data
            assert all_static_x[0].shape == (
                25,
                1,  #  13,
            ), f"Expect 13 static dimensions Got: {all_static_x[0].shape}"
Beispiel #26
0
    def test_correct_data_returned(self, tmp_path):
        #  create dummy config path
        cfg = Config(Path("tests/testconfigs/test_config.yml"))
        cfg._cfg["encode_doys"] = True
        cfg._cfg["static_inputs"] = "embedding"
        cfg._cfg["forecast_variables"] = cfg.input_variables
        #  create temporary run directory (usually done by the )
        create_and_assign_temp_run_path_to_config(cfg, tmp_path)
        #  create dummy dataset
        ds = _make_dataset().isel(lat=slice(0, 2), lon=slice(0, 1))

        #  initialise the dataloader
        dl = PixelDataLoader(ds, cfg=cfg, mode="train", DEBUG=True)
        #  one sample from the dataloader
        data = dl.__iter__().__next__()
        x, y = data["x_d"], data["y"]

        #  recreate the stacked dataset
        stacked_ds = dl.dataset.ds

        if cfg.encode_doys:
            stacked_ds, _ = add_doy_encoding_as_feature_to_dataset(
                stacked_ds, inputs=cfg.input_variables, target=cfg.target_variable
            )

        #  get the current_time_index and pixel from the __getitem__() call
        getitem_call = int(data["meta"]["index"])
        pixel, current_time_index = dl.dataset.lookup_table[getitem_call]

        # check that the returned data is valid
        #  TODO: wrap into function for getting the valid times!
        est_target_time = pd.to_datetime(
            np.array(data["meta"]["target_time"]).astype("datetime64[ns]")
        )[0]

        #  rounding error because of storing as float
        input_data_times = pd.to_datetime(stacked_ds.time.values)
        true_target_index = input_data_times.get_loc(est_target_time, method="nearest")
        true_target_time = input_data_times[true_target_index]

        assert current_time_index + cfg.horizon == true_target_index

        # :: RECREATE TARGET DATA ::
        all_expected_y = stacked_ds.sel(sample=pixel)["target"].values

        expected_y = stacked_ds.sel(sample=pixel, time=true_target_time)[
            cfg.target_variable
        ].values
        expected_y_index = (
            stacked_ds.sel(sample=pixel)
            .isel(time=true_target_index)[cfg.target_variable]
            .values
        )
        assert expected_y == expected_y_index
        assert np.isclose(y.flatten()[-1], expected_y)

        ## :: RECREATE INPUT DATA ::
        # max_input_ix should be the CURRENT TIME (+ 1 because of exlusive upper indexing)
        max_input_ix = int(true_target_index - cfg.horizon)
        assert max_input_ix == current_time_index
        max_input_time = input_data_times[max_input_ix]

        #  min_input_ix = the first input time
        min_input_ix = int(max_input_ix - cfg.seq_length) + 1
        min_input_time = input_data_times[min_input_ix]

        input_vars = (
            cfg.input_variables + ["autoregressive"]
            if cfg.autoregressive
            else cfg.input_variables
        )
        input_vars = (
            input_vars + ["sin_doy", "cos_doy"] if cfg.encode_doys else input_vars
        )

        # has x been drawn from the actual underlying data?
        all_expected_x = stacked_ds.sel(sample=pixel)["feature"].values
        _expected_x = all_expected_x[min_input_ix:max_input_ix]
        # assert x == _expected_x

        # assert all(
        #     np.isin(
        #         np.round(x.numpy().flatten(), 3).astype("float64"),
        #         np.round(all_expected_x.flatten(), 3).astype("float64"),
        #     )
        # )

        # get the exact expected input vector
        # NOTE: slice is NOT EXCLUSIVE UPPER therefore need to exclude the final
        expected_x_feature = (
            stacked_ds.sel(sample=pixel, time=slice(min_input_time, max_input_time))[
                input_vars
            ]
            .to_array()
            .values.T
        )

        x_feature = np.array(x)
        x_feature = x_feature.reshape(expected_x_feature.shape)

        assert np.allclose(x_feature, expected_x_feature)
    def test_predict(self, tmp_path, use_pred_months, use_latlons, experiment):
        x, _, _ = _make_dataset(size=(5, 5), const=True)
        y = x.isel(time=[-1])

        train_features = tmp_path / f'features/{experiment}/train/hello'
        train_features.mkdir(parents=True)

        test_features = tmp_path / f'features/{experiment}/test/hello'
        test_features.mkdir(parents=True)

        # static
        x_static, _, _ = _make_dataset(size=(5, 5), add_times=False)
        static_features = tmp_path / f'features/static'
        static_features.mkdir(parents=True)
        x_static.to_netcdf(static_features / 'data.nc')

        static_norm_dict = {'VHI': {'mean': 0.0, 'std': 1.0}}
        with (tmp_path /
              f'features/static/normalizing_dict.pkl').open('wb') as f:
            pickle.dump(static_norm_dict, f)

        # if nowcast we need another x feature
        if experiment == 'nowcast':
            x_add1, _, _ = _make_dataset(size=(5, 5),
                                         const=True,
                                         variable_name='precip')
            x_add2, _, _ = _make_dataset(size=(5, 5),
                                         const=True,
                                         variable_name='temp')
            x = xr.merge([x, x_add1, x_add2])

            norm_dict = {
                'VHI': {
                    'mean': 0,
                    'std': 1
                },
                'precip': {
                    'mean': 0,
                    'std': 1
                },
                'temp': {
                    'mean': 0,
                    'std': 1
                }
            }
        else:
            norm_dict = {'VHI': {'mean': 0, 'std': 1}}

        with (tmp_path /
              f'features/{experiment}/normalizing_dict.pkl').open('wb') as f:
            pickle.dump(norm_dict, f)

        x.to_netcdf(test_features / 'x.nc')
        y.to_netcdf(test_features / 'y.nc')

        x.to_netcdf(train_features / 'x.nc')
        y.to_netcdf(train_features / 'y.nc')

        layer_sizes = [10]
        dropout = 0.25

        model = LinearNetwork(data_folder=tmp_path,
                              layer_sizes=layer_sizes,
                              dropout=dropout,
                              experiment=experiment,
                              include_pred_month=use_pred_months,
                              include_latlons=use_latlons)
        model.train()
        test_arrays_dict, pred_dict = model.predict()

        # the foldername "hello" is the only one which should be in the dictionaries
        assert ('hello' in test_arrays_dict.keys()) and (len(test_arrays_dict)
                                                         == 1)
        assert ('hello' in pred_dict.keys()) and (len(pred_dict) == 1)

        # _make_dataset with const=True returns all ones
        assert (test_arrays_dict['hello']['y'] == 1).all()