def _create_dummy_true_preds_data(tmp_path): # save the preds parent_dir = tmp_path / 'models' / 'one_month_forecast' / 'ealstm' parent_dir.mkdir(exist_ok=True, parents=True) save_fnames = ['preds_2018_1.nc', 'preds_2018_2.nc', 'preds_2018_3.nc'] times = ['2018-01-31', '2018-02-28', '2018-03-31'] for fname, time in zip(save_fnames, times): ds, _, _ = _make_dataset((30, 30), variable_name='VHI', lonmin=30, lonmax=35, latmin=-2, latmax=2, start_date=time, end_date=time) ds.to_netcdf(parent_dir / fname) # save the TRUTH (test files) save_dnames = ['2018_1', '2018_2', '2018_3'] parent_dir = tmp_path / 'features' / 'one_month_forecast' / 'test' parent_dir.mkdir(exist_ok=True, parents=True) for dname, time in zip(save_dnames, times): ds, _, _ = _make_dataset((30, 30), variable_name='VHI', lonmin=30, lonmax=35, latmin=-2, latmax=2, start_date=time, end_date=time) (parent_dir / dname).mkdir(exist_ok=True, parents=True) ds.to_netcdf(parent_dir / dname / 'y.nc')
def make_test_datasets(tmp_dir): x_pred, _, _ = _make_dataset(size=(5, 5)) x_coeff, _, _ = _make_dataset(size=(5, 5), variable_name="precip") x = xr.merge([x_pred, x_coeff]) y = x_pred.isel(time=[0]) data_dir = tmp_path / experiment if not data_dir.exists(): data_dir.mkdir(parents=True, exist_ok=True) # save netcdf x_file = data_dir / "x.nc" y_file = data_dir / "y.nc" if not x_file.exists(): x.to_netcdf(x_file) if not y_file.exists(): y.to_netcdf(y_file) # make normalising dictionary norm_dict = {} for var in x.data_vars: norm_dict[var] = { "mean": x[var].mean(dim=["lat", "lon"], skipna=True).values, "std": x[var].std(dim=["lat", "lon"], skipna=True).values, } return data_dir
def test_predict(self, tmp_path, use_pred_months, predict_delta): x, _, _ = _make_dataset(size=(5, 5), const=True) y = x.isel(time=[-1]) train_features = tmp_path / "features/one_month_forecast/train/1980_1" train_features.mkdir(parents=True) test_features = tmp_path / "features/one_month_forecast/test/1980_1" test_features.mkdir(parents=True) norm_dict = {"VHI": {"mean": 0.0, "std": 1.0}} with (tmp_path / "features/one_month_forecast/normalizing_dict.pkl" ).open("wb") as f: pickle.dump(norm_dict, f) x.to_netcdf(test_features / "x.nc") y.to_netcdf(test_features / "y.nc") x.to_netcdf(train_features / "x.nc") y.to_netcdf(train_features / "y.nc") # static x_static, _, _ = _make_dataset(size=(5, 5), add_times=False) static_features = tmp_path / f"features/static" static_features.mkdir(parents=True) x_static.to_netcdf(static_features / "data.nc") static_norm_dict = {"VHI": {"mean": 0.0, "std": 1.0}} with (tmp_path / f"features/static/normalizing_dict.pkl").open("wb") as f: pickle.dump(static_norm_dict, f) dense_features = [10] hidden_size = 128 rnn_dropout = 0.25 model = RecurrentNetwork( hidden_size=hidden_size, dense_features=dense_features, rnn_dropout=rnn_dropout, data_folder=tmp_path, predict_delta=predict_delta, ) model.train() test_arrays_dict, pred_dict = model.predict() # the foldername "1980_1" is the only one which should be in the dictionaries assert ("1980_1" in test_arrays_dict.keys()) and (len(test_arrays_dict) == 1) assert ("1980_1" in pred_dict.keys()) and (len(pred_dict) == 1) if not predict_delta: # _make_dataset with const=True returns all ones assert (test_arrays_dict["1980_1"]["y"] == 1).all()
def test_predict(self, tmp_path, use_pred_months): x, _, _ = _make_dataset(size=(5, 5), const=True) y = x.isel(time=[-1]) train_features = tmp_path / 'features/one_month_forecast/train/hello' train_features.mkdir(parents=True) test_features = tmp_path / 'features/one_month_forecast/test/hello' test_features.mkdir(parents=True) norm_dict = {'VHI': {'mean': 0.0, 'std': 1.0}} with (tmp_path / 'features/one_month_forecast/normalizing_dict.pkl' ).open('wb') as f: pickle.dump(norm_dict, f) x.to_netcdf(test_features / 'x.nc') y.to_netcdf(test_features / 'y.nc') x.to_netcdf(train_features / 'x.nc') y.to_netcdf(train_features / 'y.nc') # static x_static, _, _ = _make_dataset(size=(5, 5), add_times=False) static_features = tmp_path / f'features/static' static_features.mkdir(parents=True) x_static.to_netcdf(static_features / 'data.nc') static_norm_dict = {'VHI': {'mean': 0.0, 'std': 1.0}} with (tmp_path / f'features/static/normalizing_dict.pkl').open('wb') as f: pickle.dump(static_norm_dict, f) dense_features = [10] hidden_size = 128 rnn_dropout = 0.25 model = EARecurrentNetwork(hidden_size=hidden_size, dense_features=dense_features, rnn_dropout=rnn_dropout, data_folder=tmp_path) model.train() test_arrays_dict, pred_dict = model.predict() # the foldername "hello" is the only one which should be in the dictionaries assert ('hello' in test_arrays_dict.keys()) and (len(test_arrays_dict) == 1) assert ('hello' in pred_dict.keys()) and (len(pred_dict) == 1) # _make_dataset with const=True returns all ones assert (test_arrays_dict['hello']['y'] == 1).all()
def test_train(self, tmp_path, capsys, use_pred_months, predict_delta, static): x, _, _ = _make_dataset(size=(5, 5), const=True) y = x.isel(time=[-1]) test_features = tmp_path / "features/one_month_forecast/train/1980_1" test_features.mkdir(parents=True) norm_dict = {"VHI": {"mean": 0, "std": 1}} with (tmp_path / "features/one_month_forecast/normalizing_dict.pkl" ).open("wb") as f: pickle.dump(norm_dict, f) x.to_netcdf(test_features / "x.nc") y.to_netcdf(test_features / "y.nc") # static x_static, _, _ = _make_dataset(size=(5, 5), add_times=False) static_features = tmp_path / f"features/static" static_features.mkdir(parents=True) x_static.to_netcdf(static_features / "data.nc") static_norm_dict = {"VHI": {"mean": 0.0, "std": 1.0}} with (tmp_path / f"features/static/normalizing_dict.pkl").open("wb") as f: pickle.dump(static_norm_dict, f) dense_features = [10] hidden_size = 128 rnn_dropout = 0.25 model = RecurrentNetwork( hidden_size=hidden_size, dense_features=dense_features, rnn_dropout=rnn_dropout, data_folder=tmp_path, include_monthly_aggs=True, predict_delta=predict_delta, static=static, ) check_inversion = False model.train(check_inversion=check_inversion) captured = capsys.readouterr() expected_stdout = "Epoch 1, train smooth L1:" assert expected_stdout in captured.out assert type(model.model) == RNN, f"Model attribute not an RNN!"
def _create_dummy_landcover_data(tmp_path): parent_dir = tmp_path / "interim" / "static" / "esa_cci_landcover_preprocessed" parent_dir.mkdir(exist_ok=True, parents=True) fname = "esa_cci_landcover_kenya_one_hot.nc" vars = [ "Cropland, irrigated or post-flooding_one_hot", "Herbaceous cover_one_hot", "No data_one_hot", "Tree or shrub cover_one_hot", ] # create non-overlapping groups # https://stackoverflow.com/a/52356978/9940782 groups = np.random.randint(0, 4, (30, 30)) masks = (groups[..., None] == np.arange(4)[None, :]).T.astype(int) all_ds = [] for group, var in enumerate(vars): ds, _, _ = _make_dataset( (30, 30), variable_name=var, lonmin=30, lonmax=35, latmin=-2, latmax=2, add_times=False, const=True, ) # assign the values from the mask to the da.values ds[var].values = masks[group, :, :] all_ds.append(ds) ds = xr.merge([*all_ds]) ds.to_netcdf(parent_dir / fname)
def test_dataset(self, tmp_path): target_variable = "target" input_variables = ["feature"] for path in [ Path("tests/testconfigs/test_config_simulate.yml"), Path("tests/testconfigs/test_config.yml"), ]: cfg = Config(path) cfg._cfg["forecast_variables"] = cfg.input_variables create_and_assign_temp_run_path_to_config(cfg, tmp_path) raw_ds = _make_dataset().isel(lat=slice(0, 2), lon=slice(0, 1)) static = create_static(cfg=cfg, ds=raw_ds) ds = XarrayDataset( raw_ds, cfg=cfg, mode="train", DEBUG=True, static_data=static ) assert ds.target == target_variable assert ( ds.inputs == input_variables + ["autoregressive"] if cfg.autoregressive else input_variables ) x_features = ( len(input_variables) + 1 if cfg.autoregressive else len(input_variables) ) seq_length = cfg.seq_length for i in range(10): data = ds.__getitem__(i) x, y = data["x_d"], data["y"] assert y.shape == (1, 1) assert x.shape == ( seq_length, x_features + 2 if cfg.encode_doys else x_features, ), f"Shape Mismatch! Expect: {(seq_length, x_features)} Got: {x.shape}" meta = data["meta"] times = ( meta["target_time"] .detach() .numpy() .astype("datetime64[ns]") .flatten() ) pixel, _ = ds.lookup_table[int(meta["index"])] latlon = tuple([float(l) for l in str(pixel).split("_")]) y_unnorm = ( ds.normalizer.individual_inverse(y, pixel, variable="target") .detach() .numpy() ) # extract from the original xr.Dataset y_exp = raw_ds.sel( lat=latlon[0], lon=latlon[1], time=times, method="nearest" )[cfg.target_variable].values assert np.isclose(y_unnorm.reshape(y_exp.shape), y_exp, atol=1e-5)
def test_dataloader(self, tmp_path): ds = _make_dataset() cfg = Config(Path("tests/testconfigs/test_config.yml")) create_and_assign_temp_run_path_to_config(cfg, tmp_path) static = create_static(cfg=cfg, ds=ds) dl = PixelDataLoader( ds, cfg=cfg, num_workers=1, mode="train", batch_size=cfg.batch_size, static_data=static, ) assert dl.batch_size == cfg.batch_size seq_length = cfg.seq_length autoregressive = cfg.autoregressive data = next(iter(dl)) x, y = data["x_d"], data["y"] n_inputs = len(["features"]) + 1 if autoregressive else len(["features"]) assert x.shape == ( cfg.batch_size, seq_length, n_inputs + 2 if cfg.encode_doys else n_inputs, ), f"Size Mismatch! Expected: {(cfg.batch_size, seq_length, n_inputs)} Got: {x.shape}"
def test_static_inputs(self, tmp_path): ds = _make_dataset().isel(lat=slice(0, 2), lon=slice(0, 1)) ds_static = ds.mean(dim="time") cfg = Config(Path("tests/testconfigs/test_config.yml")) create_and_assign_temp_run_path_to_config(cfg, tmp_path) assert False
def _create_dummy_precip_data(tmp_path): data_dir = tmp_path / "data" / "interim" / "chirps_preprocessed" if not data_dir.exists(): data_dir.mkdir(parents=True, exist_ok=True) precip, _, _ = _make_dataset((30, 30), variable_name="precip") precip.to_netcdf(data_dir / "chirps_kenya.nc") return data_dir / "chirps_kenya.nc"
def test_forecast_inputs(self, tmp_path): ds = _make_dataset().isel(lat=slice(0, 2), lon=slice(0, 1)) ds_forecast = ( ds.shift(time=1).rename({"feature": "feature_fcast1"}).drop("target") ) ds = xr.merge([ds, ds_forecast]) cfg = Config(Path("tests/testconfigs/test_config.yml")) create_and_assign_temp_run_path_to_config(cfg, tmp_path) assert False
def test_train(self, tmp_path, capsys, use_pred_months): x, _, _ = _make_dataset(size=(5, 5), const=True) y = x.isel(time=[-1]) test_features = tmp_path / 'features/one_month_forecast/train/hello' test_features.mkdir(parents=True) norm_dict = {'VHI': {'mean': 0, 'std': 1}} with (tmp_path / 'features/one_month_forecast/normalizing_dict.pkl' ).open('wb') as f: pickle.dump(norm_dict, f) x.to_netcdf(test_features / 'x.nc') y.to_netcdf(test_features / 'y.nc') # static x_static, _, _ = _make_dataset(size=(5, 5), add_times=False) static_features = tmp_path / f'features/static' static_features.mkdir(parents=True) x_static.to_netcdf(static_features / 'data.nc') static_norm_dict = {'VHI': {'mean': 0.0, 'std': 1.0}} with (tmp_path / f'features/static/normalizing_dict.pkl').open('wb') as f: pickle.dump(static_norm_dict, f) dense_features = [10] hidden_size = 128 rnn_dropout = 0.25 model = EARecurrentNetwork(hidden_size=hidden_size, dense_features=dense_features, rnn_dropout=rnn_dropout, data_folder=tmp_path) model.train() captured = capsys.readouterr() expected_stdout = 'Epoch 1, train smooth L1: 0.' assert expected_stdout in captured.out assert type(model.model) == EALSTM, \ f'Model attribute not an EALSTM!'
def make_test_data(data_dir, experiment='one_month_forecast'): # create data (X, y) x, _, _ = _make_dataset(size=(5, 5), const=True) x_static, _, _ = _make_dataset(size=(5, 5), add_times=False) y = x.isel(time=[-1]) x_add1, _, _ = _make_dataset(size=(5, 5), const=True, variable_name='precip') x_add2, _, _ = _make_dataset(size=(5, 5), const=True, variable_name='temp') x = xr.merge([x, x_add1, x_add2]) # calculate normalising dictionaries norm_dict = {'VHI': {'mean': 0, 'std': 1}, 'precip': {'mean': 0, 'std': 1}, 'temp': {'mean': 0, 'std': 1}} static_norm_dict = {'VHI': {'mean': 0.0, 'std': 1.0}} # make the appropriate folders test_features = data_dir / f'features/{experiment}/train/hello' test_features.mkdir(parents=True, exist_ok=True) pred_features = data_dir / f'features/{experiment}/test/hello' pred_features.mkdir(parents=True, exist_ok=True) static_features = data_dir / f'features/static' static_features.mkdir(parents=True, exist_ok=True) # write the data out with ( data_dir / f'features/{experiment}/normalizing_dict.pkl' ).open('wb') as f: pickle.dump(norm_dict, f) with ( data_dir / f'features/static/normalizing_dict.pkl' ).open('wb') as f: pickle.dump(static_norm_dict, f) x.to_netcdf(test_features / 'x.nc') x.to_netcdf(pred_features / 'x.nc') y.to_netcdf(test_features / 'y.nc') y.to_netcdf(pred_features / 'y.nc') x_static.to_netcdf(static_features / 'data.nc')
def test_get_background(self, tmp_path): x, _, _ = _make_dataset(size=(5, 5), const=True) y = x.isel(time=[-1]) train_features = tmp_path / "features/one_month_forecast/train/1980_1" train_features.mkdir(parents=True) x.to_netcdf(train_features / "x.nc") y.to_netcdf(train_features / "y.nc") norm_dict = {"VHI": {"mean": 0, "std": 1}} with (tmp_path / "features/one_month_forecast/normalizing_dict.pkl" ).open("wb") as f: pickle.dump(norm_dict, f) # static x_static, _, _ = _make_dataset(size=(5, 5), add_times=False) static_features = tmp_path / f"features/static" static_features.mkdir(parents=True) x_static.to_netcdf(static_features / "data.nc") static_norm_dict = {"VHI": {"mean": 0.0, "std": 1.0}} with (tmp_path / f"features/static/normalizing_dict.pkl").open("wb") as f: pickle.dump(static_norm_dict, f) model = LinearNetwork( data_folder=tmp_path, layer_sizes=[100], dropout=0.25, include_pred_month=True, ) background = model._get_background(sample_size=3) assert (background[0].shape[0] == 3 ), f"Got {background[0].shape[0]} samples back, expected 3" assert (background[1].shape[0] == 3 ), f"Got {background[1].shape[0]} samples back, expected 3" assert (len(background[1].shape) == 2 ), f"Expected 2 dimensions, got {len(background[1].shape)}"
def _create_dummy_true_preds_data(tmp_path): # save the preds parent_dir = tmp_path / "models" / "one_month_forecast" / "ealstm" parent_dir.mkdir(exist_ok=True, parents=True) save_fnames = ["preds_2018_1.nc", "preds_2018_2.nc", "preds_2018_3.nc"] times = ["2018-01-31", "2018-02-28", "2018-03-31"] for fname, time in zip(save_fnames, times): ds, _, _ = _make_dataset( (30, 30), variable_name="VHI", lonmin=30, lonmax=35, latmin=-2, latmax=2, start_date=time, end_date=time, ) ds.to_netcdf(parent_dir / fname) # save the TRUTH (test files) save_dnames = ["2018_1", "2018_2", "2018_3"] parent_dir = tmp_path / "features" / "one_month_forecast" / "test" parent_dir.mkdir(exist_ok=True, parents=True) for dname, time in zip(save_dnames, times): ds, _, _ = _make_dataset( (30, 30), variable_name="VHI", lonmin=30, lonmax=35, latmin=-2, latmax=2, start_date=time, end_date=time, ) (parent_dir / dname).mkdir(exist_ok=True, parents=True) ds.to_netcdf(parent_dir / dname / "y.nc")
def test_get_background(self, tmp_path): x, _, _ = _make_dataset(size=(5, 5), const=True) y = x.isel(time=[-1]) train_features = tmp_path / 'features/one_month_forecast/train/hello' train_features.mkdir(parents=True) x.to_netcdf(train_features / 'x.nc') y.to_netcdf(train_features / 'y.nc') norm_dict = {'VHI': {'mean': 0, 'std': 1}} with (tmp_path / 'features/one_month_forecast/normalizing_dict.pkl' ).open('wb') as f: pickle.dump(norm_dict, f) # static x_static, _, _ = _make_dataset(size=(5, 5), add_times=False) static_features = tmp_path / f'features/static' static_features.mkdir(parents=True) x_static.to_netcdf(static_features / 'data.nc') static_norm_dict = {'VHI': {'mean': 0.0, 'std': 1.0}} with (tmp_path / f'features/static/normalizing_dict.pkl').open('wb') as f: pickle.dump(static_norm_dict, f) model = LinearNetwork(data_folder=tmp_path, layer_sizes=[100], dropout=0.25, include_pred_month=True) background = model._get_background(sample_size=3) assert background[0].shape[0] == 3, \ f'Got {background[0].shape[0]} samples back, expected 3' assert background[1].shape[0] == 3, \ f'Got {background[1].shape[0]} samples back, expected 3' assert len(background[1].shape) == 2, \ f'Expected 2 dimensions, got {len(background[1].shape)}'
def test_stack_xarray(self): ds = _make_dataset().isel(lat=slice(0, 2), lon=slice(0, 1)) stacked, sample = _stack_xarray(ds, spatial_coords=["lat", "lon"]) # check that stacking works unstacked = sample.unstack() pixel = unstacked.isel( lat=np.random.choice(len(unstacked["lat"].values)), lon=np.random.choice(len(unstacked["lon"].values)), ) lat, lon = [float(ll) for ll in str(pixel.values).split("_")] assert np.allclose( [lat, lon], [float(pixel.lat.values), float(pixel.lon.values)] )
def _create_dummy_admin_boundaries_data(tmp_path, prefix: str): ds, _, _ = _make_dataset((30, 30), variable_name='VHI', lonmin=30, lonmax=35, latmin=-2, latmax=2, add_times=False) ds.VHI.astype(int) (tmp_path / 'analysis' / 'boundaries_preprocessed').mkdir( exist_ok=True, parents=True ) ds.attrs['keys'] = ', '.join([str(i) for i in range(3)]) ds.attrs['values'] = ', '.join([f'region_{i}' for i in np.arange(0, 3)]) ds.to_netcdf( tmp_path / 'analysis' / 'boundaries_preprocessed' / f'province_l{prefix}_kenya.nc' )
def test_linear_regression_forward_pass(self, tmp_path): ds = _make_dataset() cfg = Config(Path("tests/testconfigs/test_config.yml")) create_and_assign_temp_run_path_to_config(cfg, tmp_path) dl = PixelDataLoader(ds, cfg=cfg, mode="train", DEBUG=True) model = LinearRegression( input_size=(dl.input_size + dl.static_input_size + dl.forecast_input_size) * cfg.seq_length, output_size=dl.output_size, forecast_horizon=dl.horizon, ) data = dl.__iter__().__next__() y_hat = model(data) assert isinstance(y_hat, dict) assert y_hat["y_hat"].shape == (1, 1)
def _create_dummy_admin_boundaries_data(tmp_path, prefix: str): ds, _, _ = _make_dataset( (30, 30), variable_name="VHI", lonmin=30, lonmax=35, latmin=-2, latmax=2, add_times=False, ) ds.VHI.astype(int) (tmp_path / "analysis" / "boundaries_preprocessed").mkdir( exist_ok=True, parents=True) ds.attrs["keys"] = ", ".join([str(i) for i in range(3)]) ds.attrs["values"] = ", ".join( [f"region_{i}" for i in np.arange(0, 3)]) ds.to_netcdf(tmp_path / "analysis" / "boundaries_preprocessed" / f"province_l{prefix}_kenya.nc")
def test_train(self, tmp_path, capsys, use_pred_months, use_latlons, experiment, monthly_agg, static): # make the x, y data (5*5 latlons, 36 timesteps, 3 features) x, _, _ = _make_dataset(size=(5, 5), const=True) y = x.isel(time=[-1]) x_add1, _, _ = _make_dataset(size=(5, 5), const=True, variable_name='precip') x_add2, _, _ = _make_dataset(size=(5, 5), const=True, variable_name='temp') x = xr.merge([x, x_add1, x_add2]) norm_dict = { 'VHI': { 'mean': 0, 'std': 1 }, 'precip': { 'mean': 0, 'std': 1 }, 'temp': { 'mean': 0, 'std': 1 } } test_features = tmp_path / f'features/{experiment}/train/hello' test_features.mkdir(parents=True, exist_ok=True) # make the normalising dictionary with (tmp_path / f'features/{experiment}/normalizing_dict.pkl').open('wb') as f: pickle.dump(norm_dict, f) x.to_netcdf(test_features / 'x.nc') y.to_netcdf(test_features / 'y.nc') if static: x_static, _, _ = _make_dataset(size=(5, 5), add_times=False) static_features = tmp_path / f'features/static' static_features.mkdir(parents=True) x_static.to_netcdf(static_features / 'data.nc') static_norm_dict = {'VHI': {'mean': 0.0, 'std': 1.0}} with (tmp_path / f'features/static/normalizing_dict.pkl').open('wb') as f: pickle.dump(static_norm_dict, f) layer_sizes = [10] dropout = 0.25 model = LinearNetwork(data_folder=tmp_path, layer_sizes=layer_sizes, dropout=dropout, experiment=experiment, include_pred_month=use_pred_months, include_latlons=use_latlons, include_monthly_aggs=monthly_agg, include_static=static) model.train() # check the number of input features is properly initialised n_input_features = [p for p in model.model.dense_layers.parameters() ][0].shape[-1] # Expect to have 12 more features if use_pred_months if experiment == 'nowcast': n_expected = 107 else: # NOTE: data hasn't been through `src.Engineer` therefore including # current data (hence why more features than `nowcast`) n_expected = 108 if monthly_agg: n_expected *= 2 if use_pred_months: n_expected += 12 if use_latlons: n_expected += 2 n_expected += 3 # +3 for the yearly means if static: n_expected += 1 # for the static variable assert n_input_features == n_expected, "Expected the number" \ f"of input features to be: {n_expected}" \ f"Got: {n_input_features}" captured = capsys.readouterr() expected_stdout = 'Epoch 1, train smooth L1: ' assert expected_stdout in captured.out assert type(model.model) == LinearModel, \ f'Model attribute not a linear regression!'
def test_predict_and_explain(self, tmp_path, use_pred_months, predict_delta): x, _, _ = _make_dataset(size=(5, 5), const=True) y = x.isel(time=[-1]) train_features = tmp_path / "features/one_month_forecast/train/1980_1" train_features.mkdir(parents=True) test_features = tmp_path / "features/one_month_forecast/test/1980_1" test_features.mkdir(parents=True) norm_dict = {"VHI": {"mean": 0.0, "std": 1.0}} with (tmp_path / "features/one_month_forecast/normalizing_dict.pkl" ).open("wb") as f: pickle.dump(norm_dict, f) x.to_netcdf(test_features / "x.nc") y.to_netcdf(test_features / "y.nc") x.to_netcdf(train_features / "x.nc") y.to_netcdf(train_features / "y.nc") # static x_static, _, _ = _make_dataset(size=(5, 5), add_times=False) static_features = tmp_path / f"features/static" static_features.mkdir(parents=True) x_static.to_netcdf(static_features / "data.nc") static_norm_dict = {"VHI": {"mean": 0.0, "std": 1.0}} with (tmp_path / f"features/static/normalizing_dict.pkl").open("wb") as f: pickle.dump(static_norm_dict, f) dense_features = [10] hidden_size = 128 rnn_dropout = 0.25 model = EARecurrentNetwork( hidden_size=hidden_size, dense_features=dense_features, rnn_dropout=rnn_dropout, data_folder=tmp_path, predict_delta=predict_delta, normalize_y=True, ) model.train() test_arrays_dict, pred_dict = model.predict() # the foldername "1980_1" is the only one which should be in the dictionaries assert ("1980_1" in test_arrays_dict.keys()) and (len(test_arrays_dict) == 1) assert ("1980_1" in pred_dict.keys()) and (len(pred_dict) == 1) if not predict_delta: # _make_dataset with const=True returns all ones assert (test_arrays_dict["1980_1"]["y"] == 1).all() else: # _make_dataset with const=True & predict_delta # returns a change of 0 assert (test_arrays_dict["1980_1"]["y"] == 0).all() # test the Morris explanation works test_dl = next( iter( model.get_dataloader(mode="test", to_tensor=True, shuffle_data=False))) for key, val in test_dl.items(): output_m = model.explain(val.x, save_explanations=True, method="morris") assert type(output_m) is TrainData assert (model.model_dir / "analysis/morris_value_historical.npy").exists()
def test_predict(self, tmp_path, use_pred_months, use_latlons, experiment): x, _, _ = _make_dataset(size=(5, 5), const=True) y = x.isel(time=[-1]) train_features = tmp_path / f"features/{experiment}/train/1980_1" train_features.mkdir(parents=True) test_features = tmp_path / f"features/{experiment}/test/1980_1" test_features.mkdir(parents=True) # static x_static, _, _ = _make_dataset(size=(5, 5), add_times=False) static_features = tmp_path / f"features/static" static_features.mkdir(parents=True) x_static.to_netcdf(static_features / "data.nc") static_norm_dict = {"VHI": {"mean": 0.0, "std": 1.0}} with (tmp_path / f"features/static/normalizing_dict.pkl").open("wb") as f: pickle.dump(static_norm_dict, f) # if nowcast we need another x feature if experiment == "nowcast": x_add1, _, _ = _make_dataset(size=(5, 5), const=True, variable_name="precip") x_add2, _, _ = _make_dataset(size=(5, 5), const=True, variable_name="temp") x = xr.merge([x, x_add1, x_add2]) norm_dict = { "VHI": { "mean": 0, "std": 1 }, "precip": { "mean": 0, "std": 1 }, "temp": { "mean": 0, "std": 1 }, } else: norm_dict = {"VHI": {"mean": 0, "std": 1}} with (tmp_path / f"features/{experiment}/normalizing_dict.pkl").open("wb") as f: pickle.dump(norm_dict, f) x.to_netcdf(test_features / "x.nc") y.to_netcdf(test_features / "y.nc") x.to_netcdf(train_features / "x.nc") y.to_netcdf(train_features / "y.nc") layer_sizes = [10] dropout = 0.25 model = LinearNetwork( data_folder=tmp_path, layer_sizes=layer_sizes, dropout=dropout, experiment=experiment, include_pred_month=use_pred_months, include_latlons=use_latlons, ) model.train() test_arrays_dict, pred_dict = model.predict() # the foldername "1980_1" is the only one which should be in the dictionaries assert ("1980_1" in test_arrays_dict.keys()) and (len(test_arrays_dict) == 1) assert ("1980_1" in pred_dict.keys()) and (len(pred_dict) == 1) # _make_dataset with const=True returns all ones assert (test_arrays_dict["1980_1"]["y"] == 1).all()
def test_train( self, tmp_path, capsys, use_pred_months, use_latlons, experiment, monthly_agg, static, predict_delta, ): # make the x, y data (5*5 latlons, 36 timesteps, 3 features) x, _, _ = _make_dataset(size=(5, 5), const=True) y = x.isel(time=[-1]) x_add1, _, _ = _make_dataset(size=(5, 5), const=True, variable_name="precip") x_add2, _, _ = _make_dataset(size=(5, 5), const=True, variable_name="temp") x = xr.merge([x, x_add1, x_add2]) norm_dict = { "VHI": { "mean": 0, "std": 1 }, "precip": { "mean": 0, "std": 1 }, "temp": { "mean": 0, "std": 1 }, } test_features = tmp_path / f"features/{experiment}/train/1980_1" test_features.mkdir(parents=True, exist_ok=True) # make the normalising dictionary with (tmp_path / f"features/{experiment}/normalizing_dict.pkl").open("wb") as f: pickle.dump(norm_dict, f) x.to_netcdf(test_features / "x.nc") y.to_netcdf(test_features / "y.nc") if static: x_static, _, _ = _make_dataset(size=(5, 5), add_times=False) static_features = tmp_path / f"features/static" static_features.mkdir(parents=True) x_static.to_netcdf(static_features / "data.nc") static_norm_dict = {"VHI": {"mean": 0.0, "std": 1.0}} with (tmp_path / f"features/static/normalizing_dict.pkl").open("wb") as f: pickle.dump(static_norm_dict, f) layer_sizes = [10] dropout = 0.25 model = LinearNetwork( data_folder=tmp_path, layer_sizes=layer_sizes, dropout=dropout, experiment=experiment, include_pred_month=use_pred_months, include_latlons=use_latlons, include_monthly_aggs=monthly_agg, static="embeddings", predict_delta=predict_delta, ) model.train() captured = capsys.readouterr() expected_stdout = "Epoch 1, train smooth L1: " assert expected_stdout in captured.out assert (type(model.model) == LinearModel ), f"Model attribute not a linear regression!"
def test_train( self, tmp_path, capsys, use_pred_months, use_static_embedding, static, check_inversion, ): # make directories for ts in ["2001_11", "2001_12"]: test_features = tmp_path / f"features/one_month_forecast/train/{ts}" test_features.mkdir(parents=True) norm_dict = {"VHI": {"mean": 0, "std": 1}} with (tmp_path / "features/one_month_forecast/normalizing_dict.pkl").open( "wb" ) as f: pickle.dump(norm_dict, f) # save the X, y data pairs x, _, _ = _make_dataset(size=(5, 5), const=True) for ts in ["2001_11", "2001_12"]: if ts == "2001_12": y = x.sel(time="2001-12") x_save = x.sel(time=slice("2000-12", "2001-11")) else: y = x.sel(time="2001-11") x_save = x.sel(time=slice("2000-11", "2001-10")) x_save.to_netcdf(test_features / "x.nc") y.to_netcdf(test_features / "y.nc") # static x_static, _, _ = _make_dataset(size=(5, 5), add_times=False) static_features = tmp_path / f"features/static" static_features.mkdir(parents=True) x_static.to_netcdf(static_features / "data.nc") static_norm_dict = {"VHI": {"mean": 0.0, "std": 1.0}} with (tmp_path / f"features/static/normalizing_dict.pkl").open("wb") as f: pickle.dump(static_norm_dict, f) dense_features = [10] hidden_size = 128 rnn_dropout = 0.25 model = EARecurrentNetwork( hidden_size=hidden_size, dense_features=dense_features, rnn_dropout=rnn_dropout, data_folder=tmp_path, static_embedding_size=use_static_embedding, normalize_y=True, include_yearly_aggs=False, static=static, ) model.train(check_inversion=check_inversion) captured = capsys.readouterr() expected_stdout = "Epoch 1, train smooth L1: 0." assert expected_stdout in captured.out assert type(model.model) == EALSTM, f"Model attribute not an EALSTM!" # ------------------ # Check static embedding # ------------------- if use_static_embedding is not None: all_e, (all_static_x, all_latlons, all_pred_months) = get_static_embedding( ealstm=model ) assert ( all_e[0].shape[0] == 25 ), f"Expect 25 latlon values (pixels). Got: {all_e[0].shape}" assert ( all_latlons[0].shape[0] == 25 ), f"Expect 25 latlon values (pixels). Got: {all_e[0].shape}" # Moved the PredMonth OHE to the dynamic data assert all_static_x[0].shape == ( 25, 1, # 13, ), f"Expect 13 static dimensions Got: {all_static_x[0].shape}"
def test_correct_data_returned(self, tmp_path): # create dummy config path cfg = Config(Path("tests/testconfigs/test_config.yml")) cfg._cfg["encode_doys"] = True cfg._cfg["static_inputs"] = "embedding" cfg._cfg["forecast_variables"] = cfg.input_variables # create temporary run directory (usually done by the ) create_and_assign_temp_run_path_to_config(cfg, tmp_path) # create dummy dataset ds = _make_dataset().isel(lat=slice(0, 2), lon=slice(0, 1)) # initialise the dataloader dl = PixelDataLoader(ds, cfg=cfg, mode="train", DEBUG=True) # one sample from the dataloader data = dl.__iter__().__next__() x, y = data["x_d"], data["y"] # recreate the stacked dataset stacked_ds = dl.dataset.ds if cfg.encode_doys: stacked_ds, _ = add_doy_encoding_as_feature_to_dataset( stacked_ds, inputs=cfg.input_variables, target=cfg.target_variable ) # get the current_time_index and pixel from the __getitem__() call getitem_call = int(data["meta"]["index"]) pixel, current_time_index = dl.dataset.lookup_table[getitem_call] # check that the returned data is valid # TODO: wrap into function for getting the valid times! est_target_time = pd.to_datetime( np.array(data["meta"]["target_time"]).astype("datetime64[ns]") )[0] # rounding error because of storing as float input_data_times = pd.to_datetime(stacked_ds.time.values) true_target_index = input_data_times.get_loc(est_target_time, method="nearest") true_target_time = input_data_times[true_target_index] assert current_time_index + cfg.horizon == true_target_index # :: RECREATE TARGET DATA :: all_expected_y = stacked_ds.sel(sample=pixel)["target"].values expected_y = stacked_ds.sel(sample=pixel, time=true_target_time)[ cfg.target_variable ].values expected_y_index = ( stacked_ds.sel(sample=pixel) .isel(time=true_target_index)[cfg.target_variable] .values ) assert expected_y == expected_y_index assert np.isclose(y.flatten()[-1], expected_y) ## :: RECREATE INPUT DATA :: # max_input_ix should be the CURRENT TIME (+ 1 because of exlusive upper indexing) max_input_ix = int(true_target_index - cfg.horizon) assert max_input_ix == current_time_index max_input_time = input_data_times[max_input_ix] # min_input_ix = the first input time min_input_ix = int(max_input_ix - cfg.seq_length) + 1 min_input_time = input_data_times[min_input_ix] input_vars = ( cfg.input_variables + ["autoregressive"] if cfg.autoregressive else cfg.input_variables ) input_vars = ( input_vars + ["sin_doy", "cos_doy"] if cfg.encode_doys else input_vars ) # has x been drawn from the actual underlying data? all_expected_x = stacked_ds.sel(sample=pixel)["feature"].values _expected_x = all_expected_x[min_input_ix:max_input_ix] # assert x == _expected_x # assert all( # np.isin( # np.round(x.numpy().flatten(), 3).astype("float64"), # np.round(all_expected_x.flatten(), 3).astype("float64"), # ) # ) # get the exact expected input vector # NOTE: slice is NOT EXCLUSIVE UPPER therefore need to exclude the final expected_x_feature = ( stacked_ds.sel(sample=pixel, time=slice(min_input_time, max_input_time))[ input_vars ] .to_array() .values.T ) x_feature = np.array(x) x_feature = x_feature.reshape(expected_x_feature.shape) assert np.allclose(x_feature, expected_x_feature)
def test_predict(self, tmp_path, use_pred_months, use_latlons, experiment): x, _, _ = _make_dataset(size=(5, 5), const=True) y = x.isel(time=[-1]) train_features = tmp_path / f'features/{experiment}/train/hello' train_features.mkdir(parents=True) test_features = tmp_path / f'features/{experiment}/test/hello' test_features.mkdir(parents=True) # static x_static, _, _ = _make_dataset(size=(5, 5), add_times=False) static_features = tmp_path / f'features/static' static_features.mkdir(parents=True) x_static.to_netcdf(static_features / 'data.nc') static_norm_dict = {'VHI': {'mean': 0.0, 'std': 1.0}} with (tmp_path / f'features/static/normalizing_dict.pkl').open('wb') as f: pickle.dump(static_norm_dict, f) # if nowcast we need another x feature if experiment == 'nowcast': x_add1, _, _ = _make_dataset(size=(5, 5), const=True, variable_name='precip') x_add2, _, _ = _make_dataset(size=(5, 5), const=True, variable_name='temp') x = xr.merge([x, x_add1, x_add2]) norm_dict = { 'VHI': { 'mean': 0, 'std': 1 }, 'precip': { 'mean': 0, 'std': 1 }, 'temp': { 'mean': 0, 'std': 1 } } else: norm_dict = {'VHI': {'mean': 0, 'std': 1}} with (tmp_path / f'features/{experiment}/normalizing_dict.pkl').open('wb') as f: pickle.dump(norm_dict, f) x.to_netcdf(test_features / 'x.nc') y.to_netcdf(test_features / 'y.nc') x.to_netcdf(train_features / 'x.nc') y.to_netcdf(train_features / 'y.nc') layer_sizes = [10] dropout = 0.25 model = LinearNetwork(data_folder=tmp_path, layer_sizes=layer_sizes, dropout=dropout, experiment=experiment, include_pred_month=use_pred_months, include_latlons=use_latlons) model.train() test_arrays_dict, pred_dict = model.predict() # the foldername "hello" is the only one which should be in the dictionaries assert ('hello' in test_arrays_dict.keys()) and (len(test_arrays_dict) == 1) assert ('hello' in pred_dict.keys()) and (len(pred_dict) == 1) # _make_dataset with const=True returns all ones assert (test_arrays_dict['hello']['y'] == 1).all()