def earnn( experiment="one_month_forecast", include_pred_month=True, surrounding_pixels=None, pretrained=True, ignore_vars=None, ): data_path = get_data_path() if not pretrained: predictor = EARecurrentNetwork( hidden_size=128, data_folder=data_path, experiment=experiment, include_pred_month=include_pred_month, surrounding_pixels=surrounding_pixels, ignore_vars=ignore_vars, ) predictor.train(num_epochs=50, early_stopping=5) predictor.evaluate(save_preds=True) predictor.save_model() else: predictor = load_model(data_path / f"models/{experiment}/ealstm/model.pt") test_file = data_path / f"features/{experiment}/test/2018_3" assert test_file.exists() all_explanations_for_file(test_file, predictor, batch_size=100)
def earnn( experiment="one_month_forecast", include_pred_month=True, surrounding_pixels=None, pretrained=True, ignore_vars=None, include_static=True, ): # if the working directory is alread ml_drought don't need ../data if Path(".").absolute().as_posix().split("/")[-1] == "ml_drought": data_path = Path("data") else: data_path = Path("../data") if not pretrained: predictor = EARecurrentNetwork( hidden_size=128, data_folder=data_path, experiment=experiment, include_pred_month=include_pred_month, surrounding_pixels=surrounding_pixels, ignore_vars=ignore_vars, include_static=include_static, ) predictor.train(num_epochs=50, early_stopping=10) predictor.evaluate(save_preds=True) predictor.save_model() else: predictor = load_model(data_path / f"models/{experiment}/ealstm/model.pt")
def earnn(experiment='one_month_forecast', include_pred_month=True, surrounding_pixels=None, pretrained=True): # if the working directory is alread ml_drought don't need ../data if Path('.').absolute().as_posix().split('/')[-1] == 'ml_drought': data_path = Path('data') else: data_path = Path('../data') if not pretrained: predictor = EARecurrentNetwork(hidden_size=128, data_folder=data_path, experiment=experiment, include_pred_month=include_pred_month, surrounding_pixels=surrounding_pixels) predictor.train(num_epochs=50, early_stopping=5) predictor.evaluate(save_preds=True) predictor.save_model() else: predictor = load_model(data_path / f'models/{experiment}/ealstm/model.pt') test_file = data_path / f'features/{experiment}/test/2018_3' assert test_file.exists() all_shap_for_file(test_file, predictor, batch_size=100)
def test_predict(self, tmp_path, use_pred_months): x, _, _ = _make_dataset(size=(5, 5), const=True) y = x.isel(time=[-1]) train_features = tmp_path / 'features/one_month_forecast/train/hello' train_features.mkdir(parents=True) test_features = tmp_path / 'features/one_month_forecast/test/hello' test_features.mkdir(parents=True) norm_dict = {'VHI': {'mean': 0.0, 'std': 1.0}} with (tmp_path / 'features/one_month_forecast/normalizing_dict.pkl' ).open('wb') as f: pickle.dump(norm_dict, f) x.to_netcdf(test_features / 'x.nc') y.to_netcdf(test_features / 'y.nc') x.to_netcdf(train_features / 'x.nc') y.to_netcdf(train_features / 'y.nc') # static x_static, _, _ = _make_dataset(size=(5, 5), add_times=False) static_features = tmp_path / f'features/static' static_features.mkdir(parents=True) x_static.to_netcdf(static_features / 'data.nc') static_norm_dict = {'VHI': {'mean': 0.0, 'std': 1.0}} with (tmp_path / f'features/static/normalizing_dict.pkl').open('wb') as f: pickle.dump(static_norm_dict, f) dense_features = [10] hidden_size = 128 rnn_dropout = 0.25 model = EARecurrentNetwork(hidden_size=hidden_size, dense_features=dense_features, rnn_dropout=rnn_dropout, data_folder=tmp_path) model.train() test_arrays_dict, pred_dict = model.predict() # the foldername "hello" is the only one which should be in the dictionaries assert ('hello' in test_arrays_dict.keys()) and (len(test_arrays_dict) == 1) assert ('hello' in pred_dict.keys()) and (len(pred_dict) == 1) # _make_dataset with const=True returns all ones assert (test_arrays_dict['hello']['y'] == 1).all()
def test_train(self, tmp_path, capsys, use_pred_months, use_static_embedding): x, _, _ = _make_dataset(size=(5, 5), const=True) y = x.isel(time=[-1]) test_features = tmp_path / "features/one_month_forecast/train/1980_1" test_features.mkdir(parents=True) norm_dict = {"VHI": {"mean": 0, "std": 1}} with (tmp_path / "features/one_month_forecast/normalizing_dict.pkl" ).open("wb") as f: pickle.dump(norm_dict, f) x.to_netcdf(test_features / "x.nc") y.to_netcdf(test_features / "y.nc") # static x_static, _, _ = _make_dataset(size=(5, 5), add_times=False) static_features = tmp_path / f"features/static" static_features.mkdir(parents=True) x_static.to_netcdf(static_features / "data.nc") static_norm_dict = {"VHI": {"mean": 0.0, "std": 1.0}} with (tmp_path / f"features/static/normalizing_dict.pkl").open("wb") as f: pickle.dump(static_norm_dict, f) dense_features = [10] hidden_size = 128 rnn_dropout = 0.25 model = EARecurrentNetwork( hidden_size=hidden_size, dense_features=dense_features, rnn_dropout=rnn_dropout, data_folder=tmp_path, static_embedding_size=use_static_embedding, normalize_y=True, ) model.train() captured = capsys.readouterr() expected_stdout = "Epoch 1, train smooth L1: 0." assert expected_stdout in captured.out assert type(model.model) == EALSTM, f"Model attribute not an EALSTM!"
def earnn( experiment="one_month_forecast", include_pred_month=True, surrounding_pixels=None, pretrained=False, explain=False, static="features", ignore_vars=None, num_epochs=50, early_stopping=5, static_embedding_size=10, hidden_size=128, predict_delta=False, spatial_mask=None, include_latlons=False, normalize_y=True, include_prev_y=True, include_yearly_aggs=True, # new clear_nans=True, weight_observations=False, pred_month_static=False, ): data_path = get_data_path() if not pretrained: predictor = EARecurrentNetwork( hidden_size=hidden_size, data_folder=data_path, experiment=experiment, include_pred_month=include_pred_month, surrounding_pixels=surrounding_pixels, static=static, static_embedding_size=static_embedding_size, ignore_vars=ignore_vars, predict_delta=predict_delta, spatial_mask=spatial_mask, include_latlons=include_latlons, normalize_y=normalize_y, include_prev_y=include_prev_y, include_yearly_aggs=include_yearly_aggs, clear_nans=clear_nans, weight_observations=weight_observations, pred_month_static=pred_month_static, ) predictor.train(num_epochs=num_epochs, early_stopping=early_stopping) predictor.evaluate(save_preds=True) predictor.save_model() else: predictor = load_model(data_path / f"models/{experiment}/ealstm/model.pt") if explain: test_file = data_path / f"features/{experiment}/test/2018_3" assert test_file.exists() all_explanations_for_file(test_file, predictor, batch_size=100)
def test_train(self, tmp_path, capsys, use_pred_months): x, _, _ = _make_dataset(size=(5, 5), const=True) y = x.isel(time=[-1]) test_features = tmp_path / 'features/one_month_forecast/train/hello' test_features.mkdir(parents=True) norm_dict = {'VHI': {'mean': 0, 'std': 1}} with (tmp_path / 'features/one_month_forecast/normalizing_dict.pkl' ).open('wb') as f: pickle.dump(norm_dict, f) x.to_netcdf(test_features / 'x.nc') y.to_netcdf(test_features / 'y.nc') # static x_static, _, _ = _make_dataset(size=(5, 5), add_times=False) static_features = tmp_path / f'features/static' static_features.mkdir(parents=True) x_static.to_netcdf(static_features / 'data.nc') static_norm_dict = {'VHI': {'mean': 0.0, 'std': 1.0}} with (tmp_path / f'features/static/normalizing_dict.pkl').open('wb') as f: pickle.dump(static_norm_dict, f) dense_features = [10] hidden_size = 128 rnn_dropout = 0.25 model = EARecurrentNetwork(hidden_size=hidden_size, dense_features=dense_features, rnn_dropout=rnn_dropout, data_folder=tmp_path) model.train() captured = capsys.readouterr() expected_stdout = 'Epoch 1, train smooth L1: 0.' assert expected_stdout in captured.out assert type(model.model) == EALSTM, \ f'Model attribute not an EALSTM!'
def run_models(data_path, experiment): # NOTE: this model is the same for all experiments print("Running persistence model") predictor = Persistence(data_path, experiment=experiment) predictor.evaluate(save_preds=True) # linear regression print(f"Running Linear Regression model: {experiment}") predictor = LinearRegression(data_path, experiment=experiment, include_pred_month=True, surrounding_pixels=1) predictor.train(num_epochs=10, early_stopping=3) # linear network print(f"Running Linear Neural Network model: {experiment}") predictor = LinearNetwork( data_folder=data_path, experiment=experiment, layer_sizes=[100], include_pred_month=True, surrounding_pixels=1, ) predictor.train(num_epochs=10, early_stopping=3) predictor.evaluate(save_preds=True) predictor.save_model() # recurrent network print(f"Running RNN (LSTM) model: {experiment}") predictor = RecurrentNetwork( data_folder=data_path, hidden_size=128, experiment=experiment, include_pred_month=True, surrounding_pixels=1, ) predictor.train(num_epochs=10, early_stopping=3) predictor.evaluate(save_preds=True) predictor.save_model() # EA LSTM print(f"Running Entity Aware LSTM model: {experiment}") predictor = EARecurrentNetwork( data_folder=data_path, hidden_size=128, experiment=experiment, include_pred_month=True, surrounding_pixels=1, ) predictor.train(num_epochs=10, early_stopping=3) predictor.evaluate(save_preds=True) predictor.save_model()
def test_save(self, tmp_path, monkeypatch): features_per_month = 5 dense_features = [10] input_dense_features = copy(dense_features) hidden_size = 128 rnn_dropout = 0.25 include_latlons = True include_pred_month = True include_yearly_aggs = True yearly_agg_size = 3 def mocktrain(self): self.model = EALSTM(features_per_month, dense_features, hidden_size, rnn_dropout, include_latlons, include_pred_month, experiment='one_month_forecast', yearly_agg_size=yearly_agg_size) self.features_per_month = features_per_month self.yearly_agg_size = yearly_agg_size monkeypatch.setattr(EARecurrentNetwork, 'train', mocktrain) model = EARecurrentNetwork(hidden_size=hidden_size, dense_features=dense_features, include_pred_month=include_pred_month, include_latlons=include_latlons, rnn_dropout=rnn_dropout, data_folder=tmp_path, include_yearly_aggs=include_yearly_aggs) model.train() model.save_model() assert (tmp_path / 'models/one_month_forecast/ealstm/model.pt').exists(), \ f'Model not saved!' model_dict = torch.load(model.model_dir / 'model.pt', map_location='cpu') for key, val in model_dict['model']['state_dict'].items(): assert (model.model.state_dict()[key] == val).all() assert model_dict['model']['features_per_month'] == features_per_month assert model_dict['model']['yearly_agg_size'] == yearly_agg_size assert model_dict['hidden_size'] == hidden_size assert model_dict['rnn_dropout'] == rnn_dropout assert model_dict['dense_features'] == input_dense_features assert model_dict['include_pred_month'] == include_pred_month assert model_dict['include_latlons'] == include_latlons assert model_dict['include_yearly_aggs'] == include_yearly_aggs assert model_dict['experiment'] == 'one_month_forecast'
def test_save(self, tmp_path, monkeypatch): features_per_month = 5 dense_features = [10] input_dense_features = copy(dense_features) hidden_size = 128 rnn_dropout = 0.25 include_latlons = True include_pred_month = True include_yearly_aggs = True yearly_agg_size = 3 include_prev_y = True normalize_y = False def mocktrain(self): self.model = EALSTM( features_per_month, dense_features, hidden_size, rnn_dropout, include_latlons, include_pred_month, experiment="one_month_forecast", yearly_agg_size=yearly_agg_size, include_prev_y=include_prev_y, ) self.features_per_month = features_per_month self.yearly_agg_size = yearly_agg_size monkeypatch.setattr(EARecurrentNetwork, "train", mocktrain) model = EARecurrentNetwork( hidden_size=hidden_size, dense_features=dense_features, include_pred_month=include_pred_month, include_latlons=include_latlons, rnn_dropout=rnn_dropout, data_folder=tmp_path, include_yearly_aggs=include_yearly_aggs, normalize_y=normalize_y, ) model.train() model.save_model() assert (tmp_path / "models/one_month_forecast/ealstm/model.pt" ).exists(), f"Model not saved!" model_dict = torch.load(model.model_dir / "model.pt", map_location="cpu") for key, val in model_dict["model"]["state_dict"].items(): assert (model.model.state_dict()[key] == val).all() assert model_dict["model"]["features_per_month"] == features_per_month assert model_dict["model"]["yearly_agg_size"] == yearly_agg_size assert model_dict["hidden_size"] == hidden_size assert model_dict["rnn_dropout"] == rnn_dropout assert model_dict["dense_features"] == input_dense_features assert model_dict["include_pred_month"] == include_pred_month assert model_dict["include_latlons"] == include_latlons assert model_dict["include_yearly_aggs"] == include_yearly_aggs assert model_dict["experiment"] == "one_month_forecast" assert model_dict["include_prev_y"] == include_prev_y assert model_dict["normalize_y"] == normalize_y
def test_predict_and_explain(self, tmp_path, use_pred_months, predict_delta): x, _, _ = _make_dataset(size=(5, 5), const=True) y = x.isel(time=[-1]) train_features = tmp_path / "features/one_month_forecast/train/1980_1" train_features.mkdir(parents=True) test_features = tmp_path / "features/one_month_forecast/test/1980_1" test_features.mkdir(parents=True) norm_dict = {"VHI": {"mean": 0.0, "std": 1.0}} with (tmp_path / "features/one_month_forecast/normalizing_dict.pkl" ).open("wb") as f: pickle.dump(norm_dict, f) x.to_netcdf(test_features / "x.nc") y.to_netcdf(test_features / "y.nc") x.to_netcdf(train_features / "x.nc") y.to_netcdf(train_features / "y.nc") # static x_static, _, _ = _make_dataset(size=(5, 5), add_times=False) static_features = tmp_path / f"features/static" static_features.mkdir(parents=True) x_static.to_netcdf(static_features / "data.nc") static_norm_dict = {"VHI": {"mean": 0.0, "std": 1.0}} with (tmp_path / f"features/static/normalizing_dict.pkl").open("wb") as f: pickle.dump(static_norm_dict, f) dense_features = [10] hidden_size = 128 rnn_dropout = 0.25 model = EARecurrentNetwork( hidden_size=hidden_size, dense_features=dense_features, rnn_dropout=rnn_dropout, data_folder=tmp_path, predict_delta=predict_delta, normalize_y=True, ) model.train() test_arrays_dict, pred_dict = model.predict() # the foldername "1980_1" is the only one which should be in the dictionaries assert ("1980_1" in test_arrays_dict.keys()) and (len(test_arrays_dict) == 1) assert ("1980_1" in pred_dict.keys()) and (len(pred_dict) == 1) if not predict_delta: # _make_dataset with const=True returns all ones assert (test_arrays_dict["1980_1"]["y"] == 1).all() else: # _make_dataset with const=True & predict_delta # returns a change of 0 assert (test_arrays_dict["1980_1"]["y"] == 0).all() # test the Morris explanation works test_dl = next( iter( model.get_dataloader(mode="test", to_tensor=True, shuffle_data=False))) for key, val in test_dl.items(): output_m = model.explain(val.x, save_explanations=True, method="morris") assert type(output_m) is TrainData assert (model.model_dir / "analysis/morris_value_historical.npy").exists()
def test_train( self, tmp_path, capsys, use_pred_months, use_static_embedding, static, check_inversion, ): # make directories for ts in ["2001_11", "2001_12"]: test_features = tmp_path / f"features/one_month_forecast/train/{ts}" test_features.mkdir(parents=True) norm_dict = {"VHI": {"mean": 0, "std": 1}} with (tmp_path / "features/one_month_forecast/normalizing_dict.pkl").open( "wb" ) as f: pickle.dump(norm_dict, f) # save the X, y data pairs x, _, _ = _make_dataset(size=(5, 5), const=True) for ts in ["2001_11", "2001_12"]: if ts == "2001_12": y = x.sel(time="2001-12") x_save = x.sel(time=slice("2000-12", "2001-11")) else: y = x.sel(time="2001-11") x_save = x.sel(time=slice("2000-11", "2001-10")) x_save.to_netcdf(test_features / "x.nc") y.to_netcdf(test_features / "y.nc") # static x_static, _, _ = _make_dataset(size=(5, 5), add_times=False) static_features = tmp_path / f"features/static" static_features.mkdir(parents=True) x_static.to_netcdf(static_features / "data.nc") static_norm_dict = {"VHI": {"mean": 0.0, "std": 1.0}} with (tmp_path / f"features/static/normalizing_dict.pkl").open("wb") as f: pickle.dump(static_norm_dict, f) dense_features = [10] hidden_size = 128 rnn_dropout = 0.25 model = EARecurrentNetwork( hidden_size=hidden_size, dense_features=dense_features, rnn_dropout=rnn_dropout, data_folder=tmp_path, static_embedding_size=use_static_embedding, normalize_y=True, include_yearly_aggs=False, static=static, ) model.train(check_inversion=check_inversion) captured = capsys.readouterr() expected_stdout = "Epoch 1, train smooth L1: 0." assert expected_stdout in captured.out assert type(model.model) == EALSTM, f"Model attribute not an EALSTM!" # ------------------ # Check static embedding # ------------------- if use_static_embedding is not None: all_e, (all_static_x, all_latlons, all_pred_months) = get_static_embedding( ealstm=model ) assert ( all_e[0].shape[0] == 25 ), f"Expect 25 latlon values (pixels). Got: {all_e[0].shape}" assert ( all_latlons[0].shape[0] == 25 ), f"Expect 25 latlon values (pixels). Got: {all_e[0].shape}" # Moved the PredMonth OHE to the dynamic data assert all_static_x[0].shape == ( 25, 1, # 13, ), f"Expect 13 static dimensions Got: {all_static_x[0].shape}"
# dynamic_ignore_vars=dynamic_ignore_vars, # logy=True, # test_years=test_years, # target_variable=target_var, # ) # print("** Run the Engineer! **") # MODELS from src.models import EARecurrentNetwork ealstm = EARecurrentNetwork( data_folder=data_dir, batch_size=1000, hidden_size=128, experiment='one_timestep_forecast', dynamic=True, seq_length=365, dynamic_ignore_vars=dynamic_ignore_vars, static_ignore_vars=static_ignore_vars, target_var='discharge_spec', test_years=np.arange(2011, 2017), ) print("** Initialised Models! **") # test the training functionality ealstm.train( num_epochs=1, # 100 # early_stopping=10 ) # test the prediction functionality # test_arrays_dict, preds_dict = ealstm.predict()
# TODO: update `evaluate` / `predict` / `get_dataloader` / `train` # TODO: do for EALSTM first and get some results asap rocky data_dir = Path('/Volumes/Lees_Extend/data/ecmwf_sowc/data') dynamic_ignore_vars = ['temperature', 'discharge_vol', 'discharge_spec', 'pet', 'humidity', 'shortwave_rad', 'longwave_rad', 'windspeed'] # import cProfile # import profile from src.models import EARecurrentNetwork ealstm = EARecurrentNetwork( data_folder=data_dir, batch_size=1000, hidden_size=128, experiment='one_timestep_forecast', dynamic=True, seq_length=365, dynamic_ignore_vars=dynamic_ignore_vars, static_ignore_vars=static_ignore_vars, target_var='discharge_spec', test_years=np.arange(2011, 2017), ) # test the training functionality ealstm.train() # test the prediction functionality ealstm.predict() # captured = capsys.readouterr() expected_stdout = "`include_yearly_aggs` does not yet work for dynamic dataloder. Setting to False" assert (