def regression( experiment="one_month_forecast", include_pred_month=True, surrounding_pixels=None, explain=False, static="features", ignore_vars=None, predict_delta=False, spatial_mask=None, include_latlons=False, ): predictor = LinearRegression( get_data_path(), experiment=experiment, include_pred_month=include_pred_month, surrounding_pixels=surrounding_pixels, static=static, ignore_vars=ignore_vars, predict_delta=predict_delta, spatial_mask=spatial_mask, include_latlons=include_latlons, ) predictor.train() predictor.evaluate(save_preds=True) # mostly to test it works if explain: predictor.explain(save_shap_values=True)
def test_big_mean(self, tmp_path, monkeypatch): def mockiter(self): class MockIterator: def __init__(self): self.idx = 0 self.max_idx = 10 def __iter__(self): return self def __next__(self): if self.idx < self.max_idx: # batch_size = 10, timesteps = 2, num_features = 1 self.idx += 1 return (np.ones( (10, 2, 1)), np.ones( (10, ), dtype=np.int8), np.ones( (10, 2)), np.ones((10, 2)), np.ones( (10, 2)), np.ones((10, 2))), None else: raise StopIteration() return MockIterator() def do_nothing(self, data_path, batch_file_size, shuffle_data, mode, pred_months, surrounding_pixels, ignore_vars): pass monkeypatch.setattr(DataLoader, '__iter__', mockiter) monkeypatch.setattr(DataLoader, '__init__', do_nothing) model = LinearRegression(tmp_path) calculated_mean = model._calculate_big_mean() # 1 for the 2 features and for the first month, 0 for the rest expected_mean = np.array([ 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1. ]) # np.isclose because of rounding assert np.isclose(calculated_mean, expected_mean).all()
def run_models(data_path, experiment): # NOTE: this model is the same for all experiments print("Running persistence model") predictor = Persistence(data_path, experiment=experiment) predictor.evaluate(save_preds=True) # linear regression print(f"Running Linear Regression model: {experiment}") predictor = LinearRegression(data_path, experiment=experiment, include_pred_month=True, surrounding_pixels=1) predictor.train(num_epochs=10, early_stopping=3) # linear network print(f"Running Linear Neural Network model: {experiment}") predictor = LinearNetwork( data_folder=data_path, experiment=experiment, layer_sizes=[100], include_pred_month=True, surrounding_pixels=1, ) predictor.train(num_epochs=10, early_stopping=3) predictor.evaluate(save_preds=True) predictor.save_model() # recurrent network print(f"Running RNN (LSTM) model: {experiment}") predictor = RecurrentNetwork( data_folder=data_path, hidden_size=128, experiment=experiment, include_pred_month=True, surrounding_pixels=1, ) predictor.train(num_epochs=10, early_stopping=3) predictor.evaluate(save_preds=True) predictor.save_model() # EA LSTM print(f"Running Entity Aware LSTM model: {experiment}") predictor = EARecurrentNetwork( data_folder=data_path, hidden_size=128, experiment=experiment, include_pred_month=True, surrounding_pixels=1, ) predictor.train(num_epochs=10, early_stopping=3) predictor.evaluate(save_preds=True) predictor.save_model()
def regression(experiment='one_month_forecast', include_pred_month=True, surrounding_pixels=1): # if the working directory is alread ml_drought don't need ../data if Path('.').absolute().as_posix().split('/')[-1] == 'ml_drought': data_path = Path('data') else: data_path = Path('../data') predictor = LinearRegression(data_path, experiment=experiment, include_pred_month=include_pred_month, surrounding_pixels=surrounding_pixels) predictor.train() predictor.evaluate(save_preds=True) # mostly to test it works predictor.explain(save_shap_values=True)
def regression( experiment="one_month_forecast", include_pred_month=True, surrounding_pixels=None, ignore_vars=None, ): data_path = get_data_path() predictor = LinearRegression( data_path, experiment=experiment, include_pred_month=include_pred_month, surrounding_pixels=surrounding_pixels, ignore_vars=ignore_vars, static="embeddings", spatial_mask=data_path / "interim/boundaries_preprocessed/kenya_asal_mask.nc", ) predictor.train() predictor.evaluate(save_preds=True) # mostly to test it works predictor.explain(save_shap_values=True)
def test_save(self, tmp_path, monkeypatch): coef_array = np.array([1, 1, 1, 1, 1]) intercept_array = np.array([2]) def mocktrain(self): class MockModel: @property def coef_(self): return coef_array @property def intercept_(self): return intercept_array self.model = MockModel() monkeypatch.setattr(LinearRegression, "train", mocktrain) model = LinearRegression( tmp_path, experiment="one_month_forecast", normalize_y=False ) model.train() model.save_model() assert ( tmp_path / "models/one_month_forecast/linear_regression/model.pkl" ).exists(), f"Model not saved!" with (tmp_path / "models/one_month_forecast/linear_regression/model.pkl").open( "rb" ) as f: model_dict = pickle.load(f) assert np.array_equal( coef_array, model_dict["model"]["coef"] ), "Different coef array saved!" assert np.array_equal( intercept_array, model_dict["model"]["intercept"] ), "Different intercept array saved!" assert ( model_dict["experiment"] == "one_month_forecast" ), "Different experiment saved!"
def regression( experiment="one_month_forecast", include_pred_month=True, surrounding_pixels=None, ignore_vars=None, include_static=True, ): # if the working directory is alread ml_drought don't need ../data if Path(".").absolute().as_posix().split("/")[-1] == "ml_drought": data_path = Path("data") else: data_path = Path("../data") predictor = LinearRegression( data_path, experiment=experiment, include_pred_month=include_pred_month, surrounding_pixels=surrounding_pixels, ignore_vars=ignore_vars, include_static=include_static, ) predictor.train(early_stopping=5) predictor.evaluate(save_preds=True)
def test_save(self, tmp_path, monkeypatch): coef_array = np.array([1, 1, 1, 1, 1]) intercept_array = np.array([2]) def mocktrain(self): class MockModel: @property def coef_(self): return coef_array @property def intercept_(self): return intercept_array self.model = MockModel() monkeypatch.setattr(LinearRegression, 'train', mocktrain) model = LinearRegression(tmp_path, experiment='one_month_forecast') model.train() model.save_model() assert (tmp_path / 'models/one_month_forecast/linear_regression/model.pkl' ).exists(), f'Model not saved!' with (tmp_path / 'models/one_month_forecast/linear_regression/model.pkl' ).open('rb') as f: model_dict = pickle.load(f) assert np.array_equal(coef_array, model_dict['model']['coef']), \ 'Different coef array saved!' assert np.array_equal(intercept_array, model_dict['model']['intercept']), \ 'Different intercept array saved!' assert model_dict[ 'experiment'] == 'one_month_forecast', 'Different experiment saved!'
def test_train( self, tmp_path, capsys, use_pred_months, experiment, monthly_agg, predict_delta ): x, _, _ = _make_dataset(size=(5, 5), const=True) x_static, _, _ = _make_dataset(size=(5, 5), add_times=False) y = x.isel(time=[-1]) x_add1, _, _ = _make_dataset(size=(5, 5), const=True, variable_name="precip") x_add1 = x_add1 * 2 x_add2, _, _ = _make_dataset(size=(5, 5), const=True, variable_name="temp") x_add2 = x_add2 * 3 x = xr.merge([x, x_add1, x_add2]) norm_dict = { "VHI": {"mean": 0, "std": 1}, "precip": {"mean": 0, "std": 1}, "temp": {"mean": 0, "std": 1}, } static_norm_dict = {"VHI": {"mean": 0.0, "std": 1.0}} test_features = tmp_path / f"features/{experiment}/train/2001_12" test_features.mkdir(parents=True) pred_features = tmp_path / f"features/{experiment}/test/2001_12" pred_features.mkdir(parents=True) static_features = tmp_path / f"features/static" static_features.mkdir(parents=True) with (tmp_path / f"features/{experiment}/normalizing_dict.pkl").open("wb") as f: pickle.dump(norm_dict, f) with (tmp_path / f"features/static/normalizing_dict.pkl").open("wb") as f: pickle.dump(static_norm_dict, f) x.to_netcdf(test_features / "x.nc") x.to_netcdf(pred_features / "x.nc") y.to_netcdf(test_features / "y.nc") y.to_netcdf(pred_features / "y.nc") x_static.to_netcdf(static_features / "data.nc") model = LinearRegression( tmp_path, include_pred_month=use_pred_months, experiment=experiment, include_monthly_aggs=monthly_agg, predict_delta=predict_delta, normalize_y=True, ) model.train() captured = capsys.readouterr() expected_stdout = "Epoch 1, train RMSE: " assert ( expected_stdout in captured.out ), f"Expected stdout to be {expected_stdout}, got {captured.out}" assert ( type(model.model) == linear_model.SGDRegressor ), f"Model attribute not a linear regression!" if experiment == "nowcast": coef_size = (3 * 35) + 2 elif experiment == "one_month_forecast": coef_size = 3 * 36 if monthly_agg: # doubled including the mean, tripled including the std coef_size *= 2 if use_pred_months: coef_size += 12 coef_size += 3 # for the yearly aggs coef_size += 1 # for the static variable coef_size += 1 # for the prev_y_var assert model.model.coef_.size == coef_size, f"Got unexpected coef size" test_arrays_dict, preds_dict = model.predict() assert ( test_arrays_dict["2001_12"]["y"].size == preds_dict["2001_12"].shape[0] ), "Expected length of test arrays to be the same as the predictions" # test saving the model outputs model.evaluate(save_preds=True) save_path = model.data_path / "models" / experiment / "linear_regression" assert (save_path / "preds_2001_12.nc").exists() assert (save_path / "results.json").exists() pred_ds = xr.open_dataset(save_path / "preds_2001_12.nc") assert np.isin(["lat", "lon", "time"], [c for c in pred_ds.coords]).all() assert y.time == pred_ds.time
def test_big_mean(self, tmp_path, monkeypatch): def mockiter(self): class MockIterator: def __init__(self): self.idx = 0 self.max_idx = 10 def __iter__(self): return self def __next__(self): if self.idx < self.max_idx: # batch_size = 10, timesteps = 2, num_features = 1 self.idx += 1 return ( ( np.ones((10, 2, 1)), np.ones((10,), dtype=np.int8), np.ones((10, 2)), np.ones((10, 2)), np.ones((10, 2)), np.ones((10, 2)), np.ones((10, 1)), ), None, ) else: raise StopIteration() return MockIterator() def do_nothing( self, data_path, batch_file_size, mode, shuffle_data, clear_nans, normalize, experiment, mask, pred_months, to_tensor, surrounding_pixels, ignore_vars, monthly_aggs, static, device, predict_delta, spatial_mask, normalize_y, incl_yearly_aggs, ): pass monkeypatch.setattr(DataLoader, "__iter__", mockiter) monkeypatch.setattr(DataLoader, "__init__", do_nothing) model = LinearRegression(tmp_path, normalize_y=False) calculated_mean = model._calculate_big_mean() # 1 for the 2 features and for the first month, 0 for the rest expected_mean = np.array( [ 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, ] ) # np.isclose because of rounding assert np.isclose(calculated_mean, expected_mean).all()
# train models from pathlib import Path import matplotlib.pyplot as plt import numpy as np import pandas as pd import seaborn as sns from src.models import LinearRegression, LinearNetwork, Persistence from src.models.data import DataLoader data_path = Path("data") l = LinearRegression(data_path) l.train() ln = LinearNetwork(layer_sizes=[100], data_folder=data_path) ln.train(num_epochs=10) # ------------------------------------------------------------------------------ # try and explain the LinearRegression model # ------------------------------------------------------------------------------ test_arrays_loader = DataLoader( data_path=data_path, batch_file_size=1, shuffle_data=False, mode="test" ) key, val = list(next(iter(test_arrays_loader)).items())[0] explanations = l.explain(val.x) # plot the SHAP explanations # 1. mean spatial and temporal response mean_expl = explanations.mean(axis=0).mean(axis=0) x_vars = val.x_vars
import xarray as xr from pathlib import Path from src.models import Persistence, LinearRegression data_path = Path("data") # high level api predictor = Persistence(data_path) predictor.evaluate(save_preds=True) predictor = LinearRegression(data_path) predictor.evaluate(save_preds=True) # 1 LOAD_TRAIN_DATA get into the guts x, y = predictor.load_train_arrays() # 2 ds_folder_to_np (load x/y from the train data) train_data_path = predictor.data_path / "features/train" out_x, out_y = [], [] for subtrain in train_data_path.iterdir(): if (subtrain / "x.nc").exists() and (subtrain / "y.nc").exists(): arrays = predictor.ds_folder_to_np(subtrain, clear_nans=True, return_latlons=False) out_x.append(arrays.x) out_y.append(arrays.y) # 3 x, y = xr.open_dataset(folder / "x.nc"), xr.open_dataset(folder / "y.nc") x_np, y_np = x.to_array().values, y.to_array().values
from pathlib import Path import numpy as np import matplotlib.pyplot as plt import pickle from src.analysis import plot_shap_values from src.models import Persistence, LinearRegression, LinearNetwork from src.models.data import DataLoader %load_ext autoreload %autoreload 2 data_dir = Path('/Volumes/Lees_Extend/data/ecmwf_sowc/data') predictor = LinearRegression(data_folder=data_dir, experiment='nowcast') predictor.train()
def test_train(self, tmp_path, capsys, use_pred_months, experiment, monthly_agg): x, _, _ = _make_dataset(size=(5, 5), const=True) x_static, _, _ = _make_dataset(size=(5, 5), add_times=False) y = x.isel(time=[-1]) x_add1, _, _ = _make_dataset(size=(5, 5), const=True, variable_name='precip') x_add2, _, _ = _make_dataset(size=(5, 5), const=True, variable_name='temp') x = xr.merge([x, x_add1, x_add2]) norm_dict = { 'VHI': { 'mean': 0, 'std': 1 }, 'precip': { 'mean': 0, 'std': 1 }, 'temp': { 'mean': 0, 'std': 1 } } static_norm_dict = {'VHI': {'mean': 0.0, 'std': 1.0}} test_features = tmp_path / f'features/{experiment}/train/hello' test_features.mkdir(parents=True) pred_features = tmp_path / f'features/{experiment}/test/hello' pred_features.mkdir(parents=True) static_features = tmp_path / f'features/static' static_features.mkdir(parents=True) with (tmp_path / f'features/{experiment}/normalizing_dict.pkl').open('wb') as f: pickle.dump(norm_dict, f) with (tmp_path / f'features/static/normalizing_dict.pkl').open('wb') as f: pickle.dump(static_norm_dict, f) x.to_netcdf(test_features / 'x.nc') x.to_netcdf(pred_features / 'x.nc') y.to_netcdf(test_features / 'y.nc') y.to_netcdf(pred_features / 'y.nc') x_static.to_netcdf(static_features / 'data.nc') model = LinearRegression(tmp_path, include_pred_month=use_pred_months, experiment=experiment, include_monthly_aggs=monthly_agg) model.train() captured = capsys.readouterr() expected_stdout = 'Epoch 1, train RMSE: ' assert expected_stdout in captured.out, \ f'Expected stdout to be {expected_stdout}, got {captured.out}' assert type(model.model) == linear_model.SGDRegressor, \ f'Model attribute not a linear regression!' if experiment == 'nowcast': coef_size = (3 * 35) + 2 elif experiment == 'one_month_forecast': coef_size = (3 * 36) if monthly_agg: # doubled including the mean, tripled including the std coef_size *= 2 if use_pred_months: coef_size += 12 coef_size += 3 # for the yearly aggs coef_size += 1 # for the static variable assert model.model.coef_.size == coef_size, f'Got unexpected coef size' test_arrays_dict, preds_dict = model.predict() assert ( test_arrays_dict['hello']['y'].size == preds_dict['hello'].shape[0] ), 'Expected length of test arrays to be the same as the predictions' # test saving the model outputs model.evaluate(save_preds=True) save_path = model.data_path / 'models' / experiment / 'linear_regression' assert (save_path / 'preds_hello.nc').exists() assert (save_path / 'results.json').exists() pred_ds = xr.open_dataset(save_path / 'preds_hello.nc') assert np.isin(['lat', 'lon', 'time'], [c for c in pred_ds.coords]).all() assert y.time == pred_ds.time