def test_mask(self, tmp_path): for i in range(5): (tmp_path / f"features/one_month_forecast/train/{i}").mkdir(parents=True) (tmp_path / f"features/one_month_forecast/train/{i}/x.nc").touch() (tmp_path / f"features/one_month_forecast/train/{i}/y.nc").touch() mask_train = [True, True, False, True, False] mask_val = [False, False, True, False, True] train_paths = DataLoader._load_datasets( tmp_path, mode="train", experiment="one_month_forecast", shuffle_data=True, mask=mask_train, ) val_paths = DataLoader._load_datasets( tmp_path, mode="train", experiment="one_month_forecast", shuffle_data=True, mask=mask_val, ) assert (len(set(train_paths).intersection(set(val_paths))) == 0 ), f"Got the same file in both train and val set!" assert len(train_paths) + len(val_paths) == 5, f"Not all files loaded!"
def test_mask(self, tmp_path): for i in range(5): (tmp_path / f'features/one_month_forecast/train/{i}').mkdir(parents=True) (tmp_path / f'features/one_month_forecast/train/{i}/x.nc').touch() (tmp_path / f'features/one_month_forecast/train/{i}/y.nc').touch() mask_train = [True, True, False, True, False] mask_val = [False, False, True, False, True] train_paths = DataLoader._load_datasets( tmp_path, mode='train', experiment='one_month_forecast', shuffle_data=True, mask=mask_train) val_paths = DataLoader._load_datasets(tmp_path, mode='train', experiment='one_month_forecast', shuffle_data=True, mask=mask_val) assert len(set(train_paths).intersection(set(val_paths))) == 0, \ f'Got the same file in both train and val set!' assert len(train_paths) + len(val_paths) == 5, f'Not all files loaded!'
def test_pred_months(self, tmp_path): for i in range(1, 13): (tmp_path / f"features/one_month_forecast/train/2018_{i}").mkdir(parents=True) (tmp_path / f"features/one_month_forecast/train/2018_{i}/x.nc").touch() (tmp_path / f"features/one_month_forecast/train/2018_{i}/y.nc").touch() pred_months = [4, 5, 6] train_paths = DataLoader._load_datasets( tmp_path, mode="train", shuffle_data=True, pred_months=pred_months, experiment="one_month_forecast", ) assert len(train_paths) == len( pred_months ), f"Got {len(train_paths)} filepaths back, expected {len(pred_months)}" for return_file in train_paths: subfolder = return_file.parts[-1] month = int(str(subfolder)[5:]) assert (month in pred_months ), f"{month} not in {pred_months}, got {return_file}"
model = GBDT( data_dir, include_pred_month=False, experiment='one_month_forecast', include_monthly_aggs=False ) model.train() model.evaluate(save_preds=True) # ------------------------ # Inside the weeds # ------------------------ # ------------------------ # DATA LOADER # ------------------------ len_mask = len(DataLoader._load_datasets( data_path, mode='train', shuffle_data=False, experiment='one_month_forecast' )) # validation split = 10% val_split = 0.1 train_mask, val_mask = train_val_mask(len_mask, val_split) ignore_vars = [] train_dataloader = DataLoader(data_path=model.data_path, batch_file_size=model.batch_size, experiment=model.experiment, shuffle_data=True, mode='train', pred_months=model.pred_months, mask=train_mask, ignore_vars=model.ignore_vars, monthly_aggs=model.include_monthly_aggs,
# l.dense_layers layer_sizes = [100] dense_layers = nn.ModuleList([ LinearBlock(in_features=layer_sizes[i - 1], out_features=layer_sizes[i], dropout=dropout) for i in range(1, len(layer_sizes)) ]) # final dense # l.final_dense final_dense = nn.Linear(in_features=layer_sizes[-1], out_features=1) # DataLoader len_mask = len( DataLoader._load_datasets(data_path, mode="train", shuffle_data=False)) train_mask, val_mask = train_val_mask(len_mask, 0.3) batch_size = 256 # batch_size=5 train_dataloader = DataLoader( data_path=data_path, batch_file_size=batch_size, shuffle_data=True, mode="train", mask=train_mask, to_tensor=True, ) val_dataloader = DataLoader( data_path=data_path, batch_file_size=batch_size, shuffle_data=True,
from src.models import LinearRegression, LinearNetwork, Persistence from src.models.data import DataLoader data_path = Path("data") l = LinearRegression(data_path) l.train() ln = LinearNetwork(layer_sizes=[100], data_folder=data_path) ln.train(num_epochs=10) # ------------------------------------------------------------------------------ # try and explain the LinearRegression model # ------------------------------------------------------------------------------ test_arrays_loader = DataLoader( data_path=data_path, batch_file_size=1, shuffle_data=False, mode="test" ) key, val = list(next(iter(test_arrays_loader)).items())[0] explanations = l.explain(val.x) # plot the SHAP explanations # 1. mean spatial and temporal response mean_expl = explanations.mean(axis=0).mean(axis=0) x_vars = val.x_vars df = pd.DataFrame(dict(variables=x_vars, values=mean_expl)) sns.barplot(x="variables", y="values", data=df) fig = plt.gcf() plt.title(f"{key} {val.y_var} mean SHAP Values for Linear Regression") fig.savefig("scripts/mean_variable_importance_linear_regression.png", dpi=300)