def test_dump_config(self, tmp_path: Path): run_dir = tmp_path / "runs" run_dir.mkdir(exist_ok=True, parents=True) path = Path("tests/testconfigs/test_config.yml") cfg = Config(cfg_path=path) cfg.run_dir = run_dir # check that defaults not specified are written to file # check that the file gets created cfg.dump_config(run_dir) assert "config.yml" in [l.name for l in run_dir.glob("*")] cfg_path = run_dir / "config.yml" with cfg_path.open("r") as fp: yaml = YAML(typ="safe") cfg2 = yaml.load(fp) expected_keys_with_defaults = [ "autoregressive", "pixel_dims", "num_workers", "seed", "device", "learning_rate", "time_str", "run_dir", ] for key in expected_keys_with_defaults: assert key in [l for l in cfg2.keys()]
def test_dataset(self, tmp_path): target_variable = "target" input_variables = ["feature"] for path in [ Path("tests/testconfigs/test_config_simulate.yml"), Path("tests/testconfigs/test_config.yml"), ]: cfg = Config(path) cfg._cfg["forecast_variables"] = cfg.input_variables create_and_assign_temp_run_path_to_config(cfg, tmp_path) raw_ds = _make_dataset().isel(lat=slice(0, 2), lon=slice(0, 1)) static = create_static(cfg=cfg, ds=raw_ds) ds = XarrayDataset( raw_ds, cfg=cfg, mode="train", DEBUG=True, static_data=static ) assert ds.target == target_variable assert ( ds.inputs == input_variables + ["autoregressive"] if cfg.autoregressive else input_variables ) x_features = ( len(input_variables) + 1 if cfg.autoregressive else len(input_variables) ) seq_length = cfg.seq_length for i in range(10): data = ds.__getitem__(i) x, y = data["x_d"], data["y"] assert y.shape == (1, 1) assert x.shape == ( seq_length, x_features + 2 if cfg.encode_doys else x_features, ), f"Shape Mismatch! Expect: {(seq_length, x_features)} Got: {x.shape}" meta = data["meta"] times = ( meta["target_time"] .detach() .numpy() .astype("datetime64[ns]") .flatten() ) pixel, _ = ds.lookup_table[int(meta["index"])] latlon = tuple([float(l) for l in str(pixel).split("_")]) y_unnorm = ( ds.normalizer.individual_inverse(y, pixel, variable="target") .detach() .numpy() ) # extract from the original xr.Dataset y_exp = raw_ds.sel( lat=latlon[0], lon=latlon[1], time=times, method="nearest" )[cfg.target_variable].values assert np.isclose(y_unnorm.reshape(y_exp.shape), y_exp, atol=1e-5)
def test_pollution(self, tmp_path): ds = get_pollution_data_beijing().to_xarray() cfg = Config(cfg_path=Path("tests/testconfigs/pollution.yml")) cfg.run_dir = tmp_path trainer = Trainer(cfg, ds) input_variables = [] if cfg.input_variables is None else cfg.input_variables train_ds = ds[input_variables + [cfg.target_variable]].sel( time=slice(cfg.train_start_date, cfg.train_end_date)) assert trainer.train_dl.dataset.lookup_table != {} assert trainer.train_dl.dataset.y != {} assert trainer.train_dl.dataset.x_d != {}
def test_train_test_split(self, tmp_path): ds = create_linear_ds().isel(lat=slice(0, 5), lon=slice(0, 5)) cfg = Config(Path("tests/testconfigs/test_config.yml")) cfg.run_dir = tmp_path train = train_test_split(ds, cfg, subset="train") test = train_test_split(ds, cfg, subset="test") valid = train_test_split(ds, cfg, subset="validation") cfg.train_start_date cfg.train_end_date cfg.validation_start_date cfg.validation_end_date cfg.test_start_date cfg.test_end_date
def test_dataloader(self, tmp_path): ds = _make_dataset() cfg = Config(Path("tests/testconfigs/test_config.yml")) create_and_assign_temp_run_path_to_config(cfg, tmp_path) static = create_static(cfg=cfg, ds=ds) dl = PixelDataLoader( ds, cfg=cfg, num_workers=1, mode="train", batch_size=cfg.batch_size, static_data=static, ) assert dl.batch_size == cfg.batch_size seq_length = cfg.seq_length autoregressive = cfg.autoregressive data = next(iter(dl)) x, y = data["x_d"], data["y"] n_inputs = len(["features"]) + 1 if autoregressive else len(["features"]) assert x.shape == ( cfg.batch_size, seq_length, n_inputs + 2 if cfg.encode_doys else n_inputs, ), f"Size Mismatch! Expected: {(cfg.batch_size, seq_length, n_inputs)} Got: {x.shape}"
def test_kenya_data(self, tmp_path): if TEST_REAL_DATA: ds = pickle.load(Path("data/kenya.pkl").open("rb")).isel( lat=slice(0, 5), lon=slice(0, 5) ) cfg = Config(Path("tests/testconfigs/config.yml")) create_and_assign_temp_run_path_to_config(cfg, tmp_path) dl = PixelDataLoader( ds, cfg=cfg, num_workers=1, mode="train", batch_size=cfg.batch_size ) data = dl.__iter__().__next__() x, _ = data["x_d"], data["y"] batch_size = 256 seq_length = cfg.seq_length input_variables = ["precip", "t2m", "SMsurf"] autoregressive = True n_inputs = ( len(input_variables) + 1 if autoregressive else len(input_variables) ) assert cfg.batch_size == batch_size assert cfg.autoregressive == autoregressive assert x.shape == ( batch_size, seq_length, n_inputs, ), f"X Data Mismatch! Expected: {(batch_size, seq_length, n_inputs)} Got: {x.shape}" else: pass
def test_runoff_data(self, tmp_path): if TEST_REAL_DATA: ds = xr.open_dataset("data/ALL_dynamic_ds.nc").isel(station_id=slice(0, 5)) cfg = Config(Path("tests/testconfigs/config_runoff.yml")) create_and_assign_temp_run_path_to_config(cfg, tmp_path) # train period input_variables = [] if cfg.input_variables is None else cfg.input_variables train_ds = ds[input_variables + [cfg.target_variable]].sel( time=slice(cfg.train_start_date, cfg.train_end_date) ) train_dl = PixelDataLoader( train_ds, cfg=cfg, mode="train", num_workers=4, batch_size=cfg.batch_size, ) # check data is loaded properly data = next(iter(train_dl)) x, y = data["x_d"], data["y"] n_in_vars = ( len(cfg.input_variables) + 1 if cfg.autoregressive else len(cfg.input_variables) ) assert x.shape == (cfg.batch_size, cfg.seq_length, n_in_vars) assert y.shape == (cfg.batch_size, 1, 1) else: pass
def test_static_inputs(self, tmp_path): ds = _make_dataset().isel(lat=slice(0, 2), lon=slice(0, 1)) ds_static = ds.mean(dim="time") cfg = Config(Path("tests/testconfigs/test_config.yml")) create_and_assign_temp_run_path_to_config(cfg, tmp_path) assert False
def test_dataset_beijing(self, tmp_path): if is_connected(): path = Path("tests/testconfigs/pollution.yml") cfg = Config(path) create_and_assign_temp_run_path_to_config(cfg, tmp_path) raw_ds = get_pollution_data_beijing().to_xarray().isel(time=slice(0, 1000)) ds = XarrayDataset(raw_ds, cfg=cfg, mode="train", DEBUG=True) assert ds.y != {}
def test_forecast_inputs(self, tmp_path): ds = _make_dataset().isel(lat=slice(0, 2), lon=slice(0, 1)) ds_forecast = ( ds.shift(time=1).rename({"feature": "feature_fcast1"}).drop("target") ) ds = xr.merge([ds, ds_forecast]) cfg = Config(Path("tests/testconfigs/test_config.yml")) create_and_assign_temp_run_path_to_config(cfg, tmp_path) assert False
def test_normalizer(self, tmp_path): cfg = Config(Path("tests/testconfigs/test_config.yml")) # create normalizers and test whether working normalizer = pickle.load((cfg.run_dir / "normalizer.pkl").open("rb")) normalizer = pickle.load((cfg.run_dir / "static_normalizer.pkl").open("rb")) normalizer.std_ normalizer.mean_ assert False
def test_longer_horizon_fcast(self, tmp_path): cfg = Config(Path("tests/testconfigs/test_1d_config_horizon.yml")) create_and_assign_temp_run_path_to_config(cfg, tmp_path) ds = load_test_jena_data_as_dataset() dl = PixelDataLoader( ds, cfg=cfg, num_workers=1, mode="train", batch_size=cfg.batch_size ) data = dl.__iter__().__next__() _, y = data["x_d"], data["y"] assert y.shape == (cfg.batch_size, 1, 1)
def test_runoff_example(self, tmp_path): cfg = Config(Path("tests/testconfigs/config_runoff.yml")) create_and_assign_temp_run_path_to_config(cfg, tmp_path) cfg._cfg["data_path"] = Path("data/ALL_dynamic_ds.nc") cfg._cfg["static_data_path"] = Path("data/camels_static.nc") cfg._cfg["static_inputs"] = ["p_mean", "pet_mean", "area", "gauge_elev"] cfg._cfg["n_epochs"] = 3 ds, static = load_data(cfg) # select subset of 3 basins basins = [1001, 2001, 2002] ds = ds.sel(station_id=basins) static = static.sel(station_id=basins) trainer = Trainer(cfg, ds, static_data=static) self.check_loaded_data( cfg, trainer, data=ds.sel(time=slice(cfg.train_start_date, cfg.train_end_date)), ) losses = trainer.train_and_validate() tester = Tester(cfg, ds, static_data=static) preds = tester.run_test() return losses, preds
def test_kenya_vci_example(self, tmp_path): cfg = Config(Path("tests/testconfigs/config.yml")) create_and_assign_temp_run_path_to_config(cfg, tmp_path) cfg._cfg["data_path"] = Path("data/kenya.nc") cfg._cfg["n_epochs"] = 3 ds, static = load_data(cfg) trainer = Trainer(cfg, ds, static_data=static) self.check_loaded_data( cfg, trainer, data=ds.sel(time=slice(cfg.train_start_date, cfg.train_end_date)), ) losses = trainer.train_and_validate() tester = Tester(cfg, ds, static_data=static) preds = tester.run_test() return losses, preds
def test_linear_example(self): ds = create_linear_ds(epsilon_sigma=10) static_data = create_static_example_data(ds) cfg = Config(Path("tests/testconfigs/test_config.yml")) cfg._cfg["static_inputs"] = ["static_const", "static_rand"] # Train trainer = Trainer(cfg, ds, static_data=static_data) self.check_loaded_data( cfg, trainer, data=ds.sel(time=slice(cfg.train_start_date, cfg.train_end_date)), ) losses = trainer.train_and_validate() save_loss_curves(losses, cfg) # Test tester = Tester(cfg, ds, static_data=static_data) preds = tester.run_test() for _ in range(2): save_timeseries(preds, cfg)
def test_1D_data(self, tmp_path): # convert pandas to xarray object ds = load_test_jena_data_as_dataset() cfg = Config(Path("tests/testconfigs/test_1d_config_horizon.yml")) create_and_assign_temp_run_path_to_config(cfg, tmp_path) dl = PixelDataLoader( ds, cfg=cfg, num_workers=1, mode="train", batch_size=cfg.batch_size ) data = dl.__iter__().__next__() x, y = data["x_d"], data["y"] assert x.shape == (cfg.batch_size, cfg.seq_length, len(cfg.input_variables)) assert y.shape == (cfg.batch_size, 1, 1)
def test_lstm_forward_pass(self, tmp_path): ds = pickle.load(Path("data/kenya.pkl").open("rb")) cfg = Config(Path("tests/testconfigs/config.yml")) create_and_assign_temp_run_path_to_config(cfg, tmp_path) dl = PixelDataLoader(ds, cfg=cfg, mode="train") model = LSTM( input_size=dl.input_size + dl.static_input_size + dl.forecast_input_size, hidden_size=cfg.hidden_size, output_size=dl.output_size, forecast_horizon=dl.horizon, ) data = dl.__iter__().__next__() y_hat = model(data) assert all(np.isin(["h_n", "c_n", "y_hat"], [k for k in y_hat.keys()]))
def test_linear_regression_forward_pass(self, tmp_path): ds = _make_dataset() cfg = Config(Path("tests/testconfigs/test_config.yml")) create_and_assign_temp_run_path_to_config(cfg, tmp_path) dl = PixelDataLoader(ds, cfg=cfg, mode="train", DEBUG=True) model = LinearRegression( input_size=(dl.input_size + dl.static_input_size + dl.forecast_input_size) * cfg.seq_length, output_size=dl.output_size, forecast_horizon=dl.horizon, ) data = dl.__iter__().__next__() y_hat = model(data) assert isinstance(y_hat, dict) assert y_hat["y_hat"].shape == (1, 1)
def test_config(self): paths = list(Path("tests/testconfigs").glob("*.yml")) for path in paths: cfg = Config(cfg_path=path) assert isinstance(cfg.test_start_date, pd.Timestamp) assert isinstance(cfg.data_dir, Path) assert all(np.isin(cfg._mandatory_keys, list(dir(cfg)))) assert all(np.isin(cfg._mandatory_keys, list(cfg._cfg.keys()))) assert all(np.isin(list(cfg._defaults.keys()), list(dir(cfg)))) if cfg.file_path.name == "test_1d_config.yml": assert cfg.pixel_dims == ["pixel"] # TODO: test default args original = cfg._cfg.pop("pixel_dims", None) assert cfg.pixel_dims == ["lat", "lon"], "Expect to return the default"
def check_output_files(tmp_path: Path): # check the saved files (model and optimizer epochs) test_dir = sorted([d for d in tmp_path.glob("runs/test*")])[-1] created_files = sorted([t.name for t in test_dir.iterdir()]) cfg = Config(test_dir / "config.yml") assert len([f for f in created_files if "model_epoch" in f]) == cfg.n_epochs if cfg.static_inputs is not None: assert ( "static_normalizer.pkl" in created_files ), f"Expected the static normalizer to be saved. Not found in: {pformat(created_files)}" assert ( "normalizer.pkl" in created_files ), f"Expected the normalizer to be saved. Not found in: {pformat(created_files)}" assert ( len([f for f in test_dir.glob("*.nc")]) > 0 ), "Output NetCDF not saved to disk!"
def test_tester(self, tmp_path): ds = create_linear_ds().isel(lat=slice(0, 5), lon=slice(0, 5)) cfg = Config(Path("tests/testconfigs/test_config.yml")) cfg._cfg["n_epochs"] = 1 cfg._cfg["num_workers"] = 1 cfg._cfg["horizon"] = 5 cfg.run_dir = tmp_path # initialise the train directory! trainer = Trainer(cfg, ds) trainer.train_and_validate() tester = Tester(cfg=cfg, ds=ds) # TODO: test the tester evaluation loop tester.run_test() # TODO: test that plots created, outputs saved outfile = sorted(list(cfg.run_dir.glob("*.nc")))[-1] out_ds = xr.open_dataset(outfile) assert int(out_ds.horizon.values) == cfg.horizon # Check that the times are correct min_time = pd.to_datetime(out_ds.time.values.min()).round("D") exp_min_time = cfg.test_start_date + DateOffset( months=(cfg.seq_length + cfg.horizon)) assert all([ (min_time.year == exp_min_time.year), (min_time.month == exp_min_time.month), (min_time.day == exp_min_time.day), ]) max_time = pd.to_datetime(out_ds.time.values.max()).round("D") exp_max_time = cfg.test_end_date - DateOffset(months=1) assert all([ (max_time.year == exp_max_time.year), (max_time.month == exp_max_time.month), (max_time.day == exp_max_time.day), ])
def test_linear_example(self, tmp_path): """Test the linear dataset. Args: tmp_path ([type]): [description] """ cfg = Config(Path("tests/testconfigs/test_config.yml")) create_and_assign_temp_run_path_to_config(cfg, tmp_path) # Create linear dataset alpha = 0 beta = 2 epsilon_sigma = 0 ds = create_linear_ds( horizon=cfg.horizon, alpha=alpha, beta=beta, epsilon_sigma=epsilon_sigma ).isel(lat=slice(0, 2), lon=slice(0, 2)) static = create_static(cfg=cfg, ds=ds) dl = PixelDataLoader( ds, cfg=cfg, num_workers=1, mode="train", batch_size=cfg.batch_size, DEBUG=True, static_data=static, ) # load all of the data into memory data = load_all_data_from_dl_into_memory(dl) x = data["x_d"] # (n_samples, n_features, seq_length) assert x.shape == ( len(cfg.input_variables) + 2 if cfg.encode_doys else len(cfg.input_variables), cfg.seq_length, ) assert x.shape[-1] == cfg.seq_length y = data["y"] times = pd.to_datetime(data["time"].astype("datetime64[ns]").flatten()) # matching batch dims (n_samples) for all samples assert x.shape[0] == y.shape[0] # test ONE SINGLE (x, y) sample SAMPLE = 1 # get metadata for sample idx = int(data["index"][SAMPLE]) pixel, valid_current_time_index = dl.dataset.lookup_table[idx] latlon = tuple([float(l) for l in str(pixel).split("_")]) target_time = times[SAMPLE] # current_time = times[valid_current_time_index][0] # get the correct times (weird indexing becuase of imperfect translation of float -> datetime64[ns]) max_time = target_time - DateOffset(months=cfg.horizon) + DateOffset(days=2) min_time = max_time - DateOffset(months=cfg.seq_length) input_times = pd.date_range(min_time, max_time, freq="M")[-cfg.seq_length :] # recreate the data that should be loaded from the raw xr.Dataset stacked, _ = _stack_xarray(ds, spatial_coords=cfg.pixel_dims) normalizer = dl.normalizer norm_stacked = normalizer.transform(stacked) all_y = norm_stacked["target"].sel(sample=pixel) _y = all_y.sel(time=target_time, method="nearest") all_x = norm_stacked["feature"].sel(sample=pixel) _x_d = all_x.sel(time=input_times, method="nearest") # check that the dataloader saves & returns the correct values assert np.allclose( dl.dataset.y[pixel], (all_y.values) ), "The DataLoader saves incorrect y values to memory" assert np.isclose( _y.values, y[SAMPLE] ), "The DataLoader returns an incorrect value from the Dataset" # input (X) data dataset_loaded = dl.dataset.x_d[pixel] # assert dataset_loaded.shape == (, cfg.seq_length) expected = all_x.values.reshape(dataset_loaded.shape) mask = np.isnan(expected) expected = expected[~mask] dataset_loaded = dataset_loaded[~mask] assert np.allclose( dataset_loaded, expected ), f"The dataloader is saving the wrong data to the lookup table. {dataset_loaded[:10]} {expected[:10]}" # get input X data from INDEX (not times) max_input_ix = int(valid_current_time_index) min_input_ix = int(max_input_ix - cfg.seq_length) + 1 _x_d_index_values = all_x.values[min_input_ix : max_input_ix + 1] assert np.allclose(_x_d_index_values, _x_d.values) # TODO: Why does this not work? assert np.allclose( _x_d_index_values.values, x[SAMPLE] ), "The dynamic data is not the data we expect" # check that the raw data is the linear combination we expect # "target" should be linear combination of previous timestep "feature" # (y = x @ [0, 2]) zeros = np.zeros((cfg.seq_length - 1, 1)) betas = np.append(zeros, beta).reshape(-1, 1) unnorm_x = dl.dataset.normalizer.individual_inverse( x[SAMPLE], pixel_id=pixel, variable=cfg.input_variables[0] ) unnorm_y = dl.dataset.normalizer.individual_inverse( y[SAMPLE], pixel_id=pixel, variable=cfg.target_variable ) # time=target_time, ds.sel(lat=latlon[0], lon=latlon[1], method="nearest")[cfg.target_variable] assert np.isclose(unnorm_x @ betas, unnorm_y)
def test_single_train_step(self, tmp_path): torch.manual_seed(1) np.random.seed(1) hidden_size = 64 ds = pickle.load(Path("data/kenya.pkl").open("rb")).isel( lat=slice(0, 2), lon=slice(0, 4) ) paths = [ Path("tests/testconfigs/config.yml"), Path("tests/testconfigs/config_multi_horizon.yml"), ] for path in paths: cfg = Config(path) cfg._cfg["static_inputs"] = "embedding" create_and_assign_temp_run_path_to_config(cfg, tmp_path) dl = PixelDataLoader( ds, mode="train", cfg=cfg, num_workers=1, batch_size=cfg.batch_size, ) data1 = dl.dataset.__getitem__(0) data1["x_s"] data = dl.__iter__().__next__() x, y = data["x_d"], data["y"] # are we working with batches or individual predictions? x = x.unsqueeze(0) if x.ndim == 2 else x model = ( LSTM( input_size=dl.input_size + dl.static_input_size + dl.forecast_input_size, hidden_size=hidden_size, output_size=dl.output_size, forecast_horizon=dl.horizon, ) .float() .to(cfg.device) ) optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) loss_obj = F.mse_loss before = model.forward(data) for data in tqdm(dl): input, target = data["x_d"], data["y"] optimizer.zero_grad() yhat = model.forward(data) # shape = [batch_size, seq_length, forecast_horizon] assert yhat["y_hat"].shape == (cfg.batch_size, 1, 1) # get the final predictions to calculate loss loss = loss_obj(yhat["y_hat"], target) loss.backward() optimizer.step() break after = model.forward(data) loss_bf = loss_obj(before["y_hat"], y) loss_af = loss_obj(after["y_hat"], y) # NOTE: the LSTM only returns the final hidden and cell state layer NOT each timestep # TODO: why is the LSTM returning a hidden array of shape (seq_length, 1, hs) assert before["h_n"].shape == (1, cfg.batch_size, hidden_size) assert before["y_hat"].shape == (cfg.batch_size, 1, 1) if cfg.horizon == 1: assert ( loss_af < loss_bf ), "The model did not learn anything after one epoch of training"
from pathlib import Path import xarray as xr from spatio_temporal.config import Config from spatio_temporal.training.trainer import Trainer from spatio_temporal.training.tester import Tester from spatio_temporal.training.eval_utils import ( _plot_loss_curves, save_loss_curves, save_timeseries, ) if __name__ == "__main__": # LOAD IN DATA ds = xr.open_dataset("data/data_india_regions.nc").sortby("time") cfg = Config(Path("configs/india_region.yml")) cfg._cfg["n_epochs"] = 150 trainer = Trainer(cfg, ds) # TRAIN losses = trainer.train_and_validate() save_loss_curves(losses, cfg) # TEST tester = Tester(cfg, ds) preds = tester.run_test(unnormalize=True) for _ in range(2): save_timeseries(preds, cfg)
def test_lr_scheduler(self): path = Path("tests/testconfigs/test_config.yml") cfg = Config(cfg_path=path) assert isinstance(cfg.learning_rate, float)
def test_correct_data_returned(self, tmp_path): # create dummy config path cfg = Config(Path("tests/testconfigs/test_config.yml")) cfg._cfg["encode_doys"] = True cfg._cfg["static_inputs"] = "embedding" cfg._cfg["forecast_variables"] = cfg.input_variables # create temporary run directory (usually done by the ) create_and_assign_temp_run_path_to_config(cfg, tmp_path) # create dummy dataset ds = _make_dataset().isel(lat=slice(0, 2), lon=slice(0, 1)) # initialise the dataloader dl = PixelDataLoader(ds, cfg=cfg, mode="train", DEBUG=True) # one sample from the dataloader data = dl.__iter__().__next__() x, y = data["x_d"], data["y"] # recreate the stacked dataset stacked_ds = dl.dataset.ds if cfg.encode_doys: stacked_ds, _ = add_doy_encoding_as_feature_to_dataset( stacked_ds, inputs=cfg.input_variables, target=cfg.target_variable ) # get the current_time_index and pixel from the __getitem__() call getitem_call = int(data["meta"]["index"]) pixel, current_time_index = dl.dataset.lookup_table[getitem_call] # check that the returned data is valid # TODO: wrap into function for getting the valid times! est_target_time = pd.to_datetime( np.array(data["meta"]["target_time"]).astype("datetime64[ns]") )[0] # rounding error because of storing as float input_data_times = pd.to_datetime(stacked_ds.time.values) true_target_index = input_data_times.get_loc(est_target_time, method="nearest") true_target_time = input_data_times[true_target_index] assert current_time_index + cfg.horizon == true_target_index # :: RECREATE TARGET DATA :: all_expected_y = stacked_ds.sel(sample=pixel)["target"].values expected_y = stacked_ds.sel(sample=pixel, time=true_target_time)[ cfg.target_variable ].values expected_y_index = ( stacked_ds.sel(sample=pixel) .isel(time=true_target_index)[cfg.target_variable] .values ) assert expected_y == expected_y_index assert np.isclose(y.flatten()[-1], expected_y) ## :: RECREATE INPUT DATA :: # max_input_ix should be the CURRENT TIME (+ 1 because of exlusive upper indexing) max_input_ix = int(true_target_index - cfg.horizon) assert max_input_ix == current_time_index max_input_time = input_data_times[max_input_ix] # min_input_ix = the first input time min_input_ix = int(max_input_ix - cfg.seq_length) + 1 min_input_time = input_data_times[min_input_ix] input_vars = ( cfg.input_variables + ["autoregressive"] if cfg.autoregressive else cfg.input_variables ) input_vars = ( input_vars + ["sin_doy", "cos_doy"] if cfg.encode_doys else input_vars ) # has x been drawn from the actual underlying data? all_expected_x = stacked_ds.sel(sample=pixel)["feature"].values _expected_x = all_expected_x[min_input_ix:max_input_ix] # assert x == _expected_x # assert all( # np.isin( # np.round(x.numpy().flatten(), 3).astype("float64"), # np.round(all_expected_x.flatten(), 3).astype("float64"), # ) # ) # get the exact expected input vector # NOTE: slice is NOT EXCLUSIVE UPPER therefore need to exclude the final expected_x_feature = ( stacked_ds.sel(sample=pixel, time=slice(min_input_time, max_input_time))[ input_vars ] .to_array() .values.T ) x_feature = np.array(x) x_feature = x_feature.reshape(expected_x_feature.shape) assert np.allclose(x_feature, expected_x_feature)
from spatio_temporal.training.train_utils import _to_device def get_save_dir() -> Path: if socket.gethostname() == "GPU_MachineLearning": save_dir = Path("/home/tommy/spatio_temporal/runs/") else: save_dir = Path("/Users/tommylees/Downloads/") return save_dir if __name__ == "__main__": ds = xr.open_dataset(Path("data/ALL_dynamic_ds.nc")) # ds = ds.isel(station_id=slice(0, 10)) # cfg = Config(Path("tests/testconfigs/config_runoff.yml")) cfg = Config(Path("configs/runoff.yml")) cfg._cfg["scheduler"] = "step" trainer = Trainer(cfg, ds) # overfit on one epoch epochs = 100 model = trainer.model optimizer = trainer.optimizer loss_fn = trainer.loss_fn dl = trainer.train_dl scheduler = trainer.scheduler losses = [] data = dl.__iter__().__next__() data = _to_device(data, cfg.device) x = data["x_d"]
def test_trainer(self, tmp_path: Path): ds = create_linear_ds().isel(lat=slice(0, 5), lon=slice(0, 5)) cfg = Config(Path("tests/testconfigs/test_config.yml")) cfg.run_dir = tmp_path Trainer(cfg=cfg, ds=ds)
# ds = get_pollution_data_beijing().to_xarray() ## india experiments # ds = xr.open_dataset("data/data_india_regions.nc").sortby("time") # ds = xr.open_dataset("data/data_india_full.nc").sortby("time") ## river level data # ds = xr.open_dataset("data/camels_river_level_data.nc") # Run Training and Evaluation expt_class: Union[Trainer, Tester] if mode == "train": config_file = Path(args["config_file"]) assert config_file.exists(), f"Expect config file at {config_file}" cfg = Config(cfg_path=config_file) # Load in data ds, static = load_data(cfg) # Train test split expt_class = trainer = Trainer(cfg, ds, static_data=static) tester = Tester(cfg, ds, static_data=static) if overfit_test: # run test on training data to check for overfitting overfitting_tester = Tester(cfg, ds, subset="train", static_data=static) if baseline: print("Testing sklearn Linear Regression") train_dl = trainer.train_dl
def create_and_assign_temp_run_path_to_config(cfg: Config, tmp_path: Path) -> None: # create run_dir (tmp_path / "runs").mkdir(exist_ok=True, parents=True) cfg.run_dir = tmp_path / "runs"