Esempio n. 1
0
    def test_dump_config(self, tmp_path: Path):
        run_dir = tmp_path / "runs"
        run_dir.mkdir(exist_ok=True, parents=True)
        path = Path("tests/testconfigs/test_config.yml")

        cfg = Config(cfg_path=path)
        cfg.run_dir = run_dir

        #  check that defaults not specified are written to file
        # check that the file gets created
        cfg.dump_config(run_dir)

        assert "config.yml" in [l.name for l in run_dir.glob("*")]

        cfg_path = run_dir / "config.yml"
        with cfg_path.open("r") as fp:
            yaml = YAML(typ="safe")
            cfg2 = yaml.load(fp)

        expected_keys_with_defaults = [
            "autoregressive",
            "pixel_dims",
            "num_workers",
            "seed",
            "device",
            "learning_rate",
            "time_str",
            "run_dir",
        ]
        for key in expected_keys_with_defaults:
            assert key in [l for l in cfg2.keys()]
Esempio n. 2
0
    def test_dataset(self, tmp_path):
        target_variable = "target"
        input_variables = ["feature"]
        for path in [
            Path("tests/testconfigs/test_config_simulate.yml"),
            Path("tests/testconfigs/test_config.yml"),
        ]:
            cfg = Config(path)
            cfg._cfg["forecast_variables"] = cfg.input_variables

            create_and_assign_temp_run_path_to_config(cfg, tmp_path)
            raw_ds = _make_dataset().isel(lat=slice(0, 2), lon=slice(0, 1))
            static = create_static(cfg=cfg, ds=raw_ds)
            ds = XarrayDataset(
                raw_ds, cfg=cfg, mode="train", DEBUG=True, static_data=static
            )

            assert ds.target == target_variable
            assert (
                ds.inputs == input_variables + ["autoregressive"]
                if cfg.autoregressive
                else input_variables
            )

            x_features = (
                len(input_variables) + 1 if cfg.autoregressive else len(input_variables)
            )
            seq_length = cfg.seq_length
            for i in range(10):
                data = ds.__getitem__(i)
                x, y = data["x_d"], data["y"]

                assert y.shape == (1, 1)
                assert x.shape == (
                    seq_length,
                    x_features + 2 if cfg.encode_doys else x_features,
                ), f"Shape Mismatch! Expect: {(seq_length, x_features)} Got: {x.shape}"

                meta = data["meta"]
                times = (
                    meta["target_time"]
                    .detach()
                    .numpy()
                    .astype("datetime64[ns]")
                    .flatten()
                )
                pixel, _ = ds.lookup_table[int(meta["index"])]
                latlon = tuple([float(l) for l in str(pixel).split("_")])

                y_unnorm = (
                    ds.normalizer.individual_inverse(y, pixel, variable="target")
                    .detach()
                    .numpy()
                )

                #  extract from the original xr.Dataset
                y_exp = raw_ds.sel(
                    lat=latlon[0], lon=latlon[1], time=times, method="nearest"
                )[cfg.target_variable].values
                assert np.isclose(y_unnorm.reshape(y_exp.shape), y_exp, atol=1e-5)
    def test_pollution(self, tmp_path):
        ds = get_pollution_data_beijing().to_xarray()
        cfg = Config(cfg_path=Path("tests/testconfigs/pollution.yml"))
        cfg.run_dir = tmp_path
        trainer = Trainer(cfg, ds)

        input_variables = [] if cfg.input_variables is None else cfg.input_variables
        train_ds = ds[input_variables + [cfg.target_variable]].sel(
            time=slice(cfg.train_start_date, cfg.train_end_date))

        assert trainer.train_dl.dataset.lookup_table != {}
        assert trainer.train_dl.dataset.y != {}
        assert trainer.train_dl.dataset.x_d != {}
    def test_train_test_split(self, tmp_path):
        ds = create_linear_ds().isel(lat=slice(0, 5), lon=slice(0, 5))
        cfg = Config(Path("tests/testconfigs/test_config.yml"))
        cfg.run_dir = tmp_path

        train = train_test_split(ds, cfg, subset="train")
        test = train_test_split(ds, cfg, subset="test")
        valid = train_test_split(ds, cfg, subset="validation")

        cfg.train_start_date
        cfg.train_end_date
        cfg.validation_start_date
        cfg.validation_end_date
        cfg.test_start_date
        cfg.test_end_date
Esempio n. 5
0
    def test_dataloader(self, tmp_path):
        ds = _make_dataset()
        cfg = Config(Path("tests/testconfigs/test_config.yml"))
        create_and_assign_temp_run_path_to_config(cfg, tmp_path)
        static = create_static(cfg=cfg, ds=ds)
        dl = PixelDataLoader(
            ds,
            cfg=cfg,
            num_workers=1,
            mode="train",
            batch_size=cfg.batch_size,
            static_data=static,
        )

        assert dl.batch_size == cfg.batch_size

        seq_length = cfg.seq_length
        autoregressive = cfg.autoregressive
        data = next(iter(dl))
        x, y = data["x_d"], data["y"]
        n_inputs = len(["features"]) + 1 if autoregressive else len(["features"])

        assert x.shape == (
            cfg.batch_size,
            seq_length,
            n_inputs + 2 if cfg.encode_doys else n_inputs,
        ), f"Size Mismatch! Expected: {(cfg.batch_size, seq_length, n_inputs)} Got: {x.shape}"
Esempio n. 6
0
    def test_kenya_data(self, tmp_path):
        if TEST_REAL_DATA:
            ds = pickle.load(Path("data/kenya.pkl").open("rb")).isel(
                lat=slice(0, 5), lon=slice(0, 5)
            )
            cfg = Config(Path("tests/testconfigs/config.yml"))
            create_and_assign_temp_run_path_to_config(cfg, tmp_path)

            dl = PixelDataLoader(
                ds, cfg=cfg, num_workers=1, mode="train", batch_size=cfg.batch_size
            )

            data = dl.__iter__().__next__()
            x, _ = data["x_d"], data["y"]

            batch_size = 256
            seq_length = cfg.seq_length
            input_variables = ["precip", "t2m", "SMsurf"]
            autoregressive = True
            n_inputs = (
                len(input_variables) + 1 if autoregressive else len(input_variables)
            )

            assert cfg.batch_size == batch_size
            assert cfg.autoregressive == autoregressive
            assert x.shape == (
                batch_size,
                seq_length,
                n_inputs,
            ), f"X Data Mismatch! Expected: {(batch_size, seq_length, n_inputs)} Got: {x.shape}"
        else:
            pass
Esempio n. 7
0
    def test_runoff_data(self, tmp_path):
        if TEST_REAL_DATA:
            ds = xr.open_dataset("data/ALL_dynamic_ds.nc").isel(station_id=slice(0, 5))
            cfg = Config(Path("tests/testconfigs/config_runoff.yml"))
            create_and_assign_temp_run_path_to_config(cfg, tmp_path)

            # train period
            input_variables = [] if cfg.input_variables is None else cfg.input_variables
            train_ds = ds[input_variables + [cfg.target_variable]].sel(
                time=slice(cfg.train_start_date, cfg.train_end_date)
            )
            train_dl = PixelDataLoader(
                train_ds,
                cfg=cfg,
                mode="train",
                num_workers=4,
                batch_size=cfg.batch_size,
            )

            #  check data is loaded properly
            data = next(iter(train_dl))
            x, y = data["x_d"], data["y"]

            n_in_vars = (
                len(cfg.input_variables) + 1
                if cfg.autoregressive
                else len(cfg.input_variables)
            )
            assert x.shape == (cfg.batch_size, cfg.seq_length, n_in_vars)
            assert y.shape == (cfg.batch_size, 1, 1)
        else:
            pass
Esempio n. 8
0
    def test_static_inputs(self, tmp_path):
        ds = _make_dataset().isel(lat=slice(0, 2), lon=slice(0, 1))
        ds_static = ds.mean(dim="time")

        cfg = Config(Path("tests/testconfigs/test_config.yml"))
        create_and_assign_temp_run_path_to_config(cfg, tmp_path)
        assert False
Esempio n. 9
0
    def test_dataset_beijing(self, tmp_path):
        if is_connected():
            path = Path("tests/testconfigs/pollution.yml")
            cfg = Config(path)
            create_and_assign_temp_run_path_to_config(cfg, tmp_path)
            raw_ds = get_pollution_data_beijing().to_xarray().isel(time=slice(0, 1000))
            ds = XarrayDataset(raw_ds, cfg=cfg, mode="train", DEBUG=True)

            assert ds.y != {}
Esempio n. 10
0
    def test_forecast_inputs(self, tmp_path):
        ds = _make_dataset().isel(lat=slice(0, 2), lon=slice(0, 1))
        ds_forecast = (
            ds.shift(time=1).rename({"feature": "feature_fcast1"}).drop("target")
        )
        ds = xr.merge([ds, ds_forecast])

        cfg = Config(Path("tests/testconfigs/test_config.yml"))
        create_and_assign_temp_run_path_to_config(cfg, tmp_path)
        assert False
Esempio n. 11
0
    def test_normalizer(self, tmp_path):
        cfg = Config(Path("tests/testconfigs/test_config.yml"))

        #  create normalizers and test whether working
        normalizer = pickle.load((cfg.run_dir / "normalizer.pkl").open("rb"))

        normalizer = pickle.load((cfg.run_dir / "static_normalizer.pkl").open("rb"))
        normalizer.std_
        normalizer.mean_

        assert False
Esempio n. 12
0
    def test_longer_horizon_fcast(self, tmp_path):
        cfg = Config(Path("tests/testconfigs/test_1d_config_horizon.yml"))
        create_and_assign_temp_run_path_to_config(cfg, tmp_path)
        ds = load_test_jena_data_as_dataset()

        dl = PixelDataLoader(
            ds, cfg=cfg, num_workers=1, mode="train", batch_size=cfg.batch_size
        )
        data = dl.__iter__().__next__()
        _, y = data["x_d"], data["y"]

        assert y.shape == (cfg.batch_size, 1, 1)
    def test_runoff_example(self, tmp_path):
        cfg = Config(Path("tests/testconfigs/config_runoff.yml"))
        create_and_assign_temp_run_path_to_config(cfg, tmp_path)

        cfg._cfg["data_path"] = Path("data/ALL_dynamic_ds.nc")
        cfg._cfg["static_data_path"] = Path("data/camels_static.nc")
        cfg._cfg["static_inputs"] = ["p_mean", "pet_mean", "area", "gauge_elev"]
        cfg._cfg["n_epochs"] = 3

        ds, static = load_data(cfg)

        #  select subset of 3 basins
        basins = [1001, 2001, 2002]
        ds = ds.sel(station_id=basins)
        static = static.sel(station_id=basins)

        trainer = Trainer(cfg, ds, static_data=static)
        self.check_loaded_data(
            cfg,
            trainer,
            data=ds.sel(time=slice(cfg.train_start_date, cfg.train_end_date)),
        )

        losses = trainer.train_and_validate()

        tester = Tester(cfg, ds, static_data=static)
        preds = tester.run_test()

        return losses, preds
    def test_kenya_vci_example(self, tmp_path):
        cfg = Config(Path("tests/testconfigs/config.yml"))
        create_and_assign_temp_run_path_to_config(cfg, tmp_path)

        cfg._cfg["data_path"] = Path("data/kenya.nc")
        cfg._cfg["n_epochs"] = 3

        ds, static = load_data(cfg)

        trainer = Trainer(cfg, ds, static_data=static)
        self.check_loaded_data(
            cfg,
            trainer,
            data=ds.sel(time=slice(cfg.train_start_date, cfg.train_end_date)),
        )

        losses = trainer.train_and_validate()

        tester = Tester(cfg, ds, static_data=static)
        preds = tester.run_test()

        return losses, preds
    def test_linear_example(self):
        ds = create_linear_ds(epsilon_sigma=10)
        static_data = create_static_example_data(ds)

        cfg = Config(Path("tests/testconfigs/test_config.yml"))
        cfg._cfg["static_inputs"] = ["static_const", "static_rand"]

        #  Train
        trainer = Trainer(cfg, ds, static_data=static_data)
        self.check_loaded_data(
            cfg,
            trainer,
            data=ds.sel(time=slice(cfg.train_start_date, cfg.train_end_date)),
        )
        losses = trainer.train_and_validate()
        save_loss_curves(losses, cfg)

        # Test
        tester = Tester(cfg, ds, static_data=static_data)
        preds = tester.run_test()
        for _ in range(2):
            save_timeseries(preds, cfg)
Esempio n. 16
0
    def test_1D_data(self, tmp_path):
        # convert pandas to xarray object
        ds = load_test_jena_data_as_dataset()
        cfg = Config(Path("tests/testconfigs/test_1d_config_horizon.yml"))
        create_and_assign_temp_run_path_to_config(cfg, tmp_path)

        dl = PixelDataLoader(
            ds, cfg=cfg, num_workers=1, mode="train", batch_size=cfg.batch_size
        )

        data = dl.__iter__().__next__()
        x, y = data["x_d"], data["y"]

        assert x.shape == (cfg.batch_size, cfg.seq_length, len(cfg.input_variables))
        assert y.shape == (cfg.batch_size, 1, 1)
Esempio n. 17
0
    def test_lstm_forward_pass(self, tmp_path):
        ds = pickle.load(Path("data/kenya.pkl").open("rb"))
        cfg = Config(Path("tests/testconfigs/config.yml"))
        create_and_assign_temp_run_path_to_config(cfg, tmp_path)
        dl = PixelDataLoader(ds, cfg=cfg, mode="train")

        model = LSTM(
            input_size=dl.input_size + dl.static_input_size + dl.forecast_input_size,
            hidden_size=cfg.hidden_size,
            output_size=dl.output_size,
            forecast_horizon=dl.horizon,
        )
        data = dl.__iter__().__next__()
        y_hat = model(data)

        assert all(np.isin(["h_n", "c_n", "y_hat"], [k for k in y_hat.keys()]))
Esempio n. 18
0
    def test_linear_regression_forward_pass(self, tmp_path):
        ds = _make_dataset()
        cfg = Config(Path("tests/testconfigs/test_config.yml"))
        create_and_assign_temp_run_path_to_config(cfg, tmp_path)
        dl = PixelDataLoader(ds, cfg=cfg, mode="train", DEBUG=True)

        model = LinearRegression(
            input_size=(dl.input_size + dl.static_input_size + dl.forecast_input_size)
            * cfg.seq_length,
            output_size=dl.output_size,
            forecast_horizon=dl.horizon,
        )
        data = dl.__iter__().__next__()
        y_hat = model(data)

        assert isinstance(y_hat, dict)
        assert y_hat["y_hat"].shape == (1, 1)
Esempio n. 19
0
    def test_config(self):
        paths = list(Path("tests/testconfigs").glob("*.yml"))
        for path in paths:
            cfg = Config(cfg_path=path)

            assert isinstance(cfg.test_start_date, pd.Timestamp)
            assert isinstance(cfg.data_dir, Path)

            assert all(np.isin(cfg._mandatory_keys, list(dir(cfg))))
            assert all(np.isin(cfg._mandatory_keys, list(cfg._cfg.keys())))
            assert all(np.isin(list(cfg._defaults.keys()), list(dir(cfg))))

            if cfg.file_path.name == "test_1d_config.yml":
                assert cfg.pixel_dims == ["pixel"]

            # TODO: test default args
            original = cfg._cfg.pop("pixel_dims", None)
            assert cfg.pixel_dims == ["lat",
                                      "lon"], "Expect to return the default"
    def check_output_files(tmp_path: Path):
        #  check the saved files (model and optimizer epochs)
        test_dir = sorted([d for d in tmp_path.glob("runs/test*")])[-1]
        created_files = sorted([t.name for t in test_dir.iterdir()])

        cfg = Config(test_dir / "config.yml")
        assert len([f for f in created_files if "model_epoch" in f]) == cfg.n_epochs

        if cfg.static_inputs is not None:
            assert (
                "static_normalizer.pkl" in created_files
            ), f"Expected the static normalizer to be saved. Not found in: {pformat(created_files)}"

        assert (
            "normalizer.pkl" in created_files
        ), f"Expected the normalizer to be saved. Not found in: {pformat(created_files)}"
        assert (
            len([f for f in test_dir.glob("*.nc")]) > 0
        ), "Output NetCDF not saved to disk!"
Esempio n. 21
0
    def test_tester(self, tmp_path):
        ds = create_linear_ds().isel(lat=slice(0, 5), lon=slice(0, 5))
        cfg = Config(Path("tests/testconfigs/test_config.yml"))
        cfg._cfg["n_epochs"] = 1
        cfg._cfg["num_workers"] = 1
        cfg._cfg["horizon"] = 5
        cfg.run_dir = tmp_path

        # initialise the train directory!
        trainer = Trainer(cfg, ds)
        trainer.train_and_validate()

        tester = Tester(cfg=cfg, ds=ds)

        #  TODO: test the tester evaluation loop
        tester.run_test()
        #  TODO: test that plots created, outputs saved
        outfile = sorted(list(cfg.run_dir.glob("*.nc")))[-1]
        out_ds = xr.open_dataset(outfile)

        assert int(out_ds.horizon.values) == cfg.horizon

        #  Check that the times are correct
        min_time = pd.to_datetime(out_ds.time.values.min()).round("D")
        exp_min_time = cfg.test_start_date + DateOffset(
            months=(cfg.seq_length + cfg.horizon))

        assert all([
            (min_time.year == exp_min_time.year),
            (min_time.month == exp_min_time.month),
            (min_time.day == exp_min_time.day),
        ])

        max_time = pd.to_datetime(out_ds.time.values.max()).round("D")
        exp_max_time = cfg.test_end_date - DateOffset(months=1)

        assert all([
            (max_time.year == exp_max_time.year),
            (max_time.month == exp_max_time.month),
            (max_time.day == exp_max_time.day),
        ])
Esempio n. 22
0
    def test_linear_example(self, tmp_path):
        """Test the linear dataset.

        Args:
            tmp_path ([type]): [description]
        """
        cfg = Config(Path("tests/testconfigs/test_config.yml"))
        create_and_assign_temp_run_path_to_config(cfg, tmp_path)

        #  Create linear dataset
        alpha = 0
        beta = 2
        epsilon_sigma = 0

        ds = create_linear_ds(
            horizon=cfg.horizon, alpha=alpha, beta=beta, epsilon_sigma=epsilon_sigma
        ).isel(lat=slice(0, 2), lon=slice(0, 2))
        static = create_static(cfg=cfg, ds=ds)
        dl = PixelDataLoader(
            ds,
            cfg=cfg,
            num_workers=1,
            mode="train",
            batch_size=cfg.batch_size,
            DEBUG=True,
            static_data=static,
        )

        #  load all of the data into memory
        data = load_all_data_from_dl_into_memory(dl)
        x = data["x_d"]

        # (n_samples, n_features, seq_length)
        assert x.shape == (
            len(cfg.input_variables) + 2
            if cfg.encode_doys
            else len(cfg.input_variables),
            cfg.seq_length,
        )
        assert x.shape[-1] == cfg.seq_length
        y = data["y"]
        times = pd.to_datetime(data["time"].astype("datetime64[ns]").flatten())

        # matching batch dims (n_samples) for all samples
        assert x.shape[0] == y.shape[0]

        #  test ONE SINGLE (x, y) sample
        SAMPLE = 1

        # get metadata for sample
        idx = int(data["index"][SAMPLE])
        pixel, valid_current_time_index = dl.dataset.lookup_table[idx]
        latlon = tuple([float(l) for l in str(pixel).split("_")])
        target_time = times[SAMPLE]
        # current_time = times[valid_current_time_index][0]

        #  get the correct times (weird indexing becuase of imperfect translation of float -> datetime64[ns])
        max_time = target_time - DateOffset(months=cfg.horizon) + DateOffset(days=2)
        min_time = max_time - DateOffset(months=cfg.seq_length)
        input_times = pd.date_range(min_time, max_time, freq="M")[-cfg.seq_length :]

        #  recreate the data that should be loaded from the raw xr.Dataset
        stacked, _ = _stack_xarray(ds, spatial_coords=cfg.pixel_dims)
        normalizer = dl.normalizer
        norm_stacked = normalizer.transform(stacked)

        all_y = norm_stacked["target"].sel(sample=pixel)
        _y = all_y.sel(time=target_time, method="nearest")
        all_x = norm_stacked["feature"].sel(sample=pixel)
        _x_d = all_x.sel(time=input_times, method="nearest")

        #  check that the dataloader saves & returns the correct values
        assert np.allclose(
            dl.dataset.y[pixel], (all_y.values)
        ), "The DataLoader saves incorrect y values to memory"
        assert np.isclose(
            _y.values, y[SAMPLE]
        ), "The DataLoader returns an incorrect value from the Dataset"

        #  input (X) data
        dataset_loaded = dl.dataset.x_d[pixel]
        # assert dataset_loaded.shape == (, cfg.seq_length)

        expected = all_x.values.reshape(dataset_loaded.shape)
        mask = np.isnan(expected)
        expected = expected[~mask]
        dataset_loaded = dataset_loaded[~mask]

        assert np.allclose(
            dataset_loaded, expected
        ), f"The dataloader is saving the wrong data to the lookup table. {dataset_loaded[:10]} {expected[:10]}"

        #  get input X data from INDEX (not times)
        max_input_ix = int(valid_current_time_index)
        min_input_ix = int(max_input_ix - cfg.seq_length) + 1
        _x_d_index_values = all_x.values[min_input_ix : max_input_ix + 1]

        assert np.allclose(_x_d_index_values, _x_d.values)

        # TODO: Why does this not work?
        assert np.allclose(
            _x_d_index_values.values, x[SAMPLE]
        ), "The dynamic data is not the data we expect"

        #  check that the raw data is the linear combination we expect
        # "target" should be linear combination of previous timestep "feature"
        # (y = x @ [0, 2])
        zeros = np.zeros((cfg.seq_length - 1, 1))
        betas = np.append(zeros, beta).reshape(-1, 1)
        unnorm_x = dl.dataset.normalizer.individual_inverse(
            x[SAMPLE], pixel_id=pixel, variable=cfg.input_variables[0]
        )
        unnorm_y = dl.dataset.normalizer.individual_inverse(
            y[SAMPLE], pixel_id=pixel, variable=cfg.target_variable
        )

        #  time=target_time,
        ds.sel(lat=latlon[0], lon=latlon[1], method="nearest")[cfg.target_variable]
        assert np.isclose(unnorm_x @ betas, unnorm_y)
Esempio n. 23
0
    def test_single_train_step(self, tmp_path):
        torch.manual_seed(1)
        np.random.seed(1)

        hidden_size = 64
        ds = pickle.load(Path("data/kenya.pkl").open("rb")).isel(
            lat=slice(0, 2), lon=slice(0, 4)
        )

        paths = [
            Path("tests/testconfigs/config.yml"),
            Path("tests/testconfigs/config_multi_horizon.yml"),
        ]
        for path in paths:
            cfg = Config(path)
            cfg._cfg["static_inputs"] = "embedding"
            create_and_assign_temp_run_path_to_config(cfg, tmp_path)

            dl = PixelDataLoader(
                ds, mode="train", cfg=cfg, num_workers=1, batch_size=cfg.batch_size,
            )

            data1 = dl.dataset.__getitem__(0)
            data1["x_s"]

            data = dl.__iter__().__next__()
            x, y = data["x_d"], data["y"]

            # are we working with batches or individual predictions?
            x = x.unsqueeze(0) if x.ndim == 2 else x

            model = (
                LSTM(
                    input_size=dl.input_size
                    + dl.static_input_size
                    + dl.forecast_input_size,
                    hidden_size=hidden_size,
                    output_size=dl.output_size,
                    forecast_horizon=dl.horizon,
                )
                .float()
                .to(cfg.device)
            )

            optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
            loss_obj = F.mse_loss
            before = model.forward(data)
            for data in tqdm(dl):
                input, target = data["x_d"], data["y"]
                optimizer.zero_grad()
                yhat = model.forward(data)
                #  shape = [batch_size, seq_length, forecast_horizon]
                assert yhat["y_hat"].shape == (cfg.batch_size, 1, 1)

                # get the final predictions to calculate loss
                loss = loss_obj(yhat["y_hat"], target)
                loss.backward()
                optimizer.step()
                break

            after = model.forward(data)

            loss_bf = loss_obj(before["y_hat"], y)
            loss_af = loss_obj(after["y_hat"], y)

            # NOTE: the LSTM only returns the final hidden and cell state layer NOT each timestep
            # TODO: why is the LSTM returning a hidden array of shape (seq_length, 1, hs)
            assert before["h_n"].shape == (1, cfg.batch_size, hidden_size)
            assert before["y_hat"].shape == (cfg.batch_size, 1, 1)

            if cfg.horizon == 1:
                assert (
                    loss_af < loss_bf
                ), "The model did not learn anything after one epoch of training"
from pathlib import Path
import xarray as xr
from spatio_temporal.config import Config
from spatio_temporal.training.trainer import Trainer
from spatio_temporal.training.tester import Tester
from spatio_temporal.training.eval_utils import (
    _plot_loss_curves,
    save_loss_curves,
    save_timeseries,
)

if __name__ == "__main__":
    #  LOAD IN DATA
    ds = xr.open_dataset("data/data_india_regions.nc").sortby("time")
    cfg = Config(Path("configs/india_region.yml"))
    cfg._cfg["n_epochs"] = 150
    trainer = Trainer(cfg, ds)

    #  TRAIN
    losses = trainer.train_and_validate()
    save_loss_curves(losses, cfg)

    #  TEST
    tester = Tester(cfg, ds)
    preds = tester.run_test(unnormalize=True)
    for _ in range(2):
        save_timeseries(preds, cfg)
Esempio n. 25
0
 def test_lr_scheduler(self):
     path = Path("tests/testconfigs/test_config.yml")
     cfg = Config(cfg_path=path)
     assert isinstance(cfg.learning_rate, float)
Esempio n. 26
0
    def test_correct_data_returned(self, tmp_path):
        #  create dummy config path
        cfg = Config(Path("tests/testconfigs/test_config.yml"))
        cfg._cfg["encode_doys"] = True
        cfg._cfg["static_inputs"] = "embedding"
        cfg._cfg["forecast_variables"] = cfg.input_variables
        #  create temporary run directory (usually done by the )
        create_and_assign_temp_run_path_to_config(cfg, tmp_path)
        #  create dummy dataset
        ds = _make_dataset().isel(lat=slice(0, 2), lon=slice(0, 1))

        #  initialise the dataloader
        dl = PixelDataLoader(ds, cfg=cfg, mode="train", DEBUG=True)
        #  one sample from the dataloader
        data = dl.__iter__().__next__()
        x, y = data["x_d"], data["y"]

        #  recreate the stacked dataset
        stacked_ds = dl.dataset.ds

        if cfg.encode_doys:
            stacked_ds, _ = add_doy_encoding_as_feature_to_dataset(
                stacked_ds, inputs=cfg.input_variables, target=cfg.target_variable
            )

        #  get the current_time_index and pixel from the __getitem__() call
        getitem_call = int(data["meta"]["index"])
        pixel, current_time_index = dl.dataset.lookup_table[getitem_call]

        # check that the returned data is valid
        #  TODO: wrap into function for getting the valid times!
        est_target_time = pd.to_datetime(
            np.array(data["meta"]["target_time"]).astype("datetime64[ns]")
        )[0]

        #  rounding error because of storing as float
        input_data_times = pd.to_datetime(stacked_ds.time.values)
        true_target_index = input_data_times.get_loc(est_target_time, method="nearest")
        true_target_time = input_data_times[true_target_index]

        assert current_time_index + cfg.horizon == true_target_index

        # :: RECREATE TARGET DATA ::
        all_expected_y = stacked_ds.sel(sample=pixel)["target"].values

        expected_y = stacked_ds.sel(sample=pixel, time=true_target_time)[
            cfg.target_variable
        ].values
        expected_y_index = (
            stacked_ds.sel(sample=pixel)
            .isel(time=true_target_index)[cfg.target_variable]
            .values
        )
        assert expected_y == expected_y_index
        assert np.isclose(y.flatten()[-1], expected_y)

        ## :: RECREATE INPUT DATA ::
        # max_input_ix should be the CURRENT TIME (+ 1 because of exlusive upper indexing)
        max_input_ix = int(true_target_index - cfg.horizon)
        assert max_input_ix == current_time_index
        max_input_time = input_data_times[max_input_ix]

        #  min_input_ix = the first input time
        min_input_ix = int(max_input_ix - cfg.seq_length) + 1
        min_input_time = input_data_times[min_input_ix]

        input_vars = (
            cfg.input_variables + ["autoregressive"]
            if cfg.autoregressive
            else cfg.input_variables
        )
        input_vars = (
            input_vars + ["sin_doy", "cos_doy"] if cfg.encode_doys else input_vars
        )

        # has x been drawn from the actual underlying data?
        all_expected_x = stacked_ds.sel(sample=pixel)["feature"].values
        _expected_x = all_expected_x[min_input_ix:max_input_ix]
        # assert x == _expected_x

        # assert all(
        #     np.isin(
        #         np.round(x.numpy().flatten(), 3).astype("float64"),
        #         np.round(all_expected_x.flatten(), 3).astype("float64"),
        #     )
        # )

        # get the exact expected input vector
        # NOTE: slice is NOT EXCLUSIVE UPPER therefore need to exclude the final
        expected_x_feature = (
            stacked_ds.sel(sample=pixel, time=slice(min_input_time, max_input_time))[
                input_vars
            ]
            .to_array()
            .values.T
        )

        x_feature = np.array(x)
        x_feature = x_feature.reshape(expected_x_feature.shape)

        assert np.allclose(x_feature, expected_x_feature)
Esempio n. 27
0
from spatio_temporal.training.train_utils import _to_device


def get_save_dir() -> Path:
    if socket.gethostname() == "GPU_MachineLearning":
        save_dir = Path("/home/tommy/spatio_temporal/runs/")
    else:
        save_dir = Path("/Users/tommylees/Downloads/")
    return save_dir


if __name__ == "__main__":
    ds = xr.open_dataset(Path("data/ALL_dynamic_ds.nc"))
    # ds = ds.isel(station_id=slice(0, 10))
    # cfg = Config(Path("tests/testconfigs/config_runoff.yml"))
    cfg = Config(Path("configs/runoff.yml"))
    cfg._cfg["scheduler"] = "step"
    trainer = Trainer(cfg, ds)

    #  overfit on one epoch
    epochs = 100
    model = trainer.model
    optimizer = trainer.optimizer
    loss_fn = trainer.loss_fn
    dl = trainer.train_dl
    scheduler = trainer.scheduler

    losses = []
    data = dl.__iter__().__next__()
    data = _to_device(data, cfg.device)
    x = data["x_d"]
Esempio n. 28
0
 def test_trainer(self, tmp_path: Path):
     ds = create_linear_ds().isel(lat=slice(0, 5), lon=slice(0, 5))
     cfg = Config(Path("tests/testconfigs/test_config.yml"))
     cfg.run_dir = tmp_path
     Trainer(cfg=cfg, ds=ds)
Esempio n. 29
0
    # ds = get_pollution_data_beijing().to_xarray()

    ## india experiments
    # ds = xr.open_dataset("data/data_india_regions.nc").sortby("time")
    # ds = xr.open_dataset("data/data_india_full.nc").sortby("time")

    ## river level data
    # ds = xr.open_dataset("data/camels_river_level_data.nc")

    #  Run Training and Evaluation
    expt_class: Union[Trainer, Tester]
    if mode == "train":
        config_file = Path(args["config_file"])
        assert config_file.exists(), f"Expect config file at {config_file}"

        cfg = Config(cfg_path=config_file)

        # Load in data
        ds, static = load_data(cfg)

        # Train test split
        expt_class = trainer = Trainer(cfg, ds, static_data=static)
        tester = Tester(cfg, ds, static_data=static)

        if overfit_test:
            #  run test on training data to check for overfitting
            overfitting_tester = Tester(cfg, ds, subset="train", static_data=static)

        if baseline:
            print("Testing sklearn Linear Regression")
            train_dl = trainer.train_dl
Esempio n. 30
0
def create_and_assign_temp_run_path_to_config(cfg: Config,
                                              tmp_path: Path) -> None:
    # create run_dir
    (tmp_path / "runs").mkdir(exist_ok=True, parents=True)
    cfg.run_dir = tmp_path / "runs"