def test(self, tmp_path):
        self._initialise_data(tmp_path)

        # SETTINGS
        train_dates = [2000, 2010]
        target_var = "discharge_spec"
        x_variables = ["precipitation", "peti"]
        static_variables = ["pet_mean", "aridity", "p_seasonality"]
        seq_length = 10
        with_static = True
        concat_static = False
        basins = get_basins(tmp_path)
        with_basin_str = True

        # INITIALIZE
        engineer = RunoffEngineer(
            data_dir=tmp_path,
            basins=basins,
            train_dates=train_dates,
            with_basin_str=with_basin_str,
            target_var=target_var,
            x_variables=x_variables,
            static_variables=static_variables,
            ignore_static_vars=None,
            seq_length=seq_length,
            with_static=with_static,
            concat_static=concat_static,
        )

        engineer.create_training_data()
        h5_file = engineer.out_file

        assert h5_file.exists()
        with h5py.File(h5_file, "r") as f:
            x = f["input_data"][:]
            y = f["target_data"][:]
            str_arr = f["sample_2_basin"][:]
            str_arr = [x.decode("ascii") for x in str_arr]
            q_stds = f["q_stds"][:]

        assert isinstance(x, np.ndarray)
        assert isinstance(y, np.ndarray)
        assert isinstance(str_arr, list)
        assert isinstance(q_stds, np.ndarray)

        assert len(np.unique(q_stds)) == 2
        assert len(np.unique(str_arr)) == 2
        assert x[0].shape == (seq_length, len(x_variables))
        assert len(x) == len(y)
Example #2
0
def train(
    data_dir: Path,
    basins: List[str],
    train_dates: List[int],
    with_basin_str: bool = True,
    target_var: str = "discharge_spec",
    x_variables: Optional[List[str]] = ["precipitation", "peti"],
    static_variables: Optional[List[str]] = None,
    ignore_static_vars: Optional[List[str]] = None,
    seq_length: int = 365,
    with_static: bool = True,
    concat_static: bool = False,
    seed: int = 10101,
    cache: bool = True,
    batch_size: int = 32,
    num_workers: int = 1,
    hidden_size: int = 256,
    initial_forget_gate_bias: int = 5,
    dropout: float = 0.4,
    use_mse: bool = True,
    learning_rate: float = 1e-3,
    epochs: int = 10,
):
    # Set seeds
    random.seed(seed)
    np.random.seed(seed)
    torch.cuda.manual_seed(seed)
    torch.manual_seed(seed)

    basins = get_basins(data_dir)

    # engineer the data for this training run
    _prepare_data(
        data_dir=data_dir,
        basins=basins,
        train_dates=train_dates,
        with_basin_str=with_basin_str,
        target_var=target_var,
        x_variables=x_variables,
        static_variables=static_variables,
        ignore_static_vars=ignore_static_vars,
        seq_length=seq_length,
        with_static=with_static,
        concat_static=concat_static,
    )

    # create dataloader
    data = CamelsH5(
        data_dir=data_dir,
        basins=basins,
        concat_static=concat_static,
        cache=cache,
        with_static=with_static,
        train_dates=train_dates,
    )

    # initialise key parameters of the Model
    input_size_stat = len(data.static_df.columns) if with_static else 0
    dynamic_size = len(data.x_variables)
    if with_static:
        input_size_dyn = (dynamic_size +
                          input_size_stat if concat_static else dynamic_size)
    else:
        input_size_dyn = dynamic_size

    loader = DataLoader(data,
                        batch_size=batch_size,
                        shuffle=True,
                        num_workers=num_workers)

    model = Model(
        input_size_dyn=input_size_dyn,
        input_size_stat=input_size_stat,
        hidden_size=hidden_size,
        initial_forget_bias=initial_forget_gate_bias,
        dropout=dropout,
        concat_static=concat_static,
        no_static=not with_static,  # inverse with_static
    ).to(DEVICE)

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # define loss function
    if use_mse:
        loss_func = nn.MSELoss()
    else:
        loss_func = NSELoss()  # type: ignore

    # reduce learning rates after each 10 epochs
    learning_rates = {11: 5e-4, 21: 1e-4}

    for epoch in range(1, epochs + 1):
        # set new learning rate
        if epoch in learning_rates.keys():
            for param_group in optimizer.param_groups:  # type: ignore
                param_group["lr"] = learning_rates[epoch]

        train_epoch(model, optimizer, loss_func, loader, epoch, use_mse)

        # save the model
        model_str = _get_model_str(with_static=with_static,
                                   concat_static=concat_static)
        model_path = data_dir / f"models/model_{model_str}_epoch{epoch}.pt"
        model_path.parents[0].mkdir(exist_ok=True, parents=True)

        model.model_path = model_path

        torch.save(model.state_dict(), str(model_path))

    return model
Example #3
0
def evaluate(
    data_dir: Path,
    model_path: Path,
    input_size_dyn: int,
    input_size_stat: int,
    val_dates: List[int],
    with_static: bool = True,
    static_variables: Optional[List[str]] = None,
    dropout: float = 0.4,
    concat_static: bool = False,
    hidden_size: int = 256,
    target_var: str = "discharge_spec",
    x_variables: Optional[List[str]] = ["precipitation", "peti"],
    seq_length: int = 365,
):
    """Evaluate the model

    Parameters
    ----------
    user_cfg : Dict
        Dictionary containing the user entered evaluation config

    """
    basins = get_basins(data_dir)

    # get static data (attributes) means/stds
    static_df = load_static_data(
        data_dir=data_dir,
        basins=basins,
        drop_lat_lon=True,
        static_variables=static_variables,
    )

    means = static_df.mean()
    stds = static_df.std()

    # create model
    model = Model(
        input_size_dyn=input_size_dyn,
        input_size_stat=input_size_stat,
        hidden_size=hidden_size,
        dropout=dropout,
        concat_static=concat_static,
        no_static=not with_static,
    ).to(DEVICE)

    # load trained model
    weight_file = model_path
    model.load_state_dict(torch.load(weight_file, map_location=DEVICE))

    val_dates = np.sort(val_dates)
    date_range = pd.date_range(start=f"{val_dates[0]}-01-01",
                               end=f"{val_dates[-1]}-12-31",
                               freq="D")
    results: Dict[pd.DataFrame] = {}

    normalization_dict = pickle.load(
        open(data_dir / "features/normalization_dict.pkl", "rb"))

    for basin in tqdm.tqdm(basins):
        ds_test = CAMELSCSV(
            data_dir=data_dir,
            basin=basin,
            train_dates=val_dates,
            normalization_dict=normalization_dict,
            is_train=True,
            target_var=target_var,
            x_variables=x_variables,
            static_variables=static_variables,
            seq_length=seq_length,
            with_static=with_static,
            concat_static=concat_static,
        )
        loader = DataLoader(ds_test,
                            batch_size=1024,
                            shuffle=False,
                            num_workers=1)

        preds, obs = evaluate_basin(model,
                                    loader,
                                    normalization_dict=normalization_dict)

        df = pd.DataFrame(data={
            "qobs": obs.flatten(),
            "qsim": preds.flatten()
        },
                          index=date_range)

        results[basin] = df

    save_eval_results(
        data_dir=data_dir,
        results=results,
        with_static=with_static,
        concat_static=concat_static,
    )
    def test_(self, tmp_path):
        self._initialise_data(tmp_path)

        # SETTINGS
        with_basin_str = True
        train_dates = [2000]
        target_var = "discharge_spec"
        x_variables = ["precipitation", "peti"]
        static_variables = ["pet_mean", "aridity", "p_seasonality"]
        seq_length = 10
        with_static = True
        concat_static = False
        basins = get_basins(tmp_path)
        dropout = 0.4
        hidden_size = 256
        seed = 10101
        cache = True
        use_mse = True
        batch_size = 50
        num_workers = 1
        initial_forget_gate_bias = 5
        learning_rate = 1e-3
        epochs = 1

        model = train_model(
            data_dir=tmp_path,
            basins=basins,
            train_dates=train_dates,
            with_basin_str=with_basin_str,
            target_var=target_var,
            x_variables=x_variables,
            static_variables=static_variables,
            ignore_static_vars=None,
            seq_length=seq_length,
            with_static=with_static,
            concat_static=concat_static,
            dropout=dropout,
            hidden_size=hidden_size,
            seed=seed,
            cache=cache,
            use_mse=use_mse,
            batch_size=batch_size,
            num_workers=num_workers,
            initial_forget_gate_bias=initial_forget_gate_bias,
            learning_rate=learning_rate,
            epochs=epochs,
        )

        input_size_dyn = model.input_size_dyn
        input_size_stat = model.input_size_stat
        model_path = model.model_path

        evaluate_model(
            data_dir=tmp_path,
            model_path=model_path,
            input_size_dyn=input_size_dyn,
            input_size_stat=input_size_stat,
            val_dates=train_dates,
            with_static=with_static,
            static_variables=static_variables,
            dropout=dropout,
            concat_static=concat_static,
            hidden_size=hidden_size,
            target_var=target_var,
            x_variables=x_variables,
            seq_length=seq_length,
        )

        # is the data directory correctly formatted?
        dirs = ["features", "models", "interim", "raw"]
        assert all(np.isin(dirs, [d.name for d in tmp_path.iterdir()]))

        # are the models / predictions saved properly?
        results_pkl = [f for f in (tmp_path / "models").glob("*.pkl")][0]
        assert "ealstm_results.pkl" in results_pkl.name
        assert "ealstm" in [f.name for f in (tmp_path / "models").glob("*.pt")][0]

        # check that all basins are found as keys in results Dict
        results = pickle.load(open(results_pkl, "rb"))
        assert all(np.isin(basins, [k for k in results.keys()]))
    def test(self, tmp_path):
        _copy_runoff_data_to_tmp_path(tmp_path)

        processsor = CAMELSGBPreprocessor(tmp_path, open_shapefile=False)
        processsor.preprocess()

        # SETTINGS
        train_dates = [2000, 2010]
        target_var = "discharge_spec"
        x_variables = ["precipitation", "peti"]
        static_variables = ["pet_mean", "aridity", "p_seasonality"]
        seq_length = 10
        with_static = True
        is_train = True
        concat_static = False

        # DERIVED Values
        n_times = len(
            pd.date_range(
                f"{train_dates[0]}-01-01", f"{train_dates[-1]}-12-31", freq="D"
            )
        )
        n_features = len(x_variables)
        n_stations = 2
        n_static_features = len(static_variables)

        normalization_dict = CalculateNormalizationParams(
            data_dir=tmp_path,
            train_dates=train_dates,
            target_var=target_var,
            x_variables=x_variables,
            static_variables=static_variables,
        ).normalization_dict

        assert len([stn for stn in get_basins(tmp_path)]) == n_stations

        for basin in get_basins(tmp_path):
            dataset = CAMELSCSV(
                data_dir=tmp_path,
                basin=basin,
                train_dates=train_dates,
                normalization_dict=normalization_dict,
                is_train=is_train,
                target_var=target_var,
                x_variables=x_variables,
                static_variables=static_variables,
                seq_length=seq_length,
                with_static=with_static,
                concat_static=concat_static,
            )
            x = dataset.x
            y = dataset.y
            static = dataset.attributes
            scaler = dataset.normalization_dict

            assert x.shape == (n_times, seq_length, n_features)
            assert y.shape == (n_times, 1)
            assert static.shape == (1, n_static_features)

            expected = [
                "static_means",
                "static_stds",
                "target_mean",
                "target_std",
                "dynamic_stds",
                "dynamic_means",
                "x_variables",
                "target_var",
                "static_variables",
            ]
            assert all(
                np.isin([k for k in scaler.keys()], expected)
            ), f"Expected: {expected} Got: {[k for k in scaler.keys()]}"
    def test(self, tmp_path, with_static, concat_static):
        self._initialise_data(tmp_path)

        # SETTINGS
        with_basin_str = True
        train_dates = [2000, 2002]
        target_var = "discharge_spec"
        x_variables = ["precipitation", "peti"]
        static_variables = ["pet_mean", "aridity", "p_seasonality"]
        seq_length = 5
        with_static = with_static
        concat_static = concat_static
        basins = get_basins(tmp_path)

        # EXPECTED
        out_file = tmp_path / "features/features.h5"
        static_data_path = tmp_path / "interim/static/data.nc"
        n_variables = len(x_variables)
        n_static_features = len(static_variables)

        # INITIALIZE
        engineer = RunoffEngineer(
            data_dir=tmp_path,
            basins=basins,
            train_dates=train_dates,
            with_basin_str=with_basin_str,
            target_var=target_var,
            x_variables=x_variables,
            static_variables=static_variables,
            ignore_static_vars=None,
            seq_length=seq_length,
            with_static=with_static,
            concat_static=concat_static,
        )

        engineer.create_training_data()

        data = CamelsH5(
            data_dir=tmp_path,
            basins=basins,
            concat_static=concat_static,
            cache=True,
            with_static=with_static,
            train_dates=train_dates,
        )

        iterate = [d for d in data]
        assert (
            len(iterate) == ((3 * 365) + 1) * 2
        ), "Should be 3 years (365 days) + 1 leap days, for two basins"

        assert data.h5_file == out_file
        assert data.static_data_path == static_data_path

        for index in [0, -1]:
            x = data[index][0]
            q_stds = data[index][-2]
            y = data[index][-1]

            assert q_stds.numpy().shape == (1,)
            assert y.numpy().shape == (1,)

            if (with_static) & (not concat_static):
                static = data[index][1]
                assert len(data[index]) == 4
                assert x.shape == (seq_length, n_variables)
                assert static.shape == (1, n_static_features)

            if (with_static) & (concat_static):
                assert len(data[index]) == 3
                assert x.shape == (seq_length, n_variables + n_static_features)

            if not with_static:
                assert len(data[index]) == 3
                assert x.shape == (seq_length, n_variables)

        assert data.static_variables == static_variables
        assert data.target_var == target_var

        loader = DataLoader(data, batch_size=32, shuffle=True, num_workers=1)