def engineer(data_path, experiment='one_month_forecast', process_static=True,
             pred_months=12, expected_length=12):
    engineer = Engineer(data_path, experiment=experiment, process_static=process_static)
    engineer.engineer(
        test_year=2018, target_variable='VHI',
        pred_months=pred_months, expected_length=pred_months,
    )
Beispiel #2
0
def engineer_static():
    # if the working directory is alread ml_drought don't need ../data
    if Path('.').absolute().as_posix().split('/')[-1] == 'ml_drought':
        data_path = Path('data')
    else:
        data_path = Path('../data')

    Engineer.engineer_static_only(data_path)
Beispiel #3
0
def engineer(pred_months=3, target_var="VCI1M"):
    engineer = Engineer(get_data_path(),
                        experiment="one_month_forecast",
                        process_static=False)
    engineer.engineer(
        test_year=[y for y in range(2016, 2019)],
        target_variable=target_var,
        pred_months=pred_months,
        expected_length=pred_months,
    )
Beispiel #4
0
def eng_strato():
    # if the working directory is alread ml_drought don't need ../data
    if Path('.').absolute().as_posix().split('/')[-1] == 'ml_drought':
        data_path = Path('data')
    else:
        data_path = Path('../data')

    engineer = Engineer(data_path, experiment='strato')
    engineer.engineer(
        test_year=2018,
        target_variable='u',
    )
    def test_init(self, tmp_path):

        with pytest.raises(AssertionError) as e:
            Engineer(tmp_path)
            assert "does not exist. Has the preprocesser been run?" in str(e)

        (tmp_path / "interim").mkdir()

        Engineer(tmp_path)

        assert (tmp_path / "features").exists(), "Features directory not made!"
        assert (tmp_path / "features" / "one_month_forecast").exists(), "\
Beispiel #6
0
def engineer(experiment="one_month_forecast",
             process_static=True,
             pred_months=12):

    engineer = Engineer(get_data_path(),
                        experiment=experiment,
                        process_static=process_static)
    engineer.engineer(
        test_year=[y for y in range(2011, 2019)],
        target_variable="VCI",
        pred_months=pred_months,
        expected_length=pred_months,
    )
Beispiel #7
0
def engineer(experiment='one_month_forecast', process_static=True,
             pred_months=12):
    # if the working directory is alread ml_drought don't need ../data
    if Path('.').absolute().as_posix().split('/')[-1] == 'ml_drought':
        data_path = Path('data')
    else:
        data_path = Path('../data')

    engineer = Engineer(data_path, experiment=experiment, process_static=process_static)
    engineer.engineer(
        test_year=2018, target_variable='VCI',
        pred_months=pred_months, expected_length=pred_months,
    )
    def test_engineer(self, tmp_path):

        _setup(tmp_path)

        pred_months = expected_length = 11

        engineer = Engineer(tmp_path)
        engineer.engineer(
            test_year=2001,
            target_variable="a",
            pred_months=pred_months,
            expected_length=expected_length,
        )

        def check_folder(folder_path):
            y = xr.open_dataset(folder_path / "y.nc")
            assert "b" not in set(
                y.variables), "Got unexpected variables in test set"

            x = xr.open_dataset(folder_path / "x.nc")
            for expected_var in {"a", "b"}:
                assert expected_var in set(
                    x.variables), "Missing variables in testing input dataset"
            assert (len(x.time.values) == expected_length
                    ), "Wrong number of months in the test x dataset"
            assert len(
                y.time.values) == 1, "Wrong number of months in test y dataset"

        # check_folder(tmp_path / 'features/one_month_forecast/train/1999_12')
        for month in range(1, 13):
            check_folder(tmp_path /
                         f"features/one_month_forecast/test/2001_{month}")
            check_folder(tmp_path /
                         f"features/one_month_forecast/train/2000_{month}")

        assert (len(
            list((tmp_path / "features/one_month_forecast/train"
                  ).glob("2001_*"))) == 0), "Test data in the training data!"

        assert (tmp_path / "features/one_month_forecast/normalizing_dict.pkl"
                ).exists(), f"Normalizing dict not saved!"
        with (tmp_path / "features/one_month_forecast/normalizing_dict.pkl"
              ).open("rb") as f:
            norm_dict = pickle.load(f)

        for key, val in norm_dict.items():
            assert key in {"a", "b"}, f"Unexpected key!"
            assert norm_dict[key]["mean"] == 1, f"Mean incorrectly calculated!"
            assert norm_dict[key]["std"] == 0, f"Std incorrectly calculated!"
    def test_get_preprocessed(self, tmp_path, monkeypatch):

        expected_files, expected_vars = _setup(tmp_path)

        def mock_init(self, data_folder):
            self.name = "dummy"
            self.interim_folder = data_folder / "interim"

        monkeypatch.setattr(Engineer, "__init__", mock_init)

        engineer = Engineer(tmp_path)
        files = engineer._get_preprocessed_files(static=False)

        assert set(expected_files) == set(
            files), f"Did not retrieve expected files!"
    def test_static(self, tmp_path):
        _, expected_vars = _setup(tmp_path, add_times=False, static=True)
        engineer = Engineer(tmp_path, process_static=True)

        assert (
            tmp_path /
            "features/static").exists(), "Static output folder does not exist!"

        engineer.process_static()

        output_file = tmp_path / "features/static/data.nc"
        assert output_file.exists(), "Static output folder does not exist!"
        static_data = xr.open_dataset(output_file)

        for var in expected_vars:
            assert var in static_data.data_vars
def engineer(
    pred_months=3,
    target_var="boku_VCI",
    process_static=False,
    global_means: bool = True,
    log_vars: Optional[List[str]] = None,
):
    engineer = Engineer(get_data_path(),
                        experiment="one_month_forecast",
                        process_static=process_static)
    engineer.engineer(
        test_year=[y for y in range(2016, 2019)],
        target_variable=target_var,
        pred_months=pred_months,
        expected_length=pred_months,
        global_means=global_means,
    )
    def test_yearsplit(self, tmp_path):

        _setup(tmp_path)

        dataset, _, _ = _make_dataset(size=(2, 2))

        engineer = Engineer(tmp_path)
        train = engineer._train_test_split(
            dataset,
            years=[2001],
            target_variable="VHI",
            pred_months=11,
            expected_length=11,
        )

        assert (train.time.values < np.datetime64("2001-01-01")).all(
        ), "Got years greater than the test year in the training set!"
    def test_join(self, tmp_path, monkeypatch):

        expected_files, expected_vars = _setup(tmp_path)

        def mock_init(self, data_folder):
            self.name = "dummy"
            self.interim_folder = data_folder / "interim"

        monkeypatch.setattr(Engineer, "__init__", mock_init)

        engineer = Engineer(tmp_path)
        joined_ds = engineer._make_dataset(static=False)

        dims = ["lon", "lat", "time"]
        output_vars = [var for var in joined_ds.variables if var not in dims]

        assert set(output_vars) == set(
            expected_vars), f"Did not retrieve all the expected variables!"
    def test_stratify(self, tmp_path):
        _setup(tmp_path)
        engineer = Engineer(tmp_path)
        ds_target, _, _ = _make_dataset(size=(20, 20))
        ds_predictor, _, _ = _make_dataset(size=(20, 20))
        ds_predictor = ds_predictor.rename({"VHI": "predictor"})
        ds = ds_predictor.merge(ds_target)

        xy_dict, max_train_date = engineer._stratify_xy(
            ds=ds,
            year=2001,
            target_variable="VHI",
            target_month=1,
            pred_months=4,
            expected_length=4,
        )

        assert (
            xy_dict["x"].time.size == 4), f'OneMonthForecast experiment `x`\
        should have 4 times Got: {xy_dict["x"].time.size}'

        assert (max_train_date == dt.datetime(2000, 12, 31).date()), f"\
Beispiel #15
0
def engineer_static():
    Engineer.engineer_static_only(get_data_path())
Beispiel #16
0
 def engineer(self, engineer_args: Dict) -> None:
     """Run the engineer on the data
     """
     engineer_args["init_args"]["data_folder"] = self.data
     engineer = Engineer(**engineer_args["init_args"])
     engineer.engineer(**engineer_args["run_args"])
def run_training_period_experiments(pred_months: int = 3):
    expected_length = pred_months

    # Read the target data
    print("** Reading the target data **")
    data_dir = get_data_path()
    target_data = xr.open_dataset(data_dir / "interim" / "VCI_preprocessed" /
                                  "data_kenya.nc")
    # sort by the annual median (across pixels/time)
    print("** Sorting the target data **")
    sorted_years, _ = sort_by_median_target_variable(target_data)
    print(f"** sorted_years: {sorted_years} **")
    print(f"** min_year: {min(sorted_years)} max_year: {max(sorted_years)} **")

    # create all experiments
    # train_hilo(9), test_hilo(3), train_length(1)
    print("** Creating all experiments **")
    hilos = ["high", "med", "low"]
    train_lengths = [5, 10, 20]
    experiments = [
        Experiment(train_length=train_length,
                   train_hilo=train_hilo,
                   test_hilo=test_hilo)
        for train_hilo, test_hilo, train_length in itertools.product(
            hilos, hilos, train_lengths)
    ]

    print("** Running all experiments **")
    for experiment in experiments[7:]:
        test_years, train_years = get_experiment_years(
            sorted_years,
            experiment.train_length,
            experiment.test_hilo,
            experiment.train_hilo,
            test_length=3,
        )

        debug = True
        if debug:
            print(
                "\n" + "-" * 10 + "\n",
                "train_length: " + str(experiment.train_length),
                "test_hilo: " + experiment.test_hilo,
                "train_hilo: " + experiment.train_hilo,
                "\ntrain_years:\n",
                train_years,
                "\n",
                "test_years:\n",
                test_years,
                "\n" + "-" * 10 + "\n",
            )

        # have to recreate each engineer for the experiment
        # TODO: definite inefficiency should this be in DataLoader?
        engineer = Engineer(
            get_data_path(),
            experiment="one_month_forecast",
            process_static=True,
            different_training_periods=True,
        )
        engineer.engineer_class.engineer(
            test_year=test_years,  # defined by experiment
            train_years=train_years,  # defined by experiment
            pred_months=pred_months,  # 3 by default
            expected_length=expected_length,  # == pred_month by default
            target_variable="VCI",
        )

        # TODO:
        # add extra years if selected the first year in timeseries (often not 12months)
        # e.g. 1981_11 is the first valid month in our dataset

        # Run the models
        always_ignore_vars = ["ndvi", "p84.162", "sp", "tp", "Eb"]
        ignore_vars = always_ignore_vars
        run_experiments(
            train_hilo=experiment.train_hilo,
            test_hilo=experiment.test_hilo,
            train_length=len(train_years),
            ignore_vars=ignore_vars,
            run_regression=False,
            all_models=False,
            static=True,
        )

        # save some key facts about the experiment to an experiment.json file
        expt_dict = dict(
            train_hilo=experiment.train_hilo,
            test_hilo=experiment.test_hilo,
            train_length=len(train_years),
            ignore_vars=ignore_vars,
            train_years=train_years,
            test_years=test_years,
        )
        with open(data_dir / "models/one_month_forecast/experiment.json",
                  "wb") as fp:
            json.dump(expt_dict, fp, sort_keys=True, indent=4)

        # rename the features/one_month_forecast directory
        rename_experiment_dir(
            data_dir,
            train_hilo=experiment.train_hilo,
            test_hilo=experiment.test_hilo,
            train_length=len(train_years),
            dir_="features",
        )
Beispiel #18
0
import numpy as np
from collections import defaultdict
import calendar
from datetime import datetime, date
from pathlib import Path
import xarray as xr

from typing import cast, Dict, List, Optional, Union, Tuple
from typing import DefaultDict as DDict

from src.engineer import Engineer
from src.preprocess.base import BasePreProcessor

data_path = Path("/Volumes/Lees_Extend/data/ecmwf_sowc/data")
engineer = Engineer(data_path)
engineer.engineer(test_year=1990,
                  target_variable="VHI",
                  pred_months=3,
                  expected_length=3)

# wrong shapes!
datasets = engineer._get_preprocessed_files()
ds_list = [xr.open_dataset(ds) for ds in datasets]
dims_list = [[dim for dim in ds.dims] for ds in ds_list]
variable_list = [[var for var in ds.variables if var not in dims_list[i]][0]
                 for i, ds in enumerate(ds_list)]
da_list = [ds[variable_list[i]] for i, ds in enumerate(ds_list)]

#
ds = engineer._make_dataset()
years = [1990]
Beispiel #19
0
engineer._stratify_training_data
engineer._train_test_split
engineer.stratify_xy
engineer.get_datetime
engineer._save
"""

import xarray as xr
import numpy as np
from pathlib import Path
from src.preprocess.base import BasePreProcessor

from src.engineer import Engineer

data_path = Path("/Volumes/Lees_Extend/data/ecmwf_sowc/data")
engineer = Engineer(data_path)
# engineer.engineer(test_year=1994, target_variable='VHI')

# wrong shapes!
datasets = engineer._get_preprocessed_files()
ds_list = [xr.open_dataset(ds) for ds in datasets]
dims_list = [[dim for dim in ds.dims] for ds in ds_list]
variable_list = [[var for var in ds.variables if var not in dims_list[i]][0]
                 for i, ds in enumerate(ds_list)]
da_list = [ds[variable_list[i]] for i, ds in enumerate(ds_list)]

pp = BasePreProcessor(data_path)
c_ds = ds_list[0]
e_ds = ds_list[1]
v_ds = ds_list[2]