Esempio n. 1
0
    def test_mask(self, tmp_path):

        for i in range(5):
            (tmp_path /
             f"features/one_month_forecast/train/{i}").mkdir(parents=True)
            (tmp_path / f"features/one_month_forecast/train/{i}/x.nc").touch()
            (tmp_path / f"features/one_month_forecast/train/{i}/y.nc").touch()

        mask_train = [True, True, False, True, False]
        mask_val = [False, False, True, False, True]

        train_paths = DataLoader._load_datasets(
            tmp_path,
            mode="train",
            experiment="one_month_forecast",
            shuffle_data=True,
            mask=mask_train,
        )
        val_paths = DataLoader._load_datasets(
            tmp_path,
            mode="train",
            experiment="one_month_forecast",
            shuffle_data=True,
            mask=mask_val,
        )
        assert (len(set(train_paths).intersection(set(val_paths))) == 0
                ), f"Got the same file in both train and val set!"
        assert len(train_paths) + len(val_paths) == 5, f"Not all files loaded!"
Esempio n. 2
0
    def test_mask(self, tmp_path):

        for i in range(5):
            (tmp_path /
             f'features/one_month_forecast/train/{i}').mkdir(parents=True)
            (tmp_path / f'features/one_month_forecast/train/{i}/x.nc').touch()
            (tmp_path / f'features/one_month_forecast/train/{i}/y.nc').touch()

        mask_train = [True, True, False, True, False]
        mask_val = [False, False, True, False, True]

        train_paths = DataLoader._load_datasets(
            tmp_path,
            mode='train',
            experiment='one_month_forecast',
            shuffle_data=True,
            mask=mask_train)
        val_paths = DataLoader._load_datasets(tmp_path,
                                              mode='train',
                                              experiment='one_month_forecast',
                                              shuffle_data=True,
                                              mask=mask_val)
        assert len(set(train_paths).intersection(set(val_paths))) == 0, \
            f'Got the same file in both train and val set!'
        assert len(train_paths) + len(val_paths) == 5, f'Not all files loaded!'
Esempio n. 3
0
    def test_pred_months(self, tmp_path):
        for i in range(1, 13):
            (tmp_path /
             f"features/one_month_forecast/train/2018_{i}").mkdir(parents=True)
            (tmp_path /
             f"features/one_month_forecast/train/2018_{i}/x.nc").touch()
            (tmp_path /
             f"features/one_month_forecast/train/2018_{i}/y.nc").touch()

        pred_months = [4, 5, 6]

        train_paths = DataLoader._load_datasets(
            tmp_path,
            mode="train",
            shuffle_data=True,
            pred_months=pred_months,
            experiment="one_month_forecast",
        )

        assert len(train_paths) == len(
            pred_months
        ), f"Got {len(train_paths)} filepaths back, expected {len(pred_months)}"

        for return_file in train_paths:
            subfolder = return_file.parts[-1]
            month = int(str(subfolder)[5:])
            assert (month in pred_months
                    ), f"{month} not in {pred_months}, got {return_file}"
Esempio n. 4
0
model = GBDT(
    data_dir, include_pred_month=False,
    experiment='one_month_forecast', include_monthly_aggs=False
)
model.train()
model.evaluate(save_preds=True)

# ------------------------
# Inside the weeds
# ------------------------

# ------------------------
# DATA LOADER
# ------------------------
len_mask = len(DataLoader._load_datasets(
    data_path, mode='train',
    shuffle_data=False, experiment='one_month_forecast'
))

# validation split = 10%
val_split = 0.1
train_mask, val_mask = train_val_mask(len_mask, val_split)

ignore_vars = []
train_dataloader = DataLoader(data_path=model.data_path,
                              batch_file_size=model.batch_size,
                              experiment=model.experiment,
                              shuffle_data=True, mode='train',
                              pred_months=model.pred_months,
                              mask=train_mask,
                              ignore_vars=model.ignore_vars,
                              monthly_aggs=model.include_monthly_aggs,
# l.dense_layers
layer_sizes = [100]
dense_layers = nn.ModuleList([
    LinearBlock(in_features=layer_sizes[i - 1],
                out_features=layer_sizes[i],
                dropout=dropout) for i in range(1, len(layer_sizes))
])

# final dense
# l.final_dense

final_dense = nn.Linear(in_features=layer_sizes[-1], out_features=1)

# DataLoader
len_mask = len(
    DataLoader._load_datasets(data_path, mode="train", shuffle_data=False))
train_mask, val_mask = train_val_mask(len_mask, 0.3)
batch_size = 256
# batch_size=5
train_dataloader = DataLoader(
    data_path=data_path,
    batch_file_size=batch_size,
    shuffle_data=True,
    mode="train",
    mask=train_mask,
    to_tensor=True,
)
val_dataloader = DataLoader(
    data_path=data_path,
    batch_file_size=batch_size,
    shuffle_data=True,
Esempio n. 6
0
from src.models import LinearRegression, LinearNetwork, Persistence
from src.models.data import DataLoader

data_path = Path("data")
l = LinearRegression(data_path)
l.train()

ln = LinearNetwork(layer_sizes=[100], data_folder=data_path)
ln.train(num_epochs=10)

# ------------------------------------------------------------------------------
# try and explain the LinearRegression model
# ------------------------------------------------------------------------------
test_arrays_loader = DataLoader(
    data_path=data_path, batch_file_size=1, shuffle_data=False, mode="test"
)
key, val = list(next(iter(test_arrays_loader)).items())[0]
explanations = l.explain(val.x)

# plot the SHAP explanations

# 1. mean spatial and temporal response
mean_expl = explanations.mean(axis=0).mean(axis=0)
x_vars = val.x_vars
df = pd.DataFrame(dict(variables=x_vars, values=mean_expl))

sns.barplot(x="variables", y="values", data=df)
fig = plt.gcf()
plt.title(f"{key} {val.y_var} mean SHAP Values for Linear Regression")
fig.savefig("scripts/mean_variable_importance_linear_regression.png", dpi=300)