def test_new_group_ids(test_data, kwargs):
    """Test for new group ids in dataset"""
    train_agency = test_data["agency"].iloc[0]
    train_dataset = TimeSeriesDataSet(
        test_data[lambda x: x.agency == train_agency],
        time_idx="time_idx",
        target="volume",
        group_ids=["agency", "sku"],
        max_encoder_length=5,
        max_prediction_length=2,
        min_prediction_length=1,
        min_encoder_length=1,
        categorical_encoders=dict(agency=NaNLabelEncoder(add_nan=True),
                                  sku=NaNLabelEncoder(add_nan=True)),
        **kwargs,
    )

    # test sampling from training dataset
    next(iter(train_dataset.to_dataloader()))

    # create test dataset with group ids that have not been observed before
    test_dataset = TimeSeriesDataSet.from_dataset(train_dataset, test_data)

    # check that we can iterate through dataset without error
    for _ in iter(test_dataset.to_dataloader()):
        pass
def test_NaNLabelEncoder_add():
    encoder = NaNLabelEncoder(add_nan=False)
    encoder.fit(np.array(["a", "b", "c"]))
    encoder2 = deepcopy(encoder)
    encoder2.fit(np.array(["d"]))
    assert encoder2.transform(np.array(["a"
                                        ]))[0] == 0, "a must be encoded as 0"
    assert encoder2.transform(np.array(["d"
                                        ]))[0] == 3, "d must be encoded as 3"
Exemple #3
0
def from_synthetic_ar_data(
    seasonality: float = 10.0,
    timesteps: int = 400,
    n_series: int = 100,
    max_encoder_length: int = 60,
    max_prediction_length: int = 20,
    batch_size: int = 4,
    num_workers: int = 0,
    **time_series_dataset_kwargs,
) -> TabularForecastingData:
    """Creates and loads a synthetic Auto-Regressive (AR) data set."""
    data = generate_ar_data(seasonality=seasonality,
                            timesteps=timesteps,
                            n_series=n_series,
                            seed=42)
    data["date"] = pd.Timestamp("2020-01-01") + pd.to_timedelta(
        data.time_idx, "D")

    training_cutoff = data["time_idx"].max() - max_prediction_length

    return TabularForecastingData.from_data_frame(
        time_idx="time_idx",
        target="value",
        categorical_encoders={"series": NaNLabelEncoder().fit(data.series)},
        group_ids=["series"],
        # only unknown variable is "value" - and N-Beats can also not take any additional variables
        time_varying_unknown_reals=["value"],
        max_encoder_length=max_encoder_length,
        max_prediction_length=max_prediction_length,
        train_data_frame=data[lambda x: x.time_idx <= training_cutoff],
        val_data_frame=data,
        batch_size=batch_size,
        num_workers=num_workers,
        **time_series_dataset_kwargs,
    )
Exemple #4
0
def test_testing_raises(sample_data):
    """Tests that ``NotImplementedError`` is raised when attempting to perform a test pass."""
    data, training_cutoff, max_prediction_length = sample_data
    datamodule = TabularForecastingData.from_data_frame(
        time_idx="time_idx",
        target="value",
        categorical_encoders={"series": NaNLabelEncoder().fit(data.series)},
        group_ids=["series"],
        time_varying_unknown_reals=["value"],
        max_encoder_length=60,
        max_prediction_length=max_prediction_length,
        train_data_frame=data[lambda x: x.time_idx <= training_cutoff],
        test_data_frame=data,
        batch_size=4,
    )

    model = TabularForecaster(
        datamodule.parameters,
        backbone="n_beats",
        backbone_kwargs={
            "widths": [32, 512],
            "backcast_loss_ratio": 0.1
        },
    )
    trainer = flash.Trainer(max_epochs=1,
                            fast_dev_run=True,
                            gradient_clip_val=0.01)

    with pytest.raises(
            NotImplementedError,
            match=
            "Backbones provided by PyTorch Forecasting don't support testing."
    ):
        trainer.test(model, datamodule=datamodule)
Exemple #5
0
def test_fast_dev_run_smoke(sample_data):
    """Test that fast dev run works with the NBeats example data."""
    data, training_cutoff, max_prediction_length = sample_data
    datamodule = TabularForecastingData.from_data_frame(
        time_idx="time_idx",
        target="value",
        categorical_encoders={"series": NaNLabelEncoder().fit(data.series)},
        group_ids=["series"],
        time_varying_unknown_reals=["value"],
        max_encoder_length=60,
        max_prediction_length=max_prediction_length,
        train_data_frame=data[lambda x: x.time_idx <= training_cutoff],
        val_data_frame=data,
        batch_size=4,
    )

    model = TabularForecaster(
        datamodule.parameters,
        backbone="n_beats",
        backbone_kwargs={
            "widths": [32, 512],
            "backcast_loss_ratio": 0.1
        },
    )

    trainer = flash.Trainer(max_epochs=1,
                            fast_dev_run=True,
                            gradient_clip_val=0.01)
    trainer.fit(model, datamodule=datamodule)
def dataloaders_fixed_window_without_covariates():
    data = generate_ar_data(seasonality=10.0, timesteps=400, n_series=10)
    validation = data.series.iloc[:2]

    max_encoder_length = 60
    max_prediction_length = 20

    training = TimeSeriesDataSet(
        data[lambda x: ~x.series.isin(validation)],
        time_idx="time_idx",
        target="value",
        categorical_encoders={"series": NaNLabelEncoder().fit(data.series)},
        group_ids=["series"],
        static_categoricals=[],
        max_encoder_length=max_encoder_length,
        max_prediction_length=max_prediction_length,
        time_varying_unknown_reals=["value"],
        target_normalizer=EncoderNormalizer(),
    )

    validation = TimeSeriesDataSet.from_dataset(
        training,
        data[lambda x: x.series.isin(validation)],
        stop_randomization=True,
    )
    batch_size = 4
    train_dataloader = training.to_dataloader(train=True,
                                              batch_size=batch_size,
                                              num_workers=0)
    val_dataloader = validation.to_dataloader(train=False,
                                              batch_size=batch_size,
                                              num_workers=0)

    return dict(train=train_dataloader, val=val_dataloader)
    def save_time_series(self):
        """ Download preprocessing file and creates data in a format suited for temporal fusion """
        PREPROCESS_URL = 'https://raw.githubusercontent.com/AWarno/CancerOptimization/main/preprocess_data.py'
        FILE_PATH = 'data/preprocess_data.py'
        DATA_PATH = 'data/data.csv'
        FEATURES = ['dose', 'time']
        GROUP_ID = 'series'
        
        # Data file already exists so we don't need to generate it
        if os.path.isfile(DATA_PATH):
            return
        
        # Preprocessing file already exists so we don't need to download it again
        if not os.path.isfile(FILE_PATH):
            wget.download(PREPROCESS_URL, FILE_PATH)
        
        os.system('python ' + FILE_PATH)
        
        dataset = pd.read_csv(DATA_PATH)

        n = dataset[GROUP_ID].astype(int).max()

        dataset['target'] = dataset['target'].astype(float)

        dataset['time_idx'] = dataset['time_idx'].astype(int)

        training = TimeSeriesDataSet(
            dataset[dataset[GROUP_ID].apply(lambda x: int(x) < int(n * 0.7))],
            time_idx='time_idx',
            target='target',
            group_ids=[GROUP_ID],
            min_encoder_length=20,  
            max_encoder_length=20,
            min_prediction_length=1,
            max_prediction_length=1,
            static_categoricals=[],
            static_reals=[],
            time_varying_known_categoricals=[],
            variable_groups={},
            time_varying_known_reals=['time_idx'],
            time_varying_unknown_categoricals=[],
            time_varying_unknown_reals=['target'] + FEATURES,
            add_relative_time_idx=True,
            add_target_scales=False,
            add_encoder_length=True,
            categorical_encoders={GROUP_ID: NaNLabelEncoder().fit(dataset.series)},
        )
        
        training.save(self.TIMESERIES_PATH)
    def _create_dataset(self, df, valid_p=0.2):
        df = df_utils.check_dataframe(df)
        df = self._handle_missing_data(df)
        df = df[["ds", "y"]]
        df["time_idx"] = range(df.shape[0])
        df["series"] = 0
        self.n_data = df.shape[0]
        self.set_auto_batch_epoch(self.n_data)

        training_cutoff = df.shape[0] - int(valid_p * df.shape[0])

        training = TimeSeriesDataSet(
            df.iloc[:training_cutoff],
            time_idx="time_idx",
            target="y",
            categorical_encoders={"series": NaNLabelEncoder().fit(df.series)},
            group_ids=["series"],
            min_encoder_length=self.context_length,
            max_encoder_length=self.context_length,
            max_prediction_length=self.prediction_length,
            min_prediction_length=self.prediction_length,
            time_varying_unknown_reals=["y"],
            target_normalizer=GroupNormalizer(groups=["series"]),
            randomize_length=None,
            add_relative_time_idx=False,
            add_target_scales=False,
        )

        validation = TimeSeriesDataSet.from_dataset(
            training, df, min_prediction_idx=training_cutoff)
        train_dataloader = training.to_dataloader(train=True,
                                                  batch_size=self.batch_size,
                                                  num_workers=self.num_workers)
        val_dataloader = validation.to_dataloader(train=False,
                                                  batch_size=self.batch_size,
                                                  num_workers=self.num_workers)

        return training, train_dataloader, val_dataloader
        dict(
            target_normalizer=GroupNormalizer(
                groups=["agency", "sku"], log_scale=True, scale_by_group=True, log_zero_value=1.0
            )
        ),
        dict(target_normalizer=EncoderNormalizer(), min_encoder_length=2),
        dict(randomize_length=True, min_encoder_length=2, min_prediction_length=1),
        dict(predict_mode=True),
        dict(add_target_scales=True),
        dict(add_encoder_length=True),
        dict(add_encoder_length=True),
        dict(add_relative_time_idx=True),
        dict(weight="volume"),
        dict(
            scalers=dict(time_idx=GroupNormalizer(), price_regular=StandardScaler()),
            categorical_encoders=dict(month=NaNLabelEncoder()),
            time_varying_known_categoricals=["month"],
            time_varying_known_reals=["time_idx", "price_regular"],
        ),
        dict(dropout_categoricals=["month"], time_varying_known_categoricals=["month"]),
        dict(constant_fill_strategy=dict(volume=0.0), allow_missings=True),
    ],
)
def test_TimeSeriesDataSet(test_data, kwargs):

    defaults = dict(
        time_idx="time_idx",
        target="volume",
        group_ids=["agency", "sku"],
        max_encoder_length=5,
        max_prediction_length=2,
Exemple #10
0
class TestTabularForecaster(TaskTester):

    task = TabularForecaster
    # TODO: Reduce number of required parameters
    task_kwargs = {
        "parameters": {
            "time_idx": "time_idx",
            "target": "value",
            "group_ids": ["series"],
            "weight": None,
            "max_encoder_length": 60,
            "min_encoder_length": 60,
            "min_prediction_idx": 0,
            "min_prediction_length": 20,
            "max_prediction_length": 20,
            "static_categoricals": [],
            "static_reals": [],
            "time_varying_known_categoricals": [],
            "time_varying_known_reals": [],
            "time_varying_unknown_categoricals": [],
            "time_varying_unknown_reals": ["value"],
            "variable_groups": {},
            "constant_fill_strategy": {},
            "allow_missing_timesteps": False,
            "lags": {},
            "add_relative_time_idx": False,
            "add_target_scales": False,
            "add_encoder_length": False,
            "target_normalizer": EncoderNormalizer(),
            "categorical_encoders": {
                "series": NaNLabelEncoder(),
                "__group_id__series": NaNLabelEncoder()
            },
            "scalers": {},
            "randomize_length": None,
            "predict_mode": False,
            "data_sample": {
                "series": {
                    0: 0
                },
                "time_idx": {
                    0: 0
                },
                "value": {
                    0: 0.0
                },
            },
        },
        "backbone": "n_beats",
        "backbone_kwargs": {
            "widths": [32, 512],
            "backcast_loss_ratio": 0.1
        },
    }
    cli_command = "tabular_forecasting"
    is_testing = _TABULAR_TESTING
    is_available = _TABULAR_AVAILABLE

    # # TODO: Resolve JIT issues
    scriptable = False
    traceable = False

    @property
    def example_forward_input(self):
        return {
            "encoder_cat": torch.empty(2, 60, 0, dtype=torch.int64),
            "encoder_cont": torch.zeros(2, 60, 1),
            "encoder_target": torch.zeros(2, 60),
            "encoder_lengths": torch.tensor([60, 60]),
            "decoder_cat": torch.empty(2, 20, 0, dtype=torch.int64),
            "decoder_cont": torch.zeros(2, 20, 1),
            "decoder_target": torch.zeros(2, 20),
            "decoder_lengths": torch.tensor([20, 20]),
            "decoder_time_idx": torch.ones(2, 20).long(),
            "groups": torch.tensor([[0], [1]]),
            "target_scale": torch.zeros(2, 2),
        }

    def check_forward_output(self, output: Any):
        assert isinstance(output["prediction"], torch.Tensor)
        assert output["prediction"].shape == torch.Size([2, 20])
from pytorch_forecasting.data import NaNLabelEncoder  # noqa: E402
from pytorch_forecasting.data.examples import generate_ar_data  # noqa: E402

# Example based on this tutorial: https://pytorch-forecasting.readthedocs.io/en/latest/tutorials/ar.html
# 1. Create the DataModule
data = generate_ar_data(seasonality=10.0, timesteps=400, n_series=100, seed=42)
data["date"] = pd.Timestamp("2020-01-01") + pd.to_timedelta(data.time_idx, "D")

max_prediction_length = 20

training_cutoff = data["time_idx"].max() - max_prediction_length

datamodule = TabularForecastingData.from_data_frame(
    time_idx="time_idx",
    target="value",
    categorical_encoders={"series": NaNLabelEncoder().fit(data.series)},
    group_ids=["series"],
    # only unknown variable is "value" - and N-Beats can also not take any additional variables
    time_varying_unknown_reals=["value"],
    max_encoder_length=60,
    max_prediction_length=max_prediction_length,
    train_data_frame=data[lambda x: x.time_idx <= training_cutoff],
    val_data_frame=data,
    batch_size=32,
)

# 2. Build the task
model = TabularForecaster(
    datamodule.parameters,
    backbone="n_beats",
    backbone_kwargs={
Exemple #12
0
Fichier : TFT.py Projet : NHQ/tempy
    static_categoricals=["symbol"],
    #static_reals=["avg_population_2017", "avg_yearly_household_income_2017"],
    #time_varying_known_categoricals=["special_days", "month"],
    #variable_groups={"special_days": special_days},  # group of categorical variables can be treated as one variable
    time_varying_known_reals=meta["knownReals"],
    #time_varying_unknown_categoricals=[],#meta["features"],
    time_varying_unknown_reals=meta["unknownReals"],
    #target_normalizer=NaNLabelEncoder(add_nan=True),
    target_normalizer=None,
    #GroupNormalizer(
    #    groups=["symbol"], transformation="softplus", center=False
    #),  # use softplus with beta=1.0 and normalize by group
    #add_relative_time_idx=True,
    #add_target_scales=True,
    add_encoder_length=True,
    categorical_encoders={"symbol": NaNLabelEncoder(add_nan=True)})
validation = TimeSeriesDataSet.from_dataset(training,
                                            data,
                                            predict=True,
                                            stop_randomization=True)
batch_size = 64
train_dataloader = training.to_dataloader(train=True,
                                          batch_size=batch_size,
                                          num_workers=2)
val_dataloader = validation.to_dataloader(train=False,
                                          batch_size=batch_size,
                                          num_workers=2)

# save datasets
#training.save("training.pkl")
#validation.save("validation.pkl")
            scale_by_group=True,
        )),
        dict(target_normalizer=EncoderNormalizer(), min_encoder_length=2),
        dict(randomize_length=True,
             min_encoder_length=2,
             min_prediction_length=1),
        dict(predict_mode=True),
        dict(add_target_scales=True),
        dict(add_encoder_length=True),
        dict(add_encoder_length=True),
        dict(add_relative_time_idx=True),
        dict(weight="volume"),
        dict(
            scalers=dict(time_idx=GroupNormalizer(),
                         price_regular=StandardScaler()),
            categorical_encoders=dict(month=NaNLabelEncoder()),
            time_varying_known_categoricals=["month"],
            time_varying_known_reals=["time_idx", "price_regular"],
        ),
        dict(categorical_encoders={"month": NaNLabelEncoder(add_nan=True)},
             time_varying_known_categoricals=["month"]),
        dict(constant_fill_strategy=dict(volume=0.0),
             allow_missing_timesteps=True),
        dict(target_normalizer=None),
    ],
)
def test_TimeSeriesDataSet(test_data, kwargs):

    defaults = dict(
        time_idx="time_idx",
        target="volume",
Exemple #14
0
max_encoder_length = 60
max_prediction_length = 20

df_train_nbeats = df_train.copy()
df_train_nbeats = df_train_nbeats.reset_index()
df_train_nbeats = df_train_nbeats.reset_index()
df_train_nbeats["group"] = 0

df_train_nbeats_sub, df_train_nbeats_val = utilities.split_ts(df_train_nbeats)

nbeats_training = TimeSeriesDataSet(
    df_train_nbeats_sub,
    time_idx="index",
    target="y",
    categorical_encoders={
        "group": NaNLabelEncoder().fit(df_train_nbeats_sub["group"])
    },
    group_ids=["group"],
    time_varying_unknown_reals=["y"],
    max_encoder_length=max_encoder_length,
    max_prediction_length=max_prediction_length,
)
nbeats_validation = TimeSeriesDataSet.from_dataset(nbeats_training,
                                                   df_train_nbeats_val)

# %%
batch_size = 128
nbeats_train_dataloader = nbeats_training.to_dataloader(train=True,
                                                        batch_size=batch_size,
                                                        num_workers=0)
nbeats_val_dataloader = nbeats_validation.to_dataloader(train=False,
def test_NaNLabelEncoder(data, allow_nan):
    fit_data, transform_data = data
    encoder = NaNLabelEncoder(warn=False, add_nan=allow_nan)
    encoder.fit(fit_data)
    assert np.array_equal(
        encoder.inverse_transform(encoder.transform(fit_data)), fit_data
    ), "Inverse transform should reverse transform"
    if not allow_nan:
        with pytest.raises(KeyError):
            encoder.transform(transform_data)
    else:
        assert encoder.transform(transform_data)[0] == 0, "First value should be translated to 0 if nan"
        assert encoder.transform(transform_data)[-1] == 0, "Last value should be translated to 0 if nan"
        assert encoder.transform(fit_data)[0] > 0, "First value should not be 0 if not nan"
Exemple #16
0
         "fifa_u_17_world_cup",
         "football_gold_cup",
         "beer_capital",
         "music_fest",
     ]),
     time_varying_known_reals=[
         "time_idx", "price_regular", "price_actual", "discount",
         "discount_in_percent"
     ],
     time_varying_unknown_categoricals=[],
     time_varying_unknown_reals=[
         "volume", "log_volume", "industry_volume", "soda_volume",
         "avg_max_temp"
     ],
     constant_fill_strategy={"volume": 0},
     categorical_encoders={"sku": NaNLabelEncoder(add_nan=True)},
 ),
 dict(static_categoricals=["agency", "sku"]),
 dict(randomize_length=True, min_encoder_length=2),
 dict(target_normalizer=EncoderNormalizer(), min_encoder_length=2),
 dict(target_normalizer=GroupNormalizer(transformation="log1p")),
 dict(target_normalizer=GroupNormalizer(groups=["agency", "sku"],
                                        transformation="softplus",
                                        center=False)),
 dict(target="agency"),
 # test multiple targets
 dict(target=["industry_volume", "volume"]),
 dict(target=["agency", "volume"]),
 dict(target=["agency", "volume"],
      min_encoder_length=1,
      min_prediction_length=1),
    def predict(self, future_dataframe):
        """
        Predicts based on the future_dataframe. Should be called only after make_future_dataframe is called
        Args:
            future_dataframe: DataFrame form make_future_dataframe function
        Returns:
            forecast dataframe
        """

        if self.fitted is False:
            log.warning("Model has not been fitted. Predictions will be random.")

        future_dataframe = future_dataframe.copy(deep=True)

        testing = TimeSeriesDataSet(
            future_dataframe,
            time_idx="time_idx",
            target="y",
            categorical_encoders={"series": NaNLabelEncoder().fit(future_dataframe.series)},
            group_ids=["series"],
            min_encoder_length=self.context_length,
            max_encoder_length=self.context_length,
            max_prediction_length=self.prediction_length,
            min_prediction_length=self.prediction_length,
            time_varying_known_reals=["time_idx"],
            time_varying_unknown_reals=["y"],
            target_normalizer=GroupNormalizer(groups=["series"], transformation="softplus", center=False),
            add_relative_time_idx=True,
            add_target_scales=True,
            add_encoder_length=True,
        )

        new_raw_predictions, new_x = self.model.predict(testing, mode="raw", return_x=True)

        y_predicted = self.model.to_prediction(new_raw_predictions).detach().cpu()  # [0, : new_x["decoder_lengths"][0]]

        y_predicted = y_predicted.detach().numpy()

        def pad_with(vector, pad_width, iaxis, kwargs):
            pad_value = kwargs.get("padder", np.nan)
            vector[: pad_width[0]] = pad_value
            vector[-pad_width[1] :] = pad_value

        y_pred_padded = np.pad(y_predicted, self.prediction_length, pad_with)[
            self.prediction_length : -1, self.prediction_length : -self.prediction_length
        ]
        y_pred_padded = np.vstack([np.roll(y_pred_padded[:, i], i, axis=0) for i in range(y_pred_padded.shape[1])]).T

        result = pd.DataFrame(
            np.ones(shape=(len(future_dataframe), (2 + self.prediction_length))) * np.nan,
            columns=["ds", "y"] + [f"yhat{i}" for i in range(1, self.prediction_length + 1)],
        )
        result["ds"] = future_dataframe["ds"]

        result.loc[: len(future_dataframe) - (self.periods + 1), "y"] = (
            future_dataframe["y"].iloc[: len(future_dataframe) - (self.periods)].values
        )

        first_part = result.iloc[: self.context_length]
        second_part = result.iloc[self.context_length :]

        second_part.loc[:, [col for col in second_part.columns[2:]]] = y_pred_padded
        result = pd.concat([first_part, second_part])
        for i in range(1, self.prediction_length + 1):
            result[f"residual{i}"] = result[f"yhat{i}"] - result["y"]

        return result