def test_from_dataset_equivalence(test_data):
    training = TimeSeriesDataSet(
        test_data[lambda x: x.time_idx < x.time_idx.max() - 1],
        time_idx="time_idx",
        target="volume",
        time_varying_known_reals=["price_regular", "time_idx"],
        group_ids=["agency", "sku"],
        static_categoricals=["agency"],
        max_encoder_length=3,
        max_prediction_length=2,
        min_prediction_length=1,
        min_encoder_length=0,
        randomize_length=None,
        add_encoder_length=True,
        add_relative_time_idx=True,
        add_target_scales=True,
    )
    validation1 = TimeSeriesDataSet.from_dataset(training, test_data, predict=True)
    validation2 = TimeSeriesDataSet.from_dataset(
        training,
        test_data[lambda x: x.time_idx > x.time_idx.min() + 2],
        predict=True,
    )
    # ensure validation1 and validation2 datasets are exactly the same despite different data inputs
    for v1, v2 in zip(iter(validation1.to_dataloader(train=False)), iter(validation2.to_dataloader(train=False))):
        for k in v1[0].keys():
            if isinstance(v1[0][k], (tuple, list)):
                assert len(v1[0][k]) == len(v2[0][k])
                for idx in range(len(v1[0][k])):
                    assert torch.isclose(v1[0][k][idx], v2[0][k][idx]).all()
            else:
                assert torch.isclose(v1[0][k], v2[0][k]).all()
        assert torch.isclose(v1[1][0], v2[1][0]).all()
Esempio n. 2
0
def test_new_group_ids(test_data, kwargs):
    """Test for new group ids in dataset"""
    train_agency = test_data["agency"].iloc[0]
    train_dataset = TimeSeriesDataSet(
        test_data[lambda x: x.agency == train_agency],
        time_idx="time_idx",
        target="volume",
        group_ids=["agency", "sku"],
        max_encoder_length=5,
        max_prediction_length=2,
        min_prediction_length=1,
        min_encoder_length=1,
        categorical_encoders=dict(agency=NaNLabelEncoder(add_nan=True),
                                  sku=NaNLabelEncoder(add_nan=True)),
        **kwargs,
    )

    # test sampling from training dataset
    next(iter(train_dataset.to_dataloader()))

    # create test dataset with group ids that have not been observed before
    test_dataset = TimeSeriesDataSet.from_dataset(train_dataset, test_data)

    # check that we can iterate through dataset without error
    for _ in iter(test_dataset.to_dataloader()):
        pass
Esempio n. 3
0
    def create_dataset(self, df: pandas.DataFrame) -> Tuple[TimeSeriesDataSet, TimeSeriesDataSet]:
        data_spec = self.create_data_spec()

        preprocess_spec = dict(
            add_relative_time_idx=True,  # add as feature
            add_target_scales=True,  # add as feature
            add_encoder_length=True,  # add as feature
        )

        prediction_spec = self.create_prediction_spec()

        time_index_col = cfg.get("time_index")
        training_cutoff = df[time_index_col].max() - self.cfg.get("max_prediction_length")
        trainset = TimeSeriesDataSet(
            df[lambda x: x.time_idx <= training_cutoff],
            **data_spec,
            **preprocess_spec,
            **prediction_spec,
        )
        # create validation set (predict=True) which means to predict the
        # last max_prediction_length points in time for each series
        validset = TimeSeriesDataSet.from_dataset(
            trainset, df, predict=True, stop_randomization=True
        )
        return trainset, validset
Esempio n. 4
0
def test_min_prediction_idx(test_dataset, test_data, min_prediction_idx):
    dataset = TimeSeriesDataSet.from_dataset(
        test_dataset,
        test_data,
        min_prediction_idx=min_prediction_idx,
        min_encoder_length=1,
        max_prediction_length=10)

    for x, _ in iter(dataset.to_dataloader(num_workers=0, batch_size=1000)):
        assert x["decoder_time_idx"].min() >= min_prediction_idx
Esempio n. 5
0
def test_from_dataset(test_dataset, test_data):
    dataset = TimeSeriesDataSet.from_dataset(test_dataset, test_data)
    check_dataloader_output(dataset, next(iter(dataset.to_dataloader(num_workers=0))))
        "avg_max_temp",
        "avg_volume_by_agency",
        "avg_volume_by_sku",
    ],
    target_normalizer=GroupNormalizer(
        groups=["agency", "sku"], coerce_positive=1.0
    ),  # use softplus with beta=1.0 and normalize by group
    add_relative_time_idx=True,  # add as feature
    add_target_scales=True,  # add as feature
    add_encoder_length=True,  # add as feature
)

# create validation set (predict=True) which means to predict the
# last max_prediction_length points in time for each series
validation = TimeSeriesDataSet.from_dataset(training,
                                            data,
                                            predict=True,
                                            stop_randomization=True)
# create dataloaders for model
batch_size = 128
train_dataloader = training.to_dataloader(train=True,
                                          batch_size=batch_size,
                                          num_workers=0)
val_dataloader = validation.to_dataloader(train=False,
                                          batch_size=batch_size * 10,
                                          num_workers=0)

#%%
"""
Training the Temporal Fusion Transformer with PyTorch Lightning
"""