Esempio n. 1
0
def test_lagged_variables(test_data, kwargs):
    dataset = TimeSeriesDataSet(
        test_data.copy(),
        time_idx="time_idx",
        group_ids=["agency", "sku"],
        max_encoder_length=5,
        max_prediction_length=2,
        min_prediction_length=1,
        min_encoder_length=3,  # one more than max lag for validation
        time_varying_unknown_reals=["volume"],
        time_varying_unknown_categoricals=["agency"],
        lags={
            "volume": [1, 2],
            "agency": [1, 2]
        },
        add_encoder_length=False,
        **kwargs,
    )

    x_all, _ = next(iter(dataset.to_dataloader()))

    for name in ["volume", "agency"]:
        if name in dataset.reals:
            vars = dataset.reals
            x = x_all["encoder_cont"]
        else:
            vars = dataset.flat_categoricals
            x = x_all["encoder_cat"]
        target_idx = vars.index(name)
        for lag in [1, 2]:
            lag_idx = vars.index(f"{name}_lagged_by_{lag}")
            target = x[..., target_idx][:, 0]
            lagged_target = torch.roll(x[..., lag_idx], -lag, dims=1)[:, 0]
            assert torch.isclose(target, lagged_target).all(
            ), "lagged target must be the same as non-lagged target"
def test_from_dataset_equivalence(test_data):
    training = TimeSeriesDataSet(
        test_data[lambda x: x.time_idx < x.time_idx.max() - 1],
        time_idx="time_idx",
        target="volume",
        time_varying_known_reals=["price_regular", "time_idx"],
        group_ids=["agency", "sku"],
        static_categoricals=["agency"],
        max_encoder_length=3,
        max_prediction_length=2,
        min_prediction_length=1,
        min_encoder_length=0,
        randomize_length=None,
        add_encoder_length=True,
        add_relative_time_idx=True,
        add_target_scales=True,
    )
    validation1 = TimeSeriesDataSet.from_dataset(training, test_data, predict=True)
    validation2 = TimeSeriesDataSet.from_dataset(
        training,
        test_data[lambda x: x.time_idx > x.time_idx.min() + 2],
        predict=True,
    )
    # ensure validation1 and validation2 datasets are exactly the same despite different data inputs
    for v1, v2 in zip(iter(validation1.to_dataloader(train=False)), iter(validation2.to_dataloader(train=False))):
        for k in v1[0].keys():
            if isinstance(v1[0][k], (tuple, list)):
                assert len(v1[0][k]) == len(v2[0][k])
                for idx in range(len(v1[0][k])):
                    assert torch.isclose(v1[0][k][idx], v2[0][k][idx]).all()
            else:
                assert torch.isclose(v1[0][k], v2[0][k]).all()
        assert torch.isclose(v1[1][0], v2[1][0]).all()
Esempio n. 3
0
def test_new_group_ids(test_data, kwargs):
    """Test for new group ids in dataset"""
    train_agency = test_data["agency"].iloc[0]
    train_dataset = TimeSeriesDataSet(
        test_data[lambda x: x.agency == train_agency],
        time_idx="time_idx",
        target="volume",
        group_ids=["agency", "sku"],
        max_encoder_length=5,
        max_prediction_length=2,
        min_prediction_length=1,
        min_encoder_length=1,
        categorical_encoders=dict(agency=NaNLabelEncoder(add_nan=True),
                                  sku=NaNLabelEncoder(add_nan=True)),
        **kwargs,
    )

    # test sampling from training dataset
    next(iter(train_dataset.to_dataloader()))

    # create test dataset with group ids that have not been observed before
    test_dataset = TimeSeriesDataSet.from_dataset(train_dataset, test_data)

    # check that we can iterate through dataset without error
    for _ in iter(test_dataset.to_dataloader()):
        pass
Esempio n. 4
0
    def create_dataset(self, df: pandas.DataFrame) -> Tuple[TimeSeriesDataSet, TimeSeriesDataSet]:
        data_spec = self.create_data_spec()

        preprocess_spec = dict(
            add_relative_time_idx=True,  # add as feature
            add_target_scales=True,  # add as feature
            add_encoder_length=True,  # add as feature
        )

        prediction_spec = self.create_prediction_spec()

        time_index_col = cfg.get("time_index")
        training_cutoff = df[time_index_col].max() - self.cfg.get("max_prediction_length")
        trainset = TimeSeriesDataSet(
            df[lambda x: x.time_idx <= training_cutoff],
            **data_spec,
            **preprocess_spec,
            **prediction_spec,
        )
        # create validation set (predict=True) which means to predict the
        # last max_prediction_length points in time for each series
        validset = TimeSeriesDataSet.from_dataset(
            trainset, df, predict=True, stop_randomization=True
        )
        return trainset, validset
def test_categorical_target(test_data):
    dataset = TimeSeriesDataSet(
        test_data,
        time_idx="time_idx",
        target="agency",
        group_ids=["agency", "sku"],
        max_encoder_length=5,
        max_prediction_length=2,
        min_prediction_length=1,
        min_encoder_length=1,
    )
    _, y = next(iter(dataset.to_dataloader()))
    assert y[0].dtype is torch.long, "target must be of type long"
Esempio n. 6
0
def test_encoder_normalizer_for_covariates(test_data):
    dataset = TimeSeriesDataSet(
        test_data,
        time_idx="time_idx",
        target="volume",
        group_ids=["agency", "sku"],
        max_encoder_length=5,
        max_prediction_length=2,
        min_prediction_length=1,
        min_encoder_length=1,
        time_varying_known_reals=["price_regular"],
        scalers={"price_regular": EncoderNormalizer()},
    )
    next(iter(dataset.to_dataloader()))
Esempio n. 7
0
def test_multitarget(test_data, kwargs):
    dataset = TimeSeriesDataSet(
        test_data.assign(volume1=lambda x: x.volume),
        time_idx="time_idx",
        target=["volume", "volume1"],
        group_ids=["agency", "sku"],
        max_encoder_length=5,
        max_prediction_length=2,
        min_prediction_length=1,
        min_encoder_length=1,
        time_varying_known_reals=["price_regular"],
        scalers={"price_regular": EncoderNormalizer()},
        **kwargs,
    )
    next(iter(dataset.to_dataloader()))
Esempio n. 8
0
def test_min_prediction_idx(test_dataset, test_data, min_prediction_idx):
    dataset = TimeSeriesDataSet.from_dataset(
        test_dataset,
        test_data,
        min_prediction_idx=min_prediction_idx,
        min_encoder_length=1,
        max_prediction_length=10)

    for x, _ in iter(dataset.to_dataloader(num_workers=0, batch_size=1000)):
        assert x["decoder_time_idx"].min() >= min_prediction_idx
Esempio n. 9
0
def test_TimeSeriesDataSet(test_data, kwargs):

    defaults = dict(
        time_idx="time_idx",
        target="volume",
        group_ids=["agency", "sku"],
        max_encoder_length=5,
        max_prediction_length=2,
    )
    defaults.update(kwargs)
    kwargs = defaults

    if kwargs.get("allow_missings", False):
        np.random.seed(2)
        test_data = test_data.sample(frac=0.5)

    # create dataset and sample from it
    dataset = TimeSeriesDataSet(test_data, **kwargs)
    check_dataloader_output(dataset, next(iter(dataset.to_dataloader(num_workers=0))))
Esempio n. 10
0
def test_timeseries_columns_naming(test_data):
    with pytest.raises(ValueError):
        TimeSeriesDataSet(
            test_data.rename(columns=dict(agency="agency.2")),
            time_idx="time_idx",
            target="volume",
            group_ids=["agency.2", "sku"],
            max_encoder_length=5,
            max_prediction_length=2,
            min_prediction_length=1,
            min_encoder_length=1,
        )
def test_raise_short_encoder_length(test_data):
    with pytest.warns(UserWarning):
        test_data = test_data[lambda x: ~((x.agency == "Agency_22") & (x.sku == "SKU_01") & (x.time_idx > 3))]
        TimeSeriesDataSet(
            test_data,
            time_idx="time_idx",
            target="volume",
            group_ids=["agency", "sku"],
            max_encoder_length=5,
            max_prediction_length=2,
            min_prediction_length=1,
            min_encoder_length=5,
        )
Esempio n. 12
0
def test_check_nas(test_data):
    data = test_data.copy()
    data.loc[0, "volume"] = np.nan
    with pytest.raises(ValueError, match=r"1 \(.*infinite"):
        TimeSeriesDataSet(
            data,
            time_idx="time_idx",
            target=["volume"],
            group_ids=["agency", "sku"],
            max_encoder_length=5,
            max_prediction_length=2,
            min_prediction_length=1,
            min_encoder_length=1,
        )
Esempio n. 13
0
    def from_dataset(cls, dataset: TimeSeriesDataSet, **kwargs) -> LightningModule:
        """
        Create model from dataset, i.e. save dataset parameters in model

        This function should be called as ``super().from_dataset()`` in a derived models that implement it

        Args:
            dataset (TimeSeriesDataSet): timeseries dataset

        Returns:
            BaseModel: Model that can be trained
        """
        if "output_transformer" not in kwargs:
            kwargs["output_transformer"] = dataset.target_normalizer
        net = cls(**kwargs)
        net.dataset_parameters = dataset.get_parameters()
        return net
Esempio n. 14
0
    def predict_dependency(
        self,
        data: Union[DataLoader, pd.DataFrame, TimeSeriesDataSet],
        variable: str,
        values: Iterable,
        mode: str = "dataframe",
        target="decoder",
        show_progress_bar: bool = False,
        **kwargs,
    ) -> Union[np.ndarray, torch.Tensor, pd.Series, pd.DataFrame]:
        """
        Predict partial dependency.


        Args:
            data (Union[DataLoader, pd.DataFrame, TimeSeriesDataSet]): data
            variable (str): variable which to modify
            values (Iterable): array of values to probe
            mode (str, optional): Output mode. Defaults to "dataframe". Either

                * "series": values are average prediction and index are probed values
                * "dataframe": columns are as obtained by the `dataset.x_to_index()` method,
                    prediction (which is the mean prediction over the time horizon),
                    normalized_prediction (which are predictions devided by the prediction for the first probed value)
                    the variable name for the probed values
                * "raw": outputs a tensor of shape len(values) x prediction_shape

            target: Defines which values are overwritten for making a prediction.
                Same as in :py:meth:`~pytorch_forecasting.data.timeseries.TimeSeriesDataSet.set_overwrite_values`.
                Defaults to "decoder".
            show_progress_bar: if to show progress bar. Defaults to False.
            **kwargs: additional kwargs to :py:meth:`~predict` method

        Returns:
            Union[np.ndarray, torch.Tensor, pd.Series, pd.DataFrame]: output
        """
        values = np.asarray(values)
        if isinstance(data, pd.DataFrame):  # convert to dataframe
            data = TimeSeriesDataSet.from_parameters(self.dataset_parameters,
                                                     data,
                                                     predict=True)
        elif isinstance(data, DataLoader):
            data = data.dataset

        results = []
        progress_bar = tqdm(desc="Predict",
                            unit=" batches",
                            total=len(values),
                            disable=not show_progress_bar)
        for idx, value in enumerate(values):
            # set values
            data.set_overwrite_values(variable=variable,
                                      values=value,
                                      target=target)
            # predict
            kwargs.setdefault("mode", "prediction")

            if idx == 0 and mode == "dataframe":  # need index for returning as dataframe
                res, index = self.predict(data, return_index=True, **kwargs)
                results.append(res)
            else:
                results.append(self.predict(data, **kwargs))
            # increment progress
            progress_bar.update()

        data.reset_overwrite_values(
        )  # reset overwrite values to avoid side-effect

        # results to one tensor
        results = torch.stack(results, dim=0)

        # convert results to requested output format
        if mode == "series":
            results = results[:, ~torch.isnan(results[0])].mean(
                1)  # average samples and prediction horizon
            results = pd.Series(results, index=values)

        elif mode == "dataframe":
            # take mean over time
            is_nan = torch.isnan(results)
            results[is_nan] = 0
            results = results.sum(-1) / (~is_nan).float().sum(-1)

            # create dataframe
            dependencies = (index.iloc[np.tile(np.arange(
                len(index)), len(values))].reset_index(drop=True).assign(
                    prediction=results.flatten()))
            dependencies[variable] = values.repeat(len(data))
            first_prediction = dependencies.groupby(
                data.group_ids, observed=True).prediction.transform("first")
            dependencies["normalized_prediction"] = dependencies[
                "prediction"] / first_prediction
            dependencies["id"] = dependencies.groupby(data.group_ids,
                                                      observed=True).ngroup()
            results = dependencies

        elif mode == "raw":
            pass

        else:
            raise ValueError(
                f"mode {mode} is unknown - see documentation for available modes"
            )

        return results
Esempio n. 15
0
    def predict(
        self,
        data: Union[DataLoader, pd.DataFrame, TimeSeriesDataSet],
        mode: Union[str, Tuple[str, str]] = "prediction",
        return_index: bool = False,
        return_decoder_lengths: bool = False,
        batch_size: int = 64,
        num_workers: int = 0,
        fast_dev_run: bool = False,
        show_progress_bar: bool = False,
        return_x: bool = False,
    ):
        """
        predict dataloader

        Args:
            dataloader: dataloader, dataframe or dataset
            mode: one of "prediction", "quantiles" or "raw", or tuple ``("raw", output_name)`` where output_name is
                a name in the dictionary returned by ``forward()``
            return_index: if to return the prediction index
            return_decoder_lengths: if to return decoder_lengths
            batch_size: batch size for dataloader - only used if data is not a dataloader is passed
            num_workers: number of workers for dataloader - only used if data is not a dataloader is passed
            fast_dev_run: if to only return results of first batch
            show_progress_bar: if to show progress bar. Defaults to False.
            return_x: if to return network inputs

        Returns:
            output, x, index, decoder_lengths: some elements might not be present depending on what is configured
                to be returned
        """
        # convert to dataloader
        if isinstance(data, pd.DataFrame):
            data = TimeSeriesDataSet.from_parameters(self.dataset_parameters,
                                                     data,
                                                     predict=True)
        if isinstance(data, TimeSeriesDataSet):
            dataloader = data.to_dataloader(batch_size=batch_size,
                                            train=False,
                                            num_workers=num_workers)
        else:
            dataloader = data

        # ensure passed dataloader is correct
        assert isinstance(
            dataloader.dataset, TimeSeriesDataSet
        ), "dataset behind dataloader mut be TimeSeriesDataSet"

        # prepare model
        self.eval()  # no dropout, etc. no gradients

        # run predictions
        output = []
        decode_lenghts = []
        x_list = []
        index = []
        progress_bar = tqdm(desc="Predict",
                            unit=" batches",
                            total=len(dataloader),
                            disable=not show_progress_bar)
        with torch.no_grad():
            for x, _ in dataloader:
                # move data to appropriate device
                for name in x.keys():
                    if x[name].device != self.device:
                        x[name].to(self.device)

                # make prediction
                out = self(x)  # raw output is dictionary
                out["prediction"] = self.transform_output(out)

                lengths = x["decoder_lengths"]
                if return_decoder_lengths:
                    decode_lenghts.append(lengths)
                nan_mask = self._get_mask(out["prediction"].size(1), lengths)
                if isinstance(mode, (tuple, list)):
                    if mode[0] == "raw":
                        out = out[mode[1]]
                    else:
                        raise ValueError(
                            f"If a tuple is specified, the first element must be 'raw' - got {mode[0]} instead"
                        )
                elif mode == "prediction":
                    out = self.loss.to_prediction(out["prediction"])
                    # mask non-predictions
                    out = out.masked_fill(nan_mask, torch.tensor(float("nan")))
                elif mode == "quantiles":
                    out = self.loss.to_quantiles(out["prediction"])
                    # mask non-predictions
                    out = out.masked_fill(nan_mask.unsqueeze(-1),
                                          torch.tensor(float("nan")))
                elif mode == "raw":
                    pass
                else:
                    raise ValueError(
                        f"Unknown mode {mode} - see docs for valid arguments")

                output.append(out)
                if return_x:
                    x_list.append(x)
                if return_index:
                    index.append(dataloader.dataset.x_to_index(x))
                progress_bar.update()
                if fast_dev_run:
                    break

        # concatenate
        if isinstance(mode, (tuple, list)) or mode != "raw":
            output = torch.cat(output, dim=0)
        elif mode == "raw":
            output_cat = {}
            for name in output[0].keys():
                output_cat[name] = torch.cat([out[name] for out in output],
                                             dim=0)
            output = output_cat

        # generate output
        if return_x or return_index or return_decoder_lengths:
            output = [output]
        if return_x:
            x_cat = {}
            for name in x_list[0].keys():
                x_cat[name] = torch.cat([x[name] for x in x_list], dim=0)
            x_cat = x_cat
            output.append(x_cat)
        if return_index:
            output.append(pd.concat(index, axis=0, ignore_index=True))
        if return_decoder_lengths:
            output.append(torch.cat(decode_lenghts, dim=0))
        return output
training = TimeSeriesDataSet(
    data[lambda x: x.time_idx <= training_cutoff],
    time_idx="time_idx",
    target="volume",
    group_ids=["agency", "sku"],
    min_encoder_length=0,  # allow predictions without history
    max_encoder_length=max_encoder_length,
    min_prediction_length=1,
    max_prediction_length=max_prediction_length,
    static_categoricals=["agency", "sku"],
    static_reals=["avg_population_2017", "avg_yearly_household_income_2017"],
    time_varying_known_categoricals=["special_days", "month"],
    # group of categorical variables can be treated as
    # one variable --> special days' list
    variable_groups={"special_days": special_days},
    time_varying_known_reals=[
        "time_idx", "price_regular", "discount_in_percent"
    ],
    time_varying_unknown_categoricals=[],
    time_varying_unknown_reals=[
        "volume",
        "log_volume",
        "industry_volume",
        "soda_volume",
        "avg_max_temp",
        "avg_volume_by_agency",
        "avg_volume_by_sku",
    ],
    target_normalizer=GroupNormalizer(
        groups=["agency", "sku"], coerce_positive=1.0
    ),  # use softplus with beta=1.0 and normalize by group
    add_relative_time_idx=True,  # add as feature
    add_target_scales=True,  # add as feature
    add_encoder_length=True,  # add as feature
)
Esempio n. 17
0
def test_from_dataset(test_dataset, test_data):
    dataset = TimeSeriesDataSet.from_dataset(test_dataset, test_data)
    check_dataloader_output(dataset, next(iter(dataset.to_dataloader(num_workers=0))))
Esempio n. 18
0
    max_prediction_length = 24
    max_encoder_length = 168
    val_size = 9792

    training_cutoff = data["time_idx"].max() - val_size
    traning_data = data[lambda x: x.time_idx <= training_cutoff]

    training = TimeSeriesDataSet(
        traning_data,
        time_idx="time_idx",
        target="arrivals",
        group_ids=["series"],
        min_encoder_length=max_encoder_length,
        max_encoder_length=max_encoder_length,
        min_prediction_length=max_prediction_length,
        max_prediction_length=max_prediction_length,
        static_categoricals=["series"],
        time_varying_known_categoricals=time_varying_known_categoricals,
        time_varying_unknown_categoricals=time_varying_unknown_categoricals,
        time_varying_unknown_reals=time_varying_unknown_reals,
        add_relative_time_idx=True,
        add_target_scales=True,
        add_encoder_length=True,
    )

    validation = TimeSeriesDataSet.from_dataset(training,
                                                data,
                                                predict=False,
                                                stop_randomization=True)

    # create dataloaders for model