def test_lagged_variables(test_data, kwargs): dataset = TimeSeriesDataSet( test_data.copy(), time_idx="time_idx", group_ids=["agency", "sku"], max_encoder_length=5, max_prediction_length=2, min_prediction_length=1, min_encoder_length=3, # one more than max lag for validation time_varying_unknown_reals=["volume"], time_varying_unknown_categoricals=["agency"], lags={ "volume": [1, 2], "agency": [1, 2] }, add_encoder_length=False, **kwargs, ) x_all, _ = next(iter(dataset.to_dataloader())) for name in ["volume", "agency"]: if name in dataset.reals: vars = dataset.reals x = x_all["encoder_cont"] else: vars = dataset.flat_categoricals x = x_all["encoder_cat"] target_idx = vars.index(name) for lag in [1, 2]: lag_idx = vars.index(f"{name}_lagged_by_{lag}") target = x[..., target_idx][:, 0] lagged_target = torch.roll(x[..., lag_idx], -lag, dims=1)[:, 0] assert torch.isclose(target, lagged_target).all( ), "lagged target must be the same as non-lagged target"
def test_from_dataset_equivalence(test_data): training = TimeSeriesDataSet( test_data[lambda x: x.time_idx < x.time_idx.max() - 1], time_idx="time_idx", target="volume", time_varying_known_reals=["price_regular", "time_idx"], group_ids=["agency", "sku"], static_categoricals=["agency"], max_encoder_length=3, max_prediction_length=2, min_prediction_length=1, min_encoder_length=0, randomize_length=None, add_encoder_length=True, add_relative_time_idx=True, add_target_scales=True, ) validation1 = TimeSeriesDataSet.from_dataset(training, test_data, predict=True) validation2 = TimeSeriesDataSet.from_dataset( training, test_data[lambda x: x.time_idx > x.time_idx.min() + 2], predict=True, ) # ensure validation1 and validation2 datasets are exactly the same despite different data inputs for v1, v2 in zip(iter(validation1.to_dataloader(train=False)), iter(validation2.to_dataloader(train=False))): for k in v1[0].keys(): if isinstance(v1[0][k], (tuple, list)): assert len(v1[0][k]) == len(v2[0][k]) for idx in range(len(v1[0][k])): assert torch.isclose(v1[0][k][idx], v2[0][k][idx]).all() else: assert torch.isclose(v1[0][k], v2[0][k]).all() assert torch.isclose(v1[1][0], v2[1][0]).all()
def test_new_group_ids(test_data, kwargs): """Test for new group ids in dataset""" train_agency = test_data["agency"].iloc[0] train_dataset = TimeSeriesDataSet( test_data[lambda x: x.agency == train_agency], time_idx="time_idx", target="volume", group_ids=["agency", "sku"], max_encoder_length=5, max_prediction_length=2, min_prediction_length=1, min_encoder_length=1, categorical_encoders=dict(agency=NaNLabelEncoder(add_nan=True), sku=NaNLabelEncoder(add_nan=True)), **kwargs, ) # test sampling from training dataset next(iter(train_dataset.to_dataloader())) # create test dataset with group ids that have not been observed before test_dataset = TimeSeriesDataSet.from_dataset(train_dataset, test_data) # check that we can iterate through dataset without error for _ in iter(test_dataset.to_dataloader()): pass
def create_dataset(self, df: pandas.DataFrame) -> Tuple[TimeSeriesDataSet, TimeSeriesDataSet]: data_spec = self.create_data_spec() preprocess_spec = dict( add_relative_time_idx=True, # add as feature add_target_scales=True, # add as feature add_encoder_length=True, # add as feature ) prediction_spec = self.create_prediction_spec() time_index_col = cfg.get("time_index") training_cutoff = df[time_index_col].max() - self.cfg.get("max_prediction_length") trainset = TimeSeriesDataSet( df[lambda x: x.time_idx <= training_cutoff], **data_spec, **preprocess_spec, **prediction_spec, ) # create validation set (predict=True) which means to predict the # last max_prediction_length points in time for each series validset = TimeSeriesDataSet.from_dataset( trainset, df, predict=True, stop_randomization=True ) return trainset, validset
def test_categorical_target(test_data): dataset = TimeSeriesDataSet( test_data, time_idx="time_idx", target="agency", group_ids=["agency", "sku"], max_encoder_length=5, max_prediction_length=2, min_prediction_length=1, min_encoder_length=1, ) _, y = next(iter(dataset.to_dataloader())) assert y[0].dtype is torch.long, "target must be of type long"
def test_encoder_normalizer_for_covariates(test_data): dataset = TimeSeriesDataSet( test_data, time_idx="time_idx", target="volume", group_ids=["agency", "sku"], max_encoder_length=5, max_prediction_length=2, min_prediction_length=1, min_encoder_length=1, time_varying_known_reals=["price_regular"], scalers={"price_regular": EncoderNormalizer()}, ) next(iter(dataset.to_dataloader()))
def test_multitarget(test_data, kwargs): dataset = TimeSeriesDataSet( test_data.assign(volume1=lambda x: x.volume), time_idx="time_idx", target=["volume", "volume1"], group_ids=["agency", "sku"], max_encoder_length=5, max_prediction_length=2, min_prediction_length=1, min_encoder_length=1, time_varying_known_reals=["price_regular"], scalers={"price_regular": EncoderNormalizer()}, **kwargs, ) next(iter(dataset.to_dataloader()))
def test_min_prediction_idx(test_dataset, test_data, min_prediction_idx): dataset = TimeSeriesDataSet.from_dataset( test_dataset, test_data, min_prediction_idx=min_prediction_idx, min_encoder_length=1, max_prediction_length=10) for x, _ in iter(dataset.to_dataloader(num_workers=0, batch_size=1000)): assert x["decoder_time_idx"].min() >= min_prediction_idx
def test_TimeSeriesDataSet(test_data, kwargs): defaults = dict( time_idx="time_idx", target="volume", group_ids=["agency", "sku"], max_encoder_length=5, max_prediction_length=2, ) defaults.update(kwargs) kwargs = defaults if kwargs.get("allow_missings", False): np.random.seed(2) test_data = test_data.sample(frac=0.5) # create dataset and sample from it dataset = TimeSeriesDataSet(test_data, **kwargs) check_dataloader_output(dataset, next(iter(dataset.to_dataloader(num_workers=0))))
def test_timeseries_columns_naming(test_data): with pytest.raises(ValueError): TimeSeriesDataSet( test_data.rename(columns=dict(agency="agency.2")), time_idx="time_idx", target="volume", group_ids=["agency.2", "sku"], max_encoder_length=5, max_prediction_length=2, min_prediction_length=1, min_encoder_length=1, )
def test_raise_short_encoder_length(test_data): with pytest.warns(UserWarning): test_data = test_data[lambda x: ~((x.agency == "Agency_22") & (x.sku == "SKU_01") & (x.time_idx > 3))] TimeSeriesDataSet( test_data, time_idx="time_idx", target="volume", group_ids=["agency", "sku"], max_encoder_length=5, max_prediction_length=2, min_prediction_length=1, min_encoder_length=5, )
def test_check_nas(test_data): data = test_data.copy() data.loc[0, "volume"] = np.nan with pytest.raises(ValueError, match=r"1 \(.*infinite"): TimeSeriesDataSet( data, time_idx="time_idx", target=["volume"], group_ids=["agency", "sku"], max_encoder_length=5, max_prediction_length=2, min_prediction_length=1, min_encoder_length=1, )
def from_dataset(cls, dataset: TimeSeriesDataSet, **kwargs) -> LightningModule: """ Create model from dataset, i.e. save dataset parameters in model This function should be called as ``super().from_dataset()`` in a derived models that implement it Args: dataset (TimeSeriesDataSet): timeseries dataset Returns: BaseModel: Model that can be trained """ if "output_transformer" not in kwargs: kwargs["output_transformer"] = dataset.target_normalizer net = cls(**kwargs) net.dataset_parameters = dataset.get_parameters() return net
def predict_dependency( self, data: Union[DataLoader, pd.DataFrame, TimeSeriesDataSet], variable: str, values: Iterable, mode: str = "dataframe", target="decoder", show_progress_bar: bool = False, **kwargs, ) -> Union[np.ndarray, torch.Tensor, pd.Series, pd.DataFrame]: """ Predict partial dependency. Args: data (Union[DataLoader, pd.DataFrame, TimeSeriesDataSet]): data variable (str): variable which to modify values (Iterable): array of values to probe mode (str, optional): Output mode. Defaults to "dataframe". Either * "series": values are average prediction and index are probed values * "dataframe": columns are as obtained by the `dataset.x_to_index()` method, prediction (which is the mean prediction over the time horizon), normalized_prediction (which are predictions devided by the prediction for the first probed value) the variable name for the probed values * "raw": outputs a tensor of shape len(values) x prediction_shape target: Defines which values are overwritten for making a prediction. Same as in :py:meth:`~pytorch_forecasting.data.timeseries.TimeSeriesDataSet.set_overwrite_values`. Defaults to "decoder". show_progress_bar: if to show progress bar. Defaults to False. **kwargs: additional kwargs to :py:meth:`~predict` method Returns: Union[np.ndarray, torch.Tensor, pd.Series, pd.DataFrame]: output """ values = np.asarray(values) if isinstance(data, pd.DataFrame): # convert to dataframe data = TimeSeriesDataSet.from_parameters(self.dataset_parameters, data, predict=True) elif isinstance(data, DataLoader): data = data.dataset results = [] progress_bar = tqdm(desc="Predict", unit=" batches", total=len(values), disable=not show_progress_bar) for idx, value in enumerate(values): # set values data.set_overwrite_values(variable=variable, values=value, target=target) # predict kwargs.setdefault("mode", "prediction") if idx == 0 and mode == "dataframe": # need index for returning as dataframe res, index = self.predict(data, return_index=True, **kwargs) results.append(res) else: results.append(self.predict(data, **kwargs)) # increment progress progress_bar.update() data.reset_overwrite_values( ) # reset overwrite values to avoid side-effect # results to one tensor results = torch.stack(results, dim=0) # convert results to requested output format if mode == "series": results = results[:, ~torch.isnan(results[0])].mean( 1) # average samples and prediction horizon results = pd.Series(results, index=values) elif mode == "dataframe": # take mean over time is_nan = torch.isnan(results) results[is_nan] = 0 results = results.sum(-1) / (~is_nan).float().sum(-1) # create dataframe dependencies = (index.iloc[np.tile(np.arange( len(index)), len(values))].reset_index(drop=True).assign( prediction=results.flatten())) dependencies[variable] = values.repeat(len(data)) first_prediction = dependencies.groupby( data.group_ids, observed=True).prediction.transform("first") dependencies["normalized_prediction"] = dependencies[ "prediction"] / first_prediction dependencies["id"] = dependencies.groupby(data.group_ids, observed=True).ngroup() results = dependencies elif mode == "raw": pass else: raise ValueError( f"mode {mode} is unknown - see documentation for available modes" ) return results
def predict( self, data: Union[DataLoader, pd.DataFrame, TimeSeriesDataSet], mode: Union[str, Tuple[str, str]] = "prediction", return_index: bool = False, return_decoder_lengths: bool = False, batch_size: int = 64, num_workers: int = 0, fast_dev_run: bool = False, show_progress_bar: bool = False, return_x: bool = False, ): """ predict dataloader Args: dataloader: dataloader, dataframe or dataset mode: one of "prediction", "quantiles" or "raw", or tuple ``("raw", output_name)`` where output_name is a name in the dictionary returned by ``forward()`` return_index: if to return the prediction index return_decoder_lengths: if to return decoder_lengths batch_size: batch size for dataloader - only used if data is not a dataloader is passed num_workers: number of workers for dataloader - only used if data is not a dataloader is passed fast_dev_run: if to only return results of first batch show_progress_bar: if to show progress bar. Defaults to False. return_x: if to return network inputs Returns: output, x, index, decoder_lengths: some elements might not be present depending on what is configured to be returned """ # convert to dataloader if isinstance(data, pd.DataFrame): data = TimeSeriesDataSet.from_parameters(self.dataset_parameters, data, predict=True) if isinstance(data, TimeSeriesDataSet): dataloader = data.to_dataloader(batch_size=batch_size, train=False, num_workers=num_workers) else: dataloader = data # ensure passed dataloader is correct assert isinstance( dataloader.dataset, TimeSeriesDataSet ), "dataset behind dataloader mut be TimeSeriesDataSet" # prepare model self.eval() # no dropout, etc. no gradients # run predictions output = [] decode_lenghts = [] x_list = [] index = [] progress_bar = tqdm(desc="Predict", unit=" batches", total=len(dataloader), disable=not show_progress_bar) with torch.no_grad(): for x, _ in dataloader: # move data to appropriate device for name in x.keys(): if x[name].device != self.device: x[name].to(self.device) # make prediction out = self(x) # raw output is dictionary out["prediction"] = self.transform_output(out) lengths = x["decoder_lengths"] if return_decoder_lengths: decode_lenghts.append(lengths) nan_mask = self._get_mask(out["prediction"].size(1), lengths) if isinstance(mode, (tuple, list)): if mode[0] == "raw": out = out[mode[1]] else: raise ValueError( f"If a tuple is specified, the first element must be 'raw' - got {mode[0]} instead" ) elif mode == "prediction": out = self.loss.to_prediction(out["prediction"]) # mask non-predictions out = out.masked_fill(nan_mask, torch.tensor(float("nan"))) elif mode == "quantiles": out = self.loss.to_quantiles(out["prediction"]) # mask non-predictions out = out.masked_fill(nan_mask.unsqueeze(-1), torch.tensor(float("nan"))) elif mode == "raw": pass else: raise ValueError( f"Unknown mode {mode} - see docs for valid arguments") output.append(out) if return_x: x_list.append(x) if return_index: index.append(dataloader.dataset.x_to_index(x)) progress_bar.update() if fast_dev_run: break # concatenate if isinstance(mode, (tuple, list)) or mode != "raw": output = torch.cat(output, dim=0) elif mode == "raw": output_cat = {} for name in output[0].keys(): output_cat[name] = torch.cat([out[name] for out in output], dim=0) output = output_cat # generate output if return_x or return_index or return_decoder_lengths: output = [output] if return_x: x_cat = {} for name in x_list[0].keys(): x_cat[name] = torch.cat([x[name] for x in x_list], dim=0) x_cat = x_cat output.append(x_cat) if return_index: output.append(pd.concat(index, axis=0, ignore_index=True)) if return_decoder_lengths: output.append(torch.cat(decode_lenghts, dim=0)) return output
training = TimeSeriesDataSet( data[lambda x: x.time_idx <= training_cutoff], time_idx="time_idx", target="volume", group_ids=["agency", "sku"], min_encoder_length=0, # allow predictions without history max_encoder_length=max_encoder_length, min_prediction_length=1, max_prediction_length=max_prediction_length, static_categoricals=["agency", "sku"], static_reals=["avg_population_2017", "avg_yearly_household_income_2017"], time_varying_known_categoricals=["special_days", "month"], # group of categorical variables can be treated as # one variable --> special days' list variable_groups={"special_days": special_days}, time_varying_known_reals=[ "time_idx", "price_regular", "discount_in_percent" ], time_varying_unknown_categoricals=[], time_varying_unknown_reals=[ "volume", "log_volume", "industry_volume", "soda_volume", "avg_max_temp", "avg_volume_by_agency", "avg_volume_by_sku", ], target_normalizer=GroupNormalizer( groups=["agency", "sku"], coerce_positive=1.0 ), # use softplus with beta=1.0 and normalize by group add_relative_time_idx=True, # add as feature add_target_scales=True, # add as feature add_encoder_length=True, # add as feature )
def test_from_dataset(test_dataset, test_data): dataset = TimeSeriesDataSet.from_dataset(test_dataset, test_data) check_dataloader_output(dataset, next(iter(dataset.to_dataloader(num_workers=0))))
max_prediction_length = 24 max_encoder_length = 168 val_size = 9792 training_cutoff = data["time_idx"].max() - val_size traning_data = data[lambda x: x.time_idx <= training_cutoff] training = TimeSeriesDataSet( traning_data, time_idx="time_idx", target="arrivals", group_ids=["series"], min_encoder_length=max_encoder_length, max_encoder_length=max_encoder_length, min_prediction_length=max_prediction_length, max_prediction_length=max_prediction_length, static_categoricals=["series"], time_varying_known_categoricals=time_varying_known_categoricals, time_varying_unknown_categoricals=time_varying_unknown_categoricals, time_varying_unknown_reals=time_varying_unknown_reals, add_relative_time_idx=True, add_target_scales=True, add_encoder_length=True, ) validation = TimeSeriesDataSet.from_dataset(training, data, predict=False, stop_randomization=True) # create dataloaders for model