def test_new_group_ids(test_data, kwargs): """Test for new group ids in dataset""" train_agency = test_data["agency"].iloc[0] train_dataset = TimeSeriesDataSet( test_data[lambda x: x.agency == train_agency], time_idx="time_idx", target="volume", group_ids=["agency", "sku"], max_encoder_length=5, max_prediction_length=2, min_prediction_length=1, min_encoder_length=1, categorical_encoders=dict(agency=NaNLabelEncoder(add_nan=True), sku=NaNLabelEncoder(add_nan=True)), **kwargs, ) # test sampling from training dataset next(iter(train_dataset.to_dataloader())) # create test dataset with group ids that have not been observed before test_dataset = TimeSeriesDataSet.from_dataset(train_dataset, test_data) # check that we can iterate through dataset without error for _ in iter(test_dataset.to_dataloader()): pass
def test_NaNLabelEncoder_add(): encoder = NaNLabelEncoder(add_nan=False) encoder.fit(np.array(["a", "b", "c"])) encoder2 = deepcopy(encoder) encoder2.fit(np.array(["d"])) assert encoder2.transform(np.array(["a" ]))[0] == 0, "a must be encoded as 0" assert encoder2.transform(np.array(["d" ]))[0] == 3, "d must be encoded as 3"
def from_synthetic_ar_data( seasonality: float = 10.0, timesteps: int = 400, n_series: int = 100, max_encoder_length: int = 60, max_prediction_length: int = 20, batch_size: int = 4, num_workers: int = 0, **time_series_dataset_kwargs, ) -> TabularForecastingData: """Creates and loads a synthetic Auto-Regressive (AR) data set.""" data = generate_ar_data(seasonality=seasonality, timesteps=timesteps, n_series=n_series, seed=42) data["date"] = pd.Timestamp("2020-01-01") + pd.to_timedelta( data.time_idx, "D") training_cutoff = data["time_idx"].max() - max_prediction_length return TabularForecastingData.from_data_frame( time_idx="time_idx", target="value", categorical_encoders={"series": NaNLabelEncoder().fit(data.series)}, group_ids=["series"], # only unknown variable is "value" - and N-Beats can also not take any additional variables time_varying_unknown_reals=["value"], max_encoder_length=max_encoder_length, max_prediction_length=max_prediction_length, train_data_frame=data[lambda x: x.time_idx <= training_cutoff], val_data_frame=data, batch_size=batch_size, num_workers=num_workers, **time_series_dataset_kwargs, )
def test_testing_raises(sample_data): """Tests that ``NotImplementedError`` is raised when attempting to perform a test pass.""" data, training_cutoff, max_prediction_length = sample_data datamodule = TabularForecastingData.from_data_frame( time_idx="time_idx", target="value", categorical_encoders={"series": NaNLabelEncoder().fit(data.series)}, group_ids=["series"], time_varying_unknown_reals=["value"], max_encoder_length=60, max_prediction_length=max_prediction_length, train_data_frame=data[lambda x: x.time_idx <= training_cutoff], test_data_frame=data, batch_size=4, ) model = TabularForecaster( datamodule.parameters, backbone="n_beats", backbone_kwargs={ "widths": [32, 512], "backcast_loss_ratio": 0.1 }, ) trainer = flash.Trainer(max_epochs=1, fast_dev_run=True, gradient_clip_val=0.01) with pytest.raises( NotImplementedError, match= "Backbones provided by PyTorch Forecasting don't support testing." ): trainer.test(model, datamodule=datamodule)
def test_fast_dev_run_smoke(sample_data): """Test that fast dev run works with the NBeats example data.""" data, training_cutoff, max_prediction_length = sample_data datamodule = TabularForecastingData.from_data_frame( time_idx="time_idx", target="value", categorical_encoders={"series": NaNLabelEncoder().fit(data.series)}, group_ids=["series"], time_varying_unknown_reals=["value"], max_encoder_length=60, max_prediction_length=max_prediction_length, train_data_frame=data[lambda x: x.time_idx <= training_cutoff], val_data_frame=data, batch_size=4, ) model = TabularForecaster( datamodule.parameters, backbone="n_beats", backbone_kwargs={ "widths": [32, 512], "backcast_loss_ratio": 0.1 }, ) trainer = flash.Trainer(max_epochs=1, fast_dev_run=True, gradient_clip_val=0.01) trainer.fit(model, datamodule=datamodule)
def dataloaders_fixed_window_without_covariates(): data = generate_ar_data(seasonality=10.0, timesteps=400, n_series=10) validation = data.series.iloc[:2] max_encoder_length = 60 max_prediction_length = 20 training = TimeSeriesDataSet( data[lambda x: ~x.series.isin(validation)], time_idx="time_idx", target="value", categorical_encoders={"series": NaNLabelEncoder().fit(data.series)}, group_ids=["series"], static_categoricals=[], max_encoder_length=max_encoder_length, max_prediction_length=max_prediction_length, time_varying_unknown_reals=["value"], target_normalizer=EncoderNormalizer(), ) validation = TimeSeriesDataSet.from_dataset( training, data[lambda x: x.series.isin(validation)], stop_randomization=True, ) batch_size = 4 train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0) val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size, num_workers=0) return dict(train=train_dataloader, val=val_dataloader)
def save_time_series(self): """ Download preprocessing file and creates data in a format suited for temporal fusion """ PREPROCESS_URL = 'https://raw.githubusercontent.com/AWarno/CancerOptimization/main/preprocess_data.py' FILE_PATH = 'data/preprocess_data.py' DATA_PATH = 'data/data.csv' FEATURES = ['dose', 'time'] GROUP_ID = 'series' # Data file already exists so we don't need to generate it if os.path.isfile(DATA_PATH): return # Preprocessing file already exists so we don't need to download it again if not os.path.isfile(FILE_PATH): wget.download(PREPROCESS_URL, FILE_PATH) os.system('python ' + FILE_PATH) dataset = pd.read_csv(DATA_PATH) n = dataset[GROUP_ID].astype(int).max() dataset['target'] = dataset['target'].astype(float) dataset['time_idx'] = dataset['time_idx'].astype(int) training = TimeSeriesDataSet( dataset[dataset[GROUP_ID].apply(lambda x: int(x) < int(n * 0.7))], time_idx='time_idx', target='target', group_ids=[GROUP_ID], min_encoder_length=20, max_encoder_length=20, min_prediction_length=1, max_prediction_length=1, static_categoricals=[], static_reals=[], time_varying_known_categoricals=[], variable_groups={}, time_varying_known_reals=['time_idx'], time_varying_unknown_categoricals=[], time_varying_unknown_reals=['target'] + FEATURES, add_relative_time_idx=True, add_target_scales=False, add_encoder_length=True, categorical_encoders={GROUP_ID: NaNLabelEncoder().fit(dataset.series)}, ) training.save(self.TIMESERIES_PATH)
def _create_dataset(self, df, valid_p=0.2): df = df_utils.check_dataframe(df) df = self._handle_missing_data(df) df = df[["ds", "y"]] df["time_idx"] = range(df.shape[0]) df["series"] = 0 self.n_data = df.shape[0] self.set_auto_batch_epoch(self.n_data) training_cutoff = df.shape[0] - int(valid_p * df.shape[0]) training = TimeSeriesDataSet( df.iloc[:training_cutoff], time_idx="time_idx", target="y", categorical_encoders={"series": NaNLabelEncoder().fit(df.series)}, group_ids=["series"], min_encoder_length=self.context_length, max_encoder_length=self.context_length, max_prediction_length=self.prediction_length, min_prediction_length=self.prediction_length, time_varying_unknown_reals=["y"], target_normalizer=GroupNormalizer(groups=["series"]), randomize_length=None, add_relative_time_idx=False, add_target_scales=False, ) validation = TimeSeriesDataSet.from_dataset( training, df, min_prediction_idx=training_cutoff) train_dataloader = training.to_dataloader(train=True, batch_size=self.batch_size, num_workers=self.num_workers) val_dataloader = validation.to_dataloader(train=False, batch_size=self.batch_size, num_workers=self.num_workers) return training, train_dataloader, val_dataloader
dict( target_normalizer=GroupNormalizer( groups=["agency", "sku"], log_scale=True, scale_by_group=True, log_zero_value=1.0 ) ), dict(target_normalizer=EncoderNormalizer(), min_encoder_length=2), dict(randomize_length=True, min_encoder_length=2, min_prediction_length=1), dict(predict_mode=True), dict(add_target_scales=True), dict(add_encoder_length=True), dict(add_encoder_length=True), dict(add_relative_time_idx=True), dict(weight="volume"), dict( scalers=dict(time_idx=GroupNormalizer(), price_regular=StandardScaler()), categorical_encoders=dict(month=NaNLabelEncoder()), time_varying_known_categoricals=["month"], time_varying_known_reals=["time_idx", "price_regular"], ), dict(dropout_categoricals=["month"], time_varying_known_categoricals=["month"]), dict(constant_fill_strategy=dict(volume=0.0), allow_missings=True), ], ) def test_TimeSeriesDataSet(test_data, kwargs): defaults = dict( time_idx="time_idx", target="volume", group_ids=["agency", "sku"], max_encoder_length=5, max_prediction_length=2,
class TestTabularForecaster(TaskTester): task = TabularForecaster # TODO: Reduce number of required parameters task_kwargs = { "parameters": { "time_idx": "time_idx", "target": "value", "group_ids": ["series"], "weight": None, "max_encoder_length": 60, "min_encoder_length": 60, "min_prediction_idx": 0, "min_prediction_length": 20, "max_prediction_length": 20, "static_categoricals": [], "static_reals": [], "time_varying_known_categoricals": [], "time_varying_known_reals": [], "time_varying_unknown_categoricals": [], "time_varying_unknown_reals": ["value"], "variable_groups": {}, "constant_fill_strategy": {}, "allow_missing_timesteps": False, "lags": {}, "add_relative_time_idx": False, "add_target_scales": False, "add_encoder_length": False, "target_normalizer": EncoderNormalizer(), "categorical_encoders": { "series": NaNLabelEncoder(), "__group_id__series": NaNLabelEncoder() }, "scalers": {}, "randomize_length": None, "predict_mode": False, "data_sample": { "series": { 0: 0 }, "time_idx": { 0: 0 }, "value": { 0: 0.0 }, }, }, "backbone": "n_beats", "backbone_kwargs": { "widths": [32, 512], "backcast_loss_ratio": 0.1 }, } cli_command = "tabular_forecasting" is_testing = _TABULAR_TESTING is_available = _TABULAR_AVAILABLE # # TODO: Resolve JIT issues scriptable = False traceable = False @property def example_forward_input(self): return { "encoder_cat": torch.empty(2, 60, 0, dtype=torch.int64), "encoder_cont": torch.zeros(2, 60, 1), "encoder_target": torch.zeros(2, 60), "encoder_lengths": torch.tensor([60, 60]), "decoder_cat": torch.empty(2, 20, 0, dtype=torch.int64), "decoder_cont": torch.zeros(2, 20, 1), "decoder_target": torch.zeros(2, 20), "decoder_lengths": torch.tensor([20, 20]), "decoder_time_idx": torch.ones(2, 20).long(), "groups": torch.tensor([[0], [1]]), "target_scale": torch.zeros(2, 2), } def check_forward_output(self, output: Any): assert isinstance(output["prediction"], torch.Tensor) assert output["prediction"].shape == torch.Size([2, 20])
from pytorch_forecasting.data import NaNLabelEncoder # noqa: E402 from pytorch_forecasting.data.examples import generate_ar_data # noqa: E402 # Example based on this tutorial: https://pytorch-forecasting.readthedocs.io/en/latest/tutorials/ar.html # 1. Create the DataModule data = generate_ar_data(seasonality=10.0, timesteps=400, n_series=100, seed=42) data["date"] = pd.Timestamp("2020-01-01") + pd.to_timedelta(data.time_idx, "D") max_prediction_length = 20 training_cutoff = data["time_idx"].max() - max_prediction_length datamodule = TabularForecastingData.from_data_frame( time_idx="time_idx", target="value", categorical_encoders={"series": NaNLabelEncoder().fit(data.series)}, group_ids=["series"], # only unknown variable is "value" - and N-Beats can also not take any additional variables time_varying_unknown_reals=["value"], max_encoder_length=60, max_prediction_length=max_prediction_length, train_data_frame=data[lambda x: x.time_idx <= training_cutoff], val_data_frame=data, batch_size=32, ) # 2. Build the task model = TabularForecaster( datamodule.parameters, backbone="n_beats", backbone_kwargs={
static_categoricals=["symbol"], #static_reals=["avg_population_2017", "avg_yearly_household_income_2017"], #time_varying_known_categoricals=["special_days", "month"], #variable_groups={"special_days": special_days}, # group of categorical variables can be treated as one variable time_varying_known_reals=meta["knownReals"], #time_varying_unknown_categoricals=[],#meta["features"], time_varying_unknown_reals=meta["unknownReals"], #target_normalizer=NaNLabelEncoder(add_nan=True), target_normalizer=None, #GroupNormalizer( # groups=["symbol"], transformation="softplus", center=False #), # use softplus with beta=1.0 and normalize by group #add_relative_time_idx=True, #add_target_scales=True, add_encoder_length=True, categorical_encoders={"symbol": NaNLabelEncoder(add_nan=True)}) validation = TimeSeriesDataSet.from_dataset(training, data, predict=True, stop_randomization=True) batch_size = 64 train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=2) val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size, num_workers=2) # save datasets #training.save("training.pkl") #validation.save("validation.pkl")
scale_by_group=True, )), dict(target_normalizer=EncoderNormalizer(), min_encoder_length=2), dict(randomize_length=True, min_encoder_length=2, min_prediction_length=1), dict(predict_mode=True), dict(add_target_scales=True), dict(add_encoder_length=True), dict(add_encoder_length=True), dict(add_relative_time_idx=True), dict(weight="volume"), dict( scalers=dict(time_idx=GroupNormalizer(), price_regular=StandardScaler()), categorical_encoders=dict(month=NaNLabelEncoder()), time_varying_known_categoricals=["month"], time_varying_known_reals=["time_idx", "price_regular"], ), dict(categorical_encoders={"month": NaNLabelEncoder(add_nan=True)}, time_varying_known_categoricals=["month"]), dict(constant_fill_strategy=dict(volume=0.0), allow_missing_timesteps=True), dict(target_normalizer=None), ], ) def test_TimeSeriesDataSet(test_data, kwargs): defaults = dict( time_idx="time_idx", target="volume",
max_encoder_length = 60 max_prediction_length = 20 df_train_nbeats = df_train.copy() df_train_nbeats = df_train_nbeats.reset_index() df_train_nbeats = df_train_nbeats.reset_index() df_train_nbeats["group"] = 0 df_train_nbeats_sub, df_train_nbeats_val = utilities.split_ts(df_train_nbeats) nbeats_training = TimeSeriesDataSet( df_train_nbeats_sub, time_idx="index", target="y", categorical_encoders={ "group": NaNLabelEncoder().fit(df_train_nbeats_sub["group"]) }, group_ids=["group"], time_varying_unknown_reals=["y"], max_encoder_length=max_encoder_length, max_prediction_length=max_prediction_length, ) nbeats_validation = TimeSeriesDataSet.from_dataset(nbeats_training, df_train_nbeats_val) # %% batch_size = 128 nbeats_train_dataloader = nbeats_training.to_dataloader(train=True, batch_size=batch_size, num_workers=0) nbeats_val_dataloader = nbeats_validation.to_dataloader(train=False,
def test_NaNLabelEncoder(data, allow_nan): fit_data, transform_data = data encoder = NaNLabelEncoder(warn=False, add_nan=allow_nan) encoder.fit(fit_data) assert np.array_equal( encoder.inverse_transform(encoder.transform(fit_data)), fit_data ), "Inverse transform should reverse transform" if not allow_nan: with pytest.raises(KeyError): encoder.transform(transform_data) else: assert encoder.transform(transform_data)[0] == 0, "First value should be translated to 0 if nan" assert encoder.transform(transform_data)[-1] == 0, "Last value should be translated to 0 if nan" assert encoder.transform(fit_data)[0] > 0, "First value should not be 0 if not nan"
"fifa_u_17_world_cup", "football_gold_cup", "beer_capital", "music_fest", ]), time_varying_known_reals=[ "time_idx", "price_regular", "price_actual", "discount", "discount_in_percent" ], time_varying_unknown_categoricals=[], time_varying_unknown_reals=[ "volume", "log_volume", "industry_volume", "soda_volume", "avg_max_temp" ], constant_fill_strategy={"volume": 0}, categorical_encoders={"sku": NaNLabelEncoder(add_nan=True)}, ), dict(static_categoricals=["agency", "sku"]), dict(randomize_length=True, min_encoder_length=2), dict(target_normalizer=EncoderNormalizer(), min_encoder_length=2), dict(target_normalizer=GroupNormalizer(transformation="log1p")), dict(target_normalizer=GroupNormalizer(groups=["agency", "sku"], transformation="softplus", center=False)), dict(target="agency"), # test multiple targets dict(target=["industry_volume", "volume"]), dict(target=["agency", "volume"]), dict(target=["agency", "volume"], min_encoder_length=1, min_prediction_length=1),
def predict(self, future_dataframe): """ Predicts based on the future_dataframe. Should be called only after make_future_dataframe is called Args: future_dataframe: DataFrame form make_future_dataframe function Returns: forecast dataframe """ if self.fitted is False: log.warning("Model has not been fitted. Predictions will be random.") future_dataframe = future_dataframe.copy(deep=True) testing = TimeSeriesDataSet( future_dataframe, time_idx="time_idx", target="y", categorical_encoders={"series": NaNLabelEncoder().fit(future_dataframe.series)}, group_ids=["series"], min_encoder_length=self.context_length, max_encoder_length=self.context_length, max_prediction_length=self.prediction_length, min_prediction_length=self.prediction_length, time_varying_known_reals=["time_idx"], time_varying_unknown_reals=["y"], target_normalizer=GroupNormalizer(groups=["series"], transformation="softplus", center=False), add_relative_time_idx=True, add_target_scales=True, add_encoder_length=True, ) new_raw_predictions, new_x = self.model.predict(testing, mode="raw", return_x=True) y_predicted = self.model.to_prediction(new_raw_predictions).detach().cpu() # [0, : new_x["decoder_lengths"][0]] y_predicted = y_predicted.detach().numpy() def pad_with(vector, pad_width, iaxis, kwargs): pad_value = kwargs.get("padder", np.nan) vector[: pad_width[0]] = pad_value vector[-pad_width[1] :] = pad_value y_pred_padded = np.pad(y_predicted, self.prediction_length, pad_with)[ self.prediction_length : -1, self.prediction_length : -self.prediction_length ] y_pred_padded = np.vstack([np.roll(y_pred_padded[:, i], i, axis=0) for i in range(y_pred_padded.shape[1])]).T result = pd.DataFrame( np.ones(shape=(len(future_dataframe), (2 + self.prediction_length))) * np.nan, columns=["ds", "y"] + [f"yhat{i}" for i in range(1, self.prediction_length + 1)], ) result["ds"] = future_dataframe["ds"] result.loc[: len(future_dataframe) - (self.periods + 1), "y"] = ( future_dataframe["y"].iloc[: len(future_dataframe) - (self.periods)].values ) first_part = result.iloc[: self.context_length] second_part = result.iloc[self.context_length :] second_part.loc[:, [col for col in second_part.columns[2:]]] = y_pred_padded result = pd.concat([first_part, second_part]) for i in range(1, self.prediction_length + 1): result[f"residual{i}"] = result[f"yhat{i}"] - result["y"] return result