def test_predict_dependency(model, dataloaders_with_covariates, data_with_covariates, kwargs): train_dataset = dataloaders_with_covariates["train"].dataset dataset = TimeSeriesDataSet.from_dataset( train_dataset, data_with_covariates[lambda x: x.agency == data_with_covariates.agency.iloc[0]], predict=True ) model.predict_dependency(dataset, variable="discount", values=[0.1, 0.0], **kwargs) model.predict_dependency(dataset, variable="agency", values=data_with_covariates.agency.unique()[:2], **kwargs)
def make_dataloaders(data_with_covariates, **kwargs): training_cutoff = "2016-09-01" max_encoder_length = 4 max_prediction_length = 3 kwargs.setdefault("target", "volume") kwargs.setdefault("group_ids", ["agency", "sku"]) kwargs.setdefault("add_relative_time_idx", True) kwargs.setdefault("time_varying_unknown_reals", ["volume"]) training = TimeSeriesDataSet( data_with_covariates[lambda x: x.date < training_cutoff].copy(), time_idx="time_idx", max_encoder_length=max_encoder_length, max_prediction_length=max_prediction_length, **kwargs, # fixture parametrization ) validation = TimeSeriesDataSet.from_dataset( training, data_with_covariates.copy(), min_prediction_idx=training.index.time.max() + 1) train_dataloader = training.to_dataloader(train=True, batch_size=2, num_workers=0) val_dataloader = validation.to_dataloader(train=False, batch_size=2, num_workers=0) test_dataloader = validation.to_dataloader(train=False, batch_size=1, num_workers=0) return dict(train=train_dataloader, val=val_dataloader, test=test_dataloader)
def multiple_dataloaders_with_covariates(data_with_covariates, request): training_cutoff = "2016-09-01" max_encoder_length = 36 max_prediction_length = 6 params = request.param params.setdefault("target", "volume") training = TimeSeriesDataSet( data_with_covariates[lambda x: x.date < training_cutoff], time_idx="time_idx", # weight="weight", group_ids=["agency", "sku"], max_encoder_length=max_encoder_length, max_prediction_length=max_prediction_length, add_relative_time_idx=True, **params # fixture parametrization ) validation = TimeSeriesDataSet.from_dataset( training, data_with_covariates, min_prediction_idx=training.index.time.max() + 1) batch_size = 4 train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0) val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size, num_workers=0) return dict(train=train_dataloader, val=val_dataloader)
def dataloaders_with_covariates(data_with_covariates): training_cutoff = "2016-09-01" max_encoder_length = 36 max_prediction_length = 6 training = TimeSeriesDataSet( data_with_covariates[lambda x: x.date < training_cutoff], time_idx="time_idx", target="volume", # weight="weight", group_ids=["agency", "sku"], time_varying_known_reals=["discount"], time_varying_unknown_reals=["volume"], static_categoricals=["agency"], max_encoder_length=max_encoder_length, max_prediction_length=max_prediction_length, add_relative_time_idx=True, target_normalizer=GroupNormalizer(groups=["agency", "sku"], coerce_positive=False), ) validation = TimeSeriesDataSet.from_dataset( training, data_with_covariates, min_prediction_idx=training.index.time.max() + 1) batch_size = 4 train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0) val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size, num_workers=0) return dict(train=train_dataloader, val=val_dataloader)
def dataloaders_fixed_window_without_covariates(): data = generate_ar_data(seasonality=10.0, timesteps=400, n_series=10) validation = data.series.iloc[:2] max_encoder_length = 60 max_prediction_length = 20 training = TimeSeriesDataSet( data[lambda x: ~x.series.isin(validation)], time_idx="time_idx", target="value", categorical_encoders={"series": NaNLabelEncoder().fit(data.series)}, group_ids=["series"], static_categoricals=[], max_encoder_length=max_encoder_length, max_prediction_length=max_prediction_length, time_varying_unknown_reals=["value"], target_normalizer=EncoderNormalizer(), ) validation = TimeSeriesDataSet.from_dataset( training, data[lambda x: x.series.isin(validation)], stop_randomization=True, ) batch_size = 4 train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0) val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size, num_workers=0) return dict(train=train_dataloader, val=val_dataloader)
def test_model(model_path): """ Tests results of given model on dataset """ DATA_PATH = 'data/data.csv' if not os.path.isfile(FILE_PATH): wget.download(PREPROCESS_URL, FILE_PATH) dataset = pd.read_csv(DATA_PATH) dataset['target'] = dataset['target'].astype(float) dataset['time_idx'] = dataset['time_idx'].astype(int) time_series = TimeSeriesDataSet.load('models/dataset_time_set') validation = TimeSeriesDataSet.from_dataset(time_series, dataset) all_dataloader = validation.to_dataloader(train=False, num_workers=0) model = TemporalFusionTransformer.load_from_checkpoint(model_path) actuals = torch.cat([y[0] for (x, y) in iter(all_dataloader)]) predictions = model.predict(all_dataloader) print(f'test mape is {((actuals - predictions).abs() / actuals).mean()}') print(f' max mape {max(((actuals - predictions).abs() / actuals))}') res = (actuals - predictions).abs() / actuals print(f' max 99 mape {np.quantile(res, .99)}') # print("wynik", res) res = np.array([int(x) for x in res])
def transform_data(self, data, past_lags, index_label, target_label, train_val_split): self.past_lags = past_lags self.oldest_lag = int(max(self.past_lags)) + 1 self.index_label = index_label self.target_label = target_label # External train and validation sets X = data[[index_label]] y = data[[target_label]] self.training = (X.loc[:int(len(data) * train_val_split)], y.loc[:int(len(data) * train_val_split)]) self.validation = (X.loc[int(len(data) * train_val_split):], y.loc[int(len(data) * train_val_split):]) # intern train and validation sets, they use dataloaders to optimize the training routine # time index are epoch values # data["time_idx"] = (data[self.index_label] - pd.Timestamp("1970-01-01")) // pd.Timedelta("1s") data["time_idx"] = data.index data['group_id'] = 'series' max_prediction_length = self.oldest_lag max_encoder_length = self.oldest_lag # training_cutoff = data["time_idx"].max() - max_prediction_length self.intern_training = TimeSeriesDataSet( data[:int(len(data) * train_val_split)], time_idx="time_idx", group_ids=["group_id"], target=self.target_label, min_encoder_length=0, max_encoder_length=max_encoder_length, min_prediction_length=1, max_prediction_length=max_prediction_length, static_categoricals=["group_id"], # time_varying_unknown_reals=[self.target_label], # the docs says that the max_lag < max_encoder_length # lags={self.target_label: list(self.past_lags[1:-1] + 1)}, add_relative_time_idx=True, add_target_scales=True, add_encoder_length=True, # allow_missings=True ) # create validation set (predict=True) which means to predict the last max_prediction_length points in time # for each series self._intern_validation = TimeSeriesDataSet.from_dataset( self.intern_training, data, predict=True, stop_randomization=True) # store the last input to use as encoder data to next predictions self.last_period = data.iloc[-(self.oldest_lag * 2 + 1):].copy()
def predict(self, data): """ Transforms data and predicts output based on train model Parameters: self, list of protocols Return: list of results for each protocol based on train model """ print(data) self.save_time_series() dataset = self.prepare_data(data) time_series = TimeSeriesDataSet.load(self.TIMESERIES_PATH) validation = TimeSeriesDataSet.from_dataset(time_series, dataset) val_dataloader = validation.to_dataloader(train=False, num_workers=0) res = self.model.predict(val_dataloader) # print("wynik", res) res = np.array([int(x) for x in res]) return res
def _create_dataset(self, df, valid_p=0.2): df = df_utils.check_dataframe(df) df = self._handle_missing_data(df) df = df[["ds", "y"]] df["time_idx"] = range(df.shape[0]) df["series"] = 0 self.n_data = df.shape[0] self.set_auto_batch_epoch(self.n_data) training_cutoff = df.shape[0] - int(valid_p * df.shape[0]) training = TimeSeriesDataSet( df.iloc[:training_cutoff], time_idx="time_idx", target="y", categorical_encoders={"series": NaNLabelEncoder().fit(df.series)}, group_ids=["series"], min_encoder_length=self.context_length, max_encoder_length=self.context_length, max_prediction_length=self.prediction_length, min_prediction_length=self.prediction_length, time_varying_unknown_reals=["y"], target_normalizer=GroupNormalizer(groups=["series"]), randomize_length=None, add_relative_time_idx=False, add_target_scales=False, ) validation = TimeSeriesDataSet.from_dataset( training, df, min_prediction_idx=training_cutoff) train_dataloader = training.to_dataloader(train=True, batch_size=self.batch_size, num_workers=self.num_workers) val_dataloader = validation.to_dataloader(train=False, batch_size=self.batch_size, num_workers=self.num_workers) return training, train_dataloader, val_dataloader
time_idx="time_idx", target="value", categorical_encoders={"series": NaNLabelEncoder().fit(data.series)}, group_ids=["series"], min_encoder_length=context_length, max_encoder_length=context_length, max_prediction_length=prediction_length, min_prediction_length=prediction_length, time_varying_unknown_reals=["value"], randomize_length=None, add_relative_time_idx=False, add_target_scales=False, ) validation = TimeSeriesDataSet.from_dataset(training, data, min_prediction_idx=training_cutoff) batch_size = 128 train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=2) val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size, num_workers=2) early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4, patience=10, verbose=False, mode="min") trainer = pl.Trainer(
"industry_volume", "soda_volume", "avg_max_temp", "avg_volume_by_agency", "avg_volume_by_sku", ], target_normalizer=GroupNormalizer( groups=["agency", "sku"], transformation="softplus", center=False ), # use softplus with beta=1.0 and normalize by group add_relative_time_idx=True, add_target_scales=True, add_encoder_length=True, ) validation = TimeSeriesDataSet.from_dataset(training, data, predict=True, stop_randomization=True) batch_size = 64 train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0) val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size, num_workers=0) # save datasets training.save("t raining.pkl") validation.save("validation.pkl") early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4, patience=10, verbose=False, mode="min") lr_logger = LearningRateMonitor() trainer = pl.Trainer( max_epochs=100, gpus=0,
static_categoricals=["static"], min_encoder_length=max_encoder_length, max_encoder_length=max_encoder_length, min_prediction_length=max_prediction_length, max_prediction_length=max_prediction_length, time_varying_unknown_reals=["value"], time_varying_known_reals=["time_idx"], target_normalizer=GroupNormalizer(groups=["series"]), add_relative_time_idx=False, add_target_scales=True, randomize_length=None, ) validation = TimeSeriesDataSet.from_dataset( training, data[lambda x: x.series.isin(validation)], # predict=True, stop_randomization=True, ) batch_size = 64 train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0) val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size, num_workers=0) # save datasets training.save("training.pkl") validation.save("validation.pkl") early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4, patience=5, verbose=False, mode="min") lr_logger = LearningRateMonitor() trainer = pl.Trainer( max_epochs=10,
df_train_nbeats_sub, df_train_nbeats_val = utilities.split_ts(df_train_nbeats) nbeats_training = TimeSeriesDataSet( df_train_nbeats_sub, time_idx="index", target="y", categorical_encoders={ "group": NaNLabelEncoder().fit(df_train_nbeats_sub["group"]) }, group_ids=["group"], time_varying_unknown_reals=["y"], max_encoder_length=max_encoder_length, max_prediction_length=max_prediction_length, ) nbeats_validation = TimeSeriesDataSet.from_dataset(nbeats_training, df_train_nbeats_val) # %% batch_size = 128 nbeats_train_dataloader = nbeats_training.to_dataloader(train=True, batch_size=batch_size, num_workers=0) nbeats_val_dataloader = nbeats_validation.to_dataloader(train=False, batch_size=batch_size, num_workers=0) net = NBeats.from_dataset( nbeats_training, learning_rate=3e-2, weight_decay=1e-2, widths=[32, 512],