Ejemplo n.º 1
0
def test_predict_dependency(model, dataloaders_with_covariates, data_with_covariates, kwargs):
    train_dataset = dataloaders_with_covariates["train"].dataset
    dataset = TimeSeriesDataSet.from_dataset(
        train_dataset, data_with_covariates[lambda x: x.agency == data_with_covariates.agency.iloc[0]], predict=True
    )
    model.predict_dependency(dataset, variable="discount", values=[0.1, 0.0], **kwargs)
    model.predict_dependency(dataset, variable="agency", values=data_with_covariates.agency.unique()[:2], **kwargs)
Ejemplo n.º 2
0
def make_dataloaders(data_with_covariates, **kwargs):
    training_cutoff = "2016-09-01"
    max_encoder_length = 4
    max_prediction_length = 3

    kwargs.setdefault("target", "volume")
    kwargs.setdefault("group_ids", ["agency", "sku"])
    kwargs.setdefault("add_relative_time_idx", True)
    kwargs.setdefault("time_varying_unknown_reals", ["volume"])

    training = TimeSeriesDataSet(
        data_with_covariates[lambda x: x.date < training_cutoff].copy(),
        time_idx="time_idx",
        max_encoder_length=max_encoder_length,
        max_prediction_length=max_prediction_length,
        **kwargs,  # fixture parametrization
    )

    validation = TimeSeriesDataSet.from_dataset(
        training,
        data_with_covariates.copy(),
        min_prediction_idx=training.index.time.max() + 1)
    train_dataloader = training.to_dataloader(train=True,
                                              batch_size=2,
                                              num_workers=0)
    val_dataloader = validation.to_dataloader(train=False,
                                              batch_size=2,
                                              num_workers=0)
    test_dataloader = validation.to_dataloader(train=False,
                                               batch_size=1,
                                               num_workers=0)

    return dict(train=train_dataloader,
                val=val_dataloader,
                test=test_dataloader)
Ejemplo n.º 3
0
def multiple_dataloaders_with_covariates(data_with_covariates, request):
    training_cutoff = "2016-09-01"
    max_encoder_length = 36
    max_prediction_length = 6

    params = request.param
    params.setdefault("target", "volume")

    training = TimeSeriesDataSet(
        data_with_covariates[lambda x: x.date < training_cutoff],
        time_idx="time_idx",
        # weight="weight",
        group_ids=["agency", "sku"],
        max_encoder_length=max_encoder_length,
        max_prediction_length=max_prediction_length,
        add_relative_time_idx=True,
        **params  # fixture parametrization
    )

    validation = TimeSeriesDataSet.from_dataset(
        training,
        data_with_covariates,
        min_prediction_idx=training.index.time.max() + 1)
    batch_size = 4
    train_dataloader = training.to_dataloader(train=True,
                                              batch_size=batch_size,
                                              num_workers=0)
    val_dataloader = validation.to_dataloader(train=False,
                                              batch_size=batch_size,
                                              num_workers=0)

    return dict(train=train_dataloader, val=val_dataloader)
Ejemplo n.º 4
0
def dataloaders_with_covariates(data_with_covariates):
    training_cutoff = "2016-09-01"
    max_encoder_length = 36
    max_prediction_length = 6

    training = TimeSeriesDataSet(
        data_with_covariates[lambda x: x.date < training_cutoff],
        time_idx="time_idx",
        target="volume",
        # weight="weight",
        group_ids=["agency", "sku"],
        time_varying_known_reals=["discount"],
        time_varying_unknown_reals=["volume"],
        static_categoricals=["agency"],
        max_encoder_length=max_encoder_length,
        max_prediction_length=max_prediction_length,
        add_relative_time_idx=True,
        target_normalizer=GroupNormalizer(groups=["agency", "sku"],
                                          coerce_positive=False),
    )

    validation = TimeSeriesDataSet.from_dataset(
        training,
        data_with_covariates,
        min_prediction_idx=training.index.time.max() + 1)
    batch_size = 4
    train_dataloader = training.to_dataloader(train=True,
                                              batch_size=batch_size,
                                              num_workers=0)
    val_dataloader = validation.to_dataloader(train=False,
                                              batch_size=batch_size,
                                              num_workers=0)

    return dict(train=train_dataloader, val=val_dataloader)
Ejemplo n.º 5
0
def dataloaders_fixed_window_without_covariates():
    data = generate_ar_data(seasonality=10.0, timesteps=400, n_series=10)
    validation = data.series.iloc[:2]

    max_encoder_length = 60
    max_prediction_length = 20

    training = TimeSeriesDataSet(
        data[lambda x: ~x.series.isin(validation)],
        time_idx="time_idx",
        target="value",
        categorical_encoders={"series": NaNLabelEncoder().fit(data.series)},
        group_ids=["series"],
        static_categoricals=[],
        max_encoder_length=max_encoder_length,
        max_prediction_length=max_prediction_length,
        time_varying_unknown_reals=["value"],
        target_normalizer=EncoderNormalizer(),
    )

    validation = TimeSeriesDataSet.from_dataset(
        training,
        data[lambda x: x.series.isin(validation)],
        stop_randomization=True,
    )
    batch_size = 4
    train_dataloader = training.to_dataloader(train=True,
                                              batch_size=batch_size,
                                              num_workers=0)
    val_dataloader = validation.to_dataloader(train=False,
                                              batch_size=batch_size,
                                              num_workers=0)

    return dict(train=train_dataloader, val=val_dataloader)
Ejemplo n.º 6
0
    def test_model(model_path):
        """ Tests results of given model on dataset """
        DATA_PATH = 'data/data.csv'
        if not os.path.isfile(FILE_PATH):
            wget.download(PREPROCESS_URL, FILE_PATH)
        
        dataset = pd.read_csv(DATA_PATH)

        dataset['target'] = dataset['target'].astype(float)
        dataset['time_idx'] = dataset['time_idx'].astype(int)
        
        time_series = TimeSeriesDataSet.load('models/dataset_time_set')
        validation = TimeSeriesDataSet.from_dataset(time_series, dataset)
        
        all_dataloader = validation.to_dataloader(train=False, num_workers=0)
        model = TemporalFusionTransformer.load_from_checkpoint(model_path)

        actuals = torch.cat([y[0] for (x, y) in iter(all_dataloader)])
        predictions = model.predict(all_dataloader)

        print(f'test mape is {((actuals - predictions).abs() / actuals).mean()}')

        print(f' max mape {max(((actuals - predictions).abs() / actuals))}')

        res = (actuals - predictions).abs() / actuals
        print(f' max 99 mape {np.quantile(res, .99)}')
#         print("wynik", res)
        res = np.array([int(x) for x in res])
Ejemplo n.º 7
0
    def transform_data(self, data, past_lags, index_label, target_label,
                       train_val_split):

        self.past_lags = past_lags
        self.oldest_lag = int(max(self.past_lags)) + 1
        self.index_label = index_label
        self.target_label = target_label

        # External train and validation sets
        X = data[[index_label]]
        y = data[[target_label]]

        self.training = (X.loc[:int(len(data) * train_val_split)],
                         y.loc[:int(len(data) * train_val_split)])
        self.validation = (X.loc[int(len(data) * train_val_split):],
                           y.loc[int(len(data) * train_val_split):])

        # intern train and validation sets, they use dataloaders to optimize the training routine
        # time index are epoch values
        # data["time_idx"] = (data[self.index_label] - pd.Timestamp("1970-01-01")) // pd.Timedelta("1s")
        data["time_idx"] = data.index
        data['group_id'] = 'series'

        max_prediction_length = self.oldest_lag
        max_encoder_length = self.oldest_lag
        # training_cutoff = data["time_idx"].max() - max_prediction_length

        self.intern_training = TimeSeriesDataSet(
            data[:int(len(data) * train_val_split)],
            time_idx="time_idx",
            group_ids=["group_id"],
            target=self.target_label,
            min_encoder_length=0,
            max_encoder_length=max_encoder_length,
            min_prediction_length=1,
            max_prediction_length=max_prediction_length,
            static_categoricals=["group_id"],
            # time_varying_unknown_reals=[self.target_label],
            # the docs says that the max_lag < max_encoder_length
            # lags={self.target_label: list(self.past_lags[1:-1] + 1)},
            add_relative_time_idx=True,
            add_target_scales=True,
            add_encoder_length=True,
            # allow_missings=True
        )

        # create validation set (predict=True) which means to predict the last max_prediction_length points in time
        # for each series
        self._intern_validation = TimeSeriesDataSet.from_dataset(
            self.intern_training, data, predict=True, stop_randomization=True)

        # store the last input to use as encoder data to next predictions
        self.last_period = data.iloc[-(self.oldest_lag * 2 + 1):].copy()
Ejemplo n.º 8
0
    def predict(self, data):
        """ Transforms data and predicts output based on train model 
        
            Parameters: self, list of protocols
            
            Return: list of results for each protocol based on train model
        """
        print(data)
        self.save_time_series()
        dataset = self.prepare_data(data)
        
        time_series = TimeSeriesDataSet.load(self.TIMESERIES_PATH)
        validation = TimeSeriesDataSet.from_dataset(time_series, dataset)
        
        val_dataloader = validation.to_dataloader(train=False, num_workers=0)
    
        res = self.model.predict(val_dataloader)
#         print("wynik", res)
        res = np.array([int(x) for x in res])
        
        return res
Ejemplo n.º 9
0
    def _create_dataset(self, df, valid_p=0.2):
        df = df_utils.check_dataframe(df)
        df = self._handle_missing_data(df)
        df = df[["ds", "y"]]
        df["time_idx"] = range(df.shape[0])
        df["series"] = 0
        self.n_data = df.shape[0]
        self.set_auto_batch_epoch(self.n_data)

        training_cutoff = df.shape[0] - int(valid_p * df.shape[0])

        training = TimeSeriesDataSet(
            df.iloc[:training_cutoff],
            time_idx="time_idx",
            target="y",
            categorical_encoders={"series": NaNLabelEncoder().fit(df.series)},
            group_ids=["series"],
            min_encoder_length=self.context_length,
            max_encoder_length=self.context_length,
            max_prediction_length=self.prediction_length,
            min_prediction_length=self.prediction_length,
            time_varying_unknown_reals=["y"],
            target_normalizer=GroupNormalizer(groups=["series"]),
            randomize_length=None,
            add_relative_time_idx=False,
            add_target_scales=False,
        )

        validation = TimeSeriesDataSet.from_dataset(
            training, df, min_prediction_idx=training_cutoff)
        train_dataloader = training.to_dataloader(train=True,
                                                  batch_size=self.batch_size,
                                                  num_workers=self.num_workers)
        val_dataloader = validation.to_dataloader(train=False,
                                                  batch_size=self.batch_size,
                                                  num_workers=self.num_workers)

        return training, train_dataloader, val_dataloader
Ejemplo n.º 10
0
    time_idx="time_idx",
    target="value",
    categorical_encoders={"series": NaNLabelEncoder().fit(data.series)},
    group_ids=["series"],
    min_encoder_length=context_length,
    max_encoder_length=context_length,
    max_prediction_length=prediction_length,
    min_prediction_length=prediction_length,
    time_varying_unknown_reals=["value"],
    randomize_length=None,
    add_relative_time_idx=False,
    add_target_scales=False,
)

validation = TimeSeriesDataSet.from_dataset(training,
                                            data,
                                            min_prediction_idx=training_cutoff)
batch_size = 128
train_dataloader = training.to_dataloader(train=True,
                                          batch_size=batch_size,
                                          num_workers=2)
val_dataloader = validation.to_dataloader(train=False,
                                          batch_size=batch_size,
                                          num_workers=2)

early_stop_callback = EarlyStopping(monitor="val_loss",
                                    min_delta=1e-4,
                                    patience=10,
                                    verbose=False,
                                    mode="min")
trainer = pl.Trainer(
Ejemplo n.º 11
0
        "industry_volume",
        "soda_volume",
        "avg_max_temp",
        "avg_volume_by_agency",
        "avg_volume_by_sku",
    ],
    target_normalizer=GroupNormalizer(
        groups=["agency", "sku"], transformation="softplus", center=False
    ),  # use softplus with beta=1.0 and normalize by group
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
)


validation = TimeSeriesDataSet.from_dataset(training, data, predict=True, stop_randomization=True)
batch_size = 64
train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0)
val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size, num_workers=0)


# save datasets
training.save("t raining.pkl")
validation.save("validation.pkl")

early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4, patience=10, verbose=False, mode="min")
lr_logger = LearningRateMonitor()

trainer = pl.Trainer(
    max_epochs=100,
    gpus=0,
Ejemplo n.º 12
0
    static_categoricals=["static"],
    min_encoder_length=max_encoder_length,
    max_encoder_length=max_encoder_length,
    min_prediction_length=max_prediction_length,
    max_prediction_length=max_prediction_length,
    time_varying_unknown_reals=["value"],
    time_varying_known_reals=["time_idx"],
    target_normalizer=GroupNormalizer(groups=["series"]),
    add_relative_time_idx=False,
    add_target_scales=True,
    randomize_length=None,
)

validation = TimeSeriesDataSet.from_dataset(
    training,
    data[lambda x: x.series.isin(validation)],
    # predict=True,
    stop_randomization=True,
)
batch_size = 64
train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0)
val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size, num_workers=0)

# save datasets
training.save("training.pkl")
validation.save("validation.pkl")

early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4, patience=5, verbose=False, mode="min")
lr_logger = LearningRateMonitor()

trainer = pl.Trainer(
    max_epochs=10,
Ejemplo n.º 13
0
df_train_nbeats_sub, df_train_nbeats_val = utilities.split_ts(df_train_nbeats)

nbeats_training = TimeSeriesDataSet(
    df_train_nbeats_sub,
    time_idx="index",
    target="y",
    categorical_encoders={
        "group": NaNLabelEncoder().fit(df_train_nbeats_sub["group"])
    },
    group_ids=["group"],
    time_varying_unknown_reals=["y"],
    max_encoder_length=max_encoder_length,
    max_prediction_length=max_prediction_length,
)
nbeats_validation = TimeSeriesDataSet.from_dataset(nbeats_training,
                                                   df_train_nbeats_val)

# %%
batch_size = 128
nbeats_train_dataloader = nbeats_training.to_dataloader(train=True,
                                                        batch_size=batch_size,
                                                        num_workers=0)
nbeats_val_dataloader = nbeats_validation.to_dataloader(train=False,
                                                        batch_size=batch_size,
                                                        num_workers=0)

net = NBeats.from_dataset(
    nbeats_training,
    learning_rate=3e-2,
    weight_decay=1e-2,
    widths=[32, 512],