Beispiel #1
0
    def crossvalidation_split_df(self, df, freq, k=5, fold_pct=0.1, fold_overlap_pct=0.5):
        """Splits timeseries data in k folds for crossvalidation.

        Args:
            df (pd.DataFrame): data
            freq (str):Data step sizes. Frequency of data recording,
                Any valid frequency for pd.date_range, such as '5min', 'D' or 'MS'
            k: number of CV folds
            fold_pct: percentage of overall samples to be in each fold
            fold_overlap_pct: percentage of overlap between the validation folds.

        Returns:
            list of k tuples [(df_train, df_val), ...] where:
                df_train (pd.DataFrame):  training data
                df_val (pd.DataFrame): validation data
        """
        df = df.copy(deep=True)
        df = df_utils.check_dataframe(df, check_y=False)
        df = self._handle_missing_data(df, freq=freq, predicting=False)
        folds = df_utils.crossvalidation_split_df(
            df,
            n_lags=self.n_lags,
            n_forecasts=self.n_forecasts,
            k=k,
            fold_pct=fold_pct,
            fold_overlap_pct=fold_overlap_pct,
        )
        return folds
Beispiel #2
0
    def test(self, df):
        """Evaluate model on holdout data.

        Args:
            df (pd.DataFrame): containing column 'ds', 'y' with holdout data
        Returns:
            df with evaluation metrics
        """
        if self.fitted is False:
            log.warning("Model has not been fitted. Test results will be random.")
        df = df_utils.check_dataframe(df, check_y=True)
        df = self._handle_missing_data(df, freq=self.data_freq)
        loader = self._init_val_loader(df)
        val_metrics_df = self._evaluate(loader)
        return val_metrics_df
Beispiel #3
0
    def _hyperparameter_optimization(self, df, freq, epochs=None, validate_each_epoch=True, valid_p=0.2):

        self.data_freq = freq
        if epochs is not None:
            default_epochs = self.config_train.epochs
            self.config_train.epochs = epochs
        if self.fitted is True:
            log.warning("Model has already been fitted. Re-fitting will produce different results.")
        df = df_utils.check_dataframe(df, check_y=True)
        df = self._handle_missing_data(df, freq=self.data_freq)
        if validate_each_epoch:
            df_train, df_val = df_utils.split_df(df, n_lags=self.n_lags, n_forecasts=self.n_forecasts, valid_p=valid_p)
            tr_loader, val_loader, model = self._train(df_train, df_val, hyperparameter_optim=True)

        return tr_loader, val_loader, model
    def _create_dataset(self, df, valid_p=0.2):
        df = df_utils.check_dataframe(df)
        df = self._handle_missing_data(df)
        df = df[["ds", "y"]]
        df["time_idx"] = range(df.shape[0])
        df["series"] = 0
        self.n_data = df.shape[0]
        self.set_auto_batch_epoch(self.n_data)

        training_cutoff = df.shape[0] - int(valid_p * df.shape[0])

        training = TimeSeriesDataSet(
            df.iloc[:training_cutoff],
            time_idx="time_idx",
            target="y",
            categorical_encoders={"series": NaNLabelEncoder().fit(df.series)},
            group_ids=["series"],
            min_encoder_length=self.context_length,
            max_encoder_length=self.context_length,
            max_prediction_length=self.prediction_length,
            min_prediction_length=self.prediction_length,
            time_varying_unknown_reals=["y"],
            target_normalizer=GroupNormalizer(groups=["series"]),
            randomize_length=None,
            add_relative_time_idx=False,
            add_target_scales=False,
        )

        validation = TimeSeriesDataSet.from_dataset(
            training, df, min_prediction_idx=training_cutoff)
        train_dataloader = training.to_dataloader(train=True,
                                                  batch_size=self.batch_size,
                                                  num_workers=self.num_workers)
        val_dataloader = validation.to_dataloader(train=False,
                                                  batch_size=self.batch_size,
                                                  num_workers=self.num_workers)

        return training, train_dataloader, val_dataloader
 def test_train_eval_test(self):
     log.info("testing: Train Eval Test")
     m = NeuralProphet(
         n_lags=10,
         n_forecasts=3,
         ar_sparsity=0.1,
         epochs=3,
         batch_size=32,
     )
     df = pd.read_csv(PEYTON_FILE, nrows=95)
     df = df_utils.check_dataframe(df, check_y=False)
     df = m._handle_missing_data(df, freq="D", predicting=False)
     df_train, df_test = m.split_df(df, freq="D", valid_p=0.1)
     metrics = m.fit(df_train,
                     freq="D",
                     validate_each_epoch=True,
                     valid_p=0.1)
     metrics = m.fit(df_train, freq="D")
     val_metrics = m.test(df_test)
     log.debug("Metrics: train/eval: \n {}".format(
         metrics.to_string(float_format=lambda x: "{:6.3f}".format(x))))
     log.debug("Metrics: test: \n {}".format(
         val_metrics.to_string(float_format=lambda x: "{:6.3f}".format(x))))
Beispiel #6
0
    def fit(
        self, df, freq, epochs=None, validate_each_epoch=False, valid_p=0.2, progress_bar=True, plot_live_loss=False
    ):
        """Train, and potentially evaluate model.

        Args:
            df (pd.DataFrame): containing column 'ds', 'y' with all data
            freq (str):Data step sizes. Frequency of data recording,
                Any valid frequency for pd.date_range, such as '5min', 'D' or 'MS'
            epochs (int): number of epochs to train.
                default: if not specified, uses self.epochs
            validate_each_epoch (bool): whether to evaluate performance after each training epoch
            valid_p (float): fraction of data to hold out from training for model evaluation
            progress_bar (bool): display updating progress bar (tqdm)
            plot_live_loss (bool): plot live training loss,
                requires [live] install or livelossplot package installed.
        Returns:
            metrics with training and potentially evaluation metrics
        """
        self.data_freq = freq
        if epochs is not None:
            default_epochs = self.config_train.epochs
            self.config_train.epochs = epochs
        if self.fitted is True:
            log.warning("Model has already been fitted. Re-fitting will produce different results.")
        df = df_utils.check_dataframe(df, check_y=True)
        df = self._handle_missing_data(df, freq=self.data_freq)
        if validate_each_epoch:
            df_train, df_val = df_utils.split_df(df, n_lags=self.n_lags, n_forecasts=self.n_forecasts, valid_p=valid_p)
            metrics_df = self._train(df_train, df_val, progress_bar=progress_bar, plot_live_loss=plot_live_loss)
        else:
            metrics_df = self._train(df, progress_bar=progress_bar, plot_live_loss=plot_live_loss)
        if epochs is not None:
            self.config_train.epochs = default_epochs
        self.fitted = True

        return metrics_df
Beispiel #7
0
    def test_time_dataset(self):
        # manually load any file that stores a time series, for example:
        df_in = pd.read_csv(AIR_FILE, index_col=False)
        log.debug("Infile shape: {}".format(df_in.shape))

        n_lags = 3
        n_forecasts = 1
        valid_p = 0.2
        df_train, df_val = df_utils.split_df(df_in, n_lags, n_forecasts,
                                             valid_p)

        # create a tabularized dataset from time series
        df = df_utils.check_dataframe(df_train)
        data_params = df_utils.init_data_params(df, normalize="minmax")
        df = df_utils.normalize(df, data_params)
        inputs, targets = time_dataset.tabularize_univariate_datetime(
            df,
            n_lags=n_lags,
            n_forecasts=n_forecasts,
        )
        log.debug("tabularized inputs: {}".format("; ".join([
            "{}: {}".format(inp, values.shape)
            for inp, values in inputs.items()
        ])))
Beispiel #8
0
        def check_split(df_in,
                        df_len_expected,
                        n_lags,
                        n_forecasts,
                        freq,
                        p=0.1):
            m = NeuralProphet(
                n_lags=n_lags,
                n_forecasts=n_forecasts,
            )
            df_in = df_utils.check_dataframe(df_in, check_y=False)
            df_in = m._handle_missing_data(df_in, freq=freq, predicting=False)
            assert df_len_expected == len(df_in)

            total_samples = len(df_in) - n_lags - 2 * n_forecasts + 2
            df_train, df_test = m.split_df(df_in, freq=freq, valid_p=0.1)
            n_train = len(df_train) - n_lags - n_forecasts + 1
            n_test = len(df_test) - n_lags - n_forecasts + 1
            assert total_samples == n_train + n_test

            n_test_expected = max(1, int(total_samples * p))
            n_train_expected = total_samples - n_test_expected
            assert n_train == n_train_expected
            assert n_test == n_test_expected
Beispiel #9
0
    def split_df(self, df, freq, valid_p=0.2):
        """Splits timeseries df into train and validation sets.

        Prevents overbleed of targets. Overbleed of inputs can be configured.
        Also performs basic data checks and fills in missing data.

        Args:
            df (pd.DataFrame): data
            freq (str):Data step sizes. Frequency of data recording,
                Any valid frequency for pd.date_range, such as '5min', 'D' or 'MS'
            valid_p (float): fraction of data to use for holdout validation set
                Targets will still never be shared.

        Returns:
            df_train (pd.DataFrame):  training data
            df_val (pd.DataFrame): validation data
        """
        df = df.copy(deep=True)
        df = df_utils.check_dataframe(df, check_y=False)
        df = self._handle_missing_data(df, freq=freq, predicting=False)
        df_train, df_val = df_utils.split_df(
            df, n_lags=self.n_lags, n_forecasts=self.n_forecasts, valid_p=valid_p, inputs_overbleed=True,
        )
        return df_train, df_val
    def make_future_dataframe(self, df, periods=0, n_historic_predictions=0):
        """
        Creates a dataframe for prediction
        Args:
            periods: number of future periods to forecast
            n_historic_predictions: number of historic_predictions to include in forecast

        Returns:
            future_dataframe: DataFrame, used further for prediction
        """

        if isinstance(n_historic_predictions, bool):
            if n_historic_predictions:
                n_historic_predictions = len(df) - self.context_length
            else:
                n_historic_predictions = 0
        elif not isinstance(n_historic_predictions, int):
            log.error("non-integer value for n_historic_predictions set to zero.")
            n_historic_predictions = 0
        if periods == 0 and n_historic_predictions == 0:
            raise ValueError("Set either history or future to contain more than zero values.")

        if len(df) < self.context_length:
            raise ValueError("Insufficient data for a prediction")
        elif len(df) < self.context_length + n_historic_predictions:
            log.warning(
                "Insufficient data for {} historic forecasts, reduced to {}.".format(
                    n_historic_predictions, len(df) - self.context_length
                )
            )
            n_historic_predictions = len(df) - self.context_length

        if periods > 0 and periods != self.prediction_length:
            periods = self.prediction_length
            log.warning(
                "Number of forecast steps is defined by n_forecasts. " "Adjusted to {}.".format(self.prediction_length)
            )

        self.periods = periods

        self.n_historic_predictions = n_historic_predictions

        df = df.copy(deep=True)

        df = df_utils.check_dataframe(df)
        df = self._handle_missing_data(df)
        df = df[["ds", "y"]]
        df["time_idx"] = range(df.shape[0])
        df["series"] = 0
        self.n_data = df.shape[0]

        encoder_data = df[lambda x: x.time_idx > x.time_idx.max() - (self.context_length + n_historic_predictions)]
        if periods != 0:
            last_data = df[lambda x: x.time_idx == x.time_idx.max()]
            decoder_data = pd.concat(
                [last_data.assign(ds=lambda x: x.ds + pd.offsets.MonthBegin(i)) for i in range(1, periods + 1)],
                ignore_index=True,
            )
            decoder_data["time_idx"] = range(
                decoder_data["time_idx"].iloc[0] + 1, decoder_data["time_idx"].iloc[0] + periods + 1
            )
            decoder_data["ds"] = pd.date_range(start=encoder_data["ds"].iloc[-1], periods=periods + 1, freq=self.freq)[
                1:
            ]
            future_dataframe = pd.concat([encoder_data, decoder_data], ignore_index=True)
        elif periods == 0:
            future_dataframe = encoder_data
        return future_dataframe
Beispiel #11
0
    def make_future_dataframe(self, df, periods=None, n_historic_predictions=0):
        df = df.copy(deep=True)

        n_lags = 0 if self.n_lags is None else self.n_lags
        if periods is None:
            periods = 1 if n_lags == 0 else self.n_forecasts
        else:
            assert periods >= 0

        if isinstance(n_historic_predictions, bool):
            if n_historic_predictions:
                n_historic_predictions = len(df) - n_lags
            else:
                n_historic_predictions = 0
        elif not isinstance(n_historic_predictions, int):
            log.error("non-integer value for n_historic_predictions set to zero.")
            n_historic_predictions = 0

        if periods == 0 and n_historic_predictions == 0:
            raise ValueError("Set either history or future to contain more than zero values.")

        last_date = pd.to_datetime(df["ds"].copy(deep=True)).sort_values().max()

        if len(df) < n_lags:
            raise ValueError("Insufficient data for a prediction")
        elif len(df) < n_lags + n_historic_predictions:
            log.warning(
                "Insufficient data for {} historic forecasts, reduced to {}.".format(
                    n_historic_predictions, len(df) - n_lags
                )
            )
            n_historic_predictions = len(df) - n_lags
        if (n_historic_predictions + n_lags) == 0:
            df = pd.DataFrame(columns=df.columns)
        else:
            df = df[-(n_lags + n_historic_predictions) :]

        if len(df) > 0:
            if len(df.columns) == 1 and "ds" in df:
                assert n_lags == 0
                df = df_utils.check_dataframe(df, check_y=False)
            else:
                df = df_utils.check_dataframe(df, check_y=n_lags > 0)
                df = self._handle_missing_data(df, freq=self.data_freq, predicting=True)
            df = df_utils.normalize(df, self.data_params)

        # future data
        # check for external events known in future

        if n_lags > 0:
            if periods > 0 and periods != self.n_forecasts:
                periods = self.n_forecasts
                log.warning(
                    "Number of forecast steps is defined by n_forecasts. " "Adjusted to {}.".format(self.n_forecasts)
                )

        if periods > 0:
            future_df = df_utils.make_future_df(
                df_columns=df.columns, last_date=last_date, periods=periods, freq=self.data_freq,
            )
            future_df = df_utils.normalize(future_df, self.data_params)
            if len(df) > 0:
                df = df.append(future_df)
            else:
                df = future_df
        df.reset_index(drop=True, inplace=True)
        return df