Beispiel #1
0
    def __init__(self, models: Union[List[ForecastingModel],
                                     List[GlobalForecastingModel]]):
        raise_if_not(
            isinstance(models, list) and models,
            "Cannot instantiate EnsembleModel with an empty list of models",
            logger,
        )

        is_local_ensemble = all(
            isinstance(model, ForecastingModel)
            and not isinstance(model, GlobalForecastingModel)
            for model in models)
        self.is_global_ensemble = all(
            isinstance(model, GlobalForecastingModel) for model in models)

        raise_if_not(
            is_local_ensemble or self.is_global_ensemble,
            "All models must either be GlobalForecastingModel instances, or none of them should be.",
            logger,
        )

        raise_if(
            any([m._fit_called for m in models]),
            "Cannot instantiate EnsembleModel with trained/fitted models. "
            "Consider resetting all models with `my_model.untrained_model()`",
            logger,
        )

        super().__init__()
        self.models = models
        self.is_single_series = None
Beispiel #2
0
def fill_missing_values(series: TimeSeries, fill: Union[str, float] = 'auto', **interpolate_kwargs) -> TimeSeries:
    """
    Fills missing values in the provided time series

    Parameters
    ----------
    series
        The time series for which to fill missing values
    fill
        The value used to replace the missing values.
        If set to 'auto', will auto-fill missing values using the `pandas.Dataframe.interpolate()` method.
    interpolate_kwargs
        Keyword arguments for `pandas.Dataframe.interpolate()`, only used when fit is set to 'auto'.
        See `the documentation
        <https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.interpolate.html>`_
        for the list of supported parameters.

    Returns
    -------
    TimeSeries
        A new TimeSeries with all missing values filled according to the rules above.
    """
    raise_if_not(isinstance(fill, str) or isinstance(fill, float),
                 "`fill` should either be a string or a float",
                 logger)
    raise_if(isinstance(fill, str) and fill != 'auto',
             "invalid string for `fill`: can only be set to 'auto'",
             logger)

    if fill == 'auto':
        return _auto_fill(series, **interpolate_kwargs)
    return _const_fill(series, fill)
Beispiel #3
0
    def __init__(self,
                 fill: Union[str, float] = 'auto',
                 name: str = "MissingValuesFiller",
                 n_jobs: int = 1,
                 verbose: bool = False):
        """
        Data transformer to fill missing values from a (sequence of) TimeSeries

        Parameters
        ----------
        fill
            The value used to replace the missing values.
            If set to 'auto', will auto-fill missing values using the `pandas.Dataframe.interpolate()` method.
        name
            A specific name for the transformer
        n_jobs
            The number of jobs to run in parallel. Parallel jobs are created only when a `Sequence[TimeSeries]` is
            passed as input to a method, parallelising operations regarding different `TimeSeries`. Defaults to `1`
            (sequential). Setting the parameter to `-1` means using all the available processors.
            Note: for a small amount of data, the parallelisation overhead could end up increasing the total
            required amount of time.
        verbose
            Optionally, whether to print operations progress
        """
        raise_if_not(
            isinstance(fill, str) or isinstance(fill, float),
            "`fill` should either be a string or a float", logger)
        raise_if(
            isinstance(fill, str) and fill != 'auto',
            "invalid string for `fill`: can only be set to 'auto'", logger)

        super().__init__(name=name, n_jobs=n_jobs, verbose=verbose)
        self._fill = fill
Beispiel #4
0
 def _supports_range_index(self) -> bool:
     raise_if(
         self.trend and self.trend != "c",
         "'trend' is not None. Range indexing is not supported in that case.",
         logger,
     )
     return True
Beispiel #5
0
    def ts_fit(series: TimeSeries,
               lmbda: Optional[Union[float, Sequence[float]]], method, *args,
               **kwargs) -> Union[Sequence[float], pd.core.series.Series]:
        component_mask = kwargs.get("component_mask", None)

        if lmbda is None:
            # Compute optimal lmbda for each dimension of the time series. In this case, the return type is
            # an ndarray and not a Sequence
            vals = BoxCox._reshape_in(series, component_mask=component_mask)
            lmbda = np.apply_along_axis(boxcox_normmax,
                                        axis=0,
                                        arr=vals,
                                        method=method)

        elif isinstance(lmbda, Sequence):
            raise_if(
                len(lmbda) != series.width,
                "lmbda should have one value per dimension (ie. column or variable) of the time series",
                logger,
            )
        else:
            # Replicate lmbda to match dimensions of the time series
            lmbda = [lmbda] * series.width

        return lmbda
Beispiel #6
0
    def _build_train_dataset(
        self,
        target: Sequence[TimeSeries],
        past_covariates: Optional[Sequence[TimeSeries]],
        future_covariates: Optional[Sequence[TimeSeries]],
        max_samples_per_ts: Optional[int],
    ) -> MixedCovariatesSequentialDataset:

        raise_if(
            future_covariates is None and not self.add_relative_index,
            "TFTModel requires future covariates. The model applies multi-head attention queries on future "
            "inputs. Consider specifying a future encoder with `add_encoders` or setting `add_relative_index` "
            "to `True` at model creation (read TFT model docs for more information). "
            "These will automatically generate `future_covariates` from indexes.",
            logger,
        )

        return MixedCovariatesSequentialDataset(
            target_series=target,
            past_covariates=past_covariates,
            future_covariates=future_covariates,
            input_chunk_length=self.input_chunk_length,
            output_chunk_length=self.output_chunk_length,
            max_samples_per_ts=max_samples_per_ts,
        )
    def __init__(self,
                 forecasting_models: List[ForecastingModel],
                 regression_train_n_points: int,
                 regression_model=None):
        """
        Class for ensemble models using a regression model for ensembling individual models' predictions.
        The provided regression model must implement fit() and predict() methods
        (e.g. scikit-learn regression models). Note that here the regression model is used to learn how to
        best ensemble the individual forecasting models' forecasts. It is not the same usage of regression
        as in `RegressionModel`, where the regression model is used to produce forecasts based on the
        lagged series.

        Parameters
        ----------
        forecasting_models
            List of forecasting models whose predictions to ensemble
        regression_train_n_points
            The number of points to use to train the regression model
        regression_model
            Any regression model with predict() and fit() methods (e.g. from scikit-learn)
            Default: `darts.model.LinearRegressionModel(fit_intercept=False)`
        """
        super().__init__(forecasting_models)
        if regression_model is None:
            regression_model = LinearRegressionModel(lags_exog=0,
                                                     fit_intercept=False)

        regression_model = RegressionModel(lags_exog=0, model=regression_model)
        raise_if(
            regression_model.lags is not None
            and regression_model.lags_exog != [0],
            ("`lags` of regression model must be `None` and `lags_exog` must be [0]. Given: {} and {}"
             .format(regression_model.lags, regression_model.lags_exog)))
        self.regression_model = regression_model
        self.train_n_points = regression_train_n_points
Beispiel #8
0
    def fit(self, series: TimeSeries):
        super().fit(series)
        series._assert_univariate()
        series = self.training_series

        if self.version == "tsb":
            self.forecast_val = self.method(
                series.values(copy=False),
                h=1,
                future_xreg=None,
                alpha_d=self.alpha_d,
                alpha_p=self.alpha_p,
            )
        elif self.version == "sba":
            try:
                self.forecast_val = self.method(series.values(copy=False),
                                                h=1,
                                                future_xreg=None)
            except errors.TypingError:
                raise_if(
                    True,
                    '"sba" version is not supported with this version of statsforecast.',
                )

        else:
            self.forecast_val = self.method(series.values(copy=False),
                                            h=1,
                                            future_xreg=None)
        return self
Beispiel #9
0
def _generate_index(
    start: Optional[Union[pd.Timestamp, int]] = None,
    end: Optional[Union[pd.Timestamp, int]] = None,
    length: Optional[int] = None,
    freq: str = "D",
    name: str = None,
) -> Union[pd.DatetimeIndex, pd.RangeIndex]:
    """Returns an index with a given start point and length. Either a pandas DatetimeIndex with given frequency
    or a pandas RangeIndex. The index starts at

    Parameters
    ----------
    start
        The start of the returned index. If a pandas Timestamp is passed, the index will be a pandas
        DatetimeIndex. If an integer is passed, the index will be a pandas RangeIndex index. Works only with
        either `length` or `end`.
    end
        Optionally, the end of the returned index. Works only with either `start` or `length`. If `start` is
        set, `end` must be of same type as `start`. Else, it can be either a pandas Timestamp or an integer.
    length
        Optionally, the length of the returned index. Works only with either `start` or `end`.
    freq
        The time difference between two adjacent entries in the returned index. Only effective if `start` is a
        pandas Timestamp. A DateOffset alias is expected; see
        `docs <https://pandas.pydata.org/pandas-docs/stable/user_guide/TimeSeries.html#dateoffset-objects>`_.
        The freq is optional for generating an integer index.
    """
    constructors = [
        arg_name for arg, arg_name in zip([start, end, length],
                                          ["start", "end", "length"])
        if arg is not None
    ]
    raise_if(
        len(constructors) != 2,
        "index can only be generated with exactly two of the following parameters: [`start`, `end`, `length`]. "
        f"Observed parameters: {constructors}. For generating an index with `end` and `length` consider setting "
        f"`start` to None.",
        logger,
    )
    raise_if(
        end is not None and start is not None and type(start) != type(end),
        "index generation with `start` and `end` requires equal object types of `start` and `end`",
        logger,
    )

    if isinstance(start, pd.Timestamp) or isinstance(end, pd.Timestamp):
        index = pd.date_range(start=start,
                              end=end,
                              periods=length,
                              freq=freq,
                              name=name)
    else:  # int
        index = pd.RangeIndex(
            start=start if start is not None else end - length + 1,
            stop=end + 1 if end is not None else start + length,
            step=1,
            name=name,
        )
    return index
Beispiel #10
0
    def __init__(self,
                 version: str = "classic",
                 alpha_d: float = None,
                 alpha_p: float = None):
        """An implementation of the `Croston method
        <https://otexts.com/fpp3/counts.html>`_ for intermittent
        count series.

        Relying on the implementation of `Statsforecasts package
        <https://github.com/Nixtla/statsforecast>`_.

        Parameters
        ----------
        version
            - "classic" corresponds to classic Croston.
            - "optimized" corresponds to optimized classic Croston, which searches
              for the optimal ``alpha`` smoothing parameter and can take longer
              to run. Otherwise, a fixed value of ``alpha=0.1`` is used.
            - "sba" corresponds to the adjustment of the Croston method known as
              the Syntetos-Boylan Approximation [1]_.
            - "tsb" corresponds to the adjustment of the Croston method proposed by
              Teunter, Syntetos and Babai [2]_. In this case, `alpha_d` and `alpha_p` must
              be set.
        alpha_d
            For the "tsb" version, the alpha smoothing parameter to apply on demand.
        alpha_p
            For the "tsb" version, the alpha smoothing parameter to apply on probability.

        References
        ----------
        .. [1] Aris A. Syntetos and John E. Boylan. The accuracy of intermittent demand estimates.
               International Journal of Forecasting, 21(2):303 – 314, 2005.
        .. [2] Ruud H. Teunter, Aris A. Syntetos, and M. Zied Babai.
               Intermittent demand: Linking forecasting to inventory obsolescence.
               European Journal of Operational Research, 214(3):606 – 615, 2011.
        """
        super().__init__()
        raise_if_not(
            version.lower() in ["classic", "optimized", "sba", "tsb"],
            'The provided "version" parameter must be set to "classic", "optimized", "sba" or "tsb".',
        )

        if version == "classic":
            self.method = croston_classic
        elif version == "optimized":
            self.method = croston_optimized
        elif version == "sba":
            self.method = croston_sba
        else:
            raise_if(
                alpha_d is None or alpha_p is None,
                'alpha_d and alpha_p must be specified when using "tsb".',
            )
            self.method = croston_tsb
            self.alpha_d = alpha_d
            self.alpha_p = alpha_p

        self.version = version
Beispiel #11
0
 def _supports_range_index(self) -> bool:
     """Prophet does not support integer range index."""
     raise_if(
         True,
         "Prophet does not support integer range index. The index of the TimeSeries must be of type "
         "pandas.DatetimeIndex",
         logger,
     )
     return False
Beispiel #12
0
def remove_seasonality(
    ts: TimeSeries,
    freq: int = None,
    model: SeasonalityMode = SeasonalityMode.MULTIPLICATIVE,
    method: str = "naive",
    **kwargs,
) -> TimeSeries:
    """
    Adjusts the TimeSeries `ts` for a seasonality of order `frequency` using the `model` decomposition.

    Parameters
    ----------
    ts
        The TimeSeries to adjust.
    freq
        The seasonality period to use.
    model
        The type of decomposition to use.
        Must be a `from darts import SeasonalityMode` Enum member.
        Either SeasonalityMode.MULTIPLICATIVE or SeasonalityMode.ADDITIVE.
        Defaults SeasonalityMode.MULTIPLICATIVE.
    method
        The method to be used to decompose the series.
        - "naive" : Seasonal decomposition using moving averages [1]_.
        - "STL" : Season-Trend decomposition using LOESS [2]_. Only compatible with ``ADDITIVE`` model type.
        Defaults to "naive"
    kwargs
        Other keyword arguments are passed down to the decomposition method.
     Returns
    -------
    TimeSeries
        A new TimeSeries instance that corresponds to the seasonality-adjusted 'ts'.
    References
    -------
    .. [1] https://www.statsmodels.org/devel/generated/statsmodels.tsa.seasonal.seasonal_decompose.html
    .. [2] https://www.statsmodels.org/devel/generated/statsmodels.tsa.seasonal.STL.html
    """
    ts._assert_univariate()
    raise_if_not(
        model is not SeasonalityMode.NONE,
        "The model must be either MULTIPLICATIVE or ADDITIVE.",
    )
    raise_if(
        model not in [SeasonalityMode.ADDITIVE, ModelMode.ADDITIVE]
        and method == "STL",
        f"Only ADDITIVE seasonality is compatible with the STL method. Current model is {model}.",
        logger,
    )

    _, seasonality = extract_trend_and_seasonality(ts, freq, model, method,
                                                   **kwargs)
    new_ts = remove_from_series(ts, seasonality, model)
    return new_ts
    def fit(self,
            training_series: TimeSeries,
            target_series: Optional[TimeSeries] = None) -> None:
        super().fit(training_series, target_series)

        # spare train_n_points points to serve as regression target
        raise_if(
            len(self.training_series) <= self.regression_model.train_n_points,
            "regression_train_n_points parameter too big (greater or equal"
            " the number of points in training_series)", logger)
        forecast_training = self.training_series[:-self.regression_model.
                                                 train_n_points]
        forecast_target = self.target_series[:-self.regression_model.
                                             train_n_points]

        regression_target = self.target_series[-self.regression_model.
                                               train_n_points:]

        # fit the forecasting models
        for model in self.models:
            if isinstance(model, UnivariateForecastingModel):
                model.fit(forecast_training)
            else:
                model.fit(forecast_training, forecast_target)

        # predict train_n_points points for each model
        predictions = []
        for model in self.models:
            predictions.append(
                model.predict(self.regression_model.train_n_points))

        # train the regression model on the individual models' predictions
        self.regression_model.fit(train_features=predictions,
                                  train_target=regression_target)

        # prepare the forecasting models for further predicting by fitting
        # them with the entire data

        # Some models (incl. Neural-Network based models) may need to be 'reset'
        # to allow being retrained from scratch
        self.models = [
            model.untrained_model()
            if hasattr(model, 'untrained_model') else model
            for model in self.models
        ]

        # fit the forecasting models
        for model in self.models:
            if isinstance(model, UnivariateForecastingModel):
                model.fit(self.training_series)
            else:
                model.fit(self.training_series, self.target_series)
Beispiel #14
0
    def fit(self,
            data: TimeSeries,
            lmbda: Optional[Union[float, Sequence[float]]] = None,
            optim_method='mle') -> 'BoxCox':
        """
        Sets the `lmbda` parameter value.

        Parameters
        ----------
        data
            The time series to fit on
        lmbda
            If None given, will automatically find an optimal value of lmbda (for each dimension
            of the time series) using `scipy.stats.boxcox_normmax` with `method=optim_method`
            If a single float is given, the same lmbda value will be used for all dimensions of the series.
            Also allows to specify a different lmbda value for each dimension of the time series by passing
            a sequence of values.
        optim_method
            Specifies which method to use to find an optimal value for the lmbda parameter.
            Either 'mle' or 'pearsonr'.

        Returns
        -------
            Fitted transformer (self)
        """
        super().fit(data)

        raise_if(
            not isinstance(optim_method, str)
            or optim_method not in ['mle', 'pearsonr'],
            "optim_method parameter must be either 'mle' or 'pearsonr'",
            logger)

        if lmbda is None:
            # Compute optimal lmbda for each dimension of the time series
            lmbda = data._df.apply(boxcox_normmax, method=optim_method)
        elif isinstance(lmbda, Sequence):
            raise_if(
                len(lmbda) != data.width,
                "lmbda should have one value per dimension (ie. column or variable) of the time series",
                logger)
        else:
            # Replicate lmbda to match dimensions of the time series
            lmbda = [lmbda] * data.width

        self._lmbda = lmbda

        return self
Beispiel #15
0
    def _fit_iterator(
        self, series: Sequence[TimeSeries]
    ) -> Iterator[Tuple[TimeSeries, Optional[Union[Sequence[float], float]]]]:

        if isinstance(self._lmbda, Sequence) and isinstance(
                self._lmbda[0], Sequence):
            # CASE 0: Sequence[Sequence[float]]
            raise_if(
                len(self._lmbda) != len(series),
                "with multiple time series the number of lmbdas sequences must equal the number of time \
                        series", logger)
            return zip(series, self._lmbda)
        else:
            # CASE 1: Sequence[float], float, None. Replicating the same value for each TS
            lmbda_gen = (self._lmbda for _ in range(len(series)))
            return zip(series, lmbda_gen)
Beispiel #16
0
    def ts_fit(series: TimeSeries, lmbda: Optional[Union[float,
                                                         Sequence[float]]],
               method) -> Union[Sequence[float], pd.core.series.Series]:
        if lmbda is None:
            # Compute optimal lmbda for each dimension of the time series. In this case, the return type is
            # a pd.core.series.Series, which is not inhering from collections.abs.Sequence
            lmbda = series._df.apply(boxcox_normmax, method=method)
        elif isinstance(lmbda, Sequence):
            raise_if(
                len(lmbda) != series.width,
                "lmbda should have one value per dimension (ie. column or variable) of the time series",
                logger)
        else:
            # Replicate lmbda to match dimensions of the time series
            lmbda = [lmbda] * series.width

        return lmbda
Beispiel #17
0
 def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
     # by default our models are initialized as float32. For other dtypes, we need to cast to the correct precision
     # before parameters are loaded by PyTorch-Lightning
     dtype = checkpoint["model_dtype"]
     if dtype == torch.float16:
         self.half()
     if dtype == torch.float32:
         self.float()
     elif dtype == torch.float64:
         self.double()
     else:
         raise_if(
             True,
             f"Trying to load dtype {dtype}. Loading for this type is not implemented yet. Please report this "
             f"issue on https://github.com/unit8co/darts",
             logger,
         )
Beispiel #18
0
    def __init__(self,
                 name: str = "BoxCox",
                 lmbda: Optional[Union[float, Sequence[float],
                                       Sequence[Sequence[float]]]] = None,
                 optim_method='mle',
                 n_jobs: int = 1,
                 verbose: bool = False):
        """
        Box-Cox data transformer.
        See https://otexts.com/fpp2/transformations.html#mathematical-transformations for more information.

        Parameters
        ----------
        name
            A specific name for the transformer
        lmbda
            If None given, will automatically find an optimal value of lmbda (for each dimension
            of the time series, for each time series) using `scipy.stats.boxcox_normmax` with `method=optim_method`
            If a single float is given, the same lmbda value will be used for all dimensions of the series, for all
            the series.
            Also allows to specify a different lmbda value for each dimension of the time series by passing
            a sequence of values (or a sequence of sequence of values in case of multiple time series).
        optim_method
            Specifies which method to use to find an optimal value for the lmbda parameter.
            Either 'mle' or 'pearsonr'. Ignored if lmbda != None.
        n_jobs
            The number of jobs to run in parallel. Parallel jobs are created only when a `Sequence[TimeSeries]` is
            passed as input to a method, parallelising operations regarding different `TimeSeries`. Defaults to `1`
            (sequential). Setting the parameter to `-1` means using all the available processors.
            Note: for a small amount of data, the parallelisation overhead could end up increasing the total
            required amount of time.
        verbose
            Optionally, whether to print operations progress
        """

        super().__init__(name=name, n_jobs=n_jobs, verbose=verbose)

        raise_if(
            not isinstance(optim_method, str)
            or optim_method not in ['mle', 'pearsonr'],
            "optim_method parameter must be either 'mle' or 'pearsonr'",
            logger)

        self._lmbda = lmbda
        self._optim_method = optim_method
Beispiel #19
0
def remove_trend(
    ts: TimeSeries,
    model: ModelMode = ModelMode.MULTIPLICATIVE,
    method: str = "naive",
    **kwargs,
) -> TimeSeries:
    """
    Adjusts the TimeSeries `ts` for a trend using the `model` decomposition.

    Parameters
    ----------
    ts
        The TimeSeries to adjust.
    model
        The type of decomposition to use.
        Must be a `from darts import ModelMode` Enum member.
        Either ModelMode.MULTIPLICATIVE or ModelMode.ADDITIVE.
        Defaults ModelMode.MULTIPLICATIVE.
    method
        The method to be used to decompose the series.
        - "naive" : Seasonal decomposition using moving averages [1]_.
        - "STL" : Season-Trend decomposition using LOESS [2]_. Only compatible with ``ADDITIVE`` model type.
        Defaults to "naive"
    kwargs
        Other keyword arguments are passed down to the decomposition method.
    Returns
    -------
    TimeSeries
        A new TimeSeries instance that corresponds to the trend-adjusted 'ts'.
    """

    ts._assert_univariate()

    raise_if(
        model not in [SeasonalityMode.ADDITIVE, ModelMode.ADDITIVE]
        and method == "STL",
        f"Only ADDITIVE seasonality is compatible with the STL method. Current model is {model}.",
        logger,
    )
    trend, _ = extract_trend_and_seasonality(ts,
                                             model=model,
                                             method=method,
                                             **kwargs)
    new_ts = remove_from_series(ts, trend, model)
    return new_ts
Beispiel #20
0
    def __init__(self, n: int, m: int, ranges: np.ndarray = None):
        """
        Parameters
        ----------
        n
            The width of the window, must be equal to the length of series1
        m
            The height of the window, must be equal to the length of series2
        ranges
            Ranges of active cells within a column [[start_column0, end_column0], ...]
            with shape (n, 2) and where start >= 0 and end <= m.
        """

        self.n = n
        self.m = m

        if ranges is not None:
            raise_if_not(
                ranges.shape == (n, 2),
                f"Expects a 2d array with [start, end] for each column and shape = ({n}, 2)",
            )

            ranges = np.insert(ranges, 0, [0, 1], axis=0)
            start = ranges[:, 0]
            end = ranges[:, 1]

            raise_if(np.any(start < 0), "Start must be >=0")
            raise_if(np.any(end > m), "End must be <m")

            diff = np.maximum(end - start, 0)
            self.length = np.sum(diff)

            ranges[1:] += 1
            ranges = ranges.flatten()
        else:
            ranges = np.zeros((n + 1) * 2, dtype=int)
            ranges[0::2] = self.m  # start
            ranges[1::2] = 0  # end
            ranges = array.array("i", ranges)

            ranges[0] = 0
            ranges[1] = 1
            self.length = 1

        self.column_ranges = array.array("i", ranges)
    def __init__(self,
                 forecasting_models: List[ForecastingModel],
                 regression_train_n_points: int,
                 regression_model=LinearRegression(n_jobs=-1,
                                                   fit_intercept=False)):
        """
        Class for ensemble models using a regression model for ensembling individual models' predictions
        The provided regression model must implement fit() and predict() methods
        (e.g. scikit-learn regression models)

        Parameters
        ----------
        forecasting_models
            List of forecasting models whose predictions to ensemble
        regression_train_n_points
            The number of points to use to train the regression model
        regression_model
            Any regression model with predict() and fit() methods (e.g. from scikit-learn)
            Default: `sklearn.linear_model.LinearRegression(n_jobs=-1, fit_intercept=False)`
        """
        super().__init__(forecasting_models)

        # wrap provided regression_model in a StandardRegressionModel (if not already the case)
        if isinstance(regression_model, StandardRegressionModel):
            # raise exception if train_n_points value is ambiguous
            model_train_n_points = regression_model.train_n_points
            raise_if(
                model_train_n_points is not None
                and regression_train_n_points != model_train_n_points,
                "Provided StandardRegressionModel.train_n_points parameter doesn't match specified"
                " regression_train_n_points parameter.", logger)

            # if it was None, set regression_model.train_n_points to regression_train_n_points
            regression_model.train_n_points = regression_train_n_points
        else:
            regression_model = StandardRegressionModel(
                regression_train_n_points, regression_model)

        self.regression_model = regression_model
    def __init__(self,
                 fill: Union[str, float] = 'auto',
                 name: str = "MissingValuesFiller"):
        """
        Data transformer to fill missing values from time series

        Parameters
        ----------
        fill
            The value used to replace the missing values.
            If set to 'auto', will auto-fill missing values using the `pandas.Dataframe.interpolate()` method.
        name
            A specific name for the transformer
        """
        raise_if_not(isinstance(fill, str) or isinstance(fill, float),
                     "`fill` should either be a string or a float",
                     logger)
        raise_if(isinstance(fill, str) and fill != 'auto',
                 "invalid string for `fill`: can only be set to 'auto'",
                 logger)

        super().__init__(name)
        self._fill = fill
    def fit(self, series: TimeSeries) -> None:
        super().fit(series)

        # spare train_n_points points to serve as regression target
        raise_if(
            len(self.training_series) <= self.train_n_points,
            "regression_train_n_points parameter too big (must be smaller or equal"
            + " to the number of points in training_series)", logger)
        forecast_training = self.training_series[:-self.train_n_points]
        regression_target = self.training_series[-self.train_n_points:]

        # fit the forecasting models
        for model in self.models:
            model.fit(forecast_training)

        # predict train_n_points points for each model
        predictions = self.models[0].predict(self.train_n_points)
        for model in self.models[1:]:
            predictions = predictions.stack(model.predict(self.train_n_points))

        # train the regression model on the individual models' predictions
        self.regression_model.fit(series=regression_target, exog=predictions)

        # prepare the forecasting models for further predicting by fitting
        # them with the entire data

        # Some models (incl. Neural-Network based models) may need to be 'reset'
        # to allow being retrained from scratch
        self.models = [
            model.untrained_model()
            if hasattr(model, 'untrained_model') else model
            for model in self.models
        ]

        # fit the forecasting models
        for model in self.models:
            model.fit(self.training_series)
Beispiel #24
0
    def fit(
        self,
        series: Union[TimeSeries, Sequence[TimeSeries]],
        past_covariates: Optional[Union[TimeSeries,
                                        Sequence[TimeSeries]]] = None,
        future_covariates: Optional[Union[TimeSeries,
                                          Sequence[TimeSeries]]] = None,
    ):
        """
        Fits the model on the provided series.
        Note that `EnsembleModel.fit()` does NOT call `fit()` on each of its constituent forecasting models.
        It is left to classes inheriting from EnsembleModel to do so appropriately when overriding `fit()`
        """
        raise_if(
            not self.is_global_ensemble and not isinstance(series, TimeSeries),
            "The models are not GlobalForecastingModel's and do not support training on multiple series.",
            logger,
        )
        raise_if(
            not self.is_global_ensemble and past_covariates is not None,
            "The models are not GlobalForecastingModel's and do not support past covariates.",
            logger,
        )

        self.is_single_series = isinstance(series, TimeSeries)

        # check that if timeseries is single series, than covariates are as well and vice versa
        error = False

        if past_covariates is not None:
            error = self.is_single_series != isinstance(
                past_covariates, TimeSeries)

        if future_covariates is not None:
            error = self.is_single_series != isinstance(
                future_covariates, TimeSeries)

        raise_if(
            error,
            "Both series and covariates have to be either univariate or multivariate.",
            logger,
        )

        super().fit(series, past_covariates, future_covariates)

        return self
Beispiel #25
0
def _extend_time_index_until(
    time_index: Union[pd.DatetimeIndex, pd.RangeIndex],
    until: Optional[Union[int, str, pd.Timestamp]],
    add_length: int,
) -> pd.DatetimeIndex:

    if not add_length and not until:
        return time_index

    raise_if(
        bool(add_length) and bool(until),
        "set only one of add_length and until")

    end = time_index[-1]
    freq = time_index.freq

    if add_length:
        raise_if_not(
            add_length >= 0,
            f"Expected add_length, by which to extend the time series by, "
            f"to be positive, got {add_length}",
        )

        try:
            end += add_length * freq
        except pd.errors.OutOfBoundsDatetime:
            raise_log(
                ValueError(
                    f"the add operation between {end} and {add_length * freq} will overflow"
                ),
                logger,
            )
    else:
        datetime_index = isinstance(time_index, pd.DatetimeIndex)

        if datetime_index:
            raise_if_not(
                isinstance(until, (str, pd.Timestamp)),
                "Expected valid timestamp for TimeSeries, "
                "indexed by DatetimeIndex, "
                f"for parameter until, got {type(end)}",
                logger,
            )
        else:
            raise_if_not(
                isinstance(until, int),
                "Expected integer for TimeSeries, indexed by RangeIndex, "
                f"for parameter until, got {type(end)}",
                logger,
            )

        timestamp = pd.Timestamp(until) if datetime_index else until

        raise_if_not(
            timestamp > end,
            f"Expected until, {timestamp} to lie past end of time index {end}",
        )

        ahead = timestamp - end
        raise_if_not(
            (ahead % freq) == pd.Timedelta(0),
            f"End date must correspond with frequency {freq} of the time axis",
            logger,
        )

        end = timestamp

    new_time_index = pd.date_range(start=time_index[0], end=end, freq=freq)
    return new_time_index
Beispiel #26
0
    def _process_input_encoders(self, params: Dict) -> Tuple[List, List]:
        """Processes input and returns two lists of tuples `(encoder_id, attribute)` from relevant encoder
        parameters at model creation.

        Parameters
        ----------
        params
            The `add_encoders` dict used at model creation. Must follow this convention:
                `{encoder keyword: {temporal keyword: List[attributes]}}`

            Tuples of `(encoder_id, attribute)` are extracted from `add_encoders` to instantiate the `SingleEncoder`
            objects:

            * The `encoder_id` is extracted as follows:
                str(encoder_kw) + str(temporal_kw) -> 'cyclic' + 'past' -> `encoder_id` = 'cyclic_past'
                The `encoder_id` is used to map the parameters with the corresponding `SingleEncoder` objects.
            * The `attribute` is extracted from the values given by values under `temporal_kw`
                `attribute` = 'month'
                ...
                The `attribute` tells the `SingleEncoder` which attribute of the index to encode

        Raises
        ------
        ValueError
            1) if the outermost key is other than (`past`, `future`, `absolute`)
            2) if the innermost values are other than type `str` or `Sequence`
        """

        if not params:
            return [], []

        # check input for invalid encoder types
        invalid_encoders = [
            enc for enc in params if enc not in ENCODER_KEYS + TRANSFORMER_KEYS
        ]
        raise_if(
            len(invalid_encoders) > 0,
            f"Encountered invalid encoder types `{invalid_encoders}` in `add_encoders` parameter at model "
            f"creation. Supported encoder types are: `{ENCODER_KEYS + TRANSFORMER_KEYS}`.",
            logger,
        )

        encoders = {
            enc: params.get(enc, None)
            for enc in ENCODER_KEYS if params.get(enc, None)
        }

        # check input for invalid temporal types
        invalid_time_params = list()
        for encoder, t_types in encoders.items():
            invalid_time_params += [
                t_type for t_type in t_types.keys()
                if t_type not in VALID_TIME_PARAMS
            ]

        raise_if(
            len(invalid_time_params) > 0,
            f"Encountered invalid temporal types `{invalid_time_params}` in `add_encoders` parameter at model "
            f"creation. Supported temporal types are: `{VALID_TIME_PARAMS}`.",
            logger,
        )

        # convert into tuples of (encoder string identifier, encoder attribute)
        past_encoders, future_encoders = list(), list()
        for enc, enc_params in encoders.items():
            for enc_time, enc_attr in enc_params.items():
                raise_if_not(
                    isinstance(enc_attr, VALID_ENCODER_DTYPES),
                    f"Encountered value `{enc_attr}` of invalid type `{type(enc_attr)}` for encoder "
                    f"`{enc}` in `add_encoders` at model creation. Supported data types are: "
                    f"`{VALID_ENCODER_DTYPES}`.",
                    logger,
                )
                attrs = [enc_attr] if isinstance(enc_attr, str) else enc_attr
                for attr in attrs:
                    encoder_id = "_".join([enc, enc_time])
                    if enc_time == PAST:
                        past_encoders.append((encoder_id, attr))
                    else:
                        future_encoders.append((encoder_id, attr))

        for temp_enc, takes_temp, temp in [
            (past_encoders, self.takes_past_covariates, "past"),
            (future_encoders, self.takes_future_covariates, "future"),
        ]:
            if temp_enc and not takes_temp:
                logger.warning(
                    f"Specified {temp} encoders in `add_encoders` at model creation but model does not "
                    f"accept {temp} covariates. {temp} encoders will be ignored."
                )

        past_encoders = past_encoders if self.takes_past_covariates else []
        future_encoders = future_encoders if self.takes_future_covariates else []
        return past_encoders, future_encoders
Beispiel #27
0
def datetime_attribute_timeseries(
    time_index: Union[pd.DatetimeIndex, TimeSeries],
    attribute: str,
    one_hot: bool = False,
    cyclic: bool = False,
    until: Optional[Union[int, str, pd.Timestamp]] = None,
    add_length: int = 0,
    dtype=np.float64,
) -> TimeSeries:
    """
    Returns a new TimeSeries with index `time_index` and one or more dimensions containing
    (optionally one-hot encoded or cyclic encoded) pd.DatatimeIndex attribute information derived from the index.


    Parameters
    ----------
    time_index
        Either a `pd.DatetimeIndex` attribute which will serve as the basis of the new column(s), or
        a `TimeSeries` whose time axis will serve this purpose.
    attribute
        An attribute of `pd.DatetimeIndex`, or `week` / `weekofyear` / `week_of_year` - e.g. "month", "weekday", "day",
        "hour", "minute", "second". See all available attributes in
        https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DatetimeIndex.html#pandas.DatetimeIndex.
    one_hot
        Boolean value indicating whether to add the specified attribute as a one hot encoding
        (results in more columns).
    cyclic
        Boolean value indicating whether to add the specified attribute as a cyclic encoding.
        Alternative to one_hot encoding, enable only one of the two.
        (adds 2 columns, corresponding to sin and cos transformation)
    until
        Extend the time_index up until timestamp for datetime indexed series
        and int for range indexed series, should match or exceed forecasting window.
    add_length
        Extend the time_index by add_length, should match or exceed forecasting window.
        Set only one of until and add_length.
    dtype
        The desired NumPy dtype (np.float32 or np.float64) for the resulting series

    Returns
    -------
    TimeSeries
        New datetime attribute TimeSeries instance.
    """

    if isinstance(time_index, TimeSeries):
        time_index = time_index.time_index

    time_index = _extend_time_index_until(time_index, until, add_length)

    raise_if_not(
        hasattr(pd.DatetimeIndex, attribute)
        or (attribute in ["week", "weekofyear", "week_of_year"]),
        f"attribute `{attribute}` needs to be an attribute of pd.DatetimeIndex. "
        "See all available attributes in "
        "https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DatetimeIndex.html#pandas.DatetimeIndex",
        logger,
    )

    raise_if(one_hot and cyclic, "set only one of one_hot or cyclic to true",
             logger)

    num_values_dict = {
        "month": 12,
        "day": 31,
        "weekday": 7,
        "dayofweek": 7,
        "day_of_week": 7,
        "hour": 24,
        "minute": 60,
        "second": 60,
        "microsecond": 1000000,
        "nanosecond": 1000,
        "quarter": 4,
        "dayofyear": 365,
        "day_of_year": 365,
        "week": 52,
        "weekofyear": 52,
        "week_of_year": 52,
    }

    if attribute not in ["week", "weekofyear", "week_of_year"]:
        values = getattr(time_index, attribute)
    else:
        values = (time_index.isocalendar().set_index("week").index.astype(
            "int64").rename("time"))

    if one_hot or cyclic:
        raise_if_not(
            attribute in num_values_dict,
            f"Given datetime attribute `{attribute}` not supported with one-hot or cyclical encoding. "
            f"Supported datetime attribute: {list(num_values_dict.keys())}",
            logger,
        )

    if one_hot:
        values_df = pd.get_dummies(values)
        # fill missing columns (in case not all values appear in time_index)
        for i in range(1, num_values_dict[attribute] + 1):
            if not (i in values_df.columns):
                values_df[i] = 0
        values_df = values_df[range(1, num_values_dict[attribute] + 1)]

        values_df.columns = [
            attribute + "_" + str(column_name)
            for column_name in values_df.columns
        ]

    else:
        if cyclic:
            if attribute == "day":
                periods = [
                    time_index[i].days_in_month for i in time_index.month
                ]
                freq = 2 * np.pi * np.reciprocal(periods)
            else:
                period = num_values_dict[attribute]
                freq = 2 * np.pi / period

            values_df = pd.DataFrame({
                attribute + "_sin": np.sin(freq * values),
                attribute + "_cos": np.cos(freq * values),
            })
        else:
            values_df = pd.DataFrame({attribute: values})

    values_df.index = time_index

    return TimeSeries.from_dataframe(values_df).astype(dtype)
Beispiel #28
0
def plot_acf(
    ts: TimeSeries,
    m: Optional[int] = None,
    max_lag: int = 24,
    alpha: float = 0.05,
    bartlett_confint: bool = True,
    fig_size: Tuple[int, int] = (10, 5),
    axis: Optional[plt.axis] = None,
) -> None:
    """
    Plots the ACF of `ts`, highlighting it at lag `m`, with corresponding significance interval.
    Uses :func:`statsmodels.tsa.stattools.acf` [1]_

    Parameters
    ----------
    ts
        The TimeSeries whose ACF should be plotted.
    m
        Optionally, a time lag to highlight on the plot.
    max_lag
        The maximal lag order to consider.
    alpha
        The confidence interval to display.
    bartlett_confint
        The boolean value indicating whether the confidence interval should be
        calculated using Bartlett's formula. If set to True, the confidence interval
        can be used in the model identification stage for fitting ARIMA models.
        If set to False, the confidence interval can be used to test for randomness
        (i.e. there is no time dependence in the data) of the data.
    fig_size
        The size of the figure to be displayed.
    axis
        Optionally, an axis object to plot the ACF on.

    References
    ----------
    .. [1] https://www.statsmodels.org/dev/generated/statsmodels.tsa.stattools.acf.html
    """

    ts._assert_univariate()
    raise_if(
        max_lag is None or not (1 <= max_lag < len(ts)),
        "max_lag must be greater than or equal to 1 and less than len(ts).",
    )
    raise_if(
        m is not None and not (0 <= m <= max_lag),
        "m must be greater than or equal to 0 and less than or equal to max_lag.",
    )
    raise_if(
        alpha is None or not (0 < alpha < 1),
        "alpha must be greater than 0 and less than 1.",
    )

    r, confint = acf(
        ts.values(),
        nlags=max_lag,
        fft=False,
        alpha=alpha,
        bartlett_confint=bartlett_confint,
    )

    if axis is None:
        plt.figure(figsize=fig_size)
        axis = plt

    for i in range(len(r)):
        axis.plot(
            (i, i),
            (0, r[i]),
            color=("#b512b8" if m is not None and i == m else "black"),
            lw=(1 if m is not None and i == m else 0.5),
        )

    # Adjusts the upper band of the confidence interval to center it on the x axis.
    upp_band = [confint[lag][1] - r[lag] for lag in range(1, max_lag + 1)]

    axis.fill_between(
        np.arange(1, max_lag + 1),
        upp_band,
        [-x for x in upp_band],
        color="#003DFD",
        alpha=0.25,
    )
    axis.plot((0, max_lag + 1), (0, 0), color="black")
Beispiel #29
0
    def __init__(
        self,
        fill: Union[str, float] = "auto",
        name: str = "MissingValuesFiller",
        n_jobs: int = 1,
        verbose: bool = False,
    ):
        """Data transformer to fill missing values from a (sequence of) deterministic ``TimeSeries``.

        Parameters
        ----------
        fill
            The value used to replace the missing values.
            If set to 'auto', will auto-fill missing values using the :func:`pd.Dataframe.interpolate()` method.
        name
            A specific name for the transformer
        n_jobs
            The number of jobs to run in parallel. Parallel jobs are created only when a ``Sequence[TimeSeries]`` is
            passed as input to a method, parallelising operations regarding different ``TimeSeries``. Defaults to `1`
            (sequential). Setting the parameter to `-1` means using all the available processors.
            Note: for a small amount of data, the parallelisation overhead could end up increasing the total
            required amount of time.
        verbose
            Optionally, whether to print operations progress

        Examples
        --------
        >>> import numpy as np
        >>> from darts import TimeSeries
        >>> from darts.dataprocessing.transformers import MissingValuesFiller
        >>> values = np.arange(start=0, stop=1, step=0.1)
        >>> values[5:8] = np.nan
        >>> series = TimeSeries.from_values(values)
        >>> transformer = MissingValuesFiller()
        >>> series_filled = transformer.transform(series)
        >>> print(series_filled)
        <TimeSeries (DataArray) (time: 10, component: 1, sample: 1)>
        array([[[0. ]],
            [[0.1]],
            [[0.2]],
            [[0.3]],
            [[0.4]],
            [[0.5]],
            [[0.6]],
            [[0.7]],
            [[0.8]],
            [[0.9]]])
        Coordinates:
        * time       (time) int64 0 1 2 3 4 5 6 7 8 9
        * component  (component) object '0'
        Dimensions without coordinates: sample
        """
        raise_if_not(
            isinstance(fill, str) or isinstance(fill, float),
            "`fill` should either be a string or a float",
            logger,
        )
        raise_if(
            isinstance(fill, str) and fill != "auto",
            "invalid string for `fill`: can only be set to 'auto'",
            logger,
        )

        super().__init__(name=name, n_jobs=n_jobs, verbose=verbose)
        self._fill = fill
Beispiel #30
0
def plot_pacf(
    ts: TimeSeries,
    m: Optional[int] = None,
    max_lag: int = 24,
    method: str = "ywadjusted",
    alpha: float = 0.05,
    fig_size: Tuple[int, int] = (10, 5),
    axis: Optional[plt.axis] = None,
) -> None:
    """
    Plots the Partial ACF of `ts`, highlighting it at lag `m`, with corresponding significance interval.
    Uses :func:`statsmodels.tsa.stattools.pacf` [1]_

    Parameters
    ----------
    ts
        The TimeSeries whose ACF should be plotted.
    m
        Optionally, a time lag to highlight on the plot.
    max_lag
        The maximal lag order to consider.
    method
        The method to be used for the PACF calculation.
        - | "yw" or "ywadjusted" : Yule-Walker with sample-size adjustment in
          | denominator for acovf. Default.
        - "ywm" or "ywmle" : Yule-Walker without adjustment.
        - "ols" : regression of time series on lags of it and on constant.
        - "ols-inefficient" : regression of time series on lags using a single
          common sample to estimate all pacf coefficients.
        - "ols-adjusted" : regression of time series on lags with a bias
          adjustment.
        - "ld" or "ldadjusted" : Levinson-Durbin recursion with bias
          correction.
        - "ldb" or "ldbiased" : Levinson-Durbin recursion without bias
          correction.
    alpha
        The confidence interval to display.
    fig_size
        The size of the figure to be displayed.
    axis
        Optionally, an axis object to plot the ACF on.

    References
    ----------
    .. [1] https://www.statsmodels.org/dev/generated/statsmodels.tsa.stattools.pacf.html
    """

    ts._assert_univariate()
    raise_if(
        max_lag is None or not (1 <= max_lag < len(ts) // 2),
        "max_lag must be greater than or equal to 1 and less than len(ts)//2.",
    )
    raise_if(
        m is not None and not (0 <= m <= max_lag),
        "m must be greater than or equal to 0 and less than or equal to max_lag.",
    )
    raise_if(
        alpha is None or not (0 < alpha < 1),
        "alpha must be greater than 0 and less than 1.",
    )

    r, confint = pacf(ts.values(), nlags=max_lag, method=method, alpha=alpha)

    if axis is None:
        plt.figure(figsize=fig_size)
        axis = plt

    for i in range(len(r)):
        axis.plot(
            (i, i),
            (0, r[i]),
            color=("#b512b8" if m is not None and i == m else "black"),
            lw=(1 if m is not None and i == m else 0.5),
        )

    # Adjusts the upper band of the confidence interval to center it on the x axis.
    upp_band = [confint[lag][1] - r[lag] for lag in range(1, max_lag + 1)]

    axis.fill_between(
        np.arange(1, max_lag + 1),
        upp_band,
        [-x for x in upp_band],
        color="#003DFD",
        alpha=0.25,
    )
    axis.plot((0, max_lag + 1), (0, 0), color="black")