コード例 #1
0
ファイル: model_based.py プロジェクト: FelSiq/ts-pymfe
    def _fit_ortho_pol_reg(
        ts_trend: np.ndarray,
        degree: int = 2
    ) -> statsmodels.regression.linear_model.RegressionResults:
        """Regress the time-series trend on orthogonal polinomials.

        Parameters
        ----------
        ts_trend : :obj:`np.ndarray`
            One-dimensional time-series trend component.

        degree : int, optional (default=2)
            Degree of the highest order polynomial (and, therefore, the number
            of distinct polynomials used).

        Returns
        -------
        :obj:`statsmodels.regression.linear_model.RegressionResults`
            Optimized parameters of the linear model of the time-series trend
            component regressed on the orthogonal polynomials.
        """
        X = _orthopoly.ortho_poly(
            ts=np.linspace(0, 1, ts_trend.size),
            degree=degree,
            return_coeffs=False,
        )

        X = statsmodels.tools.add_constant(X)

        ts_trend_scaled = _utils.standardize_ts(ts=ts_trend)

        return statsmodels.regression.linear_model.OLS(ts_trend_scaled,
                                                       X).fit()
コード例 #2
0
ファイル: randomize.py プロジェクト: FelSiq/ts-pymfe
    def precompute_ts_scaled(cls, ts: np.ndarray,
                             **kwargs) -> t.Dict[str, np.ndarray]:
        """Precompute a standardized time series.

        Parameters
        ----------
        ts : :obj:`np.ndarray`
            One-dimensional time-series values.

        kwargs:
            Additional arguments and previous precomputed items. May
            speed up this precomputation.

        Returns
        -------
        dict
            The following precomputed item is returned:
                * ``ts_scaled`` (:obj:`np.ndarray`): standardized time-series
                    values (z-score).
        """
        precomp_vals = {}  # type: t.Dict[str, np.ndarray]

        if "ts_scaled" not in kwargs:
            precomp_vals["ts_scaled"] = _utils.standardize_ts(ts=ts)

        return precomp_vals
コード例 #3
0
ファイル: autocorr.py プロジェクト: FelSiq/ts-pymfe
    def precompute_gaussian_model(cls,
                                  ts: np.ndarray,
                                  random_state: t.Optional[int] = None,
                                  **kwargs) -> t.Dict[str, t.Any]:
        """Precompute a gaussian process model.

        Parameters
        ----------
        ts : :obj:`np.ndarray`
            One-dimensional time-series values.

        random_state : int, optional
            Random seed to optimize the gaussian process model, to keep
            the results reproducible.

        kwargs:
            Additional arguments and previous precomputed items. May
            speed up this precomputation.

        Returns
        -------
        dict
            The following precomputed item is returned:
                * ``gaussian_model`` (:obj:`GaussianProcessRegressor`):
                    Gaussian process fitted model.
                * ``gaussian_resid`` (:obj:`np.ndarray`): Gaussian process
                    model residuals (diference from the original time-series).

            The following item is necessary and, therefore, also precomputed
            if necessary:
                * ``ts_scaled`` (:obj:`np.ndarray`): standardized time-series
                    values (z-score).
        """
        precomp_vals = {}  # type: t.Dict[str, t.Any]

        ts_scaled = kwargs.get("ts_scaled")

        if ts_scaled is None:
            precomp_vals["ts_scaled"] = _utils.standardize_ts(ts=ts)
            ts_scaled = precomp_vals["ts_scaled"]

        if "gaussian_model" not in kwargs:
            gaussian_model = _utils.fit_gaussian_process(
                ts=ts, ts_scaled=ts_scaled, random_state=random_state)
            precomp_vals["gaussian_model"] = gaussian_model

        gaussian_model = kwargs.get("gaussian_model",
                                    precomp_vals["gaussian_model"])

        if "gaussian_resid" not in kwargs:
            gaussian_resid = _utils.fit_gaussian_process(
                ts=ts,
                ts_scaled=ts_scaled,
                gaussian_model=gaussian_model,
                return_residuals=True,
            )

            precomp_vals["gaussian_resid"] = gaussian_resid

        return precomp_vals
コード例 #4
0
ファイル: model_based.py プロジェクト: FelSiq/ts-pymfe
    def ft_gaussian_r_sqr(
        cls,
        ts: np.ndarray,
        random_state: t.Optional[int] = None,
        ts_scaled: t.Optional[np.ndarray] = None,
        gaussian_model: t.Optional[
            sklearn.gaussian_process.GaussianProcessRegressor] = None,
    ) -> float:
        """R^2 from a gaussian process model.

        Parameters
        ----------
        ts : :obj:`np.ndarray`
            One-dimensional time-series values.

        random_state : int, optional
            Random seed to optimize the gaussian process model, to keep
            the results reproducible.

        ts_scaled : :obj:`np.ndarray`, optional
            Standardized time-series values. Used to take advantage of
            precomputations.

        gaussian_model : :obj:`GaussianProcessRegressor`, optional
            A fitted model of a gaussian process. Used to take advantage of
            precomputations.

        Returns
        -------
        float
            R^2 of a gaussian process model.

        References
        ----------
        .. [1] B.D. Fulcher and N.S. Jones, "hctsa: A Computational Framework
            for Automated Time-Series Phenotyping Using Massive Feature
            Extraction, Cell Systems 5: 527 (2017).
            DOI: 10.1016/j.cels.2017.10.001
        .. [2] B.D. Fulcher, M.A. Little, N.S. Jones, "Highly comparative
            time-series analysis: the empirical structure of time series and
            their methods", J. Roy. Soc. Interface 10(83) 20130048 (2013).
            DOI: 10.1098/rsif.2013.0048
        """
        ts_scaled = _utils.standardize_ts(ts=ts, ts_scaled=ts_scaled)

        gaussian_model = _utils.fit_gaussian_process(
            ts=ts_scaled,
            random_state=random_state,
            gaussian_model=gaussian_model,
            ts_scaled=ts_scaled,
        )

        X = np.linspace(0, 1, ts_scaled.size).reshape(-1, 1)
        r_squared = gaussian_model.score(X=X, y=ts_scaled)

        return r_squared
コード例 #5
0
ファイル: randomize.py プロジェクト: FelSiq/ts-pymfe
    def ft_resample_std(
        cls,
        ts: np.ndarray,
        num_samples: int = 64,
        sample_size_frac: float = 0.1,
        ddof: int = 1,
        random_state: t.Optional[int] = None,
        ts_scaled: t.Optional[np.ndarray] = None,
    ) -> np.ndarray:
        """Time-series standard deviation from repeated subsampling.

        A subsample of size L is L consecutive observations from the
        time-series, starting from a random index in [0, len(ts)-L] range.

        Parameters
        ----------
        ts : :obj:`np.ndarray`
            One-dimensional time-series values.

        num_samples : int, optional (default=64)
            Number of time-series subsamples.

        sample_size_frac : float, optional (default=0.1)
            Size of each subsample proportional to the time-series length.

        ddof : int, optional (default=1)
            Degrees of freedom of the standard deviation.

        random_state : int, optional
            Random seed to ensure reproducibility.

        ts_scaled : :obj:`np.ndarray`, optional
            Standardized time-series values. Used to take advantage of
            precomputations.

        Returns
        -------
        :obj:`np.ndarray`
            Standard deviations from repeated subsampling.
        """
        ts_scaled = _utils.standardize_ts(ts=ts, ts_scaled=ts_scaled)

        sample_std = _utils.apply_on_samples(
            ts=ts_scaled,
            func=np.std,
            num_samples=num_samples,
            sample_size_frac=sample_size_frac,
            random_state=random_state,
            ddof=ddof,
        )

        return sample_std
コード例 #6
0
ファイル: model_based.py プロジェクト: FelSiq/ts-pymfe
    def _fit_res_model_des(
        ts: np.ndarray,
        damped_trend: bool = False,
        ts_scaled: t.Optional[np.ndarray] = None,
    ) -> statsmodels.tsa.holtwinters.results.HoltWintersResultsWrapper:
        """Fit a double exponential smoothing model with additive trend.

        Parameters
        ----------
        ts : :obj:`np.ndarray`
            One-dimensional time-series values.

        damped_trend : bool, optional (default=False)
            Whether or not the exponential smoothing model should include a
            damping component.

        ts_scaled : :obj:`np.ndarray`, optional
            Standardized time-series values. Used to take advantage of
            precomputations.

        Returns
        -------
        :obj:`statsmodels.tsa.holtwinters.results.HoltWintersResultsWrapper`
            Results of a optimized double exponential smoothing model.

        References
        ----------
        .. [1] Holt, C. E. (1957). Forecasting seasonals and trends by
            exponentially weighted averages (O.N.R. Memorandum No. 52).
            Carnegie Institute of Technology, Pittsburgh USA.
            https://doi.org/10.1016/j.ijforecast.2003.09.015
        """
        ts_scaled = _utils.standardize_ts(ts=ts, ts_scaled=ts_scaled)

        with warnings.catch_warnings():
            warnings.filterwarnings(
                "ignore",
                module="statsmodels",
                category=statsmodels.tools.sm_exceptions.ConvergenceWarning,
            )

            model = statsmodels.tsa.holtwinters.ExponentialSmoothing(
                endog=ts_scaled,
                initialization_method="estimated",
                trend="additive",
                damped_trend=damped_trend,
                seasonal=None,
            ).fit()

        return model
コード例 #7
0
ファイル: randomize.py プロジェクト: FelSiq/ts-pymfe
    def _itrand_stat(
        cls,
        ts: np.ndarray,
        func_stats: t.Collection[t.Callable[[np.ndarray], float]],
        strategy: str = "dist-dynamic",
        prop_rep: t.Union[int, float] = 2,
        prop_interval: float = 0.1,
        ts_scaled: t.Optional[np.ndarray] = None,
        random_state: t.Optional[int] = None,
    ) -> np.ndarray:
        """Calculate global statistics with iterative perturbation method.

        In the iterative perturbation method, a copy of the time-series is
        modified at each iteration. The quantity of observations modified and
        the sample pool from which the new values are drawn depends on the
        ``strategy``selected. Then, a statistic is extracted after every `k`
        iterations (given by ceil(ts.size * ``prop_interval``)).

        Parameters
        ----------
        ts : :obj:`np.ndarray`
            One-dimensional time-series values.

        func_stats : sequence of callable
            Sequence of callables to extract the statistic values. Each
            callable must receive a list of numeric values as the first
            argument, and return a single numeric value.

        strategy : str, optional (default="dist-dynamic")
            The strategy used to perturb the current population. Must be one
            of the following:

                1. `dist-static`: (static distribution) one observation of the
                current population is overwritten by one observation from the
                original time-series.

                2. `dist-dynamic`: (dynamic distribution) one observation of
                the current population is overwritten by another observation
                of the current population.

                3. `permute`: two observations of the current population swaps
                its positions.

        prop_rep : int or float, optional (default=2)
            Number of total iterations proportional to the time-series size.
            This means that this process will iterate for approximately
            ceil(prop_rep * ts.size) iterations. More rigorously, the number
            of iterations also depends on the number of iterations that the
            statistics are extracted, to avoid lose computations.

        prop_interval : float, optional (default=0.1)
            Interval that the statistics are extracted from the current
            population, proportional to the time-series length.

        ts_scaled : :obj:`np.ndarray`, optional
            Standardized time-series values. Used to take advantage of
            precomputations.

        random_state : int, optional
            Random seed to ensure reproducibility.

        Returns
        -------
        :obj:`np.ndarray`
            Statistics extracted from the dynamic population. Each row is
            associated to a method from ``func_stats``, and each column is
            one distinct extraction event, ordered temporally by index (i.e.
            lower indices corresponds to populations more similar to the
            starting state, and higher indices to populations more affected
            by the process).

        References
        ----------
        .. [1] B.D. Fulcher and N.S. Jones, "hctsa: A Computational Framework
            for Automated Time-Series Phenotyping Using Massive Feature
            Extraction, Cell Systems 5: 527 (2017).
            DOI: 10.1016/j.cels.2017.10.001
        .. [2] B.D. Fulcher, M.A. Little, N.S. Jones, "Highly comparative
            time-series analysis: the empirical structure of time series and
            their methods", J. Roy. Soc. Interface 10(83) 20130048 (2013).
        """
        if prop_rep <= 0:
            raise ValueError(
                "'prop_rep' must be positive (got {}).".format(prop_rep))

        if prop_interval <= 0:
            raise ValueError(
                "'prop_interval' must be positive (got {}).".format(
                    prop_interval))

        VALID_STRATEGY = ("dist-static", "dist-dynamic", "permute")

        if strategy not in VALID_STRATEGY:
            raise ValueError("'strategy' not in {} (got '{}')."
                             "".format(VALID_STRATEGY, strategy))

        if not hasattr(func_stats, "__len__"):
            func_stats = [func_stats]  # type: ignore

        ts_scaled = _utils.standardize_ts(ts=ts, ts_scaled=ts_scaled)

        rep_it = int(np.ceil(prop_interval * ts_scaled.size))

        # Note: adding (num_it % rep_it) to avoid lose computation of
        # the remaining iterations that do not produce a statistic.
        num_it = int(np.ceil(prop_rep * ts_scaled.size))
        num_it += num_it % rep_it

        res = np.zeros((len(func_stats), 1 + num_it // rep_it))
        ts_rnd = np.copy(ts_scaled)
        ts_src = ts_scaled if strategy == "dist-static" else ts_rnd
        swap = strategy == "permute"
        stat_ind = 0

        if random_state is not None:
            np.random.seed(random_state)

        inds_rnd = np.random.randint(ts_scaled.size, size=(num_it, 2))

        for it, (ind_a, ind_b) in enumerate(inds_rnd):
            if swap:
                ts_rnd[ind_a], ts_src[ind_b] = ts_src[ind_b], ts_rnd[ind_a]

            else:
                ts_rnd[ind_a] = ts_src[ind_b]

            if it % rep_it == 0:
                for ind_f, func in enumerate(func_stats):
                    res[ind_f, stat_ind] = func(ts_rnd)

                stat_ind += 1

        return res if len(func_stats) > 1 else res.ravel()
コード例 #8
0
ファイル: model_based.py プロジェクト: FelSiq/ts-pymfe
    def _fit_res_model_ets(
        ts: np.ndarray,
        damped_trend: bool = False,
        grid_search_guess: bool = True,
        ts_period: t.Optional[int] = None,
        ts_scaled: t.Optional[np.ndarray] = None,
    ) -> statsmodels.tsa.holtwinters.results.HoltWintersResultsWrapper:
        """Fit a triple exponential smoothing model with additive components.

        Parameters
        ----------
        ts : :obj:`np.ndarray`
            One-dimensional time-series values.

        damped_trend : bool, optional (default=False)
            Whether or not the exponential smoothing model should include a
            damping component.

        grid_search_guess : bool, optional (default=True)
            If True, used grid search (a.k.a. brute force) to search for good
            starting parameters. If False, this method becomes more less
            computationally intensive, but may fail to converge with higher
            chances.

        ts_period : int, optional
            Time-series period.

        ts_scaled : :obj:`np.ndarray`, optional
            Standardized time-series values. Used to take advantage of
            precomputations.

        Returns
        -------
        :obj:`statsmodels.tsa.holtwinters.results.HoltWintersResultsWrapper`
            Results of a optimized triple exponential smoothing model.

        References
        ----------
        .. [1] Winters, Peter R. Forecasting Sales by Exponentially Weighted
            Moving Averages, 1960, INFORMS, Linthicum, MD, USA
            https://doi.org/10.1287/mnsc.6.3.324
        .. [2] Charles C. Holt, Forecasting seasonals and trends by
            exponentially weighted moving averages, International Journal of
            Forecasting, Volume 20, Issue 1, 2004, Pages 5-10, ISSN 0169-2070,
            https://doi.org/10.1016/j.ijforecast.2003.09.015.
        """
        ts_scaled = _utils.standardize_ts(ts=ts, ts_scaled=ts_scaled)

        ts_period = _period.get_ts_period(ts=ts_scaled, ts_period=ts_period)

        with warnings.catch_warnings():
            warnings.filterwarnings(
                "ignore",
                module="statsmodels",
                category=statsmodels.tools.sm_exceptions.ConvergenceWarning,
            )

            model = statsmodels.tsa.holtwinters.ExponentialSmoothing(
                endog=ts_scaled,
                initialization_method="estimated",
                trend="additive",
                seasonal="additive",
                damped_trend=damped_trend,
                seasonal_periods=ts_period,
            ).fit(use_brute=grid_search_guess)

        return model
コード例 #9
0
ファイル: _embed.py プロジェクト: FelSiq/ts-pymfe-tests
def embed_dim_cao(
    ts: np.ndarray,
    lag: int,
    dims: t.Union[int, t.Sequence[int]] = 16,
    ts_scaled: t.Optional[np.ndarray] = None,
) -> t.Tuple[np.ndarray, np.ndarray]:
    """Estimate Cao's metrics to estimate time-series embedding dimension.

    The Cao's metrics are two statistics, `E1` and `E2`, used to estimate the
    appropriate embedding metric of a time-series. From the `E1` statistic it
    can be defined the appropriate embedding dimension as the index after the
    saturation of the metric from a set of ordered lags.

    The precise `saturation` concept may be a subjective concept, since this
    metric can show some curious `artifacts` related to specific lags for
    specific time-series, which will need deeper further investigation.

    The `E2` statistics is to detect `false positives` from the `E1` statistic
    since if is used to distinguish between random white noise and a process
    generated from a true, non completely random, underlying process. If the
    time-series is purely random white noise, then all values of `E2` will be
    close to 1. If there exists a dimension with the `E2` metric estimated
    `sufficiently far` from 1, then this series is considered not a white
    random noise.

    Parameters
    ----------
    ts : :obj:`np.ndarray`
        One-dimensional time-series values.

    lag : int
        Embedding lag. You may want to check the `embed_lag` function
        documentation for embedding lag estimation. Must be a stricly
        positive value.

    dims : int or sequence of int
        Dimensions to estimate the Cao's `E1` and `E2` statistic values.
        If integer, estimate all dimensions from 1 up to the given number.
        If a sequence of integers, estimate all Cao's statistics for all
        given dimensions, and return the corresponding values in the same
        order of the given dimensions.
        All dimensions with non-positive values will receive a `np.nan`
        value for both Cao's metric.

    ts_scaled : :obj:`np.ndarray`, optional
        Standardized time-series values. Used to take advantage of
        precomputations.

    Returns
    -------
    tuple of :obj:`np.ndarray`
        `E1` and `E2` Cao's metrics, necessarily in that order, for all
        given dimensions (and with direct index correspondence for the
        given dimensions).

    References
    ----------
    .. [1] Liangyue Cao, Practical method for determining the minimum
        embedding dimension of a scalar time series, Physica D: Nonlinear
        Phenomena, Volume 110, Issues 1–2, 1997, Pages 43-50,
        ISSN 0167-2789, https://doi.org/10.1016/S0167-2789(97)00118-8.
    """
    if lag <= 0:
        raise ValueError("'lag' must be positive (got {}).".format(lag))

    _dims: t.Sequence[int]

    if np.isscalar(dims):
        _dims = np.arange(1, int(dims) + 1)  # type: ignore

    else:
        _dims = np.asarray(dims, dtype=int)

    ts_scaled = _utils.standardize_ts(ts=ts, ts_scaled=ts_scaled)

    ed, ed_star = np.zeros((2, len(_dims)), dtype=float)

    for ind, dim in enumerate(_dims):
        try:
            emb_next = embed_ts(ts=ts_scaled, lag=lag, dim=dim + 1)
            emb_cur = emb_next[:, 1:]

        except ValueError:
            ed[ind] = np.nan
            ed_star[ind] = np.nan
            continue

        nn_inds, dist_cur = nn(embed=emb_cur)

        emb_next_abs_diff = np.abs(emb_next[:, 0] - emb_next[nn_inds, 0])
        # Note: 'chebyshev'/'manhattan'/'L1'/max norm distance of X and Y,
        # both in the embed of (d + 1) dimensions, can be defined in respect
        # to one dimension less:
        # L1(X_{d+1}, Y_{d+1}) = |X_{d+1}, Y_{d+1}|_{inf}
        #   = max(|x_1 - y_1|, ..., |x_{d+1} - y_{d+1}|)
        #   = max(max(|x_1 - y_1|, ..., |x_d - y_d|), |x_{d+1} - y_{d+1}|)
        #   = max(L1(X_{d}, Y_{d}), |x_{d+1} - y_{d+1}|)
        dist_next = np.maximum(dist_cur, emb_next_abs_diff)

        # Note: 'ed' and 'ed_star' refers to, respectively, E_{d} and
        # E^{*}_{d} from the Cao's paper.
        ed[ind] = np.mean(dist_next / dist_cur)
        ed_star[ind] = np.mean(emb_next_abs_diff)

    # Note: the minimum embedding dimension is D such that e1[D]
    # is the first index where e1 stops changing significantly.
    e1 = ed[1:] / ed[:-1]

    # Note: This is the E2(d) Cao's metric. Its purpose is to
    # separate random time-series. For random-generated time-
    # series, e2 will be 1 for any dimension. For deterministic
    # data, however, e2 != 1 for some d.
    e2 = ed_star[1:] / ed_star[:-1]

    return e1, e2
コード例 #10
0
ファイル: _embed.py プロジェクト: FelSiq/ts-pymfe-tests
def ft_emb_dim_cao(ts: np.ndarray,
                   dims: t.Union[int, t.Sequence[int]] = 16,
                   lag: t.Optional[t.Union[str, int]] = None,
                   tol_threshold: float = 0.05,
                   check_e2: bool = True,
                   max_nlags: t.Optional[int] = None,
                   ts_scaled: t.Optional[np.ndarray] = None,
                   detrended_acfs: t.Optional[np.ndarray] = None,
                   detrended_ami: t.Optional[np.ndarray] = None,
                   emb_dim_cao_e1: t.Optional[np.ndarray] = None,
                   emb_dim_cao_e2: t.Optional[np.ndarray] = None) -> int:
    """Embedding dimension estimation using Cao's method.

    Using the Cao's embedding dimension estimation, it is calculated both
    of its metrics, `E1` and `E2` whose purpose is to, respectively,
    detect the appropriate embedding dimension and detect whether the given
    time-series is generated by a completely random process (white noise).

    The appropriate embedding dimension is the saturation dimension of `E1`
    if and only if exists a dimension `E2` sufficiently distinct from 1.
    If `E2` is approximately constant at 1 over all dimensions, the series
    is considered white noise and, therefore, the embedding dimension is
    assumed to be 1.

    Parameters
    ----------
    ts : :obj:`np.ndarray`
        One-dimensional time-series values.

    dims : int or a sequence of int, optional (default=16)
        The embedding dimension candidates. In int, investigate all values
        between 1 and ``dims`` value (both inclusive). If a sequence of
        integers is used, then investigate only the given dimensions.

    lag : int or str, optional
        Lag of the time-series embedding. It must be a strictly positive
        value, None or a string in {`acf`, `acf-nonsig`, `ami`}. In the
        last two type of options, the lag is estimated within this method
        using the given strategy method (or, if None, it is used the
        strategy `acf-nonsig` by default) up to ``max_nlags``.
            1. `acf`: the lag corresponds to the first non-positive value
                in the autocorrelation function.
            2. `acf-nonsig`: lag corresponds to the first non-significant
                value in the autocorrelation function (absolute value below
                the critical value of 1.96 / sqrt(ts.size)).
            3. `ami`: lag corresponds to the first local minimum of the
                time-series automutual information function.

    tol_threshold : float, optional (default=0.05)
        Tolerance threshold to defined the maximum absolute diference
        between two E1 values in order to assume saturation. This same
        threshold is the minimum absolute deviation that E2 values must
        have in order to be considered different than 1.

    check_e2 : bool, optional (default=True)
        If True, check if there exist a Cao's E2 value different than 1,
        and return 1 if this condition is not satisfied. If False, ignore
        the E2 values.

    max_nlags : int, optional
        If ``lag`` is not a numeric value, than it will be estimated using
        either the time-series autocorrelation or mutual information
        function estimated up to this argument value.

    ts_scaled : :obj:`np.ndarray`, optional
        Standardized time-series values. Used to take advantage of
        precomputations.

    detrended_acfs : :obj:`np.ndarray`, optional
        Array of time-series autocorrelation function (for distinct ordered
        lags) of the detrended time-series. Used only if ``lag`` is any of
        `acf`, `acf-nonsig` or None.  If this argument is not given and the
        previous condiditon is meet, the autocorrelation function will be
        calculated inside this method up to ``max_nlags``.

    detrended_ami : :obj:`np.ndarray`, optional
        Array of time-series automutual information function (for distinct
        ordered lags). Used only if ``lag`` is `ami`. If not given and the
        previous condiditon is meet, the automutual information function
        will be calculated inside this method up to ``max_nlags``.

    emb_dim_cao_e1 : :obj:`np.ndarray`, optional
        E1 values from the Cao's method. Used to take advantage of
        precomputations.

    emb_dim_cao_e2 : :obj:`np.ndarray`, optional
        E2 values from the Cao's method. Used to take advantage of
        precomputations.

    Returns
    -------
    int
        Estimation of the appropriate embedding dimension using Cao's
        method.

    References
    ----------
    .. [1] Liangyue Cao, Practical method for determining the minimum
        embedding dimension of a scalar time series, Physica D: Nonlinear
        Phenomena, Volume 110, Issues 1–2, 1997, Pages 43-50,
        ISSN 0167-2789, https://doi.org/10.1016/S0167-2789(97)00118-8.
    """
    ts_scaled = _utils.standardize_ts(ts=ts, ts_scaled=ts_scaled)

    lag = embed_lag(ts=ts_scaled,
                    lag=lag,
                    detrended_acfs=detrended_acfs,
                    detrended_ami=detrended_ami,
                    max_nlags=max_nlags)

    if emb_dim_cao_e1 is None or (check_e2 and emb_dim_cao_e2 is None):
        emb_dim_cao_e1, emb_dim_cao_e2 = embed_dim_cao(ts=ts,
                                                       ts_scaled=ts_scaled,
                                                       dims=dims,
                                                       lag=lag)

    if (check_e2 and emb_dim_cao_e2 is not None
            and np.all(np.abs(emb_dim_cao_e2 - 1) < tol_threshold)):
        return 1

    e1_abs_diff = np.abs(np.diff(emb_dim_cao_e1))

    first_max_ind = 0

    try:
        first_max_ind = np.flatnonzero(e1_abs_diff <= tol_threshold)[0]

    except IndexError:
        pass

    return first_max_ind + 1
コード例 #11
0
    def ft_sample_entropy(
        cls,
        ts: np.ndarray,
        embed_dim: int = 2,
        embed_lag: int = 1,
        threshold: float = 0.2,
        metric: str = "chebyshev",
        p: t.Union[int, float] = 2,
        ts_scaled: t.Optional[np.ndarray] = None,
    ) -> float:
        """Sample entropy of the time-series.

        Parameters
        ----------
        ts : :obj:`np.ndarray`
            One-dimensional time-series values.

        embed_dim : int, optional (default=2)
            Embedding dimension.

        embed_dim : int, optional (default=1)
            Embedding lag.

        threshold : float, optional (default=0.2)
            Threshold to consider which observations are next to each other
            after embedding.

        metric : str, optional (default="chebyshev")
            Distance metric to calculate the pairwise distance of the
            observations after each embedding.
            Check `scipy.spatial.distance.cdist` documentation for the complete
            list of available distance metrics.

        p : int or float, optional (default=2)
            Power parameter for the minkowski metric. Used only if metric is
            `minkowski`.

        ts_scaled : :obj:`np.ndarray`, optional
            Standardized time-series values. Used to take advantage of
            precomputations.

        Returns
        -------
        float
            Estimated sample entropy.

        References
        ----------
        .. [1] Physiological time-series analysis using approximate entropy and
            sample entropy Joshua S. Richman and J. Randall Moorman, American
            Journal of Physiology-Heart and Circulatory Physiology 2000 278:6,
            H2039-H2049
        .. [2] B.D. Fulcher and N.S. Jones, "hctsa: A Computational Framework
            for Automated Time-Series Phenotyping Using Massive Feature
            Extraction, Cell Systems 5: 527 (2017).
            DOI: 10.1016/j.cels.2017.10.001
        .. [3] B.D. Fulcher, M.A. Little, N.S. Jones, "Highly comparative
            time-series analysis: the empirical structure of time series and
            their methods", J. Roy. Soc. Interface 10(83) 20130048 (2013).
            DOI: 10.1098/rsif.2013.0048
        """
        def log_neigh_num(dim: int) -> int:
            """Logarithm of the number of nearest neighbors."""
            embed = _embed.embed_ts(ts_scaled, dim=dim, lag=embed_lag)
            dist_mat = scipy.spatial.distance.pdist(embed, metric=metric, p=p)
            return np.log(np.sum(dist_mat < threshold))

        ts_scaled = _utils.standardize_ts(ts=ts, ts_scaled=ts_scaled)

        sample_entropy = log_neigh_num(embed_dim) - log_neigh_num(embed_dim +
                                                                  1)

        return sample_entropy
コード例 #12
0
    def ft_approx_entropy(
        cls,
        ts: np.ndarray,
        embed_dim: int = 2,
        embed_lag: int = 1,
        threshold: float = 0.2,
        metric: str = "chebyshev",
        p: t.Union[int, float] = 2,
        ts_scaled: t.Optional[np.ndarray] = None,
    ) -> float:
        """Approximate entropy of the time-series.

        Parameters
        ----------
        ts : :obj:`np.ndarray`
            One-dimensional time-series values.

        embed_dim : int, optional (default=2)
            Embedding dimension.

        embed_dim : int, optional (default=1)
            Embedding lag.

        threshold : float, optional (default=0.2)
            Threshold to consider which observations are next to each other
            after embedding.

        metric : str, optional (default="chebyshev")
            Distance metric to calculate the pairwise distance of the
            observations after each embedding.
            Check `scipy.spatial.distance.cdist` documentation for the complete
            list of available distance metrics.

        p : int or float, optional (default=2)
            Power parameter for the minkowski metric. Used only if metric is
            `minkowski`.

        ts_scaled : :obj:`np.ndarray`, optional
            Standardized time-series values. Used to take advantage of
            precomputations.

        Returns
        -------
        float
            Estimated approximate entropy.

        References
        ----------
        .. [1] Pincus, S.M., Gladstone, I.M. & Ehrenkranz, R.A. A regularity
            statistic for medical data analysis. J Clin Monitor Comput 7,
            335–345 (1991). https://doi.org/10.1007/BF01619355
        .. [2] B.D. Fulcher and N.S. Jones, "hctsa: A Computational Framework
            for Automated Time-Series Phenotyping Using Massive Feature
            Extraction, Cell Systems 5: 527 (2017).
            DOI: 10.1016/j.cels.2017.10.001
        .. [3] B.D. Fulcher, M.A. Little, N.S. Jones, "Highly comparative
            time-series analysis: the empirical structure of time series and
            their methods", J. Roy. Soc. Interface 10(83) 20130048 (2013).
            DOI: 10.1098/rsif.2013.0048
        """
        def neigh_num(dim: int) -> int:
            """Mean-log-mean of the number of radius neighbors."""
            embed = _embed.embed_ts(ts_scaled, dim=dim, lag=embed_lag)
            dist_mat = scipy.spatial.distance.cdist(embed,
                                                    embed,
                                                    metric=metric,
                                                    p=p)
            return np.mean(np.log(np.mean(dist_mat < threshold, axis=1)))

        ts_scaled = _utils.standardize_ts(ts=ts, ts_scaled=ts_scaled)

        approx_entropy = neigh_num(embed_dim) - neigh_num(embed_dim + 1)

        return approx_entropy
コード例 #13
0
    def ft_ami_curvature(
        cls,
        ts: np.ndarray,
        noise_range: t.Tuple[float, float] = (0, 3),
        noise_inc_num: float = 10,
        lag: t.Optional[t.Union[str, int]] = None,
        random_state: t.Optional[int] = None,
        ts_scaled: t.Optional[np.ndarray] = None,
        max_nlags: t.Optional[int] = None,
        detrended_acfs: t.Optional[np.ndarray] = None,
    ) -> float:
        """Estimate the Automutual information curvature.

        The Automutual information curvature is estimated using iterative noise
        amplification strategy.

        In the iterative noise amplification strategy, a random white noise
        is sampled from a normal distribution (mean 0 and variance 1). Then,
        this same noise is iteratively amplified from a uniformly spaced
        scales in ``noise_range`` range and added to the time-series. The
        automutual information is calculated from the perturbed time-series
        for each noise amplification.

        The automutual information curvature is the angular coefficient of a
        linear regression of the automutual information onto the noise scales.

        The lag used for every iteration is fixed from the start and, if not
        fixed by the user, it is estimated from the autocorrelation function
        by default.

        Parameters
        ----------
        ts : :obj:`np.ndarray`
            One-dimensional time-series values.

        noise_range : tuple of float, optional (default=(0, 3))
            A tuple of floats in the form (min_scale, max_scale) for the noise
            amplication range.

        noise_inc_num: float, optional (default=10)
            Number of noise amplifications. The parameter ``noise_range`` will
            be split evenly into ``noise_inc_num`` parts.

        lag : int or str, optional
            Lag to calculate the statistic. It must be a strictly positive
            value, None or a string in {`acf`, `acf-nonsig`, `ami`}. In the
            last two type of options, the lag is estimated within this method
            using the given strategy method (or, if None, it is used the
            strategy `acf-nonsig` by default) up to ``max_nlags``.
                1. `acf`: the lag corresponds to the first non-positive value
                    in the autocorrelation function.
                2. `acf-nonsig`: lag corresponds to the first non-significant
                    value in the autocorrelation function (absolute value below
                    the critical value of 1.96 / sqrt(ts.size)).
                3. `ami`: lag corresponds to the first local minimum of the
                    time-series automutual information function.

        random_state : int, optional
            Random seed to ensure reproducibility.

        ts_scaled : :obj:`np.ndarray`, optional
            Standardized time-series values. Used to take advantage of
            precomputations.

        max_nlags : int, optional
            If ``lag`` is None, then a single lag will be estimated from the
            first negative value of the detrended time-series autocorrelation
            function up to `max_nlags`, if any. Otherwise, lag 1 will be used.
            Used only if ``detrended_acfs`` is None.

        detrended_acfs : :obj:`np.ndarray`, optional
            Array of time-series autocorrelation function (for distinct ordered
            lags) of the detrended time-series. Used only if ``lag`` is None.
            If this argument is not given and the previous condiditon is meet,
            the autocorrelation function will be calculated inside this method
            up to ``max_nlags``.

        Returns
        -------
        float
            Estimated automutual information curvature.

        References
        ----------
        .. [1] Fraser AM, Swinney HL. Independent coordinates for strange
            attractors from mutual information. Phys Rev A Gen Phys.
            1986;33(2):1134‐1140. doi:10.1103/physreva.33.1134
        .. [2] B.D. Fulcher and N.S. Jones, "hctsa: A Computational Framework
            for Automated Time-Series Phenotyping Using Massive Feature
            Extraction, Cell Systems 5: 527 (2017).
            DOI: 10.1016/j.cels.2017.10.001
        .. [3] B.D. Fulcher, M.A. Little, N.S. Jones, "Highly comparative
            time-series analysis: the empirical structure of time series and
            their methods", J. Roy. Soc. Interface 10(83) 20130048 (2013).
            DOI: 10.1098/rsif.2013.0048
        .. [4] Thomas M. Cover and Joy A. Thomas. 1991. Elements of information
            theory. Wiley-Interscience, USA.
        """
        ts_scaled = _utils.standardize_ts(ts=ts, ts_scaled=ts_scaled)

        # Note: casting lag to an array since 'ft_ami_detrended' demands
        # a sequence of lags.
        _lag = np.asarray([
            _embed.embed_lag(
                ts=ts_scaled,
                lag=lag,
                max_nlags=max_nlags,
                detrended_acfs=detrended_acfs,
            )
        ])

        if random_state is not None:
            np.random.seed(random_state)

        # Note: the noise is fixed from the start, and amplified at each
        # iteration.
        gaussian_noise = np.random.randn(ts_scaled.size)
        noise_std = np.linspace(*noise_range, noise_inc_num)

        ami = np.zeros(noise_inc_num, dtype=float)

        for ind, cur_std in enumerate(noise_std):
            ts_corrupted = ts_scaled + cur_std * gaussian_noise

            ami[ind] = cls.ft_ami_detrended(ts=ts_corrupted,
                                            num_bins=32,
                                            lags=_lag,
                                            return_dist=False)

        model = sklearn.linear_model.LinearRegression().fit(
            X=noise_std.reshape(-1, 1), y=ami)

        curvature = model.coef_[0]

        return curvature
コード例 #14
0
def embed_dim_fnn(
    ts: np.ndarray,
    lag: int,
    dims: t.Union[int, t.Sequence[int]] = 16,
    rtol: t.Union[int, float] = 10,
    atol: t.Union[int, float] = 2,
    ts_scaled: t.Optional[np.ndarray] = None,
) -> int:
    """Estimation of the False Nearest Neighbors proportion for each dimension.

    The False Nearest Neighbors calculates the average number of false nearest
    neighbors of each time-series observations, given a fixed embedding
    dimension.

    A false nearest neighbors are a pair of instances that are farther apart
    in the appropriate embedding dimension, but close together in a smaller
    dimension simply because both are projected in a innapropriate dimension.
    Sure enough, we could have just use a `sufficiently large` embedding
    dimension to remove all possibility of false nearest neighbors. However,
    this strategy may imply in a lack of computational effciency, and all
    statistical concerns that may arise in high dimensional data analysis.
    The idea behind of analysing the proportion of false neighbors is to
    estimate the minimum embedding dimension that makes only true neighbors
    be close together in that given space.

    Thus, it is expected that, given the appropriate embedding dimension, the
    proportion of false neighbors will be close to zero.

    Differently from the reference paper, here we are using the Chebyshev
    distance (or maximum norm distance) rather than the Euclidean distance.

    Parameters
    ----------
    ts : :obj:`np.ndarray`
        One-dimensional time-series values.

    lag : int
        Embedding lag. You may want to check the `embed_lag` function
        documentation for embedding lag estimation. Must be a stricly
        positive value.

    dims : int or sequence of int
        Dimensions to estimate the Cao's `E1` and `E2` statistic values.
        If integer, estimate all dimensions from 1 up to the given number.
        If a sequence of integers, estimate the FNN proportion for all
        given dimensions, and return the corresponding values in the same
        order of the given dimensions.
        All dimensions with non-positive values will receive a `np.nan`.

    rtol : float, optional (default=10)
        Relative tolerance between the relative difference of the distances
        between each observation and its nearest neighbor $D_{d}$ in a given
        dimension $d$, and the distance $D_{d+1}$ of the observation and the
        same nearest neighbor in the next embedding dimension. It is used in
        the first criteria from the reference paper to define which instances
        are false neighbors. The default value (10) is the recommended value
        from the original paper, and it means that nearest neighbors that are
        ten times farther in the next dimension relative to the distance in
        the current dimension are considered false nearest neighbors.

    atol : float, optional (default=2)
        Number of time-series standard deviations that an observation and
        its nearest neighbor must be in the next dimension in order to be
        considered false neighbors. This is the reference paper's second
        criteria.

    ts_scaled : :obj:`np.ndarray`, optional
        Standardized time-series values. Used to take advantage of
        precomputations.

    Returns
    -------
    :obj:`np.ndarray`
        Proportion of false nearest neighbos for each given dimension. It is
        used the union of both criterium to determine whether a pair of
        neighbors are false neighbors in a fixed embedding dimension (i.e.,
        any pair of neighbors considered false in either of the criterium
        alone are considered false).

    References
    ----------
    .. [1] Determining embedding dimension for phase-space reconstruction using
        a geometrical construction, Kennel, Matthew B. and Brown, Reggie and
        Abarbanel, Henry D. I., Phys. Rev. A, volume 45, 1992, American
        Physical Society.
    """
    if lag <= 0:
        raise ValueError("'lag' must be positive (got {}).".format(lag))

    _dims: t.Sequence[int]

    if np.isscalar(dims):
        _dims = np.arange(1, int(dims) + 1)  # type: ignore

    else:
        _dims = np.asarray(dims, dtype=int)

    ts_scaled = _utils.standardize_ts(ts=ts, ts_scaled=ts_scaled)

    fnn_prop = np.zeros(len(_dims), dtype=float)

    # Note: since we are using the standardized time-series, its standard
    # deviation is always 1. However, we keep this variable to make clear
    # the correspondence between the reference paper's formulas and what
    # are programmed here.
    ts_std = 1.0  # = np.std(ts_scaled)

    for ind, dim in enumerate(_dims):
        try:
            emb_next = embed_ts(ts=ts_scaled, lag=lag, dim=dim + 1)
            emb_cur = emb_next[:, 1:]

        except ValueError:
            fnn_prop[ind] = np.nan
            continue

        nn_inds, dist_cur = nn(embed=emb_cur)

        emb_next_abs_diff = np.abs(emb_next[:, 0] - emb_next[nn_inds, 0])
        dist_next = np.maximum(dist_cur, emb_next_abs_diff)

        # Note: in the reference paper, there were three criteria for
        # determining what is a False Nearest Neighbor. The first and second
        # one are, respectively, related to the `crit_1` and `crit_2`
        # variables. The third criteria is the union of the criteria, which
        # means that the observation is considered a False Neighbor if either
        # criteria accuses it as such. Here, we are using the third and
        # therefore the most conservative criteria.
        crit_1 = emb_next_abs_diff > rtol * dist_cur
        crit_2 = dist_next > atol * ts_std

        fnn_prop[ind] = np.mean(np.logical_or(crit_1, crit_2))

    return fnn_prop