def _fit_ortho_pol_reg( ts_trend: np.ndarray, degree: int = 2 ) -> statsmodels.regression.linear_model.RegressionResults: """Regress the time-series trend on orthogonal polinomials. Parameters ---------- ts_trend : :obj:`np.ndarray` One-dimensional time-series trend component. degree : int, optional (default=2) Degree of the highest order polynomial (and, therefore, the number of distinct polynomials used). Returns ------- :obj:`statsmodels.regression.linear_model.RegressionResults` Optimized parameters of the linear model of the time-series trend component regressed on the orthogonal polynomials. """ X = _orthopoly.ortho_poly( ts=np.linspace(0, 1, ts_trend.size), degree=degree, return_coeffs=False, ) X = statsmodels.tools.add_constant(X) ts_trend_scaled = _utils.standardize_ts(ts=ts_trend) return statsmodels.regression.linear_model.OLS(ts_trend_scaled, X).fit()
def precompute_ts_scaled(cls, ts: np.ndarray, **kwargs) -> t.Dict[str, np.ndarray]: """Precompute a standardized time series. Parameters ---------- ts : :obj:`np.ndarray` One-dimensional time-series values. kwargs: Additional arguments and previous precomputed items. May speed up this precomputation. Returns ------- dict The following precomputed item is returned: * ``ts_scaled`` (:obj:`np.ndarray`): standardized time-series values (z-score). """ precomp_vals = {} # type: t.Dict[str, np.ndarray] if "ts_scaled" not in kwargs: precomp_vals["ts_scaled"] = _utils.standardize_ts(ts=ts) return precomp_vals
def precompute_gaussian_model(cls, ts: np.ndarray, random_state: t.Optional[int] = None, **kwargs) -> t.Dict[str, t.Any]: """Precompute a gaussian process model. Parameters ---------- ts : :obj:`np.ndarray` One-dimensional time-series values. random_state : int, optional Random seed to optimize the gaussian process model, to keep the results reproducible. kwargs: Additional arguments and previous precomputed items. May speed up this precomputation. Returns ------- dict The following precomputed item is returned: * ``gaussian_model`` (:obj:`GaussianProcessRegressor`): Gaussian process fitted model. * ``gaussian_resid`` (:obj:`np.ndarray`): Gaussian process model residuals (diference from the original time-series). The following item is necessary and, therefore, also precomputed if necessary: * ``ts_scaled`` (:obj:`np.ndarray`): standardized time-series values (z-score). """ precomp_vals = {} # type: t.Dict[str, t.Any] ts_scaled = kwargs.get("ts_scaled") if ts_scaled is None: precomp_vals["ts_scaled"] = _utils.standardize_ts(ts=ts) ts_scaled = precomp_vals["ts_scaled"] if "gaussian_model" not in kwargs: gaussian_model = _utils.fit_gaussian_process( ts=ts, ts_scaled=ts_scaled, random_state=random_state) precomp_vals["gaussian_model"] = gaussian_model gaussian_model = kwargs.get("gaussian_model", precomp_vals["gaussian_model"]) if "gaussian_resid" not in kwargs: gaussian_resid = _utils.fit_gaussian_process( ts=ts, ts_scaled=ts_scaled, gaussian_model=gaussian_model, return_residuals=True, ) precomp_vals["gaussian_resid"] = gaussian_resid return precomp_vals
def ft_gaussian_r_sqr( cls, ts: np.ndarray, random_state: t.Optional[int] = None, ts_scaled: t.Optional[np.ndarray] = None, gaussian_model: t.Optional[ sklearn.gaussian_process.GaussianProcessRegressor] = None, ) -> float: """R^2 from a gaussian process model. Parameters ---------- ts : :obj:`np.ndarray` One-dimensional time-series values. random_state : int, optional Random seed to optimize the gaussian process model, to keep the results reproducible. ts_scaled : :obj:`np.ndarray`, optional Standardized time-series values. Used to take advantage of precomputations. gaussian_model : :obj:`GaussianProcessRegressor`, optional A fitted model of a gaussian process. Used to take advantage of precomputations. Returns ------- float R^2 of a gaussian process model. References ---------- .. [1] B.D. Fulcher and N.S. Jones, "hctsa: A Computational Framework for Automated Time-Series Phenotyping Using Massive Feature Extraction, Cell Systems 5: 527 (2017). DOI: 10.1016/j.cels.2017.10.001 .. [2] B.D. Fulcher, M.A. Little, N.S. Jones, "Highly comparative time-series analysis: the empirical structure of time series and their methods", J. Roy. Soc. Interface 10(83) 20130048 (2013). DOI: 10.1098/rsif.2013.0048 """ ts_scaled = _utils.standardize_ts(ts=ts, ts_scaled=ts_scaled) gaussian_model = _utils.fit_gaussian_process( ts=ts_scaled, random_state=random_state, gaussian_model=gaussian_model, ts_scaled=ts_scaled, ) X = np.linspace(0, 1, ts_scaled.size).reshape(-1, 1) r_squared = gaussian_model.score(X=X, y=ts_scaled) return r_squared
def ft_resample_std( cls, ts: np.ndarray, num_samples: int = 64, sample_size_frac: float = 0.1, ddof: int = 1, random_state: t.Optional[int] = None, ts_scaled: t.Optional[np.ndarray] = None, ) -> np.ndarray: """Time-series standard deviation from repeated subsampling. A subsample of size L is L consecutive observations from the time-series, starting from a random index in [0, len(ts)-L] range. Parameters ---------- ts : :obj:`np.ndarray` One-dimensional time-series values. num_samples : int, optional (default=64) Number of time-series subsamples. sample_size_frac : float, optional (default=0.1) Size of each subsample proportional to the time-series length. ddof : int, optional (default=1) Degrees of freedom of the standard deviation. random_state : int, optional Random seed to ensure reproducibility. ts_scaled : :obj:`np.ndarray`, optional Standardized time-series values. Used to take advantage of precomputations. Returns ------- :obj:`np.ndarray` Standard deviations from repeated subsampling. """ ts_scaled = _utils.standardize_ts(ts=ts, ts_scaled=ts_scaled) sample_std = _utils.apply_on_samples( ts=ts_scaled, func=np.std, num_samples=num_samples, sample_size_frac=sample_size_frac, random_state=random_state, ddof=ddof, ) return sample_std
def _fit_res_model_des( ts: np.ndarray, damped_trend: bool = False, ts_scaled: t.Optional[np.ndarray] = None, ) -> statsmodels.tsa.holtwinters.results.HoltWintersResultsWrapper: """Fit a double exponential smoothing model with additive trend. Parameters ---------- ts : :obj:`np.ndarray` One-dimensional time-series values. damped_trend : bool, optional (default=False) Whether or not the exponential smoothing model should include a damping component. ts_scaled : :obj:`np.ndarray`, optional Standardized time-series values. Used to take advantage of precomputations. Returns ------- :obj:`statsmodels.tsa.holtwinters.results.HoltWintersResultsWrapper` Results of a optimized double exponential smoothing model. References ---------- .. [1] Holt, C. E. (1957). Forecasting seasonals and trends by exponentially weighted averages (O.N.R. Memorandum No. 52). Carnegie Institute of Technology, Pittsburgh USA. https://doi.org/10.1016/j.ijforecast.2003.09.015 """ ts_scaled = _utils.standardize_ts(ts=ts, ts_scaled=ts_scaled) with warnings.catch_warnings(): warnings.filterwarnings( "ignore", module="statsmodels", category=statsmodels.tools.sm_exceptions.ConvergenceWarning, ) model = statsmodels.tsa.holtwinters.ExponentialSmoothing( endog=ts_scaled, initialization_method="estimated", trend="additive", damped_trend=damped_trend, seasonal=None, ).fit() return model
def _itrand_stat( cls, ts: np.ndarray, func_stats: t.Collection[t.Callable[[np.ndarray], float]], strategy: str = "dist-dynamic", prop_rep: t.Union[int, float] = 2, prop_interval: float = 0.1, ts_scaled: t.Optional[np.ndarray] = None, random_state: t.Optional[int] = None, ) -> np.ndarray: """Calculate global statistics with iterative perturbation method. In the iterative perturbation method, a copy of the time-series is modified at each iteration. The quantity of observations modified and the sample pool from which the new values are drawn depends on the ``strategy``selected. Then, a statistic is extracted after every `k` iterations (given by ceil(ts.size * ``prop_interval``)). Parameters ---------- ts : :obj:`np.ndarray` One-dimensional time-series values. func_stats : sequence of callable Sequence of callables to extract the statistic values. Each callable must receive a list of numeric values as the first argument, and return a single numeric value. strategy : str, optional (default="dist-dynamic") The strategy used to perturb the current population. Must be one of the following: 1. `dist-static`: (static distribution) one observation of the current population is overwritten by one observation from the original time-series. 2. `dist-dynamic`: (dynamic distribution) one observation of the current population is overwritten by another observation of the current population. 3. `permute`: two observations of the current population swaps its positions. prop_rep : int or float, optional (default=2) Number of total iterations proportional to the time-series size. This means that this process will iterate for approximately ceil(prop_rep * ts.size) iterations. More rigorously, the number of iterations also depends on the number of iterations that the statistics are extracted, to avoid lose computations. prop_interval : float, optional (default=0.1) Interval that the statistics are extracted from the current population, proportional to the time-series length. ts_scaled : :obj:`np.ndarray`, optional Standardized time-series values. Used to take advantage of precomputations. random_state : int, optional Random seed to ensure reproducibility. Returns ------- :obj:`np.ndarray` Statistics extracted from the dynamic population. Each row is associated to a method from ``func_stats``, and each column is one distinct extraction event, ordered temporally by index (i.e. lower indices corresponds to populations more similar to the starting state, and higher indices to populations more affected by the process). References ---------- .. [1] B.D. Fulcher and N.S. Jones, "hctsa: A Computational Framework for Automated Time-Series Phenotyping Using Massive Feature Extraction, Cell Systems 5: 527 (2017). DOI: 10.1016/j.cels.2017.10.001 .. [2] B.D. Fulcher, M.A. Little, N.S. Jones, "Highly comparative time-series analysis: the empirical structure of time series and their methods", J. Roy. Soc. Interface 10(83) 20130048 (2013). """ if prop_rep <= 0: raise ValueError( "'prop_rep' must be positive (got {}).".format(prop_rep)) if prop_interval <= 0: raise ValueError( "'prop_interval' must be positive (got {}).".format( prop_interval)) VALID_STRATEGY = ("dist-static", "dist-dynamic", "permute") if strategy not in VALID_STRATEGY: raise ValueError("'strategy' not in {} (got '{}')." "".format(VALID_STRATEGY, strategy)) if not hasattr(func_stats, "__len__"): func_stats = [func_stats] # type: ignore ts_scaled = _utils.standardize_ts(ts=ts, ts_scaled=ts_scaled) rep_it = int(np.ceil(prop_interval * ts_scaled.size)) # Note: adding (num_it % rep_it) to avoid lose computation of # the remaining iterations that do not produce a statistic. num_it = int(np.ceil(prop_rep * ts_scaled.size)) num_it += num_it % rep_it res = np.zeros((len(func_stats), 1 + num_it // rep_it)) ts_rnd = np.copy(ts_scaled) ts_src = ts_scaled if strategy == "dist-static" else ts_rnd swap = strategy == "permute" stat_ind = 0 if random_state is not None: np.random.seed(random_state) inds_rnd = np.random.randint(ts_scaled.size, size=(num_it, 2)) for it, (ind_a, ind_b) in enumerate(inds_rnd): if swap: ts_rnd[ind_a], ts_src[ind_b] = ts_src[ind_b], ts_rnd[ind_a] else: ts_rnd[ind_a] = ts_src[ind_b] if it % rep_it == 0: for ind_f, func in enumerate(func_stats): res[ind_f, stat_ind] = func(ts_rnd) stat_ind += 1 return res if len(func_stats) > 1 else res.ravel()
def _fit_res_model_ets( ts: np.ndarray, damped_trend: bool = False, grid_search_guess: bool = True, ts_period: t.Optional[int] = None, ts_scaled: t.Optional[np.ndarray] = None, ) -> statsmodels.tsa.holtwinters.results.HoltWintersResultsWrapper: """Fit a triple exponential smoothing model with additive components. Parameters ---------- ts : :obj:`np.ndarray` One-dimensional time-series values. damped_trend : bool, optional (default=False) Whether or not the exponential smoothing model should include a damping component. grid_search_guess : bool, optional (default=True) If True, used grid search (a.k.a. brute force) to search for good starting parameters. If False, this method becomes more less computationally intensive, but may fail to converge with higher chances. ts_period : int, optional Time-series period. ts_scaled : :obj:`np.ndarray`, optional Standardized time-series values. Used to take advantage of precomputations. Returns ------- :obj:`statsmodels.tsa.holtwinters.results.HoltWintersResultsWrapper` Results of a optimized triple exponential smoothing model. References ---------- .. [1] Winters, Peter R. Forecasting Sales by Exponentially Weighted Moving Averages, 1960, INFORMS, Linthicum, MD, USA https://doi.org/10.1287/mnsc.6.3.324 .. [2] Charles C. Holt, Forecasting seasonals and trends by exponentially weighted moving averages, International Journal of Forecasting, Volume 20, Issue 1, 2004, Pages 5-10, ISSN 0169-2070, https://doi.org/10.1016/j.ijforecast.2003.09.015. """ ts_scaled = _utils.standardize_ts(ts=ts, ts_scaled=ts_scaled) ts_period = _period.get_ts_period(ts=ts_scaled, ts_period=ts_period) with warnings.catch_warnings(): warnings.filterwarnings( "ignore", module="statsmodels", category=statsmodels.tools.sm_exceptions.ConvergenceWarning, ) model = statsmodels.tsa.holtwinters.ExponentialSmoothing( endog=ts_scaled, initialization_method="estimated", trend="additive", seasonal="additive", damped_trend=damped_trend, seasonal_periods=ts_period, ).fit(use_brute=grid_search_guess) return model
def embed_dim_cao( ts: np.ndarray, lag: int, dims: t.Union[int, t.Sequence[int]] = 16, ts_scaled: t.Optional[np.ndarray] = None, ) -> t.Tuple[np.ndarray, np.ndarray]: """Estimate Cao's metrics to estimate time-series embedding dimension. The Cao's metrics are two statistics, `E1` and `E2`, used to estimate the appropriate embedding metric of a time-series. From the `E1` statistic it can be defined the appropriate embedding dimension as the index after the saturation of the metric from a set of ordered lags. The precise `saturation` concept may be a subjective concept, since this metric can show some curious `artifacts` related to specific lags for specific time-series, which will need deeper further investigation. The `E2` statistics is to detect `false positives` from the `E1` statistic since if is used to distinguish between random white noise and a process generated from a true, non completely random, underlying process. If the time-series is purely random white noise, then all values of `E2` will be close to 1. If there exists a dimension with the `E2` metric estimated `sufficiently far` from 1, then this series is considered not a white random noise. Parameters ---------- ts : :obj:`np.ndarray` One-dimensional time-series values. lag : int Embedding lag. You may want to check the `embed_lag` function documentation for embedding lag estimation. Must be a stricly positive value. dims : int or sequence of int Dimensions to estimate the Cao's `E1` and `E2` statistic values. If integer, estimate all dimensions from 1 up to the given number. If a sequence of integers, estimate all Cao's statistics for all given dimensions, and return the corresponding values in the same order of the given dimensions. All dimensions with non-positive values will receive a `np.nan` value for both Cao's metric. ts_scaled : :obj:`np.ndarray`, optional Standardized time-series values. Used to take advantage of precomputations. Returns ------- tuple of :obj:`np.ndarray` `E1` and `E2` Cao's metrics, necessarily in that order, for all given dimensions (and with direct index correspondence for the given dimensions). References ---------- .. [1] Liangyue Cao, Practical method for determining the minimum embedding dimension of a scalar time series, Physica D: Nonlinear Phenomena, Volume 110, Issues 1–2, 1997, Pages 43-50, ISSN 0167-2789, https://doi.org/10.1016/S0167-2789(97)00118-8. """ if lag <= 0: raise ValueError("'lag' must be positive (got {}).".format(lag)) _dims: t.Sequence[int] if np.isscalar(dims): _dims = np.arange(1, int(dims) + 1) # type: ignore else: _dims = np.asarray(dims, dtype=int) ts_scaled = _utils.standardize_ts(ts=ts, ts_scaled=ts_scaled) ed, ed_star = np.zeros((2, len(_dims)), dtype=float) for ind, dim in enumerate(_dims): try: emb_next = embed_ts(ts=ts_scaled, lag=lag, dim=dim + 1) emb_cur = emb_next[:, 1:] except ValueError: ed[ind] = np.nan ed_star[ind] = np.nan continue nn_inds, dist_cur = nn(embed=emb_cur) emb_next_abs_diff = np.abs(emb_next[:, 0] - emb_next[nn_inds, 0]) # Note: 'chebyshev'/'manhattan'/'L1'/max norm distance of X and Y, # both in the embed of (d + 1) dimensions, can be defined in respect # to one dimension less: # L1(X_{d+1}, Y_{d+1}) = |X_{d+1}, Y_{d+1}|_{inf} # = max(|x_1 - y_1|, ..., |x_{d+1} - y_{d+1}|) # = max(max(|x_1 - y_1|, ..., |x_d - y_d|), |x_{d+1} - y_{d+1}|) # = max(L1(X_{d}, Y_{d}), |x_{d+1} - y_{d+1}|) dist_next = np.maximum(dist_cur, emb_next_abs_diff) # Note: 'ed' and 'ed_star' refers to, respectively, E_{d} and # E^{*}_{d} from the Cao's paper. ed[ind] = np.mean(dist_next / dist_cur) ed_star[ind] = np.mean(emb_next_abs_diff) # Note: the minimum embedding dimension is D such that e1[D] # is the first index where e1 stops changing significantly. e1 = ed[1:] / ed[:-1] # Note: This is the E2(d) Cao's metric. Its purpose is to # separate random time-series. For random-generated time- # series, e2 will be 1 for any dimension. For deterministic # data, however, e2 != 1 for some d. e2 = ed_star[1:] / ed_star[:-1] return e1, e2
def ft_emb_dim_cao(ts: np.ndarray, dims: t.Union[int, t.Sequence[int]] = 16, lag: t.Optional[t.Union[str, int]] = None, tol_threshold: float = 0.05, check_e2: bool = True, max_nlags: t.Optional[int] = None, ts_scaled: t.Optional[np.ndarray] = None, detrended_acfs: t.Optional[np.ndarray] = None, detrended_ami: t.Optional[np.ndarray] = None, emb_dim_cao_e1: t.Optional[np.ndarray] = None, emb_dim_cao_e2: t.Optional[np.ndarray] = None) -> int: """Embedding dimension estimation using Cao's method. Using the Cao's embedding dimension estimation, it is calculated both of its metrics, `E1` and `E2` whose purpose is to, respectively, detect the appropriate embedding dimension and detect whether the given time-series is generated by a completely random process (white noise). The appropriate embedding dimension is the saturation dimension of `E1` if and only if exists a dimension `E2` sufficiently distinct from 1. If `E2` is approximately constant at 1 over all dimensions, the series is considered white noise and, therefore, the embedding dimension is assumed to be 1. Parameters ---------- ts : :obj:`np.ndarray` One-dimensional time-series values. dims : int or a sequence of int, optional (default=16) The embedding dimension candidates. In int, investigate all values between 1 and ``dims`` value (both inclusive). If a sequence of integers is used, then investigate only the given dimensions. lag : int or str, optional Lag of the time-series embedding. It must be a strictly positive value, None or a string in {`acf`, `acf-nonsig`, `ami`}. In the last two type of options, the lag is estimated within this method using the given strategy method (or, if None, it is used the strategy `acf-nonsig` by default) up to ``max_nlags``. 1. `acf`: the lag corresponds to the first non-positive value in the autocorrelation function. 2. `acf-nonsig`: lag corresponds to the first non-significant value in the autocorrelation function (absolute value below the critical value of 1.96 / sqrt(ts.size)). 3. `ami`: lag corresponds to the first local minimum of the time-series automutual information function. tol_threshold : float, optional (default=0.05) Tolerance threshold to defined the maximum absolute diference between two E1 values in order to assume saturation. This same threshold is the minimum absolute deviation that E2 values must have in order to be considered different than 1. check_e2 : bool, optional (default=True) If True, check if there exist a Cao's E2 value different than 1, and return 1 if this condition is not satisfied. If False, ignore the E2 values. max_nlags : int, optional If ``lag`` is not a numeric value, than it will be estimated using either the time-series autocorrelation or mutual information function estimated up to this argument value. ts_scaled : :obj:`np.ndarray`, optional Standardized time-series values. Used to take advantage of precomputations. detrended_acfs : :obj:`np.ndarray`, optional Array of time-series autocorrelation function (for distinct ordered lags) of the detrended time-series. Used only if ``lag`` is any of `acf`, `acf-nonsig` or None. If this argument is not given and the previous condiditon is meet, the autocorrelation function will be calculated inside this method up to ``max_nlags``. detrended_ami : :obj:`np.ndarray`, optional Array of time-series automutual information function (for distinct ordered lags). Used only if ``lag`` is `ami`. If not given and the previous condiditon is meet, the automutual information function will be calculated inside this method up to ``max_nlags``. emb_dim_cao_e1 : :obj:`np.ndarray`, optional E1 values from the Cao's method. Used to take advantage of precomputations. emb_dim_cao_e2 : :obj:`np.ndarray`, optional E2 values from the Cao's method. Used to take advantage of precomputations. Returns ------- int Estimation of the appropriate embedding dimension using Cao's method. References ---------- .. [1] Liangyue Cao, Practical method for determining the minimum embedding dimension of a scalar time series, Physica D: Nonlinear Phenomena, Volume 110, Issues 1–2, 1997, Pages 43-50, ISSN 0167-2789, https://doi.org/10.1016/S0167-2789(97)00118-8. """ ts_scaled = _utils.standardize_ts(ts=ts, ts_scaled=ts_scaled) lag = embed_lag(ts=ts_scaled, lag=lag, detrended_acfs=detrended_acfs, detrended_ami=detrended_ami, max_nlags=max_nlags) if emb_dim_cao_e1 is None or (check_e2 and emb_dim_cao_e2 is None): emb_dim_cao_e1, emb_dim_cao_e2 = embed_dim_cao(ts=ts, ts_scaled=ts_scaled, dims=dims, lag=lag) if (check_e2 and emb_dim_cao_e2 is not None and np.all(np.abs(emb_dim_cao_e2 - 1) < tol_threshold)): return 1 e1_abs_diff = np.abs(np.diff(emb_dim_cao_e1)) first_max_ind = 0 try: first_max_ind = np.flatnonzero(e1_abs_diff <= tol_threshold)[0] except IndexError: pass return first_max_ind + 1
def ft_sample_entropy( cls, ts: np.ndarray, embed_dim: int = 2, embed_lag: int = 1, threshold: float = 0.2, metric: str = "chebyshev", p: t.Union[int, float] = 2, ts_scaled: t.Optional[np.ndarray] = None, ) -> float: """Sample entropy of the time-series. Parameters ---------- ts : :obj:`np.ndarray` One-dimensional time-series values. embed_dim : int, optional (default=2) Embedding dimension. embed_dim : int, optional (default=1) Embedding lag. threshold : float, optional (default=0.2) Threshold to consider which observations are next to each other after embedding. metric : str, optional (default="chebyshev") Distance metric to calculate the pairwise distance of the observations after each embedding. Check `scipy.spatial.distance.cdist` documentation for the complete list of available distance metrics. p : int or float, optional (default=2) Power parameter for the minkowski metric. Used only if metric is `minkowski`. ts_scaled : :obj:`np.ndarray`, optional Standardized time-series values. Used to take advantage of precomputations. Returns ------- float Estimated sample entropy. References ---------- .. [1] Physiological time-series analysis using approximate entropy and sample entropy Joshua S. Richman and J. Randall Moorman, American Journal of Physiology-Heart and Circulatory Physiology 2000 278:6, H2039-H2049 .. [2] B.D. Fulcher and N.S. Jones, "hctsa: A Computational Framework for Automated Time-Series Phenotyping Using Massive Feature Extraction, Cell Systems 5: 527 (2017). DOI: 10.1016/j.cels.2017.10.001 .. [3] B.D. Fulcher, M.A. Little, N.S. Jones, "Highly comparative time-series analysis: the empirical structure of time series and their methods", J. Roy. Soc. Interface 10(83) 20130048 (2013). DOI: 10.1098/rsif.2013.0048 """ def log_neigh_num(dim: int) -> int: """Logarithm of the number of nearest neighbors.""" embed = _embed.embed_ts(ts_scaled, dim=dim, lag=embed_lag) dist_mat = scipy.spatial.distance.pdist(embed, metric=metric, p=p) return np.log(np.sum(dist_mat < threshold)) ts_scaled = _utils.standardize_ts(ts=ts, ts_scaled=ts_scaled) sample_entropy = log_neigh_num(embed_dim) - log_neigh_num(embed_dim + 1) return sample_entropy
def ft_approx_entropy( cls, ts: np.ndarray, embed_dim: int = 2, embed_lag: int = 1, threshold: float = 0.2, metric: str = "chebyshev", p: t.Union[int, float] = 2, ts_scaled: t.Optional[np.ndarray] = None, ) -> float: """Approximate entropy of the time-series. Parameters ---------- ts : :obj:`np.ndarray` One-dimensional time-series values. embed_dim : int, optional (default=2) Embedding dimension. embed_dim : int, optional (default=1) Embedding lag. threshold : float, optional (default=0.2) Threshold to consider which observations are next to each other after embedding. metric : str, optional (default="chebyshev") Distance metric to calculate the pairwise distance of the observations after each embedding. Check `scipy.spatial.distance.cdist` documentation for the complete list of available distance metrics. p : int or float, optional (default=2) Power parameter for the minkowski metric. Used only if metric is `minkowski`. ts_scaled : :obj:`np.ndarray`, optional Standardized time-series values. Used to take advantage of precomputations. Returns ------- float Estimated approximate entropy. References ---------- .. [1] Pincus, S.M., Gladstone, I.M. & Ehrenkranz, R.A. A regularity statistic for medical data analysis. J Clin Monitor Comput 7, 335–345 (1991). https://doi.org/10.1007/BF01619355 .. [2] B.D. Fulcher and N.S. Jones, "hctsa: A Computational Framework for Automated Time-Series Phenotyping Using Massive Feature Extraction, Cell Systems 5: 527 (2017). DOI: 10.1016/j.cels.2017.10.001 .. [3] B.D. Fulcher, M.A. Little, N.S. Jones, "Highly comparative time-series analysis: the empirical structure of time series and their methods", J. Roy. Soc. Interface 10(83) 20130048 (2013). DOI: 10.1098/rsif.2013.0048 """ def neigh_num(dim: int) -> int: """Mean-log-mean of the number of radius neighbors.""" embed = _embed.embed_ts(ts_scaled, dim=dim, lag=embed_lag) dist_mat = scipy.spatial.distance.cdist(embed, embed, metric=metric, p=p) return np.mean(np.log(np.mean(dist_mat < threshold, axis=1))) ts_scaled = _utils.standardize_ts(ts=ts, ts_scaled=ts_scaled) approx_entropy = neigh_num(embed_dim) - neigh_num(embed_dim + 1) return approx_entropy
def ft_ami_curvature( cls, ts: np.ndarray, noise_range: t.Tuple[float, float] = (0, 3), noise_inc_num: float = 10, lag: t.Optional[t.Union[str, int]] = None, random_state: t.Optional[int] = None, ts_scaled: t.Optional[np.ndarray] = None, max_nlags: t.Optional[int] = None, detrended_acfs: t.Optional[np.ndarray] = None, ) -> float: """Estimate the Automutual information curvature. The Automutual information curvature is estimated using iterative noise amplification strategy. In the iterative noise amplification strategy, a random white noise is sampled from a normal distribution (mean 0 and variance 1). Then, this same noise is iteratively amplified from a uniformly spaced scales in ``noise_range`` range and added to the time-series. The automutual information is calculated from the perturbed time-series for each noise amplification. The automutual information curvature is the angular coefficient of a linear regression of the automutual information onto the noise scales. The lag used for every iteration is fixed from the start and, if not fixed by the user, it is estimated from the autocorrelation function by default. Parameters ---------- ts : :obj:`np.ndarray` One-dimensional time-series values. noise_range : tuple of float, optional (default=(0, 3)) A tuple of floats in the form (min_scale, max_scale) for the noise amplication range. noise_inc_num: float, optional (default=10) Number of noise amplifications. The parameter ``noise_range`` will be split evenly into ``noise_inc_num`` parts. lag : int or str, optional Lag to calculate the statistic. It must be a strictly positive value, None or a string in {`acf`, `acf-nonsig`, `ami`}. In the last two type of options, the lag is estimated within this method using the given strategy method (or, if None, it is used the strategy `acf-nonsig` by default) up to ``max_nlags``. 1. `acf`: the lag corresponds to the first non-positive value in the autocorrelation function. 2. `acf-nonsig`: lag corresponds to the first non-significant value in the autocorrelation function (absolute value below the critical value of 1.96 / sqrt(ts.size)). 3. `ami`: lag corresponds to the first local minimum of the time-series automutual information function. random_state : int, optional Random seed to ensure reproducibility. ts_scaled : :obj:`np.ndarray`, optional Standardized time-series values. Used to take advantage of precomputations. max_nlags : int, optional If ``lag`` is None, then a single lag will be estimated from the first negative value of the detrended time-series autocorrelation function up to `max_nlags`, if any. Otherwise, lag 1 will be used. Used only if ``detrended_acfs`` is None. detrended_acfs : :obj:`np.ndarray`, optional Array of time-series autocorrelation function (for distinct ordered lags) of the detrended time-series. Used only if ``lag`` is None. If this argument is not given and the previous condiditon is meet, the autocorrelation function will be calculated inside this method up to ``max_nlags``. Returns ------- float Estimated automutual information curvature. References ---------- .. [1] Fraser AM, Swinney HL. Independent coordinates for strange attractors from mutual information. Phys Rev A Gen Phys. 1986;33(2):1134‐1140. doi:10.1103/physreva.33.1134 .. [2] B.D. Fulcher and N.S. Jones, "hctsa: A Computational Framework for Automated Time-Series Phenotyping Using Massive Feature Extraction, Cell Systems 5: 527 (2017). DOI: 10.1016/j.cels.2017.10.001 .. [3] B.D. Fulcher, M.A. Little, N.S. Jones, "Highly comparative time-series analysis: the empirical structure of time series and their methods", J. Roy. Soc. Interface 10(83) 20130048 (2013). DOI: 10.1098/rsif.2013.0048 .. [4] Thomas M. Cover and Joy A. Thomas. 1991. Elements of information theory. Wiley-Interscience, USA. """ ts_scaled = _utils.standardize_ts(ts=ts, ts_scaled=ts_scaled) # Note: casting lag to an array since 'ft_ami_detrended' demands # a sequence of lags. _lag = np.asarray([ _embed.embed_lag( ts=ts_scaled, lag=lag, max_nlags=max_nlags, detrended_acfs=detrended_acfs, ) ]) if random_state is not None: np.random.seed(random_state) # Note: the noise is fixed from the start, and amplified at each # iteration. gaussian_noise = np.random.randn(ts_scaled.size) noise_std = np.linspace(*noise_range, noise_inc_num) ami = np.zeros(noise_inc_num, dtype=float) for ind, cur_std in enumerate(noise_std): ts_corrupted = ts_scaled + cur_std * gaussian_noise ami[ind] = cls.ft_ami_detrended(ts=ts_corrupted, num_bins=32, lags=_lag, return_dist=False) model = sklearn.linear_model.LinearRegression().fit( X=noise_std.reshape(-1, 1), y=ami) curvature = model.coef_[0] return curvature
def embed_dim_fnn( ts: np.ndarray, lag: int, dims: t.Union[int, t.Sequence[int]] = 16, rtol: t.Union[int, float] = 10, atol: t.Union[int, float] = 2, ts_scaled: t.Optional[np.ndarray] = None, ) -> int: """Estimation of the False Nearest Neighbors proportion for each dimension. The False Nearest Neighbors calculates the average number of false nearest neighbors of each time-series observations, given a fixed embedding dimension. A false nearest neighbors are a pair of instances that are farther apart in the appropriate embedding dimension, but close together in a smaller dimension simply because both are projected in a innapropriate dimension. Sure enough, we could have just use a `sufficiently large` embedding dimension to remove all possibility of false nearest neighbors. However, this strategy may imply in a lack of computational effciency, and all statistical concerns that may arise in high dimensional data analysis. The idea behind of analysing the proportion of false neighbors is to estimate the minimum embedding dimension that makes only true neighbors be close together in that given space. Thus, it is expected that, given the appropriate embedding dimension, the proportion of false neighbors will be close to zero. Differently from the reference paper, here we are using the Chebyshev distance (or maximum norm distance) rather than the Euclidean distance. Parameters ---------- ts : :obj:`np.ndarray` One-dimensional time-series values. lag : int Embedding lag. You may want to check the `embed_lag` function documentation for embedding lag estimation. Must be a stricly positive value. dims : int or sequence of int Dimensions to estimate the Cao's `E1` and `E2` statistic values. If integer, estimate all dimensions from 1 up to the given number. If a sequence of integers, estimate the FNN proportion for all given dimensions, and return the corresponding values in the same order of the given dimensions. All dimensions with non-positive values will receive a `np.nan`. rtol : float, optional (default=10) Relative tolerance between the relative difference of the distances between each observation and its nearest neighbor $D_{d}$ in a given dimension $d$, and the distance $D_{d+1}$ of the observation and the same nearest neighbor in the next embedding dimension. It is used in the first criteria from the reference paper to define which instances are false neighbors. The default value (10) is the recommended value from the original paper, and it means that nearest neighbors that are ten times farther in the next dimension relative to the distance in the current dimension are considered false nearest neighbors. atol : float, optional (default=2) Number of time-series standard deviations that an observation and its nearest neighbor must be in the next dimension in order to be considered false neighbors. This is the reference paper's second criteria. ts_scaled : :obj:`np.ndarray`, optional Standardized time-series values. Used to take advantage of precomputations. Returns ------- :obj:`np.ndarray` Proportion of false nearest neighbos for each given dimension. It is used the union of both criterium to determine whether a pair of neighbors are false neighbors in a fixed embedding dimension (i.e., any pair of neighbors considered false in either of the criterium alone are considered false). References ---------- .. [1] Determining embedding dimension for phase-space reconstruction using a geometrical construction, Kennel, Matthew B. and Brown, Reggie and Abarbanel, Henry D. I., Phys. Rev. A, volume 45, 1992, American Physical Society. """ if lag <= 0: raise ValueError("'lag' must be positive (got {}).".format(lag)) _dims: t.Sequence[int] if np.isscalar(dims): _dims = np.arange(1, int(dims) + 1) # type: ignore else: _dims = np.asarray(dims, dtype=int) ts_scaled = _utils.standardize_ts(ts=ts, ts_scaled=ts_scaled) fnn_prop = np.zeros(len(_dims), dtype=float) # Note: since we are using the standardized time-series, its standard # deviation is always 1. However, we keep this variable to make clear # the correspondence between the reference paper's formulas and what # are programmed here. ts_std = 1.0 # = np.std(ts_scaled) for ind, dim in enumerate(_dims): try: emb_next = embed_ts(ts=ts_scaled, lag=lag, dim=dim + 1) emb_cur = emb_next[:, 1:] except ValueError: fnn_prop[ind] = np.nan continue nn_inds, dist_cur = nn(embed=emb_cur) emb_next_abs_diff = np.abs(emb_next[:, 0] - emb_next[nn_inds, 0]) dist_next = np.maximum(dist_cur, emb_next_abs_diff) # Note: in the reference paper, there were three criteria for # determining what is a False Nearest Neighbor. The first and second # one are, respectively, related to the `crit_1` and `crit_2` # variables. The third criteria is the union of the criteria, which # means that the observation is considered a False Neighbor if either # criteria accuses it as such. Here, we are using the third and # therefore the most conservative criteria. crit_1 = emb_next_abs_diff > rtol * dist_cur crit_2 = dist_next > atol * ts_std fnn_prop[ind] = np.mean(np.logical_or(crit_1, crit_2)) return fnn_prop