def mad(a, c=Gaussian.ppf(3 / 4.), axis=0, center=np.median): # c \approx .6745 """ The Median Absolute Deviation along given axis of an array Parameters ---------- a : array_like Input array. c : float, optional The normalization constant. Defined as scipy.stats.norm.ppf(3/4.), which is approximately .6745. axis : int, optional The default is 0. Can also be None. center : callable or float If a callable is provided, such as the default `np.median` then it is expected to be called center(a). The axis argument will be applied via np.apply_over_axes. Otherwise, provide a float. Returns ------- mad : float `mad` = median(abs(`a` - center))/`c` """ a = array_like(a, 'a', ndim=None) c = float_like(c, 'c') if not a.size: center = 0.0 elif callable(center): center = np.apply_over_axes(center, a, axis) else: center = float_like(center, "center") return np.median((np.abs(a - center)) / c, axis=axis)
def test_float_like(floating): assert isinstance(float_like(floating, "floating"), float) assert isinstance(float_like(floating, "floating", optional=True), float) assert float_like(None, "floating", optional=True) is None if isinstance(floating, (int, np.integer, float, np.inexact)): assert isinstance(float_like(floating, "floating", strict=True), float) assert float_like(None, "floating", optional=True, strict=True) is None
def __init__( self, index: Union[Sequence[Hashable], pd.Index], *, period: Optional[Union[float, int]] = None, constant: bool = False, order: int = 0, seasonal: bool = False, fourier: int = 0, additional_terms: Sequence[DeterministicTerm] = (), drop: bool = False, ): if not isinstance(index, pd.Index): index = pd.Index(index) self._index = index self._deterministic_terms: List[DeterministicTerm] = [] self._extendable = False self._index_freq = None self._validate_index() period = float_like(period, "period", optional=True) self._constant = constant = bool_like(constant, "constant") self._order = required_int_like(order, "order") self._seasonal = seasonal = bool_like(seasonal, "seasonal") self._fourier = required_int_like(fourier, "fourier") additional_terms = tuple(additional_terms) self._cached_in_sample = None self._drop = bool_like(drop, "drop") self._additional_terms = additional_terms if constant or order: self._deterministic_terms.append(TimeTrend(constant, order)) if seasonal and fourier: raise ValueError( """seasonal and fourier can be initialized through the constructor since\ these will be necessarily perfectly collinear. Instead, you can pass \ additional components using the additional_terms input.""") if (seasonal or fourier) and period is None: if period is None: self._period = period = freq_to_period(self._index_freq) if seasonal: period = required_int_like(period, "period") self._deterministic_terms.append(Seasonality(period)) elif fourier: period = float_like(period, "period") assert period is not None self._deterministic_terms.append(Fourier(period, order=fourier)) for term in additional_terms: if not isinstance(term, DeterministicTerm): raise TypeError( "All additional terms must be instances of subsclasses " "of DeterministicTerm") if term not in self._deterministic_terms: self._deterministic_terms.append(term) else: raise ValueError( "One or more terms in additional_terms has been added " "through the parameters of the constructor. Terms must " "be unique.") self._period = period self._retain_cols: Optional[List[Hashable]] = None
def iqr(a, c=Gaussian.ppf(3 / 4) - Gaussian.ppf(1 / 4), axis=0): """ The normalized interquartile range along given axis of an array Parameters ---------- a : array_like Input array. c : float, optional The normalization constant, used to get consistent estimates of the standard deviation at the normal distribution. Defined as scipy.stats.norm.ppf(3/4.) - scipy.stats.norm.ppf(1/4.), which is approximately 1.349. axis : int, optional The default is 0. Can also be None. Returns ------- The normalized interquartile range """ a = array_like(a, "a", ndim=None) c = float_like(c, "c") if a.ndim == 0: raise ValueError("a should have at least one dimension") elif a.size == 0: return np.nan else: quantiles = np.quantile(a, [0.25, 0.75], axis=axis) return np.squeeze(np.diff(quantiles, axis=0) / c)
def mad(a, c=Gaussian.ppf(3 / 4.0), axis=0, center=np.median): """ The Median Absolute Deviation along given axis of an array Parameters ---------- a : array_like Input array. c : float, optional The normalization constant. Defined as scipy.stats.norm.ppf(3/4.), which is approximately 0.6745. axis : int, optional The default is 0. Can also be None. center : callable or float If a callable is provided, such as the default `np.median` then it is expected to be called center(a). The axis argument will be applied via np.apply_over_axes. Otherwise, provide a float. Returns ------- mad : float `mad` = median(abs(`a` - center))/`c` """ a = array_like(a, "a", ndim=None) c = float_like(c, "c") if not a.size: center_val = 0.0 elif callable(center): if axis is not None: center_val = np.apply_over_axes(center, a, axis) else: center_val = center(a.ravel()) else: center_val = float_like(center, "center") err = (np.abs(a - center_val)) / c if not err.size: if axis is None or err.ndim == 1: return np.nan else: shape = list(err.shape) shape.pop(axis) return np.empty(shape) return np.median(err, axis=axis)
def qn_scale(a, c=1 / (np.sqrt(2) * Gaussian.ppf(5 / 8)), axis=0): """ Computes the Qn robust estimator of scale The Qn scale estimator is a more efficient alternative to the MAD. The Qn scale estimator of an array a of length n is defined as c * {abs(a[i] - a[j]): i<j}_(k), for k equal to [n/2] + 1 choose 2. Thus, the Qn estimator is the k-th order statistic of the absolute differences of the array. The optional constant is used to normalize the estimate as explained below. The implementation follows the algorithm described in Croux and Rousseeuw (1992). Parameters ---------- a : array_like Input array. c : float, optional The normalization constant. The default value is used to get consistent estimates of the standard deviation at the normal distribution. axis : int, optional The default is 0. Returns ------- {float, ndarray} The Qn robust estimator of scale """ a = array_like(a, "a", ndim=None, dtype=np.float64, contiguous=True, order="C") c = float_like(c, "c") if a.ndim == 0: raise ValueError("a should have at least one dimension") elif a.size == 0: return np.nan else: out = np.apply_along_axis(_qn, axis=axis, arr=a, c=c) if out.ndim == 0: return float(out) return out
def iqr(a, c=Gaussian.ppf(3 / 4) - Gaussian.ppf(1 / 4), axis=0, center=np.median): """ The normalized interquartile range along given axis of an array Parameters ---------- a : array_like Input array. c : float, optional The normalization constant, used to get consistent estimates of the standard deviation at the normal distribution. Defined as scipy.stats.norm.ppf(3/4.) - scipy.stats.norm.ppf(1/4.), which is approximately 1.349. axis : int, optional The default is 0. Can also be None. center : callable or float If a callable is provided, such as the default `np.median` then it is expected to be called center(a). The axis argument will be applied via np.apply_over_axes. Otherwise, provide a float. Returns ------- The normalized interquartile range """ a = array_like(a, 'a', ndim=None) c = float_like(c, 'c') if a.size == 0: return np.nan else: if callable(center) and a.size: center = np.apply_over_axes(center, a, axis) else: center = 0.0 quantiles = np.quantile(a - center, [0.25, 0.75], axis=axis) return np.squeeze(np.diff(quantiles, axis=0) / c)
def __init__( self, data: Union[np.ndarray, pd.Series, pd.DataFrame], stats: Sequence[str] = None, *, numeric: bool = True, categorical: bool = True, alpha: float = 0.05, use_t: bool = False, percentiles: Sequence[Union[int, float]] = PERCENTILES, ntop: bool = 5, ): data_arr = data if not isinstance(data, (pd.Series, pd.DataFrame)): data_arr = array_like(data, "data", maxdim=2) if data_arr.ndim == 1: data = pd.Series(data) numeric = bool_like(numeric, "numeric") categorical = bool_like(categorical, "categorical") include = [] col_types = "" if numeric: include.append(np.number) col_types = "numeric" if categorical: include.append("category") col_types += "and " if col_types != "" else "" col_types += "categorical" if not numeric and not categorical: raise ValueError( "At least one of numeric and categorical must be True" ) self._data = pd.DataFrame(data).select_dtypes(include) if self._data.shape[1] == 0: raise ValueError( "Selecting {col_types} results in an empty DataFrame" ) self._is_numeric = [is_numeric_dtype(dt) for dt in self._data.dtypes] self._is_cat_like = [ is_categorical_dtype(dt) for dt in self._data.dtypes ] if stats is not None: undef = [stat for stat in stats if stat not in DEFAULT_STATISTICS] if undef: raise ValueError( f"{', '.join(undef)} are not known statistics" ) self._stats = ( list(DEFAULT_STATISTICS) if stats is None else list(stats) ) self._ntop = int_like(ntop, "ntop") self._compute_top = "top" in self._stats self._compute_freq = "freq" in self._stats if self._compute_top and self._ntop <= 0 < sum(self._is_cat_like): raise ValueError("top must be a non-negative integer") self._compute_perc = "percentiles" in self._stats self._percentiles = array_like( percentiles, "percentiles", maxdim=1, dtype="d" ) self._percentiles = np.sort(self._percentiles) if np.unique(self._percentiles).shape[0] != self._percentiles.shape[0]: raise ValueError("percentiles must be distinct") if np.any(self._percentiles >= 100) or np.any(self._percentiles <= 0): raise ValueError("percentiles must be strictly between 0 and 100") # Expand special stats replacements = { "mode": ["mode", "mode_freq"], "ci": ["upper_ci", "lower_ci"], "jarque_bera": ["jarque_bera", "jarque_bera_pval"], "top": [f"top_{i}" for i in range(1, self._ntop + 1)], "freq": [f"freq_{i}" for i in range(1, self._ntop + 1)], "percentiles": [f"{i}%" for i in percentiles], } for key in replacements: if key in self._stats: idx = self._stats.index(key) self._stats = ( self._stats[:idx] + replacements[key] + self._stats[idx + 1 :] ) self._alpha = float_like(alpha, "alpha") if not 0 < alpha < 1: raise ValueError("alpha must be strictly between 0 and 1") self._use_t = bool_like(use_t, "use_t")
def test_not_float_like(not_floating): with pytest.raises(TypeError): float_like(not_floating, "floating")
def fit(self, smoothing_level=None, smoothing_slope=None, smoothing_seasonal=None, damping_slope=None, optimized=True, use_boxcox=False, remove_bias=False, use_basinhopping=False, start_params=None, initial_level=None, initial_slope=None, use_brute=True): """ Fit the model Parameters ---------- smoothing_level : float, optional The alpha value of the simple exponential smoothing, if the value is set then this value will be used as the value. smoothing_slope : float, optional The beta value of the Holt's trend method, if the value is set then this value will be used as the value. smoothing_seasonal : float, optional The gamma value of the holt winters seasonal method, if the value is set then this value will be used as the value. damping_slope : float, optional The phi value of the damped method, if the value is set then this value will be used as the value. optimized : bool, optional Estimate model parameters by maximizing the log-likelihood use_boxcox : {True, False, 'log', float}, optional Should the Box-Cox transform be applied to the data first? If 'log' then apply the log. If float then use lambda equal to float. remove_bias : bool, optional Remove bias from forecast values and fitted values by enforcing that the average residual is equal to zero. use_basinhopping : bool, optional Using Basin Hopping optimizer to find optimal values start_params : ndarray, optional Starting values to used when optimizing the fit. If not provided, starting values are determined using a combination of grid search and reasonable values based on the initial values of the data initial_level : float, optional Value to use when initializing the fitted level. initial_slope : float, optional Value to use when initializing the fitted slope. use_brute : bool, optional Search for good starting values using a brute force (grid) optimizer. If False, a naive set of starting values is used. Returns ------- results : HoltWintersResults class See statsmodels.tsa.holtwinters.HoltWintersResults Notes ----- This is a full implementation of the holt winters exponential smoothing as per [1]. This includes all the unstable methods as well as the stable methods. The implementation of the library covers the functionality of the R library as much as possible whilst still being Pythonic. References ---------- [1] Hyndman, Rob J., and George Athanasopoulos. Forecasting: principles and practice. OTexts, 2014. """ # Variable renames to alpha,beta, etc as this helps with following the # mathematical notation in general alpha = float_like(smoothing_level, 'smoothing_level', True) beta = float_like(smoothing_slope, 'smoothing_slope', True) gamma = float_like(smoothing_seasonal, 'smoothing_seasonal', True) phi = float_like(damping_slope, 'damping_slope', True) l0 = self._l0 = float_like(initial_level, 'initial_level', True) b0 = self._b0 = float_like(initial_slope, 'initial_slope', True) if start_params is not None: start_params = array_like(start_params, 'start_params', contiguous=True) data = self._data damped = self.damped seasoning = self.seasoning trending = self.trending trend = self.trend seasonal = self.seasonal m = self.seasonal_periods opt = None phi = phi if damped else 1.0 if use_boxcox == 'log': lamda = 0.0 y = boxcox(data, lamda) elif isinstance(use_boxcox, float): lamda = use_boxcox y = boxcox(data, lamda) elif use_boxcox: y, lamda = boxcox(data) else: lamda = None y = data.squeeze() self._y = y lvls = np.zeros(self.nobs) b = np.zeros(self.nobs) s = np.zeros(self.nobs + m - 1) p = np.zeros(6 + m) max_seen = np.finfo(np.double).max l0, b0, s0 = self.initial_values() xi = np.zeros_like(p, dtype=np.bool) if optimized: init_alpha = alpha if alpha is not None else 0.5 / max(m, 1) init_beta = beta if beta is not None else 0.1 * init_alpha if trending else beta init_gamma = None init_phi = phi if phi is not None else 0.99 # Selection of functions to optimize for appropriate parameters if seasoning: init_gamma = gamma if gamma is not None else 0.05 * \ (1 - init_alpha) xi = np.array([ alpha is None, trending and beta is None, gamma is None, initial_level is None, trending and initial_slope is None, phi is None and damped ] + [True] * m) func = SMOOTHERS[(seasonal, trend)] elif trending: xi = np.array([ alpha is None, beta is None, False, initial_level is None, initial_slope is None, phi is None and damped ] + [False] * m) func = SMOOTHERS[(None, trend)] else: xi = np.array([ alpha is None, False, False, initial_level is None, False, False ] + [False] * m) func = SMOOTHERS[(None, None)] p[:] = [init_alpha, init_beta, init_gamma, l0, b0, init_phi] + s0 if np.any(xi): # txi [alpha, beta, gamma, l0, b0, phi, s0,..,s_(m-1)] # Have a quick look in the region for a good starting place for alpha etc. # using guesstimates for the levels txi = xi & np.array([True, True, True, False, False, True] + [False] * m) txi = txi.astype(np.bool) bounds = ([(0.0, 1.0), (0.0, 1.0), (0.0, 1.0), (0.0, None), (0.0, None), (0.0, 1.0)] + [ (None, None), ] * m) args = (txi.astype(np.uint8), p, y, lvls, b, s, m, self.nobs, max_seen) if start_params is None and np.any(txi) and use_brute: _bounds = [bnd for bnd, flag in zip(bounds, txi) if flag] res = brute(func, _bounds, args, Ns=20, full_output=True, finish=None) p[txi], max_seen, _, _ = res else: if start_params is not None: if len(start_params) != xi.sum(): msg = 'start_params must have {0} values but ' \ 'has {1} instead' nxi, nsp = len(xi), len(start_params) raise ValueError(msg.format(nxi, nsp)) p[xi] = start_params args = (xi.astype(np.uint8), p, y, lvls, b, s, m, self.nobs, max_seen) max_seen = func(np.ascontiguousarray(p[xi]), *args) # alpha, beta, gamma, l0, b0, phi = p[:6] # s0 = p[6:] # bounds = np.array([(0.0,1.0),(0.0,1.0),(0.0,1.0),(0.0,None), # (0.0,None),(0.8,1.0)] + [(None,None),]*m) args = (xi.astype(np.uint8), p, y, lvls, b, s, m, self.nobs, max_seen) if use_basinhopping: # Take a deeper look in the local minimum we are in to find the best # solution to parameters, maybe hop around to try escape the local # minimum we may be in. _bounds = [bnd for bnd, flag in zip(bounds, xi) if flag] res = basinhopping(func, p[xi], minimizer_kwargs={ 'args': args, 'bounds': _bounds }, stepsize=0.01) success = res.lowest_optimization_result.success else: # Take a deeper look in the local minimum we are in to find the best # solution to parameters _bounds = [bnd for bnd, flag in zip(bounds, xi) if flag] lb, ub = np.asarray(_bounds).T.astype(np.float) initial_p = p[xi] # Ensure strictly inbounds loc = initial_p <= lb upper = ub[loc].copy() upper[~np.isfinite(upper)] = 100.0 eps = 1e-4 initial_p[loc] = lb[loc] + eps * (upper - lb[loc]) loc = initial_p >= ub lower = lb[loc].copy() lower[~np.isfinite(lower)] = -100.0 eps = 1e-4 initial_p[loc] = ub[loc] - eps * (ub[loc] - lower) res = minimize(func, initial_p, args=args, bounds=_bounds) success = res.success if not success: from warnings import warn from statsmodels.tools.sm_exceptions import ConvergenceWarning warn("Optimization failed to converge. Check mle_retvals.", ConvergenceWarning) p[xi] = res.x opt = res else: from warnings import warn from statsmodels.tools.sm_exceptions import EstimationWarning message = "Model has no free parameters to estimate. Set " \ "optimized=False to suppress this warning" warn(message, EstimationWarning) [alpha, beta, gamma, l0, b0, phi] = p[:6] s0 = p[6:] hwfit = self._predict(h=0, smoothing_level=alpha, smoothing_slope=beta, smoothing_seasonal=gamma, damping_slope=phi, initial_level=l0, initial_slope=b0, initial_seasons=s0, use_boxcox=use_boxcox, remove_bias=remove_bias, is_optimized=xi) hwfit._results.mle_retvals = opt return hwfit
def fit( self, kernel="gau", bw="normal_reference", fft=True, weights=None, gridsize=None, adjust=1, cut=3, clip=(-np.inf, np.inf), ): """ Attach the density estimate to the KDEUnivariate class. Parameters ---------- kernel : str The Kernel to be used. Choices are: - "biw" for biweight - "cos" for cosine - "epa" for Epanechnikov - "gau" for Gaussian. - "tri" for triangular - "triw" for triweight - "uni" for uniform bw : str, float, callable The bandwidth to use. Choices are: - "scott" - 1.059 * A * nobs ** (-1/5.), where A is `min(std(x),IQR/1.34)` - "silverman" - .9 * A * nobs ** (-1/5.), where A is `min(std(x),IQR/1.34)` - "normal_reference" - C * A * nobs ** (-1/5.), where C is calculated from the kernel. Equivalent (up to 2 dp) to the "scott" bandwidth for gaussian kernels. See bandwidths.py - If a float is given, its value is used as the bandwidth. - If a callable is given, it's return value is used. The callable should take exactly two parameters, i.e., fn(x, kern), and return a float, where: * x - the clipped input data * kern - the kernel instance used fft : bool Whether or not to use FFT. FFT implementation is more computationally efficient. However, only the Gaussian kernel is implemented. If FFT is False, then a 'nobs' x 'gridsize' intermediate array is created. gridsize : int If gridsize is None, max(len(x), 50) is used. cut : float Defines the length of the grid past the lowest and highest values of x so that the kernel goes to zero. The end points are -/+ cut*bw*{min(x) or max(x)} adjust : float An adjustment factor for the bw. Bandwidth becomes bw * adjust. Returns ------- KDEUnivariate The instance fit, """ if isinstance(bw, str): self.bw_method = bw else: self.bw_method = "user-given" if not callable(bw): bw = float_like(bw, "bw") endog = self.endog if fft: if kernel != "gau": msg = "Only gaussian kernel is available for fft" raise NotImplementedError(msg) if weights is not None: msg = "Weights are not implemented for fft" raise NotImplementedError(msg) density, grid, bw = kdensityfft( endog, kernel=kernel, bw=bw, adjust=adjust, weights=weights, gridsize=gridsize, clip=clip, cut=cut, ) else: density, grid, bw = kdensity( endog, kernel=kernel, bw=bw, adjust=adjust, weights=weights, gridsize=gridsize, clip=clip, cut=cut, ) self.density = density self.support = grid self.bw = bw self.kernel = kernel_switch[kernel](h=bw) # we instantiate twice, # should this passed to funcs? # put here to ensure empty cache after re-fit with new options self.kernel.weights = weights if weights is not None: self.kernel.weights /= weights.sum() self._cache = {} return self
def kdensityfft( x, kernel="gau", bw="normal_reference", weights=None, gridsize=None, adjust=1, clip=(-np.inf, np.inf), cut=3, retgrid=True, ): """ Rosenblatt-Parzen univariate kernel density estimator Parameters ---------- x : array_like The variable for which the density estimate is desired. kernel : str ONLY GAUSSIAN IS CURRENTLY IMPLEMENTED. "bi" for biweight "cos" for cosine "epa" for Epanechnikov, default "epa2" for alternative Epanechnikov "gau" for Gaussian. "par" for Parzen "rect" for rectangular "tri" for triangular bw : str, float, callable The bandwidth to use. Choices are: - "scott" - 1.059 * A * nobs ** (-1/5.), where A is `min(std(x),IQR/1.34)` - "silverman" - .9 * A * nobs ** (-1/5.), where A is `min(std(x),IQR/1.34)` - "normal_reference" - C * A * nobs ** (-1/5.), where C is calculated from the kernel. Equivalent (up to 2 dp) to the "scott" bandwidth for gaussian kernels. See bandwidths.py - If a float is given, its value is used as the bandwidth. - If a callable is given, it's return value is used. The callable should take exactly two parameters, i.e., fn(x, kern), and return a float, where: * x - the clipped input data * kern - the kernel instance used weights : array or None WEIGHTS ARE NOT CURRENTLY IMPLEMENTED. Optional weights. If the x value is clipped, then this weight is also dropped. gridsize : int If gridsize is None, min(len(x), 512) is used. Note that the provided number is rounded up to the next highest power of 2. adjust : float An adjustment factor for the bw. Bandwidth becomes bw * adjust. clip : tuple Observations in x that are outside of the range given by clip are dropped. The number of observations in x is then shortened. cut : float Defines the length of the grid past the lowest and highest values of x so that the kernel goes to zero. The end points are -/+ cut*bw*{x.min() or x.max()} retgrid : bool Whether or not to return the grid over which the density is estimated. Returns ------- density : ndarray The densities estimated at the grid points. grid : ndarray, optional The grid points at which the density is estimated. Notes ----- Only the default kernel is implemented. Weights are not implemented yet. This follows Silverman (1982) with changes suggested by Jones and Lotwick (1984). However, the discretization step is replaced by linear binning of Fan and Marron (1994). This should be extended to accept the parts that are dependent only on the data to speed things up for cross-validation. References ---------- Fan, J. and J.S. Marron. (1994) `Fast implementations of nonparametric curve estimators`. Journal of Computational and Graphical Statistics. 3.1, 35-56. Jones, M.C. and H.W. Lotwick. (1984) `Remark AS R50: A Remark on Algorithm AS 176. Kernal Density Estimation Using the Fast Fourier Transform`. Journal of the Royal Statistical Society. Series C. 33.1, 120-2. Silverman, B.W. (1982) `Algorithm AS 176. Kernel density estimation using the Fast Fourier Transform. Journal of the Royal Statistical Society. Series C. 31.2, 93-9. """ x = np.asarray(x) # will not work for two columns. x = x[np.logical_and(x > clip[0], x < clip[1])] # Get kernel object corresponding to selection kern = kernel_switch[kernel]() if callable(bw): bw = float(bw(x, kern)) # user passed a callable custom bandwidth function elif isinstance(bw, str): # if bw is None, select optimal bandwidth for kernel bw = bandwidths.select_bandwidth(x, bw, kern) # will cross-val fit this pattern? else: bw = float_like(bw, "bw") bw *= adjust nobs = len(x) # after trim # 1 Make grid and discretize the data if gridsize is None: gridsize = np.max((nobs, 512.0)) gridsize = 2**np.ceil(np.log2(gridsize)) # round to next power of 2 a = np.min(x) - cut * bw b = np.max(x) + cut * bw grid, delta = np.linspace(a, b, int(gridsize), retstep=True) RANGE = b - a # TODO: Fix this? # This is the Silverman binning function, but I believe it's buggy (SS) # weighting according to Silverman # count = counts(x,grid) # binned = np.zeros_like(grid) #xi_{k} in Silverman # j = 0 # for k in range(int(gridsize-1)): # if count[k]>0: # there are points of x in the grid here # Xingrid = x[j:j+count[k]] # get all these points # # get weights at grid[k],grid[k+1] # binned[k] += np.sum(grid[k+1]-Xingrid) # binned[k+1] += np.sum(Xingrid-grid[k]) # j += count[k] # binned /= (nobs)*delta**2 # normalize binned to sum to 1/delta # NOTE: THE ABOVE IS WRONG, JUST TRY WITH LINEAR BINNING binned = fast_linbin(x, a, b, gridsize) / (delta * nobs) # step 2 compute FFT of the weights, using Munro (1976) FFT convention y = forrt(binned) # step 3 and 4 for optimal bw compute zstar and the density estimate f # do not have to redo the above if just changing bw, ie., for cross val # NOTE: silverman_transform is the closed form solution of the FFT of the # gaussian kernel. Not yet sure how to generalize it. zstar = silverman_transform(bw, gridsize, RANGE) * y # 3.49 in Silverman # 3.50 w Gaussian kernel f = revrt(zstar) if retgrid: return f, grid, bw else: return f, bw
def __init__(self, data, ncomp=None, standardize=True, demean=True, normalize=True, gls=False, weights=None, method='svd', missing=None, tol=5e-8, max_iter=1000, tol_em=5e-8, max_em_iter=100): self._index = None self._columns = [] if isinstance(data, pd.DataFrame): self._index = data.index self._columns = data.columns self.data = array_like(data, "data", ndim=2) # Store inputs self._gls = bool_like(gls, "gls") self._normalize = bool_like(normalize, "normalize") self._tol = float_like(tol, "tol") if not 0 < self._tol < 1: raise ValueError('tol must be strictly between 0 and 1') self._max_iter = int_like(max_iter, "int_like") self._max_em_iter = int_like(max_em_iter, "max_em_iter") self._tol_em = float_like(tol_em, "tol_em") # Prepare data self._standardize = bool_like(standardize, "standardize") self._demean = bool_like(demean, "demean") self._nobs, self._nvar = self.data.shape weights = array_like(weights, "weights", maxdim=1, optional=True) if weights is None: weights = np.ones(self._nvar) else: weights = np.array(weights).flatten() if weights.shape[0] != self._nvar: raise ValueError('weights should have nvar elements') weights = weights / np.sqrt((weights ** 2.0).mean()) self.weights = weights # Check ncomp against maximum min_dim = min(self._nobs, self._nvar) self._ncomp = min_dim if ncomp is None else ncomp if self._ncomp > min_dim: import warnings warn = 'The requested number of components is more than can be ' \ 'computed from data. The maximum number of components is ' \ 'the minimum of the number of observations or variables' warnings.warn(warn, ValueWarning) self._ncomp = min_dim self._method = method # Workaround to avoid instance methods in __dict__ if self._method not in ('eig', 'svd', 'nipals'): raise ValueError('method {0} is not known.'.format(method)) self.rows = np.arange(self._nobs) self.cols = np.arange(self._nvar) # Handle missing self._missing = string_like(missing, "missing", optional=True) self._adjusted_data = self.data self._adjust_missing() # Update size self._nobs, self._nvar = self._adjusted_data.shape if self._ncomp == np.min(self.data.shape): self._ncomp = np.min(self._adjusted_data.shape) elif self._ncomp > np.min(self._adjusted_data.shape): raise ValueError('When adjusting for missing values, user ' 'provided ncomp must be no larger than the ' 'smallest dimension of the ' 'missing-value-adjusted data size.') # Attributes and internal values self._tss = 0.0 self._ess = None self.transformed_data = None self._mu = None self._sigma = None self._ess_indiv = None self._tss_indiv = None self.scores = self.factors = None self.loadings = None self.coeff = None self.eigenvals = None self.eigenvecs = None self.projection = None self.rsquare = None self.ic = None # Prepare data self.transformed_data = self._prepare_data() # Perform the PCA self._pca() if gls: self._compute_gls_weights() self.transformed_data = self._prepare_data() self._pca() # Final calculations self._compute_rsquare_and_ic() if self._index is not None: self._to_pandas()
def forecast(self, steps: int = 1, theta: float = 2) -> pd.Series: r""" Forecast the model for a given theta Parameters ---------- steps : int The number of steps ahead to compute the forecast components. theta : float The theta value to use when computing the weight to combine the trend and the SES forecasts. Returns ------- Series A Series containing the forecasts Notes ----- The forecast is computed as .. math:: \hat{X}_{T+h|T} = \frac{\theta-1}{\theta} b_0 \left[h - 1 + \frac{1}{\alpha} - \frac{(1-\alpha)^T}{\alpha} \right] + \tilde{X}_{T+h|T} where :math:`\tilde{X}_{T+h|T}` is the SES forecast of the endogenous variable using the parameter :math:`\alpha`. :math:`b_0` is the slope of a time trend line fitted to X using the terms 0, 1, ..., T-1. This expression follows from [1]_ and [2]_ when the combination weights are restricted to be (theta-1)/theta and 1/theta. This nests the original implementation when theta=2 and the two weights are both 1/2. References ---------- .. [1] Hyndman, R. J., & Billah, B. (2003). Unmasking the Theta method. International Journal of Forecasting, 19(2), 287-290. .. [2] Fioruci, J. A., Pellegrini, T. R., Louzada, F., & Petropoulos, F. (2015). The optimized theta method. arXiv preprint arXiv:1503.03529. """ steps = int_like(steps, "steps") if steps < 1: raise ValueError("steps must be a positive integer") theta = float_like(theta, "theta") if theta < 1: raise ValueError("theta must be a float >= 1") thresh = 4.0 / np.finfo(np.double).eps trend_weight = (theta - 1) / theta if theta < thresh else 1.0 comp = self.forecast_components(steps=steps) fcast = trend_weight * comp.trend + np.asarray(comp.ses) # Re-seasonalize if needed if self.model.deseasonalize: seasonal = np.asarray(comp.seasonal) if self.model.method.startswith("mul"): fcast *= seasonal else: fcast += seasonal fcast.name = "forecast" return fcast
def __init__(self, endog, trend=False, damped_trend=False, seasonal=None, initialization_method='estimated', initial_level=None, initial_trend=None, initial_seasonal=None, bounds=None, concentrate_scale=True, dates=None, freq=None, missing='none'): # Model definition self.trend = bool_like(trend, 'trend') self.damped_trend = bool_like(damped_trend, 'damped_trend') self.seasonal_periods = int_like(seasonal, 'seasonal', optional=True) self.seasonal = self.seasonal_periods is not None self.initialization_method = string_like( initialization_method, 'initialization_method').lower() self.concentrate_scale = bool_like(concentrate_scale, 'concentrate_scale') # TODO: add validation for bounds (e.g. have all bounds, upper > lower) # TODO: add `bounds_method` argument to choose between "usual" and # "admissible" as in Hyndman et al. (2008) self.bounds = bounds if self.bounds is None: self.bounds = [(1e-4, 1-1e-4)] * 3 + [(0.8, 0.98)] # Validation if self.seasonal_periods == 1: raise ValueError('Cannot have a seasonal period of 1.') if self.seasonal and self.seasonal_periods is None: raise NotImplementedError('Unable to detect season automatically;' ' please specify `seasonal_periods`.') if self.initialization_method not in ['concentrated', 'estimated', 'simple', 'heuristic', 'known']: raise ValueError('Invalid initialization method "%s".' % initialization_method) if self.initialization_method == 'known': if initial_level is None: raise ValueError('`initial_level` argument must be provided' ' when initialization method is set to' ' "known".') if initial_trend is None and self.trend: raise ValueError('`initial_trend` argument must be provided' ' for models with a trend component when' ' initialization method is set to "known".') if initial_seasonal is None and self.seasonal: raise ValueError('`initial_seasonal` argument must be provided' ' for models with a seasonal component when' ' initialization method is set to "known".') # Initialize the state space model if not self.seasonal or self.seasonal_periods is None: self._seasonal_periods = 0 else: self._seasonal_periods = self.seasonal_periods k_states = 2 + int(self.trend) + self._seasonal_periods k_posdef = 1 init = ss_init.Initialization(k_states, 'known', constant=[0] * k_states) super(ExponentialSmoothing, self).__init__( endog, k_states=k_states, k_posdef=k_posdef, initialization=init, dates=dates, freq=freq, missing=missing) # Concentrate the scale out of the likelihood function if self.concentrate_scale: self.ssm.filter_concentrated = True # Setup fixed elements of the system matrices # Observation error self.ssm['design', 0, 0] = 1. self.ssm['selection', 0, 0] = 1. self.ssm['state_cov', 0, 0] = 1. # Level self.ssm['design', 0, 1] = 1. self.ssm['transition', 1, 1] = 1. # Trend if self.trend: self.ssm['transition', 1:3, 2] = 1. # Seasonal if self.seasonal: k = 2 + int(self.trend) self.ssm['design', 0, k] = 1. self.ssm['transition', k, -1] = 1. self.ssm['transition', k + 1:k_states, k:k_states - 1] = ( np.eye(self.seasonal_periods - 1)) # Initialization of the states if self.initialization_method != 'known': msg = ('Cannot give `%%s` argument when initialization is "%s"' % initialization_method) if initial_level is not None: raise ValueError(msg % 'initial_level') if initial_trend is not None: raise ValueError(msg % 'initial_trend') if initial_seasonal is not None: raise ValueError(msg % 'initial_seasonal') if self.initialization_method == 'simple': initial_level, initial_trend, initial_seasonal = ( es_init._initialization_simple( self.endog[:, 0], trend='add' if self.trend else None, seasonal='add' if self.seasonal else None, seasonal_periods=self.seasonal_periods)) elif self.initialization_method == 'heuristic': initial_level, initial_trend, initial_seasonal = ( es_init._initialization_heuristic( self.endog[:, 0], trend='add' if self.trend else None, seasonal='add' if self.seasonal else None, seasonal_periods=self.seasonal_periods)) elif self.initialization_method == 'known': initial_level = float_like(initial_level, 'initial_level') if self.trend: initial_trend = float_like(initial_trend, 'initial_trend') if self.seasonal: initial_seasonal = array_like(initial_seasonal, 'initial_seasonal') if len(initial_seasonal) == self.seasonal_periods - 1: initial_seasonal = np.r_[initial_seasonal, 0 - np.sum(initial_seasonal)] if len(initial_seasonal) != self.seasonal_periods: raise ValueError( 'Invalid length of initial seasonal values. Must be' ' one of s or s-1, where s is the number of seasonal' ' periods.') # Note that the simple and heuristic methods of computing initial # seasonal factors return estimated seasonal factors associated with # the first t = 1, 2, ..., `n_seasons` observations. To use these as # the initial state, we lag them by `n_seasons`. This yields, for # example for `n_seasons = 4`, the seasons lagged L3, L2, L1, L0. # As described above, the state vector in this model should have # seasonal factors ordered L0, L1, L2, L3, and as a result we need to # reverse the order of the computed initial seasonal factors from # these methods. methods = ['simple', 'heuristic'] if (self.initialization_method in methods and initial_seasonal is not None): initial_seasonal = initial_seasonal[::-1] self._initial_level = initial_level self._initial_trend = initial_trend self._initial_seasonal = initial_seasonal self._initial_state = None # Initialize now if possible (if we have a damped trend, then # initialization will depend on the phi parameter, and so has to be # done at each `update`) methods = ['simple', 'heuristic', 'known'] if not self.damped_trend and self.initialization_method in methods: self._initialize_constant_statespace(initial_level, initial_trend, initial_seasonal) # Save keys for kwarg initialization self._init_keys += ['trend', 'damped_trend', 'seasonal', 'initialization_method', 'initial_level', 'initial_trend', 'initial_seasonal', 'bounds', 'concentrate_scale', 'dates', 'freq', 'missing']
def __init__(self, period: float, order: int): super().__init__(order) self._period = float_like(period, "period") if 2 * self._order > self._period: raise ValueError("2 * order must be <= period")
def kdensity( x, kernel="gau", bw="normal_reference", weights=None, gridsize=None, adjust=1, clip=(-np.inf, np.inf), cut=3, retgrid=True, ): """ Rosenblatt-Parzen univariate kernel density estimator. Parameters ---------- x : array_like The variable for which the density estimate is desired. kernel : str The Kernel to be used. Choices are - "biw" for biweight - "cos" for cosine - "epa" for Epanechnikov - "gau" for Gaussian. - "tri" for triangular - "triw" for triweight - "uni" for uniform bw : str, float, callable The bandwidth to use. Choices are: - "scott" - 1.059 * A * nobs ** (-1/5.), where A is `min(std(x),IQR/1.34)` - "silverman" - .9 * A * nobs ** (-1/5.), where A is `min(std(x),IQR/1.34)` - "normal_reference" - C * A * nobs ** (-1/5.), where C is calculated from the kernel. Equivalent (up to 2 dp) to the "scott" bandwidth for gaussian kernels. See bandwidths.py - If a float is given, its value is used as the bandwidth. - If a callable is given, it's return value is used. The callable should take exactly two parameters, i.e., fn(x, kern), and return a float, where: * x - the clipped input data * kern - the kernel instance used weights : array or None Optional weights. If the x value is clipped, then this weight is also dropped. gridsize : int If gridsize is None, max(len(x), 50) is used. adjust : float An adjustment factor for the bw. Bandwidth becomes bw * adjust. clip : tuple Observations in x that are outside of the range given by clip are dropped. The number of observations in x is then shortened. cut : float Defines the length of the grid past the lowest and highest values of x so that the kernel goes to zero. The end points are -/+ cut*bw*{min(x) or max(x)} retgrid : bool Whether or not to return the grid over which the density is estimated. Returns ------- density : ndarray The densities estimated at the grid points. grid : ndarray, optional The grid points at which the density is estimated. Notes ----- Creates an intermediate (`gridsize` x `nobs`) array. Use FFT for a more computationally efficient version. """ x = np.asarray(x) if x.ndim == 1: x = x[:, None] clip_x = np.logical_and(x > clip[0], x < clip[1]) x = x[clip_x] nobs = len(x) # after trim if gridsize is None: gridsize = max(nobs, 50) # do not need to resize if no FFT # handle weights if weights is None: weights = np.ones(nobs) q = nobs else: # ensure weights is a numpy array weights = np.asarray(weights) if len(weights) != len(clip_x): msg = "The length of the weights must be the same as the given x." raise ValueError(msg) weights = weights[clip_x.squeeze()] q = weights.sum() # Get kernel object corresponding to selection kern = kernel_switch[kernel]() if callable(bw): bw = float(bw(x, kern)) # user passed a callable custom bandwidth function elif isinstance(bw, str): bw = bandwidths.select_bandwidth(x, bw, kern) # will cross-val fit this pattern? else: bw = float_like(bw, "bw") bw *= adjust a = np.min(x, axis=0) - cut * bw b = np.max(x, axis=0) + cut * bw grid = np.linspace(a, b, gridsize) k = (x.T - grid[:, None]) / bw # uses broadcasting to make a gridsize x nobs # set kernel bandwidth kern.seth(bw) # truncate to domain if (kern.domain is not None): # will not work for piecewise kernels like parzen z_lo, z_high = kern.domain domain_mask = (k < z_lo) | (k > z_high) k = kern(k) # estimate density k[domain_mask] = 0 else: k = kern(k) # estimate density k[k < 0] = 0 # get rid of any negative values, do we need this? dens = np.dot(k, weights) / (q * bw) if retgrid: return dens, grid, bw else: return dens, bw