def _hessian_finite_difference(self, params, approx_centered=False, **kwargs): params = np.array(params, ndmin=1) warnings.warn( 'Calculation of the Hessian using finite differences' ' is usually subject to substantial approximation' ' errors.', PrecisionWarning, stacklevel=3, ) if not approx_centered: epsilon = _get_epsilon(params, 3, None, len(params)) else: epsilon = _get_epsilon(params, 4, None, len(params)) / 2 hessian = approx_fprime(params, self._score_finite_difference, epsilon=epsilon, kwargs=kwargs, centered=approx_centered) # TODO: changed this to nobs_effective, has to be changed when merging # with statespace mlemodel return hessian / (self.nobs_effective)
def approx_fprime(x, f, epsilon=None, args=(), kwargs=None, centered=True): """ Gradient of function, or Jacobian if function fun returns 1d array Parameters ---------- x : array parameters at which the derivative is evaluated fun : function `fun(*((x,)+args), **kwargs)` returning either one value or 1d array epsilon : float, optional Stepsize, if None, optimal stepsize is used. This is _EPS**(1/2)*x for `centered` == False and _EPS**(1/3)*x for `centered` == True. args : tuple Tuple of additional arguments for function `fun`. kwargs : dict Dictionary of additional keyword arguments for function `fun`. centered : bool Whether central difference should be returned. If not, does forward differencing. Returns ------- grad : array gradient or Jacobian Notes ----- If fun returns a 1d array, it returns a Jacobian. If a 2d array is returned by fun (e.g., with a value for each observation), it returns a 3d array with the Jacobian of each observation with shape xk x nobs x xk. I.e., the Jacobian of the first observation would be [:, 0, :] """ kwargs = {} if kwargs is None else kwargs x = np.atleast_1d(x).ravel() n = len(x) f0 = f(*(x, ) + args, **kwargs) dim = np.atleast_1d(f0).shape # it could be a scalar grad = np.zeros((n, ) + dim, float) ei = np.zeros(np.shape(x), float) if not centered: epsilon = _get_epsilon(x, 2, epsilon, n) for k in range(n): ei[k] = epsilon[k] grad[k, :] = (f(*(x + ei, ) + args, **kwargs) - f0) / epsilon[k] ei[k] = 0.0 else: epsilon = _get_epsilon(x, 3, epsilon, n) / 2. for k in range(n): ei[k] = epsilon[k] grad[k, :] = (f(*(x + ei, ) + args, **kwargs) - f(*(x - ei, ) + args, **kwargs)) / (2 * epsilon[k]) ei[k] = 0.0 # return grad # return grad.T # grad = grad.squeeze() axes = list(range(grad.ndim)) axes[:2] = axes[1::-1] return np.transpose(grad, axes=axes)
def _score_complex_step(self, params, **kwargs): # the default epsilon can be too small # inversion_method = INVERT_UNIVARIATE | SOLVE_LU epsilon = _get_epsilon(params, 2., None, len(params)) kwargs['transformed'] = True kwargs['complex_step'] = True return approx_fprime_cs(params, self.loglike, epsilon=epsilon, kwargs=kwargs)
def approx_jacobian(x,func,epsilon,*args): '''Approximate the Jacobian matrix of callable function func Parameters: * x - The state vector at which the Jacobian matrix is desired * func - A vector-valued function of the form f(x,*args) * epsilon - The step size used to determine the partial derivatives. Set to None to select the optimal step size. * *args - Additional arguments passed to func Returns: An array of dimensions (lenf, lenx) where lenf is the length of the outputs of func, and lenx is the number of inputs of func. Notes: The approximation is done using fourth order central difference method. ''' if np.shape(x) == (): n = 1 x = np.asarray([x]) else: n = len(x) method = 'FirstOrderCentralDifference' method = 'FourthOrderCentralDifference' x0 = np.asarray(x) f0 = func(x0, *args) if method == 'FirstOrderCentralDifference': jac = np.zeros([len(x0),len(f0)]) df1 = np.zeros([len(x0),len(f0)]) df2 = np.zeros([len(x0),len(f0)]) dx = np.zeros(len(x0)) for i in range(len(x0)): dx[i] = epsilon df1[i] = func(*((x0+dx/2,)+args)) df2[i] = func(*((x0-dx/2,)+args)) jac[i] = (df1[i] - df2[i])/epsilon dx[i] = 0.0 if method == 'FourthOrderCentralDifference': epsilon = nd._get_epsilon(x,3,epsilon,n)/2. jac = np.zeros([len(x0),len(f0)]) df1 = np.zeros([len(x0),len(f0)]) df2 = np.zeros([len(x0),len(f0)]) df3 = np.zeros([len(x0),len(f0)]) df4 = np.zeros([len(x0),len(f0)]) dx = np.zeros(len(x0)) for i in range(len(x0)): dx[i] = epsilon[i] df1[i] = -func(*((x0+2*dx,)+args)) df2[i] = 8*func(*((x0+dx,)+args)) df3[i] = -8*func(*((x0-dx,)+args)) df4[i] = func(*((x0-2*dx,)+args)) jac[i] = (df1[i]+df2[i] + df3[i] + df4[i])/(12*dx[i]) dx[i] = 0.0 return jac.transpose()
def arma_scoreobs(endog, ar_params=None, ma_params=None, sigma2=1, prefix=None): """ Compute the score per observation (gradient of the loglikelihood function) Parameters ---------- endog : ndarray The observed time-series process. ar_params : ndarray, optional Autoregressive coefficients, not including the zero lag. ma_params : ndarray, optional Moving average coefficients, not including the zero lag, where the sign convention assumes the coefficients are part of the lag polynomial on the right-hand-side of the ARMA definition (i.e. they have the same sign from the usual econometrics convention in which the coefficients are on the right-hand-side of the ARMA definition). sigma2 : ndarray, optional The ARMA innovation variance. Default is 1. prefix : str, optional The BLAS prefix associated with the datatype. Default is to find the best datatype based on given input. This argument is typically only used internally. Returns ---------- scoreobs : array Score per observation, evaluated at the given parameters. Notes ----- This is a numerical approximation, calculated using first-order complex step differentiation on the `arma_loglike` method. """ ar_params = [] if ar_params is None else ar_params ma_params = [] if ma_params is None else ma_params p = len(ar_params) q = len(ma_params) def func(params): return arma_loglikeobs(endog, params[:p], params[p:p + q], params[p + q:]) params0 = np.r_[ar_params, ma_params, sigma2] epsilon = _get_epsilon(params0, 2., None, len(params0)) return approx_fprime_cs(params0, func, epsilon)
def _hessian_complex_step(self, params, **kwargs): """ Hessian matrix computed by second-order complex-step differentiation on the `loglike` function. """ # the default epsilon can be too small epsilon = _get_epsilon(params, 3., None, len(params)) kwargs['transformed'] = True kwargs['complex_step'] = True hessian = approx_hess_cs( params, self.loglike, epsilon=epsilon, kwargs=kwargs) # TODO: changed this to nobs_effective, has to be changed when merging # with statespace mlemodel return hessian / (self.nobs_effective)
def approx_fprime_cs(x, f, epsilon=None, args=(), kwargs=None): ''' Calculate gradient or Jacobian with complex step derivative approximation Parameters ---------- x : array parameters at which the derivative is evaluated f : function `f(*((x,)+args), **kwargs)` returning either one value or 1d array epsilon : float, optional Stepsize, if None, optimal stepsize is used. Optimal step-size is EPS*x. See note. args : tuple Tuple of additional arguments for function `f`. kwargs : dict Dictionary of additional keyword arguments for function `f`. Returns ------- partials : ndarray array of partial derivatives, Gradient or Jacobian Notes ----- The complex-step derivative has truncation error O(epsilon**2), so truncation error can be eliminated by choosing epsilon to be very small. The complex-step derivative avoids the problem of round-off error with small epsilon because there is no subtraction. ''' # From Guilherme P. de Freitas, numpy mailing list # May 04 2010 thread "Improvement of performance" # http://mail.scipy.org/pipermail/numpy-discussion/2010-May/050250.html kwargs = {} if kwargs is None else kwargs x = np.atleast_1d(x).ravel() n = len(x) epsilon = _get_epsilon(x, 1, epsilon, n) increments = np.identity(n) * 1j * epsilon # TODO: see if this can be vectorized, but usually dim is small partials = [ f(x + ih, *args, **kwargs).imag / epsilon[i] for i, ih in enumerate(increments) ] axes = list(range(partials[0].ndim + 1)) axes[:2] = axes[1::-1] return np.transpose(partials, axes=axes)
def _approx_hess1_backward(x, f, epsilon=None, args=(), kwargs=None): n = len(x) epsilon = -np.abs(_get_epsilon(x, 3, epsilon, n)) return approx_hess1(x, f, epsilon, args, kwargs, centered=False)
def observed_information_matrix(self, params, **kwargs): """ Observed information matrix Parameters ---------- params : array_like, optional Array of parameters at which to evaluate the loglikelihood function. **kwargs Additional keyword arguments to pass to the Kalman filter. See `KalmanFilter.filter` for more details. Notes ----- This method is from Harvey (1989), which shows that the information matrix only depends on terms from the gradient. This implementation is partially analytic and partially numeric approximation, therefore, because it uses the analytic formula for the information matrix, with numerically computed elements of the gradient. References ---------- Harvey, Andrew C. 1990. Forecasting, Structural Time Series Models and the Kalman Filter. Cambridge University Press. """ # Setup n = len(params) epsilon = _get_epsilon(params, 1, None, n) increments = np.identity(n) * 1j * epsilon # Get values at the params themselves self.update(params) res = self.ssm.filter(**kwargs) dtype = self.ssm.dtype # Save this for inversion later inv_forecasts_error_cov = res.forecasts_error_cov.copy() # Compute partial derivatives partials_forecasts_error = ( np.zeros((self.k_endog, self.nobs, n)) ) partials_forecasts_error_cov = ( np.zeros((self.k_endog, self.k_endog, self.nobs, n)) ) for i, ih in enumerate(increments): self.update(params + ih) res = self.ssm.filter(**kwargs) partials_forecasts_error[:, :, i] = ( res.forecasts_error.imag / epsilon[i] ) partials_forecasts_error_cov[:, :, :, i] = ( res.forecasts_error_cov.imag / epsilon[i] ) # Compute the information matrix tmp = np.zeros((self.k_endog, self.k_endog, self.nobs, n), dtype=dtype) information_matrix = np.zeros((n, n), dtype=dtype) for t in range(self.ssm.loglikelihood_burn, self.nobs): inv_forecasts_error_cov[:, :, t] = ( np.linalg.inv(inv_forecasts_error_cov[:, :, t]) ) for i in range(n): tmp[:, :, t, i] = np.dot( inv_forecasts_error_cov[:, :, t], partials_forecasts_error_cov[:, :, t, i] ) for i in range(n): for j in range(n): information_matrix[i, j] += ( 0.5 * np.trace(np.dot(tmp[:, :, t, i], tmp[:, :, t, j])) ) information_matrix[i, j] += np.inner( partials_forecasts_error[:, t, i], np.dot(inv_forecasts_error_cov[:,:,t], partials_forecasts_error[:, t, j]) ) return information_matrix / (self.nobs - self.ssm.loglikelihood_burn)
def _approx_fprime_backward(x, f, epsilon=None, args=(), kwargs=None): x = np.atleast_1d(x).ravel() n = len(x) epsilon = -np.abs(_get_epsilon(x, 2, epsilon, n)) return approx_fprime(x, f, epsilon, args, kwargs, centered=False)
def _approx_hess1_backward(x, f, epsilon=None, args=(), kwargs=None): n = len(x) kwargs = {} if kwargs is None else kwargs epsilon = -np.abs(_get_epsilon(x, 3, epsilon, n)) return approx_hess1(x, f, epsilon, args, kwargs)
def observed_information_matrix(self, params, **kwargs): """ Observed information matrix Parameters ---------- params : array_like, optional Array of parameters at which to evaluate the loglikelihood function. **kwargs Additional keyword arguments to pass to the Kalman filter. See `KalmanFilter.filter` for more details. Notes ----- This method is from Harvey (1989), which shows that the information matrix only depends on terms from the gradient. This implementation is partially analytic and partially numeric approximation, therefore, because it uses the analytic formula for the information matrix, with numerically computed elements of the gradient. References ---------- Harvey, Andrew C. 1990. Forecasting, Structural Time Series Models and the Kalman Filter. Cambridge University Press. """ # Setup n = len(params) epsilon = _get_epsilon(params, 1, None, n) increments = np.identity(n) * 1j * epsilon kwargs['results'] = FilterResults # Get values at the params themselves self.update(params) res = self.filter(**kwargs) dtype = self.dtype # Save this for inversion later inv_forecasts_error_cov = res.forecasts_error_cov.copy() # Compute partial derivatives partials_forecasts_error = (np.zeros((self.k_endog, self.nobs, n))) partials_forecasts_error_cov = (np.zeros( (self.k_endog, self.k_endog, self.nobs, n))) for i, ih in enumerate(increments): self.update(params + ih) res = self.filter(**kwargs) partials_forecasts_error[:, :, i] = (res.forecasts_error.imag / epsilon[i]) partials_forecasts_error_cov[:, :, :, i] = (res.forecasts_error_cov.imag / epsilon[i]) # Compute the information matrix tmp = np.zeros((self.k_endog, self.k_endog, self.nobs, n), dtype=dtype) information_matrix = np.zeros((n, n), dtype=dtype) for t in range(self.loglikelihood_burn, self.nobs): inv_forecasts_error_cov[:, :, t] = (np.linalg.inv( inv_forecasts_error_cov[:, :, t])) for i in range(n): tmp[:, :, t, i] = np.dot(inv_forecasts_error_cov[:, :, t], partials_forecasts_error_cov[:, :, t, i]) for i in range(n): for j in range(n): information_matrix[i, j] += ( 0.5 * np.trace(np.dot(tmp[:, :, t, i], tmp[:, :, t, j]))) information_matrix[i, j] += np.inner( partials_forecasts_error[:, t, i], np.dot(inv_forecasts_error_cov[:, :, t], partials_forecasts_error[:, t, j])) return information_matrix / (self.nobs - self.loglikelihood_burn)
def approx_hess(x, f, epsilon=None, args=(), kwargs={}): """ Parameters ---------- x : array_like value at which function derivative is evaluated f : function function of one array f(x, `*args`, `**kwargs`) epsilon : float or array-like, optional Stepsize used, if None, then stepsize is automatically chosen according to EPS**(1/4)*x. args : tuple Arguments for function `f`. kwargs : dict Keyword arguments for function `f`. Returns ------- hess : ndarray array of partial second derivatives, Hessian Notes ----- Equation (9) in Ridout. Computes the Hessian as:: 1/(4*d_j*d_k) * ((f(x + d[j]*e[j] + d[k]*e[k]) - f(x + d[j]*e[j] - d[k]*e[k])) - (f(x - d[j]*e[j] + d[k]*e[k]) - f(x - d[j]*e[j] - d[k]*e[k])) where e[j] is a vector with element j == 1 and the rest are zero and d[i] is epsilon[i]. References ----------: Ridout, M.S. (2009) Statistical applications of the complex-step method of numerical differentiation. The American Statistician, 63, 66-74 Copyright --------- This is an adaptation of the function approx_hess3() in statsmodels.tools.numdiff. That code is BSD (3 clause) licensed as follows: Copyright (C) 2006, Jonathan E. Taylor All rights reserved. Copyright (c) 2006-2008 Scipy Developers. All rights reserved. Copyright (c) 2009-2012 Statsmodels Developers. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: a. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. b. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. c. Neither the name of Statsmodels nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL STATSMODELS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ n = len(x) h = smnd._get_epsilon(x, 4, epsilon, n) ee = np.diag(h) hess = np.outer(h, h) for i in range(n): for j in range(i, n): hess[i, j] = (f(*((x + ee[i, :] + ee[j, :], ) + args), **kwargs) - f(*((x + ee[i, :] - ee[j, :], ) + args), **kwargs) - (f(*((x - ee[i, :] + ee[j, :], ) + args), **kwargs) - f(*((x - ee[i, :] - ee[j, :], ) + args), **kwargs)) ) / (4. * hess[i, j]) hess[j, i] = hess[i, j] return hess
def approx_hess(x, f, epsilon=None, args=(), kwargs={}): """ Parameters ---------- x : array_like value at which function derivative is evaluated f : function function of one array f(x, `*args`, `**kwargs`) epsilon : float or array-like, optional Stepsize used, if None, then stepsize is automatically chosen according to EPS**(1/4)*x. args : tuple Arguments for function `f`. kwargs : dict Keyword arguments for function `f`. Returns ------- hess : ndarray array of partial second derivatives, Hessian Notes ----- Equation (9) in Ridout. Computes the Hessian as:: 1/(4*d_j*d_k) * ((f(x + d[j]*e[j] + d[k]*e[k]) - f(x + d[j]*e[j] - d[k]*e[k])) - (f(x - d[j]*e[j] + d[k]*e[k]) - f(x - d[j]*e[j] - d[k]*e[k])) where e[j] is a vector with element j == 1 and the rest are zero and d[i] is epsilon[i]. References ----------: Ridout, M.S. (2009) Statistical applications of the complex-step method of numerical differentiation. The American Statistician, 63, 66-74 Copyright --------- This is an adaptation of the function approx_hess3() in statsmodels.tools.numdiff. That code is BSD (3 clause) licensed as follows: Copyright (C) 2006, Jonathan E. Taylor All rights reserved. Copyright (c) 2006-2008 Scipy Developers. All rights reserved. Copyright (c) 2009-2012 Statsmodels Developers. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: a. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. b. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. c. Neither the name of Statsmodels nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL STATSMODELS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ n = len(x) h = smnd._get_epsilon(x, 4, epsilon, n) ee = np.diag(h) hess = np.outer(h,h) for i in range(n): for j in range(i, n): hess[i, j] = (f(*((x + ee[i, :] + ee[j, :],) + args), **kwargs) - f(*((x + ee[i, :] - ee[j, :],) + args), **kwargs) - (f(*((x - ee[i, :] + ee[j, :],) + args), **kwargs) - f(*((x - ee[i, :] - ee[j, :],) + args), **kwargs)) )/(4.*hess[i, j]) hess[j, i] = hess[i, j] return hess