def mad(a, c=Gaussian.ppf(3 / 4.), axis=0, center=np.median): # c \approx .6745 """ The Median Absolute Deviation along given axis of an array Parameters ---------- a : array-like Input array. c : float, optional The normalization constant. Defined as scipy.stats.norm.ppf(3/4.), which is approximately .6745. axis : int, optional The defaul is 0. Can also be None. center : callable or float If a callable is provided, such as the default `np.median` then it is expected to be called center(a). The axis argument will be applied via np.apply_over_axes. Otherwise, provide a float. Returns ------- mad : float `mad` = median(abs(`a` - center))/`c` """ a = np.asarray(a) if callable(center): center = np.apply_over_axes(center, a, axis) return np.median((np.fabs(a - center)) / c, axis=axis)
def mad(a, c=Gaussian.ppf(3/4.), axis=0, center=np.median): # c \approx .6745 """ The Median Absolute Deviation along given axis of an array Parameters ---------- a : array-like Input array. c : float, optional The normalization constant. Defined as scipy.stats.norm.ppf(3/4.), which is approximately .6745. axis : int, optional The defaul is 0. Can also be None. center : callable or float If a callable is provided, such as the default `np.median` then it is expected to be called center(a). The axis argument will be applied via np.apply_over_axes. Otherwise, provide a float. Returns ------- mad : float `mad` = median(abs(`a` - center))/`c` """ a = np.asarray(a) if callable(center): center = np.apply_over_axes(center, a, axis) return np.median((np.fabs(a-center))/c, axis=axis)
def __init__(self, shape, h = 1.0, domain = None, norm = None): """ shape should be a function taking and returning numeric type. For sanity it should always return positive or zero but this isn't enforced in case you want to do weird things. Bear in mind that the statistical tests etc. may not be valid for non-positive kernels. The bandwidth of the kernel is supplied as h. You may specify a domain as a list of 2 values [min, max], in which case kernel will be treated as zero outside these values. This will speed up calculation. You may also specify the normalisation constant for the supplied Kernel. If you do this number will be stored and used as the normalisation without calculation. It is recommended you do this if you know the constant, to speed up calculation. In particular if the shape function provided is already normalised you should provide norm = 1.0. Warning: I think several calculations assume that the kernel is normalized. No tests for non-normalized kernel. """ self._normconst = norm # a value or None, if None, then calculate self.domain = domain self.weights = None if callable(shape): self._shape = shape else: raise TypeError("shape must be a callable object/function") self._h = h self._L2Norm = None self._kernel_var = None self._normal_reference_constant = None self._order = None
def date_parser(timestr, parserinfo=None, **kwargs): """ Uses dateutil.parser.parse, but also handles monthly dates of the form 1999m4, 1999:m4, 1999:mIV, 1999mIV and the same for quarterly data with q instead of m. It is not case sensitive. The default for annual data is the end of the year, which also differs from dateutil. """ flags = re.IGNORECASE | re.VERBOSE if re.search(_q_pattern, timestr, flags): y,q = timestr.replace(":","").lower().split('q') month, day = _quarter_to_day[q.upper()] year = int(y) elif re.search(_m_pattern, timestr, flags): y,m = timestr.replace(":","").lower().split('m') month, day = _month_to_day[m.upper()] year = int(y) if _is_leap(y) and month == 2: day += 1 elif re.search(_y_pattern, timestr, flags): month, day = 12, 31 year = int(timestr) else: if (hasattr(pandas_datetools, 'parser') and not callable(pandas_datetools.parser)): # exists in 0.8.0 pandas, but it's the class not the module return pandas_datetools.parser.parse(timestr, parserinfo, **kwargs) else: # 0.8.1 pandas version didn't import this into namespace from dateutil import parser return parser.parse(timestr, parserinfo, **kwargs) return datetime.datetime(year, month, day)
def get_columns(self, *args, **kw): """ Calling function for factor instance. """ v = self.namespace[self._name] while True: if callable(v): if isinstance(v, (Term, Formula)): v = copy.copy(v) v.namespace = self.namespace v = v(*args, **kw) else: break n = len(v) if self.ordinal: col = [float(self.keys.index(v[i])) for i in range(n)] return np.array(col) else: value = [] for key in self.keys: col = [float((v[i] == key)) for i in range(n)] value.append(col) return np.array(value)
def date_parser(timestr, parserinfo=None, **kwargs): """ Uses dateutil.parser.parse, but also handles monthly dates of the form 1999m4, 1999:m4, 1999:mIV, 1999mIV and the same for quarterly data with q instead of m. It is not case sensitive. The default for annual data is the end of the year, which also differs from dateutil. """ flags = re.IGNORECASE | re.VERBOSE if re.search(_q_pattern, timestr, flags): y, q = timestr.replace(":", "").lower().split('q') month, day = _quarter_to_day[q.upper()] year = int(y) elif re.search(_m_pattern, timestr, flags): y, m = timestr.replace(":", "").lower().split('m') month, day = _month_to_day[m.upper()] year = int(y) if _is_leap(y) and month == 2: day += 1 elif re.search(_y_pattern, timestr, flags): month, day = 12, 31 year = int(timestr) else: if (hasattr(pandas_datetools, 'parser') and not callable(pandas_datetools.parser)): # exists in 0.8.0 pandas, but it's the class not the module return pandas_datetools.parser.parse(timestr, parserinfo, **kwargs) else: # 0.8.1 pandas version didn't import this into namespace from dateutil import parser return parser.parse(timestr, parserinfo, **kwargs) return datetime.datetime(year, month, day)
def __call__(self, *args, **kw): """ Return the columns associated to self in a design matrix. If the term has no 'func' attribute, it returns ``self.namespace[self.termname]`` else, it returns ``self.func(*args, **kw)`` """ if not hasattr(self, 'func'): val = self.namespace[self.termname] else: val = self.func if callable(val): if isinstance(val, (Term, Formula)): val = copy.copy(val) val.namespace = self.namespace val = val(*args, **kw) val = np.asarray(val) return np.squeeze(val)
def __init__(self, rvs, cdf, args=(), N=20): if isinstance(rvs, string_types): #cdf = getattr(stats, rvs).cdf if (not cdf) or (cdf == rvs): cdf = getattr(distributions, rvs).cdf rvs = getattr(distributions, rvs).rvs else: raise AttributeError( 'if rvs is string, cdf has to be the same distribution') if isinstance(cdf, string_types): cdf = getattr(distributions, cdf).cdf if callable(rvs): kwds = {'size': N} vals = np.sort(rvs(*args, **kwds)) else: vals = np.sort(rvs) N = len(vals) cdfvals = cdf(vals, *args) self.nobs = N self.vals_sorted = vals self.cdfvals = cdfvals
def __init__(self, rvs, cdf, args=(), N=20): if isinstance(rvs, string_types): #cdf = getattr(stats, rvs).cdf if (not cdf) or (cdf == rvs): cdf = getattr(distributions, rvs).cdf rvs = getattr(distributions, rvs).rvs else: raise AttributeError('if rvs is string, cdf has to be the same distribution') if isinstance(cdf, string_types): cdf = getattr(distributions, cdf).cdf if callable(rvs): kwds = {'size':N} vals = np.sort(rvs(*args,**kwds)) else: vals = np.sort(rvs) N = len(vals) cdfvals = cdf(vals, *args) self.nobs = N self.vals_sorted = vals self.cdfvals = cdfvals
def _updateloglike(self, params, xi10, ntrain, penalty, upperbounds, lowerbounds, F, A, H, Q, R, history): """ """ paramsorig = params # are the bounds binding? if penalty: params = np.min((np.max( (lowerbounds, params), axis=0), upperbounds), axis=0) #TODO: does it make sense for all of these to be allowed to be None? if F != None and callable(F): F = F(params) elif F == None: F = 0 if A != None and callable(A): A = A(params) elif A == None: A = 0 if H != None and callable(H): H = H(params) elif H == None: H = 0 print(callable(Q)) if Q != None and callable(Q): Q = Q(params) elif Q == None: Q = 0 if R != None and callable(R): R = R(params) elif R == None: R = 0 X = self.exog if X == None: X = 0 y = self.endog loglike = kalmanfilter(F, A, H, Q, R, y, X, xi10, ntrain, history) # use a quadratic penalty function to move away from bounds if penalty: loglike += penalty * np.sum((paramsorig - params)**2) return loglike
def _updateloglike(self, params, xi10, ntrain, penalty, upperbounds, lowerbounds, F,A,H,Q,R, history): """ """ paramsorig = params # are the bounds binding? if penalty: params = np.min((np.max((lowerbounds, params), axis=0),upperbounds), axis=0) #TODO: does it make sense for all of these to be allowed to be None? if F != None and callable(F): F = F(params) elif F == None: F = 0 if A != None and callable(A): A = A(params) elif A == None: A = 0 if H != None and callable(H): H = H(params) elif H == None: H = 0 print(callable(Q)) if Q != None and callable(Q): Q = Q(params) elif Q == None: Q = 0 if R != None and callable(R): R = R(params) elif R == None: R = 0 X = self.exog if X == None: X = 0 y = self.endog loglike = kalmanfilter(F,A,H,Q,R,y,X, xi10, ntrain, history) # use a quadratic penalty function to move away from bounds if penalty: loglike += penalty * np.sum((paramsorig-params)**2) return loglike
def margeff_cov_params(model, params, exog, cov_params, at, derivative, dummy_ind, count_ind, method, J): """ Computes the variance-covariance of marginal effects by the delta method. Parameters ---------- model : model instance The model that returned the fitted results. Its pdf method is used for computing the Jacobian of discrete variables in dummy_ind and count_ind params : array-like estimated model parameters exog : array-like exogenous variables at which to calculate the derivative cov_params : array-like The variance-covariance of the parameters at : str Options are: - 'overall', The average of the marginal effects at each observation. - 'mean', The marginal effects at the mean of each regressor. - 'median', The marginal effects at the median of each regressor. - 'zero', The marginal effects at zero for each regressor. - 'all', The marginal effects at each observation. Only overall has any effect here.you derivative : function or array-like If a function, it returns the marginal effects of the model with respect to the exogenous variables evaluated at exog. Expected to be called derivative(params, exog). This will be numerically differentiated. Otherwise, it can be the Jacobian of the marginal effects with respect to the parameters. dummy_ind : array-like Indices of the columns of exog that contain dummy variables count_ind : array-like Indices of the columns of exog that contain count variables Notes ----- For continuous regressors, the variance-covariance is given by Asy. Var[MargEff] = [d margeff / d params] V [d margeff / d params]' where V is the parameter variance-covariance. The outer Jacobians are computed via numerical differentiation if derivative is a function. """ if callable(derivative): from statsmodels.tools.numdiff import approx_fprime_cs params = params.ravel('F') # for Multinomial try: jacobian_mat = approx_fprime_cs(params, derivative, args=(exog, method)) except TypeError: # norm.cdf doesn't take complex values from statsmodels.tools.numdiff import approx_fprime jacobian_mat = approx_fprime(params, derivative, args=(exog, method)) if at == 'overall': jacobian_mat = np.mean(jacobian_mat, axis=1) else: jacobian_mat = jacobian_mat.squeeze() # exog was 2d row vector if dummy_ind is not None: jacobian_mat = _margeff_cov_params_dummy(model, jacobian_mat, params, exog, dummy_ind, method, J) if count_ind is not None: jacobian_mat = _margeff_cov_params_count(model, jacobian_mat, params, exog, count_ind, method, J) else: jacobian_mat = derivative #NOTE: this won't go through for at == 'all' return np.dot(np.dot(jacobian_mat, cov_params), jacobian_mat.T)
def kstest(rvs, cdf, args=(), N=20, alternative = 'two_sided', mode='approx',**kwds): """ Perform the Kolmogorov-Smirnov test for goodness of fit This performs a test of the distribution G(x) of an observed random variable against a given distribution F(x). Under the null hypothesis the two distributions are identical, G(x)=F(x). The alternative hypothesis can be either 'two_sided' (default), 'less' or 'greater'. The KS test is only valid for continuous distributions. Parameters ---------- rvs : string or array or callable string: name of a distribution in scipy.stats array: 1-D observations of random variables callable: function to generate random variables, requires keyword argument `size` cdf : string or callable string: name of a distribution in scipy.stats, if rvs is a string then cdf can evaluate to `False` or be the same as rvs callable: function to evaluate cdf args : tuple, sequence distribution parameters, used if rvs or cdf are strings N : int sample size if rvs is string or callable alternative : 'two_sided' (default), 'less' or 'greater' defines the alternative hypothesis (see explanation) mode : 'approx' (default) or 'asymp' defines the distribution used for calculating p-value 'approx' : use approximation to exact distribution of test statistic 'asymp' : use asymptotic distribution of test statistic Returns ------- D : float KS test statistic, either D, D+ or D- p-value : float one-tailed or two-tailed p-value Notes ----- In the one-sided test, the alternative is that the empirical cumulative distribution function of the random variable is "less" or "greater" than the cumulative distribution function F(x) of the hypothesis, G(x)<=F(x), resp. G(x)>=F(x). Examples -------- >>> from scipy import stats >>> import numpy as np >>> from scipy.stats import kstest >>> x = np.linspace(-15,15,9) >>> kstest(x,'norm') (0.44435602715924361, 0.038850142705171065) >>> np.random.seed(987654321) # set random seed to get the same result >>> kstest('norm','',N=100) (0.058352892479417884, 0.88531190944151261) is equivalent to this >>> np.random.seed(987654321) >>> kstest(stats.norm.rvs(size=100),'norm') (0.058352892479417884, 0.88531190944151261) Test against one-sided alternative hypothesis: >>> np.random.seed(987654321) Shift distribution to larger values, so that cdf_dgp(x)< norm.cdf(x): >>> x = stats.norm.rvs(loc=0.2, size=100) >>> kstest(x,'norm', alternative = 'less') (0.12464329735846891, 0.040989164077641749) Reject equal distribution against alternative hypothesis: less >>> kstest(x,'norm', alternative = 'greater') (0.0072115233216311081, 0.98531158590396395) Don't reject equal distribution against alternative hypothesis: greater >>> kstest(x,'norm', mode='asymp') (0.12464329735846891, 0.08944488871182088) Testing t distributed random variables against normal distribution: With 100 degrees of freedom the t distribution looks close to the normal distribution, and the kstest does not reject the hypothesis that the sample came from the normal distribution >>> np.random.seed(987654321) >>> stats.kstest(stats.t.rvs(100,size=100),'norm') (0.072018929165471257, 0.67630062862479168) With 3 degrees of freedom the t distribution looks sufficiently different from the normal distribution, that we can reject the hypothesis that the sample came from the normal distribution at a alpha=10% level >>> np.random.seed(987654321) >>> stats.kstest(stats.t.rvs(3,size=100),'norm') (0.131016895759829, 0.058826222555312224) """ if isinstance(rvs, string_types): #cdf = getattr(stats, rvs).cdf if (not cdf) or (cdf == rvs): cdf = getattr(distributions, rvs).cdf rvs = getattr(distributions, rvs).rvs else: raise AttributeError('if rvs is string, cdf has to be the same distribution') if isinstance(cdf, string_types): cdf = getattr(distributions, cdf).cdf if callable(rvs): kwds = {'size':N} vals = np.sort(rvs(*args,**kwds)) else: vals = np.sort(rvs) N = len(vals) cdfvals = cdf(vals, *args) if alternative in ['two_sided', 'greater']: Dplus = (np.arange(1.0, N+1)/N - cdfvals).max() if alternative == 'greater': return Dplus, distributions.ksone.sf(Dplus,N) if alternative in ['two_sided', 'less']: Dmin = (cdfvals - np.arange(0.0, N)/N).max() if alternative == 'less': return Dmin, distributions.ksone.sf(Dmin,N) if alternative == 'two_sided': D = np.max([Dplus,Dmin]) if mode == 'asymp': return D, distributions.kstwobign.sf(D*np.sqrt(N)) if mode == 'approx': pval_two = distributions.kstwobign.sf(D*np.sqrt(N)) if N > 2666 or pval_two > 0.80 - N*0.3/1000.0 : return D, distributions.kstwobign.sf(D*np.sqrt(N)) else: return D, distributions.ksone.sf(D,N)*2
def kstest(rvs, cdf, args=(), N=20, alternative='two_sided', mode='approx', **kwds): """ Perform the Kolmogorov-Smirnov test for goodness of fit This performs a test of the distribution G(x) of an observed random variable against a given distribution F(x). Under the null hypothesis the two distributions are identical, G(x)=F(x). The alternative hypothesis can be either 'two_sided' (default), 'less' or 'greater'. The KS test is only valid for continuous distributions. Parameters ---------- rvs : string or array or callable string: name of a distribution in scipy.stats array: 1-D observations of random variables callable: function to generate random variables, requires keyword argument `size` cdf : string or callable string: name of a distribution in scipy.stats, if rvs is a string then cdf can evaluate to `False` or be the same as rvs callable: function to evaluate cdf args : tuple, sequence distribution parameters, used if rvs or cdf are strings N : int sample size if rvs is string or callable alternative : 'two_sided' (default), 'less' or 'greater' defines the alternative hypothesis (see explanation) mode : 'approx' (default) or 'asymp' defines the distribution used for calculating p-value 'approx' : use approximation to exact distribution of test statistic 'asymp' : use asymptotic distribution of test statistic Returns ------- D : float KS test statistic, either D, D+ or D- p-value : float one-tailed or two-tailed p-value Notes ----- In the one-sided test, the alternative is that the empirical cumulative distribution function of the random variable is "less" or "greater" than the cumulative distribution function F(x) of the hypothesis, G(x)<=F(x), resp. G(x)>=F(x). Examples -------- >>> from scipy import stats >>> import numpy as np >>> from scipy.stats import kstest >>> x = np.linspace(-15,15,9) >>> kstest(x,'norm') (0.44435602715924361, 0.038850142705171065) >>> np.random.seed(987654321) # set random seed to get the same result >>> kstest('norm','',N=100) (0.058352892479417884, 0.88531190944151261) is equivalent to this >>> np.random.seed(987654321) >>> kstest(stats.norm.rvs(size=100),'norm') (0.058352892479417884, 0.88531190944151261) Test against one-sided alternative hypothesis: >>> np.random.seed(987654321) Shift distribution to larger values, so that cdf_dgp(x)< norm.cdf(x): >>> x = stats.norm.rvs(loc=0.2, size=100) >>> kstest(x,'norm', alternative = 'less') (0.12464329735846891, 0.040989164077641749) Reject equal distribution against alternative hypothesis: less >>> kstest(x,'norm', alternative = 'greater') (0.0072115233216311081, 0.98531158590396395) Don't reject equal distribution against alternative hypothesis: greater >>> kstest(x,'norm', mode='asymp') (0.12464329735846891, 0.08944488871182088) Testing t distributed random variables against normal distribution: With 100 degrees of freedom the t distribution looks close to the normal distribution, and the kstest does not reject the hypothesis that the sample came from the normal distribution >>> np.random.seed(987654321) >>> stats.kstest(stats.t.rvs(100,size=100),'norm') (0.072018929165471257, 0.67630062862479168) With 3 degrees of freedom the t distribution looks sufficiently different from the normal distribution, that we can reject the hypothesis that the sample came from the normal distribution at a alpha=10% level >>> np.random.seed(987654321) >>> stats.kstest(stats.t.rvs(3,size=100),'norm') (0.131016895759829, 0.058826222555312224) """ if isinstance(rvs, string_types): #cdf = getattr(stats, rvs).cdf if (not cdf) or (cdf == rvs): cdf = getattr(distributions, rvs).cdf rvs = getattr(distributions, rvs).rvs else: raise AttributeError( 'if rvs is string, cdf has to be the same distribution') if isinstance(cdf, string_types): cdf = getattr(distributions, cdf).cdf if callable(rvs): kwds = {'size': N} vals = np.sort(rvs(*args, **kwds)) else: vals = np.sort(rvs) N = len(vals) cdfvals = cdf(vals, *args) if alternative in ['two_sided', 'greater']: Dplus = (np.arange(1.0, N + 1) / N - cdfvals).max() if alternative == 'greater': return Dplus, distributions.ksone.sf(Dplus, N) if alternative in ['two_sided', 'less']: Dmin = (cdfvals - np.arange(0.0, N) / N).max() if alternative == 'less': return Dmin, distributions.ksone.sf(Dmin, N) if alternative == 'two_sided': D = np.max([Dplus, Dmin]) if mode == 'asymp': return D, distributions.kstwobign.sf(D * np.sqrt(N)) if mode == 'approx': pval_two = distributions.kstwobign.sf(D * np.sqrt(N)) if N > 2666 or pval_two > 0.80 - N * 0.3 / 1000.0: return D, distributions.kstwobign.sf(D * np.sqrt(N)) else: return D, distributions.ksone.sf(D, N) * 2
def margeff_cov_params(model, params, exog, cov_params, at, derivative, dummy_ind, count_ind, method, J): """ Computes the variance-covariance of marginal effects by the delta method. Parameters ---------- model : model instance The model that returned the fitted results. Its pdf method is used for computing the Jacobian of discrete variables in dummy_ind and count_ind params : array-like estimated model parameters exog : array-like exogenous variables at which to calculate the derivative cov_params : array-like The variance-covariance of the parameters at : str Options are: - 'overall', The average of the marginal effects at each observation. - 'mean', The marginal effects at the mean of each regressor. - 'median', The marginal effects at the median of each regressor. - 'zero', The marginal effects at zero for each regressor. - 'all', The marginal effects at each observation. Only overall has any effect here.you derivative : function or array-like If a function, it returns the marginal effects of the model with respect to the exogenous variables evaluated at exog. Expected to be called derivative(params, exog). This will be numerically differentiated. Otherwise, it can be the Jacobian of the marginal effects with respect to the parameters. dummy_ind : array-like Indices of the columns of exog that contain dummy variables count_ind : array-like Indices of the columns of exog that contain count variables Notes ----- For continuous regressors, the variance-covariance is given by Asy. Var[MargEff] = [d margeff / d params] V [d margeff / d params]' where V is the parameter variance-covariance. The outer Jacobians are computed via numerical differentiation if derivative is a function. """ if callable(derivative): from statsmodels.tools.numdiff import approx_fprime_cs params = params.ravel('F') # for Multinomial try: jacobian_mat = approx_fprime_cs(params, derivative, args=(exog,method)) except TypeError: # norm.cdf doesn't take complex values from statsmodels.tools.numdiff import approx_fprime jacobian_mat = approx_fprime(params, derivative, args=(exog,method)) if at == 'overall': jacobian_mat = np.mean(jacobian_mat, axis=1) else: jacobian_mat = jacobian_mat.squeeze() # exog was 2d row vector if dummy_ind is not None: jacobian_mat = _margeff_cov_params_dummy(model, jacobian_mat, params, exog, dummy_ind, method, J) if count_ind is not None: jacobian_mat = _margeff_cov_params_count(model, jacobian_mat, params, exog, count_ind, method, J) else: jacobian_mat = derivative #NOTE: this won't go through for at == 'all' return np.dot(np.dot(jacobian_mat, cov_params), jacobian_mat.T)