def __iter__(self): n = self.n k = self.k start = self.start if self.return_slice: for i in range(start, n-k): train_slice = slice(None, i, None) if self.kall: test_slice = slice(i, i+k) else: test_slice = slice(i+k-1, i+k) yield train_slice, test_slice else: #for compatibility with other iterators for i in range(start, n-k): train_index = np.zeros(n, dtype=np.bool) train_index[:i] = True test_index = np.zeros(n, dtype=np.bool) if self.kall: test_index[i:i+k] = True # np.logical_not(test_index) else: test_index[i+k-1:i+k] = True #or faster to return np.arange(i,i+k) ? #returning slice should be faster in this case yield train_index, test_index
def test_generate_sample(self): process = ArmaProcess.from_coeffs([0.9]) np.random.seed(12345) sample = process.generate_sample() np.random.seed(12345) expected = np.random.randn(100) for i in range(1, 100): expected[i] = 0.9 * expected[i - 1] + expected[i] assert_almost_equal(sample, expected) process = ArmaProcess.from_coeffs([1.6, -0.9]) np.random.seed(12345) sample = process.generate_sample() np.random.seed(12345) expected = np.random.randn(100) expected[1] = 1.6 * expected[0] + expected[1] for i in range(2, 100): expected[i] = 1.6 * expected[i - 1] - 0.9 * expected[i - 2] + expected[i] assert_almost_equal(sample, expected) process = ArmaProcess.from_coeffs([1.6, -0.9]) np.random.seed(12345) sample = process.generate_sample(burnin=100) np.random.seed(12345) expected = np.random.randn(200) expected[1] = 1.6 * expected[0] + expected[1] for i in range(2, 200): expected[i] = 1.6 * expected[i - 1] - 0.9 * expected[i - 2] + expected[i] assert_almost_equal(sample, expected[100:]) np.random.seed(12345) sample = process.generate_sample(nsample=(100,5)) assert_equal(sample.shape, (100,5))
def test_ftest_pvalues(self): res = self.results use_t = res.use_t k_vars = len(res.params) # check default use_t pvals = [res.wald_test(np.eye(k_vars)[k], use_f=use_t).pvalue for k in range(k_vars)] assert_allclose(pvals, res.pvalues, rtol=5e-10, atol=1e-25) # sutomatic use_f based on results class use_t pvals = [res.wald_test(np.eye(k_vars)[k]).pvalue for k in range(k_vars)] assert_allclose(pvals, res.pvalues, rtol=5e-10, atol=1e-25) # label for pvalues in summary string_use_t = 'P>|z|' if use_t is False else 'P>|t|' summ = str(res.summary()) assert_(string_use_t in summ) # try except for models that don't have summary2 try: summ2 = str(res.summary2()) except AttributeError: summ2 = None if summ2 is not None: assert_(string_use_t in summ2)
def varsim(coefs, intercept, sig_u, steps=100, initvalues=None, seed=None): """ Simulate VAR(p) process, given coefficients and assuming Gaussian noise Parameters ---------- coefs : ndarray Coefficients for the VAR lags of endog. intercept : None or ndarray 1-D (neqs,) or (steps, neqs) This can be either the intercept for each equation or an offset. If None, then the VAR process has a zero intercept. If intercept is 1-D, then the same (endog specific) intercept is added to all observations. If intercept is 2-D, then it is treated as an offset and is added as an observation specific intercept to the autoregression. In this case, the intercept/offset should have same number of rows as steps, and the same number of columns as endogenous variables (neqs). sig_u : ndarray Covariance matrix of the residuals or innovations. If sig_u is None, then an identity matrix is used. steps : None or int number of observations to simulate, this includes the initial observations to start the autoregressive process. If offset is not None, then exog of the model are used if they were provided in the model seed : None or integer If seed is not None, then it will be used with for the random variables generated by numpy.random. Returns ------- endog_simulated : nd_array Endog of the simulated VAR process """ rs = np.random.RandomState(seed=seed) rmvnorm = rs.multivariate_normal p, k, k = coefs.shape if sig_u is None: sig_u = np.eye(k) ugen = rmvnorm(np.zeros(len(sig_u)), sig_u, steps) result = np.zeros((steps, k)) if intercept is not None: # intercept can be 2-D like an offset variable if np.ndim(intercept) > 1: if not len(intercept) == len(ugen): raise ValueError('2-D intercept needs to have length `steps`') # add intercept/offset also to intial values result += intercept result[p:] += ugen[p:] else: result[p:] = ugen[p:] # add in AR terms for t in range(p, steps): ygen = result[t] for j in range(p): ygen += np.dot(coefs[j], result[t-j-1]) return result
def make_lag_names(names, lag_order, trendorder=1, exog=None): """ Produce list of lag-variable names. Constant / trends go at the beginning Examples -------- >>> make_lag_names(['foo', 'bar'], 2, 1) ['const', 'L1.foo', 'L1.bar', 'L2.foo', 'L2.bar'] """ lag_names = [] if isinstance(names, string_types): names = [names] # take care of lagged endogenous names for i in range(1, lag_order + 1): for name in names: if not isinstance(name, string_types): name = str(name) # will need consistent unicode handling lag_names.append('L'+str(i)+'.'+name) # handle the constant name if trendorder != 0: lag_names.insert(0, 'const') if trendorder > 1: lag_names.insert(1, 'trend') if trendorder > 2: lag_names.insert(2, 'trend**2') if exog is not None: for i in range(exog.shape[1]): lag_names.insert(trendorder + i, "exog" + str(i)) return lag_names
def initialize(self, model): super(GlobalOddsRatio, self).initialize(model) if self.model.weights is not None: warnings.warn("weights not implemented for GlobalOddsRatio " "cov_struct, using unweighted covariance estimate", NotImplementedWarning) # Need to restrict to between-subject pairs cpp = [] for v in model.endog_li: # Number of subjects in this group m = int(len(v) / self._ncut) i1, i2 = np.tril_indices(m, -1) cpp1 = {} for k1 in range(self._ncut): for k2 in range(k1 + 1): jj = np.zeros((len(i1), 2), dtype=np.int64) jj[:, 0] = i1 * self._ncut + k1 jj[:, 1] = i2 * self._ncut + k2 cpp1[(k2, k1)] = jj cpp.append(cpp1) self.cpp = cpp # Initialize the dependence parameters self.crude_or = self.observed_crude_oddsratio() if self.model.update_dep: self.dep_params = self.crude_or
def _eigval_decomp_SZ(self, irf_resim): """ Returns ------- W: array of eigenvectors eigva: list of eigenvalues k: matrix indicating column # of largest eigenvalue for each c_i,j """ neqs = self.neqs periods = self.periods cov_hold = np.zeros((neqs, neqs, periods, periods)) for i in range(neqs): for j in range(neqs): cov_hold[i,j,:,:] = np.cov(irf_resim[:,1:,i,j],rowvar=0) W = np.zeros((neqs, neqs, periods, periods)) eigva = np.zeros((neqs, neqs, periods, 1)) k = np.zeros((neqs, neqs)) for i in range(neqs): for j in range(neqs): W[i,j,:,:], eigva[i,j,:,0], k[i,j] = util.eigval_decomp(cov_hold[i,j,:,:]) return W, eigva, k
def plot_full_acorr(acorr, fontsize=8, linewidth=8, xlabel=None, err_bound=None): """ Parameters ---------- """ import matplotlib.pyplot as plt config = MPLConfigurator() config.set_fontsize(fontsize) k = acorr.shape[1] fig, axes = plt.subplots(k, k, figsize=(10, 10), squeeze=False) for i in range(k): for j in range(k): ax = axes[i][j] acorr_plot(acorr[:, i, j], linewidth=linewidth, xlabel=xlabel, ax=ax) if err_bound is not None: ax.axhline(err_bound, color='k', linestyle='--') ax.axhline(-err_bound, color='k', linestyle='--') adjust_subplots() config.revert() return fig
def levinson_durbin_nitime(s, order=10, isacov=False): '''Levinson-Durbin recursion for autoregressive processes ''' #from nitime ## if sxx is not None and type(sxx) == np.ndarray: ## sxx_m = sxx[:order+1] ## else: ## sxx_m = ut.autocov(s)[:order+1] if isacov: sxx_m = s else: sxx_m = acovf(s)[:order+1] #not tested phi = np.zeros((order+1, order+1), 'd') sig = np.zeros(order+1) # initial points for the recursion phi[1,1] = sxx_m[1]/sxx_m[0] sig[1] = sxx_m[0] - phi[1,1]*sxx_m[1] for k in range(2,order+1): phi[k,k] = (sxx_m[k]-np.dot(phi[1:k,k-1], sxx_m[1:k][::-1]))/sig[k-1] for j in range(1,k): phi[j,k] = phi[j,k-1] - phi[k,k]*phi[k-j,k-1] sig[k] = sig[k-1]*(1 - phi[k,k]**2) sigma_v = sig[-1]; arcoefs = phi[1:,-1] return sigma_v, arcoefs, pacf, phi #return everything
def prob_quantize_cdf(binsx, binsy, cdf): '''quantize a continuous distribution given by a cdf Parameters ---------- binsx : array_like, 1d binedges ''' binsx = np.asarray(binsx) binsy = np.asarray(binsy) nx = len(binsx) - 1 ny = len(binsy) - 1 probs = np.nan * np.ones((nx, ny)) #np.empty(nx,ny) cdf_values = cdf(binsx[:,None], binsy) cdf_func = lambda x, y: cdf_values[x,y] for xind in range(1, nx+1): for yind in range(1, ny+1): upper = (xind, yind) lower = (xind-1, yind-1) #print upper,lower, probs[xind-1,yind-1] = prob_bv_rectangle(lower, upper, cdf_func) assert not np.isnan(probs).any() return probs
def dataset(self, as_dict=False): """ Returns a Python generator object for iterating over the dataset. Parameters ---------- as_dict : bool, optional If as_dict is True, yield each row of observations as a dict. If False, yields each row of observations as a list. Returns ------- Generator object for iterating over the dataset. Yields each row of observations as a list by default. Notes ----- If missing_values is True during instantiation of StataReader then observations with _StataMissingValue(s) are not filtered and should be handled by your applcation. """ try: self._file.seek(self._data_location) except Exception: pass if as_dict: vars = lmap(str, self.variables()) for i in range(len(self)): yield dict(zip(vars, self._next())) else: for i in range(self._header['nobs']): yield self._next()
def prob_quantize_cdf_old(binsx, binsy, cdf): '''quantize a continuous distribution given by a cdf old version without precomputing cdf values Parameters ---------- binsx : array_like, 1d binedges ''' binsx = np.asarray(binsx) binsy = np.asarray(binsy) nx = len(binsx) - 1 ny = len(binsy) - 1 probs = np.nan * np.ones((nx, ny)) #np.empty(nx,ny) for xind in range(1, nx+1): for yind in range(1, ny+1): upper = (binsx[xind], binsy[yind]) lower = (binsx[xind-1], binsy[yind-1]) #print upper,lower, probs[xind-1,yind-1] = prob_bv_rectangle(lower, upper, cdf) assert not np.isnan(probs).any() return probs
def approx_hess2(x, f, epsilon=None, args=(), kwargs={}, return_grad=False): # n = len(x) # NOTE: ridout suggesting using eps**(1/4)*theta h = _get_epsilon(x, 3, epsilon, n) ee = np.diag(h) f0 = f(*((x,)+args), **kwargs) # Compute forward step g = np.zeros(n) gg = np.zeros(n) for i in range(n): g[i] = f(*((x+ee[i, :],)+args), **kwargs) gg[i] = f(*((x-ee[i, :],)+args), **kwargs) hess = np.outer(h, h) # this is now epsilon**2 # Compute "double" forward step for i in range(n): for j in range(i, n): hess[i, j] = (f(*((x + ee[i, :] + ee[j, :],) + args), **kwargs) - g[i] - g[j] + f0 + f(*((x - ee[i, :] - ee[j, :],) + args), **kwargs) - gg[i] - gg[j] + f0)/(2 * hess[i, j]) hess[j, i] = hess[i, j] if return_grad: grad = (g - f0)/h return hess, grad else: return hess
def _prepare_structured_array(self, data): self.nobs = len(data) self.nvar = len(data.dtype) self.data = data self.datarows = iter(data) dtype = data.dtype descr = dtype.descr if dtype.names is None: varlist = _default_names(self.nvar) else: varlist = dtype.names # check for datetime and change the type convert_dates = self._convert_dates if convert_dates is not None: convert_dates = _maybe_convert_to_int_keys(convert_dates, varlist) self._convert_dates = convert_dates for key in convert_dates: descr[key] = ( descr[key][0], _convert_datetime_to_stata_type(convert_dates[key]) ) dtype = np.dtype(descr) self.varlist = varlist self.typlist = [_dtype_to_stata_type(dtype[i]) for i in range(self.nvar)] self.fmtlist = [_dtype_to_default_stata_fmt(dtype[i]) for i in range(self.nvar)] # set the given format for the datetime cols if convert_dates is not None: for key in convert_dates: self.fmtlist[key] = convert_dates[key]
def get_columns(self, *args, **kw): """ Calling function for factor instance. """ v = self.namespace[self._name] while True: if callable(v): if isinstance(v, (Term, Formula)): v = copy.copy(v) v.namespace = self.namespace v = v(*args, **kw) else: break n = len(v) if self.ordinal: col = [float(self.keys.index(v[i])) for i in range(n)] return np.array(col) else: value = [] for key in self.keys: col = [float((v[i] == key)) for i in range(n)] value.append(col) return np.array(value)
def _omega_forc_cov(self, steps): # Approximate MSE matrix \Omega(h) as defined in Lut p97 G = self._zz Ginv = L.inv(G) # memoize powers of B for speedup # TODO: see if can memoize better B = self._bmat_forc_cov() _B = {} def bpow(i): if i not in _B: _B[i] = np.linalg.matrix_power(B, i) return _B[i] phis = self.ma_rep(steps) sig_u = self.sigma_u omegas = np.zeros((steps, self.neqs, self.neqs)) for h in range(1, steps + 1): if h == 1: omegas[h-1] = self.df_model * self.sigma_u continue om = omegas[h-1] for i in range(h): for j in range(h): Bi = bpow(h - 1 - i) Bj = bpow(h - 1 - j) mult = np.trace(chain_dot(Bi.T, Ginv, Bj, G)) om += mult * chain_dot(phis[i], sig_u, phis[j].T) omegas[h-1] = om return omegas
def _band2array(a, lower=0, symmetric=False, hermitian=False): """ Take an upper or lower triangular banded matrix and return a numpy array. INPUTS: a -- a matrix in upper or lower triangular banded matrix lower -- is the matrix upper or lower triangular? symmetric -- if True, return the original result plus its transpose hermitian -- if True (and symmetric False), return the original result plus its conjugate transposed """ n = a.shape[1] r = a.shape[0] _a = 0 if not lower: for j in range(r): _b = np.diag(a[r-1-j],k=j)[j:(n+j),j:(n+j)] _a += _b if symmetric and j > 0: _a += _b.T elif hermitian and j > 0: _a += _b.conjugate().T else: for j in range(r): _b = np.diag(a[j],k=j)[0:n,0:n] _a += _b if symmetric and j > 0: _a += _b.T elif hermitian and j > 0: _a += _b.conjugate().T _a = _a.T return _a
def product_func(value, d1=d1, d2=d2): out = [] for r in range(d1): for s in range(d2): out.append(value[r] * value[d1+s]) return np.array(out)
def generate_ordinal(): ## Regression coefficients beta = np.zeros(5, dtype=np.float64) beta[2] = 1 beta[4] = -1 rz = 0.5 OUT = open("gee_ordinal_1.csv", "w") for i in range(200): n = np.random.randint(3, 6) # Cluster size x = np.random.normal(size=(n,5)) for j in range(5): x[:,j] += np.random.normal() pr = np.dot(x, beta) pr = np.array([1,0,-0.5]) + pr[:,None] pr = 1 / (1 + np.exp(-pr)) z = rz*np.random.normal() +\ np.sqrt(1-rz**2)*np.random.normal(size=n) u = norm.cdf(z) y = (u[:,None] > pr).sum(1) for j in range(n): OUT.write("%d,%d," % (i, y[j])) OUT.write(",".join(["%.3f" % b for b in x[j,:]]) + "\n") OUT.close()
def generate_poisson(): ## Regression coefficients beta = np.zeros(5, dtype=np.float64) beta[2] = 0.5 beta[4] = -0.5 nclust = 100 rz = 0.5 OUT = open("gee_poisson_1.csv", "w") for i in range(nclust): n = np.random.randint(3, 6) # Cluster size x = np.random.normal(size=(n,5)) for j in range(5): x[:,j] += np.random.normal() lp = np.dot(x, beta) E = np.exp(lp) y = [np.random.poisson(e) for e in E] y = np.array(y) for j in range(n): OUT.write("%d,%d," % (i, y[j])) OUT.write(",".join(["%.3f" % b for b in x[j,:]]) + "\n") OUT.close()
def prob_mv_grid(bins, cdf, axis=-1): '''helper function for probability of a rectangle grid in a multivariate distribution how does this generalize to more than 2 variates ? bins : tuple tuple of bin edges, currently it is assumed that they broadcast correctly ''' if not isinstance(bins, np.ndarray): bins = lmap(np.asarray, bins) n_dim = len(bins) bins_ = [] #broadcast if binedges are 1d if all(lmap(np.ndim, bins) == np.ones(n_dim)): for d in range(n_dim): sl = [None]*n_dim sl[d] = slice(None) bins_.append(bins[d][sl]) else: #assume it is already correctly broadcasted n_dim = bins.shape[0] bins_ = bins print(len(bins)) cdf_values = cdf(bins_) probs = cdf_values.copy() for d in range(n_dim): probs = np.diff(probs, axis=d) return probs
def _compute_J(self, A_solve, B_solve): #first compute appropriate duplication matrix # taken from Magnus and Neudecker (1980), #"The Elimination Matrix: Some Lemmas and Applications # the creation of the D_n matrix follows MN (1980) directly, #while the rest follows Hamilton (1994) neqs = self.neqs sigma_u = self.sigma_u A_mask = self.A_mask B_mask = self.B_mask #first generate duplication matrix, see MN (1980) for notation D_nT = np.zeros([int((1.0 / 2) * (neqs) * (neqs + 1)), neqs**2]) for j in range(neqs): i=j while j <= i < neqs: u=np.zeros([int((1.0/2)*neqs*(neqs+1)), 1]) u[int(j * neqs + (i + 1) - (1.0 / 2) * (j + 1) * j - 1)] = 1 Tij=np.zeros([neqs,neqs]) Tij[i,j]=1 Tij[j,i]=1 D_nT=D_nT+np.dot(u,(Tij.ravel('F')[:,None]).T) i=i+1 D_n=D_nT.T D_pl=npl.pinv(D_n) #generate S_B S_B = np.zeros((neqs**2, len(A_solve[A_mask]))) S_D = np.zeros((neqs**2, len(B_solve[B_mask]))) j = 0 j_d = 0 if len(A_solve[A_mask]) is not 0: A_vec = np.ravel(A_mask, order='F') for k in range(neqs**2): if A_vec[k] == True: S_B[k,j] = -1 j += 1 if len(B_solve[B_mask]) is not 0: B_vec = np.ravel(B_mask, order='F') for k in range(neqs**2): if B_vec[k] == True: S_D[k,j_d] = 1 j_d +=1 #now compute J invA = npl.inv(A_solve) J_p1i = np.dot(np.dot(D_pl, np.kron(sigma_u, invA)), S_B) J_p1 = -2.0 * J_p1i J_p2 = np.dot(np.dot(D_pl, np.kron(invA, invA)), S_D) J = np.append(J_p1, J_p2, axis=1) return J
def test_zero_constrained(self): # not completely generic yet if (isinstance(self.results.model, (sm.GEE))): # GEE does not subclass LikelihoodModel pytest.skip('GEE does not subclass LikelihoodModel') use_start_params = not isinstance(self.results.model, (sm.RLM, sm.OLS, sm.WLS)) self.use_start_params = use_start_params # attach for _get_constrained keep_index = list(range(self.results.model.exog.shape[1])) # index for params might include extra params keep_index_p = list(range(self.results.params.shape[0])) drop_index = [1] for i in drop_index: del keep_index[i] del keep_index_p[i] if use_start_params: res1 = self.results.model._fit_zeros(keep_index, maxiter=500, start_params=self.results.params) else: res1 = self.results.model._fit_zeros(keep_index, maxiter=500) res2 = self._get_constrained(keep_index, keep_index_p) assert_allclose(res1.params[keep_index_p], res2.params, rtol=1e-10, atol=1e-10) assert_equal(res1.params[drop_index], 0) assert_allclose(res1.bse[keep_index_p], res2.bse, rtol=1e-10, atol=1e-10) assert_equal(res1.bse[drop_index], 0) # OSX has many slight failures on this test tol = 1e-8 if PLATFORM_OSX else 1e-10 assert_allclose(res1.tvalues[keep_index_p], res2.tvalues, rtol=tol, atol=tol) assert_allclose(res1.pvalues[keep_index_p], res2.pvalues, rtol=tol, atol=tol) if hasattr(res1, 'resid'): # discrete models, Logit don't have `resid` yet # atol discussion at gh-5158 rtol = 1e-10 atol = 1e-12 if PLATFORM_OSX: # GH 5628 rtol = 1e-8 atol = 1e-10 assert_allclose(res1.resid, res2.resid, rtol=rtol, atol=atol) ex = self.results.model.exog.mean(0) predicted1 = res1.predict(ex, **self.predict_kwds) predicted2 = res2.predict(ex[keep_index], **self.predict_kwds) assert_allclose(predicted1, predicted2, rtol=1e-10) ex = self.results.model.exog[:5] predicted1 = res1.predict(ex, **self.predict_kwds) predicted2 = res2.predict(ex[:, keep_index], **self.predict_kwds) assert_allclose(predicted1, predicted2, rtol=1e-10)
def err_band_sz1(self, orth=False, svar=False, repl=1000, signif=0.05, seed=None, burn=100, component=None): """ IRF Sims-Zha error band method 1. Assumes symmetric error bands around mean. Parameters ---------- orth : bool, default False Compute orthogonalized impulse responses repl : int, default 1000 Number of MC replications signif : float (0 < signif < 1) Significance level for error bars, defaults to 95% CI seed : int, default None np.random seed burn : int, default 100 Number of initial simulated obs to discard component : neqs x neqs array, default to largest for each Index of column of eigenvector/value to use for each error band Note: period of impulse (t=0) is not included when computing principle component References ---------- Sims, Christopher A., and Tao Zha. 1999. "Error Bands for Impulse Response". Econometrica 67: 1113-1155. """ model = self.model periods = self.periods irfs = self._choose_irfs(orth, svar) neqs = self.neqs irf_resim = model.irf_resim(orth=orth, repl=repl, T=periods, seed=seed, burn=100) q = util.norm_signif_level(signif) W, eigva, k =self._eigval_decomp_SZ(irf_resim) if component is not None: if np.shape(component) != (neqs,neqs): raise ValueError("Component array must be " + str(neqs) + " x " + str(neqs)) if np.argmax(component) >= neqs*periods: raise ValueError("Atleast one of the components does not exist") else: k = component # here take the kth column of W, which we determine by finding the largest eigenvalue of the covaraince matrix lower = np.copy(irfs) upper = np.copy(irfs) for i in range(neqs): for j in range(neqs): lower[1:,i,j] = irfs[1:,i,j] + W[i,j,:,k[i,j]]*q*np.sqrt(eigva[i,j,k[i,j]]) upper[1:,i,j] = irfs[1:,i,j] - W[i,j,:,k[i,j]]*q*np.sqrt(eigva[i,j,k[i,j]]) return lower, upper
def select_order(self, maxlag, ic, trend='c', method='mle'): """ Select the lag order according to the information criterion. Parameters ---------- maxlag : int The highest lag length tried. See `AR.fit`. ic : str {'aic','bic','hqic','t-stat'} Criterion used for selecting the optimal lag length. See `AR.fit`. trend : str {'c','nc'} Whether to include a constant or not. 'c' - include constant. 'nc' - no constant. Returns ------- bestlag : int Best lag according to IC. """ endog = self.endog # make Y and X with same nobs to compare ICs Y = endog[maxlag:] self.Y = Y # attach to get correct fit stats X = self._stackX(maxlag, trend) # sets k_trend self.X = X k = self.k_trend # k_trend set in _stackX k = max(1, k) # handle if startlag is 0 results = {} if ic != 't-stat': for lag in range(k, maxlag+1): # have to reinstantiate the model to keep comparable models endog_tmp = endog[maxlag-lag:] fit = AR(endog_tmp).fit(maxlag=lag, method=method, full_output=0, trend=trend, maxiter=100, disp=0) results[lag] = eval('fit.'+ic) bestic, bestlag = min((res, k) for k, res in iteritems(results)) else: # choose by last t-stat. stop = 1.6448536269514722 # for t-stat, norm.ppf(.95) for lag in range(maxlag, k - 1, -1): # have to reinstantiate the model to keep comparable models endog_tmp = endog[maxlag - lag:] fit = AR(endog_tmp).fit(maxlag=lag, method=method, full_output=0, trend=trend, maxiter=35, disp=-1) bestlag = 0 if np.abs(fit.tvalues[-1]) >= stop: bestlag = lag break return bestlag
def interactions(terms, order=[1,2]): """ Output all pairwise interactions of given order of a sequence of terms. The argument order is a sequence specifying which order of interactions should be generated -- the default creates main effects and two-way interactions. If order is an integer, it is changed to range(1,order+1), so order=3 is equivalent to order=[1,2,3], generating all one, two and three-way interactions. If any entry of order is greater than len(terms), it is effectively treated as len(terms). >>> print interactions([Term(l) for l in ['a', 'b', 'c']]) <formula: a*b + a*c + b*c + a + b + c> >>> >>> print interactions([Term(l) for l in ['a', 'b', 'c']], order=list(range(5))) <formula: a*b + a*b*c + a*c + b*c + a + b + c> >>> """ l = len(terms) values = {} if np.asarray(order).shape == (): order = lrange(1, int(order)+1) # First order for o in order: I = np.indices((l,)*(o)) I.shape = (I.shape[0], np.product(I.shape[1:])) for m in range(I.shape[1]): # only keep combinations that have unique entries if (np.unique(I[:,m]).shape == I[:,m].shape and np.alltrue(np.equal(np.sort(I[:,m]), I[:,m]))): ll = [terms[j] for j in I[:,m]] v = ll[0] for ii in range(len(ll)-1): v *= ll[ii+1] values[tuple(I[:,m])] = v key = list(iterkeys(values))[0] value = values[key] del(values[key]) for v in itervalues(values): value += v return value
def _prepare_ndarray(self, data): if data.ndim == 1: data = data[:, None] self.nobs, self.nvar = data.shape self.data = data self.datarows = iter(data) # TODO: this should be user settable dtype = data.dtype self.varlist = _default_names(self.nvar) self.typlist = [_dtype_to_stata_type(dtype) for i in range(self.nvar)] self.fmtlist = [_dtype_to_default_stata_fmt(dtype) for i in range(self.nvar)]
def _make_exog_names(exog): exog_var = exog.var(0) if (exog_var == 0).any(): # assumes one constant in first or last position # avoid exception if more than one constant const_idx = exog_var.argmin() exog_names = ['x%d' % i for i in range(1, exog.shape[1])] exog_names.insert(const_idx, 'const') else: exog_names = ['x%d' % i for i in range(1, exog.shape[1]+1)] return exog_names
def approx_fprime(x, f, epsilon=None, args=(), kwargs={}, centered=False): ''' Gradient of function, or Jacobian if function f returns 1d array Parameters ---------- x : array parameters at which the derivative is evaluated f : function `f(*((x,)+args), **kwargs)` returning either one value or 1d array epsilon : float, optional Stepsize, if None, optimal stepsize is used. This is EPS**(1/2)*x for `centered` == False and EPS**(1/3)*x for `centered` == True. args : tuple Tuple of additional arguments for function `f`. kwargs : dict Dictionary of additional keyword arguments for function `f`. centered : bool Whether central difference should be returned. If not, does forward differencing. Returns ------- grad : array gradient or Jacobian Notes ----- If f returns a 1d array, it returns a Jacobian. If a 2d array is returned by f (e.g., with a value for each observation), it returns a 3d array with the Jacobian of each observation with shape xk x nobs x xk. I.e., the Jacobian of the first observation would be [:, 0, :] ''' n = len(x) # TODO: add scaled stepsize f0 = f(*((x,)+args), **kwargs) dim = np.atleast_1d(f0).shape # it could be a scalar grad = np.zeros((n,) + dim, np.promote_types(float, x.dtype)) ei = np.zeros((n,), float) if not centered: epsilon = _get_epsilon(x, 2, epsilon, n) for k in range(n): ei[k] = epsilon[k] grad[k, :] = (f(*((x+ei,) + args), **kwargs) - f0)/epsilon[k] ei[k] = 0.0 else: epsilon = _get_epsilon(x, 3, epsilon, n) / 2. for k in range(len(x)): ei[k] = epsilon[k] grad[k, :] = (f(*((x+ei,)+args), **kwargs) - f(*((x-ei,)+args), **kwargs))/(2 * epsilon[k]) ei[k] = 0.0 return grad.squeeze().T
def gram(self, d=0): """ Compute Gram inner product matrix, storing it in lower triangular banded form. The (i,j) entry is G_ij = integral b_i^(d) b_j^(d) where b_i are the basis elements of the BSpline and (d) is the d-th derivative. If d is a matrix then, it is assumed to specify a differential operator as follows: the first row represents the order of derivative with the second row the coefficient corresponding to that order. For instance: [[2, 3], [3, 1]] represents 3 * f^(2) + 1 * f^(3). INPUTS: d -- which derivative to apply to each basis element, if d is a matrix, it is assumed to specify a differential operator as above OUTPUTS: gram gram -- the matrix of inner products of (derivatives) of the BSpline elements """ d = np.squeeze(d) if np.asarray(d).shape == (): self.g = _hbspline.gram(self.tau, self.m, int(d), int(d)) else: d = np.asarray(d) if d.shape[0] != 2: raise ValueError("if d is not an integer, expecting a jx2 \ array with first row indicating order \ of derivative, second row coefficient in front.") if d.shape == (2,): d.shape = (2,1) self.g = 0 for i in range(d.shape[1]): for j in range(d.shape[1]): self.g += d[1,i]* d[1,j] * _hbspline.gram(self.tau, self.m, int(d[0,i]), int(d[0,j])) self.g = self.g.T self.d = d return np.nan_to_num(self.g)
def summary_params_2dflat(result, endog_names=None, exog_names=None, alpha=0.05, use_t=True, keep_headers=True, endog_cols=False): #skip_headers2=True): '''summary table for parameters that are 2d, e.g. multi-equation models Parameters ---------- result : result instance the result instance with params, bse, tvalues and conf_int endog_names : None or list of strings names for rows of the parameter array (multivariate endog) exog_names : None or list of strings names for columns of the parameter array (exog) alpha : float level for confidence intervals, default 0.95 use_t : bool indicator whether the p-values are based on the Student-t distribution (if True) or on the normal distribution (if False) keep_headers : bool If true (default), then sub-tables keep their headers. If false, then only the first headers are kept, the other headerse are blanked out endog_cols : bool If false (default) then params and other result statistics have equations by rows. If true, then equations are assumed to be in columns. Not implemented yet. Returns ------- tables : list of SimpleTable this contains a list of all seperate Subtables table_all : SimpleTable the merged table with results concatenated for each row of the parameter array ''' res = result params = res.params if params.ndim == 2: # we've got multiple equations n_equ = params.shape[1] if not len(endog_names) == params.shape[1]: raise ValueError('endog_names has wrong length') else: if not len(endog_names) == len(params): raise ValueError('endog_names has wrong length') n_equ = 1 #VAR doesn't have conf_int #params = res.params.T # this is a convention for multi-eq models if not isinstance(endog_names, list): #this might be specific to multinomial logit type, move? if endog_names is None: endog_basename = 'endog' else: endog_basename = endog_names #TODO: note, the [1:] is specific to current MNLogit endog_names = res.model.endog_names[1:] #check if we have the right length of names tables = [] for eq in range(n_equ): restup = (res, res.params[:,eq], res.bse[:,eq], res.tvalues[:,eq], res.pvalues[:,eq], res.conf_int(alpha)[eq]) #not used anymore in current version # if skip_headers2: # skiph = (row != 0) # else: # skiph = False skiph = False tble = summary_params(restup, yname=endog_names[eq], xname=exog_names, alpha=alpha, use_t=use_t, skip_header=skiph) tables.append(tble) #add titles, they will be moved to header lines in table_extend for i in range(len(endog_names)): tables[i].title = endog_names[i] table_all = table_extend(tables, keep_headers=keep_headers) return tables, table_all
def test_zero_constrained(self): # not completely generic yet if (isinstance(self.results.model, (sm.GEE))): # GEE does not subclass LikelihoodModel pytest.skip('GEE does not subclass LikelihoodModel') use_start_params = not isinstance(self.results.model, (sm.RLM, sm.OLS, sm.WLS)) self.use_start_params = use_start_params # attach for _get_constrained keep_index = list(range(self.results.model.exog.shape[1])) # index for params might include extra params keep_index_p = list(range(self.results.params.shape[0])) drop_index = [1] for i in drop_index: del keep_index[i] del keep_index_p[i] if use_start_params: res1 = self.results.model._fit_zeros( keep_index, maxiter=500, start_params=self.results.params) else: res1 = self.results.model._fit_zeros(keep_index, maxiter=500) res2 = self._get_constrained(keep_index, keep_index_p) assert_allclose(res1.params[keep_index_p], res2.params, rtol=1e-10, atol=1e-10) assert_equal(res1.params[drop_index], 0) assert_allclose(res1.bse[keep_index_p], res2.bse, rtol=1e-10, atol=1e-10) assert_equal(res1.bse[drop_index], 0) assert_allclose(res1.tvalues[keep_index_p], res2.tvalues, rtol=1e-10, atol=1e-10) assert_allclose(res1.pvalues[keep_index_p], res2.pvalues, rtol=1e-10, atol=1e-10) if hasattr(res1, 'resid'): # discrete models, Logit don't have `resid` yet # atol discussion at gh-5158 rtol = 1e-10 atol = 1e-12 if PLATFORM_OSX: # GH 5628 rtol = 1e-8 atol = 1e-10 assert_allclose(res1.resid, res2.resid, rtol=rtol, atol=atol) ex = self.results.model.exog.mean(0) predicted1 = res1.predict(ex, **self.predict_kwds) predicted2 = res2.predict(ex[keep_index], **self.predict_kwds) assert_allclose(predicted1, predicted2, rtol=1e-10) ex = self.results.model.exog[:5] predicted1 = res1.predict(ex, **self.predict_kwds) predicted2 = res2.predict(ex[:, keep_index], **self.predict_kwds) assert_allclose(predicted1, predicted2, rtol=1e-10)
def add_lag(x, col=None, lags=1, drop=False, insert=True): """ Returns an array with lags included given an array. Parameters ---------- x : array An array or NumPy ndarray subclass. Can be either a 1d or 2d array with observations in columns. col : 'string', int, or None If data is a structured array or a recarray, `col` can be a string that is the name of the column containing the variable. Or `col` can be an int of the zero-based column index. If it's a 1d array `col` can be None. lags : int The number of lags desired. drop : bool Whether to keep the contemporaneous variable for the data. insert : bool or int If True, inserts the lagged values after `col`. If False, appends the data. If int inserts the lags at int. Returns ------- array : ndarray Array with lags Examples -------- >>> import statsmodels.api as sm >>> data = sm.datasets.macrodata.load(as_pandas=False) >>> data = data.data[['year','quarter','realgdp','cpi']] >>> data = sm.tsa.add_lag(data, 'realgdp', lags=2) Notes ----- Trims the array both forward and backward, so that the array returned so that the length of the returned array is len(`X`) - lags. The lags are returned in increasing order, ie., t-1,t-2,...,t-lags """ if x.dtype.names: names = x.dtype.names if not col and np.squeeze(x).ndim > 1: raise IndexError("col is None and the input array is not 1d") elif len(names) == 1: col = names[0] if isinstance(col, (int, long)): col = x.dtype.names[col] if not PY3: # TODO: Get rid of this kludge. See GH # 3658 names = [bytes(name) if isinstance(name, unicode) # noqa:F821 else name for name in names] # Fail loudly if there is a non-ascii name. x.dtype.names = names if isinstance(col, unicode): # noqa:F821 col = bytes(col) contemp = x[col] # make names for lags tmp_names = [col + '_'+'L(%i)' % i for i in range(1, lags+1)] ndlags = lagmat(contemp, maxlag=lags, trim='Both') # get index for return if insert is True: ins_idx = list(names).index(col) + 1 elif insert is False: ins_idx = len(names) + 1 else: # insert is an int if insert > len(names): import warnings warnings.warn("insert > number of variables, inserting at the" " last position", ValueWarning) ins_idx = insert first_names = list(names[:ins_idx]) last_names = list(names[ins_idx:]) if drop: if col in first_names: first_names.pop(first_names.index(col)) else: last_names.pop(last_names.index(col)) if first_names: # only do this if x isn't "empty" # Workaround to avoid NumPy FutureWarning _x = recarray_select(x, first_names) first_arr = nprf.append_fields(_x[lags:], tmp_names, ndlags.T, usemask=False) else: first_arr = np.zeros(len(x)-lags, dtype=lzip(tmp_names, (x[col].dtype,)*lags)) for i,name in enumerate(tmp_names): first_arr[name] = ndlags[:,i] if last_names: return nprf.append_fields(first_arr, last_names, [x[name][lags:] for name in last_names], usemask=False) else: # lags for last variable return first_arr else: # we have an ndarray if x.ndim == 1: # make 2d if 1d x = x[:,None] if col is None: col = 0 # handle negative index if col < 0: col = x.shape[1] + col contemp = x[:,col] if insert is True: ins_idx = col + 1 elif insert is False: ins_idx = x.shape[1] else: if insert < 0: # handle negative index insert = x.shape[1] + insert + 1 if insert > x.shape[1]: insert = x.shape[1] import warnings warnings.warn("insert > number of variables, inserting at the" " last position", ValueWarning) ins_idx = insert ndlags = lagmat(contemp, lags, trim='Both') first_cols = lrange(ins_idx) last_cols = lrange(ins_idx,x.shape[1]) if drop: if col in first_cols: first_cols.pop(first_cols.index(col)) else: last_cols.pop(last_cols.index(col)) return np.column_stack((x[lags:,first_cols],ndlags, x[lags:,last_cols]))
def lagmat2ds(x, maxlag0, maxlagex=None, dropex=0, trim='forward', use_pandas=False): """ Generate lagmatrix for 2d array, columns arranged by variables Parameters ---------- x : array_like, 2d 2d data, observation in rows and variables in columns maxlag0 : int for first variable all lags from zero to maxlag are included maxlagex : None or int max lag for all other variables all lags from zero to maxlag are included dropex : int (default is 0) exclude first dropex lags from other variables for all variables, except the first, lags from dropex to maxlagex are included trim : string * 'forward' : trim invalid observations in front * 'backward' : trim invalid initial observations * 'both' : trim invalid observations on both sides * 'none' : no trimming of observations use_pandas : bool, optional If true, returns a DataFrame when the input is a pandas Series or DataFrame. If false, return numpy ndarrays. Returns ------- lagmat : 2d array array with lagged observations, columns ordered by variable Notes ----- Inefficient implementation for unequal lags, implemented for convenience """ if maxlagex is None: maxlagex = maxlag0 maxlag = max(maxlag0, maxlagex) is_pandas = _is_using_pandas(x, None) if x.ndim == 1: if is_pandas: x = pd.DataFrame(x) else: x = x[:, None] elif x.ndim == 0 or x.ndim > 2: raise ValueError('Only supports 1 and 2-dimensional data.') nobs, nvar = x.shape if is_pandas and use_pandas: lags = lagmat(x.iloc[:, 0], maxlag, trim=trim, original='in', use_pandas=True) lagsli = [lags.iloc[:, :maxlag0 + 1]] for k in range(1, nvar): lags = lagmat(x.iloc[:, k], maxlag, trim=trim, original='in', use_pandas=True) lagsli.append(lags.iloc[:, dropex:maxlagex + 1]) return pd.concat(lagsli, axis=1) elif is_pandas: x = np.asanyarray(x) lagsli = [lagmat(x[:, 0], maxlag, trim=trim, original='in')[:, :maxlag0 + 1]] for k in range(1, nvar): lagsli.append(lagmat(x[:, k], maxlag, trim=trim, original='in')[:, dropex:maxlagex + 1]) return np.column_stack(lagsli)
thread_pool_num = int(sys.argv[3]) alpha = float(sys.argv[4]) # data_input = spark.read.csv(data_file_name, header=True, inferSchema=True).cache() data_input = spark.read.csv(data_file_name, header=True, inferSchema=True).persist( pyspark.StorageLevel.MEMORY_AND_DISK_2) k = len(data_input.columns) data_input = data_input.withColumn("id", monotonically_increasing_id()) # add time lag for all x_name columns w = Window().orderBy(col("id")) x_list = data_input.columns x_list.remove("id") for x_name_item in x_list: for i in range(1, maxlag + 1): data_input = data_input.withColumn("%s_t-%s" % (x_name_item, str(i)), lag(data_input[x_name_item], i, 0).over(w)) data_input.cache() # data_input.show(5) # spark.stop() n = data_input.count() # print(x_list) # maxlag = 3 def regression(x_name, y_name, maxlag, data=data_input): print("!!!!!!!!!start regression!!!!!!!!!")
def test_lutkepohl_parse(): files = ['e%d' % i for i in range(1, 7)] for f in files: get_lutkepohl_data(f)
def test_irf_stderr(self): irf_stderr = self.irf.stderr(orth=False) for i in range(1, 1 + len(self.lut.irf_stderr)): assert_almost_equal(np.round(irf_stderr[i], 3), self.lut.irf_stderr[i-1])
def update(self, params): if self.model.weights is not None: warnings.warn( "weights not implemented for autoregressive " "cov_struct, using unweighted covariance estimate", NotImplementedWarning) endog = self.model.endog_li time = self.model.time_li # Only need to compute this once if self.designx is not None: designx = self.designx else: designx = [] for i in range(self.model.num_group): ngrp = len(endog[i]) if ngrp == 0: continue # Loop over pairs of observations within a cluster for j1 in range(ngrp): for j2 in range(j1): designx.append( self.dist_func(time[i][j1, :], time[i][j2, :])) designx = np.array(designx) self.designx = designx scale = self.model.estimate_scale() varfunc = self.model.family.variance cached_means = self.model.cached_means # Weights var = 1. - self.dep_params**(2 * designx) var /= 1. - self.dep_params**2 wts = 1. / var wts /= wts.sum() residmat = [] for i in range(self.model.num_group): expval, _ = cached_means[i] stdev = np.sqrt(scale * varfunc(expval)) resid = (endog[i] - expval) / stdev ngrp = len(resid) for j1 in range(ngrp): for j2 in range(j1): residmat.append([resid[j1], resid[j2]]) residmat = np.array(residmat) # Need to minimize this def fitfunc(a): dif = residmat[:, 0] - (a**designx) * residmat[:, 1] return np.dot(dif**2, wts) # Left bracket point b_lft, f_lft = 0., fitfunc(0.) # Center bracket point b_ctr, f_ctr = 0.5, fitfunc(0.5) while f_ctr > f_lft: b_ctr /= 2 f_ctr = fitfunc(b_ctr) if b_ctr < 1e-8: self.dep_params = 0 return # Right bracket point b_rgt, f_rgt = 0.75, fitfunc(0.75) while f_rgt < f_ctr: b_rgt = b_rgt + (1. - b_rgt) / 2 f_rgt = fitfunc(b_rgt) if b_rgt > 1. - 1e-6: raise ValueError( "Autoregressive: unable to find right bracket") from scipy.optimize import brent self.dep_params = brent(fitfunc, brack=[b_lft, b_ctr, b_rgt])
def covariance_matrix_solve(self, expval, index, stdev, rhs): """ Solves matrix equations of the form `covmat * soln = rhs` and returns the values of `soln`, where `covmat` is the covariance matrix represented by this class. Parameters ---------- expval: array-like The expected value of endog for each observed value in the group. index: integer The group index. stdev : array-like The standard deviation of endog for each observation in the group. rhs : list/tuple of array-like A set of right-hand sides; each defines a matrix equation to be solved. Returns ------- soln : list/tuple of array-like The solutions to the matrix equations. Notes ----- Returns None if the solver fails. Some dependence structures do not use `expval` and/or `index` to determine the correlation matrix. Some families (e.g. binomial) do not use the `stdev` parameter when forming the covariance matrix. If the covariance matrix is singular or not SPD, it is projected to the nearest such matrix. These projection events are recorded in the fit_history member of the GEE model. Systems of linear equations with the covariance matrix as the left hand side (LHS) are solved for different right hand sides (RHS); the LHS is only factorized once to save time. This is a default implementation, it can be reimplemented in subclasses to optimize the linear algebra according to the struture of the covariance matrix. """ vmat, is_cor = self.covariance_matrix(expval, index) if is_cor: vmat *= np.outer(stdev, stdev) # Factor the covariance matrix. If the factorization fails, # attempt to condition it into a factorizable matrix. threshold = 1e-2 success = False cov_adjust = 0 for itr in range(20): try: vco = spl.cho_factor(vmat) success = True break except np.linalg.LinAlgError: vmat = cov_nearest(vmat, method=self.cov_nearest_method, threshold=threshold) threshold *= 2 cov_adjust += 1 self.cov_adjust.append(cov_adjust) # Last resort if we still can't factor the covariance matrix. if not success: warnings.warn( "Unable to condition covariance matrix to an SPD " "matrix using cov_nearest", ConvergenceWarning) vmat = np.diag(np.diag(vmat)) vco = spl.cho_factor(vmat) soln = [spl.cho_solve(vco, x) for x in rhs] return soln
def lowess(endog, exog, frac=2. / 3, it=3): """ LOWESS (Locally Weighted Scatterplot Smoothing) A lowess function that outs smoothed estimates of endog at the given exog values from points (exog, endog) Parameters ---------- endog: 1-D numpy array The y-values of the observed points exog: 1-D numpy array The x-values of the observed points frac: float Between 0 and 1. The fraction of the data used when estimating each y-value. it: int The number of residual-based reweightings to perform. Returns ------- out: numpy array A numpy array with two columns. The first column is the sorted x values and the second column the associated estimated y-values. Notes ----- This lowess function implements the algorithm given in the reference below using local linear estimates. Suppose the input data has N points. The algorithm works by estimating the true ``y_i`` by taking the frac*N closest points to ``(x_i,y_i)`` based on their x values and estimating ``y_i`` using a weighted linear regression. The weight for ``(x_j,y_j)`` is `_lowess_tricube` function applied to ``|x_i-x_j|``. If ``iter > 0``, then further weighted local linear regressions are performed, where the weights are the same as above times the `_lowess_bisquare` function of the residuals. Each iteration takes approximately the same amount of time as the original fit, so these iterations are expensive. They are most useful when the noise has extremely heavy tails, such as Cauchy noise. Noise with less heavy-tails, such as t-distributions with ``df > 2``, are less problematic. The weights downgrade the influence of points with large residuals. In the extreme case, points whose residuals are larger than 6 times the median absolute residual are given weight 0. Some experimentation is likely required to find a good choice of frac and iter for a particular dataset. References ---------- Cleveland, W.S. (1979) "Robust Locally Weighted Regression and Smoothing Scatterplots". Journal of the American Statistical Association 74 (368): 829-836. Examples -------- The below allows a comparison between how different the fits from `lowess` for different values of frac can be. >>> import numpy as np >>> import statsmodels.api as sm >>> lowess = sm.nonparametric.lowess >>> x = np.random.uniform(low=-2*np.pi, high=2*np.pi, size=500) >>> y = np.sin(x) + np.random.normal(size=len(x)) >>> z = lowess(y, x) >>> w = lowess(y, x, frac=1./3) This gives a similar comparison for when it is 0 vs not. >>> import scipy.stats as stats >>> x = np.random.uniform(low=-2*np.pi, high=2*np.pi, size=500) >>> y = np.sin(x) + stats.cauchy.rvs(size=len(x)) >>> z = lowess(y, x, frac= 1./3, it=0) >>> w = lowess(y, x, frac=1./3) """ x = exog if exog.ndim != 1: raise ValueError('exog must be a vector') if endog.ndim != 1: raise ValueError('endog must be a vector') if endog.shape[0] != x.shape[0]: raise ValueError('exog and endog must have same length') n = exog.shape[0] fitted = np.zeros(n) k = int(frac * n) index_array = np.argsort(exog) x_copy = np.array(exog[index_array]) #, dtype ='float32') y_copy = endog[index_array] fitted, weights = _lowess_initial_fit(x_copy, y_copy, k, n) for i in range(it): _lowess_robustify_fit(x_copy, y_copy, fitted, weights, k, n) out = np.array([x_copy, fitted]).T out.shape = (n, 2) return out
def convolution_filter(x, filt, nsides=2): ''' Linear filtering via convolution. Centered and backward displaced moving weighted average. Parameters ---------- x : array_like data array, 1d or 2d, if 2d then observations in rows filt : array_like Linear filter coefficients in reverse time-order. Should have the same number of dimensions as x though if 1d and ``x`` is 2d will be coerced to 2d. nsides : int, optional If 2, a centered moving average is computed using the filter coefficients. If 1, the filter coefficients are for past values only. Both methods use scipy.signal.convolve. Returns ------- y : ndarray, 2d Filtered array, number of columns determined by x and filt. If a pandas object is given, a pandas object is returned. The index of the return is the exact same as the time period in ``x`` Notes ----- In nsides == 1, x is filtered :: y[n] = filt[0]*x[n-1] + ... + filt[n_filt-1]*x[n-n_filt] where n_filt is len(filt). If nsides == 2, x is filtered around lag 0 :: y[n] = filt[0]*x[n - n_filt/2] + ... + filt[n_filt / 2] * x[n] + ... + x[n + n_filt/2] where n_filt is len(filt). If n_filt is even, then more of the filter is forward in time than backward. If filt is 1d or (nlags,1) one lag polynomial is applied to all variables (columns of x). If filt is 2d, (nlags, nvars) each series is independently filtered with its own lag polynomial, uses loop over nvar. This is different than the usual 2d vs 2d convolution. Filtering is done with scipy.signal.convolve, so it will be reasonably fast for medium sized data. For large data fft convolution would be faster. ''' # for nsides shift the index instead of using 0 for 0 lag this # allows correct handling of NaNs if nsides == 1: trim_head = len(filt) - 1 trim_tail = None elif nsides == 2: trim_head = int(np.ceil(len(filt) / 2.) - 1) or None trim_tail = int(np.ceil(len(filt) / 2.) - len(filt) % 2) or None else: # pragma : no cover raise ValueError("nsides must be 1 or 2") pw = PandasWrapper(x) x = array_like(x, 'x', maxdim=2) filt = array_like(filt, 'filt', ndim=x.ndim) if filt.ndim == 1 or min(filt.shape) == 1: result = signal.convolve(x, filt, mode='valid') elif filt.ndim == 2: nlags = filt.shape[0] nvar = x.shape[1] result = np.zeros((x.shape[0] - nlags + 1, nvar)) if nsides == 2: for i in range(nvar): # could also use np.convolve, but easier for swiching to fft result[:, i] = signal.convolve(x[:, i], filt[:, i], mode='valid') elif nsides == 1: for i in range(nvar): result[:, i] = signal.convolve(x[:, i], np.r_[0, filt[:, i]], mode='valid') result = _pad_nans(result, trim_head, trim_tail) return pw.wrap(result)
def summary(self, yname=None, xname=None, title=0, alpha=.05, returns='text', model_info=None): """ Parameters ----------- yname : string optional, Default is `Y` xname : list of strings optional, Default is `X.#` for # in p the number of regressors Confidance interval : (0,1) not implimented title : string optional, Defualt is 'Generalized linear model' returns : string 'text', 'table', 'csv', 'latex', 'html' Returns ------- Default : returns='print' Prints the summarirized results Option : returns='text' Prints the summarirized results Option : returns='table' SimpleTable instance : summarizing the fit of a linear model. Option : returns='csv' returns a string of csv of the results, to import into a spreadsheet Option : returns='latex' Not implimented yet Option : returns='HTML' Not implimented yet Examples (needs updating) -------- >>> import statsmodels as sm >>> data = sm.datasets.longley.load(as_pandas=False) >>> data.exog = sm.add_constant(data.exog) >>> ols_results = sm.OLS(data.endog, data.exog).results >>> print ols_results.summary() ... Notes ----- conf_int calculated from normal dist. """ import time as time #TODO Make sure all self.model.__class__.__name__ are listed model_types = {'OLS' : 'Ordinary least squares', 'GLS' : 'Generalized least squares', 'GLSAR' : 'Generalized least squares with AR(p)', 'WLS' : 'Weighted least squares', 'RLM' : 'Robust linear model', 'GLM' : 'Generalized linear model' } model_methods = {'OLS' : 'Least Squares', 'GLS' : 'Least Squares', 'GLSAR' : 'Least Squares', 'WLS' : 'Least Squares', 'RLM' : '?', 'GLM' : '?'} if title==0: title = model_types[self.model.__class__.__name__] if yname is None: try: yname = self.model.endog_names except AttributeError: yname = 'y' if xname is None: try: xname = self.model.exog_names except AttributeError: xname = ['var_%d' % i for i in range(len(self.params))] time_now = time.localtime() time_of_day = [time.strftime("%H:%M:%S", time_now)] date = time.strftime("%a, %d %b %Y", time_now) modeltype = self.model.__class__.__name__ #dist_family = self.model.family.__class__.__name__ nobs = self.nobs df_model = self.df_model df_resid = self.df_resid #General part of the summary table, Applicable to all? models #------------------------------------------------------------ #TODO: define this generically, overwrite in model classes #replace definition of stubs data by single list #e.g. gen_left = [('Model type:', [modeltype]), ('Date:', [date]), ('Dependent Variable:', yname), #What happens with multiple names? ('df model', [df_model]) ] gen_stubs_left, gen_data_left = zip_longest(*gen_left) #transpose row col gen_title = title gen_header = None ## gen_stubs_left = ('Model type:', ## 'Date:', ## 'Dependent Variable:', ## 'df model' ## ) ## gen_data_left = [[modeltype], ## [date], ## yname, #What happens with multiple names? ## [df_model] ## ] gen_table_left = SimpleTable(gen_data_left, gen_header, gen_stubs_left, title = gen_title, txt_fmt = gen_fmt ) gen_stubs_right = ('Method:', 'Time:', 'Number of Obs:', 'df resid') gen_data_right = ([modeltype], #was dist family need to look at more time_of_day, [nobs], [df_resid] ) gen_table_right = SimpleTable(gen_data_right, gen_header, gen_stubs_right, title = gen_title, txt_fmt = gen_fmt) gen_table_left.extend_right(gen_table_right) general_table = gen_table_left #Parameters part of the summary table #------------------------------------ #Note: this is not necessary since we standardized names, only t versus normal tstats = {'OLS' : self.t(), 'GLS' : self.t(), 'GLSAR' : self.t(), 'WLS' : self.t(), 'RLM' : self.t(), 'GLM' : self.t()} prob_stats = {'OLS' : self.pvalues, 'GLS' : self.pvalues, 'GLSAR' : self.pvalues, 'WLS' : self.pvalues, 'RLM' : self.pvalues, 'GLM' : self.pvalues } #Dictionary to store the header names for the parameter part of the #summary table. look up by modeltype alp = str((1-alpha)*100)+'%' param_header = { 'OLS' : ['coef', 'std err', 't', 'P>|t|', alp + ' Conf. Interval'], 'GLS' : ['coef', 'std err', 't', 'P>|t|', alp + ' Conf. Interval'], 'GLSAR' : ['coef', 'std err', 't', 'P>|t|', alp + ' Conf. Interval'], 'WLS' : ['coef', 'std err', 't', 'P>|t|', alp + ' Conf. Interval'], 'GLM' : ['coef', 'std err', 't', 'P>|t|', alp + ' Conf. Interval'], #glm uses t-distribution 'RLM' : ['coef', 'std err', 'z', 'P>|z|', alp + ' Conf. Interval'] #checke z } params_stubs = xname params = self.params conf_int = self.conf_int(alpha) std_err = self.bse exog_len = lrange(len(xname)) tstat = tstats[modeltype] prob_stat = prob_stats[modeltype] # Simpletable should be able to handle the formating params_data = lzip(["%#6.4g" % (params[i]) for i in exog_len], ["%#6.4f" % (std_err[i]) for i in exog_len], ["%#6.4f" % (tstat[i]) for i in exog_len], ["%#6.4f" % (prob_stat[i]) for i in exog_len], ["(%#5g, %#5g)" % tuple(conf_int[i]) for i in exog_len]) parameter_table = SimpleTable(params_data, param_header[modeltype], params_stubs, title = None, txt_fmt = fmt_2, #gen_fmt, ) #special table #------------- #TODO: exists in linear_model, what about other models #residual diagnostics #output options #-------------- #TODO: JP the rest needs to be fixed, similar to summary in linear_model def ols_printer(): """ print summary table for ols models """ table = str(general_table)+'\n'+str(parameter_table) return table def ols_to_csv(): """ exports ols summary data to csv """ pass def glm_printer(): table = str(general_table)+'\n'+str(parameter_table) return table pass printers = {'OLS': ols_printer, 'GLM': glm_printer} if returns=='print': try: return printers[modeltype]() except KeyError: return printers['OLS']()
def summary_params_2d(result, extras=None, endog_names=None, exog_names=None, title=None): '''create summary table of regression parameters with several equations This allows interleaving of parameters with bse and/or tvalues Parameters ---------- result : result instance the result instance with params and attributes in extras extras : list of strings additional attributes to add below a parameter row, e.g. bse or tvalues endog_names : None or list of strings names for rows of the parameter array (multivariate endog) exog_names : None or list of strings names for columns of the parameter array (exog) alpha : float level for confidence intervals, default 0.95 title : None or string Returns ------- tables : list of SimpleTable this contains a list of all seperate Subtables table_all : SimpleTable the merged table with results concatenated for each row of the parameter array ''' if endog_names is None: #TODO: note the [1:] is specific to current MNLogit endog_names = ['endog_%d' % i for i in np.unique(result.model.endog)[1:]] if exog_names is None: exog_names = ['var%d' %i for i in range(len(result.params))] #TODO: check formatting options with different values #res_params = [['%10.4f'%item for item in row] for row in result.params] res_params = [[forg(item, prec=4) for item in row] for row in result.params] if extras: #not None or non-empty #maybe this should be a simple triple loop instead of list comprehension? #below_list = [[['%10s' % ('('+('%10.3f'%v).strip()+')') extras_list = [[['%10s' % ('(' + forg(v, prec=3).strip() + ')') for v in col] for col in getattr(result, what)] for what in extras] data = lzip(res_params, *extras_list) data = [i for j in data for i in j] #flatten stubs = lzip(endog_names, *[['']*len(endog_names)]*len(extras)) stubs = [i for j in stubs for i in j] #flatten #return SimpleTable(data, headers=exog_names, stubs=stubs) else: data = res_params stubs = endog_names # return SimpleTable(data, headers=exog_names, stubs=stubs, # data_fmts=['%10.4f']) import copy txt_fmt = copy.deepcopy(fmt_params) txt_fmt.update(dict(data_fmts = ["%s"]*result.params.shape[1])) return SimpleTable(data, headers=exog_names, stubs=stubs, title=title, # data_fmts = ["%s"]), txt_fmt = txt_fmt)
def test_rmse(self): results = self.res1.results for i in range(len(results)): assert_almost_equal(results[i].mse_resid**.5, eval('self.res2.rmse_'+str(i+1)), DECIMAL_6)
def test_cum_irf_stderr(self): stderr = self.irf.cum_effect_stderr(orth=False) for i in range(1, 1 + len(self.lut.cum_irf_stderr)): assert_almost_equal(np.round(stderr[i], 3), self.lut.cum_irf_stderr[i-1])
def initialize(self, model): """ Called on the first call to update `ilabels` is a list of n_i x n_i matrices containing integer labels that correspond to specific correlation parameters. Two elements of ilabels[i] with the same label share identical variance components. `designx` is a matrix, with each row containing dummy variables indicating which variance components are associated with the corresponding element of QY. """ super(Nested, self).initialize(model) if self.model.weights is not None: warnings.warn( "weights not implemented for nested cov_struct, " "using unweighted covariance estimate", NotImplementedWarning) # A bit of processing of the nest data id_matrix = np.asarray(self.model.dep_data) if id_matrix.ndim == 1: id_matrix = id_matrix[:, None] self.id_matrix = id_matrix endog = self.model.endog_li designx, ilabels = [], [] # The number of layers of nesting n_nest = self.id_matrix.shape[1] for i in range(self.model.num_group): ngrp = len(endog[i]) glab = self.model.group_labels[i] rix = self.model.group_indices[glab] # Determine the number of common variance components # shared by each pair of observations. ix1, ix2 = np.tril_indices(ngrp, -1) ncm = (self.id_matrix[rix[ix1], :] == self.id_matrix[rix[ix2], :] ).sum(1) # This is used to construct the working correlation # matrix. ilabel = np.zeros((ngrp, ngrp), dtype=np.int32) ilabel[[ix1, ix2]] = ncm + 1 ilabel[[ix2, ix1]] = ncm + 1 ilabels.append(ilabel) # This is used to estimate the variance components. dsx = np.zeros((len(ix1), n_nest + 1), dtype=np.float64) dsx[:, 0] = 1 for k in np.unique(ncm): ii = np.flatnonzero(ncm == k) dsx[ii, 1:k + 1] = 1 designx.append(dsx) self.designx = np.concatenate(designx, axis=0) self.ilabels = ilabels svd = np.linalg.svd(self.designx, 0) self.designx_u = svd[0] self.designx_s = svd[1] self.designx_v = svd[2].T
def cdf(self, endog_predict=None, exog_predict=None): r""" Cumulative distribution function for the conditional density. Parameters ---------- endog_predict: array_like, optional The evaluation dependent variables at which the cdf is estimated. If not specified the training dependent variables are used. exog_predict: array_like, optional The evaluation independent variables at which the cdf is estimated. If not specified the training independent variables are used. Returns ------- cdf_est: array_like The estimate of the cdf. Notes ----- For more details on the estimation see [2]_, and p.181 in [1]_. The multivariate conditional CDF for mixed data (continuous and ordered/unordered discrete) is estimated by: .. math:: F(y|x)=\frac{n^{-1}\sum_{i=1}^{n}G(\frac{y-Y_{i}}{h_{0}}) W_{h}(X_{i},x)}{\widehat{\mu}(x)} where G() is the product kernel CDF estimator for the dependent (y) variable(s) and W() is the product kernel CDF estimator for the independent variable(s). References ---------- .. [1] Racine, J., Li, Q. Nonparametric econometrics: theory and practice. Princeton University Press. (2007) .. [2] Liu, R., Yang, L. "Kernel estimation of multivariate cumulative distribution function." Journal of Nonparametric Statistics (2008) """ if endog_predict is None: endog_predict = self.endog else: endog_predict = _adjust_shape(endog_predict, self.k_dep) if exog_predict is None: exog_predict = self.exog else: exog_predict = _adjust_shape(exog_predict, self.k_indep) N_data_predict = np.shape(exog_predict)[0] cdf_est = np.empty(N_data_predict) for i in range(N_data_predict): mu_x = gpke(self.bw[self.k_dep:], data=self.exog, data_predict=exog_predict[i, :], var_type=self.indep_type) / self.nobs mu_x = np.squeeze(mu_x) cdf_endog = gpke(self.bw[0:self.k_dep], data=self.endog, data_predict=endog_predict[i, :], var_type=self.dep_type, ckertype="gaussian_cdf", ukertype="aitchisonaitken_cdf", okertype='wangryzin_cdf', tosum=False) cdf_exog = gpke(self.bw[self.k_dep:], data=self.exog, data_predict=exog_predict[i, :], var_type=self.indep_type, tosum=False) S = (cdf_endog * cdf_exog).sum(axis=0) cdf_est[i] = S / (self.nobs * mu_x) return cdf_est
def imse(self, bw): r""" Returns the Integrated Mean Square Error for the unconditional KDE. Parameters ---------- bw: array_like The bandwidth parameter(s). Returns ------- CV: float The cross-validation objective function. Notes ----- See p. 27 in [1]_ for details on how to handle the multivariate estimation with mixed data types see p.6 in [2]_. The formula for the cross-validation objective function is: .. math:: CV=\frac{1}{n^{2}}\sum_{i=1}^{n}\sum_{j=1}^{N} \bar{K}_{h}(X_{i},X_{j})-\frac{2}{n(n-1)}\sum_{i=1}^{n} \sum_{j=1,j\neq i}^{N}K_{h}(X_{i},X_{j}) Where :math:`\bar{K}_{h}` is the multivariate product convolution kernel (consult [2]_ for mixed data types). References ---------- .. [1] Racine, J., Li, Q. Nonparametric econometrics: theory and practice. Princeton University Press. (2007) .. [2] Racine, J., Li, Q. "Nonparametric Estimation of Distributions with Categorical and Continuous Data." Working Paper. (2000) """ #F = 0 #for i in range(self.nobs): # k_bar_sum = gpke(bw, data=-self.data, # data_predict=-self.data[i, :], # var_type=self.var_type, # ckertype='gauss_convolution', # okertype='wangryzin_convolution', # ukertype='aitchisonaitken_convolution') # F += k_bar_sum ## there is a + because loo_likelihood returns the negative #return (F / self.nobs**2 + self.loo_likelihood(bw) * \ # 2 / ((self.nobs) * (self.nobs - 1))) # The code below is equivalent to the commented-out code above. It's # about 20% faster due to some code being moved outside the for-loops # and shared by gpke() and loo_likelihood(). F = 0 kertypes = dict(c=kernels.gaussian_convolution, o=kernels.wang_ryzin_convolution, u=kernels.aitchison_aitken_convolution) nobs = self.nobs data = -self.data var_type = self.var_type ix_cont = np.array([c == 'c' for c in var_type]) _bw_cont_product = bw[ix_cont].prod() Kval = np.empty(data.shape) for i in range(nobs): for ii, vtype in enumerate(var_type): Kval[:, ii] = kertypes[vtype](bw[ii], data[:, ii], data[i, ii]) dens = Kval.prod(axis=1) / _bw_cont_product k_bar_sum = dens.sum(axis=0) F += k_bar_sum # sum of prod kernel over nobs kertypes = dict(c=kernels.gaussian, o=kernels.wang_ryzin, u=kernels.aitchison_aitken) LOO = LeaveOneOut(self.data) L = 0 # leave-one-out likelihood Kval = np.empty((data.shape[0] - 1, data.shape[1])) for i, X_not_i in enumerate(LOO): for ii, vtype in enumerate(var_type): Kval[:, ii] = kertypes[vtype](bw[ii], -X_not_i[:, ii], data[i, ii]) dens = Kval.prod(axis=1) / _bw_cont_product L += dens.sum(axis=0) # CV objective function, eq. (2.4) of Ref. [3] return (F / nobs**2 - 2 * L / (nobs * (nobs - 1)))
def regression(x_name, y_name, maxlag, data=data_input): print("!!!!!!!!!start regression!!!!!!!!!") print(x_name) print(y_name) v = 0.1 data.printSchema() # data.show(10) # print(data.count()) dataFrame = data input_feature_name = [] for lagnumber in range(1, maxlag + 1): newname = "{}_t-{}".format(x_name, lagnumber) input_feature_name.append(newname) print("input_feature_name are") print(input_feature_name) assembler_for_lag = VectorAssembler( inputCols=input_feature_name, outputCol="features") dt = DecisionTreeRegressor(featuresCol="features", labelCol='{}'.format(y_name), maxDepth=6, minInstancesPerNode=10, seed=0) pipeline = Pipeline(stages=[assembler_for_lag, dt]) model = pipeline.fit(dataFrame) predictions = model.transform(dataFrame) # now predictions is the new dataFrame instead of the original dataFrame predictions = predictions.withColumnRenamed("prediction", 'predicted_{}'.format(y_name)) # print("predictions dataframe is ") # predictions.select('predicted_{}'.format(y_name), '{}'.format(y_name), "features").show(5) evaluator = RegressionEvaluator( labelCol='{}'.format(y_name), predictionCol='predicted_{}'.format(y_name), metricName="mse") mse = evaluator.evaluate(predictions) print("Mean Squared Error (MSE) on test data = %g" % mse) featureImportances = model.stages[1].featureImportances print("Feature Importance") print(featureImportances) y_hat = predictions.select('predicted_{}'.format(y_name)) y_hat = y_hat.withColumn("yid", monotonically_increasing_id()) # print(y_hat.count()) # compute residual value of y, y-y_hat, the residual value is the y in next round of loop if y_name == x_name: # learning rate is not in model 0 # dataFrame = dataFrame.join(y_hat, col("id") == (col("yid")+maxlag)) dataFrame = dataFrame.join(y_hat, col("id") == col("yid")) residual = dataFrame['{}'.format(y_name)] - dataFrame['predicted_{}'.format(y_name)] dataFrame = dataFrame.withColumn("{}res{}".format(y_name, x_name), residual) # dataFrame.show(5) dataFrame = dataFrame.drop("yid") return_col = dataFrame.select("{}res{}".format(y_name, x_name)) print("still round 1") # print(dataFrame.count()) else: # apply leraning rate dataFrame = dataFrame.join(y_hat, col("id") == col("yid")) dataFrame = dataFrame.withColumn('v_predicted_{}'.format(y_name), col('predicted_{}'.format(y_name)) * v) residual = dataFrame['{}'.format(y_name)] - dataFrame['v_predicted_{}'.format(y_name)] dataFrame = dataFrame.withColumn("{}res{}".format(y_name, x_name), residual) dataFrame = dataFrame.drop("yid") return_col = dataFrame.select("{}res{}".format(y_name, x_name)) print("after round 1 ") print("data for next step is ") return return_col, mse, featureImportances
def arma_acovf(ar, ma, nobs=10, sigma2=1, dtype=None): """ Theoretical autocovariance function of ARMA process Parameters ---------- ar : array_like, 1d coefficient for autoregressive lag polynomial, including zero lag ma : array_like, 1d coefficient for moving-average lag polynomial, including zero lag nobs : int number of terms (lags plus zero lag) to include in returned acovf sigma2 : float Variance of the innovation term. Returns ------- acovf : array autocovariance of ARMA process given by ar, ma See Also -------- arma_acf acovf References ---------- .. [*] Brockwell, Peter J., and Richard A. Davis. 2009. Time Series: Theory and Methods. 2nd ed. 1991. New York, NY: Springer. """ if dtype is None: dtype = np.common_type(np.array(ar), np.array(ma), np.array(sigma2)) p = len(ar) - 1 q = len(ma) - 1 m = max(p, q) + 1 if sigma2.real < 0: raise ValueError('Must have positive innovation variance.') # Short-circuit for trivial corner-case if p == q == 0: out = np.zeros(nobs, dtype=dtype) out[0] = sigma2 return out # Get the moving average representation coefficients that we need ma_coeffs = arma2ma(ar, ma, lags=m) # Solve for the first m autocovariances via the linear system # described by (BD, eq. 3.3.8) A = np.zeros((m, m), dtype=dtype) b = np.zeros((m, 1), dtype=dtype) # We need a zero-right-padded version of ar params tmp_ar = np.zeros(m, dtype=dtype) tmp_ar[:p + 1] = ar for k in range(m): A[k, :(k + 1)] = tmp_ar[:(k + 1)][::-1] A[k, 1:m - k] += tmp_ar[(k + 1):m] b[k] = sigma2 * np.dot(ma[k:q + 1], ma_coeffs[:max((q + 1 - k), 0)]) acovf = np.zeros(max(nobs, m), dtype=dtype) acovf[:m] = np.linalg.solve(A, b)[:, 0] # Iteratively apply (BD, eq. 3.3.9) to solve for remaining autocovariances if nobs > m: zi = signal.lfiltic([1], ar, acovf[:m:][::-1]) acovf[m:] = signal.lfilter([1], ar, np.zeros(nobs - m, dtype=dtype), zi=zi)[0] return acovf[:nobs]
def test_zero_collinear(self): # not completely generic yet if isinstance(self.results.model, (sm.GEE)): pytest.skip('Not completely generic yet') use_start_params = not isinstance(self.results.model, (sm.RLM, sm.OLS, sm.WLS, sm.GLM)) self.use_start_params = use_start_params # attach for _get_constrained keep_index = list(range(self.results.model.exog.shape[1])) # index for params might include extra params keep_index_p = list(range(self.results.params.shape[0])) drop_index = [] for i in drop_index: del keep_index[i] del keep_index_p[i] keep_index_p = list(range(self.results.params.shape[0])) # create collinear model mod2 = self.results.model mod_cls = mod2.__class__ init_kwds = mod2._get_init_kwds() ex = np.column_stack((mod2.exog, mod2.exog)) mod = mod_cls(mod2.endog, ex, **init_kwds) keep_index = list(range(self.results.model.exog.shape[1])) keep_index_p = list(range(self.results.model.exog.shape[1])) k_vars = ex.shape[1] k_extra = 0 if hasattr(mod, 'k_extra') and mod.k_extra > 0: keep_index_p += list(range(k_vars, k_vars + mod.k_extra)) k_extra = mod.k_extra cov_types = ['nonrobust', 'HC0'] for cov_type in cov_types: # Note: for RLM we only check default when cov_type is 'nonrobust' # cov_type is otherwise ignored if cov_type != 'nonrobust' and (isinstance(self.results.model, sm.RLM)): return if use_start_params: start_params = np.zeros(k_vars + k_extra) method = self.results.mle_settings['optimizer'] # string in `method` is not mutable, so no need for copy sp = self.results.mle_settings['start_params'].copy() if self.transform_index is not None: # work around internal transform_params, currently in NB sp[self.transform_index] = np.exp(sp[self.transform_index]) start_params[keep_index_p] = sp res1 = mod._fit_collinear(cov_type=cov_type, start_params=start_params, method=method, disp=0) if cov_type != 'nonrobust': # reestimate original model to get robust cov res2 = self.results.model.fit(cov_type=cov_type, start_params=sp, method=method, disp=0) else: # more special casing RLM if (isinstance(self.results.model, (sm.RLM))): res1 = mod._fit_collinear() else: res1 = mod._fit_collinear(cov_type=cov_type) if cov_type != 'nonrobust': # reestimate original model to get robust cov res2 = self.results.model.fit(cov_type=cov_type) if cov_type == 'nonrobust': res2 = self.results # check fit optimizer arguments, if mle_settings is available if hasattr(res2, 'mle_settings'): assert_equal( res1.results_constrained.mle_settings['optimizer'], res2.mle_settings['optimizer']) if 'start_params' in res2.mle_settings: spc = res1.results_constrained.mle_settings['start_params'] assert_allclose(spc, res2.mle_settings['start_params'], rtol=1e-10, atol=1e-20) assert_equal(res1.mle_settings['optimizer'], res2.mle_settings['optimizer']) assert_allclose(res1.mle_settings['start_params'], res2.mle_settings['start_params'], rtol=1e-10, atol=1e-20) # Poisson has reduced precision in params, difficult optimization? assert_allclose(res1.params[keep_index_p], res2.params, rtol=1e-6) assert_allclose(res1.params[drop_index], 0, rtol=1e-10) assert_allclose(res1.bse[keep_index_p], res2.bse, rtol=1e-8) assert_allclose(res1.bse[drop_index], 0, rtol=1e-10) assert_allclose(res1.tvalues[keep_index_p], res2.tvalues, rtol=5e-8) assert_allclose(res1.pvalues[keep_index_p], res2.pvalues, rtol=1e-6, atol=1e-30) if hasattr(res1, 'resid'): # discrete models, Logit don't have `resid` yet assert_allclose(res1.resid, res2.resid, rtol=1e-5, atol=1e-10) ex = res1.model.exog.mean(0) predicted1 = res1.predict(ex, **self.predict_kwds) predicted2 = res2.predict(ex[keep_index], **self.predict_kwds) assert_allclose(predicted1, predicted2, rtol=1e-8, atol=1e-11) ex = res1.model.exog[:5] kwds = getattr(self, 'predict_kwds_5', {}) predicted1 = res1.predict(ex, **kwds) predicted2 = res2.predict(ex[:, keep_index], **kwds) assert_allclose(predicted1, predicted2, rtol=1e-8, atol=1e-11)
def kdesum(x, axis=0): return np.asarray([np.sum(x[i] - x, axis) for i in range(len(x))])
def lagmat(x, maxlag, trim='forward', original='ex', use_pandas=False): """ Create 2d array of lags Parameters ---------- x : array_like, 1d or 2d data; if 2d, observation in rows and variables in columns maxlag : int all lags from zero to maxlag are included trim : str {'forward', 'backward', 'both', 'none'} or None * 'forward' : trim invalid observations in front * 'backward' : trim invalid initial observations * 'both' : trim invalid observations on both sides * 'none', None : no trimming of observations original : str {'ex','sep','in'} * 'ex' : drops the original array returning only the lagged values. * 'in' : returns the original array and the lagged values as a single array. * 'sep' : returns a tuple (original array, lagged values). The original array is truncated to have the same number of rows as the returned lagmat. use_pandas : bool, optional If true, returns a DataFrame when the input is a pandas Series or DataFrame. If false, return numpy ndarrays. Returns ------- lagmat : 2d array array with lagged observations y : 2d array, optional Only returned if original == 'sep' Examples -------- >>> from statsmodels.tsa.tsatools import lagmat >>> import numpy as np >>> X = np.arange(1,7).reshape(-1,2) >>> lagmat(X, maxlag=2, trim="forward", original='in') array([[ 1., 2., 0., 0., 0., 0.], [ 3., 4., 1., 2., 0., 0.], [ 5., 6., 3., 4., 1., 2.]]) >>> lagmat(X, maxlag=2, trim="backward", original='in') array([[ 5., 6., 3., 4., 1., 2.], [ 0., 0., 5., 6., 3., 4.], [ 0., 0., 0., 0., 5., 6.]]) >>> lagmat(X, maxlag=2, trim="both", original='in') array([[ 5., 6., 3., 4., 1., 2.]]) >>> lagmat(X, maxlag=2, trim="none", original='in') array([[ 1., 2., 0., 0., 0., 0.], [ 3., 4., 1., 2., 0., 0.], [ 5., 6., 3., 4., 1., 2.], [ 0., 0., 5., 6., 3., 4.], [ 0., 0., 0., 0., 5., 6.]]) Notes ----- When using a pandas DataFrame or Series with use_pandas=True, trim can only be 'forward' or 'both' since it is not possible to consistently extend index values. """ # TODO: allow list of lags additional to maxlag is_pandas = _is_using_pandas(x, None) and use_pandas trim = 'none' if trim is None else trim trim = trim.lower() if is_pandas and trim in ('none', 'backward'): raise ValueError("trim cannot be 'none' or 'forward' when used on " "Series or DataFrames") xa = np.asarray(x) dropidx = 0 if xa.ndim == 1: xa = xa[:, None] nobs, nvar = xa.shape if original in ['ex', 'sep']: dropidx = nvar if maxlag >= nobs: raise ValueError("maxlag should be < nobs") lm = np.zeros((nobs + maxlag, nvar * (maxlag + 1))) for k in range(0, int(maxlag + 1)): lm[maxlag - k:nobs + maxlag - k, nvar * (maxlag - k):nvar * (maxlag - k + 1)] = xa if trim in ('none', 'forward'): startobs = 0 elif trim in ('backward', 'both'): startobs = maxlag else: raise ValueError('trim option not valid') if trim in ('none', 'backward'): stopobs = len(lm) else: stopobs = nobs if is_pandas: x_columns = x.columns if isinstance(x, DataFrame) else [x.name] columns = [str(col) for col in x_columns] for lag in range(maxlag): lag_str = str(lag + 1) columns.extend([str(col) + '.L.' + lag_str for col in x_columns]) lm = DataFrame(lm[:stopobs], index=x.index, columns=columns) lags = lm.iloc[startobs:] if original in ('sep', 'ex'): leads = lags[x_columns] lags = lags.drop(x_columns, 1) else: lags = lm[startobs:stopobs, dropidx:] if original == 'sep': leads = lm[startobs:stopobs, :dropidx] if original == 'sep': return lags, leads else: return lags
def test_llf(self): results = self.res1.results assert_almost_equal(self.res1.llf, self.res2.llf, DECIMAL_2) for i in range(len(results)): assert_almost_equal(results[i].llf, eval('self.res2.llf_'+str(i+1)), DECIMAL_2)
def summary_col(results, float_format='%.4f', model_names=(), stars=False, info_dict=None, regressor_order=(), drop_omitted=False): """ Summarize multiple results instances side-by-side (coefs and SEs) Parameters ---------- results : statsmodels results instance or list of result instances float_format : string, optional float format for coefficients and standard errors Default : '%.4f' model_names : list of strings, optional Must have same length as the number of results. If the names are not unique, a roman number will be appended to all model names stars : bool print significance stars info_dict : dict dict of functions to be applied to results instances to retrieve model info. To use specific information for different models, add a (nested) info_dict with model name as the key. Example: `info_dict = {"N":..., "R2": ..., "OLS":{"R2":...}}` would only show `R2` for OLS regression models, but additionally `N` for all other results. Default : None (use the info_dict specified in result.default_model_infos, if this property exists) regressor_order : list of strings, optional list of names of the regressors in the desired order. All regressors not specified will be appended to the end of the list. drop_omitted : bool, optional Includes regressors that are not specified in regressor_order. If False, regressors not specified will be appended to end of the list. If True, only regressors in regressors_list will be included. """ if not isinstance(results, list): results = [results] cols = [ _col_params(x, stars=stars, float_format=float_format) for x in results ] # Unique column names (pandas has problems merging otherwise) if model_names: colnames = _make_unique(model_names) else: colnames = _make_unique([x.columns[0] for x in cols]) for i in range(len(cols)): cols[i].columns = [colnames[i]] merg = lambda x, y: x.merge( y, how='outer', right_index=True, left_index=True) summ = reduce(merg, cols) if regressor_order: varnames = summ.index.get_level_values(0).tolist() ordered = [x for x in regressor_order if x in varnames] unordered = [x for x in varnames if x not in regressor_order + ['']] order = ordered + list(np.unique(unordered)) f = lambda idx: sum([[x + 'coef', x + 'stde'] for x in idx], []) summ.index = f(pd.unique(varnames)) summ = summ.reindex(f(order)) summ.index = [x[:-4] for x in summ.index] if drop_omitted: summ = summ.loc[regressor_order] idx = pd.Series(lrange(summ.shape[0])) % 2 == 1 summ.index = np.where(idx, '', summ.index.get_level_values(0)) # add infos about the models. if info_dict: cols = [ _col_info(x, info_dict.get(x.model.__class__.__name__, info_dict)) for x in results ] else: cols = [ _col_info(x, getattr(x, "default_model_infos", None)) for x in results ] # use unique column names, otherwise the merge will not succeed for df, name in zip(cols, _make_unique([df.columns[0] for df in cols])): df.columns = [name] merg = lambda x, y: x.merge( y, how='outer', right_index=True, left_index=True) info = reduce(merg, cols) dat = pd.DataFrame(np.vstack([summ, info])) # pd.concat better, but error dat.columns = summ.columns dat.index = pd.Index(summ.index.tolist() + info.index.tolist()) summ = dat summ = summ.fillna('') smry = Summary() smry._merge_latex = True smry.add_df(summ, header=True, align='l') smry.add_text('Standard errors in parentheses.') if stars: smry.add_text('* p<.1, ** p<.05, ***p<.01') return smry
def test_rsquared(self): results = self.res1.results for i in range(len(results)): assert_almost_equal(results[i].rsquared, eval('self.res2.rsquared_'+str(i+1)), DECIMAL_3)
def _hierarchical_split(count_dict, horizontal=True, gap=0.05): """ Split a square in a hierarchical way given a contingency table. Hierarchically split the unit square in alternate directions in proportion to the subdivision contained in the contingency table count_dict. This is the function that actually perform the tiling for the creation of the mosaic plot. If the gap array has been specified it will insert a corresponding amount of space (proportional to the unit lenght), while retaining the proportionality of the tiles. Parameters ---------- count_dict : dict Dictionary containing the contingency table. Each category should contain a non-negative number with a tuple as index. It expects that all the combination of keys to be representes; if that is not true, will automatically consider the missing values as 0 horizontal : bool The starting direction of the split (by default along the horizontal axis) gap : float or array of floats The list of gaps to be applied on each subdivision. If the lenght of the given array is less of the number of subcategories (or if it's a single number) it will extend it with exponentially decreasing gaps Returns ---------- base_rect : dict A dictionary containing the result of the split. To each key is associated a 4-tuple of coordinates that are required to create the corresponding rectangle: 0 - x position of the lower left corner 1 - y position of the lower left corner 2 - width of the rectangle 3 - height of the rectangle """ # this is the unit square that we are going to divide base_rect = OrderedDict([(tuple(), (0, 0, 1, 1))]) # get the list of each possible value for each level categories_levels = _categories_level(list(iterkeys(count_dict))) L = len(categories_levels) # recreate the gaps vector starting from an int if not np.iterable(gap): gap = [gap / 1.5**idx for idx in range(L)] # extend if it's too short if len(gap) < L: last = gap[-1] gap = list(*gap) + [last / 1.5**idx for idx in range(L)] # trim if it's too long gap = gap[:L] # put the count dictionay in order for the keys # this will allow some code simplification count_ordered = OrderedDict([(k, count_dict[k]) for k in list(product(*categories_levels))]) for cat_idx, cat_enum in enumerate(categories_levels): # get the partial key up to the actual level base_keys = list(product(*categories_levels[:cat_idx])) for key in base_keys: # for each partial and each value calculate how many # observation we have in the counting dictionary part_count = [ _reduce_dict(count_ordered, key + (partial, )) for partial in cat_enum ] # reduce the gap for subsequents levels new_gap = gap[cat_idx] # split the given subkeys in the rectangle dictionary base_rect = _key_splitting(base_rect, cat_enum, part_count, key, horizontal, new_gap) horizontal = not horizontal return base_rect
def fit(self, q=.5, vcov='robust', kernel='epa', bandwidth='hsheather', max_iter=1000, p_tol=1e-6, **kwargs): '''Solve by Iterative Weighted Least Squares Parameters ---------- q : float Quantile must be between 0 and 1 vcov : string, method used to calculate the variance-covariance matrix of the parameters. Default is ``robust``: - robust : heteroskedasticity robust standard errors (as suggested in Greene 6th edition) - iid : iid errors (as in Stata 12) kernel : string, kernel to use in the kernel density estimation for the asymptotic covariance matrix: - epa: Epanechnikov - cos: Cosine - gau: Gaussian - par: Parzene bandwidth: string, Bandwidth selection method in kernel density estimation for asymptotic covariance estimate (full references in QuantReg docstring): - hsheather: Hall-Sheather (1988) - bofinger: Bofinger (1975) - chamberlain: Chamberlain (1994) ''' if q < 0 or q > 1: raise Exception('p must be between 0 and 1') kern_names = ['biw', 'cos', 'epa', 'gau', 'par'] if kernel not in kern_names: raise Exception("kernel must be one of " + ', '.join(kern_names)) else: kernel = kernels[kernel] if bandwidth == 'hsheather': bandwidth = hall_sheather elif bandwidth == 'bofinger': bandwidth = bofinger elif bandwidth == 'chamberlain': bandwidth = chamberlain else: raise Exception("bandwidth must be in 'hsheather', 'bofinger', 'chamberlain'") endog = self.endog exog = self.exog nobs = self.nobs exog_rank = np_matrix_rank(self.exog) self.rank = exog_rank self.df_model = float(self.rank - self.k_constant) self.df_resid = self.nobs - self.rank n_iter = 0 xstar = exog beta = np.ones(exog_rank) # TODO: better start, initial beta is used only for convergence check # Note the following doesn't work yet, # the iteration loop always starts with OLS as initial beta # if start_params is not None: # if len(start_params) != rank: # raise ValueError('start_params has wrong length') # beta = start_params # else: # # start with OLS # beta = np.dot(np.linalg.pinv(exog), endog) diff = 10 cycle = False history = dict(params = [], mse=[]) while n_iter < max_iter and diff > p_tol and not cycle: n_iter += 1 beta0 = beta xtx = np.dot(xstar.T, exog) xty = np.dot(xstar.T, endog) beta = np.dot(pinv(xtx), xty) resid = endog - np.dot(exog, beta) mask = np.abs(resid) < .000001 resid[mask] = ((resid[mask] >= 0) * 2 - 1) * .000001 resid = np.where(resid < 0, q * resid, (1-q) * resid) resid = np.abs(resid) xstar = exog / resid[:, np.newaxis] diff = np.max(np.abs(beta - beta0)) history['params'].append(beta) history['mse'].append(np.mean(resid*resid)) if (n_iter >= 300) and (n_iter % 100 == 0): # check for convergence circle, shouldn't happen for ii in range(2, 10): if np.all(beta == history['params'][-ii]): cycle = True warnings.warn("Convergence cycle detected", ConvergenceWarning) break if n_iter == max_iter: warnings.warn("Maximum number of iterations (" + str(max_iter) + ") reached.", IterationLimitWarning) e = endog - np.dot(exog, beta) # Greene (2008, p.407) writes that Stata 6 uses this bandwidth: # h = 0.9 * np.std(e) / (nobs**0.2) # Instead, we calculate bandwidth as in Stata 12 iqre = stats.scoreatpercentile(e, 75) - stats.scoreatpercentile(e, 25) h = bandwidth(nobs, q) h = min(np.std(endog), iqre / 1.34) * (norm.ppf(q + h) - norm.ppf(q - h)) fhat0 = 1. / (nobs * h) * np.sum(kernel(e / h)) if vcov == 'robust': d = np.where(e > 0, (q/fhat0)**2, ((1-q)/fhat0)**2) xtxi = pinv(np.dot(exog.T, exog)) xtdx = np.dot(exog.T * d[np.newaxis, :], exog) vcov = chain_dot(xtxi, xtdx, xtxi) elif vcov == 'iid': vcov = (1. / fhat0)**2 * q * (1 - q) * pinv(np.dot(exog.T, exog)) else: raise Exception("vcov must be 'robust' or 'iid'") lfit = QuantRegResults(self, beta, normalized_cov_params=vcov) lfit.q = q lfit.iterations = n_iter lfit.sparsity = 1. / fhat0 lfit.bandwidth = h lfit.history = history return RegressionResultsWrapper(lfit)
def _create_labels(rects, horizontal, ax, rotation): """find the position of the label for each value of each category right now it supports only up to the four categories ax: the axis on which the label should be applied rotation: the rotation list for each side """ categories = _categories_level(list(iterkeys(rects))) if len(categories) > 4: msg = ("maximum of 4 level supported for axes labeling..and 4" "is alreay a lot of level, are you sure you need them all?") raise NotImplementedError(msg) labels = {} #keep it fixed as will be used a lot of times items = list(iteritems(rects)) vertical = not horizontal #get the axis ticks and labels locator to put the correct values! ax2 = ax.twinx() ax3 = ax.twiny() #this is the order of execution for horizontal disposition ticks_pos = [ax.set_xticks, ax.set_yticks, ax3.set_xticks, ax2.set_yticks] ticks_lab = [ ax.set_xticklabels, ax.set_yticklabels, ax3.set_xticklabels, ax2.set_yticklabels ] #for the vertical one, rotate it by one if vertical: ticks_pos = ticks_pos[1:] + ticks_pos[:1] ticks_lab = ticks_lab[1:] + ticks_lab[:1] #clean them for pos, lab in zip(ticks_pos, ticks_lab): pos([]) lab([]) #for each level, for each value in the level, take the mean of all #the sublevel that correspond to that partial key for level_idx, level in enumerate(categories): #this dictionary keep the labels only for this level level_ticks = dict() for value in level: #to which level it should refer to get the preceding #values of labels? it's rather a tricky question... #this is dependent on the side. It's a very crude management #but I couldn't think a more general way... if horizontal: if level_idx == 3: index_select = [-1, -1, -1] else: index_select = [+0, -1, -1] else: if level_idx == 3: index_select = [+0, -1, +0] else: index_select = [-1, -1, -1] #now I create the base key name and append the current value #It will search on all the rects to find the corresponding one #and use them to evaluate the mean position basekey = tuple(categories[i][index_select[i]] for i in range(level_idx)) basekey = basekey + (value, ) subset = dict( (k, v) for k, v in items if basekey == k[:level_idx + 1]) #now I extract the center of all the tiles and make a weighted #mean of all these center on the area of the tile #this should give me the (more or less) correct position #of the center of the category vals = list(itervalues(subset)) W = sum(w * h for (x, y, w, h) in vals) x_lab = sum((x + w / 2.0) * w * h / W for (x, y, w, h) in vals) y_lab = sum((y + h / 2.0) * w * h / W for (x, y, w, h) in vals) #now base on the ordering, select which position to keep #needs to be written in a more general form of 4 level are enough? #should give also the horizontal and vertical alignment side = (level_idx + vertical) % 4 level_ticks[value] = y_lab if side % 2 else x_lab #now we add the labels of this level to the correct axis ticks_pos[level_idx](list(itervalues(level_ticks))) ticks_lab[level_idx](list(iterkeys(level_ticks)), rotation=rotation[level_idx]) return labels
def sirf_errband_mc(self, orth=False, repl=1000, T=10, signif=0.05, seed=None, burn=100, cum=False): """ Compute Monte Carlo integrated error bands assuming normally distributed for impulse response functions Parameters ---------- orth: bool, default False Compute orthoganalized impulse response error bands repl: int number of Monte Carlo replications to perform T: int, default 10 number of impulse response periods signif: float (0 < signif <1) Significance level for error bars, defaults to 95% CI seed: int np.random.seed for replications burn: int number of initial observations to discard for simulation cum: bool, default False produce cumulative irf error bands Notes ----- Lutkepohl (2005) Appendix D Returns ------- Tuple of lower and upper arrays of ma_rep monte carlo standard errors """ neqs = self.neqs mean = self.mean() k_ar = self.k_ar coefs = self.coefs sigma_u = self.sigma_u intercept = self.intercept df_model = self.df_model nobs = self.nobs ma_coll = np.zeros((repl, T + 1, neqs, neqs)) A = self.A B = self.B A_mask = self.A_mask B_mask = self.B_mask A_pass = np.zeros(A.shape, dtype='|S1') B_pass = np.zeros(B.shape, dtype='|S1') A_pass[~A_mask] = A[~A_mask] B_pass[~B_mask] = B[~B_mask] A_pass[A_mask] = 'E' B_pass[B_mask] = 'E' if A_mask.sum() == 0: s_type = 'B' elif B_mask.sum() == 0: s_type = 'A' else: s_type = 'AB' g_list = [] for i in range(repl): #discard first hundred to correct for starting bias sim = util.varsim(coefs, intercept, sigma_u, steps=nobs + burn) sim = sim[burn:] if cum == True: if i < 10: sol = SVAR(sim, svar_type=s_type, A=A_pass, B=B_pass).fit(maxlags=k_ar) g_list.append(np.append(sol.A[sol.A_mask].\ tolist(), sol.B[sol.B_mask].\ tolist())) ma_coll[i] = sol.svar_ma_rep(maxn=T).cumsum(axis=0) elif i >= 10: if i == 10: mean_AB = np.mean(g_list, axis=0) split = len(A_pass[A_mask]) opt_A = mean_AB[:split] opt_A = mean_AB[split:] ma_coll[i] = SVAR(sim, svar_type=s_type, A=A_pass, B=B_pass).fit(maxlags=k_ar,\ A_guess=opt_A, B_guess=opt_B).\ svar_ma_rep(maxn=T).cumsum(axis=0) elif cum == False: if i < 10: sol = SVAR(sim, svar_type=s_type, A=A_pass, B=B_pass).fit(maxlags=k_ar) g_list.append( np.append(sol.A[A_mask].tolist(), sol.B[B_mask].tolist())) ma_coll[i] = sol.svar_ma_rep(maxn=T) elif i >= 10: if i == 10: mean_AB = np.mean(g_list, axis=0) split = len(A[A_mask]) opt_A = mean_AB[:split] opt_B = mean_AB[split:] ma_coll[i] = SVAR(sim, svar_type=s_type, A=A_pass, B=B_pass).fit(maxlags=k_ar,\ A_guess = opt_A, B_guess = opt_B).\ svar_ma_rep(maxn=T) ma_sort = np.sort(ma_coll, axis=0) #sort to get quantiles index = round(signif / 2 * repl) - 1, round( (1 - signif / 2) * repl) - 1 lower = ma_sort[index[0], :, :, :] upper = ma_sort[index[1], :, :, :] return lower, upper