def __init__(self, y, x, w): # 1a. OLS --> \tilde{betas} ols = OLS.BaseOLS(y=y, x=x) self.n, self.k = ols.x.shape self.x = ols.x self.y = ols.y # 1b. GMM --> \tilde{\lambda1} moments = _momentsGM_Error(w, ols.u) lambda1 = optim_moments(moments) # 2a. OLS -->\hat{betas} xs = get_spFilter(w, lambda1, self.x) ys = get_spFilter(w, lambda1, self.y) ols2 = OLS.BaseOLS(y=ys, x=xs) # Output self.predy = spdot(self.x, ols2.betas) self.u = y - self.predy self.betas = np.vstack((ols2.betas, np.array([[lambda1]]))) self.sig2 = ols2.sig2n self.e_filtered = self.u - lambda1 * w * self.u self.vm = self.sig2 * ols2.xtxi se_betas = np.sqrt(self.vm.diagonal()) self._cache = {}
def vif(reg): """ Calculates the variance inflation factor for each independent variable. For the ease of indexing the results, the constant is currently included. This should be omitted when reporting the results to the output text. [Greene2003]_ Parameters ---------- reg : regression object output instance from a regression model Returns ------- vif_result : list of tuples each tuple includes the vif and the tolerance, the order of the variables corresponds to their order in the reg.x matrix Examples -------- >>> import numpy as np >>> import pysal >>> import diagnostics >>> from ols import OLS Read the DBF associated with the Columbus data. >>> db = pysal.open(pysal.examples.get_path("columbus.dbf"),"r") Create the dependent variable vector. >>> y = np.array(db.by_col("CRIME")) >>> y = np.reshape(y, (49,1)) Create the matrix of independent variables. >>> X = [] >>> X.append(db.by_col("INC")) >>> X.append(db.by_col("HOVAL")) >>> X = np.array(X).T Run an OLS regression. >>> reg = OLS(y,X) Calculate the variance inflation factor (VIF). >>> testresult = diagnostics.vif(reg) Select the tuple for the income variable. >>> incvif = testresult[1] Print the VIF for income. >>> print("%12.12f"%incvif[0]) 1.333117497189 Print the tolerance for income. >>> print("%12.12f"%incvif[1]) 0.750121427487 Repeat for the home value variable. >>> hovalvif = testresult[2] >>> print("%12.12f"%hovalvif[0]) 1.333117497189 >>> print("%12.12f"%hovalvif[1]) 0.750121427487 """ X = reg.x n, k = X.shape vif_result = [] for j in range(k): Z = X.copy() Z = np.delete(Z, j, 1) y = X[:, j] import ols as OLS aux = OLS.BaseOLS(y, Z) mean_y = aux.mean_y utu = aux.utu ss_tot = sum((y - mean_y)**2) if ss_tot == 0: resj = pysal.MISSINGVALUE else: r2aux = 1 - utu / ss_tot tolj = 1 - r2aux vifj = 1 / tolj resj = (vifj, tolj) vif_result.append(resj) return vif_result
def white(reg): """ Calculates the White test to check for heteroscedasticity. [White1980]_ Parameters ---------- reg : regression object output instance from a regression model Returns ------- white_result : dictionary contains the statistic (white), degrees of freedom (df) and the associated p-value (pvalue) for the White test. white : float scalar value for the White test statistic. df : integer degrees of freedom associated with the test pvalue : float p-value associated with the statistic (chi^2 distributed with k df) Notes ----- x attribute in the reg object must have a constant term included. This is standard for spreg.OLS so no testing done to confirm constant. Examples -------- >>> import numpy as np >>> import pysal >>> import diagnostics >>> from ols import OLS Read the DBF associated with the Columbus data. >>> db = pysal.open(pysal.examples.get_path("columbus.dbf"),"r") Create the dependent variable vector. >>> y = np.array(db.by_col("CRIME")) >>> y = np.reshape(y, (49,1)) Create the matrix of independent variables. >>> X = [] >>> X.append(db.by_col("INC")) >>> X.append(db.by_col("HOVAL")) >>> X = np.array(X).T Run an OLS regression. >>> reg = OLS(y,X) Calculate the White test for heteroscedasticity. >>> testresult = diagnostics.white(reg) Print the degrees of freedom for the test. >>> print testresult['df'] 5 Print the test statistic. >>> print("%1.3f"%testresult['wh']) 19.946 Print the associated p-value. >>> print("%1.4f"%testresult['pvalue']) 0.0013 """ e = reg.u**2 k = int(reg.k) n = int(reg.n) y = reg.y X = reg.x #constant = constant_check(X) # Check for constant, if none add one, see Greene 2003, pg. 222 # if constant == False: # X = np.hstack((np.ones((n,1)),X)) # Check for multicollinearity in the X matrix ci = condition_index(reg) if ci > 30: white_result = "Not computed due to multicollinearity." return white_result # Compute cross-products and squares of the regression variables if type(X).__name__ == 'ndarray': A = np.zeros((n, (k * (k + 1)) // 2)) elif type(X).__name__ == 'csc_matrix' or type(X).__name__ == 'csr_matrix': # this is probably inefficient A = SP.lil_matrix((n, (k * (k + 1)) // 2)) else: raise Exception, "unknown X type, %s" % type(X).__name__ counter = 0 for i in range(k): for j in range(i, k): v = spmultiply(X[:, i], X[:, j], False) A[:, counter] = v counter += 1 # Append the original variables A = sphstack(X, A) # note: this also converts a LIL to CSR n, k = A.shape # Check to identify any duplicate or constant columns in A omitcolumn = [] for i in range(k): current = A[:, i] # remove all constant terms (will add a constant back later) if spmax(current) == spmin(current): omitcolumn.append(i) pass # do not allow duplicates for j in range(k): check = A[:, j] if i < j: test = abs(current - check).sum() if test == 0: omitcolumn.append(j) uniqueomit = set(omitcolumn) omitcolumn = list(uniqueomit) # Now the identified columns must be removed if type(A).__name__ == 'ndarray': A = np.delete(A, omitcolumn, 1) elif type(A).__name__ == 'csc_matrix' or type(A).__name__ == 'csr_matrix': # this is probably inefficient keepcolumn = range(k) for i in omitcolumn: keepcolumn.remove(i) A = A[:, keepcolumn] else: raise Exception, "unknown A type, %s" % type(X).__name__ A = sphstack(np.ones((A.shape[0], 1)), A) # add a constant back in n, k = A.shape # Conduct the auxiliary regression and calculate the statistic import ols as OLS aux_reg = OLS.BaseOLS(e, A) aux_r2 = r2(aux_reg) wh = aux_r2 * n df = k - 1 pvalue = chisqprob(wh, df) white_result = {'df': df, 'wh': wh, 'pvalue': pvalue} return white_result
if len(name_y) > 1 and isinstance(name_y, list): name_y = ''.join([i for i in name_y[0] if not i.isdigit()]) if len(name_y) == 1 and isinstance(name_y, list): name_y = name_y[0] if name_x: if len(name_x) != k*T and len(name_x) != k: raise Exception("Names of columns in X must have exactly either k or k*t elements.") if len(name_x) > k: name_bigx = [] for i in range(k): name_bigx.append(''.join([j for j in name_x[i*T] if not j.isdigit()])) name_x = name_bigx return bigy, bigx, name_y, name_x ols = OLS.BaseOLS(y=y, x=x) x, y, n, k, xtx = ols.x, ols.y, ols.n, ols.k, ols.xtx N = w.n T = y.shape[0]//N moments, trace_w2 = _moments_kkp(w.sparse, ols.u, 0) lambda1, sig_v = optim_moments(moments, all_par=True) Tw = SP.kron(SP.identity(T),w.sparse) ub = Tw.dot(ols.u) ulu = ols.u - lambda1*ub Q1 = SP.kron(np.ones((T,T))/T,SP.identity(N)) sig_1 = float(np.dot(ulu.T,Q1.dot(ulu))/N) #print('initial_lamb_sig:',lambda1,sig_v,sig_1) #print('theta:', 1 - np.sqrt(sig_v)/ np.sqrt(sig_1)) Xi_a = SP.diags([(sig_v*sig_v)/(T-1),sig_1*sig_1]) if full_weights: Tau = _get_Tau(w.sparse,trace_w2)