def __init__(self, y, x, yend, q=None, h=None, robust=None, gwk=None, sig2n_k=False): if issubclass(type(q), np.ndarray) and issubclass(type(h), np.ndarray): raise Exception, "Please do not provide 'q' and 'h' together" if q is None and h is None: raise Exception, "Please provide either 'q' or 'h'" self.y = y self.n = y.shape[0] self.x = x self.kstar = yend.shape[1] # including exogenous and endogenous variables z = sphstack(self.x, yend) if type(h).__name__ not in ['ndarray', 'csr_matrix']: # including exogenous variables and instrument h = sphstack(self.x, q) self.z = z self.h = h self.q = q self.yend = yend # k = number of exogenous variables and endogenous variables self.k = z.shape[1] hth = spdot(h.T, h) hthi = la.inv(hth) zth = spdot(z.T, h) hty = spdot(h.T, y) factor_1 = np.dot(zth, hthi) factor_2 = np.dot(factor_1, zth.T) # this one needs to be in cache to be used in AK varb = la.inv(factor_2) factor_3 = np.dot(varb, factor_1) betas = np.dot(factor_3, hty) self.betas = betas self.varb = varb self.zthhthi = factor_1 # predicted values self.predy = spdot(z, betas) # residuals u = y - self.predy self.u = u # attributes used in property self.hth = hth # Required for condition index self.hthi = hthi # Used in error models self.htz = zth.T if robust: self.vm = ROBUST.robust_vm(reg=self, gwk=gwk, sig2n_k=sig2n_k) self._cache = {} if sig2n_k: self.sig2 = self.sig2n_k else: self.sig2 = self.sig2n
def __init__(self, y, x, yend, q=None, h=None, robust=None, gwk=None, sig2n_k=False): if issubclass(type(q), np.ndarray) and issubclass(type(h), np.ndarray): raise Exception, "Please do not provide 'q' and 'h' together" if q == None and h == None: raise Exception, "Please provide either 'q' or 'h'" self.y = y self.n = y.shape[0] self.x = x self.kstar = yend.shape[1] # including exogenous and endogenous variables z = sphstack(self.x, yend) if type(h).__name__ not in ['ndarray', 'csr_matrix']: # including exogenous variables and instrument h = sphstack(self.x, q) self.z = z self.h = h self.q = q self.yend = yend # k = number of exogenous variables and endogenous variables self.k = z.shape[1] hth = spdot(h.T, h) hthi = la.inv(hth) zth = spdot(z.T, h) hty = spdot(h.T, y) factor_1 = np.dot(zth, hthi) factor_2 = np.dot(factor_1, zth.T) # this one needs to be in cache to be used in AK varb = la.inv(factor_2) factor_3 = np.dot(varb, factor_1) betas = np.dot(factor_3, hty) self.betas = betas self.varb = varb self.zthhthi = factor_1 # predicted values self.predy = spdot(z, betas) # residuals u = y - self.predy self.u = u # attributes used in property self.hth = hth # Required for condition index self.hthi = hthi # Used in error models self.htz = zth.T if robust: self.vm = ROBUST.robust_vm(reg=self, gwk=gwk, sig2n_k=sig2n_k) self._cache = {} if sig2n_k: self.sig2 = self.sig2n_k else: self.sig2 = self.sig2n
def _get_spat_diag_props(self,y, x, w, yend, q, w_lags, lag_q): self._cache = {} yend, q = set_endog(y, x, w, yend, q, w_lags, lag_q) x = USER.check_constant(x) x = REGI.regimeX_setup(x, self.regimes, [True] * x.shape[1], self.regimes_set) self.z = sphstack(x,REGI.regimeX_setup(yend, self.regimes, [True] * (yend.shape[1]-1)+[False], self.regimes_set)) self.h = sphstack(x,REGI.regimeX_setup(q, self.regimes, [True] * q.shape[1], self.regimes_set)) hthi = np.linalg.inv(spdot(self.h.T,self.h)) zth = spdot(self.z.T,self.h) self.varb = np.linalg.inv(spdot(spdot(zth,hthi),zth.T))
def _get_spat_diag_props(self, results, regi_ids, x, yend, q): self._cache = {} x = USER.check_constant(x) x = REGI.regimeX_setup( x, self.regimes, [True] * x.shape[1], self.regimes_set) self.z = sphstack(x, REGI.regimeX_setup( yend, self.regimes, [True] * yend.shape[1], self.regimes_set)) self.h = sphstack( x, REGI.regimeX_setup(q, self.regimes, [True] * q.shape[1], self.regimes_set)) hthi = np.linalg.inv(spdot(self.h.T, self.h)) zth = spdot(self.z.T, self.h) self.varb = np.linalg.inv(spdot(spdot(zth, hthi), zth.T))
def _get_spat_diag_props(self, y, x, w, yend, q, w_lags, lag_q): self._cache = {} yend, q = set_endog(y, x, w, yend, q, w_lags, lag_q) x = USER.check_constant(x) x = REGI.regimeX_setup( x, self.regimes, [True] * x.shape[1], self.regimes_set) self.z = sphstack(x, REGI.regimeX_setup( yend, self.regimes, [True] * (yend.shape[1] - 1) + [False], self.regimes_set)) self.h = sphstack( x, REGI.regimeX_setup(q, self.regimes, [True] * q.shape[1], self.regimes_set)) hthi = np.linalg.inv(spdot(self.h.T, self.h)) zth = spdot(self.z.T, self.h) self.varb = np.linalg.inv(spdot(spdot(zth, hthi), zth.T))
def _get_spat_diag_props(self, results, regi_ids, x, yend, q): self._cache = {} x = USER.check_constant(x) x = REGI.regimeX_setup(x, self.regimes, [True] * x.shape[1], self.regimes_set) self.z = sphstack( x, REGI.regimeX_setup(yend, self.regimes, [True] * yend.shape[1], self.regimes_set)) self.h = sphstack( x, REGI.regimeX_setup(q, self.regimes, [True] * q.shape[1], self.regimes_set)) hthi = np.linalg.inv(spdot(self.h.T, self.h)) zth = spdot(self.z.T, self.h) self.varb = np.linalg.inv(spdot(spdot(zth, hthi), zth.T))
def _tsls_regimes_multi(self, x, yend, q, w_i, regi_ids, cores,\ gwk, sig2n_k, robust, spat_diag, vm, name_x, name_yend, name_q): pool = mp.Pool(cores) results_p = {} for r in self.regimes_set: if system() == 'Windows': is_win = True results_p[r] = _work(*(self.y,x,regi_ids,r,yend,q,robust,sig2n_k,self.name_ds,self.name_y,name_x,name_yend,name_q,self.name_w,self.name_regimes)) else: results_p[r] = pool.apply_async(_work,args=(self.y,x,regi_ids,r,yend,q,robust,sig2n_k,self.name_ds,self.name_y,name_x,name_yend,name_q,self.name_w,self.name_regimes)) is_win = False self.kryd = 0 self.kr = x.shape[1]+yend.shape[1]+1 self.kf = 0 self.nr = len(self.regimes_set) self.vm = np.zeros((self.nr*self.kr,self.nr*self.kr),float) self.betas = np.zeros((self.nr*self.kr,1),float) self.u = np.zeros((self.n,1),float) self.predy = np.zeros((self.n,1),float) if not is_win: pool.close() pool.join() results = {} self.name_y, self.name_x, self.name_yend, self.name_q, self.name_z, self.name_h = [],[],[],[],[],[] counter = 0 for r in self.regimes_set: if is_win: results[r] = results_p[r] else: results[r] = results_p[r].get() if w_i: results[r].w = w_i[r] else: results[r].w = None self.vm[(counter*self.kr):((counter+1)*self.kr),(counter*self.kr):((counter+1)*self.kr)] = results[r].vm self.betas[(counter*self.kr):((counter+1)*self.kr),] = results[r].betas self.u[regi_ids[r],]=results[r].u self.predy[regi_ids[r],]=results[r].predy self.name_y += results[r].name_y self.name_x += results[r].name_x self.name_yend += results[r].name_yend self.name_q += results[r].name_q self.name_z += results[r].name_z self.name_h += results[r].name_h counter += 1 self.multi = results self.hac_var = sphstack(x,q) if robust == 'hac': hac_multi(self,gwk) self.chow = REGI.Chow(self) SUMMARY.TSLS_multi(reg=self, multireg=self.multi, vm=vm, spat_diag=spat_diag, regimes=True)
def check_constant(x): """Check if the X matrix contains a constant, raise exception if it does not Parameters ---------- x : array Value passed by a used to a regression class Returns ------- Returns : nothing Nothing is returned Examples -------- >>> import numpy as np >>> import pysal >>> db = pysal.open(pysal.examples.get_path('columbus.dbf'),'r') >>> X = [] >>> X.append(db.by_col("INC")) >>> X.append(db.by_col("HOVAL")) >>> X = np.array(X).T >>> x_constant = check_constant(X) >>> x_constant.shape (49, 3) """ if not diagnostics.constant_check: raise Exception, "x array cannot contain a constant vector; constant will be added automatically" else: x_constant = COPY.copy(x) return sphstack(np.ones((x_constant.shape[0],1)),x_constant)
def white(reg): """ Calculates the White test to check for heteroscedasticity. Parameters ---------- reg : regression object output instance from a regression model Returns ------- white_result : dictionary contains the statistic (white), degrees of freedom (df) and the associated p-value (pvalue) for the White test. white : float scalar value for the White test statistic. df : integer degrees of freedom associated with the test pvalue : float p-value associated with the statistic (chi^2 distributed with k df) Notes ----- x attribute in the reg object must have a constant term included. This is standard for spreg.OLS so no testing done to confirm constant. References ---------- .. [1] H. White. 1980. A heteroscedasticity-consistent covariance matrix estimator and a direct test for heteroskdasticity. Econometrica. 48(4) 817-838. Examples -------- >>> import numpy as np >>> import pysal >>> import diagnostics >>> from ols import OLS Read the DBF associated with the Columbus data. >>> db = pysal.open(pysal.examples.get_path("columbus.dbf"),"r") Create the dependent variable vector. >>> y = np.array(db.by_col("CRIME")) >>> y = np.reshape(y, (49,1)) Create the matrix of independent variables. >>> X = [] >>> X.append(db.by_col("INC")) >>> X.append(db.by_col("HOVAL")) >>> X = np.array(X).T Run an OLS regression. >>> reg = OLS(y,X) Calculate the White test for heteroscedasticity. >>> testresult = diagnostics.white(reg) Print the degrees of freedom for the test. >>> print testresult['df'] 5 Print the test statistic. >>> print("%1.3f"%testresult['wh']) 19.946 Print the associated p-value. >>> print("%1.4f"%testresult['pvalue']) 0.0013 """ e = reg.u ** 2 k = reg.k n = reg.n y = reg.y X = reg.x #constant = constant_check(X) # Check for constant, if none add one, see Greene 2003, pg. 222 # if constant == False: # X = np.hstack((np.ones((n,1)),X)) # Check for multicollinearity in the X matrix ci = condition_index(reg) if ci > 30: white_result = "Not computed due to multicollinearity." return white_result # Compute cross-products and squares of the regression variables if type(X).__name__ == 'ndarray': A = np.zeros((n, (k * (k + 1)) / 2.)) elif type(X).__name__ == 'csc_matrix' or type(X).__name__ == 'csr_matrix': # this is probably inefficient A = SP.lil_matrix((n, (k * (k + 1)) / 2.)) else: raise Exception, "unknown X type, %s" % type(X).__name__ counter = 0 for i in range(k): for j in range(i, k): v = spmultiply(X[:, i], X[:, j], False) A[:, counter] = v counter += 1 # Append the original variables A = sphstack(X, A) # note: this also converts a LIL to CSR n, k = A.shape # Check to identify any duplicate or constant columns in A omitcolumn = [] for i in range(k): current = A[:, i] # remove all constant terms (will add a constant back later) if spmax(current) == spmin(current): omitcolumn.append(i) pass # do not allow duplicates for j in range(k): check = A[:, j] if i < j: test = abs(current - check).sum() if test == 0: omitcolumn.append(j) uniqueomit = set(omitcolumn) omitcolumn = list(uniqueomit) # Now the identified columns must be removed if type(A).__name__ == 'ndarray': A = np.delete(A, omitcolumn, 1) elif type(A).__name__ == 'csc_matrix' or type(A).__name__ == 'csr_matrix': # this is probably inefficient keepcolumn = range(k) for i in omitcolumn: keepcolumn.remove(i) A = A[:, keepcolumn] else: raise Exception, "unknown A type, %s" % type(X).__name__ A = sphstack(np.ones((A.shape[0], 1)), A) # add a constant back in n, k = A.shape # Conduct the auxiliary regression and calculate the statistic import ols as OLS aux_reg = OLS.BaseOLS(e, A) aux_r2 = r2(aux_reg) wh = aux_r2 * n df = k - 1 pvalue = stats.chisqprob(wh, df) white_result = {'df': df, 'wh': wh, 'pvalue': pvalue} return white_result
def white(reg): """ Calculates the White test to check for heteroscedasticity. [White1980]_ Parameters ---------- reg : regression object output instance from a regression model Returns ------- white_result : dictionary contains the statistic (white), degrees of freedom (df) and the associated p-value (pvalue) for the White test. white : float scalar value for the White test statistic. df : integer degrees of freedom associated with the test pvalue : float p-value associated with the statistic (chi^2 distributed with k df) Notes ----- x attribute in the reg object must have a constant term included. This is standard for spreg.OLS so no testing done to confirm constant. Examples -------- >>> import numpy as np >>> import pysal >>> import diagnostics >>> from ols import OLS Read the DBF associated with the Columbus data. >>> db = pysal.open(pysal.examples.get_path("columbus.dbf"),"r") Create the dependent variable vector. >>> y = np.array(db.by_col("CRIME")) >>> y = np.reshape(y, (49,1)) Create the matrix of independent variables. >>> X = [] >>> X.append(db.by_col("INC")) >>> X.append(db.by_col("HOVAL")) >>> X = np.array(X).T Run an OLS regression. >>> reg = OLS(y,X) Calculate the White test for heteroscedasticity. >>> testresult = diagnostics.white(reg) Print the degrees of freedom for the test. >>> print testresult['df'] 5 Print the test statistic. >>> print("%1.3f"%testresult['wh']) 19.946 Print the associated p-value. >>> print("%1.4f"%testresult['pvalue']) 0.0013 """ e = reg.u**2 k = int(reg.k) n = int(reg.n) y = reg.y X = reg.x #constant = constant_check(X) # Check for constant, if none add one, see Greene 2003, pg. 222 # if constant == False: # X = np.hstack((np.ones((n,1)),X)) # Check for multicollinearity in the X matrix ci = condition_index(reg) if ci > 30: white_result = "Not computed due to multicollinearity." return white_result # Compute cross-products and squares of the regression variables if type(X).__name__ == 'ndarray': A = np.zeros((n, (k * (k + 1)) // 2)) elif type(X).__name__ == 'csc_matrix' or type(X).__name__ == 'csr_matrix': # this is probably inefficient A = SP.lil_matrix((n, (k * (k + 1)) // 2)) else: raise Exception, "unknown X type, %s" % type(X).__name__ counter = 0 for i in range(k): for j in range(i, k): v = spmultiply(X[:, i], X[:, j], False) A[:, counter] = v counter += 1 # Append the original variables A = sphstack(X, A) # note: this also converts a LIL to CSR n, k = A.shape # Check to identify any duplicate or constant columns in A omitcolumn = [] for i in range(k): current = A[:, i] # remove all constant terms (will add a constant back later) if spmax(current) == spmin(current): omitcolumn.append(i) pass # do not allow duplicates for j in range(k): check = A[:, j] if i < j: test = abs(current - check).sum() if test == 0: omitcolumn.append(j) uniqueomit = set(omitcolumn) omitcolumn = list(uniqueomit) # Now the identified columns must be removed if type(A).__name__ == 'ndarray': A = np.delete(A, omitcolumn, 1) elif type(A).__name__ == 'csc_matrix' or type(A).__name__ == 'csr_matrix': # this is probably inefficient keepcolumn = range(k) for i in omitcolumn: keepcolumn.remove(i) A = A[:, keepcolumn] else: raise Exception, "unknown A type, %s" % type(X).__name__ A = sphstack(np.ones((A.shape[0], 1)), A) # add a constant back in n, k = A.shape # Conduct the auxiliary regression and calculate the statistic import ols as OLS aux_reg = OLS.BaseOLS(e, A) aux_r2 = r2(aux_reg) wh = aux_r2 * n df = k - 1 pvalue = chisqprob(wh, df) white_result = {'df': df, 'wh': wh, 'pvalue': pvalue} return white_result
def _tsls_regimes_multi(self, x, yend, q, w, regi_ids, cores, gwk, sig2n_k, robust, spat_diag, vm, name_x, name_yend, name_q): results_p = {} """ for r in self.regimes_set: if system() != 'Windows': is_win = True results_p[r] = _work(*(self.y,x,w,regi_ids,r,yend,q,robust,sig2n_k,self.name_ds,self.name_y,name_x,name_yend,name_q,self.name_w,self.name_regimes)) else: pool = mp.Pool(cores) results_p[r] = pool.apply_async(_work,args=(self.y,x,w,regi_ids,r,yend,q,robust,sig2n_k,self.name_ds,self.name_y,name_x,name_yend,name_q,self.name_w,self.name_regimes)) is_win = False """ for r in self.regimes_set: if cores: pool = mp.Pool(None) results_p[r] = pool.apply_async(_work, args=( self.y, x, w, regi_ids, r, yend, q, robust, sig2n_k, self.name_ds, self.name_y, name_x, name_yend, name_q, self.name_w, self.name_regimes)) else: results_p[r] = _work(*(self.y, x, w, regi_ids, r, yend, q, robust, sig2n_k, self.name_ds, self.name_y, name_x, name_yend, name_q, self.name_w, self.name_regimes)) self.kryd = 0 self.kr = x.shape[1] + yend.shape[1] + 1 self.kf = 0 self.nr = len(self.regimes_set) self.vm = np.zeros((self.nr * self.kr, self.nr * self.kr), float) self.betas = np.zeros((self.nr * self.kr, 1), float) self.u = np.zeros((self.n, 1), float) self.predy = np.zeros((self.n, 1), float) """ if not is_win: pool.close() pool.join() """ if cores: pool.close() pool.join() results = {} self.name_y, self.name_x, self.name_yend, self.name_q, self.name_z, self.name_h = [ ], [], [], [], [], [] counter = 0 for r in self.regimes_set: """ if is_win: results[r] = results_p[r] else: results[r] = results_p[r].get() """ if not cores: results[r] = results_p[r] else: results[r] = results_p[r].get() self.vm[(counter * self.kr):((counter + 1) * self.kr), (counter * self.kr):((counter + 1) * self.kr)] = results[r].vm self.betas[ (counter * self.kr):((counter + 1) * self.kr), ] = results[r].betas self.u[regi_ids[r], ] = results[r].u self.predy[regi_ids[r], ] = results[r].predy self.name_y += results[r].name_y self.name_x += results[r].name_x self.name_yend += results[r].name_yend self.name_q += results[r].name_q self.name_z += results[r].name_z self.name_h += results[r].name_h counter += 1 self.multi = results self.hac_var = sphstack(x, q) if robust == 'hac': hac_multi(self, gwk) if robust == 'ogmm': set_warn( self, "Residuals treated as homoskedastic for the purpose of diagnostics.") self.chow = REGI.Chow(self) if spat_diag: self._get_spat_diag_props(results, regi_ids, x, yend, q) SUMMARY.TSLS_multi( reg=self, multireg=self.multi, vm=vm, spat_diag=spat_diag, regimes=True, w=w)
def _tsls_regimes_multi(self, x, yend, q, w, regi_ids, cores, gwk, sig2n_k, robust, spat_diag, vm, name_x, name_yend, name_q): results_p = {} """ for r in self.regimes_set: if system() != 'Windows': is_win = True results_p[r] = _work(*(self.y,x,w,regi_ids,r,yend,q,robust,sig2n_k,self.name_ds,self.name_y,name_x,name_yend,name_q,self.name_w,self.name_regimes)) else: pool = mp.Pool(cores) results_p[r] = pool.apply_async(_work,args=(self.y,x,w,regi_ids,r,yend,q,robust,sig2n_k,self.name_ds,self.name_y,name_x,name_yend,name_q,self.name_w,self.name_regimes)) is_win = False """ for r in self.regimes_set: if cores: pool = mp.Pool(None) results_p[r] = pool.apply_async( _work, args=(self.y, x, w, regi_ids, r, yend, q, robust, sig2n_k, self.name_ds, self.name_y, name_x, name_yend, name_q, self.name_w, self.name_regimes)) else: results_p[r] = _work(*(self.y, x, w, regi_ids, r, yend, q, robust, sig2n_k, self.name_ds, self.name_y, name_x, name_yend, name_q, self.name_w, self.name_regimes)) self.kryd = 0 self.kr = x.shape[1] + yend.shape[1] + 1 self.kf = 0 self.nr = len(self.regimes_set) self.vm = np.zeros((self.nr * self.kr, self.nr * self.kr), float) self.betas = np.zeros((self.nr * self.kr, 1), float) self.u = np.zeros((self.n, 1), float) self.predy = np.zeros((self.n, 1), float) """ if not is_win: pool.close() pool.join() """ if cores: pool.close() pool.join() results = {} self.name_y, self.name_x, self.name_yend, self.name_q, self.name_z, self.name_h = [ ], [], [], [], [], [] counter = 0 for r in self.regimes_set: """ if is_win: results[r] = results_p[r] else: results[r] = results_p[r].get() """ if not cores: results[r] = results_p[r] else: results[r] = results_p[r].get() self.vm[(counter * self.kr):((counter + 1) * self.kr), (counter * self.kr):((counter + 1) * self.kr)] = results[r].vm self.betas[(counter * self.kr):((counter + 1) * self.kr), ] = results[r].betas self.u[regi_ids[r], ] = results[r].u self.predy[regi_ids[r], ] = results[r].predy self.name_y += results[r].name_y self.name_x += results[r].name_x self.name_yend += results[r].name_yend self.name_q += results[r].name_q self.name_z += results[r].name_z self.name_h += results[r].name_h counter += 1 self.multi = results self.hac_var = sphstack(x, q) if robust == 'hac': hac_multi(self, gwk) if robust == 'ogmm': set_warn( self, "Residuals treated as homoskedastic for the purpose of diagnostics." ) self.chow = REGI.Chow(self) if spat_diag: self._get_spat_diag_props(results, regi_ids, x, yend, q) SUMMARY.TSLS_multi(reg=self, multireg=self.multi, vm=vm, spat_diag=spat_diag, regimes=True, w=w)