def white(reg): """ Calculates the White test to check for heteroscedasticity. Parameters ---------- reg : regression object output instance from a regression model Returns ------- white_result : dictionary contains the statistic (white), degrees of freedom (df) and the associated p-value (pvalue) for the White test. white : float scalar value for the White test statistic. df : integer degrees of freedom associated with the test pvalue : float p-value associated with the statistic (chi^2 distributed with k df) Notes ----- x attribute in the reg object must have a constant term included. This is standard for spreg.OLS so no testing done to confirm constant. References ---------- .. [1] H. White. 1980. A heteroscedasticity-consistent covariance matrix estimator and a direct test for heteroskdasticity. Econometrica. 48(4) 817-838. Examples -------- >>> import numpy as np >>> import pysal >>> import diagnostics >>> from ols import OLS Read the DBF associated with the Columbus data. >>> db = pysal.open(pysal.examples.get_path("columbus.dbf"),"r") Create the dependent variable vector. >>> y = np.array(db.by_col("CRIME")) >>> y = np.reshape(y, (49,1)) Create the matrix of independent variables. >>> X = [] >>> X.append(db.by_col("INC")) >>> X.append(db.by_col("HOVAL")) >>> X = np.array(X).T Run an OLS regression. >>> reg = OLS(y,X) Calculate the White test for heteroscedasticity. >>> testresult = diagnostics.white(reg) Print the degrees of freedom for the test. >>> print testresult['df'] 5 Print the test statistic. >>> print("%1.3f"%testresult['wh']) 19.946 Print the associated p-value. >>> print("%1.4f"%testresult['pvalue']) 0.0013 """ e = reg.u ** 2 k = reg.k n = reg.n y = reg.y X = reg.x #constant = constant_check(X) # Check for constant, if none add one, see Greene 2003, pg. 222 # if constant == False: # X = np.hstack((np.ones((n,1)),X)) # Check for multicollinearity in the X matrix ci = condition_index(reg) if ci > 30: white_result = "Not computed due to multicollinearity." return white_result # Compute cross-products and squares of the regression variables if type(X).__name__ == 'ndarray': A = np.zeros((n, (k * (k + 1)) / 2.)) elif type(X).__name__ == 'csc_matrix' or type(X).__name__ == 'csr_matrix': # this is probably inefficient A = SP.lil_matrix((n, (k * (k + 1)) / 2.)) else: raise Exception, "unknown X type, %s" % type(X).__name__ counter = 0 for i in range(k): for j in range(i, k): v = spmultiply(X[:, i], X[:, j], False) A[:, counter] = v counter += 1 # Append the original variables A = sphstack(X, A) # note: this also converts a LIL to CSR n, k = A.shape # Check to identify any duplicate or constant columns in A omitcolumn = [] for i in range(k): current = A[:, i] # remove all constant terms (will add a constant back later) if spmax(current) == spmin(current): omitcolumn.append(i) pass # do not allow duplicates for j in range(k): check = A[:, j] if i < j: test = abs(current - check).sum() if test == 0: omitcolumn.append(j) uniqueomit = set(omitcolumn) omitcolumn = list(uniqueomit) # Now the identified columns must be removed if type(A).__name__ == 'ndarray': A = np.delete(A, omitcolumn, 1) elif type(A).__name__ == 'csc_matrix' or type(A).__name__ == 'csr_matrix': # this is probably inefficient keepcolumn = range(k) for i in omitcolumn: keepcolumn.remove(i) A = A[:, keepcolumn] else: raise Exception, "unknown A type, %s" % type(X).__name__ A = sphstack(np.ones((A.shape[0], 1)), A) # add a constant back in n, k = A.shape # Conduct the auxiliary regression and calculate the statistic import ols as OLS aux_reg = OLS.BaseOLS(e, A) aux_r2 = r2(aux_reg) wh = aux_r2 * n df = k - 1 pvalue = stats.chisqprob(wh, df) white_result = {'df': df, 'wh': wh, 'pvalue': pvalue} return white_result
def white(reg): """ Calculates the White test to check for heteroscedasticity. [White1980]_ Parameters ---------- reg : regression object output instance from a regression model Returns ------- white_result : dictionary contains the statistic (white), degrees of freedom (df) and the associated p-value (pvalue) for the White test. white : float scalar value for the White test statistic. df : integer degrees of freedom associated with the test pvalue : float p-value associated with the statistic (chi^2 distributed with k df) Notes ----- x attribute in the reg object must have a constant term included. This is standard for spreg.OLS so no testing done to confirm constant. Examples -------- >>> import numpy as np >>> import pysal >>> import diagnostics >>> from ols import OLS Read the DBF associated with the Columbus data. >>> db = pysal.open(pysal.examples.get_path("columbus.dbf"),"r") Create the dependent variable vector. >>> y = np.array(db.by_col("CRIME")) >>> y = np.reshape(y, (49,1)) Create the matrix of independent variables. >>> X = [] >>> X.append(db.by_col("INC")) >>> X.append(db.by_col("HOVAL")) >>> X = np.array(X).T Run an OLS regression. >>> reg = OLS(y,X) Calculate the White test for heteroscedasticity. >>> testresult = diagnostics.white(reg) Print the degrees of freedom for the test. >>> print testresult['df'] 5 Print the test statistic. >>> print("%1.3f"%testresult['wh']) 19.946 Print the associated p-value. >>> print("%1.4f"%testresult['pvalue']) 0.0013 """ e = reg.u**2 k = int(reg.k) n = int(reg.n) y = reg.y X = reg.x #constant = constant_check(X) # Check for constant, if none add one, see Greene 2003, pg. 222 # if constant == False: # X = np.hstack((np.ones((n,1)),X)) # Check for multicollinearity in the X matrix ci = condition_index(reg) if ci > 30: white_result = "Not computed due to multicollinearity." return white_result # Compute cross-products and squares of the regression variables if type(X).__name__ == 'ndarray': A = np.zeros((n, (k * (k + 1)) // 2)) elif type(X).__name__ == 'csc_matrix' or type(X).__name__ == 'csr_matrix': # this is probably inefficient A = SP.lil_matrix((n, (k * (k + 1)) // 2)) else: raise Exception, "unknown X type, %s" % type(X).__name__ counter = 0 for i in range(k): for j in range(i, k): v = spmultiply(X[:, i], X[:, j], False) A[:, counter] = v counter += 1 # Append the original variables A = sphstack(X, A) # note: this also converts a LIL to CSR n, k = A.shape # Check to identify any duplicate or constant columns in A omitcolumn = [] for i in range(k): current = A[:, i] # remove all constant terms (will add a constant back later) if spmax(current) == spmin(current): omitcolumn.append(i) pass # do not allow duplicates for j in range(k): check = A[:, j] if i < j: test = abs(current - check).sum() if test == 0: omitcolumn.append(j) uniqueomit = set(omitcolumn) omitcolumn = list(uniqueomit) # Now the identified columns must be removed if type(A).__name__ == 'ndarray': A = np.delete(A, omitcolumn, 1) elif type(A).__name__ == 'csc_matrix' or type(A).__name__ == 'csr_matrix': # this is probably inefficient keepcolumn = range(k) for i in omitcolumn: keepcolumn.remove(i) A = A[:, keepcolumn] else: raise Exception, "unknown A type, %s" % type(X).__name__ A = sphstack(np.ones((A.shape[0], 1)), A) # add a constant back in n, k = A.shape # Conduct the auxiliary regression and calculate the statistic import ols as OLS aux_reg = OLS.BaseOLS(e, A) aux_r2 = r2(aux_reg) wh = aux_r2 * n df = k - 1 pvalue = chisqprob(wh, df) white_result = {'df': df, 'wh': wh, 'pvalue': pvalue} return white_result