def white(reg): """ Calculates the White test to check for heteroscedasticity. Parameters ---------- reg : regression object output instance from a regression model Returns ------- white_result : dictionary contains the statistic (white), degrees of freedom (df) and the associated p-value (pvalue) for the White test. white : float scalar value for the White test statistic. df : integer degrees of freedom associated with the test pvalue : float p-value associated with the statistic (chi^2 distributed with k df) Notes ----- x attribute in the reg object must have a constant term included. This is standard for spreg.OLS so no testing done to confirm constant. References ---------- .. [1] H. White. 1980. A heteroscedasticity-consistent covariance matrix estimator and a direct test for heteroskdasticity. Econometrica. 48(4) 817-838. Examples -------- >>> import numpy as np >>> import pysal >>> import diagnostics >>> from ols import OLS Read the DBF associated with the Columbus data. >>> db = pysal.open(pysal.examples.get_path("columbus.dbf"),"r") Create the dependent variable vector. >>> y = np.array(db.by_col("CRIME")) >>> y = np.reshape(y, (49,1)) Create the matrix of independent variables. >>> X = [] >>> X.append(db.by_col("INC")) >>> X.append(db.by_col("HOVAL")) >>> X = np.array(X).T Run an OLS regression. >>> reg = OLS(y,X) Calculate the White test for heteroscedasticity. >>> testresult = diagnostics.white(reg) Print the degrees of freedom for the test. >>> print testresult['df'] 5 Print the test statistic. >>> print("%1.3f"%testresult['wh']) 19.946 Print the associated p-value. >>> print("%1.4f"%testresult['pvalue']) 0.0013 """ e = reg.u ** 2 k = reg.k n = reg.n y = reg.y X = reg.x #constant = constant_check(X) # Check for constant, if none add one, see Greene 2003, pg. 222 # if constant == False: # X = np.hstack((np.ones((n,1)),X)) # Check for multicollinearity in the X matrix ci = condition_index(reg) if ci > 30: white_result = "Not computed due to multicollinearity." return white_result # Compute cross-products and squares of the regression variables if type(X).__name__ == 'ndarray': A = np.zeros((n, (k * (k + 1)) / 2.)) elif type(X).__name__ == 'csc_matrix' or type(X).__name__ == 'csr_matrix': # this is probably inefficient A = SP.lil_matrix((n, (k * (k + 1)) / 2.)) else: raise Exception, "unknown X type, %s" % type(X).__name__ counter = 0 for i in range(k): for j in range(i, k): v = spmultiply(X[:, i], X[:, j], False) A[:, counter] = v counter += 1 # Append the original variables A = sphstack(X, A) # note: this also converts a LIL to CSR n, k = A.shape # Check to identify any duplicate or constant columns in A omitcolumn = [] for i in range(k): current = A[:, i] # remove all constant terms (will add a constant back later) if spmax(current) == spmin(current): omitcolumn.append(i) pass # do not allow duplicates for j in range(k): check = A[:, j] if i < j: test = abs(current - check).sum() if test == 0: omitcolumn.append(j) uniqueomit = set(omitcolumn) omitcolumn = list(uniqueomit) # Now the identified columns must be removed if type(A).__name__ == 'ndarray': A = np.delete(A, omitcolumn, 1) elif type(A).__name__ == 'csc_matrix' or type(A).__name__ == 'csr_matrix': # this is probably inefficient keepcolumn = range(k) for i in omitcolumn: keepcolumn.remove(i) A = A[:, keepcolumn] else: raise Exception, "unknown A type, %s" % type(X).__name__ A = sphstack(np.ones((A.shape[0], 1)), A) # add a constant back in n, k = A.shape # Conduct the auxiliary regression and calculate the statistic import ols as OLS aux_reg = OLS.BaseOLS(e, A) aux_r2 = r2(aux_reg) wh = aux_r2 * n df = k - 1 pvalue = stats.chisqprob(wh, df) white_result = {'df': df, 'wh': wh, 'pvalue': pvalue} return white_result
def breusch_pagan(reg, z=None): """ Calculates the Breusch-Pagan test statistic to check for heteroscedasticity. Parameters ---------- reg : regression object output instance from a regression model z : array optional input for specifying an alternative set of variables (Z) to explain the observed variance. By default this is a matrix of the squared explanatory variables (X**2) with a constant added to the first column if not already present. In the default case, the explanatory variables are squared to eliminate negative values. Returns ------- bp_result : dictionary contains the statistic (bp) for the test and the associated p-value (p-value) bp : float scalar value for the Breusch-Pagan test statistic df : integer degrees of freedom associated with the test (k) pvalue : float p-value associated with the statistic (chi^2 distributed with k df) Notes ----- x attribute in the reg object must have a constant term included. This is standard for spreg.OLS so no testing done to confirm constant. References ---------- .. [1] T. Breusch and A. Pagan. 1979. A simple test for heteroscedasticity and random coefficient variation. Econometrica: Journal of the Econometric Society, 47(5):1287-1294. Examples -------- >>> import numpy as np >>> import pysal >>> import diagnostics >>> from ols import OLS Read the DBF associated with the Columbus data. >>> db = pysal.open(pysal.examples.get_path("columbus.dbf"), "r") Create the dependent variable vector. >>> y = np.array(db.by_col("CRIME")) >>> y = np.reshape(y, (49,1)) Create the matrix of independent variables. >>> X = [] >>> X.append(db.by_col("INC")) >>> X.append(db.by_col("HOVAL")) >>> X = np.array(X).T Run an OLS regression. >>> reg = OLS(y,X) Calculate the Breusch-Pagan test for heteroscedasticity. >>> testresult = diagnostics.breusch_pagan(reg) Print the degrees of freedom for the test. >>> testresult['df'] 2 Print the test statistic. >>> print("%1.3f"%testresult['bp']) 7.900 Print the associated p-value. >>> print("%1.4f"%testresult['pvalue']) 0.0193 """ e2 = reg.u ** 2 e = reg.u n = reg.n k = reg.k ete = reg.utu den = ete / n g = e2 / den - 1.0 if z == None: x = reg.x #constant = constant_check(x) # if constant == False: # z = np.hstack((np.ones((n,1)),x))**2 # else: # z = x**2 z = spmultiply(x, x) else: #constant = constant_check(z) # if constant == False: # z = np.hstack((np.ones((n,1)),z)) pass n, p = z.shape # Check to identify any duplicate columns in Z omitcolumn = [] for i in range(p): current = z[:, i] for j in range(p): check = z[:, j] if i < j: test = abs(current - check).sum() if test == 0: omitcolumn.append(j) uniqueomit = set(omitcolumn) omitcolumn = list(uniqueomit) # Now the identified columns must be removed (done in reverse to # prevent renumbering) omitcolumn.sort() omitcolumn.reverse() for c in omitcolumn: z = np.delete(z, c, 1) n, p = z.shape df = p - 1 # Now that the variables are prepared, we calculate the statistic zt = np.transpose(z) gt = np.transpose(g) gtz = np.dot(gt, z) ztg = np.dot(zt, g) ztz = np.dot(zt, z) ztzi = la.inv(ztz) part1 = np.dot(gtz, ztzi) part2 = np.dot(part1, ztg) bp_array = 0.5 * part2 bp = bp_array[0, 0] pvalue = stats.chisqprob(bp, df) bp_result = {'df': df, 'bp': bp, 'pvalue': pvalue} return bp_result
def koenker_bassett(reg, z=None): """ Calculates the Koenker-Bassett test statistic to check for heteroscedasticity. Parameters ---------- reg : regression output output from an instance of a regression class z : array optional input for specifying an alternative set of variables (Z) to explain the observed variance. By default this is a matrix of the squared explanatory variables (X**2) with a constant added to the first column if not already present. In the default case, the explanatory variables are squared to eliminate negative values. Returns ------- kb_result : dictionary contains the statistic (kb), degrees of freedom (df) and the associated p-value (pvalue) for the test. kb : float scalar value for the Koenker-Bassett test statistic. df : integer degrees of freedom associated with the test pvalue : float p-value associated with the statistic (chi^2 distributed) Notes ----- x attribute in the reg object must have a constant term included. This is standard for spreg.OLS so no testing done to confirm constant. References ---------- .. [1] R. Koenker and G. Bassett. 1982. Robust tests for heteroscedasticity based on regression quantiles. Econometrica, 50(1):43-61. .. [2] W. Greene. 2003. Econometric Analysis. Prentice Hall, Upper Saddle River. Examples -------- >>> import numpy as np >>> import pysal >>> import diagnostics >>> from ols import OLS Read the DBF associated with the Columbus data. >>> db = pysal.open(pysal.examples.get_path("columbus.dbf"),"r") Create the dependent variable vector. >>> y = np.array(db.by_col("CRIME")) >>> y = np.reshape(y, (49,1)) Create the matrix of independent variables. >>> X = [] >>> X.append(db.by_col("INC")) >>> X.append(db.by_col("HOVAL")) >>> X = np.array(X).T Run an OLS regression. >>> reg = OLS(y,X) Calculate the Koenker-Bassett test for heteroscedasticity. >>> testresult = diagnostics.koenker_bassett(reg) Print the degrees of freedom for the test. >>> testresult['df'] 2 Print the test statistic. >>> print("%1.3f"%testresult['kb']) 5.694 Print the associated p-value. >>> print("%1.4f"%testresult['pvalue']) 0.0580 """ # The notation here matches that of Greene (2003). u = reg.u ** 2 e = reg.u n = reg.n k = reg.k x = reg.x ete = reg.utu #constant = constant_check(x) ubar = ete / n ubari = ubar * np.ones((n, 1)) g = u - ubari v = (1.0 / n) * np.sum((u - ubar) ** 2) if z == None: x = reg.x #constant = constant_check(x) # if constant == False: # z = np.hstack((np.ones((n,1)),x))**2 # else: # z = x**2 z = spmultiply(x, x) else: #constant = constant_check(z) # if constant == False: # z = np.hstack((np.ones((n,1)),z)) pass n, p = z.shape # Check to identify any duplicate columns in Z omitcolumn = [] for i in range(p): current = z[:, i] for j in range(p): check = z[:, j] if i < j: test = abs(current - check).sum() if test == 0: omitcolumn.append(j) uniqueomit = set(omitcolumn) omitcolumn = list(uniqueomit) # Now the identified columns must be removed (done in reverse to # prevent renumbering) omitcolumn.sort() omitcolumn.reverse() for c in omitcolumn: z = np.delete(z, c, 1) n, p = z.shape df = p - 1 # Conduct the auxiliary regression. zt = np.transpose(z) gt = np.transpose(g) gtz = np.dot(gt, z) ztg = np.dot(zt, g) ztz = np.dot(zt, z) ztzi = la.inv(ztz) part1 = np.dot(gtz, ztzi) part2 = np.dot(part1, ztg) kb_array = (1.0 / v) * part2 kb = kb_array[0, 0] pvalue = stats.chisqprob(kb, df) kb_result = {'kb': kb, 'df': df, 'pvalue': pvalue} return kb_result
def koenker_bassett(reg, z=None): """ Calculates the Koenker-Bassett test statistic to check for heteroscedasticity. [Koenker1982]_ [Greene2003]_ Parameters ---------- reg : regression output output from an instance of a regression class z : array optional input for specifying an alternative set of variables (Z) to explain the observed variance. By default this is a matrix of the squared explanatory variables (X**2) with a constant added to the first column if not already present. In the default case, the explanatory variables are squared to eliminate negative values. Returns ------- kb_result : dictionary contains the statistic (kb), degrees of freedom (df) and the associated p-value (pvalue) for the test. kb : float scalar value for the Koenker-Bassett test statistic. df : integer degrees of freedom associated with the test pvalue : float p-value associated with the statistic (chi^2 distributed) Notes ----- x attribute in the reg object must have a constant term included. This is standard for spreg.OLS so no testing done to confirm constant. Examples -------- >>> import numpy as np >>> import pysal >>> import diagnostics >>> from ols import OLS Read the DBF associated with the Columbus data. >>> db = pysal.open(pysal.examples.get_path("columbus.dbf"),"r") Create the dependent variable vector. >>> y = np.array(db.by_col("CRIME")) >>> y = np.reshape(y, (49,1)) Create the matrix of independent variables. >>> X = [] >>> X.append(db.by_col("INC")) >>> X.append(db.by_col("HOVAL")) >>> X = np.array(X).T Run an OLS regression. >>> reg = OLS(y,X) Calculate the Koenker-Bassett test for heteroscedasticity. >>> testresult = diagnostics.koenker_bassett(reg) Print the degrees of freedom for the test. >>> testresult['df'] 2 Print the test statistic. >>> print("%1.3f"%testresult['kb']) 5.694 Print the associated p-value. >>> print("%1.4f"%testresult['pvalue']) 0.0580 """ # The notation here matches that of Greene (2003). u = reg.u**2 e = reg.u n = reg.n k = reg.k x = reg.x ete = reg.utu #constant = constant_check(x) ubar = ete / n ubari = ubar * np.ones((n, 1)) g = u - ubari v = (1.0 / n) * np.sum((u - ubar)**2) if z == None: x = reg.x #constant = constant_check(x) # if constant == False: # z = np.hstack((np.ones((n,1)),x))**2 # else: # z = x**2 z = spmultiply(x, x) else: #constant = constant_check(z) # if constant == False: # z = np.hstack((np.ones((n,1)),z)) pass n, p = z.shape # Check to identify any duplicate columns in Z omitcolumn = [] for i in range(p): current = z[:, i] for j in range(p): check = z[:, j] if i < j: test = abs(current - check).sum() if test == 0: omitcolumn.append(j) uniqueomit = set(omitcolumn) omitcolumn = list(uniqueomit) # Now the identified columns must be removed (done in reverse to # prevent renumbering) omitcolumn.sort() omitcolumn.reverse() for c in omitcolumn: z = np.delete(z, c, 1) n, p = z.shape df = p - 1 # Conduct the auxiliary regression. zt = np.transpose(z) gt = np.transpose(g) gtz = np.dot(gt, z) ztg = np.dot(zt, g) ztz = np.dot(zt, z) ztzi = la.inv(ztz) part1 = np.dot(gtz, ztzi) part2 = np.dot(part1, ztg) kb_array = (1.0 / v) * part2 kb = kb_array[0, 0] pvalue = chisqprob(kb, df) kb_result = {'kb': kb, 'df': df, 'pvalue': pvalue} return kb_result
def white(reg): """ Calculates the White test to check for heteroscedasticity. [White1980]_ Parameters ---------- reg : regression object output instance from a regression model Returns ------- white_result : dictionary contains the statistic (white), degrees of freedom (df) and the associated p-value (pvalue) for the White test. white : float scalar value for the White test statistic. df : integer degrees of freedom associated with the test pvalue : float p-value associated with the statistic (chi^2 distributed with k df) Notes ----- x attribute in the reg object must have a constant term included. This is standard for spreg.OLS so no testing done to confirm constant. Examples -------- >>> import numpy as np >>> import pysal >>> import diagnostics >>> from ols import OLS Read the DBF associated with the Columbus data. >>> db = pysal.open(pysal.examples.get_path("columbus.dbf"),"r") Create the dependent variable vector. >>> y = np.array(db.by_col("CRIME")) >>> y = np.reshape(y, (49,1)) Create the matrix of independent variables. >>> X = [] >>> X.append(db.by_col("INC")) >>> X.append(db.by_col("HOVAL")) >>> X = np.array(X).T Run an OLS regression. >>> reg = OLS(y,X) Calculate the White test for heteroscedasticity. >>> testresult = diagnostics.white(reg) Print the degrees of freedom for the test. >>> print testresult['df'] 5 Print the test statistic. >>> print("%1.3f"%testresult['wh']) 19.946 Print the associated p-value. >>> print("%1.4f"%testresult['pvalue']) 0.0013 """ e = reg.u**2 k = int(reg.k) n = int(reg.n) y = reg.y X = reg.x #constant = constant_check(X) # Check for constant, if none add one, see Greene 2003, pg. 222 # if constant == False: # X = np.hstack((np.ones((n,1)),X)) # Check for multicollinearity in the X matrix ci = condition_index(reg) if ci > 30: white_result = "Not computed due to multicollinearity." return white_result # Compute cross-products and squares of the regression variables if type(X).__name__ == 'ndarray': A = np.zeros((n, (k * (k + 1)) // 2)) elif type(X).__name__ == 'csc_matrix' or type(X).__name__ == 'csr_matrix': # this is probably inefficient A = SP.lil_matrix((n, (k * (k + 1)) // 2)) else: raise Exception, "unknown X type, %s" % type(X).__name__ counter = 0 for i in range(k): for j in range(i, k): v = spmultiply(X[:, i], X[:, j], False) A[:, counter] = v counter += 1 # Append the original variables A = sphstack(X, A) # note: this also converts a LIL to CSR n, k = A.shape # Check to identify any duplicate or constant columns in A omitcolumn = [] for i in range(k): current = A[:, i] # remove all constant terms (will add a constant back later) if spmax(current) == spmin(current): omitcolumn.append(i) pass # do not allow duplicates for j in range(k): check = A[:, j] if i < j: test = abs(current - check).sum() if test == 0: omitcolumn.append(j) uniqueomit = set(omitcolumn) omitcolumn = list(uniqueomit) # Now the identified columns must be removed if type(A).__name__ == 'ndarray': A = np.delete(A, omitcolumn, 1) elif type(A).__name__ == 'csc_matrix' or type(A).__name__ == 'csr_matrix': # this is probably inefficient keepcolumn = range(k) for i in omitcolumn: keepcolumn.remove(i) A = A[:, keepcolumn] else: raise Exception, "unknown A type, %s" % type(X).__name__ A = sphstack(np.ones((A.shape[0], 1)), A) # add a constant back in n, k = A.shape # Conduct the auxiliary regression and calculate the statistic import ols as OLS aux_reg = OLS.BaseOLS(e, A) aux_r2 = r2(aux_reg) wh = aux_r2 * n df = k - 1 pvalue = chisqprob(wh, df) white_result = {'df': df, 'wh': wh, 'pvalue': pvalue} return white_result
def breusch_pagan(reg, z=None): """ Calculates the Breusch-Pagan test statistic to check for heteroscedasticity. [Breusch1979]_ Parameters ---------- reg : regression object output instance from a regression model z : array optional input for specifying an alternative set of variables (Z) to explain the observed variance. By default this is a matrix of the squared explanatory variables (X**2) with a constant added to the first column if not already present. In the default case, the explanatory variables are squared to eliminate negative values. Returns ------- bp_result : dictionary contains the statistic (bp) for the test and the associated p-value (p-value) bp : float scalar value for the Breusch-Pagan test statistic df : integer degrees of freedom associated with the test (k) pvalue : float p-value associated with the statistic (chi^2 distributed with k df) Notes ----- x attribute in the reg object must have a constant term included. This is standard for spreg.OLS so no testing done to confirm constant. Examples -------- >>> import numpy as np >>> import pysal >>> import diagnostics >>> from ols import OLS Read the DBF associated with the Columbus data. >>> db = pysal.open(pysal.examples.get_path("columbus.dbf"), "r") Create the dependent variable vector. >>> y = np.array(db.by_col("CRIME")) >>> y = np.reshape(y, (49,1)) Create the matrix of independent variables. >>> X = [] >>> X.append(db.by_col("INC")) >>> X.append(db.by_col("HOVAL")) >>> X = np.array(X).T Run an OLS regression. >>> reg = OLS(y,X) Calculate the Breusch-Pagan test for heteroscedasticity. >>> testresult = diagnostics.breusch_pagan(reg) Print the degrees of freedom for the test. >>> testresult['df'] 2 Print the test statistic. >>> print("%1.3f"%testresult['bp']) 7.900 Print the associated p-value. >>> print("%1.4f"%testresult['pvalue']) 0.0193 """ e2 = reg.u**2 e = reg.u n = reg.n k = reg.k ete = reg.utu den = ete / n g = e2 / den - 1.0 if z == None: x = reg.x #constant = constant_check(x) # if constant == False: # z = np.hstack((np.ones((n,1)),x))**2 # else: # z = x**2 z = spmultiply(x, x) else: #constant = constant_check(z) # if constant == False: # z = np.hstack((np.ones((n,1)),z)) pass n, p = z.shape # Check to identify any duplicate columns in Z omitcolumn = [] for i in range(p): current = z[:, i] for j in range(p): check = z[:, j] if i < j: test = abs(current - check).sum() if test == 0: omitcolumn.append(j) uniqueomit = set(omitcolumn) omitcolumn = list(uniqueomit) # Now the identified columns must be removed (done in reverse to # prevent renumbering) omitcolumn.sort() omitcolumn.reverse() for c in omitcolumn: z = np.delete(z, c, 1) n, p = z.shape df = p - 1 # Now that the variables are prepared, we calculate the statistic zt = np.transpose(z) gt = np.transpose(g) gtz = np.dot(gt, z) ztg = np.dot(zt, g) ztz = np.dot(zt, z) ztzi = la.inv(ztz) part1 = np.dot(gtz, ztzi) part2 = np.dot(part1, ztg) bp_array = 0.5 * part2 bp = bp_array[0, 0] pvalue = chisqprob(bp, df) bp_result = {'df': df, 'bp': bp, 'pvalue': pvalue} return bp_result