def white(reg):
    """
    Calculates the White test to check for heteroscedasticity.

    Parameters
    ----------
    reg             : regression object
                      output instance from a regression model

    Returns
    -------
    white_result    : dictionary
                      contains the statistic (white), degrees of freedom
                      (df) and the associated p-value (pvalue) for the
                      White test. 
    white           : float
                      scalar value for the White test statistic.
    df              : integer
                      degrees of freedom associated with the test
    pvalue          : float
                      p-value associated with the statistic (chi^2
                      distributed with k df)

    Notes
    -----
    x attribute in the reg object must have a constant term included. This is
    standard for spreg.OLS so no testing done to confirm constant.

    References
    ----------
    .. [1] H. White. 1980. A heteroscedasticity-consistent covariance
       matrix estimator and a direct test for heteroskdasticity.
       Econometrica. 48(4) 817-838. 

    Examples
    --------
    >>> import numpy as np
    >>> import pysal
    >>> import diagnostics
    >>> from ols import OLS

    Read the DBF associated with the Columbus data.

    >>> db = pysal.open(pysal.examples.get_path("columbus.dbf"),"r")

    Create the dependent variable vector. 

    >>> y = np.array(db.by_col("CRIME"))
    >>> y = np.reshape(y, (49,1))

    Create the matrix of independent variables. 

    >>> X = []
    >>> X.append(db.by_col("INC"))
    >>> X.append(db.by_col("HOVAL"))
    >>> X = np.array(X).T

    Run an OLS regression.

    >>> reg = OLS(y,X)

    Calculate the White test for heteroscedasticity.

    >>> testresult = diagnostics.white(reg)

    Print the degrees of freedom for the test.

    >>> print testresult['df']
    5

    Print the test statistic.

    >>> print("%1.3f"%testresult['wh'])
    19.946

    Print the associated p-value. 

    >>> print("%1.4f"%testresult['pvalue'])
    0.0013

    """
    e = reg.u ** 2
    k = reg.k
    n = reg.n
    y = reg.y
    X = reg.x
    #constant = constant_check(X)

    # Check for constant, if none add one, see Greene 2003, pg. 222
    # if constant == False:
    #    X = np.hstack((np.ones((n,1)),X))

    # Check for multicollinearity in the X matrix
    ci = condition_index(reg)
    if ci > 30:
        white_result = "Not computed due to multicollinearity."
        return white_result

    # Compute cross-products and squares of the regression variables
    if type(X).__name__ == 'ndarray':
        A = np.zeros((n, (k * (k + 1)) / 2.))
    elif type(X).__name__ == 'csc_matrix' or type(X).__name__ == 'csr_matrix':
        # this is probably inefficient
        A = SP.lil_matrix((n, (k * (k + 1)) / 2.))
    else:
        raise Exception, "unknown X type, %s" % type(X).__name__
    counter = 0
    for i in range(k):
        for j in range(i, k):
            v = spmultiply(X[:, i], X[:, j], False)
            A[:, counter] = v
            counter += 1

    # Append the original variables
    A = sphstack(X, A)   # note: this also converts a LIL to CSR
    n, k = A.shape

    # Check to identify any duplicate or constant columns in A
    omitcolumn = []
    for i in range(k):
        current = A[:, i]
        # remove all constant terms (will add a constant back later)
        if spmax(current) == spmin(current):
            omitcolumn.append(i)
            pass
        # do not allow duplicates
        for j in range(k):
            check = A[:, j]
            if i < j:
                test = abs(current - check).sum()
                if test == 0:
                    omitcolumn.append(j)
    uniqueomit = set(omitcolumn)
    omitcolumn = list(uniqueomit)

    # Now the identified columns must be removed
    if type(A).__name__ == 'ndarray':
        A = np.delete(A, omitcolumn, 1)
    elif type(A).__name__ == 'csc_matrix' or type(A).__name__ == 'csr_matrix':
        # this is probably inefficient
        keepcolumn = range(k)
        for i in omitcolumn:
            keepcolumn.remove(i)
        A = A[:, keepcolumn]
    else:
        raise Exception, "unknown A type, %s" % type(X).__name__
    A = sphstack(np.ones((A.shape[0], 1)), A)   # add a constant back in
    n, k = A.shape

    # Conduct the auxiliary regression and calculate the statistic
    import ols as OLS
    aux_reg = OLS.BaseOLS(e, A)
    aux_r2 = r2(aux_reg)
    wh = aux_r2 * n
    df = k - 1
    pvalue = stats.chisqprob(wh, df)
    white_result = {'df': df, 'wh': wh, 'pvalue': pvalue}
    return white_result
def breusch_pagan(reg, z=None):
    """
    Calculates the Breusch-Pagan test statistic to check for
    heteroscedasticity. 

    Parameters
    ----------
    reg             : regression object
                      output instance from a regression model
    z               : array
                      optional input for specifying an alternative set of
                      variables (Z) to explain the observed variance. By
                      default this is a matrix of the squared explanatory
                      variables (X**2) with a constant added to the first
                      column if not already present. In the default case,
                      the explanatory variables are squared to eliminate
                      negative values. 

    Returns
    -------
    bp_result       : dictionary
                      contains the statistic (bp) for the test and the
                      associated p-value (p-value)
    bp              : float
                      scalar value for the Breusch-Pagan test statistic
    df              : integer
                      degrees of freedom associated with the test (k)
    pvalue          : float
                      p-value associated with the statistic (chi^2
                      distributed with k df)

    Notes
    -----
    x attribute in the reg object must have a constant term included. This is
    standard for spreg.OLS so no testing done to confirm constant.

    References
    ----------

    .. [1] T. Breusch and A. Pagan. 1979. A simple test for
       heteroscedasticity and random coefficient variation. Econometrica:
       Journal of the Econometric Society, 47(5):1287-1294.

    Examples
    --------
    >>> import numpy as np
    >>> import pysal
    >>> import diagnostics
    >>> from ols import OLS

    Read the DBF associated with the Columbus data.

    >>> db = pysal.open(pysal.examples.get_path("columbus.dbf"), "r")

    Create the dependent variable vector. 

    >>> y = np.array(db.by_col("CRIME"))
    >>> y = np.reshape(y, (49,1))

    Create the matrix of independent variables. 

    >>> X = []
    >>> X.append(db.by_col("INC"))
    >>> X.append(db.by_col("HOVAL"))
    >>> X = np.array(X).T

    Run an OLS regression.

    >>> reg = OLS(y,X)

    Calculate the Breusch-Pagan test for heteroscedasticity.

    >>> testresult = diagnostics.breusch_pagan(reg)

    Print the degrees of freedom for the test.

    >>> testresult['df']
    2

    Print the test statistic.

    >>> print("%1.3f"%testresult['bp'])
    7.900

    Print the associated p-value. 

    >>> print("%1.4f"%testresult['pvalue'])
    0.0193

    """
    e2 = reg.u ** 2
    e = reg.u
    n = reg.n
    k = reg.k
    ete = reg.utu

    den = ete / n
    g = e2 / den - 1.0

    if z == None:
        x = reg.x
        #constant = constant_check(x)
        # if constant == False:
        #    z = np.hstack((np.ones((n,1)),x))**2
        # else:
        #    z = x**2
        z = spmultiply(x, x)
    else:
        #constant = constant_check(z)
        # if constant == False:
        #    z = np.hstack((np.ones((n,1)),z))
        pass

    n, p = z.shape

    # Check to identify any duplicate columns in Z
    omitcolumn = []
    for i in range(p):
        current = z[:, i]
        for j in range(p):
            check = z[:, j]
            if i < j:
                test = abs(current - check).sum()
                if test == 0:
                    omitcolumn.append(j)

    uniqueomit = set(omitcolumn)
    omitcolumn = list(uniqueomit)

    # Now the identified columns must be removed (done in reverse to
    # prevent renumbering)
    omitcolumn.sort()
    omitcolumn.reverse()
    for c in omitcolumn:
        z = np.delete(z, c, 1)
    n, p = z.shape

    df = p - 1

    # Now that the variables are prepared, we calculate the statistic
    zt = np.transpose(z)
    gt = np.transpose(g)
    gtz = np.dot(gt, z)
    ztg = np.dot(zt, g)
    ztz = np.dot(zt, z)
    ztzi = la.inv(ztz)

    part1 = np.dot(gtz, ztzi)
    part2 = np.dot(part1, ztg)
    bp_array = 0.5 * part2
    bp = bp_array[0, 0]

    pvalue = stats.chisqprob(bp, df)
    bp_result = {'df': df, 'bp': bp, 'pvalue': pvalue}
    return bp_result
def koenker_bassett(reg, z=None):
    """
    Calculates the Koenker-Bassett test statistic to check for
    heteroscedasticity. 

    Parameters
    ----------
    reg             : regression output
                      output from an instance of a regression class
    z               : array
                      optional input for specifying an alternative set of
                      variables (Z) to explain the observed variance. By
                      default this is a matrix of the squared explanatory
                      variables (X**2) with a constant added to the first
                      column if not already present. In the default case,
                      the explanatory variables are squared to eliminate
                      negative values. 

    Returns
    -------
    kb_result       : dictionary
                      contains the statistic (kb), degrees of freedom (df)
                      and the associated p-value (pvalue) for the test. 
    kb              : float
                      scalar value for the Koenker-Bassett test statistic.
    df              : integer
                      degrees of freedom associated with the test
    pvalue          : float
                      p-value associated with the statistic (chi^2
                      distributed)

    Notes
    -----
    x attribute in the reg object must have a constant term included. This is
    standard for spreg.OLS so no testing done to confirm constant.

    References
    ----------
    .. [1] R. Koenker and G. Bassett. 1982. Robust tests for
       heteroscedasticity based on regression quantiles. Econometrica,
       50(1):43-61. 
    .. [2] W. Greene. 2003. Econometric Analysis. Prentice Hall, Upper
       Saddle River. 

    Examples
    --------
    >>> import numpy as np
    >>> import pysal
    >>> import diagnostics
    >>> from ols import OLS

    Read the DBF associated with the Columbus data.

    >>> db = pysal.open(pysal.examples.get_path("columbus.dbf"),"r")

    Create the dependent variable vector. 

    >>> y = np.array(db.by_col("CRIME"))
    >>> y = np.reshape(y, (49,1))

    Create the matrix of independent variables. 

    >>> X = []
    >>> X.append(db.by_col("INC"))
    >>> X.append(db.by_col("HOVAL"))
    >>> X = np.array(X).T

    Run an OLS regression.

    >>> reg = OLS(y,X)

    Calculate the Koenker-Bassett test for heteroscedasticity.

    >>> testresult = diagnostics.koenker_bassett(reg)

    Print the degrees of freedom for the test.

    >>> testresult['df']
    2

    Print the test statistic.

    >>> print("%1.3f"%testresult['kb'])
    5.694

    Print the associated p-value. 

    >>> print("%1.4f"%testresult['pvalue'])
    0.0580

    """
    # The notation here matches that of Greene (2003).
    u = reg.u ** 2
    e = reg.u
    n = reg.n
    k = reg.k
    x = reg.x
    ete = reg.utu
    #constant = constant_check(x)

    ubar = ete / n
    ubari = ubar * np.ones((n, 1))
    g = u - ubari
    v = (1.0 / n) * np.sum((u - ubar) ** 2)

    if z == None:
        x = reg.x
        #constant = constant_check(x)
        # if constant == False:
        #    z = np.hstack((np.ones((n,1)),x))**2
        # else:
        #    z = x**2
        z = spmultiply(x, x)
    else:
        #constant = constant_check(z)
        # if constant == False:
        #    z = np.hstack((np.ones((n,1)),z))
        pass

    n, p = z.shape

    # Check to identify any duplicate columns in Z
    omitcolumn = []
    for i in range(p):
        current = z[:, i]
        for j in range(p):
            check = z[:, j]
            if i < j:
                test = abs(current - check).sum()
                if test == 0:
                    omitcolumn.append(j)

    uniqueomit = set(omitcolumn)
    omitcolumn = list(uniqueomit)

    # Now the identified columns must be removed (done in reverse to
    # prevent renumbering)
    omitcolumn.sort()
    omitcolumn.reverse()
    for c in omitcolumn:
        z = np.delete(z, c, 1)
    n, p = z.shape

    df = p - 1

    # Conduct the auxiliary regression.
    zt = np.transpose(z)
    gt = np.transpose(g)
    gtz = np.dot(gt, z)
    ztg = np.dot(zt, g)
    ztz = np.dot(zt, z)
    ztzi = la.inv(ztz)

    part1 = np.dot(gtz, ztzi)
    part2 = np.dot(part1, ztg)
    kb_array = (1.0 / v) * part2
    kb = kb_array[0, 0]

    pvalue = stats.chisqprob(kb, df)
    kb_result = {'kb': kb, 'df': df, 'pvalue': pvalue}
    return kb_result
Exemple #4
0
def koenker_bassett(reg, z=None):
    """
    Calculates the Koenker-Bassett test statistic to check for
    heteroscedasticity. [Koenker1982]_ [Greene2003]_

    Parameters
    ----------
    reg             : regression output
                      output from an instance of a regression class
    z               : array
                      optional input for specifying an alternative set of
                      variables (Z) to explain the observed variance. By
                      default this is a matrix of the squared explanatory
                      variables (X**2) with a constant added to the first
                      column if not already present. In the default case,
                      the explanatory variables are squared to eliminate
                      negative values. 

    Returns
    -------
    kb_result       : dictionary
                      contains the statistic (kb), degrees of freedom (df)
                      and the associated p-value (pvalue) for the test. 
    kb              : float
                      scalar value for the Koenker-Bassett test statistic.
    df              : integer
                      degrees of freedom associated with the test
    pvalue          : float
                      p-value associated with the statistic (chi^2
                      distributed)

    Notes
    -----
    x attribute in the reg object must have a constant term included. This is
    standard for spreg.OLS so no testing done to confirm constant.

    Examples
    --------
    >>> import numpy as np
    >>> import pysal
    >>> import diagnostics
    >>> from ols import OLS

    Read the DBF associated with the Columbus data.

    >>> db = pysal.open(pysal.examples.get_path("columbus.dbf"),"r")

    Create the dependent variable vector. 

    >>> y = np.array(db.by_col("CRIME"))
    >>> y = np.reshape(y, (49,1))

    Create the matrix of independent variables. 

    >>> X = []
    >>> X.append(db.by_col("INC"))
    >>> X.append(db.by_col("HOVAL"))
    >>> X = np.array(X).T

    Run an OLS regression.

    >>> reg = OLS(y,X)

    Calculate the Koenker-Bassett test for heteroscedasticity.

    >>> testresult = diagnostics.koenker_bassett(reg)

    Print the degrees of freedom for the test.

    >>> testresult['df']
    2

    Print the test statistic.

    >>> print("%1.3f"%testresult['kb'])
    5.694

    Print the associated p-value. 

    >>> print("%1.4f"%testresult['pvalue'])
    0.0580

    """
    # The notation here matches that of Greene (2003).
    u = reg.u**2
    e = reg.u
    n = reg.n
    k = reg.k
    x = reg.x
    ete = reg.utu
    #constant = constant_check(x)

    ubar = ete / n
    ubari = ubar * np.ones((n, 1))
    g = u - ubari
    v = (1.0 / n) * np.sum((u - ubar)**2)

    if z == None:
        x = reg.x
        #constant = constant_check(x)
        # if constant == False:
        #    z = np.hstack((np.ones((n,1)),x))**2
        # else:
        #    z = x**2
        z = spmultiply(x, x)
    else:
        #constant = constant_check(z)
        # if constant == False:
        #    z = np.hstack((np.ones((n,1)),z))
        pass

    n, p = z.shape

    # Check to identify any duplicate columns in Z
    omitcolumn = []
    for i in range(p):
        current = z[:, i]
        for j in range(p):
            check = z[:, j]
            if i < j:
                test = abs(current - check).sum()
                if test == 0:
                    omitcolumn.append(j)

    uniqueomit = set(omitcolumn)
    omitcolumn = list(uniqueomit)

    # Now the identified columns must be removed (done in reverse to
    # prevent renumbering)
    omitcolumn.sort()
    omitcolumn.reverse()
    for c in omitcolumn:
        z = np.delete(z, c, 1)
    n, p = z.shape

    df = p - 1

    # Conduct the auxiliary regression.
    zt = np.transpose(z)
    gt = np.transpose(g)
    gtz = np.dot(gt, z)
    ztg = np.dot(zt, g)
    ztz = np.dot(zt, z)
    ztzi = la.inv(ztz)

    part1 = np.dot(gtz, ztzi)
    part2 = np.dot(part1, ztg)
    kb_array = (1.0 / v) * part2
    kb = kb_array[0, 0]

    pvalue = chisqprob(kb, df)
    kb_result = {'kb': kb, 'df': df, 'pvalue': pvalue}
    return kb_result
Exemple #5
0
def white(reg):
    """
    Calculates the White test to check for heteroscedasticity. [White1980]_

    Parameters
    ----------
    reg             : regression object
                      output instance from a regression model

    Returns
    -------
    white_result    : dictionary
                      contains the statistic (white), degrees of freedom
                      (df) and the associated p-value (pvalue) for the
                      White test. 
    white           : float
                      scalar value for the White test statistic.
    df              : integer
                      degrees of freedom associated with the test
    pvalue          : float
                      p-value associated with the statistic (chi^2
                      distributed with k df)

    Notes
    -----
    x attribute in the reg object must have a constant term included. This is
    standard for spreg.OLS so no testing done to confirm constant.

    Examples
    --------
    >>> import numpy as np
    >>> import pysal
    >>> import diagnostics
    >>> from ols import OLS

    Read the DBF associated with the Columbus data.

    >>> db = pysal.open(pysal.examples.get_path("columbus.dbf"),"r")

    Create the dependent variable vector. 

    >>> y = np.array(db.by_col("CRIME"))
    >>> y = np.reshape(y, (49,1))

    Create the matrix of independent variables. 

    >>> X = []
    >>> X.append(db.by_col("INC"))
    >>> X.append(db.by_col("HOVAL"))
    >>> X = np.array(X).T

    Run an OLS regression.

    >>> reg = OLS(y,X)

    Calculate the White test for heteroscedasticity.

    >>> testresult = diagnostics.white(reg)

    Print the degrees of freedom for the test.

    >>> print testresult['df']
    5

    Print the test statistic.

    >>> print("%1.3f"%testresult['wh'])
    19.946

    Print the associated p-value. 

    >>> print("%1.4f"%testresult['pvalue'])
    0.0013

    """
    e = reg.u**2
    k = int(reg.k)
    n = int(reg.n)
    y = reg.y
    X = reg.x
    #constant = constant_check(X)

    # Check for constant, if none add one, see Greene 2003, pg. 222
    # if constant == False:
    #    X = np.hstack((np.ones((n,1)),X))

    # Check for multicollinearity in the X matrix
    ci = condition_index(reg)
    if ci > 30:
        white_result = "Not computed due to multicollinearity."
        return white_result

    # Compute cross-products and squares of the regression variables
    if type(X).__name__ == 'ndarray':
        A = np.zeros((n, (k * (k + 1)) // 2))
    elif type(X).__name__ == 'csc_matrix' or type(X).__name__ == 'csr_matrix':
        # this is probably inefficient
        A = SP.lil_matrix((n, (k * (k + 1)) // 2))
    else:
        raise Exception, "unknown X type, %s" % type(X).__name__
    counter = 0
    for i in range(k):
        for j in range(i, k):
            v = spmultiply(X[:, i], X[:, j], False)
            A[:, counter] = v
            counter += 1

    # Append the original variables
    A = sphstack(X, A)  # note: this also converts a LIL to CSR
    n, k = A.shape

    # Check to identify any duplicate or constant columns in A
    omitcolumn = []
    for i in range(k):
        current = A[:, i]
        # remove all constant terms (will add a constant back later)
        if spmax(current) == spmin(current):
            omitcolumn.append(i)
            pass
        # do not allow duplicates
        for j in range(k):
            check = A[:, j]
            if i < j:
                test = abs(current - check).sum()
                if test == 0:
                    omitcolumn.append(j)
    uniqueomit = set(omitcolumn)
    omitcolumn = list(uniqueomit)

    # Now the identified columns must be removed
    if type(A).__name__ == 'ndarray':
        A = np.delete(A, omitcolumn, 1)
    elif type(A).__name__ == 'csc_matrix' or type(A).__name__ == 'csr_matrix':
        # this is probably inefficient
        keepcolumn = range(k)
        for i in omitcolumn:
            keepcolumn.remove(i)
        A = A[:, keepcolumn]
    else:
        raise Exception, "unknown A type, %s" % type(X).__name__
    A = sphstack(np.ones((A.shape[0], 1)), A)  # add a constant back in
    n, k = A.shape

    # Conduct the auxiliary regression and calculate the statistic
    import ols as OLS
    aux_reg = OLS.BaseOLS(e, A)
    aux_r2 = r2(aux_reg)
    wh = aux_r2 * n
    df = k - 1
    pvalue = chisqprob(wh, df)
    white_result = {'df': df, 'wh': wh, 'pvalue': pvalue}
    return white_result
Exemple #6
0
def breusch_pagan(reg, z=None):
    """
    Calculates the Breusch-Pagan test statistic to check for
    heteroscedasticity. [Breusch1979]_ 

    Parameters
    ----------
    reg             : regression object
                      output instance from a regression model
    z               : array
                      optional input for specifying an alternative set of
                      variables (Z) to explain the observed variance. By
                      default this is a matrix of the squared explanatory
                      variables (X**2) with a constant added to the first
                      column if not already present. In the default case,
                      the explanatory variables are squared to eliminate
                      negative values. 

    Returns
    -------
    bp_result       : dictionary
                      contains the statistic (bp) for the test and the
                      associated p-value (p-value)
    bp              : float
                      scalar value for the Breusch-Pagan test statistic
    df              : integer
                      degrees of freedom associated with the test (k)
    pvalue          : float
                      p-value associated with the statistic (chi^2
                      distributed with k df)

    Notes
    -----
    x attribute in the reg object must have a constant term included. This is
    standard for spreg.OLS so no testing done to confirm constant.

    Examples
    --------
    >>> import numpy as np
    >>> import pysal
    >>> import diagnostics
    >>> from ols import OLS

    Read the DBF associated with the Columbus data.

    >>> db = pysal.open(pysal.examples.get_path("columbus.dbf"), "r")

    Create the dependent variable vector. 

    >>> y = np.array(db.by_col("CRIME"))
    >>> y = np.reshape(y, (49,1))

    Create the matrix of independent variables. 

    >>> X = []
    >>> X.append(db.by_col("INC"))
    >>> X.append(db.by_col("HOVAL"))
    >>> X = np.array(X).T

    Run an OLS regression.

    >>> reg = OLS(y,X)

    Calculate the Breusch-Pagan test for heteroscedasticity.

    >>> testresult = diagnostics.breusch_pagan(reg)

    Print the degrees of freedom for the test.

    >>> testresult['df']
    2

    Print the test statistic.

    >>> print("%1.3f"%testresult['bp'])
    7.900

    Print the associated p-value. 

    >>> print("%1.4f"%testresult['pvalue'])
    0.0193

    """
    e2 = reg.u**2
    e = reg.u
    n = reg.n
    k = reg.k
    ete = reg.utu

    den = ete / n
    g = e2 / den - 1.0

    if z == None:
        x = reg.x
        #constant = constant_check(x)
        # if constant == False:
        #    z = np.hstack((np.ones((n,1)),x))**2
        # else:
        #    z = x**2
        z = spmultiply(x, x)
    else:
        #constant = constant_check(z)
        # if constant == False:
        #    z = np.hstack((np.ones((n,1)),z))
        pass

    n, p = z.shape

    # Check to identify any duplicate columns in Z
    omitcolumn = []
    for i in range(p):
        current = z[:, i]
        for j in range(p):
            check = z[:, j]
            if i < j:
                test = abs(current - check).sum()
                if test == 0:
                    omitcolumn.append(j)

    uniqueomit = set(omitcolumn)
    omitcolumn = list(uniqueomit)

    # Now the identified columns must be removed (done in reverse to
    # prevent renumbering)
    omitcolumn.sort()
    omitcolumn.reverse()
    for c in omitcolumn:
        z = np.delete(z, c, 1)
    n, p = z.shape

    df = p - 1

    # Now that the variables are prepared, we calculate the statistic
    zt = np.transpose(z)
    gt = np.transpose(g)
    gtz = np.dot(gt, z)
    ztg = np.dot(zt, g)
    ztz = np.dot(zt, z)
    ztzi = la.inv(ztz)

    part1 = np.dot(gtz, ztzi)
    part2 = np.dot(part1, ztg)
    bp_array = 0.5 * part2
    bp = bp_array[0, 0]

    pvalue = chisqprob(bp, df)
    bp_result = {'df': df, 'bp': bp, 'pvalue': pvalue}
    return bp_result