Esempio n. 1
0
def added_variable_resids(results,
                          focus_exog,
                          resid_type=None,
                          use_glm_weights=True,
                          fit_kwargs=None):
    """
    Residualize the endog variable and a 'focus' exog variable in a
    regression model with respect to the other exog variables.

    Parameters
    ----------
    results : regression results instance
        A fitted model including the focus exog and all other
        predictors of interest.
    focus_exog : integer or string
        The column of results.model.exog or a variable name that is
        to be residualized against the other predictors.
    resid_type : string
        The type of residuals to use for the dependent variable.  If
        None, uses `resid_deviance` for GLM/GEE and `resid` otherwise.
    use_glm_weights : bool
        Only used if the model is a GLM or GEE.  If True, the
        residuals for the focus predictor are computed using WLS, with
        the weights obtained from the IRLS calculations for fitting
        the GLM.  If False, unweighted regression is used.
    fit_kwargs : dict, optional
        Keyword arguments to be passed to fit when refitting the
        model.

    Returns
    -------
    endog_resid : array-like
        The residuals for the original exog
    focus_exog_resid : array-like
        The residuals for the focus predictor

    Notes
    -----
    The 'focus variable' residuals are always obtained using linear
    regression.

    Currently only GLM, GEE, and OLS models are supported.
    """

    model = results.model
    if not isinstance(model, (GEE, GLM, OLS)):
        raise ValueError(
            "model type %s not supported for added variable residuals" %
            model.__class__.__name__)

    exog = model.exog
    endog = model.endog

    focus_exog, focus_col = utils.maybe_name_or_idx(focus_exog, model)

    focus_exog_vals = exog[:, focus_col]

    # Default residuals
    if resid_type is None:
        if isinstance(model, (GEE, GLM)):
            resid_type = "resid_deviance"
        else:
            resid_type = "resid"

    ii = range(exog.shape[1])
    ii = list(ii)
    ii.pop(focus_col)
    reduced_exog = exog[:, ii]
    start_params = results.params[ii]

    klass = model.__class__

    kwargs = model._get_init_kwds()
    new_model = klass(endog, reduced_exog, **kwargs)
    args = {"start_params": start_params}
    if fit_kwargs is not None:
        args.update(fit_kwargs)
    new_result = new_model.fit(**args)
    if not new_result.converged:
        raise ValueError(
            "fit did not converge when calculating added variable residuals")

    try:
        endog_resid = getattr(new_result, resid_type)
    except AttributeError:
        raise ValueError("'%s' residual type not available" % resid_type)

    import statsmodels.regression.linear_model as lm

    if isinstance(model, (GLM, GEE)) and use_glm_weights:
        weights = model.family.weights(results.fittedvalues)
        if hasattr(model, "data_weights"):
            weights = weights * model.data_weights
        lm_results = lm.WLS(focus_exog_vals, reduced_exog, weights).fit()
    else:
        lm_results = lm.OLS(focus_exog_vals, reduced_exog).fit()
    focus_exog_resid = lm_results.resid

    return endog_resid, focus_exog_resid
Esempio n. 2
0
def ceres_resids(results, focus_exog, frac=0.66, cond_means=None):
    """
    Calculate the CERES residuals (Conditional Expectation Partial
    Residuals) for a fitted model.

    Parameters
    ----------
    results : model results instance
        The fitted model for which the CERES residuals are calculated.
    focus_exog : int
        The column of results.model.exog used as the 'focus variable'.
    frac : float, optional
        Lowess smoothing parameter for estimating the conditional
        means.  Not used if `cond_means` is provided.
    cond_means : array-like, optional
        If provided, the columns of this array are the conditional
        means E[exog | focus exog], where exog ranges over some
        or all of the columns of exog other than focus exog.  If
        this is an empty nx0 array, the conditional means are
        treated as being zero.  If None, the conditional means are
        estimated.

    Returns
    -------
    An array containing the CERES residuals.

    Notes
    -----
    If `cond_means` is not provided, it is obtained by smoothing each
    column of exog (except the focus column) against the focus column.

    Currently only supports GLM, GEE, and OLS models.
    """

    model = results.model

    if not isinstance(model, (GLM, GEE, OLS)):
        raise ValueError("ceres residuals not available for %s" %
                         model.__class__.__name__)

    focus_exog, focus_col = utils.maybe_name_or_idx(focus_exog, model)

    # Indices of non-focus columns
    ix_nf = range(len(results.params))
    ix_nf = list(ix_nf)
    ix_nf.pop(focus_col)
    nnf = len(ix_nf)

    # Estimate the conditional means if not provided.
    if cond_means is None:

        # Below we calculate E[x | focus] where x is each column other
        # than the focus column.  We don't want the intercept when we do
        # this so we remove it here.
        pexog = model.exog[:, ix_nf]
        pexog -= pexog.mean(0)
        u, s, vt = np.linalg.svd(pexog, 0)
        ii = np.flatnonzero(s > 1e-6)
        pexog = u[:, ii]

        fcol = model.exog[:, focus_col]
        cond_means = np.empty((len(fcol), pexog.shape[1]))
        for j in range(pexog.shape[1]):

            # Get the fitted values for column i given the other
            # columns (skip the intercept).
            y0 = pexog[:, j]

            cf = lowess(y0, fcol, frac=frac, return_sorted=False)

            cond_means[:, j] = cf

    new_exog = np.concatenate((model.exog[:, ix_nf], cond_means), axis=1)

    # Refit the model using the adjusted exog values
    klass = model.__class__
    init_kwargs = model._get_init_kwds()
    new_model = klass(model.endog, new_exog, **init_kwargs)
    new_result = new_model.fit()

    # The partial residual, with respect to l(x2) (notation of Cook 1998)
    presid = model.endog - new_result.fittedvalues
    if isinstance(model, (GLM, GEE)):
        presid *= model.family.link.deriv(new_result.fittedvalues)
    if new_exog.shape[1] > nnf:
        presid += np.dot(new_exog[:, nnf:], new_result.params[nnf:])

    return presid
Esempio n. 3
0
def ceres_resids(results, focus_exog, frac=0.66, cond_means=None):
    """
    Calculate the CERES residuals (Conditional Expectation Partial
    Residuals) for a fitted model.

    Parameters
    ----------
    results : model results instance
        The fitted model for which the CERES residuals are calculated.
    focus_exog : int
        The column of results.model.exog used as the 'focus variable'.
    frac : float, optional
        Lowess smoothing parameter for estimating the conditional
        means.  Not used if `cond_means` is provided.
    cond_means : array-like, optional
        If provided, the columns of this array are the conditional
        means E[exog | focus exog], where exog ranges over some
        or all of the columns of exog other than focus exog.  If
        this is an empty nx0 array, the conditional means are
        treated as being zero.  If None, the conditional means are
        estimated.

    Returns
    -------
    An array containing the CERES residuals.

    Notes
    -----
    If `cond_means` is not provided, it is obtained by smoothing each
    column of exog (except the focus column) against the focus column.

    Currently only supports GLM, GEE, and OLS models.
    """

    model = results.model

    if not isinstance(model, (GLM, GEE, OLS)):
        raise ValueError("ceres residuals not available for %s" %
                         model.__class__.__name__)

    focus_exog, focus_col = utils.maybe_name_or_idx(focus_exog, model)

    # Indices of non-focus columns
    ix_nf = range(len(results.params))
    ix_nf = list(ix_nf)
    ix_nf.pop(focus_col)
    nnf = len(ix_nf)

    # Estimate the conditional means if not provided.
    if cond_means is None:

        # Below we calculate E[x | focus] where x is each column other
        # than the focus column.  We don't want the intercept when we do
        # this so we remove it here.
        pexog = model.exog[:, ix_nf]
        pexog -= pexog.mean(0)
        u, s, vt = np.linalg.svd(pexog, 0)
        ii = np.flatnonzero(s > 1e-6)
        pexog = u[:, ii]

        fcol = model.exog[:, focus_col]
        cond_means = np.empty((len(fcol), pexog.shape[1]))
        for j in range(pexog.shape[1]):

            # Get the fitted values for column i given the other
            # columns (skip the intercept).
            y0 = pexog[:, j]

            cf = lowess(y0, fcol, frac=frac, return_sorted=False)

            cond_means[:, j] = cf

    new_exog = np.concatenate((model.exog[:, ix_nf], cond_means), axis=1)

    # Refit the model using the adjusted exog values
    klass = model.__class__
    init_kwargs = model._get_init_kwds()
    new_model = klass(model.endog, new_exog, **init_kwargs)
    new_result = new_model.fit()

    # The partial residual, with respect to l(x2) (notation of Cook 1998)
    presid = model.endog - new_result.fittedvalues
    if isinstance(model, (GLM, GEE)):
        presid *= model.family.link.deriv(new_result.fittedvalues)
    if new_exog.shape[1] > nnf:
        presid += np.dot(new_exog[:, nnf:], new_result.params[nnf:])

    return presid
Esempio n. 4
0
def ceres_resids(results, focus_exog, frac=None, cond_means=None):
    """
    Calculate the CERES residuals (Conditional Expectation Partial
    Residuals) for a fitted model.

    Parameters
    ----------
    results : model results instance
        The fitted model for which the CERES residuals are calculated.
    focus_exog : int
        The column of results.model.exog used as the 'focus variable'.
    frac : dict, optional
        Map from column indices of results.model.exog to lowess
        smoothing parameters (the frac keyword argument to lowess).
        Not used if `cond_means` is provided.
    cond_means : array-like, optional
        If provided, the columns of this array are the conditional
        means E[exog | focus exog], where exog ranges over some
        or all of the columns of exog other than focus exog.  If
        this is an empty nx0 array, the conditional means are
        treated as being zero.

    Returns
    -------
    An array containing the CERES residuals.

    Notes
    -----
    If `cond_means` is not provided, it is obtained by smoothing each
    column of exog (except the focus column) against the focus column.
    The values of `frac` control these lowess smooths.

    Currently only supports GLM, GEE, and OLS models.
    """

    model = results.model

    if not isinstance(model, (GLM, GEE, OLS)):
        raise ValueError("ceres residuals not available for %s" %
                         type(model))

    n = model.exog.shape[0]
    m = model.exog.shape[1] - 1

    if frac is None:
        frac = {}

    if type(focus_exog) is str:
        focus_col = results.exog_names.index(focus_exog)
    else:
        focus_col = focus_exog

    # Indices of non-focus columns
    ii = range(len(results.params))
    ii = list(ii)
    ii.pop(focus_col)

    if cond_means is None:
        cond_means = np.zeros((n, m))
        x0 = model.exog[:, focus_col]
        for j, i in enumerate(ii):
            y0 = model.exog[:, i]
            fr = frac[i] if i in frac else 0.66
            cond_means[:, j] = lowess(y0, x0, frac=fr, return_sorted=False)

    new_exog = np.concatenate((model.exog[:, ii], cond_means), axis=1)

    # Refit the model using the adjusted exog values
    klass = model.__class__
    init_kwargs = model._get_init_kwds()
    new_model = klass(model.endog, new_exog, **init_kwargs)
    new_result = new_model.fit()

    # The partial residual, with respect to l(x2) (notation of Cook 1998)
    presid = model.endog - new_result.fittedvalues
    if isinstance(model, (GLM, GEE)):
        presid *= model.family.link.deriv(new_result.fittedvalues)
    if cond_means.shape[1] > 0:
        presid += np.dot(cond_means, new_result.params[m:])

    return presid
Esempio n. 5
0
def added_variable_resids(results, focus_exog, resid_type=None,
                          use_glm_weights=True, fit_kwargs=None):
    """
    Residualize the endog variable and a 'focus' exog variable in a
    regression model with respect to the other exog variables.

    Parameters
    ----------
    results : regression results instance
        A fitted model including the focus exog and all other
        predictors of interest.
    focus_exog : integer or string
        The column of results.model.exog or a variable name that is
        to be residualized against the other predictors.
    resid_type : string
        The type of residuals to use for the dependent variable.  If
        None, uses `resid_deviance` for GLM/GEE and `resid` otherwise.
    use_glm_weights : bool
        Only used if the model is a GLM or GEE.  If True, the
        residuals for the focus predictor are computed using WLS, with
        the weights obtained from the IRLS calculations for fitting
        the GLM.  If False, unweighted regression is used.
    fit_kwargs : dict, optional
        Keyword arguments to be passed to fit when refitting the
        model.

    Returns
    -------
    endog_resid : array-like
        The residuals for the original exog
    focus_exog_resid : array-like
        The residuals for the focus predictor

    Notes
    -----
    The 'focus variable' residuals are always obtained using linear
    regression.

    Currently only GLM, GEE, and OLS models are supported.
    """

    model = results.model
    if not isinstance(model, (GEE, GLM, OLS)):
        raise ValueError("model type %s not supported for added variable residuals" %
                         model.__class__.__name__)

    exog = model.exog
    endog = model.endog

    focus_exog, focus_col = utils.maybe_name_or_idx(focus_exog, model)

    focus_exog_vals = exog[:, focus_col]

    # Default residuals
    if resid_type is None:
        if isinstance(model, (GEE, GLM)):
            resid_type = "resid_deviance"
        else:
            resid_type = "resid"

    ii = range(exog.shape[1])
    ii = list(ii)
    ii.pop(focus_col)
    reduced_exog = exog[:, ii]
    start_params = results.params[ii]

    klass = model.__class__

    kwargs = model._get_init_kwds()
    new_model = klass(endog, reduced_exog, **kwargs)
    args = {"start_params": start_params}
    if fit_kwargs is not None:
        args.update(fit_kwargs)
    new_result = new_model.fit(**args)
    if not new_result.converged:
        raise ValueError("fit did not converge when calculating added variable residuals")

    try:
        endog_resid = getattr(new_result, resid_type)
    except AttributeError:
        raise ValueError("'%s' residual type not available" % resid_type)

    import statsmodels.regression.linear_model as lm

    if isinstance(model, (GLM, GEE)) and use_glm_weights:
        weights = model.family.weights(results.fittedvalues)
        if hasattr(model, "data_weights"):
            weights = weights * model.data_weights
        lm_results = lm.WLS(focus_exog_vals, reduced_exog, weights).fit()
    else:
        lm_results = lm.OLS(focus_exog_vals, reduced_exog).fit()
    focus_exog_resid = lm_results.resid

    return endog_resid, focus_exog_resid