def __init__(self,
                 results,
                 resid=None,
                 endog=None,
                 exog=None,
                 hat_matrix_diag=None,
                 cov_params=None,
                 scale=None):
        # I'm not calling super for now, OLS attributes might not be available
        # check which model is allowed
        self.results = results = maybe_unwrap_results(results)
        # TODO: check for extra params in e.g. NegBin
        self.nobs, self.k_vars = results.model.exog.shape
        self.endog = endog if endog is not None else results.model.endog
        self.exog = exog if exog is not None else results.model.exog
        self.resid = resid if resid is not None else results.resid_pearson
        self.scale = scale if scale is not None else results.scale
        self.cov_params = (cov_params
                           if cov_params is not None else results.cov_params())
        self.model_class = results.model.__class__

        self.hessian = self.results.model.hessian(self.results.params)
        self.score_obs = self.results.model.score_obs(self.results.params)
        if hat_matrix_diag is not None:
            self._hat_matrix_diag = hat_matrix_diag
Example #2
0
def ccpr_data(results, exog_idx):
    
    exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model)
    results = maybe_unwrap_results(results)
    
    x1 = results.model.exog[:, exog_idx]
    x1beta = x1*results.params[exog_idx]
    
    return x1, x1beta + results.resid
Example #3
0
def plot_fit(results, exog_idx, y_true=None, ax=None, **kwargs):
    """Plot fit against one regressor.

    This creates one graph with the scatterplot of observed values compared to
    fitted values.

    Parameters
    ----------
    results : result instance
        result instance with resid, model.endog and model.exog as attributes
    x_var : int or str
        Name or index of regressor in exog matrix.
    y_true : array_like
        (optional) If this is not None, then the array is added to the plot
    ax : Matplotlib AxesSubplot instance, optional
        If given, this subplot is used to plot in instead of a new figure being
        created.
    kwargs
        The keyword arguments are passed to the plot command for the fitted
        values points.

    Returns
    -------
    fig : Matplotlib figure instance
        If `ax` is None, the created figure.  Otherwise the figure to which
        `ax` is connected.
    """
    fig, ax = utils.create_mpl_ax(ax)

    exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model)
    results = maybe_unwrap_results(results)

    #maybe add option for wendog, wexog
    y = results.model.endog
    x1 = results.model.exog[:, exog_idx]
    x1_argsort = np.argsort(x1)
    y = y[x1_argsort]
    x1 = x1[x1_argsort]

    ax.plot(x1, y, 'bo', label=results.model.endog_names)
    if not y_true is None:
        ax.plot(x1, y_true[x1_argsort], 'b-', label='True values')
    title = 'Fitted values versus %s' % exog_name

    prstd, iv_l, iv_u = wls_prediction_std(results)
    ax.plot(x1, results.fittedvalues[x1_argsort], 'D', color='r',
            label='fitted', **kwargs)
    ax.vlines(x1, iv_l[x1_argsort], iv_u[x1_argsort], linewidth=1, color='k',
            alpha=.7)
    #ax.fill_between(x1, iv_l[x1_argsort], iv_u[x1_argsort], alpha=0.1,
    #                    color='k')
    ax.set_title(title)
    ax.set_xlabel(exog_name)
    ax.set_ylabel(results.model.endog_names)
    ax.legend(loc='best')

    return fig
Example #4
0
def ccpr_data(results, exog_idx):

    exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model)
    results = maybe_unwrap_results(results)

    x1 = results.model.exog[:, exog_idx]
    x1beta = x1 * results.params[exog_idx]

    return x1, x1beta + results.resid
Example #5
0
    def __init__(self, results):
        #check which model is allowed
        self.results = maybe_unwrap_results(results)
        self.nobs, self.k_vars = results.model.exog.shape
        self.endog = results.model.endog
        self.exog = results.model.exog
        self.model_class = results.model.__class__

        self.sigma_est = np.sqrt(results.mse_resid)

        self.aux_regression_exog = {}
        self.aux_regression_endog = {}
    def __init__(self, results):
        #check which model is allowed
        self.results = maybe_unwrap_results(results)
        self.nobs, self.k_vars = results.model.exog.shape
        self.endog = results.model.endog
        self.exog = results.model.exog
        self.model_class = results.model.__class__

        self.sigma_est = np.sqrt(results.mse_resid)

        self.aux_regression_exog = {}
        self.aux_regression_endog = {}
    def __init__(self, results, resid=None, endog=None, exog=None,
                 hat_matrix_diag=None, cov_params=None, scale=None):
        # I'm not calling super for now, OLS attributes might not be available
        #check which model is allowed
        self.results = results = maybe_unwrap_results(results)
        # TODO: check for extra params in e.g. NegBin
        self.nobs, self.k_vars = results.model.exog.shape
        self.endog = endog if endog is not None else results.model.endog
        self.exog = exog if exog is not None else results.model.exog
        self.resid = resid if resid is not None else results.resid_pearson
        self.scale = scale if scale is not None else results.scale
        self.cov_params = (cov_params if cov_params is not None
                           else results.cov_params())
        self.model_class = results.model.__class__

        self.hessian = self.results.model.hessian(self.results.params)
        self.score_obs = self.results.model.score_obs(self.results.params)
        if hat_matrix_diag is not None:
            self._hat_matrix_diag = hat_matrix_diag
Example #8
0
def outlier_test(model_results, method='bonf', alpha=.05, labels=None,
                 order=False):
    """
    Outlier Tests for RegressionResults instances.

    Parameters
    ----------
    model_results : RegressionResults instance
        Linear model results
    method : str
        - `bonferroni` : one-step correction
        - `sidak` : one-step correction
        - `holm-sidak` :
        - `holm` :
        - `simes-hochberg` :
        - `hommel` :
        - `fdr_bh` : Benjamini/Hochberg
        - `fdr_by` : Benjamini/Yekutieli
        See `statsmodels.stats.multitest.multipletests` for details.
    alpha : float
        familywise error rate
    order : bool
        Whether or not to order the results by the absolute value of the
        studentized residuals. If labels are provided they will also be sorted.

    Returns
    -------
    table : ndarray or DataFrame
        Returns either an ndarray or a DataFrame if labels is not None.
        Will attempt to get labels from model_results if available. The
        columns are the Studentized residuals, the unadjusted p-value,
        and the corrected p-value according to method.

    Notes
    -----
    The unadjusted p-value is stats.t.sf(abs(resid), df) where
    df = df_resid - 1.
    """
    from scipy import stats # lazy import
    infl = getattr(model_results, 'get_influence', None)
    if infl is None:
        results = maybe_unwrap_results(model_results)
        raise AttributeError("model_results object %s does not have a "
                "get_influence method." % results.__class__.__name__)
    resid = infl().resid_studentized_external
    if order:
        idx = np.abs(resid).argsort()[::-1]
        resid = resid[idx]
        if labels is not None:
            labels = np.array(labels)[idx].tolist()
    df = model_results.df_resid - 1
    unadj_p = stats.t.sf(np.abs(resid), df) * 2
    adj_p = multipletests(unadj_p, alpha=alpha, method=method)

    data = np.c_[resid, unadj_p, adj_p[1]]
    if labels is None:
        labels = getattr(model_results.model.data, 'row_labels', None)
    if labels is not None:
        from pandas import DataFrame
        return DataFrame(data,
                         columns=['student_resid', 'unadj_p', method+"(p)"],
                         index=labels)
    return data
def outlier_test(model_results, method='bonf', alpha=.05, labels=None,
                 order=False):
    """
    Outlier Tests for RegressionResults instances.

    Parameters
    ----------
    model_results : RegressionResults instance
        Linear model results
    method : str
        - `bonferroni` : one-step correction
        - `sidak` : one-step correction
        - `holm-sidak` :
        - `holm` :
        - `simes-hochberg` :
        - `hommel` :
        - `fdr_bh` : Benjamini/Hochberg
        - `fdr_by` : Benjamini/Yekutieli
        See `statsmodels.stats.multitest.multipletests` for details.
    alpha : float
        familywise error rate
    order : bool
        Whether or not to order the results by the absolute value of the
        studentized residuals. If labels are provided they will also be sorted.

    Returns
    -------
    table : ndarray or DataFrame
        Returns either an ndarray or a DataFrame if labels is not None.
        Will attempt to get labels from model_results if available. The
        columns are the Studentized residuals, the unadjusted p-value,
        and the corrected p-value according to method.

    Notes
    -----
    The unadjusted p-value is stats.t.sf(abs(resid), df) where
    df = df_resid - 1.
    """
    from scipy import stats # lazy import
    infl = getattr(model_results, 'get_influence', None)
    if infl is None:
        results = maybe_unwrap_results(model_results)
        raise AttributeError("model_results object %s does not have a "
                "get_influence method." % results.__class__.__name__)
    resid = infl().resid_studentized_external
    if order:
        idx = np.abs(resid).argsort()[::-1]
        resid = resid[idx]
        if labels is not None:
            labels = np.array(labels)[idx].tolist()
    df = model_results.df_resid - 1
    unadj_p = stats.t.sf(np.abs(resid), df) * 2
    adj_p = multipletests(unadj_p, alpha=alpha, method=method)

    data = np.c_[resid, unadj_p, adj_p[1]]
    if labels is None:
        labels = getattr(model_results.model.data, 'row_labels', None)
    if labels is not None:
        from pandas import DataFrame
        return DataFrame(data,
                         columns=['student_resid', 'unadj_p', method+"(p)"],
                         index=labels)
    return data
Example #10
0
def plot_fit(results, exog_idx, y_true=None, ax=None, **kwargs):
    """Plot fit against one regressor.

    This creates one graph with the scatterplot of observed values compared to
    fitted values.

    Parameters
    ----------
    results : result instance
        result instance with resid, model.endog and model.exog as attributes
    x_var : int or str
        Name or index of regressor in exog matrix.
    y_true : array_like
        (optional) If this is not None, then the array is added to the plot
    ax : Matplotlib AxesSubplot instance, optional
        If given, this subplot is used to plot in instead of a new figure being
        created.
    kwargs
        The keyword arguments are passed to the plot command for the fitted
        values points.

    Returns
    -------
    fig : Matplotlib figure instance
        If `ax` is None, the created figure.  Otherwise the figure to which
        `ax` is connected.

    Examples
    --------
    Load the Statewide Crime data set and perform linear regression with
    `poverty` and `hs_grad` as variables and `murder` as the response

    >>> import statsmodels.api as sm
    >>> import matplotlib.pyplot as plt

    >>> data = sm.datasets.statecrime.load_pandas().data
    >>> murder = data['murder']
    >>> X = data[['poverty', 'hs_grad']]

    >>> X["constant"] = 1
    >>> y = murder
    >>> model = sm.OLS(y, X)
    >>> results = model.fit()

    Create a plot just for the variable 'Poverty':

    >>> fig, ax = plt.subplots()
    >>> fig = sm.graphics.plot_fit(results, 0, ax=ax)
    >>> ax.set_ylabel("Murder Rate")
    >>> ax.set_xlabel("Poverty Level")
    >>> ax.set_title("Linear Regression")

    >>> plt.show()

    .. plot:: plots/graphics_plot_fit_ex.py

    """

    fig, ax = utils.create_mpl_ax(ax)

    exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model)
    results = maybe_unwrap_results(results)

    #maybe add option for wendog, wexog
    y = results.model.endog
    x1 = results.model.exog[:, exog_idx]
    x1_argsort = np.argsort(x1)
    y = y[x1_argsort]
    x1 = x1[x1_argsort]

    ax.plot(x1, y, 'bo', label=results.model.endog_names)
    if not y_true is None:
        ax.plot(x1, y_true[x1_argsort], 'b-', label='True values')
    title = 'Fitted values versus %s' % exog_name

    prstd, iv_l, iv_u = wls_prediction_std(results)
    ax.plot(x1, results.fittedvalues[x1_argsort], 'D', color='r',
            label='fitted', **kwargs)
    ax.vlines(x1, iv_l[x1_argsort], iv_u[x1_argsort], linewidth=1, color='k',
              alpha=.7)
    #ax.fill_between(x1, iv_l[x1_argsort], iv_u[x1_argsort], alpha=0.1,
    #                    color='k')
    ax.set_title(title)
    ax.set_xlabel(exog_name)
    ax.set_ylabel(results.model.endog_names)
    ax.legend(loc='best', numpoints=1)

    return fig
Example #11
0
def plot_ccpr(results, exog_idx, ax=None):
    """Plot CCPR against one regressor.

    Generates a CCPR (component and component-plus-residual) plot.

    Parameters
    ----------
    results : result instance
        A regression results instance.
    exog_idx : int or string
        Exogenous, explanatory variable. If string is given, it should
        be the variable name that you want to use, and you can use arbitrary
        translations as with a formula.
    ax : Matplotlib AxesSubplot instance, optional
        If given, it is used to plot in instead of a new figure being
        created.

    Returns
    -------
    fig : Matplotlib figure instance
        If `ax` is None, the created figure.  Otherwise the figure to which
        `ax` is connected.

    See Also
    --------
    plot_ccpr_grid : Creates CCPR plot for multiple regressors in a plot grid.

    Notes
    -----
    The CCPR plot provides a way to judge the effect of one regressor on the
    response variable by taking into account the effects of the other
    independent variables. The partial residuals plot is defined as
    Residuals + B_i*X_i versus X_i. The component adds the B_i*X_i versus
    X_i to show where the fitted line would lie. Care should be taken if X_i
    is highly correlated with any of the other independent variables. If this
    is the case, the variance evident in the plot will be an underestimate of
    the true variance.

    References
    ----------
    http://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/ccpr.htm
    """
    fig, ax = utils.create_mpl_ax(ax)

    exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model)
    results = maybe_unwrap_results(results)

    x1 = results.model.exog[:, exog_idx]
    #namestr = ' for %s' % self.name if self.name else ''
    x1beta = x1*results.params[exog_idx]
    ax.plot(x1, x1beta + results.resid, 'o')
    from statsmodels.tools.tools import add_constant
    mod = OLS(x1beta, add_constant(x1)).fit()
    params = mod.params
    fig = abline_plot(*params, **dict(ax=ax))
    #ax.plot(x1, x1beta, '-')
    ax.set_title('Component and component plus residual plot')
    ax.set_ylabel("Residual + %s*beta_%d" % (exog_name, exog_idx))
    ax.set_xlabel("%s" % exog_name)

    return fig
Example #12
0
def feature_regression_summary(df,
                               feat_idx,
                               target,
                               model_fit_results,
                               display_regress_diagnostics=False):

    feat = df.columns[feat_idx]

    v = vif(np.matrix(df), feat_idx)
    colinear = v > 10

    if display_regress_diagnostics:
        # ‘endog versus exog’, ‘residuals versus exog’, ‘fitted versus exog’ and ‘fitted plus residual versus exog’
        fig = plt.figure(constrained_layout=True,
                         figsize=(2.25 * plot_edge, 4 * plot_edge))
        fig = smgrpu.create_mpl_fig(fig)
        gs = GridSpec(3, 2, figure=fig)
        ax_fit = fig.add_subplot(gs[0, 0])
        ax_partial_residuals = fig.add_subplot(gs[0, 1])
        ax_partregress = fig.add_subplot(gs[1, 0])
        ax_ccpr = fig.add_subplot(gs[1, 1])
        ax_dist = fig.add_subplot(gs[2:4, 0:2])

        exog_name, exog_idx = smgrpu.maybe_name_or_idx(feat,
                                                       model_fit_results.model)
        smresults = smtt.maybe_unwrap_results(model_fit_results)
        y_name = smresults.model.endog_names
        x1 = smresults.model.exog[:, exog_idx]
        prstd, iv_l, iv_u = wls_prediction_std(smresults)

        # endog versus exog
        # use wrapper since it's availab;e!
        sm.graphics.plot_fit(model_fit_results, feat, ax=ax_fit)

        # residuals versus exog
        ax_partial_residuals.plot(x1, smresults.resid, 'o')
        ax_partial_residuals.axhline(y=0, color='black')
        ax_partial_residuals.set_title('Residuals versus %s' % exog_name,
                                       fontsize='large')
        ax_partial_residuals.set_xlabel(exog_name)
        ax_partial_residuals.set_ylabel("resid")

        # Partial Regression plot: fitted versus exog
        exog_noti = np.ones(smresults.model.exog.shape[1], bool)
        exog_noti[exog_idx] = False
        exog_others = smresults.model.exog[:, exog_noti]
        from pandas import Series
        smgrp.plot_partregress(smresults.model.data.orig_endog,
                               Series(x1,
                                      name=exog_name,
                                      index=smresults.model.data.row_labels),
                               exog_others,
                               obs_labels=False,
                               ax=ax_partregress)

        # CCPR: fitted plus residual versus exog
        # use wrapper since it's availab;e!
        sm.graphics.plot_ccpr(model_fit_results, feat, ax=ax_ccpr)
        ax_ccpr.set_title('CCPR Plot', fontsize='large')

        sns.distplot(df[feat], ax=ax_dist)

        fig.suptitle('Regression Plots for %s' % exog_name, fontsize="large")
        #fig.tight_layout()
        #fig.subplots_adjust(top=.90)
        plt.show()

        display(
            HTML(
                "Variance Inflation Factor (<i>VIF</i>) for <b>{}</b>: <b>{}</b> {}"
                .format(
                    feat, round(v, 2), "$\\le 10 \\iff$ low colinearity" if
                    not colinear else "$> 10 \\iff$ <b>HIGH COLINEARITY</b>")))
        display(
            HTML(
                "<b><i>p-value</i></b> (<i>VIF</i>) for <b>{}</b>: <b>{}</b><br><br>"
                .format(feat, model_fit_results.pvalues[feat_idx + 1])))

    return v
Example #13
0
def plot_ccpr(results, exog_idx, ax=None):
    """Plot CCPR against one regressor.

    Generates a CCPR (component and component-plus-residual) plot.

    Parameters
    ----------
    results : result instance
        A regression results instance.
    exog_idx : int or string
        Exogenous, explanatory variable. If string is given, it should
        be the variable name that you want to use, and you can use arbitrary
        translations as with a formula.
    ax : Matplotlib AxesSubplot instance, optional
        If given, it is used to plot in instead of a new figure being
        created.

    Returns
    -------
    fig : Matplotlib figure instance
        If `ax` is None, the created figure.  Otherwise the figure to which
        `ax` is connected.

    See Also
    --------
    plot_ccpr_grid : Creates CCPR plot for multiple regressors in a plot grid.

    Notes
    -----
    The CCPR plot provides a way to judge the effect of one regressor on the
    response variable by taking into account the effects of the other
    independent variables. The partial residuals plot is defined as
    Residuals + B_i*X_i versus X_i. The component adds the B_i*X_i versus
    X_i to show where the fitted line would lie. Care should be taken if X_i
    is highly correlated with any of the other independent variables. If this
    is the case, the variance evident in the plot will be an underestimate of
    the true variance.

    Examples
    --------
    Using the state crime dataset plot the effect of the rate of single
    households ('single') on the murder rate while accounting for high school
    graduation rate ('hs_grad'), percentage of people in an urban area, and rate
    of poverty ('poverty').


    >>> import statsmodels.api as sm
    >>> import matplotlib.pyplot as plot
    >>> import statsmodels.formula.api as smf

    >>> crime_data = sm.datasets.statecrime.load_pandas()
    >>> results = smf.ols('murder ~ hs_grad + urban + poverty + single',
    ...                   data=crime_data.data).fit()
    >>> sm.graphics.plot_ccpr(results, 'single')
    >>> plt.show()

    .. plot:: plots/graphics_regression_ccpr.py

    References
    ----------
    http://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/ccpr.htm

    """
    fig, ax = utils.create_mpl_ax(ax)

    exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model)
    results = maybe_unwrap_results(results)

    x1 = results.model.exog[:, exog_idx]
    #namestr = ' for %s' % self.name if self.name else ''
    x1beta = x1 * results.params[exog_idx]
    ax.plot(x1, x1beta + results.resid, 'o')
    from statsmodels.tools.tools import add_constant
    mod = OLS(x1beta, add_constant(x1)).fit()
    params = mod.params
    fig = abline_plot(*params, **dict(ax=ax))
    #ax.plot(x1, x1beta, '-')
    ax.set_title('Component and component plus residual plot')
    ax.set_ylabel("Residual + %s*beta_%d" % (exog_name, exog_idx))
    ax.set_xlabel("%s" % exog_name)

    return fig
def outlier_test(model_results,
                 method='bonf',
                 alpha=.05,
                 labels=None,
                 order=False,
                 cutoff=None):
    """
    Outlier Tests for RegressionResults instances.

    Parameters
    ----------
    model_results : RegressionResults
        Linear model results
    method : str
        - `bonferroni` : one-step correction
        - `sidak` : one-step correction
        - `holm-sidak` :
        - `holm` :
        - `simes-hochberg` :
        - `hommel` :
        - `fdr_bh` : Benjamini/Hochberg
        - `fdr_by` : Benjamini/Yekutieli
        See `statsmodels.stats.multitest.multipletests` for details.
    alpha : float
        familywise error rate
    labels : None or array_like
        If `labels` is not None, then it will be used as index to the
        returned pandas DataFrame. See also Returns below
    order : bool
        Whether or not to order the results by the absolute value of the
        studentized residuals. If labels are provided they will also be sorted.
    cutoff : None or float in [0, 1]
        If cutoff is not None, then the return only includes observations with
        multiple testing corrected p-values strictly below the cutoff. The
        returned array or dataframe can be empty if there are no outlier
        candidates at the specified cutoff.

    Returns
    -------
    table : ndarray or DataFrame
        Returns either an ndarray or a DataFrame if labels is not None.
        Will attempt to get labels from model_results if available. The
        columns are the Studentized residuals, the unadjusted p-value,
        and the corrected p-value according to method.

    Notes
    -----
    The unadjusted p-value is stats.t.sf(abs(resid), df) where
    df = df_resid - 1.
    """
    from scipy import stats  # lazy import
    if labels is None:
        labels = getattr(model_results.model.data, 'row_labels', None)
    infl = getattr(model_results, 'get_influence', None)
    if infl is None:
        results = maybe_unwrap_results(model_results)
        raise AttributeError("model_results object %s does not have a "
                             "get_influence "
                             "method." % results.__class__.__name__)
    resid = infl().resid_studentized_external
    if order:
        idx = np.abs(resid).argsort()[::-1]
        resid = resid[idx]
        if labels is not None:
            labels = np.asarray(labels)[idx]
    df = model_results.df_resid - 1
    unadj_p = stats.t.sf(np.abs(resid), df) * 2
    adj_p = multipletests(unadj_p, alpha=alpha, method=method)

    data = np.c_[resid, unadj_p, adj_p[1]]
    if cutoff is not None:
        mask = data[:, -1] < cutoff
        data = data[mask]
    else:
        mask = slice(None)

    if labels is not None:
        from pandas import DataFrame
        return DataFrame(data,
                         columns=['student_resid', 'unadj_p', method + "(p)"],
                         index=np.asarray(labels)[mask])
    return data
Example #15
0
def plot_regress_exog(results, exog_idx, fig=None):
    """Plot regression results against one regressor.

    This plots four graphs in a 2 by 2 figure: 'endog versus exog',
    'residuals versus exog', 'fitted versus exog' and
    'fitted plus residual versus exog'

    Parameters
    ----------
    results : result instance
        result instance with resid, model.endog and model.exog as attributes
    exog_idx : int or str
        Name or index of regressor in exog matrix
    fig : Matplotlib figure instance, optional
        If given, this figure is simply returned.  Otherwise a new figure is
        created.

    Returns
    -------
    fig : matplotlib figure instance

    Examples
    --------
    Load the Statewide Crime data set and build a model with regressors
    including the rate of high school graduation (hs_grad), population in urban
    areas (urban), households below poverty line (poverty), and single person
    households (single).  Outcome variable is the muder rate (murder).

    Build a 2 by 2 figure based on poverty showing fitted versus actual murder
    rate, residuals versus the poverty rate, partial regression plot of poverty,
    and CCPR plot for poverty rate.

    >>> import statsmodels.api as sm
    >>> import matplotlib.pyplot as plot
    >>> import statsmodels.formula.api as smf

    >>> fig = plt.figure(figsize=(8, 6))
    >>> crime_data = sm.datasets.statecrime.load_pandas()
    >>> results = smf.ols('murder ~ hs_grad + urban + poverty + single',
    ...                   data=crime_data.data).fit()
    >>> sm.graphics.plot_regress_exog(results, 'poverty', fig=fig)
    >>> plt.show()

    .. plot:: plots/graphics_regression_regress_exog.py

    """

    fig = utils.create_mpl_fig(fig)

    exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model)
    results = maybe_unwrap_results(results)

    #maybe add option for wendog, wexog
    y_name = results.model.endog_names
    x1 = results.model.exog[:, exog_idx]
    prstd, iv_l, iv_u = wls_prediction_std(results)

    ax = fig.add_subplot(2, 2, 1)
    ax.plot(x1, results.model.endog, 'o', color='b', alpha=0.9, label=y_name)
    ax.plot(x1, results.fittedvalues, 'D', color='r', label='fitted',
            alpha=.5)
    ax.vlines(x1, iv_l, iv_u, linewidth=1, color='k', alpha=.7)
    ax.set_title('Y and Fitted vs. X', fontsize='large')
    ax.set_xlabel(exog_name)
    ax.set_ylabel(y_name)
    ax.legend(loc='best')

    ax = fig.add_subplot(2, 2, 2)
    ax.plot(x1, results.resid, 'o')
    ax.axhline(y=0, color='black')
    ax.set_title('Residuals versus %s' % exog_name, fontsize='large')
    ax.set_xlabel(exog_name)
    ax.set_ylabel("resid")

    ax = fig.add_subplot(2, 2, 3)
    exog_noti = np.ones(results.model.exog.shape[1], bool)
    exog_noti[exog_idx] = False
    exog_others = results.model.exog[:, exog_noti]
    from pandas import Series
    fig = plot_partregress(results.model.data.orig_endog,
                           Series(x1, name=exog_name,
                                  index=results.model.data.row_labels),
                           exog_others, obs_labels=False, ax=ax)
    ax.set_title('Partial regression plot', fontsize='large')
    #ax.set_ylabel("Fitted values")
    #ax.set_xlabel(exog_name)

    ax = fig.add_subplot(2, 2, 4)
    fig = plot_ccpr(results, exog_idx, ax=ax)
    ax.set_title('CCPR Plot', fontsize='large')
    #ax.set_xlabel(exog_name)
    #ax.set_ylabel("Fitted values + resids")

    fig.suptitle('Regression Plots for %s' % exog_name, fontsize="large")

    fig.tight_layout()

    fig.subplots_adjust(top=.90)
    return fig
def plot_regress_exog(results, exog_idx, fig=None):
    """Plot regression #1lab_results against one regressor.

    This plots four graphs in a 2 by 2 figure: 'endog versus exog',
    'residuals versus exog', 'fitted versus exog' and
    'fitted plus residual versus exog'

    Parameters
    ----------
    results : result instance
        result instance with resid, model.endog and model.exog as attributes
    exog_idx : int
        index of regressor in exog matrix
    fig : Matplotlib figure instance, optional
        If given, this figure is simply returned.  Otherwise a new figure is
        created.

    Returns
    -------
    fig : matplotlib figure instance
    """

    fig = utils.create_mpl_fig(fig)

    exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model)
    results = maybe_unwrap_results(results)

    #maybe add option for wendog, wexog
    y_name = results.model.endog_names
    x1 = results.model.exog[:, exog_idx]
    prstd, iv_l, iv_u = wls_prediction_std(results)

    ax = fig.add_subplot(2, 2, 1)
    ax.plot(x1, results.model.endog, 'o', color='b', alpha=0.9, label=y_name)
    ax.plot(x1, results.fittedvalues, 'D', color='r', label='fitted', alpha=.5)
    ax.vlines(x1, iv_l, iv_u, linewidth=1, color='k', alpha=.7)
    ax.set_title('Y and Fitted vs. X', fontsize='large')
    ax.set_xlabel(exog_name)
    ax.set_ylabel(y_name)
    ax.legend(loc='best')

    ax = fig.add_subplot(2, 2, 2)
    ax.plot(x1, results.resid, 'o')
    ax.axhline(y=0, color='black')
    ax.set_title('Residuals versus %s' % exog_name, fontsize='large')
    ax.set_xlabel(exog_name)
    ax.set_ylabel("resid")

    ax = fig.add_subplot(2, 2, 3)
    exog_noti = np.ones(results.model.exog.shape[1], bool)
    exog_noti[exog_idx] = False
    exog_others = results.model.exog[:, exog_noti]
    from pandas import Series
    fig = plot_partregress(results.model.data.orig_endog,
                           Series(x1,
                                  name=exog_name,
                                  index=results.model.data.row_labels),
                           exog_others,
                           obs_labels=False,
                           ax=ax)
    ax.set_title('Partial regression plot', fontsize='large')
    #ax.set_ylabel("Fitted values")
    #ax.set_xlabel(exog_name)

    ax = fig.add_subplot(2, 2, 4)
    fig = plot_ccpr(results, exog_idx, ax=ax)
    ax.set_title('CCPR Plot', fontsize='large')
    #ax.set_xlabel(exog_name)
    #ax.set_ylabel("Fitted values + resids")

    fig.suptitle('Regression Plots for %s' % exog_name, fontsize="large")

    fig.tight_layout()

    fig.subplots_adjust(top=.90)
    return fig
Example #17
0
def plot_fit(results, exog_idx, y_true=None, ax=None, **kwargs):
    """Plot fit against one regressor.

    This creates one graph with the scatterplot of observed values compared to
    fitted values.

    Parameters
    ----------
    results : result instance
        result instance with resid, model.endog and model.exog as attributes
    x_var : int or str
        Name or index of regressor in exog matrix.
    y_true : array_like
        (optional) If this is not None, then the array is added to the plot
    ax : Matplotlib AxesSubplot instance, optional
        If given, this subplot is used to plot in instead of a new figure being
        created.
    kwargs
        The keyword arguments are passed to the plot command for the fitted
        values points.

    Returns
    -------
    fig : Matplotlib figure instance
        If `ax` is None, the created figure.  Otherwise the figure to which
        `ax` is connected.
    """
    fig, ax = utils.create_mpl_ax(ax)

    exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model)
    results = maybe_unwrap_results(results)

    #maybe add option for wendog, wexog
    y = results.model.endog
    x1 = results.model.exog[:, exog_idx]
    x1_argsort = np.argsort(x1)
    y = y[x1_argsort]
    x1 = x1[x1_argsort]

    ax.plot(x1, y, 'bo', label=results.model.endog_names)
    if not y_true is None:
        ax.plot(x1, y_true[x1_argsort], 'b-', label='True values')
    title = 'Fitted values versus %s' % exog_name

    prstd, iv_l, iv_u = wls_prediction_std(results)
    ax.plot(x1,
            results.fittedvalues[x1_argsort],
            'D',
            color='r',
            label='fitted',
            **kwargs)
    ax.vlines(x1,
              iv_l[x1_argsort],
              iv_u[x1_argsort],
              linewidth=1,
              color='k',
              alpha=.7)
    #ax.fill_between(x1, iv_l[x1_argsort], iv_u[x1_argsort], alpha=0.1,
    #                    color='k')
    ax.set_title(title)
    ax.set_xlabel(exog_name)
    ax.set_ylabel(results.model.endog_names)
    ax.legend(loc='best')

    return fig
Example #18
0
def plot_regress_exog(results, exog_idx, fig=None):
    """Plot regression results against one regressor.

    This plots four graphs in a 2 by 2 figure: 'endog versus exog',
    'residuals versus exog', 'fitted versus exog' and
    'fitted plus residual versus exog'

    Parameters
    ----------
    results : result instance
        result instance with resid, model.endog and model.exog as attributes
    exog_idx : int or str
        Name or index of regressor in exog matrix
    fig : Matplotlib figure instance, optional
        If given, this figure is simply returned.  Otherwise a new figure is
        created.

    Returns
    -------
    fig : matplotlib figure instance

    Examples
    --------
    Load the Statewide Crime data set and build a model with regressors
    including the rate of high school graduation (hs_grad), population in urban
    areas (urban), households below poverty line (poverty), and single person
    households (single).  Outcome variable is the muder rate (murder).

    Build a 2 by 2 figure based on poverty showing fitted versus actual murder
    rate, residuals versus the poverty rate, partial regression plot of poverty,
    and CCPR plot for poverty rate.

    >>> import statsmodels.api as sm
    >>> import matplotlib.pyplot as plot
    >>> import statsmodels.formula.api as smf

    >>> fig = plt.figure(figsize=(8, 6))
    >>> crime_data = sm.datasets.statecrime.load_pandas()
    >>> results = smf.ols('murder ~ hs_grad + urban + poverty + single',
    ...                   data=crime_data.data).fit()
    >>> sm.graphics.plot_regress_exog(results, 'poverty', fig=fig)
    >>> plt.show()

    .. plot:: plots/graphics_regression_regress_exog.py

    """

    fig = utils.create_mpl_fig(fig)

    exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model)
    results = maybe_unwrap_results(results)

    #maybe add option for wendog, wexog
    y_name = results.model.endog_names
    x1 = results.model.exog[:, exog_idx]
    prstd, iv_l, iv_u = wls_prediction_std(results)

    ax = fig.add_subplot(2, 2, 1)
    ax.plot(x1, results.model.endog, 'o', color='b', alpha=0.9, label=y_name)
    ax.plot(x1, results.fittedvalues, 'D', color='r', label='fitted', alpha=.5)
    ax.vlines(x1, iv_l, iv_u, linewidth=1, color='k', alpha=.7)
    ax.set_title('Y and Fitted vs. X', fontsize='large')
    ax.set_xlabel(exog_name)
    ax.set_ylabel(y_name)
    ax.legend(loc='best')

    ax = fig.add_subplot(2, 2, 2)
    ax.plot(x1, results.resid, 'o')
    ax.axhline(y=0, color='black')
    ax.set_title('Residuals versus %s' % exog_name, fontsize='large')
    ax.set_xlabel(exog_name)
    ax.set_ylabel("resid")

    ax = fig.add_subplot(2, 2, 3)
    exog_noti = np.ones(results.model.exog.shape[1], bool)
    exog_noti[exog_idx] = False
    exog_others = results.model.exog[:, exog_noti]
    from pandas import Series
    fig = plot_partregress(results.model.data.orig_endog,
                           Series(x1,
                                  name=exog_name,
                                  index=results.model.data.row_labels),
                           exog_others,
                           obs_labels=False,
                           ax=ax)
    ax.set_title('Partial regression plot', fontsize='large')
    #ax.set_ylabel("Fitted values")
    #ax.set_xlabel(exog_name)

    ax = fig.add_subplot(2, 2, 4)
    fig = plot_ccpr(results, exog_idx, ax=ax)
    ax.set_title('CCPR Plot', fontsize='large')
    #ax.set_xlabel(exog_name)
    #ax.set_ylabel("Fitted values + resids")

    fig.suptitle('Regression Plots for %s' % exog_name, fontsize="large")

    fig.tight_layout()

    fig.subplots_adjust(top=.90)
    return fig
    def __init__(
        self, results: Type[
            statsmodels.regression.linear_model.RegressionResultsWrapper]
    ) -> None:
        """
        For a linear regression model, generates following diagnostic plots:

        a. residual
        b. qq
        c. scale location and
        d. leverage

        and a table

        e. vif

        Args:
            results (Type[statsmodels.regression.linear_model.RegressionResultsWrapper]): 
                must be instance of statsmodels.regression.linear_model object

        Raises:
            TypeError: if instance does not belong to above object

        Example:
        >>> import numpy as np
        >>> import pandas as pd
        >>> import statsmodels.formula.api as smf
        >>> x = np.linspace(-np.pi, np.pi, 100)
        >>> y = 3*x + 8 + np.random.normal(0,1, 100)
        >>> df = pd.DataFrame({'x':x, 'y':y})
        >>> res = smf.ols(formula= "y ~ x", data=df).fit()
        >>> cls = Linear_Reg_Diagnostic(res)
        >>> cls(plot_context="seaborn-paper")     

        In case you do not need all plots you can also independently make an individual plot/table
        in following ways

        >>> cls = Linear_Reg_Diagnostic(res)
        >>> cls.residual_plot()
        >>> cls.qq_plot()
        >>> cls.scale_location_plot()
        >>> cls.leverage_plot()
        >>> cls.vif_table()
        """

        if isinstance(
                results, statsmodels.regression.linear_model.
                RegressionResultsWrapper) is False:
            raise TypeError(
                "result must be instance of statsmodels.regression.linear_model.RegressionResultsWrapper object"
            )

        self.results = maybe_unwrap_results(results)

        self.y_true = self.results.model.endog
        self.y_predict = self.results.fittedvalues
        self.xvar = self.results.model.exog
        self.xvar_names = self.results.model.exog_names

        self.residual = np.array(self.results.resid)
        influence = self.results.get_influence()
        self.residual_norm = influence.resid_studentized_internal
        self.leverage = influence.hat_matrix_diag
        self.cooks_distance = influence.cooks_distance[0]
        self.nparams = len(self.results.params)
Example #20
0
def plot_fit(results, exog_idx, y_true=None, ax=None, **kwargs):
    """Plot fit against one regressor.

    This creates one graph with the scatterplot of observed values compared to
    fitted values.

    Parameters
    ----------
    results : result instance
        result instance with resid, model.endog and model.exog as attributes
    x_var : int or str
        Name or index of regressor in exog matrix.
    y_true : array_like
        (optional) If this is not None, then the array is added to the plot
    ax : Matplotlib AxesSubplot instance, optional
        If given, this subplot is used to plot in instead of a new figure being
        created.
    kwargs
        The keyword arguments are passed to the plot command for the fitted
        values points.

    Returns
    -------
    fig : Matplotlib figure instance
        If `ax` is None, the created figure.  Otherwise the figure to which
        `ax` is connected.

    Examples
    --------
    Load the Statewide Crime data set and perform linear regression with
    `poverty` and `hs_grad` as variables and `murder` as the response

    >>> import statsmodels.api as sm
    >>> import matplotlib.pyplot as plt

    >>> data = sm.datasets.statecrime.load_pandas().data
    >>> murder = data['murder']
    >>> X = data[['poverty', 'hs_grad']]

    >>> X["constant"] = 1
    >>> y = murder
    >>> model = sm.OLS(y, X)
    >>> results = model.fit()

    Create a plot just for the variable 'Poverty':

    >>> fig, ax = plt.subplots()
    >>> fig = sm.graphics.plot_fit(results, 0, ax=ax)
    >>> ax.set_ylabel("Murder Rate")
    >>> ax.set_xlabel("Poverty Level")
    >>> ax.set_title("Linear Regression")

    >>> plt.show()

    .. plot:: plots/graphics_plot_fit_ex.py

    """

    fig, ax = utils.create_mpl_ax(ax)

    exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model)
    results = maybe_unwrap_results(results)

    #maybe add option for wendog, wexog
    y = results.model.endog
    x1 = results.model.exog[:, exog_idx]
    x1_argsort = np.argsort(x1)
    y = y[x1_argsort]
    x1 = x1[x1_argsort]

    ax.plot(x1, y, 'bo', label=results.model.endog_names)
    if y_true is not None:
        ax.plot(x1, y_true[x1_argsort], 'b-', label='True values')
    title = 'Fitted values versus %s' % exog_name

    prstd, iv_l, iv_u = wls_prediction_std(results)
    ax.plot(x1,
            results.fittedvalues[x1_argsort],
            'D',
            color='r',
            label='fitted',
            **kwargs)
    ax.vlines(x1,
              iv_l[x1_argsort],
              iv_u[x1_argsort],
              linewidth=1,
              color='k',
              alpha=.7)
    #ax.fill_between(x1, iv_l[x1_argsort], iv_u[x1_argsort], alpha=0.1,
    #                    color='k')
    ax.set_title(title)
    ax.set_xlabel(exog_name)
    ax.set_ylabel(results.model.endog_names)
    ax.legend(loc='best', numpoints=1)

    return fig
Example #21
0
def plot_regress_exog(results, exog_idx, fig=None):
    """Plot regression results against one regressor.

    This plots four graphs in a 2 by 2 figure: 'endog versus exog',
    'residuals versus exog', 'fitted versus exog' and
    'fitted plus residual versus exog'

    Parameters
    ----------
    results : result instance
        result instance with resid, model.endog and model.exog as attributes
    exog_idx : int
        index of regressor in exog matrix
    fig : Matplotlib figure instance, optional
        If given, this figure is simply returned.  Otherwise a new figure is
        created.

    Returns
    -------
    fig : matplotlib figure instance
    """

    fig = utils.create_mpl_fig(fig)

    exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model)
    results = maybe_unwrap_results(results)

    #maybe add option for wendog, wexog
    y_name = results.model.endog_names
    x1 = results.model.exog[:, exog_idx]
    prstd, iv_l, iv_u = wls_prediction_std(results)

    ax = fig.add_subplot(2, 2, 1)
    ax.plot(x1, results.model.endog, 'o', color='b', alpha=0.9, label=y_name)
    ax.plot(x1, results.fittedvalues, 'D', color='r', label='fitted',
            alpha=.5)
    ax.vlines(x1, iv_l, iv_u, linewidth=1, color='k', alpha=.7)
    ax.set_title('Y and Fitted vs. X', fontsize='large')
    ax.set_xlabel(exog_name)
    ax.set_ylabel(y_name)
    ax.legend(loc='best')

    ax = fig.add_subplot(2, 2, 2)
    ax.plot(x1, results.resid, 'o')
    ax.axhline(y=0, color='black')
    ax.set_title('Residuals versus %s' % exog_name, fontsize='large')
    ax.set_xlabel(exog_name)
    ax.set_ylabel("resid")

    ax = fig.add_subplot(2, 2, 3)
    exog_noti = np.ones(results.model.exog.shape[1], bool)
    exog_noti[exog_idx] = False
    exog_others = results.model.exog[:, exog_noti]
    from pandas import Series
    fig = plot_partregress(results.model.data.orig_endog,
                           Series(x1, name=exog_name,
                                  index=results.model.data.row_labels),
                           exog_others, obs_labels=False, ax=ax)
    ax.set_title('Partial regression plot', fontsize='large')
    #ax.set_ylabel("Fitted values")
    #ax.set_xlabel(exog_name)

    ax = fig.add_subplot(2, 2, 4)
    fig = plot_ccpr(results, exog_idx, ax=ax)
    ax.set_title('CCPR Plot', fontsize='large')
    #ax.set_xlabel(exog_name)
    #ax.set_ylabel("Fitted values + resids")

    fig.suptitle('Regression Plots for %s' % exog_name, fontsize="large")

    fig.tight_layout()

    fig.subplots_adjust(top=.90)
    return fig