Beispiel #1
0
def plot_ccpr(results, exog_idx, ax=None):
    """Plot CCPR against one regressor.

    Generates a CCPR (component and component-plus-residual) plot.

    Parameters
    ----------
    results : result instance
        A regression results instance.
    exog_idx : int or string
        Exogenous, explanatory variable. If string is given, it should
        be the variable name that you want to use, and you can use arbitrary
        translations as with a formula.
    ax : Matplotlib AxesSubplot instance, optional
        If given, it is used to plot in instead of a new figure being
        created.

    Returns
    -------
    fig : Matplotlib figure instance
        If `ax` is None, the created figure.  Otherwise the figure to which
        `ax` is connected.

    See Also
    --------
    plot_ccpr_grid : Creates CCPR plot for multiple regressors in a plot grid.

    Notes
    -----
    The CCPR plot provides a way to judge the effect of one regressor on the
    response variable by taking into account the effects of the other
    independent variables. The partial residuals plot is defined as
    Residuals + B_i*X_i versus X_i. The component adds the B_i*X_i versus
    X_i to show where the fitted line would lie. Care should be taken if X_i
    is highly correlated with any of the other independent variables. If this
    is the case, the variance evident in the plot will be an underestimate of
    the true variance.

    References
    ----------
    http://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/ccpr.htm
    """
    fig, ax = utils.create_mpl_ax(ax)

    exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model)

    x1 = results.model.exog[:, exog_idx]
    #namestr = ' for %s' % self.name if self.name else ''
    x1beta = x1*results._results.params[exog_idx]
    ax.plot(x1, x1beta + results.resid, 'o')
    from statsmodels.tools.tools import add_constant
    mod = OLS(x1beta, add_constant(x1)).fit()
    params = mod.params
    fig = abline_plot(*params, **dict(ax=ax))
    #ax.plot(x1, x1beta, '-')
    ax.set_title('Component and component plus residual plot')
    ax.set_ylabel("Residual + %s*beta_%d" % (exog_name, exog_idx))
    ax.set_xlabel("%s" % exog_name)

    return fig
def plot_added_variable(results, focus_exog, resid_type=None,
                        use_glm_weights=True, fit_kwargs=None, ax=None):
    # Docstring attached below

    model = results.model

    fig, ax = utils.create_mpl_ax(ax)

    endog_resid, focus_exog_resid =\
                 added_variable_resids(results, focus_exog,
                                       resid_type=resid_type,
                                       use_glm_weights=use_glm_weights,
                                       fit_kwargs=fit_kwargs)

    ax.plot(focus_exog_resid, endog_resid, 'o', alpha=0.6)

    ax.set_title('Added variable plot', fontsize='large')

    if type(focus_exog) is str:
        xname = focus_exog
    else:
        xname = model.exog_names[focus_exog]
    ax.set_xlabel(xname, size=15)
    ax.set_ylabel(model.endog_names + " residuals", size=15)

    return fig
Beispiel #3
0
    def plot_rsquare(self, ncomp=None, ax=None):
        """
        Box plots of the individual series R-square against the number of PCs

        Parameters
        ----------
        ncomp : int, optional
            Number of components ot include in the plot.  If None, will
            plot the minimum of 10 or the number of computed components
        ax : Matplotlib axes instance, optional
            An axes on which to draw the graph.  If omitted, new a figure
            is created

        Returns
        -------
        fig : figure
            Handle to the figure
        """
        import statsmodels.graphics.utils as gutils

        fig, ax = gutils.create_mpl_ax(ax)

        ncomp = 10 if ncomp is None else ncomp
        ncomp = min(ncomp, self._ncomp)
        # R2s in rows, series in columns
        r2s = 1.0 - self._ess_indiv / self._tss_indiv
        r2s = r2s[1:]
        r2s = r2s[:ncomp]
        ax.boxplot(r2s.T)
        ax.set_title('Individual Input $R^2$')
        ax.set_ylabel('$R^2$')
        ax.set_xlabel('Number of Included Principal Components')

        return fig
def _plot_leverage_resid2(results, influence, alpha=.05, ax=None,
                         **kwargs):

    from scipy.stats import zscore, norm
    fig, ax = utils.create_mpl_ax(ax)

    infl = influence
    leverage = infl.hat_matrix_diag
    resid = zscore(infl.resid)
    ax.plot(resid**2, leverage, 'o', **kwargs)
    ax.set_xlabel("Normalized residuals**2")
    ax.set_ylabel("Leverage")
    ax.set_title("Leverage vs. Normalized residuals squared")

    large_leverage = leverage > _high_leverage(results)
    #norm or t here if standardized?
    cutoff = norm.ppf(1.-alpha/2)
    large_resid = np.abs(resid) > cutoff
    labels = results.model.data.row_labels
    if labels is None:
        labels = lrange(int(results.nobs))
    index = np.where(np.logical_or(large_leverage, large_resid))[0]
    ax = utils.annotate_axes(index, labels, lzip(resid**2, leverage),
                             [(0, 5)]*int(results.nobs), "large",
                             ax=ax, ha="center", va="bottom")
    ax.margins(.075, .075)
    return fig
    def _plot_index(self, y, ylabel, threshold=None, title=None, ax=None,**kwds):
        from statsmodels.graphics import utils
        fig, ax = utils.create_mpl_ax(ax)
        if title is None:
            title = "Index Plot"
        nobs = len(self.endog)
        index = np.arange(nobs)
        ax.scatter(index, y, **kwds)

        if threshold == 'all':
            large_points = np.ones(nobs, np.bool_)
        else:
            large_points = np.abs(y) > threshold
        psize = 3 * np.ones(nobs)
        # add point labels
        labels = self.results.model.data.row_labels
        if labels is None:
            labels = np.arange(nobs)
        ax = utils.annotate_axes(np.where(large_points)[0], labels,
                                 lzip(index, y),
                                 lzip(-psize, psize), "large",
                                 ax)

        font = {"fontsize" : 16, "color" : "black"}
        ax.set_ylabel(ylabel, **font)
        ax.set_xlabel("Observation", **font)
        ax.set_title(title, **font)
        return fig
Beispiel #6
0
def seasonal_plot(grouped_x, xticklabels, ylabel=None, ax=None):
    """
    Consider using one of month_plot or quarter_plot unless you need
    irregular plotting.

    Parameters
    ----------
    grouped_x : iterable of DataFrames
        Should be a GroupBy object (or similar pair of group_names and groups
        as DataFrames) with a DatetimeIndex or PeriodIndex
    """
    fig, ax = utils.create_mpl_ax(ax)
    start = 0
    ticks = []
    for season, df in grouped_x:
        df = df.copy() # or sort balks for series. may be better way
        sort_values(df, inplace=True)
        nobs = len(df)
        x_plot = np.arange(start, start + nobs)
        ticks.append(x_plot.mean())
        ax.plot(x_plot, df.values, 'k')
        ax.hlines(df.values.mean(), x_plot[0], x_plot[-1], colors='k')
        start += nobs

    ax.set_xticks(ticks)
    ax.set_xticklabels(xticklabels)
    ax.set_ylabel(ylabel)
    ax.margins(.1, .05)
    return fig
Beispiel #7
0
    def plot_scree(self, ncomp=None, log_scale=True,
                   cumulative=False, ax=None):
        """
        Plot of the ordered eigenvalues

        Parameters
        ----------
        ncomp : int, optional
            Number of components ot include in the plot.  If None, will
            included the same as the number of components computed
        log_scale : boot, optional
            Flag indicating whether ot use a log scale for the y-axis
        cumulative : bool, optional
            Flag indicating whether to plot the eigenvalues or cumulative
            eigenvalues
        ax : Matplotlib axes instance, optional
            An axes on which to draw the graph.  If omitted, new a figure
            is created

        Returns
        -------
        fig : figure
            Handle to the figure
        """
        import statsmodels.graphics.utils as gutils

        fig, ax = gutils.create_mpl_ax(ax)

        ncomp = self._ncomp if ncomp is None else ncomp
        vals = np.asarray(self.eigenvals)
        vals = vals[:self._ncomp]
        if cumulative:
            vals = np.cumsum(vals)

        if log_scale:
            ax.set_yscale('log')
        ax.plot(np.arange(ncomp), vals[: ncomp], 'bo')
        ax.autoscale(tight=True)
        xlim = np.array(ax.get_xlim())
        sp = xlim[1] - xlim[0]
        xlim += 0.02 * np.array([-sp, sp])
        ax.set_xlim(xlim)

        ylim = np.array(ax.get_ylim())
        scale = 0.02
        if log_scale:
            sp = np.log(ylim[1] / ylim[0])
            ylim = np.exp(np.array([np.log(ylim[0]) - scale * sp,
                                    np.log(ylim[1]) + scale * sp]))
        else:
            sp = ylim[1] - ylim[0]
            ylim += scale * np.array([-sp, sp])
        ax.set_ylim(ylim)
        ax.set_title('Scree Plot')
        ax.set_ylabel('Eigenvalue')
        ax.set_xlabel('Component Number')
        fig.tight_layout()

        return fig
Beispiel #8
0
def plot_fit(results, exog_idx, y_true=None, ax=None, **kwargs):
    """Plot fit against one regressor.

    This creates one graph with the scatterplot of observed values compared to
    fitted values.

    Parameters
    ----------
    results : result instance
        result instance with resid, model.endog and model.exog as attributes
    x_var : int or str
        Name or index of regressor in exog matrix.
    y_true : array_like
        (optional) If this is not None, then the array is added to the plot
    ax : Matplotlib AxesSubplot instance, optional
        If given, this subplot is used to plot in instead of a new figure being
        created.
    kwargs
        The keyword arguments are passed to the plot command for the fitted
        values points.

    Returns
    -------
    fig : Matplotlib figure instance
        If `ax` is None, the created figure.  Otherwise the figure to which
        `ax` is connected.
    """
    fig, ax = utils.create_mpl_ax(ax)

    exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model)
    results = maybe_unwrap_results(results)

    #maybe add option for wendog, wexog
    y = results.model.endog
    x1 = results.model.exog[:, exog_idx]
    x1_argsort = np.argsort(x1)
    y = y[x1_argsort]
    x1 = x1[x1_argsort]

    ax.plot(x1, y, 'bo', label=results.model.endog_names)
    if not y_true is None:
        ax.plot(x1, y_true[x1_argsort], 'b-', label='True values')
    title = 'Fitted values versus %s' % exog_name

    prstd, iv_l, iv_u = wls_prediction_std(results)
    ax.plot(x1, results.fittedvalues[x1_argsort], 'D', color='r',
            label='fitted', **kwargs)
    ax.vlines(x1, iv_l[x1_argsort], iv_u[x1_argsort], linewidth=1, color='k',
            alpha=.7)
    #ax.fill_between(x1, iv_l[x1_argsort], iv_u[x1_argsort], alpha=0.1,
    #                    color='k')
    ax.set_title(title)
    ax.set_xlabel(exog_name)
    ax.set_ylabel(results.model.endog_names)
    ax.legend(loc='best')

    return fig
def plot_fit(res, exog_idx, exog_name='', y_true=None, ax=None, fontsize='small'):
    """Plot fit against one regressor.

    This creates one graph with the scatterplot of observed values compared to
    fitted values.

    Parameters
    ----------
    res : result instance
        result instance with resid, model.endog and model.exog as attributes
    exog_idx : int
        index of regressor in exog matrix
    y_true : array_like
        (optional) If this is not None, then the array is added to the plot
    ax : Matplotlib AxesSubplot instance, optional
        If given, this subplot is used to plot in instead of a new figure being
        created.

    Returns
    -------
    fig : Matplotlib figure instance
        If `ax` is None, the created figure.  Otherwise the figure to which
        `ax` is connected.

    Notes
    -----
    This is currently very simple, no options or varnames yet.

    """
    fig, ax = utils.create_mpl_ax(ax)

    if exog_name == '':
        exog_name = 'variable %d' % exog_idx

    #maybe add option for wendog, wexog
    y = res.model.endog
    x1 = res.model.exog[:, exog_idx]
    x1_argsort = np.argsort(x1)
    y = y[x1_argsort]
    x1 = x1[x1_argsort]

    ax.plot(x1, y, 'bo', label='observed')
    if not y_true is None:
        ax.plot(x1, y_true[x1_argsort], 'b-', label='true')
        title = 'fitted versus regressor %s' % exog_name
    else:
        title = 'fitted versus regressor %s' % exog_name

    prstd, iv_l, iv_u = wls_prediction_std(res)
    ax.plot(x1, res.fittedvalues[x1_argsort], 'k-', label='fitted') #'k-o')
    #ax.plot(x1, iv_u, 'r--')
    #ax.plot(x1, iv_l, 'r--')
    ax.fill_between(x1, iv_l[x1_argsort], iv_u[x1_argsort], alpha=0.1, color='k')
    ax.set_title(title, fontsize=fontsize)

    return fig
    def plot_partial(self, smooth_index, plot_se=True, cpr=False,
                     include_constant=True, ax=None):
        """plot the contribution of a smooth term to the linear prediction

        Parameters
        ----------
        smooth_index : int
            index of the smooth term within list of smooth terms
        plot_se : book
            If plot_se is true, then the confidence interval for the linear
            prediction will be added to the plot.
        cpr : bool
            If cpr (component plus residual) is true, the a scatter plot of
            the partial working residuals will be added to the plot.
        include_constant : bool
            If true, then the estimated intercept is added to the prediction
            and its standard errors. This avoids that the confidence interval
            has zero width at the imposed identification constraint, e.g.
            either at a reference point or at the mean.
        ax : None or matplotlib axis instance
           If ax is not None, then the plot will be added to it.

        Returns
        -------
        fig : matplotlib Figure instance

        """
        from statsmodels.graphics.utils import _import_mpl, create_mpl_ax
        _import_mpl()

        variable = smooth_index
        y_est, se = self.partial_values(variable,
                                        include_constant=include_constant)
        smoother = self.model.smoother
        x = smoother.smoothers[variable].x
        sort_index = np.argsort(x)
        x = x[sort_index]
        y_est = y_est[sort_index]
        se = se[sort_index]

        fig, ax = create_mpl_ax(ax)
        ax.plot(x, y_est, c='blue', lw=2)
        if plot_se:
            ax.plot(x, y_est + 1.96 * se, '-', c='blue')
            ax.plot(x, y_est - 1.96 * se, '-', c='blue')
        if cpr:
            # TODO: resid_response doesn't make sense with nonlinear link
            # use resid_working ?
            cpr_ = y_est + self.resid_working
            ax.plot(x, cpr_, '.', lw=2)

        ax.set_xlabel(smoother.smoothers[variable].variable_name)

        return fig
def _influence_plot(results, influence, external=True, alpha=.05,
                    criterion="cooks", size=48, plot_alpha=.75, ax=None,
                    **kwargs):
    infl = influence
    fig, ax = utils.create_mpl_ax(ax)

    if criterion.lower().startswith('coo'):
        psize = infl.cooks_distance[0]
    elif criterion.lower().startswith('dff'):
        psize = np.abs(infl.dffits[0])
    else:
        raise ValueError("Criterion %s not understood" % criterion)

    # scale the variables
    #TODO: what is the correct scaling and the assumption here?
    #we want plots to be comparable across different plots
    #so we would need to use the expected distribution of criterion probably
    old_range = np.ptp(psize)
    new_range = size**2 - 8**2

    psize = (psize - psize.min()) * new_range/old_range + 8**2

    leverage = infl.hat_matrix_diag
    if external:
        resids = infl.resid_studentized_external
    else:
        resids = infl.resid_studentized

    from scipy import stats

    cutoff = stats.t.ppf(1.-alpha/2, results.df_resid)
    large_resid = np.abs(resids) > cutoff
    large_leverage = leverage > _high_leverage(results)
    large_points = np.logical_or(large_resid, large_leverage)

    ax.scatter(leverage, resids, s=psize, alpha=plot_alpha)

    # add point labels
    labels = results.model.data.row_labels
    if labels is None:
        labels = lrange(len(resids))
    ax = utils.annotate_axes(np.where(large_points)[0], labels,
                             lzip(leverage, resids),
                             lzip(-(psize/2)**.5, (psize/2)**.5), "x-large",
                             ax)

    #TODO: make configurable or let people do it ex-post?
    font = {"fontsize" : 16, "color" : "black"}
    ax.set_ylabel("Studentized Residuals", **font)
    ax.set_xlabel("H Leverage", **font)
    ax.set_title("Influence Plot", **font)
    return fig
Beispiel #12
0
    def plot_contour(self, mu_low, mu_high, var_low, var_high, mu_step,
                        var_step,
                        levs=[.2, .1, .05, .01, .001]):
        """
        Returns a plot of the confidence region for a univariate
        mean and variance.

        Parameters
        ----------
        mu_low : float
            Lowest value of the mean to plot

        mu_high : float
            Highest value of the mean to plot

        var_low : float
            Lowest value of the variance to plot

        var_high : float
            Highest value of the variance to plot

        mu_step : float
            Increments to evaluate the mean

        var_step : float
            Increments to evaluate the mean

        levs : list
            Which values of significance the contour lines will be drawn.
            Default is [.2, .1, .05, .01, .001]

        Returns
        -------
        fig : matplotlib figure instance
            The contour plot
        """
        fig, ax = utils.create_mpl_ax()
        ax.set_ylabel('Variance')
        ax.set_xlabel('Mean')
        mu_vect = list(np.arange(mu_low, mu_high, mu_step))
        var_vect = list(np.arange(var_low, var_high, var_step))
        z = []
        for sig0 in var_vect:
            self.sig2_0 = sig0
            for mu0 in mu_vect:
                z.append(self._opt_var(mu0, pval=True))
        z = np.asarray(z).reshape(len(var_vect), len(mu_vect))
        ax.contour(mu_vect, var_vect, z, levels=levs)
        return fig
Beispiel #13
0
def plot_leverage_resid2(results, alpha=.05, label_kwargs={}, ax=None,
                         **kwargs):
    """
    Plots leverage statistics vs. normalized residuals squared

    Parameters
    ----------
    results : results instance
        A regression results instance
    alpha : float
        Specifies the cut-off for large-standardized residuals. Residuals
        are assumed to be distributed N(0, 1) with alpha=alpha.
    label_kwargs : dict
        The keywords to pass to annotate for the labels.
    ax : Axes instance
        Matplotlib Axes instance

    Returns
    -------
    fig : matplotlib Figure
        A matplotlib figure instance.
    """
    from scipy.stats import zscore, norm, t
    fig, ax = utils.create_mpl_ax(ax)

    infl = results.get_influence()
    leverage = infl.hat_matrix_diag
    resid = zscore(results.resid)
    ax.plot(resid**2, leverage, 'o', **kwargs)
    ax.set_xlabel("Normalized residuals**2")
    ax.set_ylabel("Leverage")
    ax.set_title("Leverage vs. Normalized residuals squared")

    large_leverage = leverage > _high_leverage(results)
    #norm or t here if standardized?
    cutoff = norm.ppf(1.-alpha/2)
    large_resid = np.abs(resid) > cutoff
    labels = results.model.data.row_labels
    if labels is None:
        labels = range(results.nobs)
    index = np.where(np.logical_or(large_leverage, large_resid))[0]
    ax = utils.annotate_axes(index, labels, zip(resid**2, leverage),
                             [(0, 5)]*int(results.nobs), "large",
                             ax=ax, ha="center", va="bottom")
    ax.margins(.075, .075)
    return fig
def plot_ccpr_ax(res, exog_idx=None, ax=None):
    """Plot CCPR against one regressor.

    Generates a CCPR (component and component-plus-residual) plot.

    Parameters
    ----------
    res : result instance
        uses exog and params of the result instance
    exog_idx : int
        (column) index of the exog used in the plot
    ax : Matplotlib AxesSubplot instance, optional
        If given, this subplot is used to plot in instead of a new figure being
        created.

    Returns
    -------
    fig : Matplotlib figure instance
        If `ax` is None, the created figure.  Otherwise the figure to which
        `ax` is connected.

    See Also
    --------
    plot_ccpr : Creates CCPR plot for multiple regressors in a plot grid.

    References
    ----------
    See http://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/ccpr.htm

    """
    fig, ax = utils.create_mpl_ax(ax)

    x1 = res.model.exog[:,exog_idx]
    #namestr = ' for %s' % self.name if self.name else ''
    x1beta = x1*res.params[1]
    ax.plot(x1, x1beta + res.resid, 'o')
    ax.plot(x1, x1beta, '-')
    ax.set_title('X_%d beta_%d plus residuals versus exog (CCPR)' % \
                                                (exog_idx, exog_idx))

    return fig
def plot_partregress_ax(endog, exog_i, exog_others, varname='',
                        title_fontsize=None, ax=None):
    """Plot partial regression for a single regressor.

    Parameters
    ----------
    endog : ndarray
       endogenous or response variable
    exog_i : ndarray
        exogenous, explanatory variable
    exog_others : ndarray
        other exogenous, explanatory variables, the effect of these variables
        will be removed by OLS regression
    varname : str
        name of the variable used in the title
    ax : Matplotlib AxesSubplot instance, optional
        If given, this subplot is used to plot in instead of a new figure being
        created.

    Returns
    -------
    fig : Matplotlib figure instance
        If `ax` is None, the created figure.  Otherwise the figure to which
        `ax` is connected.

    See Also
    --------
    plot_partregress : Plot partial regression for a set of regressors.

    """
    fig, ax = utils.create_mpl_ax(ax)

    res1a = OLS(endog, exog_others).fit()
    res1b = OLS(exog_i, exog_others).fit()
    ax.plot(res1b.resid, res1a.resid, 'o')
    res1c = OLS(res1a.resid, res1b.resid).fit()
    ax.plot(res1b.resid, res1c.fittedvalues, '-', color='k')
    ax.set_title('Partial Regression plot %s' % varname,
                 fontsize=title_fontsize)# + namestr)

    return fig
def plot_ceres_residuals(results, focus_exog, frac=0.66, cond_means=None,
               ax=None):
    # Docstring attached below

    model = results.model

    focus_exog, focus_col = utils.maybe_name_or_idx(focus_exog, model)

    presid = ceres_resids(results, focus_exog, frac=frac,
                          cond_means=cond_means)

    focus_exog_vals = model.exog[:, focus_col]

    fig, ax = utils.create_mpl_ax(ax)
    ax.plot(focus_exog_vals, presid, 'o', alpha=0.6)

    ax.set_title('CERES residuals plot', fontsize='large')

    ax.set_xlabel(focus_exog, size=15)
    ax.set_ylabel("Component plus residual", size=15)

    return fig
def plot_partial_residuals(results, focus_exog, ax=None):
    # Docstring attached below

    model = results.model

    focus_exog, focus_col = utils.maybe_name_or_idx(focus_exog, model)

    pr = partial_resids(results, focus_exog)
    focus_exog_vals = results.model.exog[:, focus_col]

    fig, ax = utils.create_mpl_ax(ax)
    ax.plot(focus_exog_vals, pr, 'o', alpha=0.6)

    ax.set_title('Partial residuals plot', fontsize='large')

    if type(focus_exog) is str:
        xname = focus_exog
    else:
        xname = model.exog_names[focus_exog]
    ax.set_xlabel(xname, size=15)
    ax.set_ylabel("Component plus residual", size=15)

    return fig
Beispiel #18
0
def seasonal_plot(grouped_x, xticklabels, ylabel=None, ax=None):
    """
    Consider using one of month_plot or quarter_plot unless you need
    irregular plotting.

    Parameters
    ----------
    grouped_x : iterable of DataFrames
        Should be a GroupBy object (or similar pair of group_names and groups
        as DataFrames) with a DatetimeIndex or PeriodIndex
    xticklabels : list of str
        List of season labels, one for each group.
    ylabel : str
        Lable for y axis
    ax : Matplotlib AxesSubplot instance, optional
        If given, this subplot is used to plot in instead of a new figure being
        created.
    """
    fig, ax = utils.create_mpl_ax(ax)
    start = 0
    ticks = []
    for season, df in grouped_x:
        df = df.copy()  # or sort balks for series. may be better way
        df.sort_index()
        nobs = len(df)
        x_plot = np.arange(start, start + nobs)
        ticks.append(x_plot.mean())
        ax.plot(x_plot, df.values, 'k')
        ax.hlines(df.values.mean(), x_plot[0], x_plot[-1], colors='r',
                  linewidth=3)
        start += nobs

    ax.set_xticks(ticks)
    ax.set_xticklabels(xticklabels)
    ax.set_ylabel(ylabel)
    ax.margins(.1, .05)
    return fig
Beispiel #19
0
def plot_acf(x,
             ax=None,
             lags=None,
             *,
             alpha=.05,
             use_vlines=True,
             adjusted=False,
             fft=False,
             missing='none',
             title='Autocorrelation',
             zero=True,
             vlines_kwargs=None,
             **kwargs):
    """
    Plot the autocorrelation function

    Plots lags on the horizontal and the correlations on vertical axis.

    Parameters
    ----------
    x : array_like
        Array of time-series values
    ax : AxesSubplot, optional
        If given, this subplot is used to plot in instead of a new figure being
        created.
    lags : {int, array_like}, optional
        An int or array of lag values, used on horizontal axis. Uses
        np.arange(lags) when lags is an int.  If not provided,
        ``lags=np.arange(len(corr))`` is used.
    alpha : scalar, optional
        If a number is given, the confidence intervals for the given level are
        returned. For instance if alpha=.05, 95 % confidence intervals are
        returned where the standard deviation is computed according to
        Bartlett's formula. If None, no confidence intervals are plotted.
    use_vlines : bool, optional
        If True, vertical lines and markers are plotted.
        If False, only markers are plotted.  The default marker is 'o'; it can
        be overridden with a ``marker`` kwarg.
    adjusted : bool
        If True, then denominators for autocovariance are n-k, otherwise n
    fft : bool, optional
        If True, computes the ACF via FFT.
    missing : str, optional
        A string in ['none', 'raise', 'conservative', 'drop'] specifying how
        the NaNs are to be treated.
    title : str, optional
        Title to place on plot.  Default is 'Autocorrelation'
    zero : bool, optional
        Flag indicating whether to include the 0-lag autocorrelation.
        Default is True.
    vlines_kwargs : dict, optional
        Optional dictionary of keyword arguments that are passed to vlines.
    **kwargs : kwargs, optional
        Optional keyword arguments that are directly passed on to the
        Matplotlib ``plot`` and ``axhline`` functions.

    Returns
    -------
    Figure
        If `ax` is None, the created figure.  Otherwise the figure to which
        `ax` is connected.

    See Also
    --------
    matplotlib.pyplot.xcorr
    matplotlib.pyplot.acorr

    Notes
    -----
    Adapted from matplotlib's `xcorr`.

    Data are plotted as ``plot(lags, corr, **kwargs)``

    kwargs is used to pass matplotlib optional arguments to both the line
    tracing the autocorrelations and for the horizontal line at 0. These
    options must be valid for a Line2D object.

    vlines_kwargs is used to pass additional optional arguments to the
    vertical lines connecting each autocorrelation to the axis.  These options
    must be valid for a LineCollection object.

    Examples
    --------
    >>> import pandas as pd
    >>> import matplotlib.pyplot as plt
    >>> import statsmodels.api as sm

    >>> dta = sm.datasets.sunspots.load_pandas().data
    >>> dta.index = pd.Index(sm.tsa.datetools.dates_from_range('1700', '2008'))
    >>> del dta["YEAR"]
    >>> sm.graphics.tsa.plot_acf(dta.values.squeeze(), lags=40)
    >>> plt.show()

    .. plot:: plots/graphics_tsa_plot_acf.py
    """
    fig, ax = utils.create_mpl_ax(ax)

    lags, nlags, irregular = _prepare_data_corr_plot(x, lags, zero)
    vlines_kwargs = {} if vlines_kwargs is None else vlines_kwargs

    confint = None
    # acf has different return type based on alpha
    acf_x = acf(x,
                nlags=nlags,
                alpha=alpha,
                fft=fft,
                adjusted=adjusted,
                missing=missing)
    if alpha is not None:
        acf_x, confint = acf_x

    _plot_corr(ax, title, acf_x, confint, lags, irregular, use_vlines,
               vlines_kwargs, **kwargs)

    return fig
Beispiel #20
0
def interaction_plot(x,
                     trace,
                     response,
                     func=np.mean,
                     ax=None,
                     plottype='b',
                     xlabel=None,
                     ylabel=None,
                     colors=None,
                     markers=None,
                     linestyles=None,
                     legendloc='best',
                     legendtitle=None,
                     **kwargs):
    """
    Interaction plot for factor level statistics.

    Note. If categorial factors are supplied levels will be internally
    recoded to integers. This ensures matplotlib compatibility. Uses
    a DataFrame to calculate an `aggregate` statistic for each level of the
    factor or group given by `trace`.

    Parameters
    ----------
    x : array_like
        The `x` factor levels constitute the x-axis. If a `pandas.Series` is
        given its name will be used in `xlabel` if `xlabel` is None.
    trace : array_like
        The `trace` factor levels will be drawn as lines in the plot.
        If `trace` is a `pandas.Series` its name will be used as the
        `legendtitle` if `legendtitle` is None.
    response : array_like
        The reponse or dependent variable. If a `pandas.Series` is given
        its name will be used in `ylabel` if `ylabel` is None.
    func : function
        Anything accepted by `pandas.DataFrame.aggregate`. This is applied to
        the response variable grouped by the trace levels.
    ax : axes, optional
        Matplotlib axes instance
    plottype : str {'line', 'scatter', 'both'}, optional
        The type of plot to return. Can be 'l', 's', or 'b'
    xlabel : str, optional
        Label to use for `x`. Default is 'X'. If `x` is a `pandas.Series` it
        will use the series names.
    ylabel : str, optional
        Label to use for `response`. Default is 'func of response'. If
        `response` is a `pandas.Series` it will use the series names.
    colors : list, optional
        If given, must have length == number of levels in trace.
    markers : list, optional
        If given, must have length == number of levels in trace
    linestyles : list, optional
        If given, must have length == number of levels in trace.
    legendloc : {None, str, int}
        Location passed to the legend command.
    legendtitle : {None, str}
        Title of the legend.
    **kwargs
        These will be passed to the plot command used either plot or scatter.
        If you want to control the overall plotting options, use kwargs.

    Returns
    -------
    Figure
        The figure given by `ax.figure` or a new instance.

    Examples
    --------
    >>> import numpy as np
    >>> np.random.seed(12345)
    >>> weight = np.random.randint(1,4,size=60)
    >>> duration = np.random.randint(1,3,size=60)
    >>> days = np.log(np.random.randint(1,30, size=60))
    >>> fig = interaction_plot(weight, duration, days,
    ...             colors=['red','blue'], markers=['D','^'], ms=10)
    >>> import matplotlib.pyplot as plt
    >>> plt.show()

    .. plot::

       import numpy as np
       from statsmodels.graphics.factorplots import interaction_plot
       np.random.seed(12345)
       weight = np.random.randint(1,4,size=60)
       duration = np.random.randint(1,3,size=60)
       days = np.log(np.random.randint(1,30, size=60))
       fig = interaction_plot(weight, duration, days,
                   colors=['red','blue'], markers=['D','^'], ms=10)
       import matplotlib.pyplot as plt
       #plt.show()
    """

    from pandas import DataFrame
    fig, ax = utils.create_mpl_ax(ax)

    response_name = ylabel or getattr(response, 'name', 'response')
    ylabel = '%s of %s' % (func.__name__, response_name)
    xlabel = xlabel or getattr(x, 'name', 'X')
    legendtitle = legendtitle or getattr(trace, 'name', 'Trace')

    ax.set_ylabel(ylabel)
    ax.set_xlabel(xlabel)

    x_values = x_levels = None
    if isinstance(x[0], str):
        x_levels = [l for l in np.unique(x)]
        x_values = lrange(len(x_levels))
        x = _recode(x, dict(zip(x_levels, x_values)))

    data = DataFrame(dict(x=x, trace=trace, response=response))
    plot_data = data.groupby(['trace', 'x']).aggregate(func).reset_index()

    # return data
    # check plot args
    n_trace = len(plot_data['trace'].unique())

    linestyles = ['-'] * n_trace if linestyles is None else linestyles
    markers = ['.'] * n_trace if markers is None else markers
    colors = rainbow(n_trace) if colors is None else colors

    if len(linestyles) != n_trace:
        raise ValueError("Must be a linestyle for each trace level")
    if len(markers) != n_trace:
        raise ValueError("Must be a marker for each trace level")
    if len(colors) != n_trace:
        raise ValueError("Must be a color for each trace level")

    if plottype == 'both' or plottype == 'b':
        for i, (values, group) in enumerate(plot_data.groupby(['trace'])):
            # trace label
            label = str(group['trace'].values[0])
            ax.plot(group['x'],
                    group['response'],
                    color=colors[i],
                    marker=markers[i],
                    label=label,
                    linestyle=linestyles[i],
                    **kwargs)
    elif plottype == 'line' or plottype == 'l':
        for i, (values, group) in enumerate(plot_data.groupby(['trace'])):
            # trace label
            label = str(group['trace'].values[0])
            ax.plot(group['x'],
                    group['response'],
                    color=colors[i],
                    label=label,
                    linestyle=linestyles[i],
                    **kwargs)
    elif plottype == 'scatter' or plottype == 's':
        for i, (values, group) in enumerate(plot_data.groupby(['trace'])):
            # trace label
            label = str(group['trace'].values[0])
            ax.scatter(group['x'],
                       group['response'],
                       color=colors[i],
                       label=label,
                       marker=markers[i],
                       **kwargs)

    else:
        raise ValueError("Plot type %s not understood" % plottype)
    ax.legend(loc=legendloc, title=legendtitle)
    ax.margins(.1)

    if all([x_levels, x_values]):
        ax.set_xticks(x_values)
        ax.set_xticklabels(x_levels)
    return fig
Beispiel #21
0
def plot_acf(x,
             ax=None,
             lags=None,
             alpha=.05,
             use_vlines=True,
             unbiased=False,
             fft=False,
             **kwargs):
    """Plot the autocorrelation function

    Plots lags on the horizontal and the correlations on vertical axis.

    Parameters
    ----------
    x : array_like
        Array of time-series values
    ax : Matplotlib AxesSubplot instance, optional
        If given, this subplot is used to plot in instead of a new figure being
        created.
    lags : array_like, optional
        Array of lag values, used on horizontal axis.
        If not given, ``lags=np.arange(len(corr))`` is used.
    alpha : scalar, optional
        If a number is given, the confidence intervals for the given level are
        returned. For instance if alpha=.05, 95 % confidence intervals are
        returned where the standard deviation is computed according to
        Bartlett's formula. If None, no confidence intervals are plotted.
    use_vlines : bool, optional
        If True, vertical lines and markers are plotted.
        If False, only markers are plotted.  The default marker is 'o'; it can
        be overridden with a ``marker`` kwarg.
    unbiased : bool
       If True, then denominators for autocovariance are n-k, otherwise n
    fft : bool, optional
        If True, computes the ACF via FFT.
    **kwargs : kwargs, optional
        Optional keyword arguments that are directly passed on to the
        Matplotlib ``plot`` and ``axhline`` functions.

    Returns
    -------
    fig : Matplotlib figure instance
        If `ax` is None, the created figure.  Otherwise the figure to which
        `ax` is connected.

    See Also
    --------
    matplotlib.pyplot.xcorr
    matplotlib.pyplot.acorr
    mpl_examples/pylab_examples/xcorr_demo.py

    Notes
    -----
    Adapted from matplotlib's `xcorr`.

    Data are plotted as ``plot(lags, corr, **kwargs)``

    """
    fig, ax = utils.create_mpl_ax(ax)

    if lags is None:
        lags = np.arange(len(x))
        nlags = len(lags) - 1
    else:
        nlags = lags
        lags = np.arange(lags + 1)  # +1 for zero lag

    confint = None
    # acf has different return type based on alpha
    if alpha is None:
        acf_x = acf(x, nlags=nlags, alpha=alpha, fft=fft, unbiased=unbiased)
    else:
        acf_x, confint = acf(x,
                             nlags=nlags,
                             alpha=alpha,
                             fft=fft,
                             unbiased=unbiased)

    if use_vlines:
        ax.vlines(lags, [0], acf_x, **kwargs)
        ax.axhline(**kwargs)

    kwargs.setdefault('marker', 'o')
    kwargs.setdefault('markersize', 5)
    kwargs.setdefault('linestyle', 'None')
    ax.margins(.05)
    ax.plot(lags, acf_x, **kwargs)
    ax.set_title("Autocorrelation")

    if confint is not None:
        # center the confidence interval TODO: do in acf?
        ax.fill_between(lags,
                        confint[:, 0] - acf_x,
                        confint[:, 1] - acf_x,
                        alpha=.25)

    return fig
Beispiel #22
0
    def plot_fit_obs(self, col_name, lowess_args=None,
                     lowess_min_n=40, jitter=None,
                     plot_points=True, ax=None):
        """
        Plot fitted versus imputed or observed values as a scatterplot.

        Parameters
        ----------
        col_name : string
            The variable to be plotted on the horizontal axis.
        lowess_args : dict-like
            Keyword arguments passed to lowess fit.  A dictionary of
            dictionaries, keys are 'o' and 'i' denoting 'observed' and
            'imputed', respectively.
        lowess_min_n : integer
            Minimum sample size to plot a lowess fit
        jitter : float or tuple
            Standard deviation for jittering points in the plot.
            Either a single scalar applied to both axes, or a tuple
            containing x-axis jitter and y-axis jitter, respectively.
        plot_points : bool
            If True, the data points are plotted.
        ax : matplotlib axes object
            Axes on which to plot, created if not provided.

        Returns
        -------
        The matplotlib figure on which the plot is drawn.
        """

        from statsmodels.graphics import utils as gutils
        from statsmodels.nonparametric.smoothers_lowess import lowess

        if lowess_args is None:
            lowess_args = {}

        if ax is None:
            fig, ax = gutils.create_mpl_ax(ax)
        else:
            fig = ax.get_figure()

        ax.set_position([0.1, 0.1, 0.7, 0.8])

        ixi = self.ix_miss[col_name]
        ixo = self.ix_obs[col_name]

        vec1 = np.asarray(self.data[col_name])

        # Fitted values
        formula = self.conditional_formula[col_name]
        endog, exog = patsy.dmatrices(formula, self.data,
                                      return_type="dataframe")
        results = self.results[col_name]
        vec2 = results.predict(exog=exog)
        vec2 = self._get_predicted(vec2)

        if jitter is not None:
            if np.isscalar(jitter):
                jitter = (jitter, jitter)
            vec1 += jitter[0] * np.random.normal(size=len(vec1))
            vec2 += jitter[1] * np.random.normal(size=len(vec2))

        # Plot the points
        keys = ['o', 'i']
        ixs = {'o': ixo, 'i': ixi}
        lak = {'o': 'obs', 'i': 'imp'}
        color = {'o': 'orange', 'i': 'lime'}
        if plot_points:
            for ky in keys:
                ix = ixs[ky]
                ax.plot(vec1[ix], vec2[ix], 'o', color=color[ky],
                        label=lak[ky], alpha=0.6)

        # Plot the lowess fits
        for ky in keys:
            ix = ixs[ky]
            if len(ix) < lowess_min_n:
                continue
            if ky in lowess_args:
                la = lowess_args[ky]
            else:
                la = {}
            ix = ixs[ky]
            lfit = lowess(vec2[ix], vec1[ix], **la)
            ax.plot(lfit[:, 0], lfit[:, 1], '-', color=color[ky],
                    alpha=0.6, lw=4, label=lak[ky])

        ha, la = ax.get_legend_handles_labels()
        leg = fig.legend(ha, la, 'center right', numpoints=1)
        leg.draw_frame(False)

        ax.set_xlabel(col_name + " observed or imputed")
        ax.set_ylabel(col_name + " fitted")

        return fig
Beispiel #23
0
    def plot_missing_pattern(self, ax=None, row_order="pattern",
                             column_order="pattern",
                             hide_complete_rows=False,
                             hide_complete_columns=False,
                             color_row_patterns=True):
        """
        Generate an image showing the missing data pattern.

        Parameters
        ----------
        ax : matplotlib axes
            Axes on which to draw the plot.
        row_order : string
            The method for ordering the rows.  Must be one of 'pattern',
            'proportion', or 'raw'.
        column_order : string
            The method for ordering the columns.  Must be one of 'pattern',
            'proportion', or 'raw'.
        hide_complete_rows : boolean
            If True, rows with no missing values are not drawn.
        hide_complete_columns : boolean
            If True, columns with no missing values are not drawn.
        color_row_patterns : boolean
            If True, color the unique row patterns, otherwise use grey
            and white as colors.

        Returns
        -------
        A figure containing a plot of the missing data pattern.
        """

        # Create an indicator matrix for missing values.
        miss = np.zeros(self.data.shape)
        cols = self.data.columns
        for j, col in enumerate(cols):
            ix = self.ix_miss[col]
            miss[ix, j] = 1

        # Order the columns as requested
        if column_order == "proportion":
            ix = np.argsort(miss.mean(0))
        elif column_order == "pattern":
            cv = np.cov(miss.T)
            u, s, vt = np.linalg.svd(cv, 0)
            ix = np.argsort(cv[:, 0])
        elif column_order == "raw":
            ix = np.arange(len(cols))
        else:
            raise ValueError(
                column_order + " is not an allowed value for `column_order`.")
        miss = miss[:, ix]
        cols = [cols[i] for i in ix]

        # Order the rows as requested
        if row_order == "proportion":
            ix = np.argsort(miss.mean(1))
        elif row_order == "pattern":
            x = 2**np.arange(miss.shape[1])
            rky = np.dot(miss, x)
            ix = np.argsort(rky)
        elif row_order == "raw":
            ix = np.arange(miss.shape[0])
        else:
            raise ValueError(
                row_order + " is not an allowed value for `row_order`.")
        miss = miss[ix, :]

        if hide_complete_rows:
            ix = np.flatnonzero((miss == 1).any(1))
            miss = miss[ix, :]

        if hide_complete_columns:
            ix = np.flatnonzero((miss == 1).any(0))
            miss = miss[:, ix]
            cols = [cols[i] for i in ix]

        from statsmodels.graphics import utils as gutils
        from matplotlib.colors import LinearSegmentedColormap

        if ax is None:
            fig, ax = gutils.create_mpl_ax(ax)
        else:
            fig = ax.get_figure()

        if color_row_patterns:
            x = 2**np.arange(miss.shape[1])
            rky = np.dot(miss, x)
            _, rcol = np.unique(rky, return_inverse=True)
            miss *= 1 + rcol[:, None]
            ax.imshow(miss, aspect="auto", interpolation="nearest",
                      cmap='gist_ncar_r')
        else:
            cmap = LinearSegmentedColormap.from_list("_",
                                                     ["white", "darkgrey"])
            ax.imshow(miss, aspect="auto", interpolation="nearest",
                      cmap=cmap)

        ax.set_ylabel("Cases")
        ax.set_xticks(range(len(cols)))
        ax.set_xticklabels(cols, rotation=90)

        return fig
Beispiel #24
0
def plot_predict(
    result,
    start=None,
    end=None,
    dynamic=False,
    alpha=0.05,
    ax=None,
    **predict_kwargs,
):
    """

    Parameters
    ----------
    result : Result
        Any model result supporting ``get_prediction``.
    start : int, str, or datetime, optional
        Zero-indexed observation number at which to start forecasting,
        i.e., the first forecast is start. Can also be a date string to
        parse or a datetime type. Default is the the zeroth observation.
    end : int, str, or datetime, optional
        Zero-indexed observation number at which to end forecasting, i.e.,
        the last forecast is end. Can also be a date string to
        parse or a datetime type. However, if the dates index does not
        have a fixed frequency, end must be an integer index if you
        want out of sample prediction. Default is the last observation in
        the sample.
    dynamic : bool, int, str, or datetime, optional
        Integer offset relative to `start` at which to begin dynamic
        prediction. Can also be an absolute date string to parse or a
        datetime type (these are not interpreted as offsets).
        Prior to this observation, true endogenous values will be used for
        prediction; starting with this observation and continuing through
        the end of prediction, forecasted endogenous values will be used
        instead.
    alpha : {float, None}
        The tail probability not covered by the confidence interval. Must
        be in (0, 1). Confidence interval is constructed assuming normally
        distributed shocks. If None, figure will not show the confidence
        interval.
    ax : AxesSubplot
        matplotlib Axes instance to use
    **predict_kwargs
        Any additional keyword arguments to pass to ``result.get_prediction``.

    Returns
    -------
    Figure
        matplotlib Figure containing the prediction plot
    """
    from statsmodels.graphics.utils import _import_mpl, create_mpl_ax

    _ = _import_mpl()
    fig, ax = create_mpl_ax(ax)
    from statsmodels.tsa.base.prediction import PredictionResults

    # use predict so you set dates
    pred: PredictionResults = result.get_prediction(start=start,
                                                    end=end,
                                                    dynamic=dynamic,
                                                    **predict_kwargs)
    mean = pred.predicted_mean
    if isinstance(mean, (pd.Series, pd.DataFrame)):
        x = mean.index
        mean.plot(ax=ax, label="forecast")
    else:
        x = np.arange(mean.shape[0])
        ax.plot(x, mean)

    if alpha is not None:
        label = f"{1-alpha:.0%} confidence interval"
        ci = pred.conf_int(alpha)
        conf_int = np.asarray(ci)

        ax.fill_between(
            x,
            conf_int[:, 0],
            conf_int[:, 1],
            color="gray",
            alpha=0.5,
            label=label,
        )

    ax.legend(loc="best")

    return fig
Beispiel #25
0
def plot_fit(results, exog_idx, y_true=None, ax=None, vlines=True, **kwargs):
    """
    Plot fit against one regressor.

    This creates one graph with the scatterplot of observed values
    compared to fitted values.

    Parameters
    ----------
    results : Results
        A result instance with resid, model.endog and model.exog as
        attributes.
    exog_idx : {int, str}
        Name or index of regressor in exog matrix.
    y_true : array_like. optional
        If this is not None, then the array is added to the plot.
    ax : AxesSubplot, optional
        If given, this subplot is used to plot in instead of a new figure being
        created.
    vlines : bool, optional
        If this not True, then the uncertainty of the fit is not
        plotted.
    **kwargs
        The keyword arguments are passed to the plot command for the fitted
        values points.

    Returns
    -------
    Figure
        If `ax` is None, the created figure.  Otherwise the figure to which
        `ax` is connected.

    Examples
    --------
    Load the Statewide Crime data set and perform linear regression with
    `poverty` and `hs_grad` as variables and `murder` as the response

    >>> import statsmodels.api as sm
    >>> import matplotlib.pyplot as plt

    >>> data = sm.datasets.statecrime.load_pandas().data
    >>> murder = data['murder']
    >>> X = data[['poverty', 'hs_grad']]

    >>> X["constant"] = 1
    >>> y = murder
    >>> model = sm.OLS(y, X)
    >>> results = model.fit()

    Create a plot just for the variable 'Poverty':

    >>> fig, ax = plt.subplots()
    >>> fig = sm.graphics.plot_fit(results, 0, ax=ax)
    >>> ax.set_ylabel("Murder Rate")
    >>> ax.set_xlabel("Poverty Level")
    >>> ax.set_title("Linear Regression")

    >>> plt.show()

    .. plot:: plots/graphics_plot_fit_ex.py
    """

    fig, ax = utils.create_mpl_ax(ax)

    exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model)
    results = maybe_unwrap_results(results)

    #maybe add option for wendog, wexog
    y = results.model.endog
    x1 = results.model.exog[:, exog_idx]
    x1_argsort = np.argsort(x1)
    y = y[x1_argsort]
    x1 = x1[x1_argsort]

    ax.plot(x1, y, 'bo', label=results.model.endog_names)
    if y_true is not None:
        ax.plot(x1, y_true[x1_argsort], 'b-', label='True values')
    title = 'Fitted values versus %s' % exog_name

    ax.plot(x1, results.fittedvalues[x1_argsort], 'D', color='r',
            label='fitted', **kwargs)
    if vlines is True:
        _, iv_l, iv_u = wls_prediction_std(results)
        ax.vlines(x1, iv_l[x1_argsort], iv_u[x1_argsort], linewidth=1,
                  color='k', alpha=.7)
    #ax.fill_between(x1, iv_l[x1_argsort], iv_u[x1_argsort], alpha=0.1,
    #                    color='k')
    ax.set_title(title)
    ax.set_xlabel(exog_name)
    ax.set_ylabel(results.model.endog_names)
    ax.legend(loc='best', numpoints=1)

    return fig
Beispiel #26
0
    def plot_missing_pattern(self, ax=None, row_order="pattern",
                             column_order="pattern",
                             hide_complete_rows=False,
                             hide_complete_columns=False,
                             color_row_patterns=True):
        """
        Generate an image showing the missing data pattern.

        Parameters
        ----------
        ax : matplotlib axes
            Axes on which to draw the plot.
        row_order : string
            The method for ordering the rows.  Must be one of 'pattern',
            'proportion', or 'raw'.
        column_order : string
            The method for ordering the columns.  Must be one of 'pattern',
            'proportion', or 'raw'.
        hide_complete_rows : boolean
            If True, rows with no missing values are not drawn.
        hide_complete_columns : boolean
            If True, columns with no missing values are not drawn.
        color_row_patterns : boolean
            If True, color the unique row patterns, otherwise use grey
            and white as colors.

        Returns
        -------
        A figure containing a plot of the missing data pattern.
        """

        # Create an indicator matrix for missing values.
        miss = np.zeros(self.data.shape)
        cols = self.data.columns
        for j, col in enumerate(cols):
            ix = self.ix_miss[col]
            miss[ix, j] = 1

        # Order the columns as requested
        if column_order == "proportion":
            ix = np.argsort(miss.mean(0))
        elif column_order == "pattern":
            cv = np.cov(miss.T)
            u, s, vt = np.linalg.svd(cv, 0)
            ix = np.argsort(cv[:, 0])
        elif column_order == "raw":
            ix = np.arange(len(cols))
        else:
            raise ValueError(column_order + " is not an allowed value for `column_order`.")
        miss = miss[:, ix]
        cols = [cols[i] for i in ix]

        # Order the rows as requested
        if row_order == "proportion":
            ix = np.argsort(miss.mean(1))
        elif row_order == "pattern":
            x = 2**np.arange(miss.shape[1])
            rky = np.dot(miss, x)
            ix = np.argsort(rky)
        elif row_order == "raw":
            ix = np.arange(miss.shape[0])
        else:
            raise ValueError(row_order + " is not an allowed value for `row_order`.")
        miss = miss[ix, :]

        if hide_complete_rows:
            ix = np.flatnonzero((miss == 1).any(1))
            miss = miss[ix, :]

        if hide_complete_columns:
            ix = np.flatnonzero((miss == 1).any(0))
            miss = miss[:, ix]
            cols = [cols[i] for i in ix]

        from statsmodels.graphics import utils as gutils
        from matplotlib.colors import LinearSegmentedColormap

        if ax is None:
            fig, ax = gutils.create_mpl_ax(ax)
        else:
            fig = ax.get_figure()

        if color_row_patterns:
            x = 2**np.arange(miss.shape[1])
            rky = np.dot(miss, x)
            _, rcol = np.unique(rky, return_inverse=True)
            miss *= 1 + rcol[:, None]
            ax.imshow(miss, aspect="auto", interpolation="nearest",
                      cmap='gist_ncar_r')
        else:
            cmap = LinearSegmentedColormap.from_list("_",
                                         ["white", "darkgrey"])
            ax.imshow(miss, aspect="auto", interpolation="nearest",
                      cmap=cmap)

        ax.set_ylabel("Cases")
        ax.set_xticks(range(len(cols)))
        ax.set_xticklabels(cols, rotation=90)

        return fig
Beispiel #27
0
def plot_pacf(x,
              ax=None,
              lags=None,
              alpha=.05,
              method='ywunbiased',
              use_vlines=True,
              title='Partial Autocorrelation',
              zero=True,
              **kwargs):
    """
    Plot the partial autocorrelation function

    Parameters
    ----------
    x : array_like
        Array of time-series values
    ax : Matplotlib AxesSubplot instance, optional
        If given, this subplot is used to plot in instead of a new figure being
        created.
    lags : int or array_like, optional
        int or Array of lag values, used on horizontal axis. Uses
        np.arange(lags) when lags is an int.  If not provided,
        ``lags=np.arange(len(corr))`` is used.
    alpha : float, optional
        If a number is given, the confidence intervals for the given level are
        returned. For instance if alpha=.05, 95 % confidence intervals are
        returned where the standard deviation is computed according to
        1/sqrt(len(x))
    method : {'ywunbiased', 'ywmle', 'ols'}
        Specifies which method for the calculations to use:

        - yw or ywunbiased : yule walker with bias correction in denominator
          for acovf. Default.
        - ywm or ywmle : yule walker without bias correction
        - ols - regression of time series on lags of it and on constant
        - ld or ldunbiased : Levinson-Durbin recursion with bias correction
        - ldb or ldbiased : Levinson-Durbin recursion without bias correction
    use_vlines : bool, optional
        If True, vertical lines and markers are plotted.
        If False, only markers are plotted.  The default marker is 'o'; it can
        be overridden with a ``marker`` kwarg.
    title : str, optional
        Title to place on plot.  Default is 'Partial Autocorrelation'
    zero : bool, optional
        Flag indicating whether to include the 0-lag autocorrelation.
        Default is True.
    **kwargs : kwargs, optional
        Optional keyword arguments that are directly passed on to the
        Matplotlib ``plot`` and ``axhline`` functions.

    Returns
    -------
    fig : Matplotlib figure instance
        If `ax` is None, the created figure.  Otherwise the figure to which
        `ax` is connected.

    See Also
    --------
    matplotlib.pyplot.xcorr
    matplotlib.pyplot.acorr
    mpl_examples/pylab_examples/xcorr_demo.py

    Notes
    -----
    Plots lags on the horizontal and the correlations on vertical axis.
    Adapted from matplotlib's `xcorr`.

    Data are plotted as ``plot(lags, corr, **kwargs)``
    """
    fig, ax = utils.create_mpl_ax(ax)

    lags, nlags, irregular = _prepare_data_corr_plot(x, lags, zero)

    confint = None
    if alpha is None:
        acf_x = pacf(x, nlags=nlags, alpha=alpha, method=method)
    else:
        acf_x, confint = pacf(x, nlags=nlags, alpha=alpha, method=method)

    _plot_corr(ax, title, acf_x, confint, lags, irregular, use_vlines,
               **kwargs)

    return fig
Beispiel #28
0
def plot_acf(x,
             ax=None,
             lags=None,
             alpha=.05,
             use_vlines=True,
             unbiased=False,
             fft=False,
             title='Autocorrelation',
             zero=True,
             **kwargs):
    """Plot the autocorrelation function

    Plots lags on the horizontal and the correlations on vertical axis.

    Parameters
    ----------
    x : array_like
        Array of time-series values
    ax : Matplotlib AxesSubplot instance, optional
        If given, this subplot is used to plot in instead of a new figure being
        created.
    lags : int or array_like, optional
        int or Array of lag values, used on horizontal axis. Uses
        np.arange(lags) when lags is an int.  If not provided,
        ``lags=np.arange(len(corr))`` is used.
    alpha : scalar, optional
        If a number is given, the confidence intervals for the given level are
        returned. For instance if alpha=.05, 95 % confidence intervals are
        returned where the standard deviation is computed according to
        Bartlett's formula. If None, no confidence intervals are plotted.
    use_vlines : bool, optional
        If True, vertical lines and markers are plotted.
        If False, only markers are plotted.  The default marker is 'o'; it can
        be overridden with a ``marker`` kwarg.
    unbiased : bool
        If True, then denominators for autocovariance are n-k, otherwise n
    fft : bool, optional
        If True, computes the ACF via FFT.
    title : str, optional
        Title to place on plot.  Default is 'Autocorrelation'
    zero : bool, optional
        Flag indicating whether to include the 0-lag autocorrelation.
        Default is True.
    **kwargs : kwargs, optional
        Optional keyword arguments that are directly passed on to the
        Matplotlib ``plot`` and ``axhline`` functions.

    Returns
    -------
    fig : Matplotlib figure instance
        If `ax` is None, the created figure.  Otherwise the figure to which
        `ax` is connected.

    See Also
    --------
    matplotlib.pyplot.xcorr
    matplotlib.pyplot.acorr
    mpl_examples/pylab_examples/xcorr_demo.py

    Notes
    -----
    Adapted from matplotlib's `xcorr`.

    Data are plotted as ``plot(lags, corr, **kwargs)``

    """
    fig, ax = utils.create_mpl_ax(ax)

    lags, nlags, irregular = _prepare_data_corr_plot(x, lags, zero)

    confint = None
    # acf has different return type based on alpha
    if alpha is None:
        acf_x = acf(x, nlags=nlags, alpha=alpha, fft=fft, unbiased=unbiased)
    else:
        acf_x, confint = acf(x,
                             nlags=nlags,
                             alpha=alpha,
                             fft=fft,
                             unbiased=unbiased)

    _plot_corr(ax, title, acf_x, confint, lags, irregular, use_vlines,
               **kwargs)

    return fig
def plot_fit(results, exog_idx, y_true=None, ax=None, **kwargs):
    """Plot fit against one regressor.

    This creates one graph with the scatterplot of observed values compared to
    fitted values.

    Parameters
    ----------
    results : result instance
        result instance with resid, model.endog and model.exog as attributes
    x_var : int or str
        Name or index of regressor in exog matrix.
    y_true : array_like
        (optional) If this is not None, then the array is added to the plot
    ax : Matplotlib AxesSubplot instance, optional
        If given, this subplot is used to plot in instead of a new figure being
        created.
    kwargs
        The keyword arguments are passed to the plot command for the fitted
        values points.

    Returns
    -------
    fig : Matplotlib figure instance
        If `ax` is None, the created figure.  Otherwise the figure to which
        `ax` is connected.

    Examples
    --------
    Load the Statewide Crime data set and perform linear regression with
    `poverty` and `hs_grad` as variables and `murder` as the response

    >>> import statsmodels.api as sm
    >>> import matplotlib.pyplot as plt

    >>> data = sm.datasets.statecrime.load_pandas().data
    >>> murder = data['murder']
    >>> X = data[['poverty', 'hs_grad']]

    >>> X["constant"] = 1
    >>> y = murder
    >>> model = sm.OLS(y, X)
    >>> results = model.fit()

    Create a plot just for the variable 'Poverty':

    >>> fig, ax = plt.subplots()
    >>> fig = sm.graphics.plot_fit(results, 0, ax=ax)
    >>> ax.set_ylabel("Murder Rate")
    >>> ax.set_xlabel("Poverty Level")
    >>> ax.set_title("Linear Regression")

    >>> plt.show()

    .. plot:: plots/graphics_plot_fit_ex.py

    """

    fig, ax = utils.create_mpl_ax(ax)

    exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model)
    results = maybe_unwrap_results(results)

    #maybe add option for wendog, wexog
    y = results.model.endog
    x1 = results.model.exog[:, exog_idx]
    x1_argsort = np.argsort(x1)
    y = y[x1_argsort]
    x1 = x1[x1_argsort]

    ax.plot(x1, y, 'bo', label=results.model.endog_names)
    if not y_true is None:
        ax.plot(x1, y_true[x1_argsort], 'b-', label='True values')
    title = 'Fitted values versus %s' % exog_name

    prstd, iv_l, iv_u = wls_prediction_std(results)
    ax.plot(x1, results.fittedvalues[x1_argsort], 'D', color='r',
            label='fitted', **kwargs)
    ax.vlines(x1, iv_l[x1_argsort], iv_u[x1_argsort], linewidth=1, color='k',
              alpha=.7)
    #ax.fill_between(x1, iv_l[x1_argsort], iv_u[x1_argsort], alpha=0.1,
    #                    color='k')
    ax.set_title(title)
    ax.set_xlabel(exog_name)
    ax.set_ylabel(results.model.endog_names)
    ax.legend(loc='best', numpoints=1)

    return fig
def plot_ccf(x,
             y,
             ax=None,
             lags=None,
             *,
             alpha=.05,
             use_vlines=True,
             unbiased=False,
             title='Cross-correlation',
             vlines_kwargs=None,
             **kwargs):
    """
    Plot the cross-correlation function
    Plots lags on the horizontal and the correlations on vertical axis.
    Parameters
    ----------
    x : array_like
        Array of time-series values
    y:  array_like
        Array of time-series values
    ax : AxesSubplot, optional
        If given, this subplot is used to plot in instead of a new figure being
        created.
    lags : {int, array_like}, optional
        An int or array of lag values, used on horizontal axis. Uses
        np.arange(lags) when lags is an int.  If not provided,
        ``lags=np.arange(len(corr))`` is used.
    alpha : scalar, optional
        If a number is given, the confidence intervals for the given level are
        returned. For instance if alpha=.05, 95 % confidence intervals are
        returned where the standard deviation is computed according to
        Bartlett's formula. If None, no confidence intervals are plotted.
    use_vlines : bool, optional
        If True, vertical lines and markers are plotted.
        If False, only markers are plotted.  The default marker is 'o'; it can
        be overridden with a ``marker`` kwarg.
    unbiased : bool
        If True, then denominators for autocovariance are n-k, otherwise n
    title : str, optional
        Title to place on plot.  Default is 'Cross-correlation'
    vlines_kwargs : dict, optional
        Optional dictionary of keyword arguments that are passed to vlines.
    **kwargs : kwargs, optional
        Optional keyword arguments that are directly passed on to the
        Matplotlib ``plot`` and ``axhline`` functions.
    Returns
    -------
    Figure
        If `ax` is None, the created figure.  Otherwise the figure to which
        `ax` is connected.
    See Also
    --------
    matplotlib.pyplot.xcorr
    matplotlib.pyplot.acorr
    Notes
    -----
    Adapted from 
    https://github.com/statsmodels/statsmodels/blob/master/statsmodels/graphics/tsaplots.py
    """
    fig, ax = utils.create_mpl_ax(ax)

    lags, nlags, irregular = _prepare_data_corr_plot_ccf(x, lags)
    vlines_kwargs = {} if vlines_kwargs is None else vlines_kwargs

    confint = None
    ccf_data = ccf(y, x, lag_max=nlags)

    if alpha is not None:
        z = norm.ppf(1 - alpha / 2)
        sl = z / np.sqrt(len(x))
        confint = np.empty((2 * nlags + 1, 2))
        confint[:, 0] = ccf_data - sl
        confint[:, 1] = ccf_data + sl

    _plot_corr(ax, title, ccf_data, confint, lags, irregular, use_vlines,
               vlines_kwargs, **kwargs)

    return fig
Beispiel #31
0
    def mv_mean_contour(self,
                        mu1_low,
                        mu1_upp,
                        mu2_low,
                        mu2_upp,
                        step1,
                        step2,
                        levs=(.001, .01, .05, .1, .2),
                        var1_name=None,
                        var2_name=None,
                        plot_dta=False):
        """
        Creates a confidence region plot for the mean of bivariate data

        Parameters
        ----------
        m1_low : float
            Minimum value of the mean for variable 1
        m1_upp : float
            Maximum value of the mean for variable 1
        mu2_low : float
            Minimum value of the mean for variable 2
        mu2_upp : float
            Maximum value of the mean for variable 2
        step1 : float
            Increment of evaluations for variable 1
        step2 : float
            Increment of evaluations for variable 2
        levs : list
            Levels to be drawn on the contour plot.
            Default =  (.001, .01, .05, .1, .2)
        plot_dta : bool
            If True, makes a scatter plot of the data on
            top of the contour plot. Defaultis False.
        var1_name : str
            Name of variable 1 to be plotted on the x-axis
        var2_name : str
            Name of variable 2 to be plotted on the y-axis

        Notes
        -----
        The smaller the step size, the more accurate the intervals
        will be

        If the function returns optimization failed, consider narrowing
        the boundaries of the plot

        Examples
        --------
        >>> import statsmodels.api as sm
        >>> two_rvs = np.random.standard_normal((20,2))
        >>> el_analysis = sm.emplike.DescStat(two_rvs)
        >>> contourp = el_analysis.mv_mean_contour(-2, 2, -2, 2, .1, .1)
        >>> contourp.show()
        """
        if self.endog.shape[1] != 2:
            raise ValueError('Data must contain exactly two variables')
        fig, ax = utils.create_mpl_ax()
        if var2_name is None:
            ax.set_ylabel('Variable 2')
        else:
            ax.set_ylabel(var2_name)
        if var1_name is None:
            ax.set_xlabel('Variable 1')
        else:
            ax.set_xlabel(var1_name)
        x = np.arange(mu1_low, mu1_upp, step1)
        y = np.arange(mu2_low, mu2_upp, step2)
        pairs = itertools.product(x, y)
        z = []
        for i in pairs:
            z.append(self.mv_test_mean(np.asarray(i))[0])
        X, Y = np.meshgrid(x, y)
        z = np.asarray(z)
        z = z.reshape(X.shape[1], Y.shape[0])
        ax.contour(x, y, z.T, levels=levs)
        if plot_dta:
            ax.plot(self.endog[:, 0], self.endog[:, 1], 'bo')
        return fig
Beispiel #32
0
def _influence_plot(results,
                    influence,
                    external=True,
                    alpha=.05,
                    criterion="cooks",
                    size=48,
                    plot_alpha=.75,
                    ax=None,
                    leverage=None,
                    resid=None,
                    **kwargs):
    # leverage and resid kwds are used only internally for MLEInfluence
    infl = influence
    fig, ax = utils.create_mpl_ax(ax)

    if criterion.lower().startswith('coo'):
        psize = infl.cooks_distance[0]
    elif criterion.lower().startswith('dff'):
        psize = np.abs(infl.dffits[0])
    else:
        raise ValueError("Criterion %s not understood" % criterion)

    # scale the variables
    #TODO: what is the correct scaling and the assumption here?
    #we want plots to be comparable across different plots
    #so we would need to use the expected distribution of criterion probably
    old_range = np.ptp(psize)
    new_range = size**2 - 8**2

    psize = (psize - psize.min()) * new_range / old_range + 8**2

    if leverage is None:
        leverage = infl.hat_matrix_diag
    if resid is None:
        ylabel = "Studentized Residuals"
        if external:
            resid = infl.resid_studentized_external
        else:
            resid = infl.resid_studentized
    else:
        resid = np.asarray(resid)
        ylabel = "Residuals"

    from scipy import stats

    cutoff = stats.t.ppf(1. - alpha / 2, results.df_resid)
    large_resid = np.abs(resid) > cutoff
    large_leverage = leverage > _high_leverage(results)
    large_points = np.logical_or(large_resid, large_leverage)

    ax.scatter(leverage, resid, s=psize, alpha=plot_alpha)

    # add point labels
    labels = results.model.data.row_labels
    if labels is None:
        labels = lrange(len(resid))
    ax = utils.annotate_axes(
        np.where(large_points)[0], labels, lzip(leverage, resid),
        lzip(-(psize / 2)**.5, (psize / 2)**.5), "x-large", ax)

    # TODO: make configurable or let people do it ex-post?
    font = {"fontsize": 16, "color": "black"}
    ax.set_ylabel(ylabel, **font)
    ax.set_xlabel("Leverage", **font)
    ax.set_title("Influence Plot", **font)
    return fig
Beispiel #33
0
def abline_plot(intercept=None, slope=None, horiz=None, vert=None,
                model_results=None, ax=None, **kwargs):
    """
    Plot a line given an intercept and slope.

    Parameters
    ----------
    intercept : float
        The intercept of the line.
    slope : float
        The slope of the line.
    horiz : float or array_like
        Data for horizontal lines on the y-axis.
    vert : array_like
        Data for verterical lines on the x-axis.
    model_results : statsmodels results instance
        Any object that has a two-value `params` attribute. Assumed that it
        is (intercept, slope).
    ax : axes, optional
        Matplotlib axes instance.
    **kwargs
        Options passed to matplotlib.pyplot.plt.

    Returns
    -------
    Figure
        The figure given by `ax.figure` or a new instance.

    Examples
    --------
    >>> import numpy as np
    >>> import statsmodels.api as sm

    >>> np.random.seed(12345)
    >>> X = sm.add_constant(np.random.normal(0, 20, size=30))
    >>> y = np.dot(X, [25, 3.5]) + np.random.normal(0, 30, size=30)
    >>> mod = sm.OLS(y,X).fit()
    >>> fig = sm.graphics.abline_plot(model_results=mod)
    >>> ax = fig.axes[0]
    >>> ax.scatter(X[:,1], y)
    >>> ax.margins(.1)
    >>> import matplotlib.pyplot as plt
    >>> plt.show()

    .. plot:: plots/graphics_regression_abline.py
    """
    if ax is not None:  # get axis limits first thing, do not change these
        x = ax.get_xlim()
    else:
        x = None

    fig, ax = utils.create_mpl_ax(ax)

    if model_results:
        intercept, slope = model_results.params
        if x is None:
            x = [model_results.model.exog[:, 1].min(),
                 model_results.model.exog[:, 1].max()]
    else:
        if not (intercept is not None and slope is not None):
            raise ValueError("specify slope and intercepty or model_results")
        if x is None:
            x = ax.get_xlim()

    data_y = [x[0]*slope+intercept, x[1]*slope+intercept]
    ax.set_xlim(x)
    #ax.set_ylim(y)

    from matplotlib.lines import Line2D

    class ABLine2D(Line2D):
        def __init__(self, *args, **kwargs):
            super(ABLine2D, self).__init__(*args, **kwargs)
            self.id_xlim_callback = None
            self.id_ylim_callback = None

        def remove(self):
            ax = self.axes
            if self.id_xlim_callback:
                ax.callbacks.disconnect(self.id_xlim_callback)
            if self.id_ylim_callback:
                ax.callbacks.disconnect(self.id_ylim_callback)
            super(ABLine2D, self).remove()

        def update_datalim(self, ax):
            ax.set_autoscale_on(False)
            children = ax.get_children()
            ablines = [child for child in children if child is self]
            abline = ablines[0]
            x = ax.get_xlim()
            y = [x[0] * slope + intercept, x[1] * slope + intercept]
            abline.set_data(x, y)
            ax.figure.canvas.draw()

    # TODO: how to intercept something like a margins call and adjust?
    line = ABLine2D(x, data_y, **kwargs)
    ax.add_line(line)
    line.id_xlim_callback = ax.callbacks.connect('xlim_changed', line.update_datalim)
    line.id_ylim_callback = ax.callbacks.connect('ylim_changed', line.update_datalim)

    if horiz:
        ax.hline(horiz)
    if vert:
        ax.vline(vert)
    return fig
Beispiel #34
0
    def plot_fit_obs(self, col_name, lowess_args=None,
                     lowess_min_n=40, jitter=None,
                     plot_points=True, ax=None):
        """
        Plot fitted versus imputed or observed values as a scatterplot.

        Parameters
        ----------
        col_name : string
            The variable to be plotted on the horizontal axis.
        lowess_args : dict-like
            Keyword arguments passed to lowess fit.  A dictionary of
            dictionaries, keys are 'o' and 'i' denoting 'observed' and
            'imputed', respectively.
        lowess_min_n : integer
            Minimum sample size to plot a lowess fit
        jitter : float or tuple
            Standard deviation for jittering points in the plot.
            Either a single scalar applied to both axes, or a tuple
            containing x-axis jitter and y-axis jitter, respectively.
        plot_points : bool
            If True, the data points are plotted.
        ax : matplotlib axes object
            Axes on which to plot, created if not provided.

        Returns
        -------
        The matplotlib figure on which the plot is drawn.
        """

        from statsmodels.graphics import utils as gutils
        from statsmodels.nonparametric.smoothers_lowess import lowess
        import pandas as pd

        if lowess_args is None:
            lowess_args = {}

        if ax is None:
            fig, ax = gutils.create_mpl_ax(ax)
        else:
            fig = ax.get_figure()

        ax.set_position([0.1, 0.1, 0.7, 0.8])

        ixi = self.ix_miss[col_name]
        ixo = self.ix_obs[col_name]

        vec1 = np.asarray(self.data[col_name])

        # Fitted values
        formula = self.conditional_formula[col_name]
        endog, exog = patsy.dmatrices(formula, self.data,
                                      return_type="dataframe")
        results = self.results[col_name]
        vec2 = results.predict(exog=exog)
        vec2 = self._get_predicted(vec2)

        if jitter is not None:
            if np.isscalar(jitter):
                jitter = (jitter, jitter)
            vec1 += jitter[0] * np.random.normal(size=len(vec1))
            vec2 += jitter[1] * np.random.normal(size=len(vec2))

        # Plot the points
        keys = ['o', 'i']
        ixs = {'o': ixo, 'i': ixi}
        lak = {'o': 'obs', 'i': 'imp'}
        color = {'o': 'orange', 'i': 'lime'}
        if plot_points:
            for ky in keys:
                ix = ixs[ky]
                ax.plot(vec1[ix], vec2[ix], 'o', color=color[ky],
                        label=lak[ky], alpha=0.6)

        # Plot the lowess fits
        for ky in keys:
            ix = ixs[ky]
            if len(ix) < lowess_min_n:
                continue
            if ky in lowess_args:
                la = lowess_args[ky]
            else:
                la = {}
            ix = ixs[ky]
            lfit = lowess(vec2[ix], vec1[ix], **la)
            ax.plot(lfit[:, 0], lfit[:, 1], '-', color=color[ky],
                    alpha=0.6, lw=4, label=lak[ky])

        ha, la = ax.get_legend_handles_labels()
        leg = fig.legend(ha, la, 'center right', numpoints=1)
        leg.draw_frame(False)

        ax.set_xlabel(col_name + " observed or imputed")
        ax.set_ylabel(col_name + " fitted")

        return fig
Beispiel #35
0
def split_plot3(grouped_x,
                keyOrder,
                ydim,
                labdim,
                xticklabels=None,
                ylabel=None,
                ax=None,
                label=False,
                keyHighlight=None,
                ylim=None):
    fig, ax = utils.create_mpl_ax(ax)
    start = 0
    ticks = []
    tmpxlabels = []
    for key in keyOrder:
        df = grouped_x.get_group(key)
        nobs = len(df)
        x_plot = arange(start, start + nobs)
        ticks.append(x_plot.mean())
        #Third parameter is color; 'k' is black
        ax.plot(x_plot, df[ydim], 'g', linestyle='--')

        #named colors: http://matplotlib.org/examples/color/named_colors.html
        highlightColors = [
            'mistyrose', 'lightsalmon', 'salmon', 'tomato', 'orangered'
        ]
        baseColors = [
            'silver', 'lightblue', 'paleturquoise', 'lightcyan', 'lightgreen'
        ]
        if key == keyHighlight:
            colors = highlightColors
        else:
            colors = baseColors
        for i in range(0, nobs):
            if ylim is not None and (df[ydim].iloc[i] <= ylim[0]
                                     or df[ydim].iloc[i] >= ylim[1]):
                continue
            if label:
                if nobs <= len(colors):
                    if keyHighlight == 'leader':
                        if int(df[labdim].iloc[i]) == 1:
                            color = highlightColors[i - nobs]
                        else:
                            color = baseColors[i - nobs]
                    else:
                        color = colors[i - nobs]
                else:
                    color = 'pink'
                ax.text(x_plot[i],
                        df[ydim].iloc[i],
                        int(df[labdim].iloc[i]),
                        bbox=dict(boxstyle='round,pad=0.3', color=color))
            #elif df[ydim].iloc[i]>self.RC1SIZE:
            #    ax.text(x_plot[i]-1, df[ydim].iloc[i], int(df[labdim].iloc[i]),
            #            bbox=dict(  boxstyle='round,pad=0.3',color='pink')) #facecolor='none',edgecolor='black',
            else:
                ax.plot(x_plot[i], df[labdim].iloc[i], 'or')
        #ax.hlines(df.values.mean(), x_plot[0], x_plot[-1], colors='k')
        start += nobs
        tmpxlabels.append(key)

    if xticklabels is None:
        xticklabels = tmpxlabels
    elif isinstance(xticklabels, dict):
        xticklabels = [
            xticklabels[x] if x in xticklabels else x for x in tmpxlabels
        ]
    ax.set_xticks(ticks)
    ax.set_xticklabels(xticklabels)
    if ylabel is not None: ax.set_ylabel(ylabel)
    ax.margins(.1, .05)
    return fig
Beispiel #36
0
def plot_survfunc(survfuncs, ax=None):
    """
    Plot one or more survivor functions.

    Parameters
    ----------
    survfuncs : object or array-like
        A single SurvfuncRight object, or a list or SurvfuncRight
        objects that are plotted together.

    Returns
    -------
    A figure instance on which the plot was drawn.

    Examples
    --------
    Add a legend:

    >>> import statsmodels.api as sm
    >>> from statsmodels.duration.survfunc import plot_survfunc
    >>> data = sm.datasets.get_rdataset("flchain", "survival").data
    >>> df = data.loc[data.sex == "F", :]
    >>> sf0 = sm.SurvfuncRight(df["futime"], df["death"])
    >>> sf1 = sm.SurvfuncRight(3.0 * df["futime"], df["death"])
    >>> fig = plot_survfunc([sf0, sf1])
    >>> ax = fig.get_axes()[0]
    >>> ax.set_position([0.1, 0.1, 0.64, 0.8])
    >>> ha, lb = ax.get_legend_handles_labels()
    >>> leg = fig.legend((ha[0], ha[1]), (lb[0], lb[1]), 'center right')

    Change the line colors:

    >>> fig = plot_survfunc([sf0, sf1])
    >>> ax = fig.get_axes()[0]
    >>> ax.set_position([0.1, 0.1, 0.64, 0.8])
    >>> ha, lb = ax.get_legend_handles_labels()
    >>> ha[0].set_color('purple')
    >>> ha[1].set_color('orange')
    """

    fig, ax = utils.create_mpl_ax(ax)

    # If we have only a single survival function to plot, put it into
    # a list.
    try:
        assert(type(survfuncs[0]) is SurvfuncRight)
    except:
        survfuncs = [survfuncs]

    for gx, sf in enumerate(survfuncs):

        # The estimated survival function does not include a point at
        # time 0, include it here for plotting.
        surv_times = np.concatenate(([0], sf.surv_times))
        surv_prob = np.concatenate(([1], sf.surv_prob))

        # If the final times are censoring times they are not included
        # in the survival function so we add them here
        mxt = max(sf.time)
        if mxt > surv_times[-1]:
            surv_times = np.concatenate((surv_times, [mxt]))
            surv_prob = np.concatenate((surv_prob, [surv_prob[-1]]))

        label = getattr(sf, "title", "Group %d" % (gx + 1))

        li, = ax.step(surv_times, surv_prob, '-', label=label, lw=2,
                      where='post')

        # Plot the censored points.
        ii = np.flatnonzero(np.logical_not(sf.status))
        ti = sf.time[ii]
        jj = np.searchsorted(surv_times, ti) - 1
        sp = surv_prob[jj]
        ax.plot(ti, sp, '+', ms=12, color=li.get_color(),
                label=label + " points")

    ax.set_ylim(0, 1.01)

    return fig
Beispiel #37
0
def plot_acf(
    x,
    ax=None,
    lags=None,
    *,
    alpha=0.05,
    use_vlines=True,
    adjusted=False,
    fft=False,
    missing="none",
    title="Autocorrelation",
    zero=True,
    auto_ylims=False,
    bartlett_confint=True,
    vlines_kwargs=None,
    **kwargs,
):
    """
    Plot the autocorrelation function

    Plots lags on the horizontal and the correlations on vertical axis.

    Parameters
    ----------
    x : array_like
        Array of time-series values
    ax : AxesSubplot, optional
        If given, this subplot is used to plot in instead of a new figure being
        created.
    lags : {int, array_like}, optional
        An int or array of lag values, used on horizontal axis. Uses
        np.arange(lags) when lags is an int.  If not provided,
        ``lags=np.arange(len(corr))`` is used.
    alpha : scalar, optional
        If a number is given, the confidence intervals for the given level are
        returned. For instance if alpha=.05, 95 % confidence intervals are
        returned where the standard deviation is computed according to
        Bartlett's formula. If None, no confidence intervals are plotted.
    use_vlines : bool, optional
        If True, vertical lines and markers are plotted.
        If False, only markers are plotted.  The default marker is 'o'; it can
        be overridden with a ``marker`` kwarg.
    adjusted : bool
        If True, then denominators for autocovariance are n-k, otherwise n
    fft : bool, optional
        If True, computes the ACF via FFT.
    missing : str, optional
        A string in ['none', 'raise', 'conservative', 'drop'] specifying how
        the NaNs are to be treated.
    title : str, optional
        Title to place on plot.  Default is 'Autocorrelation'
    zero : bool, optional
        Flag indicating whether to include the 0-lag autocorrelation.
        Default is True.
    auto_ylims : bool, optional
        If True, adjusts automatically the y-axis limits to ACF values.
    bartlett_confint : bool, default True
        Confidence intervals for ACF values are generally placed at 2
        standard errors around r_k. The formula used for standard error
        depends upon the situation. If the autocorrelations are being used
        to test for randomness of residuals as part of the ARIMA routine,
        the standard errors are determined assuming the residuals are white
        noise. The approximate formula for any lag is that standard error
        of each r_k = 1/sqrt(N). See section 9.4 of [1] for more details on
        the 1/sqrt(N) result. For more elementary discussion, see section
        5.3.2 in [2].
        For the ACF of raw data, the standard error at a lag k is
        found as if the right model was an MA(k-1). This allows the
        possible interpretation that if all autocorrelations past a
        certain lag are within the limits, the model might be an MA of
        order defined by the last significant autocorrelation. In this
        case, a moving average model is assumed for the data and the
        standard errors for the confidence intervals should be
        generated using Bartlett's formula. For more details on
        Bartlett formula result, see section 7.2 in [1].
    vlines_kwargs : dict, optional
        Optional dictionary of keyword arguments that are passed to vlines.
    **kwargs : kwargs, optional
        Optional keyword arguments that are directly passed on to the
        Matplotlib ``plot`` and ``axhline`` functions.

    Returns
    -------
    Figure
        If `ax` is None, the created figure.  Otherwise the figure to which
        `ax` is connected.

    See Also
    --------
    matplotlib.pyplot.xcorr
    matplotlib.pyplot.acorr

    Notes
    -----
    Adapted from matplotlib's `xcorr`.

    Data are plotted as ``plot(lags, corr, **kwargs)``

    kwargs is used to pass matplotlib optional arguments to both the line
    tracing the autocorrelations and for the horizontal line at 0. These
    options must be valid for a Line2D object.

    vlines_kwargs is used to pass additional optional arguments to the
    vertical lines connecting each autocorrelation to the axis.  These options
    must be valid for a LineCollection object.

    References
    ----------
    [1] Brockwell and Davis, 1987. Time Series Theory and Methods
    [2] Brockwell and Davis, 2010. Introduction to Time Series and
    Forecasting, 2nd edition.

    Examples
    --------
    >>> import pandas as pd
    >>> import matplotlib.pyplot as plt
    >>> import statsmodels.api as sm

    >>> dta = sm.datasets.sunspots.load_pandas().data
    >>> dta.index = pd.Index(sm.tsa.datetools.dates_from_range('1700', '2008'))
    >>> del dta["YEAR"]
    >>> sm.graphics.tsa.plot_acf(dta.values.squeeze(), lags=40)
    >>> plt.show()

    .. plot:: plots/graphics_tsa_plot_acf.py
    """
    fig, ax = utils.create_mpl_ax(ax)

    lags, nlags, irregular = _prepare_data_corr_plot(x, lags, zero)
    vlines_kwargs = {} if vlines_kwargs is None else vlines_kwargs

    confint = None
    # acf has different return type based on alpha
    acf_x = acf(
        x,
        nlags=nlags,
        alpha=alpha,
        fft=fft,
        bartlett_confint=bartlett_confint,
        adjusted=adjusted,
        missing=missing,
    )
    if alpha is not None:
        acf_x, confint = acf_x[:2]

    _plot_corr(
        ax,
        title,
        acf_x,
        confint,
        lags,
        irregular,
        use_vlines,
        vlines_kwargs,
        auto_ylims=auto_ylims,
        **kwargs,
    )

    return fig
Beispiel #38
0
def plot_acf(x, ax=None, lags=None, alpha=.05, use_vlines=True, unbiased=False,
            fft=False, **kwargs):
    """Plot the autocorrelation function

    Plots lags on the horizontal and the correlations on vertical axis.

    Parameters
    ----------
    x : array_like
        Array of time-series values
    ax : Matplotlib AxesSubplot instance, optional
        If given, this subplot is used to plot in instead of a new figure being
        created.
    lags : array_like, optional
        Array of lag values, used on horizontal axis.
        If not given, ``lags=np.arange(len(corr))`` is used.
    alpha : scalar, optional
        If a number is given, the confidence intervals for the given level are
        returned. For instance if alpha=.05, 95 % confidence intervals are
        returned where the standard deviation is computed according to
        Bartlett's formula. If None, no confidence intervals are plotted.
    use_vlines : bool, optional
        If True, vertical lines and markers are plotted.
        If False, only markers are plotted.  The default marker is 'o'; it can
        be overridden with a ``marker`` kwarg.
    unbiased : bool
       If True, then denominators for autocovariance are n-k, otherwise n
    fft : bool, optional
        If True, computes the ACF via FFT.
    **kwargs : kwargs, optional
        Optional keyword arguments that are directly passed on to the
        Matplotlib ``plot`` and ``axhline`` functions.

    Returns
    -------
    fig : Matplotlib figure instance
        If `ax` is None, the created figure.  Otherwise the figure to which
        `ax` is connected.

    See Also
    --------
    matplotlib.pyplot.xcorr
    matplotlib.pyplot.acorr
    mpl_examples/pylab_examples/xcorr_demo.py

    Notes
    -----
    Adapted from matplotlib's `xcorr`.

    Data are plotted as ``plot(lags, corr, **kwargs)``

    """
    fig, ax = utils.create_mpl_ax(ax)

    if lags is None:
        lags = np.arange(len(x))
        nlags = len(lags) - 1
    else:
        nlags = lags
        lags = np.arange(lags + 1) # +1 for zero lag

    acf_x, confint = acf(x, nlags=nlags, alpha=alpha, fft=fft,
                         unbiased=unbiased)

    if use_vlines:
        ax.vlines(lags, [0], acf_x, **kwargs)
        ax.axhline(**kwargs)

    # center the confidence interval TODO: do in acf?
    confint = confint - confint.mean(1)[:,None]
    kwargs.setdefault('marker', 'o')
    kwargs.setdefault('markersize', 5)
    kwargs.setdefault('linestyle', 'None')
    ax.margins(.05)
    ax.plot(lags, acf_x, **kwargs)
    ax.fill_between(lags, confint[:,0], confint[:,1], alpha=.25)
    ax.set_title("Autocorrelation")

    return fig
Beispiel #39
0
    def plot_bivariate(self, col1_name, col2_name,
                       lowess_args=None, lowess_min_n=40,
                       jitter=None, plot_points=True, ax=None):
        """
        Plot observed and imputed values for two variables.

        Displays a scatterplot of one variable against another.  The
        points are colored according to whether the values are
        observed or imputed.

        Parameters
        ----------
        col1_name : string
            The variable to be plotted on the horizontal axis.
        col2_name : string
            The variable to be plotted on the vertical axis.
        lowess_args : dictionary
            A dictionary of dictionaries, keys are 'ii', 'io', 'oi'
            and 'oo', where 'o' denotes 'observed' and 'i' denotes
            imputed.  See Notes for details.
        lowess_min_n : integer
            Minimum sample size to plot a lowess fit
        jitter : float or tuple
            Standard deviation for jittering points in the plot.
            Either a single scalar applied to both axes, or a tuple
            containing x-axis jitter and y-axis jitter, respectively.
        plot_points : bool
            If True, the data points are plotted.
        ax : matplotlib axes object
            Axes on which to plot, created if not provided.

        Returns
        -------
        The matplotlib figure on which the plot id drawn.
        """

        from statsmodels.graphics import utils as gutils
        from statsmodels.nonparametric.smoothers_lowess import lowess

        if lowess_args is None:
            lowess_args = {}

        if ax is None:
            fig, ax = gutils.create_mpl_ax(ax)
        else:
            fig = ax.get_figure()

        ax.set_position([0.1, 0.1, 0.7, 0.8])

        ix1i = self.ix_miss[col1_name]
        ix1o = self.ix_obs[col1_name]
        ix2i = self.ix_miss[col2_name]
        ix2o = self.ix_obs[col2_name]

        ix_ii = np.intersect1d(ix1i, ix2i)
        ix_io = np.intersect1d(ix1i, ix2o)
        ix_oi = np.intersect1d(ix1o, ix2i)
        ix_oo = np.intersect1d(ix1o, ix2o)

        vec1 = np.asarray(self.data[col1_name])
        vec2 = np.asarray(self.data[col2_name])

        if jitter is not None:
            if np.isscalar(jitter):
                jitter = (jitter, jitter)
            vec1 += jitter[0] * np.random.normal(size=len(vec1))
            vec2 += jitter[1] * np.random.normal(size=len(vec2))

        # Plot the points
        keys = ['oo', 'io', 'oi', 'ii']
        lak = {'i': 'imp', 'o': 'obs'}
        ixs = {'ii': ix_ii, 'io': ix_io, 'oi': ix_oi, 'oo': ix_oo}
        color = {'oo': 'grey', 'ii': 'red', 'io': 'orange',
                 'oi': 'lime'}
        if plot_points:
            for ky in keys:
                ix = ixs[ky]
                lab = lak[ky[0]] + "/" + lak[ky[1]]
                ax.plot(vec1[ix], vec2[ix], 'o', color=color[ky],
                        label=lab, alpha=0.6)

        # Plot the lowess fits
        for ky in keys:
            ix = ixs[ky]
            if len(ix) < lowess_min_n:
                continue
            if ky in lowess_args:
                la = lowess_args[ky]
            else:
                la = {}
            ix = ixs[ky]
            lfit = lowess(vec2[ix], vec1[ix], **la)
            if plot_points:
                ax.plot(lfit[:, 0], lfit[:, 1], '-', color=color[ky],
                        alpha=0.6, lw=4)
            else:
                lab = lak[ky[0]] + "/" + lak[ky[1]]
                ax.plot(lfit[:, 0], lfit[:, 1], '-', color=color[ky],
                        alpha=0.6, lw=4, label=lab)

        ha, la = ax.get_legend_handles_labels()
        pad = 0.0001 if plot_points else 0.5
        leg = fig.legend(ha, la, 'center right', numpoints=1,
                         handletextpad=pad)
        leg.draw_frame(False)

        ax.set_xlabel(col1_name)
        ax.set_ylabel(col2_name)

        return fig
Beispiel #40
0
def mosaic(data, index=None, ax=None, horizontal=True, gap=0.005,
           properties=lambda key: None, labelizer=None,
           title='', statistic=False, axes_label=True,
           label_rotation=0.0):
    """Create a mosaic plot from a contingency table.

    It allows to visualize multivariate categorical data in a rigorous
    and informative way.

    Parameters
    ----------
    data : dict, pandas.Series, np.ndarray, pandas.DataFrame
        The contingency table that contains the data.
        Each category should contain a non-negative number
        with a tuple as index.  It expects that all the combination
        of keys to be representes; if that is not true, will
        automatically consider the missing values as 0.  The order
        of the keys will be the same as the one of insertion.
        If a dict of a Series (or any other dict like object)
        is used, it will take the keys as labels.  If a
        np.ndarray is provided, it will generate a simple
        numerical labels.
    index: list, optional
        Gives the preferred order for the category ordering. If not specified
        will default to the given order.  It doesn't support named indexes
        for hierarchical Series.  If a DataFrame is provided, it expects
        a list with the name of the columns.
    ax : matplotlib.Axes, optional
        The graph where display the mosaic. If not given, will
        create a new figure
    horizontal : bool, optional (default True)
        The starting direction of the split (by default along
        the horizontal axis)
    gap : float or array of floats
        The list of gaps to be applied on each subdivision.
        If the lenght of the given array is less of the number
        of subcategories (or if it's a single number) it will extend
        it with exponentially decreasing gaps
    labelizer : function (key) -> string, optional
        A function that generate the text to display at the center of
        each tile base on the key of that tile
    properties : function (key) -> dict, optional
        A function that for each tile in the mosaic take the key
        of the tile and returns the dictionary of properties
        of the generated Rectangle, like color, hatch or similar.
        A default properties set will be provided fot the keys whose
        color has not been defined, and will use color variation to help
        visually separates the various categories. It should return None
        to indicate that it should use the default property for the tile.
        A dictionary of the properties for each key can be passed,
        and it will be internally converted to the correct function
    statistic: bool, optional (default False)
        if true will use a crude statistical model to give colors to the plot.
        If the tile has a containt that is more than 2 standard deviation
        from the expected value under independence hipotesys, it will
        go from green to red (for positive deviations, blue otherwise) and
        will acquire an hatching when crosses the 3 sigma.
    title: string, optional
        The title of the axis
    axes_label: boolean, optional
        Show the name of each value of each category
        on the axis (default) or hide them.
    label_rotation: float or list of float
        the rotation of the axis label (if present). If a list is given
        each axis can have a different rotation

    Returns
    ----------
    fig : matplotlib.Figure
        The generate figure
    rects : dict
        A dictionary that has the same keys of the original
        dataset, that holds a reference to the coordinates of the
        tile and the Rectangle that represent it

    See Also
    ----------
    A Brief History of the Mosaic Display
        Michael Friendly, York University, Psychology Department
        Journal of Computational and Graphical Statistics, 2001

    Mosaic Displays for Loglinear Models.
        Michael Friendly, York University, Psychology Department
        Proceedings of the Statistical Graphics Section, 1992, 61-68.

    Mosaic displays for multi-way contingecy tables.
        Michael Friendly, York University, Psychology Department
        Journal of the american statistical association
        March 1994, Vol. 89, No. 425, Theory and Methods

    Examples
    ----------
    The most simple use case is to take a dictionary and plot the result

    >>> data = {'a': 10, 'b': 15, 'c': 16}
    >>> mosaic(data, title='basic dictionary')
    >>> pylab.show()

    A more useful example is given by a dictionary with multiple indices.
    In this case we use a wider gap to a better visual separation of the
    resulting plot

    >>> data = {('a', 'b'): 1, ('a', 'c'): 2, ('d', 'b'): 3, ('d', 'c'): 4}
    >>> mosaic(data, gap=0.05, title='complete dictionary')
    >>> pylab.show()

    The same data can be given as a simple or hierarchical indexed Series

    >>> rand = np.random.random
    >>> from itertools import product
    >>>
    >>> tuples = list(product(['bar', 'baz', 'foo', 'qux'], ['one', 'two']))
    >>> index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
    >>> data = pd.Series(rand(8), index=index)
    >>> mosaic(data, title='hierarchical index series')
    >>> pylab.show()

    The third accepted data structureis the np array, for which a
    very simple index will be created.

    >>> rand = np.random.random
    >>> data = 1+rand((2,2))
    >>> mosaic(data, title='random non-labeled array')
    >>> pylab.show()

    If you need to modify the labeling and the coloring you can give
    a function tocreate the labels and one with the graphical properties
    starting from the key tuple

    >>> data = {'a': 10, 'b': 15, 'c': 16}
    >>> props = lambda key: {'color': 'r' if 'a' in key else 'gray'}
    >>> labelizer = lambda k: {('a',): 'first', ('b',): 'second',
                               ('c',): 'third'}[k]
    >>> mosaic(data, title='colored dictionary',
                properties=props, labelizer=labelizer)
    >>> pylab.show()

    Using a DataFrame as source, specifying the name of the columns of interest
    >>> gender = ['male', 'male', 'male', 'female', 'female', 'female']
    >>> pet = ['cat', 'dog', 'dog', 'cat', 'dog', 'cat']
    >>> data = pandas.DataFrame({'gender': gender, 'pet': pet})
    >>> mosaic(data, ['pet', 'gender'])
    >>> pylab.show()
    """
    from pylab import Rectangle
    fig, ax = utils.create_mpl_ax(ax)
    # normalize the data to a dict with tuple of strings as keys
    data = _normalize_data(data, index)
    # split the graph into different areas
    rects = _hierarchical_split(data, horizontal=horizontal, gap=gap)
    # if there is no specified way to create the labels
    # create a default one
    if labelizer is None:
        labelizer = lambda k: "\n".join(k)
    if statistic:
        default_props = _statistical_coloring(data)
    else:
        default_props = _create_default_properties(data)
    if isinstance(properties, dict):
        color_dict = properties
        properties = lambda key: color_dict.get(key, None)
    for k, v in list(rects.items()):
        # create each rectangle and put a label on it
        x, y, w, h = v
        conf = properties(k)
        props = conf if conf else default_props[k]
        text = labelizer(k)
        Rect = Rectangle((x, y), w, h, label=text, **props)
        ax.add_patch(Rect)
        ax.text(x + w / 2, y + h / 2, text, ha='center',
                 va='center', size='smaller')
    #creating the labels on the axis
    #o clearing it
    if axes_label:
        if np.iterable(label_rotation):
            rotation = label_rotation
        else:
            rotation = [label_rotation] * 4
        labels = _create_labels(rects, horizontal, ax, rotation)
    else:
        ax.set_xticks([])
        ax.set_xticklabels([])
        ax.set_yticks([])
        ax.set_yticklabels([])
    ax.set_title(title)
    return fig, rects
Beispiel #41
0
    def plot_imputed_hist(self, col_name, ax=None, imp_hist_args=None,
                          obs_hist_args=None, all_hist_args=None):
        """
        Display imputed values for one variable as a histogram.

        Parameters
        ----------
        col_name : string
            The name of the variable to be plotted.
        ax : matplotlib axes
            An axes on which to draw the histograms.  If not provided,
            one is created.
        imp_hist_args : dict
            Keyword arguments to be passed to pyplot.hist when
            creating the histogram for imputed values.
        obs_hist_args : dict
            Keyword arguments to be passed to pyplot.hist when
            creating the histogram for observed values.
        all_hist_args : dict
            Keyword arguments to be passed to pyplot.hist when
            creating the histogram for all values.

        Returns
        -------
        The matplotlib figure on which the histograms were drawn
        """

        from statsmodels.graphics import utils as gutils

        if imp_hist_args is None:
            imp_hist_args = {}
        if obs_hist_args is None:
            obs_hist_args = {}
        if all_hist_args is None:
            all_hist_args = {}

        if ax is None:
            fig, ax = gutils.create_mpl_ax(ax)
        else:
            fig = ax.get_figure()

        ax.set_position([0.1, 0.1, 0.7, 0.8])

        ixm = self.ix_miss[col_name]
        ixo = self.ix_obs[col_name]

        imp = self.data[col_name].iloc[ixm]
        obs = self.data[col_name].iloc[ixo]

        for di in imp_hist_args, obs_hist_args, all_hist_args:
            if 'histtype' not in di:
                di['histtype'] = 'step'

        ha, la = [], []
        if len(imp) > 0:
            h = ax.hist(np.asarray(imp), **imp_hist_args)
            ha.append(h[-1][0])
            la.append("Imp")
        h1 = ax.hist(np.asarray(obs), **obs_hist_args)
        h2 = ax.hist(np.asarray(self.data[col_name]), **all_hist_args)
        ha.extend([h1[-1][0], h2[-1][0]])
        la.extend(["Obs", "All"])

        leg = fig.legend(ha, la, 'center right', numpoints=1)
        leg.draw_frame(False)

        ax.set_xlabel(col_name)
        ax.set_ylabel("Frequency")

        return fig
Beispiel #42
0
def abline_plot(intercept=None, slope=None, horiz=None, vert=None,
                model_results=None, ax=None, **kwargs):
    """
    Plots a line given an intercept and slope.

    intercept : float
        The intercept of the line
    slope : float
        The slope of the line
    horiz : float or array-like
        Data for horizontal lines on the y-axis
    vert : array-like
        Data for verterical lines on the x-axis
    model_results : statsmodels results instance
        Any object that has a two-value `params` attribute. Assumed that it
        is (intercept, slope)
    ax : axes, optional
        Matplotlib axes instance
    kwargs
        Options passed to matplotlib.pyplot.plt

    Returns
    -------
    fig : Figure
        The figure given by `ax.figure` or a new instance.

    Examples
    --------
    >>> import numpy as np
    >>> import statsmodels.api as sm
    >>> np.random.seed(12345)
    >>> X = sm.add_constant(np.random.normal(0, 20, size=30), prepend=True)
    >>> y = np.dot(X, [25, 3.5]) + np.random.normal(0, 30, size=30)
    >>> mod = sm.OLS(y,X).fit()
    >>> fig = abline_plot(model_results=mod)
    >>> ax = fig.axes
    >>> ax.scatter(X[:,1], y)
    >>> ax.margins(.1)
    >>> import matplotlib.pyplot as plt
    >>> plt.show()
    """
    if ax is not None: # get axis limits first thing, don't change these
        x = ax.get_xlim()
        y = ax.get_ylim()
    else:
        x = None

    fig,ax = utils.create_mpl_ax(ax)

    if model_results:
        intercept, slope = model_results.params
        if x is None:
            x = [model_results.model.exog[:,1].min(),
                 model_results.model.exog[:,1].max()]
    else:
        if not (intercept is not None and slope is not None):
            raise ValueError("specify slope and intercepty or model_results")
        if x is None:
            x = ax.get_xlim()

    data_y = [x[0]*slope+intercept, x[1]*slope+intercept]
    ax.set_xlim(x)
    #ax.set_ylim(y)

    from matplotlib.lines import Line2D

    class ABLine2D(Line2D):

        def update_datalim(self, ax):
            ax.set_autoscale_on(False)

            children = ax.get_children()
            abline = [children[i] for i in range(len(children))
                       if isinstance(children[i], ABLine2D)][0]
            x = ax.get_xlim()
            y = [x[0]*slope+intercept, x[1]*slope+intercept]
            abline.set_data(x,y)
            ax.figure.canvas.draw()

    line = ABLine2D(x, data_y, **kwargs)
    ax.add_line(line)
    ax.callbacks.connect('xlim_changed', line.update_datalim)
    ax.callbacks.connect('ylim_changed', line.update_datalim)


    if horiz:
        ax.hline(horiz)
    if vert:
        ax.vline(vert)
    return fig
Beispiel #43
0
    def plot_power(self, dep_var='nobs', nobs=None, effect_size=None,
                   alpha=0.05, ax=None, title=None, plt_kwds=None, **kwds):
        '''plot power with number of observations or effect size on x-axis

        Parameters
        ----------
        dep_var : string in ['nobs', 'effect_size', 'alpha']
            This specifies which variable is used for the horizontal axis.
            If dep_var='nobs' (default), then one curve is created for each
            value of ``effect_size``. If dep_var='effect_size' or alpha, then
            one curve is created for each value of ``nobs``.
        nobs : scalar or array_like
            specifies the values of the number of observations in the plot
        effect_size : scalar or array_like
            specifies the values of the effect_size in the plot
        alpha : float or array_like
            The significance level (type I error) used in the power
            calculation. Can only be more than a scalar, if ``dep_var='alpha'``
        ax : None or axis instance
            If ax is None, than a matplotlib figure is created. If ax is a
            matplotlib axis instance, then it is reused, and the plot elements
            are created with it.
        title : string
            title for the axis. Use an empty string, ``''``, to avoid a title.
        plt_kwds : None or dict
            not used yet
        kwds : optional keywords for power function
            These remaining keyword arguments are used as arguments to the
            power function. Many power function support ``alternative`` as a
            keyword argument, two-sample test support ``ratio``.

        Returns
        -------
        fig : matplotlib figure instance

        Notes
        -----
        This works only for classes where the ``power`` method has
        ``effect_size``, ``nobs`` and ``alpha`` as the first three arguments.
        If the second argument is ``nobs1``, then the number of observations
        in the plot are those for the first sample.
        TODO: fix this for FTestPower and GofChisquarePower

        TODO: maybe add line variable, if we want more than nobs and effectsize
        '''
        #if pwr_kwds is None:
        #    pwr_kwds = {}
        from statsmodels.graphics import utils
        from statsmodels.graphics.plottools import rainbow
        fig, ax = utils.create_mpl_ax(ax)
        import matplotlib.pyplot as plt
        colormap = plt.cm.Dark2 #pylint: disable-msg=E1101
        plt_alpha = 1 #0.75
        lw = 2
        if dep_var == 'nobs':
            colors = rainbow(len(effect_size))
            colors = [colormap(i) for i in np.linspace(0, 0.9, len(effect_size))]
            for ii, es in enumerate(effect_size):
                power = self.power(es, nobs, alpha, **kwds)
                ax.plot(nobs, power, lw=lw, alpha=plt_alpha,
                        color=colors[ii], label='es=%4.2F' % es)
                xlabel = 'Number of Observations'
        elif dep_var in ['effect size', 'effect_size', 'es']:
            colors = rainbow(len(nobs))
            colors = [colormap(i) for i in np.linspace(0, 0.9, len(nobs))]
            for ii, n in enumerate(nobs):
                power = self.power(effect_size, n, alpha, **kwds)
                ax.plot(effect_size, power, lw=lw, alpha=plt_alpha,
                        color=colors[ii], label='N=%4.2F' % n)
                xlabel = 'Effect Size'
        elif dep_var in ['alpha']:
            # experimental nobs as defining separate lines
            colors = rainbow(len(nobs))

            for ii, n in enumerate(nobs):
                power = self.power(effect_size, n, alpha, **kwds)
                ax.plot(alpha, power, lw=lw, alpha=plt_alpha,
                        color=colors[ii], label='N=%4.2F' % n)
                xlabel = 'alpha'
        else:
            raise ValueError('depvar not implemented')

        if title is None:
            title = 'Power of Test'
        ax.set_xlabel(xlabel)
        ax.set_title(title)
        ax.legend(loc='lower right')
        return fig
yhat=phi1*np.hstack((0,y[0:(n-1)]))
sigma=np.var(y,ddof=1)
nu=np.hstack((sigma,sigma*(1-phi1**(2))[1:n]))
tau=nu/sigma
sigmahat=np.mean((y-yhat)**2/tau)
nuhat=sigmahat*(1-phi1**(2))
nuhat2=np.sqrt(nuhat)
ciar=(1-phi1**(2))
nuhat3=np.sqrt(sigma*ciar)

####Fitting Regular Models

arma_mod10 = sm.tsa.ARMA(y, (1,0)).fit(trend='nc')
#arma_mod10.summary()
syar=arma_mod10.sigma2/(1-arma_mod10.params[0]**2)
car=(1-arma_mod10.params[0]**2)
sear=np.sqrt(sigma*car)


pdf = matplotlib.backends.backend_pdf.PdfPages("outputAR.pdf")
ax=None
fig, ax = utils.create_mpl_ax(ax)
ax.vlines(sT[1:100], ymin=np.zeros(100), ymax=nuhat3[1:100])
ax.scatter(sT[1:100],nuhat3[1:100],c='green')
ax.axhline(np.sqrt(sigma),color='red')
ax.axhline(sear,color='blue')
ax.axhline(np.mean(nuhat3[1:100]),color='black')
ax.set_ylim([0, 1])
pdf.savefig(1)
pdf.close()
Beispiel #45
0
def plot_pacf(x,
              ax=None,
              lags=None,
              alpha=.05,
              method='ywadjusted',
              use_vlines=True,
              title='Partial Autocorrelation',
              zero=True,
              vlines_kwargs=None,
              **kwargs):
    """
    Plot the partial autocorrelation function

    Parameters
    ----------
    x : array_like
        Array of time-series values
    ax : AxesSubplot, optional
        If given, this subplot is used to plot in instead of a new figure being
        created.
    lags : {int, array_like}, optional
        An int or array of lag values, used on horizontal axis. Uses
        np.arange(lags) when lags is an int.  If not provided,
        ``lags=np.arange(len(corr))`` is used.
    alpha : float, optional
        If a number is given, the confidence intervals for the given level are
        returned. For instance if alpha=.05, 95 % confidence intervals are
        returned where the standard deviation is computed according to
        1/sqrt(len(x))
    method : {'ywunbiased', 'ywmle', 'ols'}
        Specifies which method for the calculations to use:

        - yw or ywunbiased : yule walker with bias correction in denominator
          for acovf. Default.
        - ywm or ywmle : yule walker without bias correction
        - ols - regression of time series on lags of it and on constant
        - ld or ldunbiased : Levinson-Durbin recursion with bias correction
        - ldb or ldbiased : Levinson-Durbin recursion without bias correction

    use_vlines : bool, optional
        If True, vertical lines and markers are plotted.
        If False, only markers are plotted.  The default marker is 'o'; it can
        be overridden with a ``marker`` kwarg.
    title : str, optional
        Title to place on plot.  Default is 'Partial Autocorrelation'
    zero : bool, optional
        Flag indicating whether to include the 0-lag autocorrelation.
        Default is True.
    vlines_kwargs : dict, optional
        Optional dictionary of keyword arguments that are passed to vlines.
    **kwargs : kwargs, optional
        Optional keyword arguments that are directly passed on to the
        Matplotlib ``plot`` and ``axhline`` functions.

    Returns
    -------
    Figure
        If `ax` is None, the created figure.  Otherwise the figure to which
        `ax` is connected.

    See Also
    --------
    matplotlib.pyplot.xcorr
    matplotlib.pyplot.acorr

    Notes
    -----
    Plots lags on the horizontal and the correlations on vertical axis.
    Adapted from matplotlib's `xcorr`.

    Data are plotted as ``plot(lags, corr, **kwargs)``

    kwargs is used to pass matplotlib optional arguments to both the line
    tracing the autocorrelations and for the horizontal line at 0. These
    options must be valid for a Line2D object.

    vlines_kwargs is used to pass additional optional arguments to the
    vertical lines connecting each autocorrelation to the axis.  These options
    must be valid for a LineCollection object.

    Examples
    --------
    >>> import pandas as pd
    >>> import matplotlib.pyplot as plt
    >>> import statsmodels.api as sm

    >>> dta = sm.datasets.sunspots.load_pandas().data
    >>> dta.index = pd.Index(sm.tsa.datetools.dates_from_range('1700', '2008'))
    >>> del dta["YEAR"]
    >>> sm.graphics.tsa.plot_acf(dta.values.squeeze(), lags=40)
    >>> plt.show()

    .. plot:: plots/graphics_tsa_plot_pacf.py
    """
    fig, ax = utils.create_mpl_ax(ax)
    vlines_kwargs = {} if vlines_kwargs is None else vlines_kwargs
    lags, nlags, irregular = _prepare_data_corr_plot(x, lags, zero)

    confint = None
    if alpha is None:
        acf_x = pacf(x, nlags=nlags, alpha=alpha, method=method)
    else:
        acf_x, confint = pacf(x, nlags=nlags, alpha=alpha, method=method)

    _plot_corr(ax, title, acf_x, confint, lags, irregular, use_vlines,
               vlines_kwargs, **kwargs)

    return fig
Beispiel #46
0
def plot_partregress(endog, exog_i, exog_others, data=None,
                     title_kwargs={}, obs_labels=True, label_kwargs={},
                     ax=None, ret_coords=False, eval_env=1, **kwargs):
    """Plot partial regression for a single regressor.

    Parameters
    ----------
    endog : {ndarray, str}
       The endogenous or response variable. If string is given, you can use a
       arbitrary translations as with a formula.
    exog_i : {ndarray, str}
        The exogenous, explanatory variable. If string is given, you can use a
        arbitrary translations as with a formula.
    exog_others : {ndarray, list[str]}
        Any other exogenous, explanatory variables. If a list of strings is
        given, each item is a term in formula. You can use a arbitrary
        translations as with a formula. The effect of these variables will be
        removed by OLS regression.
    data : {DataFrame, dict}
        Some kind of data structure with names if the other variables are
        given as strings.
    title_kwargs : dict
        Keyword arguments to pass on for the title. The key to control the
        fonts is fontdict.
    obs_labels : {bool, array_like}
        Whether or not to annotate the plot points with their observation
        labels. If obs_labels is a boolean, the point labels will try to do
        the right thing. First it will try to use the index of data, then
        fall back to the index of exog_i. Alternatively, you may give an
        array-like object corresponding to the observation numbers.
    label_kwargs : dict
        Keyword arguments that control annotate for the observation labels.
    ax : AxesSubplot, optional
        If given, this subplot is used to plot in instead of a new figure being
        created.
    ret_coords : bool
        If True will return the coordinates of the points in the plot. You
        can use this to add your own annotations.
    eval_env : int
        Patsy eval environment if user functions and formulas are used in
        defining endog or exog.
    **kwargs
        The keyword arguments passed to plot for the points.

    Returns
    -------
    fig : Figure
        If `ax` is None, the created figure.  Otherwise the figure to which
        `ax` is connected.
    coords : list, optional
        If ret_coords is True, return a tuple of arrays (x_coords, y_coords).

    See Also
    --------
    plot_partregress_grid : Plot partial regression for a set of regressors.

    Notes
    -----
    The slope of the fitted line is the that of `exog_i` in the full
    multiple regression. The individual points can be used to assess the
    influence of points on the estimated coefficient.

    Examples
    --------
    Load the Statewide Crime data set and plot partial regression of the rate
    of high school graduation (hs_grad) on the murder rate(murder).

    The effects of the percent of the population living in urban areas (urban),
    below the poverty line (poverty) , and in a single person household (single)
    are removed by OLS regression.

    >>> import statsmodels.api as sm
    >>> import matplotlib.pyplot as plt

    >>> crime_data = sm.datasets.statecrime.load_pandas()
    >>> sm.graphics.plot_partregress(endog='murder', exog_i='hs_grad',
    ...                              exog_others=['urban', 'poverty', 'single'],
    ...                              data=crime_data.data, obs_labels=False)
    >>> plt.show()

    .. plot:: plots/graphics_regression_partregress.py

    More detailed examples can be found in the Regression Plots notebook
    on the examples page.
    """
    #NOTE: there is no interaction between possible missing data and
    #obs_labels yet, so this will need to be tweaked a bit for this case
    fig, ax = utils.create_mpl_ax(ax)
    print("eval_env:", eval_env)
    # strings, use patsy to transform to data
    if isinstance(endog, str):
        endog = dmatrix(endog + "-1", data, eval_env=eval_env)

    if isinstance(exog_others, str):
        RHS = dmatrix(exog_others, data, eval_env=eval_env)
    elif isinstance(exog_others, list):
        RHS = "+".join(exog_others)
        RHS = dmatrix(RHS, data, eval_env=eval_env)
    else:
        RHS = exog_others
    RHS_isemtpy = False
    if isinstance(RHS, np.ndarray) and RHS.size==0:
        RHS_isemtpy = True
    elif isinstance(RHS, pd.DataFrame) and RHS.empty:
        RHS_isemtpy = True
    if isinstance(exog_i, str):
        exog_i = dmatrix(exog_i + "-1", data, eval_env=eval_env)

    # all arrays or pandas-like

    if RHS_isemtpy:
        endog = np.asarray(endog)
        exog_i = np.asarray(exog_i)
        ax.plot(endog, exog_i, 'o', **kwargs)
        fitted_line = OLS(endog, exog_i).fit()
        x_axis_endog_name = 'x' if isinstance(exog_i, np.ndarray) else exog_i.name
        y_axis_endog_name = 'y' if isinstance(endog, np.ndarray) else endog.design_info.column_names[0]
    else:
        res_yaxis = OLS(endog, RHS).fit()
        res_xaxis = OLS(exog_i, RHS).fit()
        xaxis_resid = res_xaxis.resid
        yaxis_resid = res_yaxis.resid
        x_axis_endog_name = res_xaxis.model.endog_names
        y_axis_endog_name = res_yaxis.model.endog_names
        ax.plot(xaxis_resid, yaxis_resid, 'o', **kwargs)
        fitted_line = OLS(yaxis_resid, xaxis_resid).fit()

    fig = abline_plot(0, fitted_line.params[0], color='k', ax=ax)

    if x_axis_endog_name == 'y':  # for no names regression will just get a y
        x_axis_endog_name = 'x'  # this is misleading, so use x
    ax.set_xlabel("e(%s | X)" % x_axis_endog_name)
    ax.set_ylabel("e(%s | X)" % y_axis_endog_name)
    ax.set_title('Partial Regression Plot', **title_kwargs)

    # NOTE: if we want to get super fancy, we could annotate if a point is
    # clicked using this widget
    # http://stackoverflow.com/questions/4652439/
    # is-there-a-matplotlib-equivalent-of-matlabs-datacursormode/
    # 4674445#4674445
    if obs_labels is True:
        if data is not None:
            obs_labels = data.index
        elif hasattr(exog_i, "index"):
            obs_labels = exog_i.index
        else:
            obs_labels = res_xaxis.model.data.row_labels
        #NOTE: row_labels can be None.
        #Maybe we should fix this to never be the case.
        if obs_labels is None:
            obs_labels = lrange(len(exog_i))

    if obs_labels is not False:  # could be array_like
        if len(obs_labels) != len(exog_i):
            raise ValueError("obs_labels does not match length of exog_i")
        label_kwargs.update(dict(ha="center", va="bottom"))
        ax = utils.annotate_axes(lrange(len(obs_labels)), obs_labels,
                                 lzip(res_xaxis.resid, res_yaxis.resid),
                                 [(0, 5)] * len(obs_labels), "x-large", ax=ax,
                                 **label_kwargs)

    if ret_coords:
        return fig, (res_xaxis.resid, res_yaxis.resid)
    else:
        return fig
Beispiel #47
0
    def plot_power(self,
                   dep_var='nobs',
                   nobs=None,
                   effect_size=None,
                   alpha=0.05,
                   ax=None,
                   title=None,
                   plt_kwds=None,
                   **kwds):
        '''plot power with number of observations or effect size on x-axis

        Parameters
        ----------
        dep_var : string in ['nobs', 'effect_size', 'alpha']
            This specifies which variable is used for the horizontal axis.
            If dep_var='nobs' (default), then one curve is created for each
            value of ``effect_size``. If dep_var='effect_size' or alpha, then
            one curve is created for each value of ``nobs``.
        nobs : scalar or array_like
            specifies the values of the number of observations in the plot
        effect_size : scalar or array_like
            specifies the values of the effect_size in the plot
        alpha : float or array_like
            The significance level (type I error) used in the power
            calculation. Can only be more than a scalar, if ``dep_var='alpha'``
        ax : None or axis instance
            If ax is None, than a matplotlib figure is created. If ax is a
            matplotlib axis instance, then it is reused, and the plot elements
            are created with it.
        title : string
            title for the axis. Use an empty string, ``''``, to avoid a title.
        plt_kwds : None or dict
            not used yet
        kwds : optional keywords for power function
            These remaining keyword arguments are used as arguments to the
            power function. Many power function support ``alternative`` as a
            keyword argument, two-sample test support ``ratio``.

        Returns
        -------
        fig : matplotlib figure instance

        Notes
        -----
        This works only for classes where the ``power`` method has
        ``effect_size``, ``nobs`` and ``alpha`` as the first three arguments.
        If the second argument is ``nobs1``, then the number of observations
        in the plot are those for the first sample.
        TODO: fix this for FTestPower and GofChisquarePower

        TODO: maybe add line variable, if we want more than nobs and effectsize
        '''
        #if pwr_kwds is None:
        #    pwr_kwds = {}
        from statsmodels.graphics import utils
        from statsmodels.graphics.plottools import rainbow
        fig, ax = utils.create_mpl_ax(ax)
        import matplotlib.pyplot as plt
        colormap = plt.cm.Dark2  #pylint: disable-msg=E1101
        plt_alpha = 1  #0.75
        lw = 2
        if dep_var == 'nobs':
            colors = rainbow(len(effect_size))
            colors = [
                colormap(i) for i in np.linspace(0, 0.9, len(effect_size))
            ]
            for ii, es in enumerate(effect_size):
                power = self.power(es, nobs, alpha, **kwds)
                ax.plot(nobs,
                        power,
                        lw=lw,
                        alpha=plt_alpha,
                        color=colors[ii],
                        label='es=%4.2F' % es)
                xlabel = 'Number of Observations'
        elif dep_var in ['effect size', 'effect_size', 'es']:
            colors = rainbow(len(nobs))
            colors = [colormap(i) for i in np.linspace(0, 0.9, len(nobs))]
            for ii, n in enumerate(nobs):
                power = self.power(effect_size, n, alpha, **kwds)
                ax.plot(effect_size,
                        power,
                        lw=lw,
                        alpha=plt_alpha,
                        color=colors[ii],
                        label='N=%4.2F' % n)
                xlabel = 'Effect Size'
        elif dep_var in ['alpha']:
            # experimental nobs as defining separate lines
            colors = rainbow(len(nobs))

            for ii, n in enumerate(nobs):
                power = self.power(effect_size, n, alpha, **kwds)
                ax.plot(alpha,
                        power,
                        lw=lw,
                        alpha=plt_alpha,
                        color=colors[ii],
                        label='N=%4.2F' % n)
                xlabel = 'alpha'
        else:
            raise ValueError('depvar not implemented')

        if title is None:
            title = 'Power of Test'
        ax.set_xlabel(xlabel)
        ax.set_title(title)
        ax.legend(loc='lower right')
        return fig
Beispiel #48
0
def plot_ccpr(results, exog_idx, ax=None):
    """
    Plot CCPR against one regressor.

    Generates a component and component-plus-residual (CCPR) plot.

    Parameters
    ----------
    results : result instance
        A regression results instance.
    exog_idx : {int, str}
        Exogenous, explanatory variable. If string is given, it should
        be the variable name that you want to use, and you can use arbitrary
        translations as with a formula.
    ax : AxesSubplot, optional
        If given, it is used to plot in instead of a new figure being
        created.

    Returns
    -------
    Figure
        If `ax` is None, the created figure.  Otherwise the figure to which
        `ax` is connected.

    See Also
    --------
    plot_ccpr_grid : Creates CCPR plot for multiple regressors in a plot grid.

    Notes
    -----
    The CCPR plot provides a way to judge the effect of one regressor on the
    response variable by taking into account the effects of the other
    independent variables. The partial residuals plot is defined as
    Residuals + B_i*X_i versus X_i. The component adds the B_i*X_i versus
    X_i to show where the fitted line would lie. Care should be taken if X_i
    is highly correlated with any of the other independent variables. If this
    is the case, the variance evident in the plot will be an underestimate of
    the true variance.

    References
    ----------
    http://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/ccpr.htm

    Examples
    --------
    Using the state crime dataset plot the effect of the rate of single
    households ('single') on the murder rate while accounting for high school
    graduation rate ('hs_grad'), percentage of people in an urban area, and rate
    of poverty ('poverty').

    >>> import statsmodels.api as sm
    >>> import matplotlib.pyplot as plot
    >>> import statsmodels.formula.api as smf

    >>> crime_data = sm.datasets.statecrime.load_pandas()
    >>> results = smf.ols('murder ~ hs_grad + urban + poverty + single',
    ...                   data=crime_data.data).fit()
    >>> sm.graphics.plot_ccpr(results, 'single')
    >>> plt.show()

    .. plot:: plots/graphics_regression_ccpr.py
    """
    fig, ax = utils.create_mpl_ax(ax)

    exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model)
    results = maybe_unwrap_results(results)

    x1 = results.model.exog[:, exog_idx]
    #namestr = ' for %s' % self.name if self.name else ''
    x1beta = x1*results.params[exog_idx]
    ax.plot(x1, x1beta + results.resid, 'o')
    from statsmodels.tools.tools import add_constant
    mod = OLS(x1beta, add_constant(x1)).fit()
    params = mod.params
    fig = abline_plot(*params, **dict(ax=ax))
    #ax.plot(x1, x1beta, '-')
    ax.set_title('Component and component plus residual plot')
    ax.set_ylabel("Residual + %s*beta_%d" % (exog_name, exog_idx))
    ax.set_xlabel("%s" % exog_name)

    return fig
Beispiel #49
0
    def plot_bivariate(self, col1_name, col2_name,
                       lowess_args=None, lowess_min_n=40,
                       jitter=None, plot_points=True, ax=None):
        """
        Plot observed and imputed values for two variables.

        Displays a scatterplot of one variable against another.  The
        points are colored according to whether the values are
        observed or imputed.

        Parameters
        ----------
        col1_name : string
            The variable to be plotted on the horizontal axis.
        col2_name : string
            The variable to be plotted on the vertical axis.
        lowess_args : dictionary
            A dictionary of dictionaries, keys are 'ii', 'io', 'oi'
            and 'oo', where 'o' denotes 'observed' and 'i' denotes
            imputed.  See Notes for details.
        lowess_min_n : integer
            Minimum sample size to plot a lowess fit
        jitter : float or tuple
            Standard deviation for jittering points in the plot.
            Either a single scalar applied to both axes, or a tuple
            containing x-axis jitter and y-axis jitter, respectively.
        plot_points : bool
            If True, the data points are plotted.
        ax : matplotlib axes object
            Axes on which to plot, created if not provided.

        Returns
        -------
        The matplotlib figure on which the plot id drawn.
        """

        from statsmodels.graphics import utils as gutils
        from statsmodels.nonparametric.smoothers_lowess import lowess

        if lowess_args is None:
            lowess_args = {}

        if ax is None:
            fig, ax = gutils.create_mpl_ax(ax)
        else:
            fig = ax.get_figure()

        ax.set_position([0.1, 0.1, 0.7, 0.8])

        ix1i = self.ix_miss[col1_name]
        ix1o = self.ix_obs[col1_name]
        ix2i = self.ix_miss[col2_name]
        ix2o = self.ix_obs[col2_name]

        ix_ii = np.intersect1d(ix1i, ix2i)
        ix_io = np.intersect1d(ix1i, ix2o)
        ix_oi = np.intersect1d(ix1o, ix2i)
        ix_oo = np.intersect1d(ix1o, ix2o)

        vec1 = np.asarray(self.data[col1_name])
        vec2 = np.asarray(self.data[col2_name])

        if jitter is not None:
            if np.isscalar(jitter):
                jitter = (jitter, jitter)
            vec1 += jitter[0] * np.random.normal(size=len(vec1))
            vec2 += jitter[1] * np.random.normal(size=len(vec2))

        # Plot the points
        keys = ['oo', 'io', 'oi', 'ii']
        lak = {'i': 'imp', 'o': 'obs'}
        ixs = {'ii': ix_ii, 'io': ix_io, 'oi': ix_oi, 'oo': ix_oo}
        color = {'oo': 'grey', 'ii': 'red', 'io': 'orange',
                 'oi': 'lime'}
        if plot_points:
            for ky in keys:
                ix = ixs[ky]
                lab = lak[ky[0]] + "/" + lak[ky[1]]
                ax.plot(vec1[ix], vec2[ix], 'o', color=color[ky],
                        label=lab, alpha=0.6)

        # Plot the lowess fits
        for ky in keys:
            ix = ixs[ky]
            if len(ix) < lowess_min_n:
                continue
            if ky in lowess_args:
                la = lowess_args[ky]
            else:
                la = {}
            ix = ixs[ky]
            lfit = lowess(vec2[ix], vec1[ix], **la)
            if plot_points:
                ax.plot(lfit[:, 0], lfit[:, 1], '-', color=color[ky],
                        alpha=0.6, lw=4)
            else:
                lab = lak[ky[0]] + "/" + lak[ky[1]]
                ax.plot(lfit[:, 0], lfit[:, 1], '-', color=color[ky],
                        alpha=0.6, lw=4, label=lab)

        ha, la = ax.get_legend_handles_labels()
        pad = 0.0001 if plot_points else 0.5
        leg = fig.legend(ha, la, 'center right', numpoints=1,
                         handletextpad=pad)
        leg.draw_frame(False)

        ax.set_xlabel(col1_name)
        ax.set_ylabel(col2_name)

        return fig
Beispiel #50
0
def plot_partregress(endog, exog_i, exog_others, data=None,
                     title_kwargs={}, obs_labels=True, label_kwargs={},
                     ax=None, ret_coords=False, **kwargs):
    """Plot partial regression for a single regressor.

    Parameters
    ----------
    endog : ndarray or string
       endogenous or response variable. If string is given, you can use a
       arbitrary translations as with a formula.
    exog_i : ndarray or string
        exogenous, explanatory variable. If string is given, you can use a
        arbitrary translations as with a formula.
    exog_others : ndarray or list of strings
        other exogenous, explanatory variables. If a list of strings is given,
        each item is a term in formula. You can use a arbitrary translations
        as with a formula. The effect of these variables will be removed by
        OLS regression.
    data : DataFrame, dict, or recarray
        Some kind of data structure with names if the other variables are
        given as strings.
    title_kwargs : dict
        Keyword arguments to pass on for the title. The key to control the
        fonts is fontdict.
    obs_labels : bool or array-like
        Whether or not to annotate the plot points with their observation
        labels. If obs_labels is a boolean, the point labels will try to do
        the right thing. First it will try to use the index of data, then
        fall back to the index of exog_i. Alternatively, you may give an
        array-like object corresponding to the obseveration numbers.
    labels_kwargs : dict
        Keyword arguments that control annotate for the observation labels.
    ax : Matplotlib AxesSubplot instance, optional
        If given, this subplot is used to plot in instead of a new figure being
        created.
    ret_coords : bool
        If True will return the coordinates of the points in the plot. You
        can use this to add your own annotations.
    kwargs
        The keyword arguments passed to plot for the points.

    Returns
    -------
    fig : Matplotlib figure instance
        If `ax` is None, the created figure.  Otherwise the figure to which
        `ax` is connected.
    coords : list, optional
        If ret_coords is True, return a tuple of arrays (x_coords, y_coords).

    Notes
    -----
    The slope of the fitted line is the that of `exog_i` in the full
    multiple regression. The individual points can be used to assess the
    influence of points on the estimated coefficient.

    See Also
    --------
    plot_partregress_grid : Plot partial regression for a set of regressors.
    """
    #NOTE: there is no interaction between possible missing data and
    #obs_labels yet, so this will need to be tweaked a bit for this case
    fig, ax = utils.create_mpl_ax(ax)

    # strings, use patsy to transform to data
    if isinstance(endog, string_types):
        endog = dmatrix(endog + "-1", data)

    if isinstance(exog_others, string_types):
        RHS = dmatrix(exog_others, data)
    elif isinstance(exog_others, list):
        RHS = "+".join(exog_others)
        RHS = dmatrix(RHS, data)
    else:
        RHS = exog_others

    if isinstance(exog_i, string_types):
        exog_i = dmatrix(exog_i + "-1", data)

    # all arrays or pandas-like
    res_yaxis = OLS(endog, RHS).fit()
    res_xaxis = OLS(exog_i, RHS).fit()

    ax.plot(res_xaxis.resid, res_yaxis.resid, 'o', **kwargs)
    fitted_line = OLS(res_yaxis.resid, res_xaxis.resid).fit()
    fig = abline_plot(0, fitted_line.params[0], color='k', ax=ax)
    x_axis_endog_name = res_xaxis.model.endog_names
    if x_axis_endog_name == 'y':  # for no names regression will just get a y
        x_axis_endog_name = 'x'  # this is misleading, so use x
    ax.set_xlabel("e(%s | X)" % x_axis_endog_name)
    ax.set_ylabel("e(%s | X)" % res_yaxis.model.endog_names)
    ax.set_title('Partial Regression Plot', **title_kwargs)

    #NOTE: if we want to get super fancy, we could annotate if a point is
    #clicked using this widget
    #http://stackoverflow.com/questions/4652439/
    #is-there-a-matplotlib-equivalent-of-matlabs-datacursormode/
    #4674445#4674445
    if obs_labels is True:
        if data is not None:
            obs_labels = data.index
        elif hasattr(exog_i, "index"):
            obs_labels = exog_i.index
        else:
            obs_labels = res_xaxis.model.data.row_labels
        #NOTE: row_labels can be None.
        #Maybe we should fix this to never be the case.
        if obs_labels is None:
            obs_labels = lrange(len(exog_i))

    if obs_labels is not False:  # could be array-like
        if len(obs_labels) != len(exog_i):
            raise ValueError("obs_labels does not match length of exog_i")
        label_kwargs.update(dict(ha="center", va="bottom"))
        ax = utils.annotate_axes(lrange(len(obs_labels)), obs_labels,
                                 lzip(res_xaxis.resid, res_yaxis.resid),
                                 [(0, 5)] * len(obs_labels), "x-large", ax=ax,
                                 **label_kwargs)

    if ret_coords:
        return fig, (res_xaxis.resid, res_yaxis.resid)
    else:
        return fig
Beispiel #51
0
    def plot_imputed_hist(self, col_name, ax=None, imp_hist_args=None,
                          obs_hist_args=None, all_hist_args=None):
        """
        Display imputed values for one variable as a histogram.

        Parameters
        ----------
        col_name : string
            The name of the variable to be plotted.
        ax : matplotlib axes
            An axes on which to draw the histograms.  If not provided,
            one is created.
        imp_hist_args : dict
            Keyword arguments to be passed to pyplot.hist when
            creating the histogram for imputed values.
        obs_hist_args : dict
            Keyword arguments to be passed to pyplot.hist when
            creating the histogram for observed values.
        all_hist_args : dict
            Keyword arguments to be passed to pyplot.hist when
            creating the histogram for all values.

        Returns
        -------
        The matplotlib figure on which the histograms were drawn
        """

        from statsmodels.graphics import utils as gutils
        from matplotlib.colors import LinearSegmentedColormap

        if imp_hist_args is None:
            imp_hist_args = {}
        if obs_hist_args is None:
            obs_hist_args = {}
        if all_hist_args is None:
            all_hist_args = {}

        if ax is None:
            fig, ax = gutils.create_mpl_ax(ax)
        else:
            fig = ax.get_figure()

        ax.set_position([0.1, 0.1, 0.7, 0.8])

        ixm = self.ix_miss[col_name]
        ixo = self.ix_obs[col_name]

        imp = self.data[col_name].iloc[ixm]
        obs = self.data[col_name].iloc[ixo]

        for di in imp_hist_args, obs_hist_args, all_hist_args:
            if 'histtype' not in di:
                di['histtype'] = 'step'

        ha, la = [], []
        if len(imp) > 0:
            h = ax.hist(np.asarray(imp), **imp_hist_args)
            ha.append(h[-1][0])
            la.append("Imp")
        h1 = ax.hist(np.asarray(obs), **obs_hist_args)
        h2 = ax.hist(np.asarray(self.data[col_name]), **all_hist_args)
        ha.extend([h1[-1][0], h2[-1][0]])
        la.extend(["Obs", "All"])

        leg = fig.legend(ha, la, 'center right', numpoints=1)
        leg.draw_frame(False)

        ax.set_xlabel(col_name)
        ax.set_ylabel("Frequency")

        return fig
Beispiel #52
0
def plot_ccpr(results, exog_idx, ax=None):
    """Plot CCPR against one regressor.

    Generates a CCPR (component and component-plus-residual) plot.

    Parameters
    ----------
    results : result instance
        A regression results instance.
    exog_idx : int or string
        Exogenous, explanatory variable. If string is given, it should
        be the variable name that you want to use, and you can use arbitrary
        translations as with a formula.
    ax : Matplotlib AxesSubplot instance, optional
        If given, it is used to plot in instead of a new figure being
        created.

    Returns
    -------
    fig : Matplotlib figure instance
        If `ax` is None, the created figure.  Otherwise the figure to which
        `ax` is connected.

    See Also
    --------
    plot_ccpr_grid : Creates CCPR plot for multiple regressors in a plot grid.

    Notes
    -----
    The CCPR plot provides a way to judge the effect of one regressor on the
    response variable by taking into account the effects of the other
    independent variables. The partial residuals plot is defined as
    Residuals + B_i*X_i versus X_i. The component adds the B_i*X_i versus
    X_i to show where the fitted line would lie. Care should be taken if X_i
    is highly correlated with any of the other independent variables. If this
    is the case, the variance evident in the plot will be an underestimate of
    the true variance.

    References
    ----------
    http://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/ccpr.htm
    """
    fig, ax = utils.create_mpl_ax(ax)

    exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model)
    results = maybe_unwrap_results(results)

    x1 = results.model.exog[:, exog_idx]
    #namestr = ' for %s' % self.name if self.name else ''
    x1beta = x1*results.params[exog_idx]
    ax.plot(x1, x1beta + results.resid, 'o')
    from statsmodels.tools.tools import add_constant
    mod = OLS(x1beta, add_constant(x1)).fit()
    params = mod.params
    fig = abline_plot(*params, **dict(ax=ax))
    #ax.plot(x1, x1beta, '-')
    ax.set_title('Component and component plus residual plot')
    ax.set_ylabel("Residual + %s*beta_%d" % (exog_name, exog_idx))
    ax.set_xlabel("%s" % exog_name)

    return fig
Beispiel #53
0
def plot_pacf(x,
              ax=None,
              lags=None,
              alpha=.05,
              method='ywm',
              use_vlines=True,
              **kwargs):
    """Plot the partial autocorrelation function

    Plots lags on the horizontal and the correlations on vertical axis.

    Parameters
    ----------
    x : array_like
        Array of time-series values
    ax : Matplotlib AxesSubplot instance, optional
        If given, this subplot is used to plot in instead of a new figure being
        created.
    lags : array_like, optional
        Array of lag values, used on horizontal axis.
        If not given, ``lags=np.arange(len(corr))`` is used.
    alpha : scalar, optional
        If a number is given, the confidence intervals for the given level are
        returned. For instance if alpha=.05, 95 % confidence intervals are
        returned where the standard deviation is computed according to
        1/sqrt(len(x))
    method : 'ywunbiased' (default) or 'ywmle' or 'ols'
        specifies which method for the calculations to use:

        - yw or ywunbiased : yule walker with bias correction in denominator
          for acovf
        - ywm or ywmle : yule walker without bias correction
        - ols - regression of time series on lags of it and on constant
        - ld or ldunbiased : Levinson-Durbin recursion with bias correction
        - ldb or ldbiased : Levinson-Durbin recursion without bias correction

    use_vlines : bool, optional
        If True, vertical lines and markers are plotted.
        If False, only markers are plotted.  The default marker is 'o'; it can
        be overridden with a ``marker`` kwarg.
    **kwargs : kwargs, optional
        Optional keyword arguments that are directly passed on to the
        Matplotlib ``plot`` and ``axhline`` functions.

    Returns
    -------
    fig : Matplotlib figure instance
        If `ax` is None, the created figure.  Otherwise the figure to which
        `ax` is connected.

    See Also
    --------
    matplotlib.pyplot.xcorr
    matplotlib.pyplot.acorr
    mpl_examples/pylab_examples/xcorr_demo.py

    Notes
    -----
    Adapted from matplotlib's `xcorr`.

    Data are plotted as ``plot(lags, corr, **kwargs)``

    """
    fig, ax = utils.create_mpl_ax(ax)

    if lags is None:
        lags = np.arange(len(x))
        nlags = len(lags) - 1
    else:
        nlags = lags
        lags = np.arange(lags + 1)  # +1 for zero lag

    confint = None
    if alpha is None:
        acf_x = pacf(x, nlags=nlags, alpha=alpha, method=method)
    else:
        acf_x, confint = pacf(x, nlags=nlags, alpha=alpha, method=method)

    if use_vlines:
        ax.vlines(lags, [0], acf_x, **kwargs)
        ax.axhline(**kwargs)

    # center the confidence interval TODO: do in acf?
    kwargs.setdefault('marker', 'o')
    kwargs.setdefault('markersize', 5)
    kwargs.setdefault('linestyle', 'None')
    ax.margins(.05)
    ax.plot(lags, acf_x, **kwargs)
    ax.set_title("Partial Autocorrelation")

    if confint is not None:
        # center the confidence interval TODO: do in acf?
        ax.fill_between(lags,
                        confint[:, 0] - acf_x,
                        confint[:, 1] - acf_x,
                        alpha=.25)

    return fig
Beispiel #54
0
def influence_plot(results, external=True, alpha=.05, criterion="cooks",
                   size=48, plot_alpha=.75, ax=None, **kwargs):
    """
    Plot of influence in regression. Plots studentized resids vs. leverage.

    Parameters
    ----------
    results : results instance
        A fitted model.
    external : bool
        Whether to use externally or internally studentized residuals. It is
        recommended to leave external as True.
    alpha : float
        The alpha value to identify large studentized residuals. Large means
        abs(resid_studentized) > t.ppf(1-alpha/2, dof=results.df_resid)
    criterion : str {'DFFITS', 'Cooks'}
        Which criterion to base the size of the points on. Options are
        DFFITS or Cook's D.
    size : float
        The range of `criterion` is mapped to 10**2 - size**2 in points.
    plot_alpha : float
        The `alpha` of the plotted points.
    ax : matplotlib Axes instance
        An instance of a matplotlib Axes.

    Returns
    -------
    fig : matplotlib figure
        The matplotlib figure that contains the Axes.

    Notes
    -----
    Row labels for the observations in which the leverage, measured by the
    diagonal of the hat matrix, is high or the residuals are large, as the
    combination of large residuals and a high influence value indicates an
    influence point. The value of large residuals can be controlled using the
    `alpha` parameter. Large leverage points are identified as
    hat_i > 2 * (df_model + 1)/nobs.
    """
    fig, ax = utils.create_mpl_ax(ax)

    infl = results.get_influence()

    if criterion.lower().startswith('dff'):
        psize = infl.cooks_distance[0]
    elif criterion.lower().startswith('coo'):
        psize = np.abs(infl.dffits[0])
    else:
        raise ValueError("Criterion %s not understood" % criterion)

    # scale the variables
    #TODO: what is the correct scaling and the assumption here?
    #we want plots to be comparable across different plots
    #so we would need to use the expected distribution of criterion probably
    old_range = np.ptp(psize)
    new_range = size**2 - 8**2

    psize = (psize - psize.min()) * new_range/old_range + 8**2

    leverage = infl.hat_matrix_diag
    if external:
        resids = infl.resid_studentized_external
    else:
        resids = infl.resid_studentized_internal

    from scipy import stats

    cutoff = stats.t.ppf(1.-alpha/2, results.df_resid)
    large_resid = np.abs(resids) > cutoff
    large_leverage = leverage > _high_leverage(results)
    large_points = np.logical_or(large_resid, large_leverage)

    ax.scatter(leverage, resids, s=psize, alpha=plot_alpha)

    # add point labels
    labels = results.model.data.row_labels
    if labels is None:
        labels = lrange(len(resids))
    ax = utils.annotate_axes(np.where(large_points)[0], labels,
                             lzip(leverage, resids),
                             lzip(-(psize/2)**.5, (psize/2)**.5), "x-large",
                             ax)

    #TODO: make configurable or let people do it ex-post?
    font = {"fontsize" : 16, "color" : "black"}
    ax.set_ylabel("Studentized Residuals", **font)
    ax.set_xlabel("H Leverage", **font)
    ax.set_title("Influence Plot", **font)
    return fig
Beispiel #55
0
def plot_pacf(x, ax=None, lags=None, alpha=.05, method='ywm',
                use_vlines=True, **kwargs):
    """Plot the partial autocorrelation function

    Plots lags on the horizontal and the correlations on vertical axis.

    Parameters
    ----------
    x : array_like
        Array of time-series values
    ax : Matplotlib AxesSubplot instance, optional
        If given, this subplot is used to plot in instead of a new figure being
        created.
    lags : array_like, optional
        Array of lag values, used on horizontal axis.
        If not given, ``lags=np.arange(len(corr))`` is used.
    alpha : scalar, optional
        If a number is given, the confidence intervals for the given level are
        returned. For instance if alpha=.05, 95 % confidence intervals are
        returned where the standard deviation is computed according to
        1/sqrt(len(x))
    method : 'ywunbiased' (default) or 'ywmle' or 'ols'
        specifies which method for the calculations to use:

        - yw or ywunbiased : yule walker with bias correction in denominator
          for acovf
        - ywm or ywmle : yule walker without bias correction
        - ols - regression of time series on lags of it and on constant
        - ld or ldunbiased : Levinson-Durbin recursion with bias correction
        - ldb or ldbiased : Levinson-Durbin recursion without bias correction

    use_vlines : bool, optional
        If True, vertical lines and markers are plotted.
        If False, only markers are plotted.  The default marker is 'o'; it can
        be overridden with a ``marker`` kwarg.
    **kwargs : kwargs, optional
        Optional keyword arguments that are directly passed on to the
        Matplotlib ``plot`` and ``axhline`` functions.

    Returns
    -------
    fig : Matplotlib figure instance
        If `ax` is None, the created figure.  Otherwise the figure to which
        `ax` is connected.

    See Also
    --------
    matplotlib.pyplot.xcorr
    matplotlib.pyplot.acorr
    mpl_examples/pylab_examples/xcorr_demo.py

    Notes
    -----
    Adapted from matplotlib's `xcorr`.

    Data are plotted as ``plot(lags, corr, **kwargs)``

    """
    fig, ax = utils.create_mpl_ax(ax)

    if lags is None:
        lags = np.arange(len(x))
        nlags = len(lags) - 1
    else:
        nlags = lags
        lags = np.arange(lags + 1) # +1 for zero lag

    acf_x, confint = pacf(x, nlags=nlags, alpha=alpha, method=method)

    if use_vlines:
        ax.vlines(lags, [0], acf_x, **kwargs)
        ax.axhline(**kwargs)

    # center the confidence interval TODO: do in acf?
    confint = confint - confint.mean(1)[:,None]
    kwargs.setdefault('marker', 'o')
    kwargs.setdefault('markersize', 5)
    kwargs.setdefault('linestyle', 'None')
    ax.margins(.05)
    ax.plot(lags, acf_x, **kwargs)
    ax.fill_between(lags, confint[:,0], confint[:,1], alpha=.25)
    ax.set_title("Partial Autocorrelation")

    return fig
def mosaic(data,
           index=None,
           ax=None,
           horizontal=True,
           gap=0.005,
           properties=lambda key: None,
           labelizer=None,
           title='',
           statistic=False,
           axes_label=True,
           label_rotation=0.0):
    """Create a mosaic plot from a contingency table.

    It allows to visualize multivariate categorical data in a rigorous
    and informative way.

    Parameters
    ----------
    data : {dict, Series, ndarray, DataFrame}
        The contingency table that contains the data.
        Each category should contain a non-negative number
        with a tuple as index.  It expects that all the combination
        of keys to be represents; if that is not true, will
        automatically consider the missing values as 0.  The order
        of the keys will be the same as the one of insertion.
        If a dict of a Series (or any other dict like object)
        is used, it will take the keys as labels.  If a
        np.ndarray is provided, it will generate a simple
        numerical labels.
    index : list, optional
        Gives the preferred order for the category ordering. If not specified
        will default to the given order.  It does not support named indexes
        for hierarchical Series.  If a DataFrame is provided, it expects
        a list with the name of the columns.
    ax : Axes, optional
        The graph where display the mosaic. If not given, will
        create a new figure
    horizontal : bool, optional
        The starting direction of the split (by default along
        the horizontal axis)
    gap : {float, sequence[float]}
        The list of gaps to be applied on each subdivision.
        If the length of the given array is less of the number
        of subcategories (or if it's a single number) it will extend
        it with exponentially decreasing gaps
    properties : dict[str, callable], optional
        A function that for each tile in the mosaic take the key
        of the tile and returns the dictionary of properties
        of the generated Rectangle, like color, hatch or similar.
        A default properties set will be provided fot the keys whose
        color has not been defined, and will use color variation to help
        visually separates the various categories. It should return None
        to indicate that it should use the default property for the tile.
        A dictionary of the properties for each key can be passed,
        and it will be internally converted to the correct function
    labelizer : dict[str, callable], optional
        A function that generate the text to display at the center of
        each tile base on the key of that tile
    title : str, optional
        The title of the axis
    statistic : bool, optional
        If true will use a crude statistical model to give colors to the plot.
        If the tile has a constraint that is more than 2 standard deviation
        from the expected value under independence hypothesis, it will
        go from green to red (for positive deviations, blue otherwise) and
        will acquire an hatching when crosses the 3 sigma.
    axes_label : bool, optional
        Show the name of each value of each category
        on the axis (default) or hide them.
    label_rotation : {float, list[float]}
        The rotation of the axis label (if present). If a list is given
        each axis can have a different rotation

    Returns
    ---------
    fig : Figure
        The figure containing the plot.
    rects : dict
        A dictionary that has the same keys of the original
        dataset, that holds a reference to the coordinates of the
        tile and the Rectangle that represent it.

    References
    ----------
    A Brief History of the Mosaic Display
        Michael Friendly, York University, Psychology Department
        Journal of Computational and Graphical Statistics, 2001

    Mosaic Displays for Loglinear Models.
        Michael Friendly, York University, Psychology Department
        Proceedings of the Statistical Graphics Section, 1992, 61-68.

    Mosaic displays for multi-way contingency tables.
        Michael Friendly, York University, Psychology Department
        Journal of the american statistical association
        March 1994, Vol. 89, No. 425, Theory and Methods

    Examples
    ----------
    >>> import numpy as np
    >>> import pandas as pd
    >>> import matplotlib.pyplot as plt
    >>> from statsmodels.graphics.mosaicplot import mosaic

    The most simple use case is to take a dictionary and plot the result

    >>> data = {'a': 10, 'b': 15, 'c': 16}
    >>> mosaic(data, title='basic dictionary')
    >>> plt.show()

    A more useful example is given by a dictionary with multiple indices.
    In this case we use a wider gap to a better visual separation of the
    resulting plot

    >>> data = {('a', 'b'): 1, ('a', 'c'): 2, ('d', 'b'): 3, ('d', 'c'): 4}
    >>> mosaic(data, gap=0.05, title='complete dictionary')
    >>> plt.show()

    The same data can be given as a simple or hierarchical indexed Series

    >>> rand = np.random.random
    >>> from itertools import product
    >>> tuples = list(product(['bar', 'baz', 'foo', 'qux'], ['one', 'two']))
    >>> index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
    >>> data = pd.Series(rand(8), index=index)
    >>> mosaic(data, title='hierarchical index series')
    >>> plt.show()

    The third accepted data structure is the np array, for which a
    very simple index will be created.

    >>> rand = np.random.random
    >>> data = 1+rand((2,2))
    >>> mosaic(data, title='random non-labeled array')
    >>> plt.show()

    If you need to modify the labeling and the coloring you can give
    a function tocreate the labels and one with the graphical properties
    starting from the key tuple

    >>> data = {'a': 10, 'b': 15, 'c': 16}
    >>> props = lambda key: {'color': 'r' if 'a' in key else 'gray'}
    >>> labelizer = lambda k: {('a',): 'first', ('b',): 'second',
    ...                        ('c',): 'third'}[k]
    >>> mosaic(data, title='colored dictionary', properties=props,
    ...        labelizer=labelizer)
    >>> plt.show()

    Using a DataFrame as source, specifying the name of the columns of interest

    >>> gender = ['male', 'male', 'male', 'female', 'female', 'female']
    >>> pet = ['cat', 'dog', 'dog', 'cat', 'dog', 'cat']
    >>> data = pd.DataFrame({'gender': gender, 'pet': pet})
    >>> mosaic(data, ['pet', 'gender'], title='DataFrame as Source')
    >>> plt.show()

    .. plot :: plots/graphics_mosaicplot_mosaic.py
    """
    if isinstance(data, DataFrame) and index is None:
        raise ValueError("You must pass an index if data is a DataFrame."
                         " See examples.")

    from matplotlib.patches import Rectangle
    #from pylab import Rectangle
    fig, ax = utils.create_mpl_ax(ax)
    # normalize the data to a dict with tuple of strings as keys
    data = _normalize_data(data, index)
    # split the graph into different areas
    rects = _hierarchical_split(data, horizontal=horizontal, gap=gap)
    # if there is no specified way to create the labels
    # create a default one
    if labelizer is None:
        labelizer = lambda k: "\n".join(k)
    if statistic:
        default_props = _statistical_coloring(data)
    else:
        default_props = _create_default_properties(data)
    if isinstance(properties, dict):
        color_dict = properties
        properties = lambda key: color_dict.get(key, None)
    for k, v in rects.items():
        # create each rectangle and put a label on it
        x, y, w, h = v
        conf = properties(k)
        props = conf if conf else default_props[k]
        text = labelizer(k)
        Rect = Rectangle((x, y), w, h, label=text, **props)
        ax.add_patch(Rect)
        ax.text(x + w / 2,
                y + h / 2,
                text,
                ha='center',
                va='center',
                size='smaller')
    #creating the labels on the axis
    #o clearing it
    if axes_label:
        if np.iterable(label_rotation):
            rotation = label_rotation
        else:
            rotation = [label_rotation] * 4
        labels = _create_labels(rects, horizontal, ax, rotation)
    else:
        ax.set_xticks([])
        ax.set_xticklabels([])
        ax.set_yticks([])
        ax.set_yticklabels([])
    ax.set_title(title)
    return fig, rects
Beispiel #57
0
def plot_pacf(x, ax=None, lags=None, alpha=.05, method='ywm', use_vlines=True,
              title='Partial Autocorrelation', zero=True, **kwargs):
    """Plot the partial autocorrelation function

    Plots lags on the horizontal and the correlations on vertical axis.

    Parameters
    ----------
    x : array_like
        Array of time-series values
    ax : Matplotlib AxesSubplot instance, optional
        If given, this subplot is used to plot in instead of a new figure being
        created.
    lags : int or array_like, optional
        int or Array of lag values, used on horizontal axis. Uses
        np.arange(lags) when lags is an int.  If not provided,
        ``lags=np.arange(len(corr))`` is used.
    alpha : scalar, optional
        If a number is given, the confidence intervals for the given level are
        returned. For instance if alpha=.05, 95 % confidence intervals are
        returned where the standard deviation is computed according to
        1/sqrt(len(x))
    method : 'ywunbiased' (default) or 'ywmle' or 'ols'
        specifies which method for the calculations to use:

        - yw or ywunbiased : yule walker with bias correction in denominator
          for acovf
        - ywm or ywmle : yule walker without bias correction
        - ols - regression of time series on lags of it and on constant
        - ld or ldunbiased : Levinson-Durbin recursion with bias correction
        - ldb or ldbiased : Levinson-Durbin recursion without bias correction
    use_vlines : bool, optional
        If True, vertical lines and markers are plotted.
        If False, only markers are plotted.  The default marker is 'o'; it can
        be overridden with a ``marker`` kwarg.
    title : str, optional
        Title to place on plot.  Default is 'Partial Autocorrelation'
    zero : bool, optional
        Flag indicating whether to include the 0-lag autocorrelation.
        Default is True.
    **kwargs : kwargs, optional
        Optional keyword arguments that are directly passed on to the
        Matplotlib ``plot`` and ``axhline`` functions.

    Returns
    -------
    fig : Matplotlib figure instance
        If `ax` is None, the created figure.  Otherwise the figure to which
        `ax` is connected.

    See Also
    --------
    matplotlib.pyplot.xcorr
    matplotlib.pyplot.acorr
    mpl_examples/pylab_examples/xcorr_demo.py

    Notes
    -----
    Adapted from matplotlib's `xcorr`.

    Data are plotted as ``plot(lags, corr, **kwargs)``

    """
    fig, ax = utils.create_mpl_ax(ax)

    lags, nlags, irregular = _prepare_data_corr_plot(x, lags, zero)

    confint = None
    if alpha is None:
        acf_x = pacf(x, nlags=nlags, alpha=alpha, method=method)
    else:
        acf_x, confint = pacf(x, nlags=nlags, alpha=alpha, method=method)

    _plot_corr(ax, title, acf_x, confint, lags, irregular, use_vlines, **kwargs)

    return fig
    def plot_scree(self,
                   ncomp=None,
                   log_scale=True,
                   cumulative=False,
                   ax=None):
        """
        Plot of the ordered eigenvalues

        Parameters
        ----------
        ncomp : int, optional
            Number of components ot include in the plot.  If None, will
            included the same as the number of components computed
        log_scale : boot, optional
            Flag indicating whether ot use a log scale for the y-axis
        cumulative : bool, optional
            Flag indicating whether to plot the eigenvalues or cumulative
            eigenvalues
        ax : Matplotlib axes instance, optional
            An axes on which to draw the graph.  If omitted, new a figure
            is created

        Returns
        -------
        fig : figure
            Handle to the figure
        """
        import statsmodels.graphics.utils as gutils

        fig, ax = gutils.create_mpl_ax(ax)

        ncomp = self._ncomp if ncomp is None else ncomp
        vals = np.asarray(self.eigenvals)
        vals = vals[:self._ncomp]
        if cumulative:
            vals = np.cumsum(vals)

        if log_scale:
            ax.set_yscale('log')
        ax.plot(np.arange(ncomp), vals[:ncomp], 'bo')
        ax.autoscale(tight=True)
        xlim = np.array(ax.get_xlim())
        sp = xlim[1] - xlim[0]
        xlim += 0.02 * np.array([-sp, sp])
        ax.set_xlim(xlim)

        ylim = np.array(ax.get_ylim())
        scale = 0.02
        if log_scale:
            sp = np.log(ylim[1] / ylim[0])
            ylim = np.exp(
                np.array([
                    np.log(ylim[0]) - scale * sp,
                    np.log(ylim[1]) + scale * sp
                ]))
        else:
            sp = ylim[1] - ylim[0]
            ylim += scale * np.array([-sp, sp])
        ax.set_ylim(ylim)
        ax.set_title('Scree Plot')
        ax.set_ylabel('Eigenvalue')
        ax.set_xlabel('Component Number')
        fig.tight_layout()

        return fig
Beispiel #59
0
def plot_acf(x, ax=None, lags=None, alpha=.05, use_vlines=True, unbiased=False,
             fft=False, title='Autocorrelation', zero=True, **kwargs):
    """Plot the autocorrelation function

    Plots lags on the horizontal and the correlations on vertical axis.

    Parameters
    ----------
    x : array_like
        Array of time-series values
    ax : Matplotlib AxesSubplot instance, optional
        If given, this subplot is used to plot in instead of a new figure being
        created.
    lags : int or array_like, optional
        int or Array of lag values, used on horizontal axis. Uses
        np.arange(lags) when lags is an int.  If not provided,
        ``lags=np.arange(len(corr))`` is used.
    alpha : scalar, optional
        If a number is given, the confidence intervals for the given level are
        returned. For instance if alpha=.05, 95 % confidence intervals are
        returned where the standard deviation is computed according to
        Bartlett's formula. If None, no confidence intervals are plotted.
    use_vlines : bool, optional
        If True, vertical lines and markers are plotted.
        If False, only markers are plotted.  The default marker is 'o'; it can
        be overridden with a ``marker`` kwarg.
    unbiased : bool
        If True, then denominators for autocovariance are n-k, otherwise n
    fft : bool, optional
        If True, computes the ACF via FFT.
    title : str, optional
        Title to place on plot.  Default is 'Autocorrelation'
    zero : bool, optional
        Flag indicating whether to include the 0-lag autocorrelation.
        Default is True.
    **kwargs : kwargs, optional
        Optional keyword arguments that are directly passed on to the
        Matplotlib ``plot`` and ``axhline`` functions.

    Returns
    -------
    fig : Matplotlib figure instance
        If `ax` is None, the created figure.  Otherwise the figure to which
        `ax` is connected.

    See Also
    --------
    matplotlib.pyplot.xcorr
    matplotlib.pyplot.acorr
    mpl_examples/pylab_examples/xcorr_demo.py

    Notes
    -----
    Adapted from matplotlib's `xcorr`.

    Data are plotted as ``plot(lags, corr, **kwargs)``

    """
    fig, ax = utils.create_mpl_ax(ax)

    lags, nlags, irregular = _prepare_data_corr_plot(x, lags, zero)

    confint = None
    # acf has different return type based on alpha
    if alpha is None:
        acf_x = acf(x, nlags=nlags, alpha=alpha, fft=fft,
                    unbiased=unbiased)
    else:
        acf_x, confint = acf(x, nlags=nlags, alpha=alpha, fft=fft,
                             unbiased=unbiased)

    _plot_corr(ax, title, acf_x, confint, lags, irregular, use_vlines, **kwargs)

    return fig
Beispiel #60
0
def plot_survfunc(survfuncs, ax=None):
    """
    Plot one or more survivor functions.

    Parameters
    ----------
    survfuncs : object or array-like
        A single SurvfuncRight object, or a list or SurvfuncRight
        objects that are plotted together.

    Returns
    -------
    A figure instance on which the plot was drawn.

    Examples
    --------
    Add a legend:

    >>> import statsmodels.api as sm
    >>> from statsmodels.duration.survfunc import plot_survfunc
    >>> data = sm.datasets.get_rdataset("flchain", "survival").data
    >>> df = data.loc[data.sex == "F", :]
    >>> sf0 = sm.SurvfuncRight(df["futime"], df["death"])
    >>> sf1 = sm.SurvfuncRight(3.0 * df["futime"], df["death"])
    >>> fig = plot_survfunc([sf0, sf1])
    >>> ax = fig.get_axes()[0]
    >>> ax.set_position([0.1, 0.1, 0.64, 0.8])
    >>> ha, lb = ax.get_legend_handles_labels()
    >>> leg = fig.legend((ha[0], ha[1]), (lb[0], lb[1]), 'center right')

    Change the line colors:

    >>> fig = plot_survfunc([sf0, sf1])
    >>> ax = fig.get_axes()[0]
    >>> ax.set_position([0.1, 0.1, 0.64, 0.8])
    >>> ha, lb = ax.get_legend_handles_labels()
    >>> ha[0].set_color('purple')
    >>> ha[1].set_color('orange')
    """

    fig, ax = utils.create_mpl_ax(ax)

    # If we have only a single survival function to plot, put it into
    # a list.
    try:
        assert (type(survfuncs[0]) is SurvfuncRight)
    except:
        survfuncs = [survfuncs]

    for gx, sf in enumerate(survfuncs):

        # The estimated survival function does not include a point at
        # time 0, include it here for plotting.
        surv_times = np.concatenate(([0], sf.surv_times))
        surv_prob = np.concatenate(([1], sf.surv_prob))

        # If the final times are censoring times they are not included
        # in the survival function so we add them here
        mxt = max(sf.time)
        if mxt > surv_times[-1]:
            surv_times = np.concatenate((surv_times, [mxt]))
            surv_prob = np.concatenate((surv_prob, [surv_prob[-1]]))

        label = getattr(sf, "title", "Group %d" % (gx + 1))

        li, = ax.step(surv_times,
                      surv_prob,
                      '-',
                      label=label,
                      lw=2,
                      where='post')

        # Plot the censored points.
        ii = np.flatnonzero(np.logical_not(sf.status))
        ti = sf.time[ii]
        jj = np.searchsorted(surv_times, ti) - 1
        sp = surv_prob[jj]
        ax.plot(ti,
                sp,
                '+',
                ms=12,
                color=li.get_color(),
                label=label + " points")

    ax.set_ylim(0, 1.01)

    return fig