Exemple #1
0
    def plot_loadings(self, loading_pairs=None, plot_prerotated=False):
        """
        Plot factor loadings in 2-d plots

        Parameters
        ----------
        loading_pairs : None or a list of tuples
            Specify plots. Each tuple (i, j) represent one figure, i and j is
            the loading number for x-axis and y-axis, respectively. If `None`,
            all combinations of the loadings will be plotted.
        plot_prerotated : True or False
            If True, the loadings before rotation applied will be plotted. If
            False, rotated loadings will be plotted.

        Returns
        -------
        figs : a list of figure handles
        """
        _import_mpl()
        from .plots import plot_loadings

        if self.rotation_method is None:
            plot_prerotated = True
        loadings = self.loadings_no_rot if plot_prerotated else self.loadings
        if plot_prerotated:
            title = 'Prerotated Factor Pattern'
        else:
            title = '%s Rotated Factor Pattern' % (self.rotation_method)
        var_explained = self.eigenvals / self.n_comp * 100

        return plot_loadings(loadings,
                             loading_pairs=loading_pairs,
                             title=title,
                             row_names=self.endog_names,
                             percent_variance=var_explained)
    def plot_partial(self,
                     smooth_index,
                     plot_se=True,
                     cpr=False,
                     include_constant=True,
                     ax=None):
        """plot the contribution of a smooth term to the linear prediction

        Parameters
        ----------
        smooth_index : int
            index of the smooth term within list of smooth terms
        plot_se : book
            If plot_se is true, then the confidence interval for the linear
            prediction will be added to the plot.
        cpr : bool
            If cpr (component plus residual) is true, the a scatter plot of
            the partial working residuals will be added to the plot.
        include_constant : bool
            If true, then the estimated intercept is added to the prediction
            and its standard errors. This avoids that the confidence interval
            has zero width at the imposed identification constraint, e.g.
            either at a reference point or at the mean.
        ax : None or matplotlib axis instance
           If ax is not None, then the plot will be added to it.

        Returns
        -------
        Figure
            If `ax` is None, the created figure. Otherwise the Figure to which
            `ax` is connected.
        """
        from statsmodels.graphics.utils import _import_mpl, create_mpl_ax
        _import_mpl()

        variable = smooth_index
        y_est, se = self.partial_values(variable,
                                        include_constant=include_constant)
        smoother = self.model.smoother
        x = smoother.smoothers[variable].x
        sort_index = np.argsort(x)
        x = x[sort_index]
        y_est = y_est[sort_index]
        se = se[sort_index]

        fig, ax = create_mpl_ax(ax)
        ax.plot(x, y_est, c='blue', lw=2)
        if plot_se:
            ax.plot(x, y_est + 1.96 * se, '-', c='blue')
            ax.plot(x, y_est - 1.96 * se, '-', c='blue')
        if cpr:
            # TODO: resid_response does not make sense with nonlinear link
            # use resid_working ?
            cpr_ = y_est + self.resid_working
            ax.plot(x, cpr_, '.', lw=2)

        ax.set_xlabel(smoother.smoothers[variable].variable_name)

        return fig
    def plot_cusum(self, alpha=0.05, legend_loc='upper left',
                   fig=None, figsize=None):
        r"""
        Plot the CUSUM statistic and significance bounds.

        Parameters
        ----------
        alpha : float, optional
            The plotted significance bounds are alpha %.
        legend_loc : string, optional
            The location of the legend in the plot. Default is upper left.
        fig : Matplotlib Figure instance, optional
            If given, subplots are created in this figure instead of in a new
            figure. Note that the grid will be created in the provided
            figure using `fig.add_subplot()`.
        figsize : tuple, optional
            If a figure is created, this argument allows specifying a size.
            The tuple is (width, height).

        Notes
        -----
        Evidence of parameter instability may be found if the CUSUM statistic
        moves out of the significance bounds.

        References
        ----------
        .. [*] Brown, R. L., J. Durbin, and J. M. Evans. 1975.
           "Techniques for Testing the Constancy of
           Regression Relationships over Time."
           Journal of the Royal Statistical Society.
           Series B (Methodological) 37 (2): 149-92.

        """
        # Create the plot
        from statsmodels.graphics.utils import _import_mpl, create_mpl_fig
        _import_mpl()
        fig = create_mpl_fig(fig, figsize)
        ax = fig.add_subplot(1, 1, 1)

        # Get dates, if applicable
        if hasattr(self.data, 'dates') and self.data.dates is not None:
            dates = self.data.dates._mpl_repr()
        else:
            dates = np.arange(self.nobs)
        d = max(self.nobs_diffuse, self.loglikelihood_burn)

        # Plot cusum series and reference line
        ax.plot(dates[d:], self.cusum, label='CUSUM')
        ax.hlines(0, dates[d], dates[-1], color='k', alpha=0.3)

        # Plot significance bounds
        lower_line, upper_line = self._cusum_significance_bounds(alpha)
        ax.plot([dates[d], dates[-1]], upper_line, 'k--',
                label='%d%% significance' % (alpha * 100))
        ax.plot([dates[d], dates[-1]], lower_line, 'k--')

        ax.legend(loc=legend_loc)

        return fig
Exemple #4
0
    def plot_cusum(self, alpha=0.05, legend_loc='upper left',
                   fig=None, figsize=None):
        r"""
        Plot the CUSUM statistic and significance bounds.

        Parameters
        ----------
        alpha : float, optional
            The plotted significance bounds are alpha %.
        legend_loc : string, optional
            The location of the legend in the plot. Default is upper left.
        fig : Matplotlib Figure instance, optional
            If given, subplots are created in this figure instead of in a new
            figure. Note that the grid will be created in the provided
            figure using `fig.add_subplot()`.
        figsize : tuple, optional
            If a figure is created, this argument allows specifying a size.
            The tuple is (width, height).

        Notes
        -----
        Evidence of parameter instability may be found if the CUSUM statistic
        moves out of the significance bounds.

        References
        ----------
        .. [*] Brown, R. L., J. Durbin, and J. M. Evans. 1975.
           "Techniques for Testing the Constancy of
           Regression Relationships over Time."
           Journal of the Royal Statistical Society.
           Series B (Methodological) 37 (2): 149-92.

        """
        # Create the plot
        from statsmodels.graphics.utils import _import_mpl, create_mpl_fig
        _import_mpl()
        fig = create_mpl_fig(fig, figsize)
        ax = fig.add_subplot(1, 1, 1)

        # Get dates, if applicable
        if hasattr(self.data, 'dates') and self.data.dates is not None:
            dates = self.data.dates._mpl_repr()
        else:
            dates = np.arange(self.nobs)
        d = max(self.nobs_diffuse, self.loglikelihood_burn)

        # Plot cusum series and reference line
        ax.plot(dates[d:], self.cusum, label='CUSUM')
        ax.hlines(0, dates[d], dates[-1], color='k', alpha=0.3)

        # Plot significance bounds
        lower_line, upper_line = self._cusum_significance_bounds(alpha)
        ax.plot([dates[d], dates[-1]], upper_line, 'k--',
                label='%d%% significance' % (alpha * 100))
        ax.plot([dates[d], dates[-1]], lower_line, 'k--')

        ax.legend(loc=legend_loc)

        return fig
    def plot_partial(self, smooth_index, plot_se=True, cpr=False,
                     include_constant=True, ax=None):
        """plot the contribution of a smooth term to the linear prediction

        Parameters
        ----------
        smooth_index : int
            index of the smooth term within list of smooth terms
        plot_se : book
            If plot_se is true, then the confidence interval for the linear
            prediction will be added to the plot.
        cpr : bool
            If cpr (component plus residual) is true, the a scatter plot of
            the partial working residuals will be added to the plot.
        include_constant : bool
            If true, then the estimated intercept is added to the prediction
            and its standard errors. This avoids that the confidence interval
            has zero width at the imposed identification constraint, e.g.
            either at a reference point or at the mean.
        ax : None or matplotlib axis instance
           If ax is not None, then the plot will be added to it.

        Returns
        -------
        fig : matplotlib Figure instance

        """
        from statsmodels.graphics.utils import _import_mpl, create_mpl_ax
        _import_mpl()

        variable = smooth_index
        y_est, se = self.partial_values(variable,
                                        include_constant=include_constant)
        smoother = self.model.smoother
        x = smoother.smoothers[variable].x
        sort_index = np.argsort(x)
        x = x[sort_index]
        y_est = y_est[sort_index]
        se = se[sort_index]

        fig, ax = create_mpl_ax(ax)
        ax.plot(x, y_est, c='blue', lw=2)
        if plot_se:
            ax.plot(x, y_est + 1.96 * se, '-', c='blue')
            ax.plot(x, y_est - 1.96 * se, '-', c='blue')
        if cpr:
            # TODO: resid_response doesn't make sense with nonlinear link
            # use resid_working ?
            cpr_ = y_est + self.resid_working
            ax.plot(x, cpr_, '.', lw=2)

        ax.set_xlabel(smoother.smoothers[variable].variable_name)

        return fig
Exemple #6
0
    def plot(self):
        from statsmodels.graphics.utils import _import_mpl
        plt = _import_mpl()
        fig, axes = plt.subplots(4, 1, sharex=True)
        if hasattr(self.observed, 'plot'):  # got pandas use it
            self.observed.plot(ax=axes[0], legend=False)
            axes[0].set_ylabel('Observed')
            self.trend.plot(ax=axes[1], legend=False)
            axes[1].set_ylabel('Trend')
            self.seasonal.plot(ax=axes[2], legend=False)
            axes[2].set_ylabel('Seasonal')
            self.resid.plot(ax=axes[3], legend=False)
            axes[3].set_ylabel('Residual')
        else:
            axes[0].plot(self.observed)
            axes[0].set_ylabel('Observed')
            axes[1].plot(self.trend)
            axes[1].set_ylabel('Trend')
            axes[2].plot(self.seasonal)
            axes[2].set_ylabel('Seasonal')
            axes[3].plot(self.resid)
            axes[3].set_ylabel('Residual')
            axes[3].set_xlabel('Time')
            axes[3].set_xlim(0, self.nobs)

        fig.tight_layout()
        return fig
Exemple #7
0
    def plot(self):
        from statsmodels.graphics.utils import _import_mpl
        plt = _import_mpl()
        fig, axes = plt.subplots(4, 1, sharex=True)
        if hasattr(self.observed, 'plot'):  # got pandas use it
            self.observed.plot(ax=axes[0], legend=False)
            axes[0].set_ylabel('Observed')
            self.trend.plot(ax=axes[1], legend=False)
            axes[1].set_ylabel('Trend')
            self.seasonal.plot(ax=axes[2], legend=False)
            axes[2].set_ylabel('Seasonal')
            self.resid.plot(ax=axes[3], legend=False)
            axes[3].set_ylabel('Residual')
        else:
            axes[0].plot(self.observed)
            axes[0].set_ylabel('Observed')
            axes[1].plot(self.trend)
            axes[1].set_ylabel('Trend')
            axes[2].plot(self.seasonal)
            axes[2].set_ylabel('Seasonal')
            axes[3].plot(self.resid)
            axes[3].set_ylabel('Residual')
            axes[3].set_xlabel('Time')
            axes[3].set_xlim(0, self.nobs)

        fig.tight_layout()
        return fig
Exemple #8
0
 def test_baseline(self, close_figures):
     plt = _import_mpl()
     fig, ax = gofplots._do_plot(self.x, self.y)
     assert isinstance(fig, plt.Figure)
     assert isinstance(ax, plt.Axes)
     assert self.fig is not fig
     assert self.ax is not ax
Exemple #9
0
 def test_with_ax(self, close_figures):
     plt = _import_mpl()
     fig, ax = gofplots._do_plot(self.x, self.y, ax=self.ax)
     assert isinstance(fig, plt.Figure)
     assert isinstance(ax, plt.Axes)
     assert self.fig is fig
     assert self.ax is ax
Exemple #10
0
    def plot(self,
             observed=True,
             seasonal=True,
             trend=True,
             resid=True,
             weights=False):
        """
        Plot estimated components

        Parameters
        ----------
        observed : bool
            Include the observed series in the plot
        seasonal : bool
            Include the seasonal component in the plot
        trend : bool
            Include the trend component in the plot
        resid : bool
            Include the residual in the plot
        weights : bool
            Include the weights in the plot (if any)

        Returns
        -------
        matplotlib.figure.Figure
            The figure instance that containing the plot.
        """
        from statsmodels.graphics.utils import _import_mpl
        from pandas.plotting import register_matplotlib_converters
        plt = _import_mpl()
        register_matplotlib_converters()
        series = [(self._observed, 'Observed')] if observed else []
        series += [(self.trend, 'trend')] if trend else []
        series += [(self.seasonal, 'seasonal')] if seasonal else []
        series += [(self.resid, 'residual')] if resid else []
        series += [(self.weights, 'weights')] if weights else []

        if isinstance(self._observed, (pd.DataFrame, pd.Series)):
            nobs = self._observed.shape[0]
            xlim = self._observed.index[0], self._observed.index[nobs - 1]
        else:
            xlim = (0, self._observed.shape[0] - 1)

        fig, axs = plt.subplots(len(series), 1)
        for i, (ax, (series, def_name)) in enumerate(zip(axs, series)):
            if def_name != 'residual':
                ax.plot(series)
            else:
                ax.plot(series, marker='o', linestyle='none')
                ax.plot(xlim, (0, 0), color='#000000', zorder=-3)
            name = getattr(series, 'name', def_name)
            if def_name != 'Observed':
                name = name.capitalize()
            title = ax.set_title if i == 0 and observed else ax.set_ylabel
            title(name)
            ax.set_xlim(xlim)

        fig.tight_layout()
        return fig
Exemple #11
0
    def plot_scree(self, ncomp=None):
        """
        Plot of the ordered eigenvalues and variance explained for the loadings

        Parameters
        ----------
        ncomp : int, optional
            Number of loadings to include in the plot.  If None, will
            included the same as the number of maximum possible loadings

        Returns
        -------
        Figure
            Handle to the figure.
        """
        _import_mpl()
        from .plots import plot_scree
        return plot_scree(self.eigenvals, self.n_comp, ncomp)
Exemple #12
0
    def plot(self):
        from statsmodels.graphics.utils import _import_mpl
        plt = _import_mpl()
        fig, axes = plt.subplots(4, 1, sharex=True)
        self.observed.plot(ax=axes[0], legend=False)
        axes[0].set_ylabel('Observed')
        self.seasadj.plot(ax=axes[1], legend=False)
        axes[1].set_ylabel('Seas. Adjusted')
        self.trend.plot(ax=axes[2], legend=False)
        axes[2].set_ylabel('Trend')
        self.irregular.plot(ax=axes[3], legend=False)
        axes[3].set_ylabel('Irregular')

        fig.tight_layout()
        return fig
Exemple #13
0
    def plot(self):
        from statsmodels.graphics.utils import _import_mpl
        plt = _import_mpl()
        fig, axes = plt.subplots(4, 1, sharex=True)
        self.observed.plot(ax=axes[0], legend=False)
        axes[0].set_ylabel('Observed')
        self.seasadj.plot(ax=axes[1], legend=False)
        axes[1].set_ylabel('Seas. Adjusted')
        self.trend.plot(ax=axes[2], legend=False)
        axes[2].set_ylabel('Trend')
        self.irregular.plot(ax=axes[3], legend=False)
        axes[3].set_ylabel('Irregular')

        fig.tight_layout()
        return fig
Exemple #14
0
    def plot_path(self):
        from statsmodels.graphics.utils import _import_mpl
        plt = _import_mpl()
        plt.plot(self.alphas, self.cv_error, c='black')
        plt.plot(self.alphas, self.cv_error + 1.96 * self.cv_std,
                 c='blue')
        plt.plot(self.alphas, self.cv_error - 1.96 * self.cv_std,
                 c='blue')

        plt.plot(self.alphas, self.cv_error, 'o', c='black')
        plt.plot(self.alphas, self.cv_error + 1.96 * self.cv_std, 'o',
                 c='blue')
        plt.plot(self.alphas, self.cv_error - 1.96 * self.cv_std, 'o',
                 c='blue')

        return
Exemple #15
0
    def plot_cusum_squares(self, alpha=0.05, legend_loc='upper left',
                           fig=None, figsize=None):
        r"""
        Plot the CUSUM of squares statistic and significance bounds.

        Parameters
        ----------
        alpha : float, optional
            The plotted significance bounds are alpha %.
        legend_loc : string, optional
            The location of the legend in the plot. Default is upper left.
        fig : Matplotlib Figure instance, optional
            If given, subplots are created in this figure instead of in a new
            figure. Note that the grid will be created in the provided
            figure using `fig.add_subplot()`.
        figsize : tuple, optional
            If a figure is created, this argument allows specifying a size.
            The tuple is (width, height).

        Notes
        -----
        Evidence of parameter instability may be found if the CUSUM of squares
        statistic moves out of the significance bounds.

        Critical values used in creating the significance bounds are computed
        using the approximate formula of [2]_.

        References
        ----------
        .. [1] Brown, R. L., J. Durbin, and J. M. Evans. 1975.
           "Techniques for Testing the Constancy of
           Regression Relationships over Time."
           Journal of the Royal Statistical Society.
           Series B (Methodological) 37 (2): 149-92.
        .. [2] Edgerton, David, and Curt Wells. 1994.
           "Critical Values for the Cusumsq Statistic
           in Medium and Large Sized Samples."
           Oxford Bulletin of Economics and Statistics 56 (3): 355-65.

        """
        # Create the plot
        from statsmodels.graphics.utils import _import_mpl, create_mpl_fig
        plt = _import_mpl()
        fig = create_mpl_fig(fig, figsize)
        ax = fig.add_subplot(1, 1, 1)

        # Get dates, if applicable
        if hasattr(self.data, 'dates') and self.data.dates is not None:
            dates = self.data.dates._mpl_repr()
        else:
            dates = np.arange(self.nobs)
        llb = self.loglikelihood_burn

        # Plot cusum series and reference line
        ax.plot(dates[llb:], self.cusum_squares, label='CUSUM of squares')
        ref_line = (np.arange(llb, self.nobs) - llb) / (self.nobs - llb)
        ax.plot(dates[llb:], ref_line, 'k', alpha=0.3)

        # Plot significance bounds
        lower_line, upper_line = self._cusum_squares_significance_bounds(alpha)
        ax.plot([dates[llb], dates[-1]], upper_line, 'k--',
                label='%d%% significance' % (alpha * 100))
        ax.plot([dates[llb], dates[-1]], lower_line, 'k--')

        ax.legend(loc=legend_loc)

        return fig
Exemple #16
0
    def plot_recursive_coefficient(
        self,
        variables=None,
        alpha=0.05,
        legend_loc="upper left",
        fig=None,
        figsize=None,
    ):
        r"""
        Plot the recursively estimated coefficients on a given variable

        Parameters
        ----------
        variables : {int, str, Iterable[int], Iterable[str], None}, optional
            Integer index or string name of the variables whose coefficients
            to plot. Can also be an iterable of integers or strings. Default
            plots all coefficients.
        alpha : float, optional
            The confidence intervals for the coefficient are (1 - alpha)%. Set
            to None to exclude confidence intervals.
        legend_loc : str, optional
            The location of the legend in the plot. Default is upper left.
        fig : Figure, optional
            If given, subplots are created in this figure instead of in a new
            figure. Note that the grid will be created in the provided
            figure using `fig.add_subplot()`.
        figsize : tuple, optional
            If a figure is created, this argument allows specifying a size.
            The tuple is (width, height).

        Returns
        -------
        Figure
            The matplotlib Figure object.
        """
        from statsmodels.graphics.utils import _import_mpl, create_mpl_fig

        if alpha is not None:
            ci = self._conf_int(alpha, None)

        row_labels = self.model.data.row_labels
        if row_labels is None:
            row_labels = np.arange(self._params.shape[0])
        k_variables = self._params.shape[1]
        param_names = self.model.data.param_names
        if variables is None:
            variable_idx = list(range(k_variables))
        else:
            if isinstance(variables, (int, str)):
                variables = [variables]
            variable_idx = []
            for i in range(len(variables)):
                variable = variables[i]
                if variable in param_names:
                    variable_idx.append(param_names.index(variable))
                elif isinstance(variable, int):
                    variable_idx.append(variable)
                else:
                    msg = ("variable {0} is not an integer and was not found "
                           "in the list of variable "
                           "names: {1}".format(variables[i],
                                               ", ".join(param_names)))
                    raise ValueError(msg)

        _import_mpl()
        fig = create_mpl_fig(fig, figsize)

        loc = 0
        import pandas as pd

        if isinstance(row_labels, pd.PeriodIndex):
            row_labels = row_labels.to_timestamp()
        row_labels = np.asarray(row_labels)
        for i in variable_idx:
            ax = fig.add_subplot(len(variable_idx), 1, loc + 1)
            params = self._params[:, i]
            valid = ~np.isnan(self._params[:, i])
            row_lbl = row_labels[valid]
            ax.plot(row_lbl, params[valid])
            if alpha is not None:
                this_ci = np.reshape(ci[:, :, i], (-1, 2))
                if not np.all(np.isnan(this_ci)):
                    ax.plot(row_lbl,
                            this_ci[:, 0][valid],
                            "k:",
                            label="Lower CI")
                    ax.plot(row_lbl,
                            this_ci[:, 1][valid],
                            "k:",
                            label="Upper CI")
                    if loc == 0:
                        ax.legend(loc=legend_loc)
            ax.set_xlim(row_lbl[0], row_lbl[-1])
            ax.set_title(param_names[i])
            loc += 1

        fig.tight_layout()
        return fig
Exemple #17
0
    def plot(
        self,
        observed=True,
        seasonal=True,
        trend=True,
        resid=True,
        weights=False,
    ):
        """
        Plot estimated components

        Parameters
        ----------
        observed : bool
            Include the observed series in the plot
        seasonal : bool
            Include the seasonal component in the plot
        trend : bool
            Include the trend component in the plot
        resid : bool
            Include the residual in the plot
        weights : bool
            Include the weights in the plot (if any)

        Returns
        -------
        matplotlib.figure.Figure
            The figure instance that containing the plot.
        """
        from pandas.plotting import register_matplotlib_converters

        from statsmodels.graphics.utils import _import_mpl

        plt = _import_mpl()
        register_matplotlib_converters()
        series = [(self._observed, "Observed")] if observed else []
        series += [(self.trend, "trend")] if trend else []

        if self.seasonal.ndim == 1:
            series += [(self.seasonal, "seasonal")] if seasonal else []
        elif self.seasonal.ndim > 1:
            if isinstance(self.seasonal, pd.DataFrame):
                for col in self.seasonal.columns:
                    series += ([(self.seasonal[col],
                                 "seasonal")] if seasonal else [])
            else:
                for i in range(self.seasonal.shape[1]):
                    series += ([(self.seasonal[:, i],
                                 "seasonal")] if seasonal else [])

        series += [(self.resid, "residual")] if resid else []
        series += [(self.weights, "weights")] if weights else []

        if isinstance(self._observed, (pd.DataFrame, pd.Series)):
            nobs = self._observed.shape[0]
            xlim = self._observed.index[0], self._observed.index[nobs - 1]
        else:
            xlim = (0, self._observed.shape[0] - 1)

        fig, axs = plt.subplots(len(series), 1)
        for i, (ax, (series, def_name)) in enumerate(zip(axs, series)):
            if def_name != "residual":
                ax.plot(series)
            else:
                ax.plot(series, marker="o", linestyle="none")
                ax.plot(xlim, (0, 0), color="#000000", zorder=-3)
            name = getattr(series, "name", def_name)
            if def_name != "Observed":
                name = name.capitalize()
            title = ax.set_title if i == 0 and observed else ax.set_ylabel
            title(name)
            ax.set_xlim(xlim)

        fig.tight_layout()
        return fig
Exemple #18
0
    def plot_recursive_coefficient(self, variables=0, alpha=0.05,
                                   legend_loc='upper left', fig=None,
                                   figsize=None):
        r"""
        Plot the recursively estimated coefficients on a given variable

        Parameters
        ----------
        variables : int or str or iterable of int or string, optional
            Integer index or string name of the variable whose coefficient will
            be plotted. Can also be an iterable of integers or strings. Default
            is the first variable.
        alpha : float, optional
            The confidence intervals for the coefficient are (1 - alpha) %
        legend_loc : string, optional
            The location of the legend in the plot. Default is upper left.
        fig : Matplotlib Figure instance, optional
            If given, subplots are created in this figure instead of in a new
            figure. Note that the grid will be created in the provided
            figure using `fig.add_subplot()`.
        figsize : tuple, optional
            If a figure is created, this argument allows specifying a size.
            The tuple is (width, height).

        Notes
        -----
        All plots contain (1 - `alpha`) %  confidence intervals.
        """
        # Get variables
        if isinstance(variables, (int, str)):
            variables = [variables]
        k_variables = len(variables)

        # If a string was given for `variable`, try to get it from exog names
        exog_names = self.model.exog_names
        for i in range(k_variables):
            variable = variables[i]
            if isinstance(variable, str):
                variables[i] = exog_names.index(variable)

        # Create the plot
        from scipy.stats import norm
        from statsmodels.graphics.utils import _import_mpl, create_mpl_fig
        plt = _import_mpl()
        fig = create_mpl_fig(fig, figsize)

        for i in range(k_variables):
            variable = variables[i]
            ax = fig.add_subplot(k_variables, 1, i + 1)

            # Get dates, if applicable
            if hasattr(self.data, 'dates') and self.data.dates is not None:
                dates = self.data.dates._mpl_repr()
            else:
                dates = np.arange(self.nobs)
            d = max(self.nobs_diffuse, self.loglikelihood_burn)

            # Plot the coefficient
            coef = self.recursive_coefficients
            ax.plot(dates[d:], coef.filtered[variable, d:],
                    label='Recursive estimates: %s' % exog_names[variable])

            # Legend
            handles, labels = ax.get_legend_handles_labels()

            # Get the critical value for confidence intervals
            if alpha is not None:
                critical_value = norm.ppf(1 - alpha / 2.)

                # Plot confidence intervals
                std_errors = np.sqrt(coef.filtered_cov[variable, variable, :])
                ci_lower = (
                    coef.filtered[variable] - critical_value * std_errors)
                ci_upper = (
                    coef.filtered[variable] + critical_value * std_errors)
                ci_poly = ax.fill_between(
                    dates[d:], ci_lower[d:], ci_upper[d:], alpha=0.2
                )
                ci_label = ('$%.3g \\%%$ confidence interval'
                            % ((1 - alpha)*100))

                # Only add CI to legend for the first plot
                if i == 0:
                    # Proxy artist for fill_between legend entry
                    # See https://matplotlib.org/1.3.1/users/legend_guide.html
                    p = plt.Rectangle((0, 0), 1, 1,
                                      fc=ci_poly.get_facecolor()[0])

                    handles.append(p)
                    labels.append(ci_label)

            ax.legend(handles, labels, loc=legend_loc)

            # Remove xticks for all but the last plot
            if i < k_variables - 1:
                ax.xaxis.set_ticklabels([])

        fig.tight_layout()

        return fig
Exemple #19
0
def hdrboxplot(data,
               ncomp=2,
               alpha=None,
               threshold=0.95,
               bw=None,
               xdata=None,
               labels=None,
               ax=None,
               use_brute=False,
               seed=None):
    """
    High Density Region boxplot

    Parameters
    ----------
    data : sequence of ndarrays or 2-D ndarray
        The vectors of functions to create a functional boxplot from.  If a
        sequence of 1-D arrays, these should all be the same size.
        The first axis is the function index, the second axis the one along
        which the function is defined.  So ``data[0, :]`` is the first
        functional curve.
    ncomp : int, optional
        Number of components to use.  If None, returns the as many as the
        smaller of the number of rows or columns in data.
    alpha : list of floats between 0 and 1, optional
        Extra quantile values to compute. Default is None
    threshold : float between 0 and 1, optional
        Percentile threshold value for outliers detection. High value means
        a lower sensitivity to outliers. Default is `0.95`.
    bw: array_like or str, optional
        If an array, it is a fixed user-specified bandwidth. If `None`, set to
        `normal_reference`. If a string, should be one of:

            - normal_reference: normal reference rule of thumb (default)
            - cv_ml: cross validation maximum likelihood
            - cv_ls: cross validation least squares

    xdata : ndarray, optional
        The independent variable for the data. If not given, it is assumed to
        be an array of integers 0..N-1 with N the length of the vectors in
        `data`.
    labels : sequence of scalar or str, optional
        The labels or identifiers of the curves in `data`. If not given,
        outliers are labeled in the plot with array indices.
    ax : Matplotlib AxesSubplot instance, optional
        If given, this subplot is used to plot in instead of a new figure being
        created.
    use_brute : bool
        Use the brute force optimizer instead of the default differential
        evolution to find the curves. Default is False.
    seed : {None, int, np.random.RandomState}
        Seed value to pass to scipy.optimize.differential_evolution. Can be an
        integer or RandomState instance. If None, then the default RandomState
        provided by np.random is used.

    Returns
    -------
    fig : Matplotlib figure instance
        If `ax` is None, the created figure.  Otherwise the figure to which
        `ax` is connected.
    hdr_res : HdrResults instance
        An `HdrResults` instance with the following attributes:

         - 'median', array. Median curve.
         - 'hdr_50', array. 50% quantile band. [sup, inf] curves
         - 'hdr_90', list of array. 90% quantile band. [sup, inf]
            curves.
         - 'extra_quantiles', list of array. Extra quantile band.
            [sup, inf] curves.
         - 'outliers', ndarray. Outlier curves.

    Notes
    -----
    The median curve is the curve with the highest probability on the reduced
    space of a Principal Component Analysis (PCA).

    Outliers are defined as curves that fall outside the band corresponding
    to the quantile given by `threshold`.

    The non-outlying region is defined as the band made up of all the
    non-outlying curves.

    Behind the scene, the dataset is represented as a matrix. Each line
    corresponding to a 1D curve. This matrix is then decomposed using Principal
    Components Analysis (PCA). This allows to represent the data using a finite
    number of modes, or components. This compression process allows to turn the
    functional representation into a scalar representation of the matrix. In
    other words, you can visualize each curve from its components. Each curve
    is thus a point in this reduced space. With 2 components, this is called a
    bivariate plot (2D plot).

    In this plot, if some points are adjacent (similar components), it means
    that back in the original space, the curves are similar. Then, finding the
    median curve means finding the higher density region (HDR) in the reduced
    space. Moreover, the more you get away from this HDR, the more the curve is
    unlikely to be similar to the other curves.

    Using a kernel smoothing technique, the probability density function (PDF)
    of the multivariate space can be recovered. From this PDF, it is possible
    to compute the density probability linked to the cluster of points and plot
    its contours.

    Finally, using these contours, the different quantiles can be extracted
    along with the median curve and the outliers.

    Steps to produce the HDR boxplot include:

    1. Compute a multivariate kernel density estimation
    2. Compute contour lines for quantiles 90%, 50% and `alpha` %
    3. Plot the bivariate plot
    4. Compute median curve along with quantiles and outliers curves.

    References
    ----------
    [1] R.J. Hyndman and H.L. Shang, "Rainbow Plots, Bagplots, and Boxplots for
        Functional Data", vol. 19, pp. 29-45, 2010.

    Examples
    --------
    Load the El Nino dataset.  Consists of 60 years worth of Pacific Ocean sea
    surface temperature data.

    >>> import matplotlib.pyplot as plt
    >>> import statsmodels.api as sm
    >>> data = sm.datasets.elnino.load(as_pandas=False)

    Create a functional boxplot.  We see that the years 1982-83 and 1997-98 are
    outliers; these are the years where El Nino (a climate pattern
    characterized by warming up of the sea surface and higher air pressures)
    occurred with unusual intensity.

    >>> fig = plt.figure()
    >>> ax = fig.add_subplot(111)
    >>> res = sm.graphics.hdrboxplot(data.raw_data[:, 1:],
    ...                              labels=data.raw_data[:, 0].astype(int),
    ...                              ax=ax)

    >>> ax.set_xlabel("Month of the year")
    >>> ax.set_ylabel("Sea surface temperature (C)")
    >>> ax.set_xticks(np.arange(13, step=3) - 1)
    >>> ax.set_xticklabels(["", "Mar", "Jun", "Sep", "Dec"])
    >>> ax.set_xlim([-0.2, 11.2])

    >>> plt.show()

    .. plot:: plots/graphics_functional_hdrboxplot.py

    See Also
    --------
    banddepth, rainbowplot, fboxplot
    """
    fig, ax = utils.create_mpl_ax(ax)

    if labels is None:
        # For use with pandas, get the labels
        if hasattr(data, 'index'):
            labels = data.index
        else:
            labels = np.arange(len(data))

    data = np.asarray(data)
    if xdata is None:
        xdata = np.arange(data.shape[1])

    n_samples, dim = data.shape
    # PCA and bivariate plot
    pca = PCA(data, ncomp=ncomp)
    data_r = pca.factors

    # Create gaussian kernel
    ks_gaussian = KDEMultivariate(data_r,
                                  bw=bw,
                                  var_type='c' * data_r.shape[1])

    # Boundaries of the n-variate space
    bounds = np.array([data_r.min(axis=0), data_r.max(axis=0)]).T

    # Compute contour line of pvalue linked to a given probability level
    if alpha is None:
        alpha = [threshold, 0.9, 0.5]
    else:
        alpha.extend([threshold, 0.9, 0.5])
        alpha = list(set(alpha))
    alpha.sort(reverse=True)

    n_quantiles = len(alpha)
    pdf_r = ks_gaussian.pdf(data_r).flatten()
    pvalues = [
        np.percentile(pdf_r, (1 - alpha[i]) * 100, interpolation='linear')
        for i in range(n_quantiles)
    ]

    # Find mean, outliers curves
    if have_de_optim and not use_brute:
        median = differential_evolution(lambda x: -ks_gaussian.pdf(x),
                                        bounds=bounds,
                                        maxiter=5,
                                        seed=seed).x
    else:
        median = brute(lambda x: -ks_gaussian.pdf(x),
                       ranges=bounds,
                       finish=fmin)

    outliers_idx = np.where(pdf_r < pvalues[alpha.index(threshold)])[0]
    labels_outlier = [labels[i] for i in outliers_idx]
    outliers = data[outliers_idx]

    # Find HDR given some quantiles

    def _band_quantiles(band, use_brute=use_brute, seed=seed):
        """
        Find extreme curves for a quantile band.

        From the `band` of quantiles, the associated PDF extrema values
        are computed. If `min_alpha` is not provided (single quantile value),
        `max_pdf` is set to `1E6` in order not to constrain the problem on high
        values.

        An optimization is performed per component in order to find the min and
        max curves. This is done by comparing the PDF value of a given curve
        with the band PDF.

        Parameters
        ----------
        band : array_like
            alpha values ``(max_alpha, min_alpha)`` ex: ``[0.9, 0.5]``
        use_brute : bool
            Use the brute force optimizer instead of the default differential
            evolution to find the curves. Default is False.
        seed : {None, int, np.random.RandomState}
            Seed value to pass to scipy.optimize.differential_evolution. Can
            be an integer or RandomState instance. If None, then the default
            RandomState provided by np.random is used.


        Returns
        -------
        band_quantiles : list of 1-D array
            ``(max_quantile, min_quantile)`` (2, n_features)

        """
        min_pdf = pvalues[alpha.index(band[0])]
        try:
            max_pdf = pvalues[alpha.index(band[1])]
        except IndexError:
            max_pdf = 1E6
        band = [min_pdf, max_pdf]

        pool = Pool()
        data = zip(
            range(dim),
            itertools.repeat(
                (band, pca, bounds, ks_gaussian, seed, use_brute)))
        band_quantiles = pool.map(_min_max_band, data)
        pool.terminate()
        pool.close()

        band_quantiles = list(zip(*band_quantiles))

        return band_quantiles

    extra_alpha = [
        i for i in alpha if 0.5 != i and 0.9 != i and threshold != i
    ]
    if len(extra_alpha) > 0:
        extra_quantiles = []
        for x in extra_alpha:
            for y in _band_quantiles([x], use_brute=use_brute, seed=seed):
                extra_quantiles.append(y)
    else:
        extra_quantiles = []

    # Inverse transform from n-variate plot to dataset dataset's shape
    median = _inverse_transform(pca, median)[0]
    hdr_90 = _band_quantiles([0.9, 0.5], use_brute=use_brute, seed=seed)
    hdr_50 = _band_quantiles([0.5], use_brute=use_brute, seed=seed)

    hdr_res = HdrResults({
        "median": median,
        "hdr_50": hdr_50,
        "hdr_90": hdr_90,
        "extra_quantiles": extra_quantiles,
        "outliers": outliers,
        "outliers_idx": outliers_idx
    })

    # Plots
    ax.plot(np.array([xdata] * n_samples).T,
            data.T,
            c='c',
            alpha=.1,
            label=None)
    ax.plot(xdata, median, c='k', label='Median')
    fill_betweens = []
    fill_betweens.append(
        ax.fill_between(xdata,
                        *hdr_50,
                        color='gray',
                        alpha=.4,
                        label='50% HDR'))
    fill_betweens.append(
        ax.fill_between(xdata,
                        *hdr_90,
                        color='gray',
                        alpha=.3,
                        label='90% HDR'))

    if len(extra_quantiles) != 0:
        ax.plot(np.array([xdata] * len(extra_quantiles)).T,
                np.array(extra_quantiles).T,
                c='y',
                ls='-.',
                alpha=.4,
                label='Extra quantiles')

    if len(outliers) != 0:
        for ii, outlier in enumerate(outliers):
            if labels_outlier is None:
                label = 'Outliers'
            else:
                label = str(labels_outlier[ii])
            ax.plot(xdata, outlier, ls='--', alpha=0.7, label=label)

    handles, labels = ax.get_legend_handles_labels()

    # Proxy artist for fill_between legend entry
    # See https://matplotlib.org/1.3.1/users/legend_guide.html
    plt = _import_mpl()
    for label, fill_between in zip(['50% HDR', '90% HDR'], fill_betweens):
        p = plt.Rectangle((0, 0), 1, 1, fc=fill_between.get_facecolor()[0])
        handles.append(p)
        labels.append(label)

    by_label = OrderedDict(zip(labels, handles))
    if len(outliers) != 0:
        by_label.pop('Median')
        by_label.pop('50% HDR')
        by_label.pop('90% HDR')

    ax.legend(by_label.values(), by_label.keys(), loc='best')

    return fig, hdr_res
Exemple #20
0
def plot_predict(
    result,
    start=None,
    end=None,
    dynamic=False,
    alpha=0.05,
    ax=None,
    **predict_kwargs,
):
    """

    Parameters
    ----------
    result : Result
        Any model result supporting ``get_prediction``.
    start : int, str, or datetime, optional
        Zero-indexed observation number at which to start forecasting,
        i.e., the first forecast is start. Can also be a date string to
        parse or a datetime type. Default is the the zeroth observation.
    end : int, str, or datetime, optional
        Zero-indexed observation number at which to end forecasting, i.e.,
        the last forecast is end. Can also be a date string to
        parse or a datetime type. However, if the dates index does not
        have a fixed frequency, end must be an integer index if you
        want out of sample prediction. Default is the last observation in
        the sample.
    dynamic : bool, int, str, or datetime, optional
        Integer offset relative to `start` at which to begin dynamic
        prediction. Can also be an absolute date string to parse or a
        datetime type (these are not interpreted as offsets).
        Prior to this observation, true endogenous values will be used for
        prediction; starting with this observation and continuing through
        the end of prediction, forecasted endogenous values will be used
        instead.
    alpha : {float, None}
        The tail probability not covered by the confidence interval. Must
        be in (0, 1). Confidence interval is constructed assuming normally
        distributed shocks. If None, figure will not show the confidence
        interval.
    ax : AxesSubplot
        matplotlib Axes instance to use
    **predict_kwargs
        Any additional keyword arguments to pass to ``result.get_prediction``.

    Returns
    -------
    Figure
        matplotlib Figure containing the prediction plot
    """
    from statsmodels.graphics.utils import _import_mpl, create_mpl_ax

    _ = _import_mpl()
    fig, ax = create_mpl_ax(ax)
    from statsmodels.tsa.base.prediction import PredictionResults

    # use predict so you set dates
    pred: PredictionResults = result.get_prediction(start=start,
                                                    end=end,
                                                    dynamic=dynamic,
                                                    **predict_kwargs)
    mean = pred.predicted_mean
    if isinstance(mean, (pd.Series, pd.DataFrame)):
        x = mean.index
        mean.plot(ax=ax, label="forecast")
    else:
        x = np.arange(mean.shape[0])
        ax.plot(x, mean)

    if alpha is not None:
        label = f"{1-alpha:.0%} confidence interval"
        ci = pred.conf_int(alpha)
        conf_int = np.asarray(ci)

        ax.fill_between(
            x,
            conf_int[:, 0],
            conf_int[:, 1],
            color="gray",
            alpha=0.5,
            label=label,
        )

    ax.legend(loc="best")

    return fig
def hdrboxplot(data, ncomp=2, alpha=None, threshold=0.95, bw=None,
               xdata=None, labels=None, ax=None):
    """
    High Density Region boxplot

    Parameters
    ----------
    data : sequence of ndarrays or 2-D ndarray
        The vectors of functions to create a functional boxplot from.  If a
        sequence of 1-D arrays, these should all be the same size.
        The first axis is the function index, the second axis the one along
        which the function is defined.  So ``data[0, :]`` is the first
        functional curve.
    ncomp : int, optional
        Number of components to use.  If None, returns the as many as the
        smaller of the number of rows or columns in data.
    alpha : list of floats between 0 and 1, optional
        Extra quantile values to compute. Default is None
    threshold : float between 0 and 1, optional
        Percentile threshold value for outliers detection. High value means
        a lower sensitivity to outliers. Default is `0.95`.
    bw: array_like or str, optional
        If an array, it is a fixed user-specified bandwidth. If `None`, set to
        `normal_reference`. If a string, should be one of:

            - normal_reference: normal reference rule of thumb (default)
            - cv_ml: cross validation maximum likelihood
            - cv_ls: cross validation least squares

    xdata : ndarray, optional
        The independent variable for the data. If not given, it is assumed to
        be an array of integers 0..N-1 with N the length of the vectors in
        `data`.
    labels : sequence of scalar or str, optional
        The labels or identifiers of the curves in `data`. If not given,
        outliers are labeled in the plot with array indices.
    ax : Matplotlib AxesSubplot instance, optional
        If given, this subplot is used to plot in instead of a new figure being
        created.

    Returns
    -------
    fig : Matplotlib figure instance
        If `ax` is None, the created figure.  Otherwise the figure to which
        `ax` is connected.
    hdr_res : HdrResults instance
        An `HdrResults` instance with the following attributes:

         - 'median', array. Median curve.
         - 'hdr_50', array. 50% quantile band. [sup, inf] curves
         - 'hdr_90', list of array. 90% quantile band. [sup, inf]
            curves.
         - 'extra_quantiles', list of array. Extra quantile band.
            [sup, inf] curves.
         - 'outliers', ndarray. Outlier curves.

    Notes
    -----
    The median curve is the curve with the highest probability on the reduced
    space of a Principal Component Analysis (PCA).

    Outliers are defined as curves that fall outside the band corresponding
    to the quantile given by `threshold`.

    The non-outlying region is defined as the band made up of all the
    non-outlying curves.

    Behind the scene, the dataset is represented as a matrix. Each line
    corresponding to a 1D curve. This matrix is then decomposed using Principal
    Components Analysis (PCA). This allows to represent the data using a finite
    number of modes, or components. This compression process allows to turn the
    functional representation into a scalar representation of the matrix. In
    other words, you can visualize each curve from its components. Each curve
    is thus a point in this reduced space. With 2 components, this is called a
    bivariate plot (2D plot).

    In this plot, if some points are adjacent (similar components), it means
    that back in the original space, the curves are similar. Then, finding the
    median curve means finding the higher density region (HDR) in the reduced
    space. Moreover, the more you get away from this HDR, the more the curve is
    unlikely to be similar to the other curves.

    Using a kernel smoothing technique, the probability density function (PDF)
    of the multivariate space can be recovered. From this PDF, it is possible to
    compute the density probability linked to the cluster of points and plot
    its contours.

    Finally, using these contours, the different quantiles can be extracted
    along with the median curve and the outliers.

    Steps to produce the HDR boxplot include:

    1. Compute a multivariate kernel density estimation
    2. Compute contour lines for quantiles 90%, 50% and `alpha` %
    3. Plot the bivariate plot
    4. Compute median curve along with quantiles and outliers curves.

    References
    ----------
    [1] R.J. Hyndman and H.L. Shang, "Rainbow Plots, Bagplots, and Boxplots for
        Functional Data", vol. 19, pp. 29-45, 2010.

    Examples
    --------
    Load the El Nino dataset.  Consists of 60 years worth of Pacific Ocean sea
    surface temperature data.

    >>> import matplotlib.pyplot as plt
    >>> import statsmodels.api as sm
    >>> data = sm.datasets.elnino.load(as_pandas=False)

    Create a functional boxplot.  We see that the years 1982-83 and 1997-98 are
    outliers; these are the years where El Nino (a climate pattern
    characterized by warming up of the sea surface and higher air pressures)
    occurred with unusual intensity.

    >>> fig = plt.figure()
    >>> ax = fig.add_subplot(111)
    >>> res = sm.graphics.hdrboxplot(data.raw_data[:, 1:],
    ...                              labels=data.raw_data[:, 0].astype(int),
    ...                              ax=ax)

    >>> ax.set_xlabel("Month of the year")
    >>> ax.set_ylabel("Sea surface temperature (C)")
    >>> ax.set_xticks(np.arange(13, step=3) - 1)
    >>> ax.set_xticklabels(["", "Mar", "Jun", "Sep", "Dec"])
    >>> ax.set_xlim([-0.2, 11.2])

    >>> plt.show()

    .. plot:: plots/graphics_functional_hdrboxplot.py

    See Also
    --------
    banddepth, rainbowplot, fboxplot
    """
    fig, ax = utils.create_mpl_ax(ax)

    if labels is None:
        # For use with pandas, get the labels
        if hasattr(data, 'index'):
            labels = data.index
        else:
            labels = np.arange(len(data))

    data = np.asarray(data)
    if xdata is None:
        xdata = np.arange(data.shape[1])

    n_samples, dim = data.shape
    # PCA and bivariate plot
    pca = PCA(data, ncomp=ncomp)
    data_r = pca.factors

    # Create gaussian kernel
    ks_gaussian = KDEMultivariate(data_r, bw=bw,
                                  var_type='c' * data_r.shape[1])

    # Boundaries of the n-variate space
    bounds = np.array([data_r.min(axis=0), data_r.max(axis=0)]).T

    # Compute contour line of pvalue linked to a given probability level
    if alpha is None:
        alpha = [threshold, 0.9, 0.5]
    else:
        alpha.extend([threshold, 0.9, 0.5])
        alpha = list(set(alpha))
    alpha.sort(reverse=True)

    n_quantiles = len(alpha)
    pdf_r = ks_gaussian.pdf(data_r).flatten()
    pvalues = [np.percentile(pdf_r, (1 - alpha[i]) * 100,
                             interpolation='linear')
               for i in range(n_quantiles)]

    # Find mean, outliers curves
    if have_de_optim:
        median = differential_evolution(lambda x: - ks_gaussian.pdf(x),
                                        bounds=bounds, maxiter=5).x
    else:
        median = brute(lambda x: - ks_gaussian.pdf(x),
                       ranges=bounds, finish=fmin)

    outliers_idx = np.where(pdf_r < pvalues[alpha.index(threshold)])[0]
    labels_outlier = [labels[i] for i in outliers_idx]
    outliers = data[outliers_idx]

    # Find HDR given some quantiles

    def _band_quantiles(band):
        """Find extreme curves for a quantile band.

        From the `band` of quantiles, the associated PDF extrema values
        are computed. If `min_alpha` is not provided (single quantile value),
        `max_pdf` is set to `1E6` in order not to constrain the problem on high
        values.

        An optimization is performed per component in order to find the min and
        max curves. This is done by comparing the PDF value of a given curve
        with the band PDF.

        Parameters
        ----------
        band : array_like
            alpha values ``(max_alpha, min_alpha)`` ex: ``[0.9, 0.5]``

        Returns
        -------
        band_quantiles : list of 1-D array
            ``(max_quantile, min_quantile)`` (2, n_features)

        """
        min_pdf = pvalues[alpha.index(band[0])]
        try:
            max_pdf = pvalues[alpha.index(band[1])]
        except IndexError:
            max_pdf = 1E6
        band = [min_pdf, max_pdf]

        pool = Pool()
        data = zip(range(dim), itertools.repeat((band, pca,
                                                 bounds, ks_gaussian)))
        band_quantiles = pool.map(_min_max_band, data)
        pool.terminate()
        pool.close()

        band_quantiles = list(zip(*band_quantiles))

        return band_quantiles

    extra_alpha = [i for i in alpha
                   if 0.5 != i and 0.9 != i and threshold != i]
    if extra_alpha != []:
            extra_quantiles = [y for x in extra_alpha
                               for y in _band_quantiles([x])]
    else:
        extra_quantiles = []

    # Inverse transform from n-variate plot to dataset dataset's shape
    median = _inverse_transform(pca, median)[0]
    hdr_90 = _band_quantiles([0.9, 0.5])
    hdr_50 = _band_quantiles([0.5])

    hdr_res = HdrResults({
                            "median": median,
                            "hdr_50": hdr_50,
                            "hdr_90": hdr_90,
                            "extra_quantiles": extra_quantiles,
                            "outliers": outliers,
                            "outliers_idx": outliers_idx
                         })

    # Plots
    ax.plot(np.array([xdata] * n_samples).T, data.T,
            c='c', alpha=.1, label=None)
    ax.plot(xdata, median, c='k', label='Median')
    fill_betweens = []
    fill_betweens.append(ax.fill_between(xdata, *hdr_50, color='gray',
                                         alpha=.4,  label='50% HDR'))
    fill_betweens.append(ax.fill_between(xdata, *hdr_90, color='gray',
                                         alpha=.3, label='90% HDR'))

    if len(extra_quantiles) != 0:
        ax.plot(np.array([xdata] * len(extra_quantiles)).T,
                np.array(extra_quantiles).T,
                c='y', ls='-.', alpha=.4, label='Extra quantiles')

    if len(outliers) != 0:
        for ii, outlier in enumerate(outliers):
            label = str(labels_outlier[ii]) if labels_outlier is not None else 'Outliers'
            ax.plot(xdata, outlier,
                    ls='--', alpha=0.7, label=label)

    handles, labels = ax.get_legend_handles_labels()

    # Proxy artist for fill_between legend entry
    # See http://matplotlib.org/1.3.1/users/legend_guide.html
    plt = _import_mpl()
    for label, fill_between in zip(['50% HDR', '90% HDR'], fill_betweens):
        p = plt.Rectangle((0, 0), 1, 1,
                          fc=fill_between.get_facecolor()[0])
        handles.append(p)
        labels.append(label)

    by_label = OrderedDict(zip(labels, handles))
    if len(outliers) != 0:
        by_label.pop('Median')
        by_label.pop('50% HDR')
        by_label.pop('90% HDR')

    ax.legend(by_label.values(), by_label.keys(), loc='best')

    return fig, hdr_res
Exemple #22
0
    def plot_predict(
        self,
        steps: int = 1,
        theta: float = 2,
        alpha: Optional[float] = 0.05,
        in_sample: bool = False,
        fig: Optional["matplotlib.figure.Figure"] = None,
        figsize: Tuple[float, float] = None,
    ) -> "matplotlib.figure.Figure":
        r"""
        Plot forecasts, prediction intervals and in-sample values

        Parameters
        ----------
        steps : int, default 1
            The number of steps ahead to compute the forecast components.
        theta : float, default 2
            The theta value to use when computing the weight to combine
            the trend and the SES forecasts.
        alpha : {float, None}, default 0.05
            The tail probability not covered by the confidence interval. Must
            be in (0, 1). Confidence interval is constructed assuming normally
            distributed shocks. If None, figure will not show the confidence
            interval.
        in_sample : bool, default False
            Flag indicating whether to include the in-sample period in the
            plot.
        fig : Figure, default None
            An existing figure handle. If not provided, a new figure is
            created.
        figsize: tuple[float, float], default None
            Tuple containing the figure size.

        Returns
        -------
        Figure
            Figure handle containing the plot.

        Notes
        -----
        The variance of the h-step forecast is assumed to follow from the
        integrated Moving Average structure of the Theta model, and so is
        :math:`\sigma^2(\alpha^2 + (h-1))`. The prediction interval assumes
        that innovations are normally distributed.
        """
        from statsmodels.graphics.utils import _import_mpl, create_mpl_fig

        _import_mpl()
        fig = create_mpl_fig(fig, figsize)
        assert fig is not None
        predictions = self.forecast(steps, theta)
        pred_index = predictions.index

        ax = fig.add_subplot(111)
        nobs = self.model.endog_orig.shape[0]
        index = NumericIndex(np.arange(nobs))
        if in_sample:
            if isinstance(self.model.endog_orig, pd.Series):
                index = self.model.endog_orig.index
            ax.plot(index, self.model.endog_orig)
        ax.plot(pred_index, predictions)
        if alpha is not None:
            pi = self.prediction_intervals(steps, theta, alpha)
            label = "{0:.0%} confidence interval".format(1 - alpha)
            ax.fill_between(
                pred_index,
                pi["lower"],
                pi["upper"],
                color="gray",
                alpha=0.5,
                label=label,
            )

        ax.legend(loc="best", frameon=False)
        fig.tight_layout(pad=1.0)

        return fig
Exemple #23
0
    def plot_cusum_squares(self,
                           alpha=0.05,
                           legend_loc='upper left',
                           fig=None,
                           figsize=None):
        r"""
        Plot the CUSUM of squares statistic and significance bounds.

        Parameters
        ----------
        alpha : float, optional
            The plotted significance bounds are alpha %.
        legend_loc : string, optional
            The location of the legend in the plot. Default is upper left.
        fig : Matplotlib Figure instance, optional
            If given, subplots are created in this figure instead of in a new
            figure. Note that the grid will be created in the provided
            figure using `fig.add_subplot()`.
        figsize : tuple, optional
            If a figure is created, this argument allows specifying a size.
            The tuple is (width, height).

        Notes
        -----
        Evidence of parameter instability may be found if the CUSUM of squares
        statistic moves out of the significance bounds.

        Critical values used in creating the significance bounds are computed
        using the approximate formula of [2]_.

        References
        ----------
        .. [1] Brown, R. L., J. Durbin, and J. M. Evans. 1975.
           "Techniques for Testing the Constancy of
           Regression Relationships over Time."
           Journal of the Royal Statistical Society.
           Series B (Methodological) 37 (2): 149-92.
        .. [2] Edgerton, David, and Curt Wells. 1994.
           "Critical Values for the Cusumsq Statistic
           in Medium and Large Sized Samples."
           Oxford Bulletin of Economics and Statistics 56 (3): 355-65.

        """
        # Create the plot
        from statsmodels.graphics.utils import _import_mpl, create_mpl_fig
        plt = _import_mpl()
        fig = create_mpl_fig(fig, figsize)
        ax = fig.add_subplot(1, 1, 1)

        # Get dates, if applicable
        if hasattr(self.data, 'dates') and self.data.dates is not None:
            dates = self.data.dates._mpl_repr()
        else:
            dates = np.arange(self.nobs)
        llb = self.loglikelihood_burn

        # Plot cusum series and reference line
        ax.plot(dates[llb:], self.cusum_squares, label='CUSUM of squares')
        ref_line = (np.arange(llb, self.nobs) - llb) / (self.nobs - llb)
        ax.plot(dates[llb:], ref_line, 'k', alpha=0.3)

        # Plot significance bounds
        lower_line, upper_line = self._cusum_squares_significance_bounds(alpha)
        ax.plot([dates[llb], dates[-1]],
                upper_line,
                'k--',
                label='%d%% significance' % (alpha * 100))
        ax.plot([dates[llb], dates[-1]], lower_line, 'k--')

        ax.legend(loc=legend_loc)

        return fig
Exemple #24
0
def plot_diagnostics(residuals,
                     variable=0,
                     lags=40,
                     fig=None,
                     figsize=(15, 7),
                     savefig=False,
                     path=None):

    _import_mpl()
    fig = create_mpl_fig(fig, figsize)

    # # Eliminate residuals associated with burned or diffuse likelihoods
    # d = np.maximum(self.loglikelihood_burn, self.nobs_diffuse)
    # resid = self.filter_results.standardized_forecasts_error[variable, d:]
    # loglikelihood_burn: the number of observations during which the likelihood is not evaluated.

    # Standardize residual
    # Source: https://alkaline-ml.com/pmdarima/1.1.1/_modules/pmdarima/arima/arima.html
    resid = residuals
    resid = (resid - np.nanmean(resid)) / np.nanstd(resid)

    # Top-left: residuals vs time
    ax = fig.add_subplot(221)
    #  if hasattr(self.data, 'dates') and self.data.dates is not None:
    #      x = self.data.dates[d:]._mpl_repr()
    #  else:
    #      x = np.arange(len(resid))
    x = np.arange(len(resid))
    ax.plot(x, resid)
    ax.hlines(0, x[0], x[-1], alpha=0.5)
    ax.set_xlim(x[0], x[-1])
    ax.set_title('Standardized residual')

    # Top-right: histogram, Gaussian kernel density, Normal density
    # Can only do histogram and Gaussian kernel density on the non-null
    # elements
    resid_nonmissing = resid[~(np.isnan(resid))]
    ax = fig.add_subplot(222)

    # gh5792: Remove  except after support for matplotlib>2.1 required
    try:
        ax.hist(resid_nonmissing, density=True, label='Hist')
    except AttributeError:
        ax.hist(resid_nonmissing, normed=True, label='Hist')

    from scipy.stats import gaussian_kde, norm
    kde = gaussian_kde(resid_nonmissing)
    xlim = (-1.96 * 2, 1.96 * 2)
    x = np.linspace(xlim[0], xlim[1])
    ax.plot(x, kde(x), label='KDE')
    ax.plot(x, norm.pdf(x), label='N(0,1)')
    ax.set_xlim(xlim)
    ax.legend()
    ax.set_title('Histogram plus estimated density')

    # Bottom-left: QQ plot
    ax = fig.add_subplot(223)
    from statsmodels.graphics.gofplots import qqplot
    qqplot(resid_nonmissing, line='s', ax=ax)
    ax.set_title('Normal Q-Q')

    # Bottom-right: Correlogram
    ax = fig.add_subplot(224)
    from statsmodels.graphics.tsaplots import plot_pacf
    plot_pacf(resid, ax=ax, lags=lags)
    ax.set_title('Partial Autocorrelation function')

    ax.set_ylim(-0.1, 0.1)

    if savefig == True:
        fig.suptitle('Residual diagnostic', fontsize=20)
        fig.savefig(path, dpi=500)
        fig.show()
    return fig
    def plot_recursive_coefficient(self, variables=0, alpha=0.05,
                                   legend_loc='upper left', fig=None,
                                   figsize=None):
        r"""
        Plot the recursively estimated coefficients on a given variable

        Parameters
        ----------
        variables : int or str or iterable of int or string, optional
            Integer index or string name of the variable whose coefficient will
            be plotted. Can also be an iterable of integers or strings. Default
            is the first variable.
        alpha : float, optional
            The confidence intervals for the coefficient are (1 - alpha) %
        legend_loc : string, optional
            The location of the legend in the plot. Default is upper left.
        fig : Matplotlib Figure instance, optional
            If given, subplots are created in this figure instead of in a new
            figure. Note that the grid will be created in the provided
            figure using `fig.add_subplot()`.
        figsize : tuple, optional
            If a figure is created, this argument allows specifying a size.
            The tuple is (width, height).

        Notes
        -----
        All plots contain (1 - `alpha`) %  confidence intervals.
        """
        # Get variables
        if isinstance(variables, (int, str)):
            variables = [variables]
        k_variables = len(variables)

        # If a string was given for `variable`, try to get it from exog names
        exog_names = self.model.exog_names
        for i in range(k_variables):
            variable = variables[i]
            if isinstance(variable, str):
                variables[i] = exog_names.index(variable)

        # Create the plot
        from scipy.stats import norm
        from statsmodels.graphics.utils import _import_mpl, create_mpl_fig
        plt = _import_mpl()
        fig = create_mpl_fig(fig, figsize)

        for i in range(k_variables):
            variable = variables[i]
            ax = fig.add_subplot(k_variables, 1, i + 1)

            # Get dates, if applicable
            if hasattr(self.data, 'dates') and self.data.dates is not None:
                dates = self.data.dates._mpl_repr()
            else:
                dates = np.arange(self.nobs)
            d = max(self.nobs_diffuse, self.loglikelihood_burn)

            # Plot the coefficient
            coef = self.recursive_coefficients
            ax.plot(dates[d:], coef.filtered[variable, d:],
                    label='Recursive estimates: %s' % exog_names[variable])

            # Legend
            handles, labels = ax.get_legend_handles_labels()

            # Get the critical value for confidence intervals
            if alpha is not None:
                critical_value = norm.ppf(1 - alpha / 2.)

                # Plot confidence intervals
                std_errors = np.sqrt(coef.filtered_cov[variable, variable, :])
                ci_lower = (
                    coef.filtered[variable] - critical_value * std_errors)
                ci_upper = (
                    coef.filtered[variable] + critical_value * std_errors)
                ci_poly = ax.fill_between(
                    dates[d:], ci_lower[d:], ci_upper[d:], alpha=0.2
                )
                ci_label = ('$%.3g \\%%$ confidence interval'
                            % ((1 - alpha)*100))

                # Only add CI to legend for the first plot
                if i == 0:
                    # Proxy artist for fill_between legend entry
                    # See http://matplotlib.org/1.3.1/users/legend_guide.html
                    p = plt.Rectangle((0, 0), 1, 1,
                                      fc=ci_poly.get_facecolor()[0])

                    handles.append(p)
                    labels.append(ci_label)

            ax.legend(handles, labels, loc=legend_loc)

            # Remove xticks for all but the last plot
            if i < k_variables - 1:
                ax.xaxis.set_ticklabels([])

        fig.tight_layout()

        return fig