def plot_loadings(self, loading_pairs=None, plot_prerotated=False): """ Plot factor loadings in 2-d plots Parameters ---------- loading_pairs : None or a list of tuples Specify plots. Each tuple (i, j) represent one figure, i and j is the loading number for x-axis and y-axis, respectively. If `None`, all combinations of the loadings will be plotted. plot_prerotated : True or False If True, the loadings before rotation applied will be plotted. If False, rotated loadings will be plotted. Returns ------- figs : a list of figure handles """ _import_mpl() from .plots import plot_loadings if self.rotation_method is None: plot_prerotated = True loadings = self.loadings_no_rot if plot_prerotated else self.loadings if plot_prerotated: title = 'Prerotated Factor Pattern' else: title = '%s Rotated Factor Pattern' % (self.rotation_method) var_explained = self.eigenvals / self.n_comp * 100 return plot_loadings(loadings, loading_pairs=loading_pairs, title=title, row_names=self.endog_names, percent_variance=var_explained)
def plot_partial(self, smooth_index, plot_se=True, cpr=False, include_constant=True, ax=None): """plot the contribution of a smooth term to the linear prediction Parameters ---------- smooth_index : int index of the smooth term within list of smooth terms plot_se : book If plot_se is true, then the confidence interval for the linear prediction will be added to the plot. cpr : bool If cpr (component plus residual) is true, the a scatter plot of the partial working residuals will be added to the plot. include_constant : bool If true, then the estimated intercept is added to the prediction and its standard errors. This avoids that the confidence interval has zero width at the imposed identification constraint, e.g. either at a reference point or at the mean. ax : None or matplotlib axis instance If ax is not None, then the plot will be added to it. Returns ------- Figure If `ax` is None, the created figure. Otherwise the Figure to which `ax` is connected. """ from statsmodels.graphics.utils import _import_mpl, create_mpl_ax _import_mpl() variable = smooth_index y_est, se = self.partial_values(variable, include_constant=include_constant) smoother = self.model.smoother x = smoother.smoothers[variable].x sort_index = np.argsort(x) x = x[sort_index] y_est = y_est[sort_index] se = se[sort_index] fig, ax = create_mpl_ax(ax) ax.plot(x, y_est, c='blue', lw=2) if plot_se: ax.plot(x, y_est + 1.96 * se, '-', c='blue') ax.plot(x, y_est - 1.96 * se, '-', c='blue') if cpr: # TODO: resid_response does not make sense with nonlinear link # use resid_working ? cpr_ = y_est + self.resid_working ax.plot(x, cpr_, '.', lw=2) ax.set_xlabel(smoother.smoothers[variable].variable_name) return fig
def plot_cusum(self, alpha=0.05, legend_loc='upper left', fig=None, figsize=None): r""" Plot the CUSUM statistic and significance bounds. Parameters ---------- alpha : float, optional The plotted significance bounds are alpha %. legend_loc : string, optional The location of the legend in the plot. Default is upper left. fig : Matplotlib Figure instance, optional If given, subplots are created in this figure instead of in a new figure. Note that the grid will be created in the provided figure using `fig.add_subplot()`. figsize : tuple, optional If a figure is created, this argument allows specifying a size. The tuple is (width, height). Notes ----- Evidence of parameter instability may be found if the CUSUM statistic moves out of the significance bounds. References ---------- .. [*] Brown, R. L., J. Durbin, and J. M. Evans. 1975. "Techniques for Testing the Constancy of Regression Relationships over Time." Journal of the Royal Statistical Society. Series B (Methodological) 37 (2): 149-92. """ # Create the plot from statsmodels.graphics.utils import _import_mpl, create_mpl_fig _import_mpl() fig = create_mpl_fig(fig, figsize) ax = fig.add_subplot(1, 1, 1) # Get dates, if applicable if hasattr(self.data, 'dates') and self.data.dates is not None: dates = self.data.dates._mpl_repr() else: dates = np.arange(self.nobs) d = max(self.nobs_diffuse, self.loglikelihood_burn) # Plot cusum series and reference line ax.plot(dates[d:], self.cusum, label='CUSUM') ax.hlines(0, dates[d], dates[-1], color='k', alpha=0.3) # Plot significance bounds lower_line, upper_line = self._cusum_significance_bounds(alpha) ax.plot([dates[d], dates[-1]], upper_line, 'k--', label='%d%% significance' % (alpha * 100)) ax.plot([dates[d], dates[-1]], lower_line, 'k--') ax.legend(loc=legend_loc) return fig
def plot_cusum(self, alpha=0.05, legend_loc='upper left', fig=None, figsize=None): r""" Plot the CUSUM statistic and significance bounds. Parameters ---------- alpha : float, optional The plotted significance bounds are alpha %. legend_loc : string, optional The location of the legend in the plot. Default is upper left. fig : Matplotlib Figure instance, optional If given, subplots are created in this figure instead of in a new figure. Note that the grid will be created in the provided figure using `fig.add_subplot()`. figsize : tuple, optional If a figure is created, this argument allows specifying a size. The tuple is (width, height). Notes ----- Evidence of parameter instability may be found if the CUSUM statistic moves out of the significance bounds. References ---------- .. [*] Brown, R. L., J. Durbin, and J. M. Evans. 1975. "Techniques for Testing the Constancy of Regression Relationships over Time." Journal of the Royal Statistical Society. Series B (Methodological) 37 (2): 149-92. """ # Create the plot from statsmodels.graphics.utils import _import_mpl, create_mpl_fig _import_mpl() fig = create_mpl_fig(fig, figsize) ax = fig.add_subplot(1, 1, 1) # Get dates, if applicable if hasattr(self.data, 'dates') and self.data.dates is not None: dates = self.data.dates._mpl_repr() else: dates = np.arange(self.nobs) d = max(self.nobs_diffuse, self.loglikelihood_burn) # Plot cusum series and reference line ax.plot(dates[d:], self.cusum, label='CUSUM') ax.hlines(0, dates[d], dates[-1], color='k', alpha=0.3) # Plot significance bounds lower_line, upper_line = self._cusum_significance_bounds(alpha) ax.plot([dates[d], dates[-1]], upper_line, 'k--', label='%d%% significance' % (alpha * 100)) ax.plot([dates[d], dates[-1]], lower_line, 'k--') ax.legend(loc=legend_loc) return fig
def plot_partial(self, smooth_index, plot_se=True, cpr=False, include_constant=True, ax=None): """plot the contribution of a smooth term to the linear prediction Parameters ---------- smooth_index : int index of the smooth term within list of smooth terms plot_se : book If plot_se is true, then the confidence interval for the linear prediction will be added to the plot. cpr : bool If cpr (component plus residual) is true, the a scatter plot of the partial working residuals will be added to the plot. include_constant : bool If true, then the estimated intercept is added to the prediction and its standard errors. This avoids that the confidence interval has zero width at the imposed identification constraint, e.g. either at a reference point or at the mean. ax : None or matplotlib axis instance If ax is not None, then the plot will be added to it. Returns ------- fig : matplotlib Figure instance """ from statsmodels.graphics.utils import _import_mpl, create_mpl_ax _import_mpl() variable = smooth_index y_est, se = self.partial_values(variable, include_constant=include_constant) smoother = self.model.smoother x = smoother.smoothers[variable].x sort_index = np.argsort(x) x = x[sort_index] y_est = y_est[sort_index] se = se[sort_index] fig, ax = create_mpl_ax(ax) ax.plot(x, y_est, c='blue', lw=2) if plot_se: ax.plot(x, y_est + 1.96 * se, '-', c='blue') ax.plot(x, y_est - 1.96 * se, '-', c='blue') if cpr: # TODO: resid_response doesn't make sense with nonlinear link # use resid_working ? cpr_ = y_est + self.resid_working ax.plot(x, cpr_, '.', lw=2) ax.set_xlabel(smoother.smoothers[variable].variable_name) return fig
def plot(self): from statsmodels.graphics.utils import _import_mpl plt = _import_mpl() fig, axes = plt.subplots(4, 1, sharex=True) if hasattr(self.observed, 'plot'): # got pandas use it self.observed.plot(ax=axes[0], legend=False) axes[0].set_ylabel('Observed') self.trend.plot(ax=axes[1], legend=False) axes[1].set_ylabel('Trend') self.seasonal.plot(ax=axes[2], legend=False) axes[2].set_ylabel('Seasonal') self.resid.plot(ax=axes[3], legend=False) axes[3].set_ylabel('Residual') else: axes[0].plot(self.observed) axes[0].set_ylabel('Observed') axes[1].plot(self.trend) axes[1].set_ylabel('Trend') axes[2].plot(self.seasonal) axes[2].set_ylabel('Seasonal') axes[3].plot(self.resid) axes[3].set_ylabel('Residual') axes[3].set_xlabel('Time') axes[3].set_xlim(0, self.nobs) fig.tight_layout() return fig
def plot(self): from statsmodels.graphics.utils import _import_mpl plt = _import_mpl() fig, axes = plt.subplots(4, 1, sharex=True) if hasattr(self.observed, 'plot'): # got pandas use it self.observed.plot(ax=axes[0], legend=False) axes[0].set_ylabel('Observed') self.trend.plot(ax=axes[1], legend=False) axes[1].set_ylabel('Trend') self.seasonal.plot(ax=axes[2], legend=False) axes[2].set_ylabel('Seasonal') self.resid.plot(ax=axes[3], legend=False) axes[3].set_ylabel('Residual') else: axes[0].plot(self.observed) axes[0].set_ylabel('Observed') axes[1].plot(self.trend) axes[1].set_ylabel('Trend') axes[2].plot(self.seasonal) axes[2].set_ylabel('Seasonal') axes[3].plot(self.resid) axes[3].set_ylabel('Residual') axes[3].set_xlabel('Time') axes[3].set_xlim(0, self.nobs) fig.tight_layout() return fig
def test_baseline(self, close_figures): plt = _import_mpl() fig, ax = gofplots._do_plot(self.x, self.y) assert isinstance(fig, plt.Figure) assert isinstance(ax, plt.Axes) assert self.fig is not fig assert self.ax is not ax
def test_with_ax(self, close_figures): plt = _import_mpl() fig, ax = gofplots._do_plot(self.x, self.y, ax=self.ax) assert isinstance(fig, plt.Figure) assert isinstance(ax, plt.Axes) assert self.fig is fig assert self.ax is ax
def plot(self, observed=True, seasonal=True, trend=True, resid=True, weights=False): """ Plot estimated components Parameters ---------- observed : bool Include the observed series in the plot seasonal : bool Include the seasonal component in the plot trend : bool Include the trend component in the plot resid : bool Include the residual in the plot weights : bool Include the weights in the plot (if any) Returns ------- matplotlib.figure.Figure The figure instance that containing the plot. """ from statsmodels.graphics.utils import _import_mpl from pandas.plotting import register_matplotlib_converters plt = _import_mpl() register_matplotlib_converters() series = [(self._observed, 'Observed')] if observed else [] series += [(self.trend, 'trend')] if trend else [] series += [(self.seasonal, 'seasonal')] if seasonal else [] series += [(self.resid, 'residual')] if resid else [] series += [(self.weights, 'weights')] if weights else [] if isinstance(self._observed, (pd.DataFrame, pd.Series)): nobs = self._observed.shape[0] xlim = self._observed.index[0], self._observed.index[nobs - 1] else: xlim = (0, self._observed.shape[0] - 1) fig, axs = plt.subplots(len(series), 1) for i, (ax, (series, def_name)) in enumerate(zip(axs, series)): if def_name != 'residual': ax.plot(series) else: ax.plot(series, marker='o', linestyle='none') ax.plot(xlim, (0, 0), color='#000000', zorder=-3) name = getattr(series, 'name', def_name) if def_name != 'Observed': name = name.capitalize() title = ax.set_title if i == 0 and observed else ax.set_ylabel title(name) ax.set_xlim(xlim) fig.tight_layout() return fig
def plot_scree(self, ncomp=None): """ Plot of the ordered eigenvalues and variance explained for the loadings Parameters ---------- ncomp : int, optional Number of loadings to include in the plot. If None, will included the same as the number of maximum possible loadings Returns ------- Figure Handle to the figure. """ _import_mpl() from .plots import plot_scree return plot_scree(self.eigenvals, self.n_comp, ncomp)
def plot(self): from statsmodels.graphics.utils import _import_mpl plt = _import_mpl() fig, axes = plt.subplots(4, 1, sharex=True) self.observed.plot(ax=axes[0], legend=False) axes[0].set_ylabel('Observed') self.seasadj.plot(ax=axes[1], legend=False) axes[1].set_ylabel('Seas. Adjusted') self.trend.plot(ax=axes[2], legend=False) axes[2].set_ylabel('Trend') self.irregular.plot(ax=axes[3], legend=False) axes[3].set_ylabel('Irregular') fig.tight_layout() return fig
def plot(self): from statsmodels.graphics.utils import _import_mpl plt = _import_mpl() fig, axes = plt.subplots(4, 1, sharex=True) self.observed.plot(ax=axes[0], legend=False) axes[0].set_ylabel('Observed') self.seasadj.plot(ax=axes[1], legend=False) axes[1].set_ylabel('Seas. Adjusted') self.trend.plot(ax=axes[2], legend=False) axes[2].set_ylabel('Trend') self.irregular.plot(ax=axes[3], legend=False) axes[3].set_ylabel('Irregular') fig.tight_layout() return fig
def plot_path(self): from statsmodels.graphics.utils import _import_mpl plt = _import_mpl() plt.plot(self.alphas, self.cv_error, c='black') plt.plot(self.alphas, self.cv_error + 1.96 * self.cv_std, c='blue') plt.plot(self.alphas, self.cv_error - 1.96 * self.cv_std, c='blue') plt.plot(self.alphas, self.cv_error, 'o', c='black') plt.plot(self.alphas, self.cv_error + 1.96 * self.cv_std, 'o', c='blue') plt.plot(self.alphas, self.cv_error - 1.96 * self.cv_std, 'o', c='blue') return
def plot_cusum_squares(self, alpha=0.05, legend_loc='upper left', fig=None, figsize=None): r""" Plot the CUSUM of squares statistic and significance bounds. Parameters ---------- alpha : float, optional The plotted significance bounds are alpha %. legend_loc : string, optional The location of the legend in the plot. Default is upper left. fig : Matplotlib Figure instance, optional If given, subplots are created in this figure instead of in a new figure. Note that the grid will be created in the provided figure using `fig.add_subplot()`. figsize : tuple, optional If a figure is created, this argument allows specifying a size. The tuple is (width, height). Notes ----- Evidence of parameter instability may be found if the CUSUM of squares statistic moves out of the significance bounds. Critical values used in creating the significance bounds are computed using the approximate formula of [2]_. References ---------- .. [1] Brown, R. L., J. Durbin, and J. M. Evans. 1975. "Techniques for Testing the Constancy of Regression Relationships over Time." Journal of the Royal Statistical Society. Series B (Methodological) 37 (2): 149-92. .. [2] Edgerton, David, and Curt Wells. 1994. "Critical Values for the Cusumsq Statistic in Medium and Large Sized Samples." Oxford Bulletin of Economics and Statistics 56 (3): 355-65. """ # Create the plot from statsmodels.graphics.utils import _import_mpl, create_mpl_fig plt = _import_mpl() fig = create_mpl_fig(fig, figsize) ax = fig.add_subplot(1, 1, 1) # Get dates, if applicable if hasattr(self.data, 'dates') and self.data.dates is not None: dates = self.data.dates._mpl_repr() else: dates = np.arange(self.nobs) llb = self.loglikelihood_burn # Plot cusum series and reference line ax.plot(dates[llb:], self.cusum_squares, label='CUSUM of squares') ref_line = (np.arange(llb, self.nobs) - llb) / (self.nobs - llb) ax.plot(dates[llb:], ref_line, 'k', alpha=0.3) # Plot significance bounds lower_line, upper_line = self._cusum_squares_significance_bounds(alpha) ax.plot([dates[llb], dates[-1]], upper_line, 'k--', label='%d%% significance' % (alpha * 100)) ax.plot([dates[llb], dates[-1]], lower_line, 'k--') ax.legend(loc=legend_loc) return fig
def plot_recursive_coefficient( self, variables=None, alpha=0.05, legend_loc="upper left", fig=None, figsize=None, ): r""" Plot the recursively estimated coefficients on a given variable Parameters ---------- variables : {int, str, Iterable[int], Iterable[str], None}, optional Integer index or string name of the variables whose coefficients to plot. Can also be an iterable of integers or strings. Default plots all coefficients. alpha : float, optional The confidence intervals for the coefficient are (1 - alpha)%. Set to None to exclude confidence intervals. legend_loc : str, optional The location of the legend in the plot. Default is upper left. fig : Figure, optional If given, subplots are created in this figure instead of in a new figure. Note that the grid will be created in the provided figure using `fig.add_subplot()`. figsize : tuple, optional If a figure is created, this argument allows specifying a size. The tuple is (width, height). Returns ------- Figure The matplotlib Figure object. """ from statsmodels.graphics.utils import _import_mpl, create_mpl_fig if alpha is not None: ci = self._conf_int(alpha, None) row_labels = self.model.data.row_labels if row_labels is None: row_labels = np.arange(self._params.shape[0]) k_variables = self._params.shape[1] param_names = self.model.data.param_names if variables is None: variable_idx = list(range(k_variables)) else: if isinstance(variables, (int, str)): variables = [variables] variable_idx = [] for i in range(len(variables)): variable = variables[i] if variable in param_names: variable_idx.append(param_names.index(variable)) elif isinstance(variable, int): variable_idx.append(variable) else: msg = ("variable {0} is not an integer and was not found " "in the list of variable " "names: {1}".format(variables[i], ", ".join(param_names))) raise ValueError(msg) _import_mpl() fig = create_mpl_fig(fig, figsize) loc = 0 import pandas as pd if isinstance(row_labels, pd.PeriodIndex): row_labels = row_labels.to_timestamp() row_labels = np.asarray(row_labels) for i in variable_idx: ax = fig.add_subplot(len(variable_idx), 1, loc + 1) params = self._params[:, i] valid = ~np.isnan(self._params[:, i]) row_lbl = row_labels[valid] ax.plot(row_lbl, params[valid]) if alpha is not None: this_ci = np.reshape(ci[:, :, i], (-1, 2)) if not np.all(np.isnan(this_ci)): ax.plot(row_lbl, this_ci[:, 0][valid], "k:", label="Lower CI") ax.plot(row_lbl, this_ci[:, 1][valid], "k:", label="Upper CI") if loc == 0: ax.legend(loc=legend_loc) ax.set_xlim(row_lbl[0], row_lbl[-1]) ax.set_title(param_names[i]) loc += 1 fig.tight_layout() return fig
def plot( self, observed=True, seasonal=True, trend=True, resid=True, weights=False, ): """ Plot estimated components Parameters ---------- observed : bool Include the observed series in the plot seasonal : bool Include the seasonal component in the plot trend : bool Include the trend component in the plot resid : bool Include the residual in the plot weights : bool Include the weights in the plot (if any) Returns ------- matplotlib.figure.Figure The figure instance that containing the plot. """ from pandas.plotting import register_matplotlib_converters from statsmodels.graphics.utils import _import_mpl plt = _import_mpl() register_matplotlib_converters() series = [(self._observed, "Observed")] if observed else [] series += [(self.trend, "trend")] if trend else [] if self.seasonal.ndim == 1: series += [(self.seasonal, "seasonal")] if seasonal else [] elif self.seasonal.ndim > 1: if isinstance(self.seasonal, pd.DataFrame): for col in self.seasonal.columns: series += ([(self.seasonal[col], "seasonal")] if seasonal else []) else: for i in range(self.seasonal.shape[1]): series += ([(self.seasonal[:, i], "seasonal")] if seasonal else []) series += [(self.resid, "residual")] if resid else [] series += [(self.weights, "weights")] if weights else [] if isinstance(self._observed, (pd.DataFrame, pd.Series)): nobs = self._observed.shape[0] xlim = self._observed.index[0], self._observed.index[nobs - 1] else: xlim = (0, self._observed.shape[0] - 1) fig, axs = plt.subplots(len(series), 1) for i, (ax, (series, def_name)) in enumerate(zip(axs, series)): if def_name != "residual": ax.plot(series) else: ax.plot(series, marker="o", linestyle="none") ax.plot(xlim, (0, 0), color="#000000", zorder=-3) name = getattr(series, "name", def_name) if def_name != "Observed": name = name.capitalize() title = ax.set_title if i == 0 and observed else ax.set_ylabel title(name) ax.set_xlim(xlim) fig.tight_layout() return fig
def plot_recursive_coefficient(self, variables=0, alpha=0.05, legend_loc='upper left', fig=None, figsize=None): r""" Plot the recursively estimated coefficients on a given variable Parameters ---------- variables : int or str or iterable of int or string, optional Integer index or string name of the variable whose coefficient will be plotted. Can also be an iterable of integers or strings. Default is the first variable. alpha : float, optional The confidence intervals for the coefficient are (1 - alpha) % legend_loc : string, optional The location of the legend in the plot. Default is upper left. fig : Matplotlib Figure instance, optional If given, subplots are created in this figure instead of in a new figure. Note that the grid will be created in the provided figure using `fig.add_subplot()`. figsize : tuple, optional If a figure is created, this argument allows specifying a size. The tuple is (width, height). Notes ----- All plots contain (1 - `alpha`) % confidence intervals. """ # Get variables if isinstance(variables, (int, str)): variables = [variables] k_variables = len(variables) # If a string was given for `variable`, try to get it from exog names exog_names = self.model.exog_names for i in range(k_variables): variable = variables[i] if isinstance(variable, str): variables[i] = exog_names.index(variable) # Create the plot from scipy.stats import norm from statsmodels.graphics.utils import _import_mpl, create_mpl_fig plt = _import_mpl() fig = create_mpl_fig(fig, figsize) for i in range(k_variables): variable = variables[i] ax = fig.add_subplot(k_variables, 1, i + 1) # Get dates, if applicable if hasattr(self.data, 'dates') and self.data.dates is not None: dates = self.data.dates._mpl_repr() else: dates = np.arange(self.nobs) d = max(self.nobs_diffuse, self.loglikelihood_burn) # Plot the coefficient coef = self.recursive_coefficients ax.plot(dates[d:], coef.filtered[variable, d:], label='Recursive estimates: %s' % exog_names[variable]) # Legend handles, labels = ax.get_legend_handles_labels() # Get the critical value for confidence intervals if alpha is not None: critical_value = norm.ppf(1 - alpha / 2.) # Plot confidence intervals std_errors = np.sqrt(coef.filtered_cov[variable, variable, :]) ci_lower = ( coef.filtered[variable] - critical_value * std_errors) ci_upper = ( coef.filtered[variable] + critical_value * std_errors) ci_poly = ax.fill_between( dates[d:], ci_lower[d:], ci_upper[d:], alpha=0.2 ) ci_label = ('$%.3g \\%%$ confidence interval' % ((1 - alpha)*100)) # Only add CI to legend for the first plot if i == 0: # Proxy artist for fill_between legend entry # See https://matplotlib.org/1.3.1/users/legend_guide.html p = plt.Rectangle((0, 0), 1, 1, fc=ci_poly.get_facecolor()[0]) handles.append(p) labels.append(ci_label) ax.legend(handles, labels, loc=legend_loc) # Remove xticks for all but the last plot if i < k_variables - 1: ax.xaxis.set_ticklabels([]) fig.tight_layout() return fig
def hdrboxplot(data, ncomp=2, alpha=None, threshold=0.95, bw=None, xdata=None, labels=None, ax=None, use_brute=False, seed=None): """ High Density Region boxplot Parameters ---------- data : sequence of ndarrays or 2-D ndarray The vectors of functions to create a functional boxplot from. If a sequence of 1-D arrays, these should all be the same size. The first axis is the function index, the second axis the one along which the function is defined. So ``data[0, :]`` is the first functional curve. ncomp : int, optional Number of components to use. If None, returns the as many as the smaller of the number of rows or columns in data. alpha : list of floats between 0 and 1, optional Extra quantile values to compute. Default is None threshold : float between 0 and 1, optional Percentile threshold value for outliers detection. High value means a lower sensitivity to outliers. Default is `0.95`. bw: array_like or str, optional If an array, it is a fixed user-specified bandwidth. If `None`, set to `normal_reference`. If a string, should be one of: - normal_reference: normal reference rule of thumb (default) - cv_ml: cross validation maximum likelihood - cv_ls: cross validation least squares xdata : ndarray, optional The independent variable for the data. If not given, it is assumed to be an array of integers 0..N-1 with N the length of the vectors in `data`. labels : sequence of scalar or str, optional The labels or identifiers of the curves in `data`. If not given, outliers are labeled in the plot with array indices. ax : Matplotlib AxesSubplot instance, optional If given, this subplot is used to plot in instead of a new figure being created. use_brute : bool Use the brute force optimizer instead of the default differential evolution to find the curves. Default is False. seed : {None, int, np.random.RandomState} Seed value to pass to scipy.optimize.differential_evolution. Can be an integer or RandomState instance. If None, then the default RandomState provided by np.random is used. Returns ------- fig : Matplotlib figure instance If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. hdr_res : HdrResults instance An `HdrResults` instance with the following attributes: - 'median', array. Median curve. - 'hdr_50', array. 50% quantile band. [sup, inf] curves - 'hdr_90', list of array. 90% quantile band. [sup, inf] curves. - 'extra_quantiles', list of array. Extra quantile band. [sup, inf] curves. - 'outliers', ndarray. Outlier curves. Notes ----- The median curve is the curve with the highest probability on the reduced space of a Principal Component Analysis (PCA). Outliers are defined as curves that fall outside the band corresponding to the quantile given by `threshold`. The non-outlying region is defined as the band made up of all the non-outlying curves. Behind the scene, the dataset is represented as a matrix. Each line corresponding to a 1D curve. This matrix is then decomposed using Principal Components Analysis (PCA). This allows to represent the data using a finite number of modes, or components. This compression process allows to turn the functional representation into a scalar representation of the matrix. In other words, you can visualize each curve from its components. Each curve is thus a point in this reduced space. With 2 components, this is called a bivariate plot (2D plot). In this plot, if some points are adjacent (similar components), it means that back in the original space, the curves are similar. Then, finding the median curve means finding the higher density region (HDR) in the reduced space. Moreover, the more you get away from this HDR, the more the curve is unlikely to be similar to the other curves. Using a kernel smoothing technique, the probability density function (PDF) of the multivariate space can be recovered. From this PDF, it is possible to compute the density probability linked to the cluster of points and plot its contours. Finally, using these contours, the different quantiles can be extracted along with the median curve and the outliers. Steps to produce the HDR boxplot include: 1. Compute a multivariate kernel density estimation 2. Compute contour lines for quantiles 90%, 50% and `alpha` % 3. Plot the bivariate plot 4. Compute median curve along with quantiles and outliers curves. References ---------- [1] R.J. Hyndman and H.L. Shang, "Rainbow Plots, Bagplots, and Boxplots for Functional Data", vol. 19, pp. 29-45, 2010. Examples -------- Load the El Nino dataset. Consists of 60 years worth of Pacific Ocean sea surface temperature data. >>> import matplotlib.pyplot as plt >>> import statsmodels.api as sm >>> data = sm.datasets.elnino.load(as_pandas=False) Create a functional boxplot. We see that the years 1982-83 and 1997-98 are outliers; these are the years where El Nino (a climate pattern characterized by warming up of the sea surface and higher air pressures) occurred with unusual intensity. >>> fig = plt.figure() >>> ax = fig.add_subplot(111) >>> res = sm.graphics.hdrboxplot(data.raw_data[:, 1:], ... labels=data.raw_data[:, 0].astype(int), ... ax=ax) >>> ax.set_xlabel("Month of the year") >>> ax.set_ylabel("Sea surface temperature (C)") >>> ax.set_xticks(np.arange(13, step=3) - 1) >>> ax.set_xticklabels(["", "Mar", "Jun", "Sep", "Dec"]) >>> ax.set_xlim([-0.2, 11.2]) >>> plt.show() .. plot:: plots/graphics_functional_hdrboxplot.py See Also -------- banddepth, rainbowplot, fboxplot """ fig, ax = utils.create_mpl_ax(ax) if labels is None: # For use with pandas, get the labels if hasattr(data, 'index'): labels = data.index else: labels = np.arange(len(data)) data = np.asarray(data) if xdata is None: xdata = np.arange(data.shape[1]) n_samples, dim = data.shape # PCA and bivariate plot pca = PCA(data, ncomp=ncomp) data_r = pca.factors # Create gaussian kernel ks_gaussian = KDEMultivariate(data_r, bw=bw, var_type='c' * data_r.shape[1]) # Boundaries of the n-variate space bounds = np.array([data_r.min(axis=0), data_r.max(axis=0)]).T # Compute contour line of pvalue linked to a given probability level if alpha is None: alpha = [threshold, 0.9, 0.5] else: alpha.extend([threshold, 0.9, 0.5]) alpha = list(set(alpha)) alpha.sort(reverse=True) n_quantiles = len(alpha) pdf_r = ks_gaussian.pdf(data_r).flatten() pvalues = [ np.percentile(pdf_r, (1 - alpha[i]) * 100, interpolation='linear') for i in range(n_quantiles) ] # Find mean, outliers curves if have_de_optim and not use_brute: median = differential_evolution(lambda x: -ks_gaussian.pdf(x), bounds=bounds, maxiter=5, seed=seed).x else: median = brute(lambda x: -ks_gaussian.pdf(x), ranges=bounds, finish=fmin) outliers_idx = np.where(pdf_r < pvalues[alpha.index(threshold)])[0] labels_outlier = [labels[i] for i in outliers_idx] outliers = data[outliers_idx] # Find HDR given some quantiles def _band_quantiles(band, use_brute=use_brute, seed=seed): """ Find extreme curves for a quantile band. From the `band` of quantiles, the associated PDF extrema values are computed. If `min_alpha` is not provided (single quantile value), `max_pdf` is set to `1E6` in order not to constrain the problem on high values. An optimization is performed per component in order to find the min and max curves. This is done by comparing the PDF value of a given curve with the band PDF. Parameters ---------- band : array_like alpha values ``(max_alpha, min_alpha)`` ex: ``[0.9, 0.5]`` use_brute : bool Use the brute force optimizer instead of the default differential evolution to find the curves. Default is False. seed : {None, int, np.random.RandomState} Seed value to pass to scipy.optimize.differential_evolution. Can be an integer or RandomState instance. If None, then the default RandomState provided by np.random is used. Returns ------- band_quantiles : list of 1-D array ``(max_quantile, min_quantile)`` (2, n_features) """ min_pdf = pvalues[alpha.index(band[0])] try: max_pdf = pvalues[alpha.index(band[1])] except IndexError: max_pdf = 1E6 band = [min_pdf, max_pdf] pool = Pool() data = zip( range(dim), itertools.repeat( (band, pca, bounds, ks_gaussian, seed, use_brute))) band_quantiles = pool.map(_min_max_band, data) pool.terminate() pool.close() band_quantiles = list(zip(*band_quantiles)) return band_quantiles extra_alpha = [ i for i in alpha if 0.5 != i and 0.9 != i and threshold != i ] if len(extra_alpha) > 0: extra_quantiles = [] for x in extra_alpha: for y in _band_quantiles([x], use_brute=use_brute, seed=seed): extra_quantiles.append(y) else: extra_quantiles = [] # Inverse transform from n-variate plot to dataset dataset's shape median = _inverse_transform(pca, median)[0] hdr_90 = _band_quantiles([0.9, 0.5], use_brute=use_brute, seed=seed) hdr_50 = _band_quantiles([0.5], use_brute=use_brute, seed=seed) hdr_res = HdrResults({ "median": median, "hdr_50": hdr_50, "hdr_90": hdr_90, "extra_quantiles": extra_quantiles, "outliers": outliers, "outliers_idx": outliers_idx }) # Plots ax.plot(np.array([xdata] * n_samples).T, data.T, c='c', alpha=.1, label=None) ax.plot(xdata, median, c='k', label='Median') fill_betweens = [] fill_betweens.append( ax.fill_between(xdata, *hdr_50, color='gray', alpha=.4, label='50% HDR')) fill_betweens.append( ax.fill_between(xdata, *hdr_90, color='gray', alpha=.3, label='90% HDR')) if len(extra_quantiles) != 0: ax.plot(np.array([xdata] * len(extra_quantiles)).T, np.array(extra_quantiles).T, c='y', ls='-.', alpha=.4, label='Extra quantiles') if len(outliers) != 0: for ii, outlier in enumerate(outliers): if labels_outlier is None: label = 'Outliers' else: label = str(labels_outlier[ii]) ax.plot(xdata, outlier, ls='--', alpha=0.7, label=label) handles, labels = ax.get_legend_handles_labels() # Proxy artist for fill_between legend entry # See https://matplotlib.org/1.3.1/users/legend_guide.html plt = _import_mpl() for label, fill_between in zip(['50% HDR', '90% HDR'], fill_betweens): p = plt.Rectangle((0, 0), 1, 1, fc=fill_between.get_facecolor()[0]) handles.append(p) labels.append(label) by_label = OrderedDict(zip(labels, handles)) if len(outliers) != 0: by_label.pop('Median') by_label.pop('50% HDR') by_label.pop('90% HDR') ax.legend(by_label.values(), by_label.keys(), loc='best') return fig, hdr_res
def plot_predict( result, start=None, end=None, dynamic=False, alpha=0.05, ax=None, **predict_kwargs, ): """ Parameters ---------- result : Result Any model result supporting ``get_prediction``. start : int, str, or datetime, optional Zero-indexed observation number at which to start forecasting, i.e., the first forecast is start. Can also be a date string to parse or a datetime type. Default is the the zeroth observation. end : int, str, or datetime, optional Zero-indexed observation number at which to end forecasting, i.e., the last forecast is end. Can also be a date string to parse or a datetime type. However, if the dates index does not have a fixed frequency, end must be an integer index if you want out of sample prediction. Default is the last observation in the sample. dynamic : bool, int, str, or datetime, optional Integer offset relative to `start` at which to begin dynamic prediction. Can also be an absolute date string to parse or a datetime type (these are not interpreted as offsets). Prior to this observation, true endogenous values will be used for prediction; starting with this observation and continuing through the end of prediction, forecasted endogenous values will be used instead. alpha : {float, None} The tail probability not covered by the confidence interval. Must be in (0, 1). Confidence interval is constructed assuming normally distributed shocks. If None, figure will not show the confidence interval. ax : AxesSubplot matplotlib Axes instance to use **predict_kwargs Any additional keyword arguments to pass to ``result.get_prediction``. Returns ------- Figure matplotlib Figure containing the prediction plot """ from statsmodels.graphics.utils import _import_mpl, create_mpl_ax _ = _import_mpl() fig, ax = create_mpl_ax(ax) from statsmodels.tsa.base.prediction import PredictionResults # use predict so you set dates pred: PredictionResults = result.get_prediction(start=start, end=end, dynamic=dynamic, **predict_kwargs) mean = pred.predicted_mean if isinstance(mean, (pd.Series, pd.DataFrame)): x = mean.index mean.plot(ax=ax, label="forecast") else: x = np.arange(mean.shape[0]) ax.plot(x, mean) if alpha is not None: label = f"{1-alpha:.0%} confidence interval" ci = pred.conf_int(alpha) conf_int = np.asarray(ci) ax.fill_between( x, conf_int[:, 0], conf_int[:, 1], color="gray", alpha=0.5, label=label, ) ax.legend(loc="best") return fig
def hdrboxplot(data, ncomp=2, alpha=None, threshold=0.95, bw=None, xdata=None, labels=None, ax=None): """ High Density Region boxplot Parameters ---------- data : sequence of ndarrays or 2-D ndarray The vectors of functions to create a functional boxplot from. If a sequence of 1-D arrays, these should all be the same size. The first axis is the function index, the second axis the one along which the function is defined. So ``data[0, :]`` is the first functional curve. ncomp : int, optional Number of components to use. If None, returns the as many as the smaller of the number of rows or columns in data. alpha : list of floats between 0 and 1, optional Extra quantile values to compute. Default is None threshold : float between 0 and 1, optional Percentile threshold value for outliers detection. High value means a lower sensitivity to outliers. Default is `0.95`. bw: array_like or str, optional If an array, it is a fixed user-specified bandwidth. If `None`, set to `normal_reference`. If a string, should be one of: - normal_reference: normal reference rule of thumb (default) - cv_ml: cross validation maximum likelihood - cv_ls: cross validation least squares xdata : ndarray, optional The independent variable for the data. If not given, it is assumed to be an array of integers 0..N-1 with N the length of the vectors in `data`. labels : sequence of scalar or str, optional The labels or identifiers of the curves in `data`. If not given, outliers are labeled in the plot with array indices. ax : Matplotlib AxesSubplot instance, optional If given, this subplot is used to plot in instead of a new figure being created. Returns ------- fig : Matplotlib figure instance If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. hdr_res : HdrResults instance An `HdrResults` instance with the following attributes: - 'median', array. Median curve. - 'hdr_50', array. 50% quantile band. [sup, inf] curves - 'hdr_90', list of array. 90% quantile band. [sup, inf] curves. - 'extra_quantiles', list of array. Extra quantile band. [sup, inf] curves. - 'outliers', ndarray. Outlier curves. Notes ----- The median curve is the curve with the highest probability on the reduced space of a Principal Component Analysis (PCA). Outliers are defined as curves that fall outside the band corresponding to the quantile given by `threshold`. The non-outlying region is defined as the band made up of all the non-outlying curves. Behind the scene, the dataset is represented as a matrix. Each line corresponding to a 1D curve. This matrix is then decomposed using Principal Components Analysis (PCA). This allows to represent the data using a finite number of modes, or components. This compression process allows to turn the functional representation into a scalar representation of the matrix. In other words, you can visualize each curve from its components. Each curve is thus a point in this reduced space. With 2 components, this is called a bivariate plot (2D plot). In this plot, if some points are adjacent (similar components), it means that back in the original space, the curves are similar. Then, finding the median curve means finding the higher density region (HDR) in the reduced space. Moreover, the more you get away from this HDR, the more the curve is unlikely to be similar to the other curves. Using a kernel smoothing technique, the probability density function (PDF) of the multivariate space can be recovered. From this PDF, it is possible to compute the density probability linked to the cluster of points and plot its contours. Finally, using these contours, the different quantiles can be extracted along with the median curve and the outliers. Steps to produce the HDR boxplot include: 1. Compute a multivariate kernel density estimation 2. Compute contour lines for quantiles 90%, 50% and `alpha` % 3. Plot the bivariate plot 4. Compute median curve along with quantiles and outliers curves. References ---------- [1] R.J. Hyndman and H.L. Shang, "Rainbow Plots, Bagplots, and Boxplots for Functional Data", vol. 19, pp. 29-45, 2010. Examples -------- Load the El Nino dataset. Consists of 60 years worth of Pacific Ocean sea surface temperature data. >>> import matplotlib.pyplot as plt >>> import statsmodels.api as sm >>> data = sm.datasets.elnino.load(as_pandas=False) Create a functional boxplot. We see that the years 1982-83 and 1997-98 are outliers; these are the years where El Nino (a climate pattern characterized by warming up of the sea surface and higher air pressures) occurred with unusual intensity. >>> fig = plt.figure() >>> ax = fig.add_subplot(111) >>> res = sm.graphics.hdrboxplot(data.raw_data[:, 1:], ... labels=data.raw_data[:, 0].astype(int), ... ax=ax) >>> ax.set_xlabel("Month of the year") >>> ax.set_ylabel("Sea surface temperature (C)") >>> ax.set_xticks(np.arange(13, step=3) - 1) >>> ax.set_xticklabels(["", "Mar", "Jun", "Sep", "Dec"]) >>> ax.set_xlim([-0.2, 11.2]) >>> plt.show() .. plot:: plots/graphics_functional_hdrboxplot.py See Also -------- banddepth, rainbowplot, fboxplot """ fig, ax = utils.create_mpl_ax(ax) if labels is None: # For use with pandas, get the labels if hasattr(data, 'index'): labels = data.index else: labels = np.arange(len(data)) data = np.asarray(data) if xdata is None: xdata = np.arange(data.shape[1]) n_samples, dim = data.shape # PCA and bivariate plot pca = PCA(data, ncomp=ncomp) data_r = pca.factors # Create gaussian kernel ks_gaussian = KDEMultivariate(data_r, bw=bw, var_type='c' * data_r.shape[1]) # Boundaries of the n-variate space bounds = np.array([data_r.min(axis=0), data_r.max(axis=0)]).T # Compute contour line of pvalue linked to a given probability level if alpha is None: alpha = [threshold, 0.9, 0.5] else: alpha.extend([threshold, 0.9, 0.5]) alpha = list(set(alpha)) alpha.sort(reverse=True) n_quantiles = len(alpha) pdf_r = ks_gaussian.pdf(data_r).flatten() pvalues = [np.percentile(pdf_r, (1 - alpha[i]) * 100, interpolation='linear') for i in range(n_quantiles)] # Find mean, outliers curves if have_de_optim: median = differential_evolution(lambda x: - ks_gaussian.pdf(x), bounds=bounds, maxiter=5).x else: median = brute(lambda x: - ks_gaussian.pdf(x), ranges=bounds, finish=fmin) outliers_idx = np.where(pdf_r < pvalues[alpha.index(threshold)])[0] labels_outlier = [labels[i] for i in outliers_idx] outliers = data[outliers_idx] # Find HDR given some quantiles def _band_quantiles(band): """Find extreme curves for a quantile band. From the `band` of quantiles, the associated PDF extrema values are computed. If `min_alpha` is not provided (single quantile value), `max_pdf` is set to `1E6` in order not to constrain the problem on high values. An optimization is performed per component in order to find the min and max curves. This is done by comparing the PDF value of a given curve with the band PDF. Parameters ---------- band : array_like alpha values ``(max_alpha, min_alpha)`` ex: ``[0.9, 0.5]`` Returns ------- band_quantiles : list of 1-D array ``(max_quantile, min_quantile)`` (2, n_features) """ min_pdf = pvalues[alpha.index(band[0])] try: max_pdf = pvalues[alpha.index(band[1])] except IndexError: max_pdf = 1E6 band = [min_pdf, max_pdf] pool = Pool() data = zip(range(dim), itertools.repeat((band, pca, bounds, ks_gaussian))) band_quantiles = pool.map(_min_max_band, data) pool.terminate() pool.close() band_quantiles = list(zip(*band_quantiles)) return band_quantiles extra_alpha = [i for i in alpha if 0.5 != i and 0.9 != i and threshold != i] if extra_alpha != []: extra_quantiles = [y for x in extra_alpha for y in _band_quantiles([x])] else: extra_quantiles = [] # Inverse transform from n-variate plot to dataset dataset's shape median = _inverse_transform(pca, median)[0] hdr_90 = _band_quantiles([0.9, 0.5]) hdr_50 = _band_quantiles([0.5]) hdr_res = HdrResults({ "median": median, "hdr_50": hdr_50, "hdr_90": hdr_90, "extra_quantiles": extra_quantiles, "outliers": outliers, "outliers_idx": outliers_idx }) # Plots ax.plot(np.array([xdata] * n_samples).T, data.T, c='c', alpha=.1, label=None) ax.plot(xdata, median, c='k', label='Median') fill_betweens = [] fill_betweens.append(ax.fill_between(xdata, *hdr_50, color='gray', alpha=.4, label='50% HDR')) fill_betweens.append(ax.fill_between(xdata, *hdr_90, color='gray', alpha=.3, label='90% HDR')) if len(extra_quantiles) != 0: ax.plot(np.array([xdata] * len(extra_quantiles)).T, np.array(extra_quantiles).T, c='y', ls='-.', alpha=.4, label='Extra quantiles') if len(outliers) != 0: for ii, outlier in enumerate(outliers): label = str(labels_outlier[ii]) if labels_outlier is not None else 'Outliers' ax.plot(xdata, outlier, ls='--', alpha=0.7, label=label) handles, labels = ax.get_legend_handles_labels() # Proxy artist for fill_between legend entry # See http://matplotlib.org/1.3.1/users/legend_guide.html plt = _import_mpl() for label, fill_between in zip(['50% HDR', '90% HDR'], fill_betweens): p = plt.Rectangle((0, 0), 1, 1, fc=fill_between.get_facecolor()[0]) handles.append(p) labels.append(label) by_label = OrderedDict(zip(labels, handles)) if len(outliers) != 0: by_label.pop('Median') by_label.pop('50% HDR') by_label.pop('90% HDR') ax.legend(by_label.values(), by_label.keys(), loc='best') return fig, hdr_res
def plot_predict( self, steps: int = 1, theta: float = 2, alpha: Optional[float] = 0.05, in_sample: bool = False, fig: Optional["matplotlib.figure.Figure"] = None, figsize: Tuple[float, float] = None, ) -> "matplotlib.figure.Figure": r""" Plot forecasts, prediction intervals and in-sample values Parameters ---------- steps : int, default 1 The number of steps ahead to compute the forecast components. theta : float, default 2 The theta value to use when computing the weight to combine the trend and the SES forecasts. alpha : {float, None}, default 0.05 The tail probability not covered by the confidence interval. Must be in (0, 1). Confidence interval is constructed assuming normally distributed shocks. If None, figure will not show the confidence interval. in_sample : bool, default False Flag indicating whether to include the in-sample period in the plot. fig : Figure, default None An existing figure handle. If not provided, a new figure is created. figsize: tuple[float, float], default None Tuple containing the figure size. Returns ------- Figure Figure handle containing the plot. Notes ----- The variance of the h-step forecast is assumed to follow from the integrated Moving Average structure of the Theta model, and so is :math:`\sigma^2(\alpha^2 + (h-1))`. The prediction interval assumes that innovations are normally distributed. """ from statsmodels.graphics.utils import _import_mpl, create_mpl_fig _import_mpl() fig = create_mpl_fig(fig, figsize) assert fig is not None predictions = self.forecast(steps, theta) pred_index = predictions.index ax = fig.add_subplot(111) nobs = self.model.endog_orig.shape[0] index = NumericIndex(np.arange(nobs)) if in_sample: if isinstance(self.model.endog_orig, pd.Series): index = self.model.endog_orig.index ax.plot(index, self.model.endog_orig) ax.plot(pred_index, predictions) if alpha is not None: pi = self.prediction_intervals(steps, theta, alpha) label = "{0:.0%} confidence interval".format(1 - alpha) ax.fill_between( pred_index, pi["lower"], pi["upper"], color="gray", alpha=0.5, label=label, ) ax.legend(loc="best", frameon=False) fig.tight_layout(pad=1.0) return fig
def plot_cusum_squares(self, alpha=0.05, legend_loc='upper left', fig=None, figsize=None): r""" Plot the CUSUM of squares statistic and significance bounds. Parameters ---------- alpha : float, optional The plotted significance bounds are alpha %. legend_loc : string, optional The location of the legend in the plot. Default is upper left. fig : Matplotlib Figure instance, optional If given, subplots are created in this figure instead of in a new figure. Note that the grid will be created in the provided figure using `fig.add_subplot()`. figsize : tuple, optional If a figure is created, this argument allows specifying a size. The tuple is (width, height). Notes ----- Evidence of parameter instability may be found if the CUSUM of squares statistic moves out of the significance bounds. Critical values used in creating the significance bounds are computed using the approximate formula of [2]_. References ---------- .. [1] Brown, R. L., J. Durbin, and J. M. Evans. 1975. "Techniques for Testing the Constancy of Regression Relationships over Time." Journal of the Royal Statistical Society. Series B (Methodological) 37 (2): 149-92. .. [2] Edgerton, David, and Curt Wells. 1994. "Critical Values for the Cusumsq Statistic in Medium and Large Sized Samples." Oxford Bulletin of Economics and Statistics 56 (3): 355-65. """ # Create the plot from statsmodels.graphics.utils import _import_mpl, create_mpl_fig plt = _import_mpl() fig = create_mpl_fig(fig, figsize) ax = fig.add_subplot(1, 1, 1) # Get dates, if applicable if hasattr(self.data, 'dates') and self.data.dates is not None: dates = self.data.dates._mpl_repr() else: dates = np.arange(self.nobs) llb = self.loglikelihood_burn # Plot cusum series and reference line ax.plot(dates[llb:], self.cusum_squares, label='CUSUM of squares') ref_line = (np.arange(llb, self.nobs) - llb) / (self.nobs - llb) ax.plot(dates[llb:], ref_line, 'k', alpha=0.3) # Plot significance bounds lower_line, upper_line = self._cusum_squares_significance_bounds(alpha) ax.plot([dates[llb], dates[-1]], upper_line, 'k--', label='%d%% significance' % (alpha * 100)) ax.plot([dates[llb], dates[-1]], lower_line, 'k--') ax.legend(loc=legend_loc) return fig
def plot_diagnostics(residuals, variable=0, lags=40, fig=None, figsize=(15, 7), savefig=False, path=None): _import_mpl() fig = create_mpl_fig(fig, figsize) # # Eliminate residuals associated with burned or diffuse likelihoods # d = np.maximum(self.loglikelihood_burn, self.nobs_diffuse) # resid = self.filter_results.standardized_forecasts_error[variable, d:] # loglikelihood_burn: the number of observations during which the likelihood is not evaluated. # Standardize residual # Source: https://alkaline-ml.com/pmdarima/1.1.1/_modules/pmdarima/arima/arima.html resid = residuals resid = (resid - np.nanmean(resid)) / np.nanstd(resid) # Top-left: residuals vs time ax = fig.add_subplot(221) # if hasattr(self.data, 'dates') and self.data.dates is not None: # x = self.data.dates[d:]._mpl_repr() # else: # x = np.arange(len(resid)) x = np.arange(len(resid)) ax.plot(x, resid) ax.hlines(0, x[0], x[-1], alpha=0.5) ax.set_xlim(x[0], x[-1]) ax.set_title('Standardized residual') # Top-right: histogram, Gaussian kernel density, Normal density # Can only do histogram and Gaussian kernel density on the non-null # elements resid_nonmissing = resid[~(np.isnan(resid))] ax = fig.add_subplot(222) # gh5792: Remove except after support for matplotlib>2.1 required try: ax.hist(resid_nonmissing, density=True, label='Hist') except AttributeError: ax.hist(resid_nonmissing, normed=True, label='Hist') from scipy.stats import gaussian_kde, norm kde = gaussian_kde(resid_nonmissing) xlim = (-1.96 * 2, 1.96 * 2) x = np.linspace(xlim[0], xlim[1]) ax.plot(x, kde(x), label='KDE') ax.plot(x, norm.pdf(x), label='N(0,1)') ax.set_xlim(xlim) ax.legend() ax.set_title('Histogram plus estimated density') # Bottom-left: QQ plot ax = fig.add_subplot(223) from statsmodels.graphics.gofplots import qqplot qqplot(resid_nonmissing, line='s', ax=ax) ax.set_title('Normal Q-Q') # Bottom-right: Correlogram ax = fig.add_subplot(224) from statsmodels.graphics.tsaplots import plot_pacf plot_pacf(resid, ax=ax, lags=lags) ax.set_title('Partial Autocorrelation function') ax.set_ylim(-0.1, 0.1) if savefig == True: fig.suptitle('Residual diagnostic', fontsize=20) fig.savefig(path, dpi=500) fig.show() return fig
def plot_recursive_coefficient(self, variables=0, alpha=0.05, legend_loc='upper left', fig=None, figsize=None): r""" Plot the recursively estimated coefficients on a given variable Parameters ---------- variables : int or str or iterable of int or string, optional Integer index or string name of the variable whose coefficient will be plotted. Can also be an iterable of integers or strings. Default is the first variable. alpha : float, optional The confidence intervals for the coefficient are (1 - alpha) % legend_loc : string, optional The location of the legend in the plot. Default is upper left. fig : Matplotlib Figure instance, optional If given, subplots are created in this figure instead of in a new figure. Note that the grid will be created in the provided figure using `fig.add_subplot()`. figsize : tuple, optional If a figure is created, this argument allows specifying a size. The tuple is (width, height). Notes ----- All plots contain (1 - `alpha`) % confidence intervals. """ # Get variables if isinstance(variables, (int, str)): variables = [variables] k_variables = len(variables) # If a string was given for `variable`, try to get it from exog names exog_names = self.model.exog_names for i in range(k_variables): variable = variables[i] if isinstance(variable, str): variables[i] = exog_names.index(variable) # Create the plot from scipy.stats import norm from statsmodels.graphics.utils import _import_mpl, create_mpl_fig plt = _import_mpl() fig = create_mpl_fig(fig, figsize) for i in range(k_variables): variable = variables[i] ax = fig.add_subplot(k_variables, 1, i + 1) # Get dates, if applicable if hasattr(self.data, 'dates') and self.data.dates is not None: dates = self.data.dates._mpl_repr() else: dates = np.arange(self.nobs) d = max(self.nobs_diffuse, self.loglikelihood_burn) # Plot the coefficient coef = self.recursive_coefficients ax.plot(dates[d:], coef.filtered[variable, d:], label='Recursive estimates: %s' % exog_names[variable]) # Legend handles, labels = ax.get_legend_handles_labels() # Get the critical value for confidence intervals if alpha is not None: critical_value = norm.ppf(1 - alpha / 2.) # Plot confidence intervals std_errors = np.sqrt(coef.filtered_cov[variable, variable, :]) ci_lower = ( coef.filtered[variable] - critical_value * std_errors) ci_upper = ( coef.filtered[variable] + critical_value * std_errors) ci_poly = ax.fill_between( dates[d:], ci_lower[d:], ci_upper[d:], alpha=0.2 ) ci_label = ('$%.3g \\%%$ confidence interval' % ((1 - alpha)*100)) # Only add CI to legend for the first plot if i == 0: # Proxy artist for fill_between legend entry # See http://matplotlib.org/1.3.1/users/legend_guide.html p = plt.Rectangle((0, 0), 1, 1, fc=ci_poly.get_facecolor()[0]) handles.append(p) labels.append(ci_label) ax.legend(handles, labels, loc=legend_loc) # Remove xticks for all but the last plot if i < k_variables - 1: ax.xaxis.set_ticklabels([]) fig.tight_layout() return fig