def __init__(self, results, resid=None, endog=None, exog=None, hat_matrix_diag=None, cov_params=None, scale=None): # I'm not calling super for now, OLS attributes might not be available # check which model is allowed self.results = results = maybe_unwrap_results(results) # TODO: check for extra params in e.g. NegBin self.nobs, self.k_vars = results.model.exog.shape self.endog = endog if endog is not None else results.model.endog self.exog = exog if exog is not None else results.model.exog self.resid = resid if resid is not None else results.resid_pearson self.scale = scale if scale is not None else results.scale self.cov_params = (cov_params if cov_params is not None else results.cov_params()) self.model_class = results.model.__class__ self.hessian = self.results.model.hessian(self.results.params) self.score_obs = self.results.model.score_obs(self.results.params) if hat_matrix_diag is not None: self._hat_matrix_diag = hat_matrix_diag
def ccpr_data(results, exog_idx): exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model) results = maybe_unwrap_results(results) x1 = results.model.exog[:, exog_idx] x1beta = x1*results.params[exog_idx] return x1, x1beta + results.resid
def plot_fit(results, exog_idx, y_true=None, ax=None, **kwargs): """Plot fit against one regressor. This creates one graph with the scatterplot of observed values compared to fitted values. Parameters ---------- results : result instance result instance with resid, model.endog and model.exog as attributes x_var : int or str Name or index of regressor in exog matrix. y_true : array_like (optional) If this is not None, then the array is added to the plot ax : Matplotlib AxesSubplot instance, optional If given, this subplot is used to plot in instead of a new figure being created. kwargs The keyword arguments are passed to the plot command for the fitted values points. Returns ------- fig : Matplotlib figure instance If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. """ fig, ax = utils.create_mpl_ax(ax) exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model) results = maybe_unwrap_results(results) #maybe add option for wendog, wexog y = results.model.endog x1 = results.model.exog[:, exog_idx] x1_argsort = np.argsort(x1) y = y[x1_argsort] x1 = x1[x1_argsort] ax.plot(x1, y, 'bo', label=results.model.endog_names) if not y_true is None: ax.plot(x1, y_true[x1_argsort], 'b-', label='True values') title = 'Fitted values versus %s' % exog_name prstd, iv_l, iv_u = wls_prediction_std(results) ax.plot(x1, results.fittedvalues[x1_argsort], 'D', color='r', label='fitted', **kwargs) ax.vlines(x1, iv_l[x1_argsort], iv_u[x1_argsort], linewidth=1, color='k', alpha=.7) #ax.fill_between(x1, iv_l[x1_argsort], iv_u[x1_argsort], alpha=0.1, # color='k') ax.set_title(title) ax.set_xlabel(exog_name) ax.set_ylabel(results.model.endog_names) ax.legend(loc='best') return fig
def ccpr_data(results, exog_idx): exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model) results = maybe_unwrap_results(results) x1 = results.model.exog[:, exog_idx] x1beta = x1 * results.params[exog_idx] return x1, x1beta + results.resid
def __init__(self, results): #check which model is allowed self.results = maybe_unwrap_results(results) self.nobs, self.k_vars = results.model.exog.shape self.endog = results.model.endog self.exog = results.model.exog self.model_class = results.model.__class__ self.sigma_est = np.sqrt(results.mse_resid) self.aux_regression_exog = {} self.aux_regression_endog = {}
def __init__(self, results, resid=None, endog=None, exog=None, hat_matrix_diag=None, cov_params=None, scale=None): # I'm not calling super for now, OLS attributes might not be available #check which model is allowed self.results = results = maybe_unwrap_results(results) # TODO: check for extra params in e.g. NegBin self.nobs, self.k_vars = results.model.exog.shape self.endog = endog if endog is not None else results.model.endog self.exog = exog if exog is not None else results.model.exog self.resid = resid if resid is not None else results.resid_pearson self.scale = scale if scale is not None else results.scale self.cov_params = (cov_params if cov_params is not None else results.cov_params()) self.model_class = results.model.__class__ self.hessian = self.results.model.hessian(self.results.params) self.score_obs = self.results.model.score_obs(self.results.params) if hat_matrix_diag is not None: self._hat_matrix_diag = hat_matrix_diag
def outlier_test(model_results, method='bonf', alpha=.05, labels=None, order=False): """ Outlier Tests for RegressionResults instances. Parameters ---------- model_results : RegressionResults instance Linear model results method : str - `bonferroni` : one-step correction - `sidak` : one-step correction - `holm-sidak` : - `holm` : - `simes-hochberg` : - `hommel` : - `fdr_bh` : Benjamini/Hochberg - `fdr_by` : Benjamini/Yekutieli See `statsmodels.stats.multitest.multipletests` for details. alpha : float familywise error rate order : bool Whether or not to order the results by the absolute value of the studentized residuals. If labels are provided they will also be sorted. Returns ------- table : ndarray or DataFrame Returns either an ndarray or a DataFrame if labels is not None. Will attempt to get labels from model_results if available. The columns are the Studentized residuals, the unadjusted p-value, and the corrected p-value according to method. Notes ----- The unadjusted p-value is stats.t.sf(abs(resid), df) where df = df_resid - 1. """ from scipy import stats # lazy import infl = getattr(model_results, 'get_influence', None) if infl is None: results = maybe_unwrap_results(model_results) raise AttributeError("model_results object %s does not have a " "get_influence method." % results.__class__.__name__) resid = infl().resid_studentized_external if order: idx = np.abs(resid).argsort()[::-1] resid = resid[idx] if labels is not None: labels = np.array(labels)[idx].tolist() df = model_results.df_resid - 1 unadj_p = stats.t.sf(np.abs(resid), df) * 2 adj_p = multipletests(unadj_p, alpha=alpha, method=method) data = np.c_[resid, unadj_p, adj_p[1]] if labels is None: labels = getattr(model_results.model.data, 'row_labels', None) if labels is not None: from pandas import DataFrame return DataFrame(data, columns=['student_resid', 'unadj_p', method+"(p)"], index=labels) return data
def plot_fit(results, exog_idx, y_true=None, ax=None, **kwargs): """Plot fit against one regressor. This creates one graph with the scatterplot of observed values compared to fitted values. Parameters ---------- results : result instance result instance with resid, model.endog and model.exog as attributes x_var : int or str Name or index of regressor in exog matrix. y_true : array_like (optional) If this is not None, then the array is added to the plot ax : Matplotlib AxesSubplot instance, optional If given, this subplot is used to plot in instead of a new figure being created. kwargs The keyword arguments are passed to the plot command for the fitted values points. Returns ------- fig : Matplotlib figure instance If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. Examples -------- Load the Statewide Crime data set and perform linear regression with `poverty` and `hs_grad` as variables and `murder` as the response >>> import statsmodels.api as sm >>> import matplotlib.pyplot as plt >>> data = sm.datasets.statecrime.load_pandas().data >>> murder = data['murder'] >>> X = data[['poverty', 'hs_grad']] >>> X["constant"] = 1 >>> y = murder >>> model = sm.OLS(y, X) >>> results = model.fit() Create a plot just for the variable 'Poverty': >>> fig, ax = plt.subplots() >>> fig = sm.graphics.plot_fit(results, 0, ax=ax) >>> ax.set_ylabel("Murder Rate") >>> ax.set_xlabel("Poverty Level") >>> ax.set_title("Linear Regression") >>> plt.show() .. plot:: plots/graphics_plot_fit_ex.py """ fig, ax = utils.create_mpl_ax(ax) exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model) results = maybe_unwrap_results(results) #maybe add option for wendog, wexog y = results.model.endog x1 = results.model.exog[:, exog_idx] x1_argsort = np.argsort(x1) y = y[x1_argsort] x1 = x1[x1_argsort] ax.plot(x1, y, 'bo', label=results.model.endog_names) if not y_true is None: ax.plot(x1, y_true[x1_argsort], 'b-', label='True values') title = 'Fitted values versus %s' % exog_name prstd, iv_l, iv_u = wls_prediction_std(results) ax.plot(x1, results.fittedvalues[x1_argsort], 'D', color='r', label='fitted', **kwargs) ax.vlines(x1, iv_l[x1_argsort], iv_u[x1_argsort], linewidth=1, color='k', alpha=.7) #ax.fill_between(x1, iv_l[x1_argsort], iv_u[x1_argsort], alpha=0.1, # color='k') ax.set_title(title) ax.set_xlabel(exog_name) ax.set_ylabel(results.model.endog_names) ax.legend(loc='best', numpoints=1) return fig
def plot_ccpr(results, exog_idx, ax=None): """Plot CCPR against one regressor. Generates a CCPR (component and component-plus-residual) plot. Parameters ---------- results : result instance A regression results instance. exog_idx : int or string Exogenous, explanatory variable. If string is given, it should be the variable name that you want to use, and you can use arbitrary translations as with a formula. ax : Matplotlib AxesSubplot instance, optional If given, it is used to plot in instead of a new figure being created. Returns ------- fig : Matplotlib figure instance If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. See Also -------- plot_ccpr_grid : Creates CCPR plot for multiple regressors in a plot grid. Notes ----- The CCPR plot provides a way to judge the effect of one regressor on the response variable by taking into account the effects of the other independent variables. The partial residuals plot is defined as Residuals + B_i*X_i versus X_i. The component adds the B_i*X_i versus X_i to show where the fitted line would lie. Care should be taken if X_i is highly correlated with any of the other independent variables. If this is the case, the variance evident in the plot will be an underestimate of the true variance. References ---------- http://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/ccpr.htm """ fig, ax = utils.create_mpl_ax(ax) exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model) results = maybe_unwrap_results(results) x1 = results.model.exog[:, exog_idx] #namestr = ' for %s' % self.name if self.name else '' x1beta = x1*results.params[exog_idx] ax.plot(x1, x1beta + results.resid, 'o') from statsmodels.tools.tools import add_constant mod = OLS(x1beta, add_constant(x1)).fit() params = mod.params fig = abline_plot(*params, **dict(ax=ax)) #ax.plot(x1, x1beta, '-') ax.set_title('Component and component plus residual plot') ax.set_ylabel("Residual + %s*beta_%d" % (exog_name, exog_idx)) ax.set_xlabel("%s" % exog_name) return fig
def feature_regression_summary(df, feat_idx, target, model_fit_results, display_regress_diagnostics=False): feat = df.columns[feat_idx] v = vif(np.matrix(df), feat_idx) colinear = v > 10 if display_regress_diagnostics: # ‘endog versus exog’, ‘residuals versus exog’, ‘fitted versus exog’ and ‘fitted plus residual versus exog’ fig = plt.figure(constrained_layout=True, figsize=(2.25 * plot_edge, 4 * plot_edge)) fig = smgrpu.create_mpl_fig(fig) gs = GridSpec(3, 2, figure=fig) ax_fit = fig.add_subplot(gs[0, 0]) ax_partial_residuals = fig.add_subplot(gs[0, 1]) ax_partregress = fig.add_subplot(gs[1, 0]) ax_ccpr = fig.add_subplot(gs[1, 1]) ax_dist = fig.add_subplot(gs[2:4, 0:2]) exog_name, exog_idx = smgrpu.maybe_name_or_idx(feat, model_fit_results.model) smresults = smtt.maybe_unwrap_results(model_fit_results) y_name = smresults.model.endog_names x1 = smresults.model.exog[:, exog_idx] prstd, iv_l, iv_u = wls_prediction_std(smresults) # endog versus exog # use wrapper since it's availab;e! sm.graphics.plot_fit(model_fit_results, feat, ax=ax_fit) # residuals versus exog ax_partial_residuals.plot(x1, smresults.resid, 'o') ax_partial_residuals.axhline(y=0, color='black') ax_partial_residuals.set_title('Residuals versus %s' % exog_name, fontsize='large') ax_partial_residuals.set_xlabel(exog_name) ax_partial_residuals.set_ylabel("resid") # Partial Regression plot: fitted versus exog exog_noti = np.ones(smresults.model.exog.shape[1], bool) exog_noti[exog_idx] = False exog_others = smresults.model.exog[:, exog_noti] from pandas import Series smgrp.plot_partregress(smresults.model.data.orig_endog, Series(x1, name=exog_name, index=smresults.model.data.row_labels), exog_others, obs_labels=False, ax=ax_partregress) # CCPR: fitted plus residual versus exog # use wrapper since it's availab;e! sm.graphics.plot_ccpr(model_fit_results, feat, ax=ax_ccpr) ax_ccpr.set_title('CCPR Plot', fontsize='large') sns.distplot(df[feat], ax=ax_dist) fig.suptitle('Regression Plots for %s' % exog_name, fontsize="large") #fig.tight_layout() #fig.subplots_adjust(top=.90) plt.show() display( HTML( "Variance Inflation Factor (<i>VIF</i>) for <b>{}</b>: <b>{}</b> {}" .format( feat, round(v, 2), "$\\le 10 \\iff$ low colinearity" if not colinear else "$> 10 \\iff$ <b>HIGH COLINEARITY</b>"))) display( HTML( "<b><i>p-value</i></b> (<i>VIF</i>) for <b>{}</b>: <b>{}</b><br><br>" .format(feat, model_fit_results.pvalues[feat_idx + 1]))) return v
def plot_ccpr(results, exog_idx, ax=None): """Plot CCPR against one regressor. Generates a CCPR (component and component-plus-residual) plot. Parameters ---------- results : result instance A regression results instance. exog_idx : int or string Exogenous, explanatory variable. If string is given, it should be the variable name that you want to use, and you can use arbitrary translations as with a formula. ax : Matplotlib AxesSubplot instance, optional If given, it is used to plot in instead of a new figure being created. Returns ------- fig : Matplotlib figure instance If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. See Also -------- plot_ccpr_grid : Creates CCPR plot for multiple regressors in a plot grid. Notes ----- The CCPR plot provides a way to judge the effect of one regressor on the response variable by taking into account the effects of the other independent variables. The partial residuals plot is defined as Residuals + B_i*X_i versus X_i. The component adds the B_i*X_i versus X_i to show where the fitted line would lie. Care should be taken if X_i is highly correlated with any of the other independent variables. If this is the case, the variance evident in the plot will be an underestimate of the true variance. Examples -------- Using the state crime dataset plot the effect of the rate of single households ('single') on the murder rate while accounting for high school graduation rate ('hs_grad'), percentage of people in an urban area, and rate of poverty ('poverty'). >>> import statsmodels.api as sm >>> import matplotlib.pyplot as plot >>> import statsmodels.formula.api as smf >>> crime_data = sm.datasets.statecrime.load_pandas() >>> results = smf.ols('murder ~ hs_grad + urban + poverty + single', ... data=crime_data.data).fit() >>> sm.graphics.plot_ccpr(results, 'single') >>> plt.show() .. plot:: plots/graphics_regression_ccpr.py References ---------- http://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/ccpr.htm """ fig, ax = utils.create_mpl_ax(ax) exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model) results = maybe_unwrap_results(results) x1 = results.model.exog[:, exog_idx] #namestr = ' for %s' % self.name if self.name else '' x1beta = x1 * results.params[exog_idx] ax.plot(x1, x1beta + results.resid, 'o') from statsmodels.tools.tools import add_constant mod = OLS(x1beta, add_constant(x1)).fit() params = mod.params fig = abline_plot(*params, **dict(ax=ax)) #ax.plot(x1, x1beta, '-') ax.set_title('Component and component plus residual plot') ax.set_ylabel("Residual + %s*beta_%d" % (exog_name, exog_idx)) ax.set_xlabel("%s" % exog_name) return fig
def outlier_test(model_results, method='bonf', alpha=.05, labels=None, order=False, cutoff=None): """ Outlier Tests for RegressionResults instances. Parameters ---------- model_results : RegressionResults Linear model results method : str - `bonferroni` : one-step correction - `sidak` : one-step correction - `holm-sidak` : - `holm` : - `simes-hochberg` : - `hommel` : - `fdr_bh` : Benjamini/Hochberg - `fdr_by` : Benjamini/Yekutieli See `statsmodels.stats.multitest.multipletests` for details. alpha : float familywise error rate labels : None or array_like If `labels` is not None, then it will be used as index to the returned pandas DataFrame. See also Returns below order : bool Whether or not to order the results by the absolute value of the studentized residuals. If labels are provided they will also be sorted. cutoff : None or float in [0, 1] If cutoff is not None, then the return only includes observations with multiple testing corrected p-values strictly below the cutoff. The returned array or dataframe can be empty if there are no outlier candidates at the specified cutoff. Returns ------- table : ndarray or DataFrame Returns either an ndarray or a DataFrame if labels is not None. Will attempt to get labels from model_results if available. The columns are the Studentized residuals, the unadjusted p-value, and the corrected p-value according to method. Notes ----- The unadjusted p-value is stats.t.sf(abs(resid), df) where df = df_resid - 1. """ from scipy import stats # lazy import if labels is None: labels = getattr(model_results.model.data, 'row_labels', None) infl = getattr(model_results, 'get_influence', None) if infl is None: results = maybe_unwrap_results(model_results) raise AttributeError("model_results object %s does not have a " "get_influence " "method." % results.__class__.__name__) resid = infl().resid_studentized_external if order: idx = np.abs(resid).argsort()[::-1] resid = resid[idx] if labels is not None: labels = np.asarray(labels)[idx] df = model_results.df_resid - 1 unadj_p = stats.t.sf(np.abs(resid), df) * 2 adj_p = multipletests(unadj_p, alpha=alpha, method=method) data = np.c_[resid, unadj_p, adj_p[1]] if cutoff is not None: mask = data[:, -1] < cutoff data = data[mask] else: mask = slice(None) if labels is not None: from pandas import DataFrame return DataFrame(data, columns=['student_resid', 'unadj_p', method + "(p)"], index=np.asarray(labels)[mask]) return data
def plot_regress_exog(results, exog_idx, fig=None): """Plot regression results against one regressor. This plots four graphs in a 2 by 2 figure: 'endog versus exog', 'residuals versus exog', 'fitted versus exog' and 'fitted plus residual versus exog' Parameters ---------- results : result instance result instance with resid, model.endog and model.exog as attributes exog_idx : int or str Name or index of regressor in exog matrix fig : Matplotlib figure instance, optional If given, this figure is simply returned. Otherwise a new figure is created. Returns ------- fig : matplotlib figure instance Examples -------- Load the Statewide Crime data set and build a model with regressors including the rate of high school graduation (hs_grad), population in urban areas (urban), households below poverty line (poverty), and single person households (single). Outcome variable is the muder rate (murder). Build a 2 by 2 figure based on poverty showing fitted versus actual murder rate, residuals versus the poverty rate, partial regression plot of poverty, and CCPR plot for poverty rate. >>> import statsmodels.api as sm >>> import matplotlib.pyplot as plot >>> import statsmodels.formula.api as smf >>> fig = plt.figure(figsize=(8, 6)) >>> crime_data = sm.datasets.statecrime.load_pandas() >>> results = smf.ols('murder ~ hs_grad + urban + poverty + single', ... data=crime_data.data).fit() >>> sm.graphics.plot_regress_exog(results, 'poverty', fig=fig) >>> plt.show() .. plot:: plots/graphics_regression_regress_exog.py """ fig = utils.create_mpl_fig(fig) exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model) results = maybe_unwrap_results(results) #maybe add option for wendog, wexog y_name = results.model.endog_names x1 = results.model.exog[:, exog_idx] prstd, iv_l, iv_u = wls_prediction_std(results) ax = fig.add_subplot(2, 2, 1) ax.plot(x1, results.model.endog, 'o', color='b', alpha=0.9, label=y_name) ax.plot(x1, results.fittedvalues, 'D', color='r', label='fitted', alpha=.5) ax.vlines(x1, iv_l, iv_u, linewidth=1, color='k', alpha=.7) ax.set_title('Y and Fitted vs. X', fontsize='large') ax.set_xlabel(exog_name) ax.set_ylabel(y_name) ax.legend(loc='best') ax = fig.add_subplot(2, 2, 2) ax.plot(x1, results.resid, 'o') ax.axhline(y=0, color='black') ax.set_title('Residuals versus %s' % exog_name, fontsize='large') ax.set_xlabel(exog_name) ax.set_ylabel("resid") ax = fig.add_subplot(2, 2, 3) exog_noti = np.ones(results.model.exog.shape[1], bool) exog_noti[exog_idx] = False exog_others = results.model.exog[:, exog_noti] from pandas import Series fig = plot_partregress(results.model.data.orig_endog, Series(x1, name=exog_name, index=results.model.data.row_labels), exog_others, obs_labels=False, ax=ax) ax.set_title('Partial regression plot', fontsize='large') #ax.set_ylabel("Fitted values") #ax.set_xlabel(exog_name) ax = fig.add_subplot(2, 2, 4) fig = plot_ccpr(results, exog_idx, ax=ax) ax.set_title('CCPR Plot', fontsize='large') #ax.set_xlabel(exog_name) #ax.set_ylabel("Fitted values + resids") fig.suptitle('Regression Plots for %s' % exog_name, fontsize="large") fig.tight_layout() fig.subplots_adjust(top=.90) return fig
def plot_regress_exog(results, exog_idx, fig=None): """Plot regression #1lab_results against one regressor. This plots four graphs in a 2 by 2 figure: 'endog versus exog', 'residuals versus exog', 'fitted versus exog' and 'fitted plus residual versus exog' Parameters ---------- results : result instance result instance with resid, model.endog and model.exog as attributes exog_idx : int index of regressor in exog matrix fig : Matplotlib figure instance, optional If given, this figure is simply returned. Otherwise a new figure is created. Returns ------- fig : matplotlib figure instance """ fig = utils.create_mpl_fig(fig) exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model) results = maybe_unwrap_results(results) #maybe add option for wendog, wexog y_name = results.model.endog_names x1 = results.model.exog[:, exog_idx] prstd, iv_l, iv_u = wls_prediction_std(results) ax = fig.add_subplot(2, 2, 1) ax.plot(x1, results.model.endog, 'o', color='b', alpha=0.9, label=y_name) ax.plot(x1, results.fittedvalues, 'D', color='r', label='fitted', alpha=.5) ax.vlines(x1, iv_l, iv_u, linewidth=1, color='k', alpha=.7) ax.set_title('Y and Fitted vs. X', fontsize='large') ax.set_xlabel(exog_name) ax.set_ylabel(y_name) ax.legend(loc='best') ax = fig.add_subplot(2, 2, 2) ax.plot(x1, results.resid, 'o') ax.axhline(y=0, color='black') ax.set_title('Residuals versus %s' % exog_name, fontsize='large') ax.set_xlabel(exog_name) ax.set_ylabel("resid") ax = fig.add_subplot(2, 2, 3) exog_noti = np.ones(results.model.exog.shape[1], bool) exog_noti[exog_idx] = False exog_others = results.model.exog[:, exog_noti] from pandas import Series fig = plot_partregress(results.model.data.orig_endog, Series(x1, name=exog_name, index=results.model.data.row_labels), exog_others, obs_labels=False, ax=ax) ax.set_title('Partial regression plot', fontsize='large') #ax.set_ylabel("Fitted values") #ax.set_xlabel(exog_name) ax = fig.add_subplot(2, 2, 4) fig = plot_ccpr(results, exog_idx, ax=ax) ax.set_title('CCPR Plot', fontsize='large') #ax.set_xlabel(exog_name) #ax.set_ylabel("Fitted values + resids") fig.suptitle('Regression Plots for %s' % exog_name, fontsize="large") fig.tight_layout() fig.subplots_adjust(top=.90) return fig
def __init__( self, results: Type[ statsmodels.regression.linear_model.RegressionResultsWrapper] ) -> None: """ For a linear regression model, generates following diagnostic plots: a. residual b. qq c. scale location and d. leverage and a table e. vif Args: results (Type[statsmodels.regression.linear_model.RegressionResultsWrapper]): must be instance of statsmodels.regression.linear_model object Raises: TypeError: if instance does not belong to above object Example: >>> import numpy as np >>> import pandas as pd >>> import statsmodels.formula.api as smf >>> x = np.linspace(-np.pi, np.pi, 100) >>> y = 3*x + 8 + np.random.normal(0,1, 100) >>> df = pd.DataFrame({'x':x, 'y':y}) >>> res = smf.ols(formula= "y ~ x", data=df).fit() >>> cls = Linear_Reg_Diagnostic(res) >>> cls(plot_context="seaborn-paper") In case you do not need all plots you can also independently make an individual plot/table in following ways >>> cls = Linear_Reg_Diagnostic(res) >>> cls.residual_plot() >>> cls.qq_plot() >>> cls.scale_location_plot() >>> cls.leverage_plot() >>> cls.vif_table() """ if isinstance( results, statsmodels.regression.linear_model. RegressionResultsWrapper) is False: raise TypeError( "result must be instance of statsmodels.regression.linear_model.RegressionResultsWrapper object" ) self.results = maybe_unwrap_results(results) self.y_true = self.results.model.endog self.y_predict = self.results.fittedvalues self.xvar = self.results.model.exog self.xvar_names = self.results.model.exog_names self.residual = np.array(self.results.resid) influence = self.results.get_influence() self.residual_norm = influence.resid_studentized_internal self.leverage = influence.hat_matrix_diag self.cooks_distance = influence.cooks_distance[0] self.nparams = len(self.results.params)
def plot_fit(results, exog_idx, y_true=None, ax=None, **kwargs): """Plot fit against one regressor. This creates one graph with the scatterplot of observed values compared to fitted values. Parameters ---------- results : result instance result instance with resid, model.endog and model.exog as attributes x_var : int or str Name or index of regressor in exog matrix. y_true : array_like (optional) If this is not None, then the array is added to the plot ax : Matplotlib AxesSubplot instance, optional If given, this subplot is used to plot in instead of a new figure being created. kwargs The keyword arguments are passed to the plot command for the fitted values points. Returns ------- fig : Matplotlib figure instance If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. Examples -------- Load the Statewide Crime data set and perform linear regression with `poverty` and `hs_grad` as variables and `murder` as the response >>> import statsmodels.api as sm >>> import matplotlib.pyplot as plt >>> data = sm.datasets.statecrime.load_pandas().data >>> murder = data['murder'] >>> X = data[['poverty', 'hs_grad']] >>> X["constant"] = 1 >>> y = murder >>> model = sm.OLS(y, X) >>> results = model.fit() Create a plot just for the variable 'Poverty': >>> fig, ax = plt.subplots() >>> fig = sm.graphics.plot_fit(results, 0, ax=ax) >>> ax.set_ylabel("Murder Rate") >>> ax.set_xlabel("Poverty Level") >>> ax.set_title("Linear Regression") >>> plt.show() .. plot:: plots/graphics_plot_fit_ex.py """ fig, ax = utils.create_mpl_ax(ax) exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model) results = maybe_unwrap_results(results) #maybe add option for wendog, wexog y = results.model.endog x1 = results.model.exog[:, exog_idx] x1_argsort = np.argsort(x1) y = y[x1_argsort] x1 = x1[x1_argsort] ax.plot(x1, y, 'bo', label=results.model.endog_names) if y_true is not None: ax.plot(x1, y_true[x1_argsort], 'b-', label='True values') title = 'Fitted values versus %s' % exog_name prstd, iv_l, iv_u = wls_prediction_std(results) ax.plot(x1, results.fittedvalues[x1_argsort], 'D', color='r', label='fitted', **kwargs) ax.vlines(x1, iv_l[x1_argsort], iv_u[x1_argsort], linewidth=1, color='k', alpha=.7) #ax.fill_between(x1, iv_l[x1_argsort], iv_u[x1_argsort], alpha=0.1, # color='k') ax.set_title(title) ax.set_xlabel(exog_name) ax.set_ylabel(results.model.endog_names) ax.legend(loc='best', numpoints=1) return fig
def plot_regress_exog(results, exog_idx, fig=None): """Plot regression results against one regressor. This plots four graphs in a 2 by 2 figure: 'endog versus exog', 'residuals versus exog', 'fitted versus exog' and 'fitted plus residual versus exog' Parameters ---------- results : result instance result instance with resid, model.endog and model.exog as attributes exog_idx : int index of regressor in exog matrix fig : Matplotlib figure instance, optional If given, this figure is simply returned. Otherwise a new figure is created. Returns ------- fig : matplotlib figure instance """ fig = utils.create_mpl_fig(fig) exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model) results = maybe_unwrap_results(results) #maybe add option for wendog, wexog y_name = results.model.endog_names x1 = results.model.exog[:, exog_idx] prstd, iv_l, iv_u = wls_prediction_std(results) ax = fig.add_subplot(2, 2, 1) ax.plot(x1, results.model.endog, 'o', color='b', alpha=0.9, label=y_name) ax.plot(x1, results.fittedvalues, 'D', color='r', label='fitted', alpha=.5) ax.vlines(x1, iv_l, iv_u, linewidth=1, color='k', alpha=.7) ax.set_title('Y and Fitted vs. X', fontsize='large') ax.set_xlabel(exog_name) ax.set_ylabel(y_name) ax.legend(loc='best') ax = fig.add_subplot(2, 2, 2) ax.plot(x1, results.resid, 'o') ax.axhline(y=0, color='black') ax.set_title('Residuals versus %s' % exog_name, fontsize='large') ax.set_xlabel(exog_name) ax.set_ylabel("resid") ax = fig.add_subplot(2, 2, 3) exog_noti = np.ones(results.model.exog.shape[1], bool) exog_noti[exog_idx] = False exog_others = results.model.exog[:, exog_noti] from pandas import Series fig = plot_partregress(results.model.data.orig_endog, Series(x1, name=exog_name, index=results.model.data.row_labels), exog_others, obs_labels=False, ax=ax) ax.set_title('Partial regression plot', fontsize='large') #ax.set_ylabel("Fitted values") #ax.set_xlabel(exog_name) ax = fig.add_subplot(2, 2, 4) fig = plot_ccpr(results, exog_idx, ax=ax) ax.set_title('CCPR Plot', fontsize='large') #ax.set_xlabel(exog_name) #ax.set_ylabel("Fitted values + resids") fig.suptitle('Regression Plots for %s' % exog_name, fontsize="large") fig.tight_layout() fig.subplots_adjust(top=.90) return fig