def test_correct_labels(close_figures, reset_randomstate, line, x_size, y_size, labels): rs = np.random.RandomState(9876554) x = rs.normal(loc=0, scale=0.1, size=x_size) y = rs.standard_t(3, size=y_size) pp_x = sm.ProbPlot(x) pp_y = sm.ProbPlot(y) fig = qqplot_2samples(pp_x, pp_y, line=line, **labels) ax = fig.get_axes()[0] x_label = ax.get_xlabel() y_label = ax.get_ylabel() if x_size <= y_size: if not labels: assert "2nd" in x_label assert "1st" in y_label else: assert "Y" in x_label assert "X" in y_label else: if not labels: assert "1st" in x_label assert "2nd" in y_label else: assert "X" in x_label assert "Y" in y_label
def plotDiagnostics(data, mu, xi, sigma, figfile): """ Create a 4-panel diagnostics plot of the fitted distribution. :param data: :class:`numpy.ndarray` of observed data values (in units of metres/second). :param float mu: Selected threshold value. :param float xi: Fitted shape parameter. :param float sigma: Fitted scale parameter. :param str figfile: Path to store the file (includes image format) """ LOG.info("Plotting diagnostics") fig, ax = plt.subplots(2, 2) axes = ax.flatten() # Probability plots sortedmax = np.sort(data[data > mu]) gpdf = fittedPDF(data, mu, xi, sigma) pp_x = sm.ProbPlot(sortedmax) pp_x.ppplot(xlabel="Empirical", ylabel="Model", ax=axes[0], line='45') axes[0].set_title("Probability plot") prplot = sm.ProbPlot(sortedmax, genpareto, distargs=(xi, ), loc=mu, scale=sigma) prplot.qqplot(xlabel="Model", ylabel="Empirical", ax=axes[1], line='45') axes[1].set_title("Quantile plot") ax2 = axes[2] rp = np.array( [1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000]) rate = float(len(sortedmax)) / float(len(data)) rval = returnLevels(rp, mu, xi, sigma, rate) emprp = empiricalReturnPeriod(np.sort(data)) ax2.semilogx(rp, rval, label="Fitted RP curve", color='r') ax2.scatter(emprp[emprp > 1], np.sort(data)[emprp > 1], color='b', label="Empirical RP", s=100) ax2.legend(loc=2) ax2.set_xlabel("Return period") ax2.set_ylabel("Return level") ax2.set_title("Return level plot") ax2.grid(True) maxbin = 4 * np.ceil(np.floor(data.max() / 4) + 1) sns.distplot(sortedmax, bins=np.arange(mu, maxbin, 2), hist=True, axlabel='Wind speed (m/s)', ax=axes[3]) axes[3].plot(sortedmax, gpdf, color='r') axes[3].set_title("Density plot") plt.tight_layout() plt.savefig(figfile) plt.close()
def setup(self): self.data = sm.datasets.longley.load(as_pandas=False) self.data.exog = sm.add_constant(self.data.exog, prepend=False) self.mod_fit = sm.OLS(self.data.endog, self.data.exog).fit() self.res = self.mod_fit.resid self.prbplt = sm.ProbPlot(self.mod_fit.resid, stats.t, distargs=(4,)) self.other_array = np.random.normal(size=self.prbplt.data.shape) self.other_prbplot = sm.ProbPlot(self.other_array)
def qq_plot_2samples(self): """ :return: Q-Q plot between two samples """ self.ax = self.figure.add_subplot(111) self.ax.hold(True) pp_x = sm.ProbPlot(self.column_data) pp_y = sm.ProbPlot(self.var_data) qqplot_2samples(pp_x, pp_y, ax=self.ax) self.canvas.draw()
def test_ProbPlot_comparison_arrays(): # two fake samples for comparison x = np.random.normal(loc=8.25, scale=3.25, size=37) y = np.random.normal(loc=8.25, scale=3.25, size=37) pp_x = sm.ProbPlot(x) pp_y = sm.ProbPlot(y) # test `other` kwarg with array fig6 = pp_x.qqplot(other=y) fig7 = pp_x.ppplot(other=y) plt.close('all')
def test_ProbPlot_comparison(): # two fake samples for comparison x = np.random.normal(loc=8.25, scale=3.25, size=37) y = np.random.normal(loc=8.25, scale=3.25, size=37) pp_x = sm.ProbPlot(x) pp_y = sm.ProbPlot(y) # test `other` kwarg with `ProbPlot` instance fig4 = pp_x.qqplot(other=pp_y) fig5 = pp_x.ppplot(other=pp_y) plt.close('all')
def test_qqplot_2samples_arrays(): #just test that it runs x = np.random.normal(loc=8.25, scale=3.25, size=37) y = np.random.normal(loc=8.25, scale=3.25, size=37) pp_x = sm.ProbPlot(x) pp_y = sm.ProbPlot(y) # also tests all values for line for line in ['r', 'q', '45', 's']: # test with arrays fig1 = sm.qqplot_2samples(x, y, line=line) plt.close('all')
def normality_of_residuals_test(model): ''' Function for drawing the normal QQ-plot of the residuals and running 4 statistical tests to investigate the normality of residuals. Arg: * model - fitted OLS models from statsmodels ''' sm.ProbPlot(model.resid).qqplot(line='s') plt.title('Q-Q Plot') jb = stats.jarque_bera(model.resid) sw = stats.shapiro(model.resid) ad = stats.anderson(model.resid, dist='norm') ks = stats.kstest(model.resid, 'norm') print(f'Jarque_Bera test ---- statistic: {jb[0]:.4f}, p-value: {jb[1]}') print( f'Shapiro_Wilk test ---- statistic: {sw[0]:.4f}, p-value: {sw[1]:.4f}') print( f'Kolmogorov_Smirnov test ---- statistic: {ks.statistic:.4f}, p-value: {ks.pvalue:.4f}' ) print( f'Anderson_Darling test ---- statistic: {ad.statistic:.4f}, 5% critical value: {ad.critical_values[2]:.4f}' )
def pp_plot(residual_values, *, ax=None): """P-P plot compares the empirical cumulative distribution function against the theoretical cumulative distribution function given a specified model. The plot is a useful diagnostic to assess whether the assumption of linearity holds for a model (more sensitive to non-linearity in the middle of the distribution). Args: residual_values (array): array of residuals from a model ax (Axes object): Matplotlib Axes object (optional) Returns: Figure object """ if ax is None: ax = plt.gca() prob_plot = sm.ProbPlot(residual_values, fit=True) fig = prob_plot.ppplot(ax=ax, color='tab:blue', markersize=4, line='45') # Figure returned is passed to subplot ax.grid(True, linewidth=0.5) ax.set_title("P-P Plot of Residuals") return fig
def garch_plot1(lh): # Plot figure with subplots of different sizes fig = plt.figure(1) # set up subplot grid gridspec.GridSpec(3, 2) # large subplot plt.subplot2grid((3, 2), (0, 0), colspan=2, rowspan=1) plt.title('Lean Hogs Time Series Analysis Plots') plt.plot(lh) # small subplot 1 plt.subplot2grid((3, 2), (1, 0)) lag_acf = acf(lh, nlags=40) plt.stem(lag_acf) plt.axhline(y=0, linestyle='-', color='black') plt.axhline(y=-1.96 / np.sqrt(len(lh)), linestyle='--', color='gray') plt.axhline(y=1.96 / np.sqrt(len(lh)), linestyle='--', color='gray') plt.ylabel('ACF') # small subplot 2 plt.subplot2grid((3, 2), (1, 1)) lag_pacf = pacf(lh, nlags=40, method='ols') plt.stem(lag_pacf) plt.axhline(y=0, linestyle='-', color='black') plt.axhline(y=-1.96 / np.sqrt(len(lh)), linestyle='--', color='gray') plt.axhline(y=1.96 / np.sqrt(len(lh)), linestyle='--', color='gray') plt.ylabel('PACF') # small subplot 3 ax0 = plt.subplot2grid((3, 2), (2, 0)) ax1 = plt.subplot2grid((3, 2), (2, 1)) probplot = sm.ProbPlot(lh, dist='lognorm', fit=True) probplot.ppplot(line='45', ax=ax0) probplot.qqplot(line='45', ax=ax1) ax0.set_title('P-P Plot') ax1.set_title('Q-Q Plot') plt.show()
def normality_of_residuals_test(model): ''' Function for drawing the normal QQ-plot of the residuals and running 4 statistical tests to investigate the normality of residuals. Arg: * model - fitted OLS models from statsmodels ''' sm.ProbPlot(model.resid).qqplot(line='s') plt.title('Q-Q plot') jb = stats.jarque_bera(model.resid) sw = stats.shapiro(model.resid) ad = stats.anderson(model.resid, dist='norm') ks = stats.kstest(model.resid, 'norm') print(f'Jarque-Bera test ---- statistic: {jb[0]:.4f}, p-value: {jb[1]}') print( f'Shapiro-Wilk test ---- statistic: {sw[0]:.4f}, p-value: {sw[1]:.4f}') print( f'Kolmogorov-Smirnov test ---- statistic: {ks.statistic:.4f}, p-value: {ks.pvalue:.4f}' ) print( f'Anderson-Darling test ---- statistic: {ad.statistic:.4f}, 5% critical value: {ad.critical_values[2]:.4f}' ) print( 'If the returned AD statistic is larger than the critical value, then for the 5% significance level, the null hypothesis that the data come from the Normal distribution should be rejected. ' )
def norm_plot(self, x): '''Generate subplots of QQPlot and histgram to visualize the normality of a variable. Parameters: ---------- x : list of numpy.ndarray The variable to plot Returns: ------- ax1 : matplotlib.axes To plot the QQplot of variable x ax2 : matplotlib.axes To plot histogram of variable x Notes: ----- None''' fig, [ax1, ax2] = plt.subplots(1, 2, figsize=(8, 3)) qlt = sm.ProbPlot(x.reshape(-1), fit=True) qq = qlt.qqplot(marker='o', color='coral', ax=ax1) sm.qqline(qq.axes[0], line='45', fmt='g--') ax2.hist(x, color='orange', alpha=.6) return ax1, ax2
def animate(i): ax0.cla() ax1.cla() ax2.cla() ax4.cla() query = ('SELECT * FROM means') data = pd.read_sql_query(query, connection) x_random_mean= data.means ax0.hist(x_random_mean,bins = 6) query2 = ('SELECT * FROM shap_p_values_text') data2 = pd.read_sql_query(query2, connection) x2 = data2.p_value.values ax1.text(0.01, 0.5,f"P values is: {x2[-1]}") ax1.axes.get_yaxis().set_visible(False) ax1.axis('off') sm.ProbPlot(x_random_mean).qqplot(line='s', ax=ax2) ax3.plot(data2.Id, x2) query4 = ('SELECT * FROM main_data') data4 = pd.read_sql_query(query4, connection) data41 = data4.main_value.values ax4.hist(data41, bins= 6)
def qq_plot(std_residuals, ax=None): """ Plot quantiles of the normalized residuals. The Q–Q plot has use for regression with assumption that the errors are normally distributed. This is the basic assumption for linear regression: if the normalized residuals do not follow a normal distribution, the interpretation may be affected and the model may have a weaken inference. The Q–Q-plot depicts the standardized residuals (z-scores) against theoretical quantiles of the normal distribution. Ideally, the points should all lie near the 1:1 line (the diagonal line, intercept 0 and slope 1). If the pattern is S-shaped, banana-shaped, or too off the diagonal line, you may need to fit a different model to the data. The top 3 y-axis values are also annotated. Parameters ---------- std_residuals : vector Vector of the standardized residuals (z-scores). ax : matplotlib, optional Plot into this axis, otherside otherwise grab the current axis or make a new one if not existing. See Also -------- std_residual_hist : histogram of normalized residuals. References ---------- Crawley (2007) The R Book (1st ed.). Wiley Publishing. James, Witten, Hastie & Tibshirani (2014) An Introduction to Statistical Learning: With Applications in R. Springer Publishing Company, Incorporated. """ qq = sm.ProbPlot(std_residuals) fig = qq.qqplot(ax=ax, line='45', alpha=0.7, color='#4C72B0', lw=1, markersize=3) if not ax: ax = fig.axes[0] ax.set_title('Normal Q–Q') ax.set_xlabel('Theoretical quantiles') ax.set_ylabel('Standardized residuals') # Annotations abs_norm_resid = np.flip(np.argsort(np.abs(std_residuals)), 0) abs_norm_resid_top_3 = abs_norm_resid[:3] for rank, i in enumerate(abs_norm_resid_top_3): x = np.flip(qq.theoretical_quantiles, 0)[rank] y = std_residuals[i] ax.text(x, y, i, size=8)
def test_invalid_dist_config(close_figures): # GH 4226 np.random.seed(5) data = sm.datasets.longley.load(as_pandas=False) data.exog = sm.add_constant(data.exog, prepend=False) mod_fit = sm.OLS(data.endog, data.exog).fit() with pytest.raises(TypeError, match=r'dist\(0, 1, 4, loc=0, scale=1\)'): sm.ProbPlot(mod_fit.resid, stats.t, distargs=(0, 1, 4))
def setup(self): np.random.seed(5) self.data = sm.datasets.longley.load(as_pandas=False) self.data.exog = sm.add_constant(self.data.exog, prepend=False) self.mod_fit = sm.OLS(self.data.endog, self.data.exog).fit() self.prbplt = sm.ProbPlot(self.mod_fit.resid, stats.t, distargs=(4,)) self.line = 'r' super(TestProbPlotLongely, self).setup()
def plot_quant_probs(data, dist): a = sm.ProbPlot(data, dist=dist, fit=True) fig = plt.figure() ax = fig.add_subplot(1, 2, 1) aplt = a.qqplot(ax=ax, line='45') ax = fig.add_subplot(1, 2, 2) aplt = a.ppplot(ax=ax, line='45')
def setup(self): try: import matplotlib.pyplot as plt self.fig, self.ax = plt.subplots() except ImportError: pass self.other_array = np.random.normal(size=self.prbplt.data.shape) self.other_prbplot = sm.ProbPlot(self.other_array)
def qqplot(self): fig = plt.figure(figsize=(12,8)) probplot = sm.ProbPlot(self.data.band_filtered.compressed()) ax = fig.gca() probplot.qqplot(ax=ax, line='s') ax.get_lines()[0].set(markersize=1) ax.get_lines()[1].set(color='black', dashes=[4, 1]) ax.set_title('Normal Q-Q Plot') plt.savefig(self.output_path + '/qq_plot.png')
def error_distribution(model, clade, protein): '''applys distribution of errors test and saves: should be normally distributed''' fig, ax = plt.subplots(1, 1) sm.ProbPlot(model.resid).qqplot(line='s', color='#1f77b4', ax=ax) ax.title.set_text('QQ Plot') fig.savefig(f'error_dist_test_{clade}_{protein}.png') plt.show() plt.close(fig) return
def pp_plot(self): """ :return: P-P plot """ self.ax = self.figure.add_subplot(111) self.ax.hold(True) probplot = sm.ProbPlot(self.column_data) probplot.ppplot(ax=self.ax, line='45') self.canvas.draw()
def compute_quantile_quantile_curve(x): print('getting qqplot estimate') if not hasattr(defaults, 'figureNumber'): defaults.figureNumber = 0 defaults.figureNumber = defaults.figureNumber + 1 plt.figure(defaults.figureNumber) res = stats.probplot(x, plot=plt) res1 = sm.ProbPlot(x, stats.t, fit=True) print(res1) return res
def qq_plot(residuals): fig, ax = plt.subplots(figsize=(8, 5)) pp = sm.ProbPlot(residuals, fit=True) qq = pp.qqplot(color='#1F77B4', alpha=0.8, ax=ax) a = ax.get_xlim()[0] b = ax.get_xlim()[1] ax.plot([a, b], [a, b], color='black', alpha=0.6) ax.set_xlim(a, b) ax.set_title('Normal Q-Q plot for the residuals', fontsize=12) return fig, ax
def qqplotResiduals(zHat, indVar, depVar): residuals = depVar - zHat df = np.shape(zHat)[0] df -= len(indVar[0]) probPlot = sm.ProbPlot(residuals, t, distargs=(df,)) probPlot.qqplot() show() return
def qqplot(self): fig = plt.figure(figsize=(12, 8)) probplot = sm.ProbPlot(self.raster_difference.elevation.compressed()) ax = fig.gca() probplot.qqplot(ax=ax, line='s') ax.get_lines()[0].set(markersize=1) ax.get_lines()[1].set(color='black', dashes=[4, 1]) ax.set_title('Normal Q-Q Plot', **self.title_opts()) fig.tight_layout() plt.savefig(self.output_path + '/qq_plot.png', **self.output_defaults())
def probability_plot(col, df_origin, df_impute): ''' Input: col: A list of columns that need to plot df_origin: The original dataframe df_impute: The dataframe after missing value imputation Output: A large graph containing the respective probability plots (origin vs. impute) of the required columns ''' r, c = len(col) // 4 + 1, 4 fig = plt.figure(figsize=(c * 8, r * 8)) for i in range(len(col)): feature = col[i] pp_origin = sm.ProbPlot(df_origin[feature].dropna(), fit=True) pp_impute = sm.ProbPlot(df_impute[feature], fit=True) ax = fig.add_subplot(r, c, i + 1) pp_origin.ppplot(line="45", other=pp_impute, ax=ax) plt.title(f"{feature}, origin vs. impute") plt.tight_layout()
def test_ProbPlot(): #just test that it runs data = sm.datasets.longley.load() data.exog = sm.add_constant(data.exog) mod_fit = sm.OLS(data.endog, data.exog).fit() res = sm.ProbPlot(mod_fit.resid, stats.t, distargs=(4, )) # basic tests modeled after example in docstring fig1 = res.qqplot(line='r') fig2 = res.ppplot(line='r') fig3 = res.probplot(line='r') plt.close('all')
def distribution_test(dataset, data_name, **kwargs): path = os.path.dirname((os.path.abspath(__file__))) mat_file = scipy.io.loadmat(dataset) data = mat_file['data'].squeeze() probplot = sm.ProbPlot(data, scipy.stats.uniform, fit=True) probplot.qqplot(line='45') plt.savefig( os.path.join(path, 'test_result/uniform/{}.png'.format(data_name))) plt.title(data_name) plt.clf()
def plot_normal_qq(model, ax): # Use StatsModels ProbPlot to compute quantiles probplot = sm.ProbPlot(model.resid_pearson) x, y = probplot.theoretical_quantiles, probplot.sample_quantiles ax.plot(x, y, marker='o', markerfacecolor='none', ls='none') # Draw 45 degree dotted line vmin, vmax = min(np.min(x), np.min(y)), max(np.max(x), np.max(y)) ax.plot([vmin, vmax], [vmin, vmax], linestyle=':', color='C0') ax.set_xlabel('Theoretical Quantiles') ax.set_ylabel('Standardized Residuals') ax.set_title('Normal Q-Q')
def show_qqplot(k, lamda, theta, X_0, T, simulated): c = (2 * k) / ((1 - np.exp(-k * T)) * theta**2) df = 4 * k * lamda / theta**2 nc = 2 * c * X_0 * np.exp(-k * T) pp = sm.ProbPlot(simulated, ncx2, distargs=(df, nc), scale=1 / (2 * c)) x = pp.theoretical_quantiles y = pp.sample_quantiles plt.plot(x, y, "bo") plt.title("Probability Plot") plt.xlabel("Theoretical quantiles") plt.ylabel("Sample quantiles") x = np.linspace(min(x[0], y[0]), max(x[-1], y[-1]), 2) plt.plot(x, x, "k--") plt.gca().set_aspect("equal")