def return_qqplot(data): ''' Generates a Q-Q plot of the returns.''' plt.figure(figsize=(9, 5)) sm.qqplot(data['returns'], line='s') plt.grid(True) plt.xlabel('theoretical quantiles') plt.ylabel('sample quantiles')
def plot_single_peak(peak, ff = False, num_bins = 50, qq = scipy.stats.norm): '''Plotte fuer einen Peak das Histogramm sowie qq-Plot zur Verteilung qq Besser plot_simlist verwenden, wenn nicht nur gezielt ein Peak angeschaut werden soll, oder Histogrammdarstellung erwuenscht''' data = peak # Falls from_file gewaehlt, oeffne file if ff: with open (peak, 'rb') as daten: data = pickle.load(daten) #Normales Hist plotten n, bins, patches = plt.hist(data.times, num_bins, normed=1, alpha=0.5 ) plt.suptitle("params:" + str(data.params)) # Jetzt noch ein qq-Plot x = np.arange(1, 250, 0.5) if qq == scipy.stats.invgauss: mu, loc, scale = scipy.stats.invgauss.fit(data.times) logging.log(20, "ig-paramss, %s, %s, %s", str(mu), str(loc), str(scale)) plt.plot(x,scipy.stats.invgauss.pdf(x,mu, loc, scale)) logging.log(20,'skew, %s', str(scipy.stats.skew(data.times))) sm.qqplot(np.array(data.times), qq, distargs=(mu,), line = 'r') plt.suptitle("params:" + str(data.params) + " qq-Plot mit Normalverteilung" ) elif qq == scipy.stats.norm: sm.qqplot(np.array(data.times), qq, line='r') plt.suptitle("params:" + str(data.params) + " qq-Plot mit Inverser Gauss Verteilung: ") else: print("not yet implemented, distribution:", qq) plt.show()
def hist(request, sym): """create a histogram plot""" data = Data(syms=[sym], start=start) r = data.panel.r.copy() r = r.dropna() fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(12, 7)) ax = axes[0,0] ax.hist(r[sym].values, bins=30) r.plot(kind="kde", ax=ax,grid=True) r.boxplot(ax=axes[0,1],grid=True) r.plot(kind="kde", ax=axes[1,0],grid=True) sm.qqplot(r[sym], line='r', fit=True, ax=axes[1,1]) r['mean'] = pandas.rolling_mean(r[sym], 12) r['std'] = pandas.rolling_std(r[sym], 12) r['cum_ret'] = r[sym].cumsum() r[['mean', 'std']].plot(ax=axes[0,2], grid=True, rot=45) r[['cum_ret']].plot(ax=axes[1,2], grid=True, rot=45) fig.tight_layout() fig.set_facecolor((1,.8,.6,0)) canvas = FigureCanvas(fig) response = HttpResponse(content_type='image/png') canvas.print_png(response) return response
def plotFit(fit): """Create's the 2x2 panel of plots that plot(fit) would create in R""" resid = fit.resid mu = resid.mean() std = resid.std(axis=0) #had to write my own normalize function def _normalize(resid): return (resid-mu)/std norm_resid = resid.apply(_normalize) f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, sharex='col', sharey='row') ax1.scatter(fit.fittedvalues, fit.resid) ax1.set_xlabel('Fitted Values') ax1.set_ylabel('Residuals') ax1.set_title('Residuals vs Fitted') sm.qqplot(fit.resid, ax=ax2) ax2.set_title('QQ plot') ax3.scatter(fit.fittedvalues, norm_resid) ax3.set_xlabel('Fitted Values') ax3.set_ylabel('Standardized Residuals') ax3.set_title('Scale-Location') sm.graphics.influence_plot(fit, ax=ax4, criterion="cooks") plt.show()
def plot (sim_liste, histogram_separate, histogram_spec, qq_Plot, fit_qq_Plot, num_bins = 50, vergleich= scipy.stats.invgauss): startzeit = time.clock() if histogram_spec: print "Erstelle Spektrum" fig, ax = plt.subplots() fig.suptitle("Laenge: "+str(sim_liste[0].length)+" Anz Teilchen: " +str(sim_liste[1].number)) #TODO, gehe hier davon aus, dass gleiche sim-bedingungen vorliegen for sim in sim_liste: ax.hist(sim.times, num_bins, alpha=0.5, normed = 1, label = str(sim.params) ) # plt.show() legend = ax.legend(loc='upper right', shadow=True) # Je Simulation ein Ausgabefenster mit separatem Histogramm/qq-Plot mit gewählten Params/qq mit automatischem Fit number_stats = sum([histogram_separate, qq_Plot, fit_qq_Plot]) print number_stats if histogram_separate or qq_Plot or fit_qq_Plot: print "Erstelle separate Dinge" for sim in sim_liste: fig = plt.figure(figsize=(4*number_stats, 4)) gs1 = gridspec.GridSpec(1, number_stats) ax_list = [fig.add_subplot(ss) for ss in gs1] akt = 0 fig.suptitle("ps, pm"+str(sim.params)+str(round(sim.params[0]-sim.params[1],5)), size = 15) if histogram_separate: ax_list[akt].hist(sim.times, num_bins) ax_list[akt].set_title("Histogramm") akt+=1 #print "hist sep", time.clock()-startzeit if qq_Plot: sm.qqplot (np.array(sim.times), scipy.stats.norm, line = 'r', ax=ax_list[akt]) ax_list[akt].set_title("qq-Plot; norm!! Params: 0.05") akt+=1 #print 'qq 0.05', time.clock()-startzeit if fit_qq_Plot: #mu, loc, scale = scipy.stats.invgauss.fit(sim.times) #mean, var = scipy.stats.invgauss.stats(mu, loc, scale, moments='mv') #print "params", sim.params, '(mu, loc, scale), mean, var', round(mu, 5), round(loc, 2), round(scale, 2), '\n', mean, '\n', var #sm.qqplot (np.array(sim.times), vergleich, fit = True, line = 'r', ax=ax_list[akt]) #ax_list[akt].set_title("qq-Plot mit auto Fit") #akt+=1 sm.qqplot (np.array(sim.times), vergleich, distargs= (sim.mu, ), line = 'r', ax=ax_list[akt]) ax_list[akt].set_title("qq-Plot mit mu:" + str(sim.mu)) akt+=1 #print "qq plus rechnen", time.clock()-startzeit #fig.subplots_adjust(top=5.85) gs1.tight_layout(fig, rect=[0, 0.03, 1, 0.95]) print time.clock()-startzeit #plt.tight_layout() plt.show() '''x = np.linspace(0, 2*np.pi, 400)
def plot_single_histqq_ff(datei, num_bins=50): with open(datei, 'rb') as daten: sim = pickle.load(daten) n, bins, patches = plt.hist(sim.times, num_bins, normed=1, alpha=0.5 ) x = np.arange(50000, 250000, 100) print "ig-params", scipy.stats.invgauss.fit(sim.times) mu, loc, scale = scipy.stats.invgauss.fit(sim.times) plt.plot(x,scipy.stats.invgauss.pdf(x,mu, loc, scale)) print 'skew', scipy.stats.skew(sim.times) sm.qqplot(np.array(sim.times), scipy.stats.invgauss, distargs=(mu,), line = 'r')
def ts_diagnostics(y, lags=None, title='', filename=''): ''' Calculate acf, pacf, qq plot and Augmented Dickey Fuller test for a given time series ''' if not isinstance(y, pd.Series): y = pd.Series(y) # weekly moving averages (5 day window because of workdays) rolling_mean = pd.rolling_mean(y, window=12) rolling_std = pd.rolling_std(y, window=12) fig = plt.figure(figsize=(14, 12)) layout = (3, 2) ts_ax = plt.subplot2grid(layout, (0, 0), colspan=2) acf_ax = plt.subplot2grid(layout, (1, 0)) pacf_ax = plt.subplot2grid(layout, (1, 1)) qq_ax = plt.subplot2grid(layout, (2, 0)) hist_ax = plt.subplot2grid(layout, (2, 1)) # time series plot y.plot(ax=ts_ax) rolling_mean.plot(ax=ts_ax, color='crimson') rolling_std.plot(ax=ts_ax, color='darkslateblue') plt.legend(loc='best') ts_ax.set_title(title, fontsize=24) # acf and pacf smt.graphics.plot_acf(y, lags=lags, ax=acf_ax, alpha=0.5) smt.graphics.plot_pacf(y, lags=lags, ax=pacf_ax, alpha=0.5) # qq plot sm.qqplot(y, line='s', ax=qq_ax) qq_ax.set_title('QQ Plot') # hist plot y.plot(ax=hist_ax, kind='hist', bins=25) hist_ax.set_title('Histogram') plt.tight_layout() # plt.savefig('./img/{}.png'.format(filename)) plt.show() # perform Augmented Dickey Fuller test print('Results of Dickey-Fuller test:') dftest = adfuller(y, autolag='AIC') dfoutput = pd.Series( dftest[0:4], index=['test statistic', 'p-value', '# of lags', '# of observations']) for key, value in dftest[4].items(): dfoutput['Critical Value (%s)' % key] = value print(dfoutput) return
def plot_time_series(data, lags=None, title=None, filename=None): """ Saves time series plot figure of the provided data in filename. Parameters ========== data : series One-dimensional ndarray with axis labels (including time series). lags : {int, array_like} An int or array of lag values, used on horizontal axis. title : string The title that will be set for the whole figure. filename : string File to save the plot result """ if not isinstance(data, pd.Series): data = pd.Series(data).dropna() with plt.style.context('bmh'): fig = plt.figure(figsize=(10, 8)) layout = (3, 2) ts_ax = plt.subplot2grid(layout, (0, 0), colspan=2) acf_ax = plt.subplot2grid(layout, (1, 0)) pacf_ax = plt.subplot2grid(layout, (1, 1)) qq_ax = plt.subplot2grid(layout, (2, 0)) pp_ax = plt.subplot2grid(layout, (2, 1)) data.plot(ax=ts_ax) ts_ax.set_title(title if title else 'Time Series Analysis Plots') smt.graphics.plot_acf(data, lags=lags, ax=acf_ax, alpha=0.5, zero=False) smt.graphics.plot_pacf(data, lags=lags, ax=pacf_ax, alpha=0.5, zero=False) sm.qqplot(data, line='s', ax=qq_ax) qq_ax.set_title('QQ Plot') scs.probplot(data, sparams=(data.mean(), data.std()), plot=pp_ax) plt.sca(acf_ax) plt.xticks(np.arange(1, lags + 1, 2.0)) plt.sca(pacf_ax) plt.xticks(np.arange(1, lags + 1, 2.0)) plt.tight_layout() fig.savefig(filename.lower()) plt.close()
def qqplot(dataFrame, columns): '''qq图''' counts = 0 for i, col in enumerate(columns): if i % cell_size == 0: fig = plt.figure(figsize=(15, 15)) ax = fig.add_subplot(col_size, row_size, (i % cell_size) + 1) sm.qqplot(dataFrame[col], ax=ax) ax.set_title(col) if (i + 1) % cell_size == 0 or i + 1 == len(columns): counts += 1 plt.subplots_adjust(wspace=0.3, hspace=0.3) plt.savefig('./output/qqplot' + str(counts) + '.png') plt.show()
def residual(): residual_mean = [] residual_std = [] resid = pd.DataFrame() resid_lag = pd.DataFrame() auto_alpha = [] auto_pvalue = [] code_list = [ 'SPY', 'XLB', 'XLE', 'XLF', 'XLI', 'XLK', 'XLP', 'XLU', 'XLV', 'XLY' ] for code in code_list: etf = ETF(code, '2010-01-01', '2019-09-14') etf.price_acquire() etf.data['ETF_Daily_return'] = ( etf.data['Close'] / etf.data['Close'].shift(1) - 1) etf.data['Date'] = etf.data['Date'].apply( lambda x: x.strftime("%Y%m%d")) data = pd.DataFrame.merge(etf.data, ff.data, how='left', on='Date') data = data.dropna(axis=0, how='any') model = OLS(y=data.ETF_Daily_return, x=data[['Mkt_RF', 'SMB', 'HML']]) resid['' + code + '_resids'] = model.resids sm.qqplot(resid['' + code + '_resids'], fit=True, line='45') plt.title('Normality test of daily residuals for ETF:' + code + '') plt.show() residual_mean = residual_mean + [np.mean(resid['' + code + '_resids'])] residual_std = residual_std + [np.std(resid['' + code + '_resids'])] resid_lag['' + code + '_resids_lag'] = resid['' + code + '_resids'].shift(1) residual = pd.concat([ resid_lag['' + code + '_resids_lag'], resid['' + code + '_resids'] ], axis=1).dropna() regress_result = stats.linregress(residual.iloc[:, 0], residual.iloc[:, 1]) auto_alpha = auto_alpha + [regress_result.slope] auto_pvalue = auto_pvalue + [regress_result.pvalue] result = { 'Code': code_list, 'E_Mean': residual_mean, 'E_std': residual_std } result = pd.DataFrame(result) auto = {'Code': code_list, 'Alpha': auto_alpha, 'P_Value': auto_pvalue} auto = pd.DataFrame(auto) return result, auto
def plot_ic_qq(ic, theoretical_dist=stats.norm, ax=None): """ Plots Spearman Rank Information Coefficient "Q-Q" plot relative to a theoretical distribution. Parameters ---------- ic : pd.DataFrame DataFrame indexed by date, with IC for each forward return. theoretical_dist : scipy.stats._continuous_distns Continuous distribution generator. scipy.stats.norm and scipy.stats.t are popular options. ax : matplotlib.Axes, optional Axes upon which to plot. Returns ------- ax : matplotlib.Axes The axes that were plotted on. """ ic = ic.copy() num_plots = len(ic.columns) v_spaces = ((num_plots - 1) // 3) + 1 if ax is None: f, ax = plt.subplots(v_spaces, 3, figsize=(18, v_spaces * 6)) ax = ax.flatten() if isinstance(theoretical_dist, stats.norm.__class__): dist_name = 'Normal' elif isinstance(theoretical_dist, stats.t.__class__): dist_name = 'T' else: dist_name = 'Theoretical' for a, (period_num, ic) in zip(ax, ic.iteritems()): sm.qqplot(ic.replace(np.nan, 0.).values, theoretical_dist, fit=True, line='45', ax=a) a.set(title="{} Period IC {} Dist. Q-Q".format(period_num, dist_name), ylabel='Observed Quantile', xlabel='{} Distribution Quantile'.format(dist_name)) return ax
def simple_exponential_smoothing(): N, t, alpha, x0 = 200, 160, 0.5, 20 realisations = pd.Series(sample_gaussian_process(20, 5, N), range(N)) forecasts = ses(realisations, alpha, x0, t) plot(realisations, forecasts, alpha) forecasts = ses_rolling(realisations, alpha, x0) res = residuals(realisations, forecasts) print("E[e_t] = "+str(statistics.mean(res))) print("Stdev[e_t] = "+str(statistics.stdev(res))) standardised_res = standardised_residuals(realisations, forecasts) residuals_plot(res) residuals_histogram(standardised_res) residuals_autocorrelation(res, None) sm.qqplot(standardised_res, line ='45') py.show()
def QQplot(self, save = False): ''' Function for Q-Q plot visualization. Args: save - whether to save the output in local directory or not. Return: Q-Q plots for each variable. ''' for idx, array in enumerate(self.array): fig, ax = plt.subplots(figsize = (7, 5)) plt.title('Q-Q plot ({})'.format(self.labels[idx])) sm.qqplot(np.array(array), line = '45', fit = True, ax = ax) if save == True: plt.savefig('Q-Q_plot_{}.png'.format(self.labels[idx]), dpi = 200) plt.show()
def plot_qq_checkout(): path = './qq_checkout' if os.path.exists(path) == False: os.mkdir(path) global number_attribute_remove_lost_arr for k, v in number_attribute_remove_lost_arr.iteritems(): sm.qqplot(np.array(v), line='r') #plt.xlabel(k) plt.title(k) plt.grid(True) #plt.show() plt.savefig(path + '/' + k + '.png') plt.close()
def regression(self): # 线性回归 rate1 = self.rate rate2 = self.rate2 model = sm.OLS(rate1, sm.add_constant(rate2)).fit() print(model.summary()) model.fittedvalues # 查看方程的拟合值 model.resid # 回归的残差项 plt.scatter(model.fittedvalues, model.resid) plt.show() # 正态性,当因变量成正态分布,模型的残差应该是一个均值为0的正态分布 # qq图 sm.qqplot(model.resid_pearson, stats.norm, line='45') # 同方差性 plt.scatter(model.fittedvalues, model.resid_pearson**0.5) pass
def graphics(data): fig, axes = plt.subplots(nrows=1, ncols=3) fig.suptitle("Graphical Analysis") axes[0].hist(data, bins=20, alpha=0.8) axes[0].set_title("Histogram") sns.boxplot(y=data["Close"], ax=axes[1], orient="vertical") axes[1].set_title("Boxplot") sm.qqplot(data["Close"], ax=axes[2], line="q") axes[2].set_title("Q-Q Plot against a normal distribution") plt.show()
def qqplot(x, title='', path=None): """ Q-Q plot Parameters ---------- x : array_like first group Returns ------- None """ sm.qqplot(np.array(x), line='q') if (path != None): plt.savefig(path + '/qqplot_' + title + '.png')
def sklearn_ols_regression(X,y,print_coefficients=True,print_resid=False,plot_resid=False,qqplot_line='s'): """ ols regression in sklearn print: coefficients (optional), regression metrics (optional), qqplot (optional) output: SKlearn LinearRegression object """ # initialize a linear regression model in sklearn linrig = LinearRegression() # fit linear model to training data linrig.fit(X, y) y_pred = linrig.predict(X) if print_coefficients: print('Features: ', list(X.columns)) print('Coefficients: ', linrig.coef_) print('y-intercept: ', np.round(linrig.intercept_,3)) print('\n') if print_resid: regression_results(y, y_pred) # print('MSE: ', mean_squared_error(y, y_pred, multioutput='raw_values')) if plot_resid: sk_res = pd.Series(data=[np.abs(y - y_pred)]) #correct this later print('QQPLOT OF RESID NOT WORKING. RESID INCORRECT OR WRONG ORDER?\n') fig = sm.qqplot(sk_res,line=qqplot_line) plt.show() return linrig
def check_residuals(resids, **plot_args): """可视化残差诊断检验 Args: resids: 残差, np.array or pd.Series **plot_args: 用于构造figure对象 Returns: plt.Figure """ fig, axes = plt.subplots(nrows=2, ncols=2, **plot_args) ax1, ax2, ax3, ax4 = axes.flatten() # 残差时序图 ax1 = sns.lineplot(x=range(len(resids)), y=resids, ax=ax1) ax1.set(title="Residuals", xlabel="", ylabel="") # 残差直方图 ax2 = sns.histplot(x=resids, kde=True, ax=ax2) ax2.set(title="Histogram", xlabel="", ylabel="") # QQ plot fig = sm.qqplot(resids, fit=True, line="45", ax=ax3) ax3.set(title="Normal QQ") # 自相关图 ax4 = plot_acf(resids, ax=ax4, title="ACF") plt.tight_layout() return fig
def test_qqplot_pltkwargs(self): fig = sm.qqplot(self.res, line='r', marker='d', markerfacecolor='cornflowerblue', markeredgecolor='white', alpha=0.5)
def residual_plots(): ''' Plots the OLS residuals vs predictors. Also plots a QQ plot of residuals''' plt.style.use('ggplot') aggregated_data = load_concatenated_data() lm_ols = smf.ols(formula='y ~ x', data=aggregated_data).fit() # OLS fit #--------------------------------------------- # Scatter plot of OLS residuals vs predictors #--------------------------------------------- #plt.scatter(aggregated_data['x'].values, lm_ols.resid) plt.scatter(aggregated_data['x'].values, lm_ols.resid**2) plt.xlabel('x') plt.ylabel('OLS squared residuals') plt.savefig('squared_residuals_scatterplot.pdf') plt.show() #---------------------- ## QQ plot of residuals #---------------------- fig = sm.qqplot(lm_ols.resid, line='s') fig.savefig('residuals_QQplot.pdf') plt.show(fig)
def shapiro(data): ''' >plot qq plot and pdf > shapiro wilk test for normality ''' #qq plot sm.qqplot(np.array(data), line='45') pylab.show() #pdf fig, ax = plt.subplots() n, bins, patches = ax.hist(data, 40, density=1) #shapiro wilk test print('shapiro test', stats.shapiro(data))
def arima_diag(resids, n_lags = 40): fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2,2) r = resids resids = (r - np.nanmean(r)) / np.nanstd(r) resids_nonmissing = resids[~(np.isnan(resids))] ###### sns.lineplot(x = np.arange(len(resids)), y = resids, ax = ax1) ax1.set_title('Standardized residuals') x_lim = (-1.96 * 2, 1.96 *2) r_range = np.linspace(x_lim[0], x_lim[1]) norm_pdf = stats.norm.pdf(r_range) sns.distplot(resids_nonmissing, norm_hist = True, hist = True, kde = True, ax = ax2) ax2.plot(r_range, norm_pdf, 'g', lw= 2, label = 'N(0,1)') ax2.set_title('Distribution of standardized residuals') ax2.set_xlim(x_lim) ax2.legend() qq = sm.qqplot(resids_nonmissing, line = 's', ax = ax3) ax3.set_title('Q-Q plot') plot_acf(resids, ax = ax4, lags = n_lags, alpha = 0.05) ax4.set_title('ACF plot') return fig
def plot_QQ(model, fit=False, *args, **kwargs): """ Plot the QQ plot. :param model: The statmodel model :param fit: When True the line that is shown as True line is the fitting line. This is useful sometimes since a straight line different to y=x means that the distribution is probably the same but the parameters are the same For instance, a Gaussian with different mean or sigma. :param args: Parameters for the qqplot method from statmodels. The most important one is the first parametrs which represents a model different to Gaussian (use: scipy.stats.t for t distribution and so on). Check: http://www.statsmodels.org/dev/generated/statsmodels.graphics.gofplots.qqplot.html :param kwargs: Other parameters for qqplot :return: None """ res = model.resid # residuals xmin = np.min(res) xmax = np.max(res) if 'fit' not in kwargs: kwargs['fit'] = fit #reg = LinearRegression().fit(np.arange(len(res)).reshape(-1,1), res) #print('Fitting line coefficients: {} and intercepts'.format(reg.coef_, reg.intercept_)) fig = sm.qqplot(res, line='r', *args, **kwargs) plt.plot([xmin,xmax],[xmin,xmax], 'r') plt.show()
def create_qq_subplots(data, variables): fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(20, 15)) ax = axes.flatten() for i in range(len(variables)): for label in (ax[i].get_xticklabels() + ax[i].get_yticklabels()): label.set_fontsize(12) col_name = variables[i] sm.qqplot(data[col_name], marker='o', markerfacecolor='none', markeredgecolor='k', alpha=0.5, ax=ax[i]) ax[i].set_ylabel(col_name, fontsize=18) ax[i].set_xlabel("Theoretical Quantiles", fontsize=14) return plt
def statistic_plot(log_returns,stock_set): for sym in stock_set: print("\nResults for symbol %s" % sym) print(30 * "-") log_data = np.array(log_returns[sym].dropna()) stc.print_statistics(log_data) # 通过qq图检查代码的数据 # 下面是HS300 对数收益率 分位数-分位数图 sm.qqplot(log_returns[sym].dropna(), line='s') plt.title(sym+'qqplot') plt.grid(True) plt.xlabel('theoretical quantiles') plt.ylabel('sample quantiles')
def qq_plot(depend,features, df): df_copy = df.copy() fig, ax = plt.subplots(4,2, figsize=(30,30)) i=0 for m in range(4): for n in range(2): if m == 3 and n == 1: pass else: f = '{}~{}'.format(depend, features[i]) model = smf.ols(formula=f, data=df_copy).fit() resid1 = model.resid sm.qqplot(resid1, dist=sp.stats.norm, line='45', fit=True, ax=ax[m][n]) ax[m][n].set_title('{}'.format(features[i])) i += 1 return
def RunEstimation(self,request,tsmodelid,tsworkspaceid): self.data=pandas.DataFrame() self.prepdata(tsmodelid) print(self.data) ig=lambda x:x g=lambda x:x tsmodel=modeler.ModelClass(data=self.data,startdate=self.startdate,enddate=self.enddate, dependent=self.depVar,exogenous=self.indepVar ,transform=g,inverstransform=ig) tsmodel.setmodel(AR=int(self.AR),I=int(self.I),MA=int(self.MA)) tsmodel.estimate() self.fit=tsmodel.fit print tsmodel.fit.summary() #return HttpResponseRedirect('/tsbuild/workspace/%s/%s' % (str(tsmodelid),str(tsworkspaceid))) confint0=self.fit.conf_int()[0] confint1=self.fit.conf_int()[1] self.SaveValues(tsmodelid,tsworkspaceid,tsmodel.fit) #QQ Plot sm.qqplot(tsmodel.fit.resid) plt.savefig('files/%s/%s/qqplot_resid.png' % (tsmodelid, tsworkspaceid)) plt.clf() #In Sample plot pdframe=pandas.DataFrame() self.data['resid']=tsmodel.fit.resid self.data['%s_%s' % (self.depVar[0], 'hat')]=self.fit.fittedvalues #Plot residuals print(tsmodel.fit.resid.index) print(tsmodel.fit.resid.values) plt.plot(tsmodel.fit.resid.index,tsmodel.fit.resid.values) plt.savefig('files/%s/%s/resid.png' % (tsmodelid, tsworkspaceid)) plt.clf() plt.plot(tsmodel.fit.fittedvalues.index,tsmodel.fit.fittedvalues.values) plt.plot(self.data[self.depVar[0]].index,self.data[self.depVar[0]].values) plt.savefig('files/%s/%s/insample.png' % (tsmodelid, tsworkspaceid)) plt.clf() return render(request,'tsbuild/arimaSummary.html', {'fit': self.fit, 'confint0':confint0, 'confint1':confint1, 'tsmodelid':tsmodelid, 'tsworkspaceid':tsworkspaceid} )
def plot_ic_qq(ic, theoretical_dist=stats.norm, ax=None): """ Plots Spearman Rank Information Coefficient "Q-Q" plot relative to a theoretical distribution. Parameters ---------- ic : pd.DataFrame DataFrame indexed by date, with IC for each forward return. theoretical_dist : scipy.stats._continuous_distns Continuous distribution generator. scipy.stats.norm and scipy.stats.t are popular options. ax : matplotlib.Axes, optional Axes upon which to plot. Returns ------- ax : matplotlib.Axes The axes that were plotted on. """ ic = ic.copy() num_plots = len(ic.columns) v_spaces = ((num_plots - 1) // 3) + 1 if ax is None: f, ax = plt.subplots(v_spaces, 3, figsize=(18, v_spaces * 6)) ax = ax.flatten() if isinstance(theoretical_dist, stats.norm.__class__): dist_name = 'Normal' elif isinstance(theoretical_dist, stats.t.__class__): dist_name = 'T' else: dist_name = 'Theoretical' for a, (period_num, ic) in zip(ax, ic.iteritems()): sm.qqplot(ic.replace(np.nan, 0.).values, theoretical_dist, fit=True, line='45', ax=a) a.set(title="{} Period IC {} Dist. Q-Q".format( period_num, dist_name), ylabel='Observed Quantile', xlabel='{} Distribution Quantile'.format(dist_name)) return ax
def bootstrap_qqplot(data_directory_name: str, scenario: str, result_dict_name: str): """ :param data_directory_name: :param scenario: :param result_dict_name: :return: """ with open( f'results/result_dict/{data_directory_name}/bootstrap_refit_reduced_{result_dict_name}_{scenario}_' f'result_dict.p', 'rb') as fp: bootstrap_result_dict = pickle.load(fp) train_p_value_vet = [] test_p_value_vet = [] for sample_size in bootstrap_result_dict.keys(): sample_size_train_p_value_vet = [] sample_size_test_p_value_vet = [] for trial_index in bootstrap_result_dict[sample_size].keys(): sample_size_train_p_value_vet.append( bootstrap_result_dict[sample_size][trial_index] ["train_p_value"]) sample_size_test_p_value_vet.append( bootstrap_result_dict[sample_size][trial_index] ["test_p_value"]) train_p_value_vet.append(sample_size_train_p_value_vet) test_p_value_vet.append(sample_size_test_p_value_vet) plt.scatter(train_p_value_vet[1], test_p_value_vet[1]) fig_1 = sm.qqplot(data=np.array(test_p_value_vet[0]), dist=dist.uniform, line="45") plt.title("Train") fig_2 = sm.qqplot(data=np.array(test_p_value_vet[1]), dist=dist.uniform, line="45") plt.title("Test") fig_1.savefig( f"results/plots/{data_directory_name}/bootstrap_refit_reduced_{result_dict_name}_train.png" ) fig_2.savefig( f"results/plots/{data_directory_name}/bootstrap_refit_reduced_{result_dict_name}_test.png" )
def tsplot(y, lags=None, figsize=(10, 8), style='bmh', max_lag=10): if not isinstance(y, pd.Series): y = pd.Series(y) with plt.style.context(style): fig = plt.figure(figsize=figsize) #mpl.rcParams['font.family'] = 'Ubuntu Mono' layout = (3, 2) ts_ax = plt.subplot2grid(layout, (0, 0), colspan=2) acf_ax = plt.subplot2grid(layout, (1, 0)) pacf_ax = plt.subplot2grid(layout, (1, 1)) qq_ax = plt.subplot2grid(layout, (2, 0)) pp_ax = plt.subplot2grid(layout, (2, 1)) dful_pvalue = np.around(smt.stattools.adfuller(y)[1], 3) ACF = smt.stattools.acf(y, nlags=max_lag, qstat=True) ARord = np.array([ i for i in range(0, max_lag + 1) if abs(ACF[0][i]) > 2 / np.sqrt(y.shape[0]) ]) PACF = smt.stattools.pacf(y, nlags=max_lag) MAord = np.array([ i for i in range(0, max_lag + 1) if abs(PACF[i]) > 2 / np.sqrt(y.shape[0]) ]) Qstat_pvalue = np.around(ACF[2][max_lag - 1], 3) jb_pvalue = sm.stats.stattools.jarque_bera(y) jb_pvalue, kurtosis = np.around(jb_pvalue[1], 3), np.around(jb_pvalue[3], 3) y.plot(ax=ts_ax) ts_ax.set_title( 'Time Series Analysis Plots\nDickey-Fuller Test: {}'.format( dful_pvalue)) smt.graphics.plot_acf(y, lags=lags, ax=acf_ax, alpha=0.5) smt.graphics.plot_pacf(y, lags=lags, ax=pacf_ax, alpha=0.5) sm.qqplot(y, line='s', ax=qq_ax) scs.probplot(y, sparams=(y.mean(), y.std()), plot=pp_ax) qq_ax.set_title('QQ Plot\nJarque-Bera Test: {}\nKurtosis: {}'.format( jb_pvalue, kurtosis)) acf_ax.set_title( "Autocorrelation\nQ({}): {}\nLast Singf Lag: {}".format( max_lag, Qstat_pvalue, max(ARord))) pacf_ax.set_title("Partial Autocorrelation\nLast Singf Lag: {}".format( max(MAord))) plt.tight_layout() plt.show() return ARord, MAord
def qq_plot(diffs_mean, recall, rt=False): sns.set_style('white') t = 'recall' t1 = '' if not recall: t = 'recog' if rt: t1 = '_rt' t = t + t1 fig2, axs = plt.subplots(1, 3, sharex=False) fig2.add_subplot(111, frameon=False) plt.tick_params(labelcolor='none', top=False, bottom=False, left=False, right=False) plt.xlabel("Theoretical Quantiles") plt.ylabel("Sample Quantiles") title = ['Rest', 'Video', 'Game'] axs = axs.ravel() for i in range(3): sm.qqplot(np.array(diffs_mean.iloc[:, i]), line='s', ax=axs[i]) axs[i].get_lines()[0].set_markersize(5) axs[i].get_lines()[0].set_markeredgewidth(0.3) axs[i].get_lines()[0].set_markerfacecolor(colors[i]) axs[i].get_lines()[0].set_markeredgecolor('gray') axs[i].get_lines()[1].set_color('gray') axs[i].set_xlabel('') axs[i].set_ylabel('') axs[i].set_title(title[i]) axs[i].set_xlim(-2, 2) plt.tight_layout() plt.savefig(F'qq-plot{t}', dpi=300)
def qqplotbags(dataframe, bagType, method, regionList): for reg in regionList: df = dataframe.loc[(dataframe['region'] == reg) & (dataframe['type'] == method)] demand = df[bagType].tolist() demand.sort() demandSorted = pd.DataFrame(demand) sm.qqplot(demandSorted, line='s', alpha=0.3) plt.title(reg + " " + method + " " + bagType) directory = "Data Analysis/BAGS/QQplotsbags/" if not os.path.isdir(directory + method + bagType + "/"): os.makedirs(directory + method + bagType + "/") plt.savefig(directory + method + bagType + "/" + reg + " " + method + " " + bagType) plt.show()
def q_q_plot(filepath, parameter): df = pandas.read_csv(filepath) array = df[parameter] try: fig = sm.qqplot(array, scipy.stats.t, fit=True, line='45') plt.show() except: print "There was an error."
def fourinone(res, y_pred2, x): ### Residual check ### Set up Quad graph fig = plt.figure() fig.suptitle('Residual Summary', fontsize=16) fig.set_facecolor('tan') ### QQ-plot ax = fig.add_subplot(2, 2, 1) sm.qqplot(res, line='s', ax=ax) plt.title('QQ plot') ### Res vs fitted value ax = fig.add_subplot(2, 2, 2) ###Horizontal line horiz_line_data = np.array([0, 0]) min_max = np.array([y_pred2.min(), y_pred2.max()]) ax.plot(min_max, horiz_line_data, 'k--') ### Data ax.plot(y_pred2, res, 'o', label="data") # Data ax.set_ylabel('Residual') ax.set_xlabel('Fitted Value') ax.set_title('Residual vs Fitted Value') ### Histogram of residuals ax = fig.add_subplot(2, 2, 3) bins = 12 plt.hist(res, bins, edgecolor="k", alpha=1) #plt.xticks(bins) ax.set_ylabel('Frequency') ax.set_xlabel('Residual') ax.set_title('Histogram') ### Residual vs Observation Order ax = fig.add_subplot(2, 2, 4) horiz_line_data = np.array([0, 0]) min_max = np.array([x.min(), x.max()]) ax.plot(min_max, horiz_line_data, 'k--') ax.plot(x, res, '-o', label="data") # Data ax.set_ylabel('Residual') ax.set_xlabel('Observation Order') ax.set_title('Residual vs Observation Order') fig.tight_layout() fig.show() return
def target_dist(self, bins=10, dist=stats.norm): # plot the distribution histogram of the target variable _ = self.target.hist(bins=bins) plt.show() # plot the qq plot _ = sm.qqplot(self.target, dist, fit=True, line='45') #,stats.beta plt.show()
def plot_model(prediction, y, x): fig, axs = sns.plt.subplots(2, 2, figsize=(16, 10)) axs = axs.flatten() resid = pd.Series(y - prediction, index=y.index, name='Residuals') resid.hist(bins=40, ax=axs[0]) axs[0].set_xlabel('Residuals') sm.qqplot(resid, line='q', ax=axs[1]) axs[1].set_xlabel('Residuals') tbpd.hist2d(resid, prediction, ax=axs[2], vlabel='Residuals', hlabel='Predicted value', integer_aligned_bins=True) tbpd.hist2d(y, prediction, ax=axs[3], vlabel='True value', hlabel='Predicted value', integer_aligned_bins=True, sqrt=True) fig.tight_layout()
def hyp_test_pic2(symbol, from_t, to_t): """ 画出检验正态分布的图。这是方法二。 X轴理论分位数,y轴样本分位数.只要不在一条直线上,就表示不符合正态分布 :param symbol: str :param from_t: str :param to_t: str :return: picture """ sql = f"select * from stock_candles_day where symbol='{symbol}' and dt>='{from_t}' and dt<='{to_t}' order by symbol,series" dt = query_dt(sql) #计算对数收益率 a = np.log(dt['c'].pct_change() + 1) fix, axes = plt.subplots(1, 1, figsize=(10, 12)) sm.qqplot(a.dropna(), line='s', ax=axes) axes.set_title("hypothesis testing") #用中文做标题会出错 return plt.show()
def do_qqplot(data, data_type, d): fig = sm.qqplot(data, line='45') fig.savefig( f"/home/vmargot/Documents/Jussieu/new/{data_type}_{n}_d={d}_qqplot", format="svg", dpi=300, ) plt.close(fig)
def test_qqplot(): #just test that it runs data = sm.datasets.longley.load() data.exog = sm.add_constant(data.exog) mod_fit = sm.OLS(data.endog, data.exog).fit() res = mod_fit.resid fig = sm.qqplot(res) plt.close(fig)
def test_qqplot(): #just test that it runs data = sm.datasets.longley.load() data.exog = sm.add_constant(data.exog, prepend=False) mod_fit = sm.OLS(data.endog, data.exog).fit() res = mod_fit.resid fig = sm.qqplot(res, line='r') plt.close('all')
def plot(file_name,negative_control_gRNAs=None,wald_only=False): data=open(file_name,'rb') short_file_name=file_name[:file_name.index(".gene_summary.txt")] data.readline() permute_p_value_list=[] wald_p_value_list=[] beta_value_list=[] if negative_control_gRNAs!=None: negative_control_permute_p_value_list=[] negative_control_wald_p_value_list=[] negative_control_beta_value_list=[] for line in data: elements=line.decode().strip().split("\t") if negative_control_gRNAs!=None and elements[0] in negative_control_gRNAs: negative_control_beta_value_list.append(float(elements[2])) if wald_only==True: negative_control_wald_p_value_list.append(float(elements[4])) else: negative_control_permute_p_value_list.append(float(elements[4])) negative_control_wald_p_value_list.append(float(elements[6])) else: beta_value_list.append(float(elements[2])) if wald_only==True: wald_p_value_list.append(float(elements[4])) else: permute_p_value_list.append(float(elements[4])) wald_p_value_list.append(float(elements[6])) beta_value_list=[x for x in beta_value_list if str(x) != 'nan' and abs(x)<3] wald_p_value_list=[x for x in wald_p_value_list if str(x) != 'nan'] if negative_control_gRNAs!=None: negative_control_beta_value_list=[x for x in beta_value_list if str(x) != 'nan' and abs(x)<3] negative_control_wald_p_value_list=[x for x in wald_p_value_list if str(x) != 'nan'] if wald_only!=True: permute_p_value_list=[x for x in permute_p_value_list if str(x) != 'nan'] stats.probplot(permute_p_value_list, dist="uniform",plot=pylab) pylab.savefig("QQplot of permute_p value %s.png" %short_file_name) pylab.close() pylab.hist(beta_value_list,bins=1000) pylab.savefig("Hist of beta value %s.png" %short_file_name) pylab.close() #stats.probplot(wald_p_value_list, dist="uniform",plot=pylab) fig=sm.qqplot(np.array(wald_p_value_list),stats.uniform,fit=True, line='45') pylab.xlim(0,1) pylab.ylim(0,1) #fig.set_xlim(0,1) pylab.savefig("QQplot of wald_p value %s.png" %short_file_name) pylab.close() '''
def qqPlot(self): """ Plots sample signals against theorethical distribution""" import statsmodels.api as sm #pandas, patsy import matplotlib.pyplot as plt data = self.array.probes[:, 2 + self.number] # add log2 plt.figure(self.number) fig = sm.qqplot(data) plt.xlabel('Theoretical quantiles') plt.ylabel('Sample quantiles') plt.title('Probe intensities for %s' % (self.name)) plt.savefig("%s_qqprob.png" % (self.name))
def print_qqplot_and_residuals_plot(model): # qq-plot ax1 = plt.subplot(1, 3, 1) qq_plot = sm.qqplot(model.resid, line = 'r', ax = ax1) # Residuals plot ax2 = plt.subplot(1, 3, 2) stdres = pandas.DataFrame(model.resid_pearson) residuals_plot = plt.plot(stdres, 'o', ls = 'None') plt.axhline(y = 0, color = 'r') plt.ylabel('Standarized Residual') plt.xlabel('Observation Number') plt.show()
def tsplot(y, lags=None, figsize=(10, 8), style='bmh'): if not isinstance(y, pd.Series): y = pd.Series(y) with plt.style.context(style): fig = plt.figure(figsize=figsize) #mpl.rcParams['font.family'] = 'Ubuntu Mono' layout = (3, 2) ts_ax = plt.subplot2grid(layout, (0, 0), colspan=2) acf_ax = plt.subplot2grid(layout, (1, 0)) pacf_ax = plt.subplot2grid(layout, (1, 1)) qq_ax = plt.subplot2grid(layout, (2, 0)) pp_ax = plt.subplot2grid(layout, (2, 1)) y.plot(ax=ts_ax) ts_ax.set_title('Time Series Analysis Plots') smt.graphics.plot_acf(y, lags=lags, ax=acf_ax, alpha=0.5) smt.graphics.plot_pacf(y, lags=lags, ax=pacf_ax, alpha=0.5) sm.qqplot(y, line='s', ax=qq_ax) qq_ax.set_title('QQ Plot') scs.probplot(y, sparams=(y.mean(), y.std()), plot=pp_ax) plt.tight_layout() return
def mult_regression(wine_set): # center quantitative IVs for regression analysis w = wine_set['quality'] wine_set = wine_set - wine_set.mean() wine_set['quality'] = w print ("OLS multivariate regression model") # first i have run with all columns; than chose the most significant for each wine set and rerun: if len(wine_set) < 2000: # for red model1 = smf.ols( formula="quality ~ volatile_acidity + chlorides + pH + sulphates + alcohol", data=wine_set) else: # for white model1 = smf.ols( formula="quality ~ volatile_acidity + density + pH + sulphates + alcohol", data=wine_set) results1 = model1.fit() print(results1.summary()) # q-q plot for normality qq = sm.qqplot(results1.resid, line = 'r') plt.show() # plot of residuals stdres = pd.DataFrame(results1.resid_pearson) plt.plot(stdres, 'o', ls = 'None') l = plt.axhline(y=0, color = 'r') plt.ylabel('Standardized redisual') plt.xlabel('Observation number') plt.show() # # diagnostic plots # figure1 = plt.figure(figsize=(12, 8)) # figure1 = sm.graphics.plot_regress_exog(results1, "alcohol", fig = figure1) # plt.show() # # figure1 = plt.figure(figsize=(12, 8)) # figure1 = sm.graphics.plot_regress_exog(results1, "sulphates", fig = figure1) # plt.show() # leverage plot figure1 = sm.graphics.influence_plot(results1, size=8) plt.show()
reg2 = smf.ols('lifeexpectancy ~ breastcancerper100th_c + I(breastcancerper100th_c**2)', data=sub1).fit() print (reg2.summary()) #################################################################################### # EVALUATING MODEL FIT #################################################################################### # adding alcohol consumption reg3 = smf.ols('lifeexpectancy ~ breastcancerper100th_c + I(breastcancerper100th_c**2) + breastcancerper100th_c', data=sub1).fit() print (reg3.summary()) #Q-Q plot for normality fig4=sm.qqplot(reg3.resid, line='r') # simple plot of residuals stdres=pandas.DataFrame(reg3.resid_pearson) plt.plot(stdres, 'o', ls='None') l = plt.axhline(y=0, color='r') plt.ylabel('Standardized Residual') plt.xlabel('Observation Number') # additional regression diagnostic plots fig2 = plt.figure(figsize=(12,8)) fig2 = sm.graphics.plot_regress_exog(reg3, "breastcancerper100th_c", fig=fig2) # leverage plot fig3=sm.graphics.influence_plot(reg3, size=8)
def plot_box_resids(fit_model, y_pred, subset=None): '''More than you ever wanted to know about your residuals''' s_resid = (fit_model.resid - np.mean(fit_model.resid)) /\ np.var(fit_model.resid) if subset: s_resid = np.random.choice(s_resid, replace=False, size=math.floor(len(s_resid) * subset)) df = pd.DataFrame(s_resid, columns=['resids']) temp_df = pd.DataFrame(y_pred, columns=['target']) df = df.join(temp_df) if min(y_pred) < -1: df['turnout_bucket'] = df['target']\ .apply(lambda x: int(math.floor(10 * np.exp(x)))) y = df['target'].apply(lambda x: np.exp(x)) else: df['turnout_bucket'] = df['target']\ .apply(lambda x: int(math.floor(10 * x))) y = df['target'] posit = sorted(df['turnout_bucket'].unique()) plt.scatter(y, s_resid, alpha=.2) slope, intercept = np.polyfit(y, s_resid, 1) plt.plot(y, np.poly1d(np.polyfit(y, s_resid, 1))(y)) plt.title('Studentized Residuals vs Prediction') plt.xlabel('Predicted Value') plt.ylabel('Studentized Residual') print 'Slope of best fit line: %s' % slope plt.show() ax1 = df[['resids', 'turnout_bucket']]\ .boxplot(by='turnout_bucket', positions=posit, widths=.5) plt.title('Residuals versus Turnout') plt.xlabel('Turnout Bucket') plt.ylabel('Studentized Residuals') plt.suptitle('') plt.show() fig = sm.qqplot(s_resid, line='s') plt.title('Q-Q Plot') plt.show() w, p_val = shapiro(s_resid) print 'Shapiro-Wilk P_val is %s, larger the better' % p_val k, p_val = normaltest(s_resid) print 'D’Agostino and Pearson’s P_val is %s, larger the better' % p_val k, p_val = kstest(s_resid, 'norm') print 'Kolmogorov–Smirnov P_val is %s, larger the better' % p_val A, critical, sig = anderson(s_resid) print 'Anderson-Darling A2 is %s, smaller the better' % A print critical print sig n, bins, patches = plt.hist(s_resid, 75, normed=1) mu = np.mean(s_resid) sigma = np.std(s_resid) plt.plot(bins, mlab.normpdf(bins, mu, sigma)) plt.title('Residuals versus a Normal Dist') plt.show() df['turnout_bucket'].hist(bins=posit, align='left', color='b') plt.title('Histogram of Turnout Bucket') plt.ylabel('Count') plt.xlim(-.5, - .5 + len(posit)) temp = df[['resids', 'turnout_bucket']].groupby('turnout_bucket').count() temp.columns = ['Count'] plt.show() print temp
def plot(self): """Makes the plot.""" sm.qqplot(self.data, fit=True, line='s') plt.show()
def azureml_main(BikeShare): import matplotlib matplotlib.use('agg') # Set backend matplotlib.rcParams.update({'font.size': 20}) import matplotlib.pyplot as plt import statsmodels.api as sm Azure = False ## Sort the data frame based on the dayCount BikeShare.sort('dayCount', axis = 0, inplace = True) ## Compute the residuals. BikeShare['Resids'] = BikeShare['Scored Label Mean'] - BikeShare['cnt'] ## Plot the residuals vs the label, the count of rented bikes. fig = plt.figure(figsize=(8, 6)) fig.clf() ax = fig.gca() ## PLot the residuals. BikeShare.plot(kind = 'scatter', x = 'cnt', y = 'Resids', alpha = 0.05, color = 'red', ax = ax) plt.xlabel("Bike demand") plt.ylabel("Residual") plt.title("Residuals vs demand") plt.show() if(Azure == True): fig.savefig('scatter1.png') ## Make time series plots of actual bike demand and ## predicted demand by times of the day. times = [7, 9, 12, 15, 18, 20, 22] for tm in times: fig = plt.figure(figsize=(8, 6)) fig.clf() ax = fig.gca() BikeShare[BikeShare.hr == tm].plot(kind = 'line', x = 'dayCount', y = 'cnt', ax = ax) BikeShare[BikeShare.hr == tm].plot(kind = 'line', x = 'dayCount', y = 'Scored Label Mean', color = 'red', ax = ax) plt.xlabel("Days from start of plot") plt.ylabel("Count of bikes rented") plt.title("Bikes rented by days for hour = " + str(tm)) plt.show() if(Azure == True): fig.savefig('tsplot' + str(tm) + '.png') ## Boxplots to for the residuals by hour and transformed hour. labels = ["Box plots of residuals by hour of the day \n\n", "Box plots of residuals by transformed hour of the day \n\n"] xAxes = ["hr", "xformWorkHr"] for lab, xaxs in zip(labels, xAxes): fig = plt.figure(figsize=(12, 6)) fig.clf() ax = fig.gca() BikeShare.boxplot(column = ['Resids'], by = [xaxs], ax = ax) plt.xlabel('') plt.ylabel('Residuals') plt.show() if(Azure == True): fig.savefig('boxplot' + xaxs + '.png') ## QQ Normal plot of residuals fig = plt.figure(figsize = (6,6)) fig.clf() ax = fig.gca() sm.qqplot(BikeShare['Resids'], ax = ax) ax.set_title('QQ Normal plot of residuals') if(Azure == True): fig.savefig('QQ.png') if(Azure == True): fig.savefig('QQ1.png') ## Histograms of the residuals fig = plt.figure(figsize = (8,6)) fig.clf() fig.clf() ax = fig.gca() ax.hist(BikeShare['Resids'].as_matrix(), bins = 40) ax.set_xlabel("Residuals") ax.set_ylabel("Density") ax.set_title("Histogram of residuals") if(Azure == True): fig.savefig('hist.png') return BikeShare
comb.boxplot(column=[0]) ## Q-Q Plot ##### In statistics, a Q–Q plot ("Q" stands for quantile) is a probability plot, which is a graphical method for comparing two probability distributions by plotting their quantiles against each other. If the two distributions being compared are similar, the points in the Q–Q plot will approximately lie on the line y = x. If the distributions are linearly related, the points in the Q–Q plot will approximately lie on a line, but not necessarily on the line y = x. # In[266]: import statsmodels.api as sm # In[269]: sm.qqplot(comb[1],line='45') # In[275]: os.getcwd() # In[287]: for i in np.arange(0,40,1): pieces1='histograms/histogram',format(i),'.jpg' hist=comb[i].hist() fig = hist.get_figure() fig.savefig(''.join(pieces1)) fig.clear()
import numpy as np from scipy.interpolate import interp1d import sys def load(): return pd.read_excel("Analysis.xls",header = None) data = load() for i in range(17): print data[i].dropna().describe() for i in range(3,11): plt.hist(data[i].dropna()) plt.show() sm.qqplot(data[i], line='q') plt.show() plt.boxplot(data[i].dropna()) plt.show() for i in range(11,18): h = data[data[2].isin(['high'])][i] m = data[data[2].isin(['medium'])][i] l = data[data[2].isin(['low'])][i] d = [np.asarray(h),np.asarray(m),np.asarray(l)] plt.boxplot(d) plt.show() def getmaxcorr(dt,index): max = -1.0 pos = 0;
sig_marks = {} for speed, event in groups.groups.keys(): group = groups.get_group((speed, event)) index = [] t_vals = [] p_vals = [] for col in group.columns: if col.startswith('k_'): # plot the quantiles plot to see if the data is normally distributed fig = qqplot(group[col], line='45') plot_dir = os.path.join(PATHS['figures_dir'], 'quantile-plots', event, structure, '{:1.1f}'.format(speed)) plot_dir = utils.mkdir(plot_dir) fig.savefig(os.path.join(plot_dir, '{}.png'.format(col))) plt.close(fig) # compute the t statistic to see if the value is significantly # different than zero t_stat, p_val = ttest_1samp(group[col], 0.0) index.append(col) t_vals.append(t_stat) p_vals.append(p_val) #mark = np.zeros((num_schedules, num_sensors, num_actuators), dtype=bool)
def plot_qq(datei, qq_Plot, fit_qq_Plot, vergleich = scipy.stats.invgauss): with open(datei, 'rb') as csvfile: myreader = csv.reader(csvfile, delimiter = ";",quoting=csv.QUOTE_NONE) liste = [] # Erstelle Liste wie oben for row in myreader: unterliste = [] for r in row: r2 = float(r) unterliste.append(r2) liste.append(unterliste) # Und einen qq-Plot erstellen, evtl Parameter zur vergleichsfunktion müssen # per Hand eingestellt werden if qq_Plot: print "erstelle qq-Plot", fig = plt.figure() ax = fig.add_subplot(221) sm.qqplot (np.array(liste[0]), vergleich, distargs= (0.005,), line = 'r', ax =ax) #txt = ax.text(-1.8, 3500, str(params[0]) ,verticalalignment='top') #txt.set_bbox(dict(facecolor='k', alpha=0.1)) print "nr2", ax = fig.add_subplot(222) sm.qqplot (np.array(liste[1]), vergleich, distargs= (0.005,), line = 'r', ax =ax) #txt = ax.text(-1.8, 3500, str(params[1]) ,verticalalignment='top') #txt.set_bbox(dict(facecolor='k', alpha=0.1)) print "nr3", ax = fig.add_subplot(223) sm.qqplot (np.array(liste[2]), vergleich, distargs= (0.005,), line = 'r', ax =ax) #txt = ax.text(-1.8, 3500, str(params[2]) ,verticalalignment='top') #txt.set_bbox(dict(facecolor='k', alpha=0.1)) print "nr4", ax = fig.add_subplot(224) sm.qqplot (np.array(liste[3]), vergleich, distargs= (0.005,), line = 'r', ax =ax) #txt = ax.text(-1.8, 3500, str(params[3]) ,verticalalignment='top') #txt.set_bbox(dict(facecolor='k', alpha=0.1)) print "qqplot erstellt" # qq-Plot mit automatischem fit zur Vergleichsfunktion if fit_qq_Plot: print "erstelle fit-qq-plot", fig = plt.figure() ax = fig.add_subplot(221) sm.qqplot (np.array(liste[0]), vergleich, fit = True, line = 'r', ax =ax) #txt = ax.text(-1.8, 3500, str(params[0]) ,verticalalignment='top') #txt.set_bbox(dict(facecolor='k', alpha=0.1)) print "nr2", ax = fig.add_subplot(222) sm.qqplot (np.array(liste[1]), vergleich, fit = True, line = 'r', ax =ax) #txt = ax.text(-1.8, 3500, str(params[1]) ,verticalalignment='top') #txt.set_bbox(dict(facecolor='k', alpha=0.1)) print "nr3", ax = fig.add_subplot(223) sm.qqplot (np.array(liste[2]), vergleich, fit = True, line = 'r', ax =ax) #txt = ax.text(-1.8, 3500, str(params[2]) ,verticalalignment='top') #txt.set_bbox(dict(facecolor='k', alpha=0.1)) print "nr4", ax = fig.add_subplot(224) sm.qqplot (np.array(liste[3]), vergleich, fit = True, line = 'r', ax =ax) #txt = ax.text(-1.8, 3500, str(params[3]) ,verticalalignment='top') #txt.set_bbox(dict(facecolor='k', alpha=0.1)) print "qqplot erstellt" plt.show()
#Adding employement rate print ("Association Between Urban Rate, Life Expectancy, Income, CO2 Emissions, Alcohol, Employment and Breast Cancers Rate") reg6 = smf.ols('breastcancer ~ urbanrate_c + lifeexpect_c + co2emissions_c + income_c + alcconsumption_c + employrate_c', data=gapmind1).fit() print (reg6.summary()) #%% #%% #Keep only significant variables in the model print ("Association Between Income, Alcohol and Breast Cancers Rate") reg7 = smf.ols('breastcancer ~ income_c + alcconsumption_c', data=gapmind1).fit() print (reg7.summary()) #################################################################################### # EVALUATING MODEL FIT #################################################################################### #%% #Q-Q plot for normality fig1=sm.qqplot(reg7.resid, line='r') #%% # simple plot of residuals stdres=pandas.DataFrame(reg7.resid_pearson) fig2 = plt.plot(stdres, 'o', ls='None') l = plt.axhline(y=0, color='r') plt.ylabel('Standardized Residual') plt.xlabel('Observation Number') print (fig2) #%% """ # additional regression diagnostic plots # For alcohol consumption fig3 = plt.figure(figsize=(12,8)) fig3 = sm.graphics.plot_regress_exog(reg7, 'alcconsumption_c', fig=fig3) #%%
from scipy import stats import matplotlib.pyplot as plt params = stats.f.fit(sample) print(params) fig = plt.figure(8, figsize=(10, 10)) ax = fig.add_subplot(111) res = stats.probplot(sample, dist=stats.f, sparams=params, plot=ax) plt.show() # In[13]: import statsmodels.api as sm fig = sm.qqplot(sample, stats.genextreme, fit=True, line='45') plt.show() fig = sm.qqplot(sample, stats.lognorm, fit=True, line='45') plt.show() fig = sm.qqplot(sample, stats.f, fit=True, line='45') plt.show() # In[16]: # Computes the Kolmogorov-Smirnov statistic on 2 samples. # This is a two-sided test for the null hypothesis that 2 independent samples # are drawn from the same continuous distribution.
# Funcion que realiza las iteraciones del cross validation def itcrossval(kf, X, Y): k_fold = cross_validation.KFold(len(X),kf) mse_cv = 0 for k, (train, val) in enumerate(k_fold): linreg = lm.LinearRegression(fit_intercept = False) linreg.fit(X[train], Y[train]) yhat_val = linreg.predict(X[val]) mse_fold = mean_squared_error(Y[val], yhat_val) mse_cv += mse_fold mse_cv = mse_cv / kf return mse_cv # Validacion cruzada para k=5 print "mse para training con k=5: ", itcrossval(5, Xm, ym) # Validacion cruzada para k=10 print "mse para training con k=10: ", itcrossval(10, Xm, ym) ######## Pregunta (j) ############################################################ # Se calcula el error de prediccion sobre todos los datos de entrenamiento errorp = ytrain - yhat_train print "Error de prediccion sobre training set: \n", errorp # Se realiza un quantile-quntile plot graf = sm.qqplot(yhat_train - ytrain, fit=True, line='45') plt.show()
def test_qqplot(self, close_figures): sm.qqplot(self.res, line='r')
def test_qqplot_pltkwargs(self, close_figures): sm.qqplot(self.res, line='r', marker='d', markerfacecolor='cornflowerblue', markeredgecolor='white', alpha=0.5)