def biplot(df, x_name, y_name): fig, ax = plt.subplots() ax.grid(False) x = df[x_name] y = df[y_name] plt.scatter(x,y,c='blue', edgecolors='none',alpha=0.5) abline_plot(intercept=p.Intercept, slope=p.DEP_DELAY,ax=plt.gca(),color="brown") plt.xlabel("Departure delay") plt.ylabel("Arrival delay") plt.show()
def test_abline_remove(self, close_figures): mod = self.mod intercept, slope = mod.params fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(self.X[:,1], self.y) abline_plot(intercept=intercept, slope=slope, ax=ax) abline_plot(intercept=intercept, slope=2*slope, ax=ax) lines = ax.get_lines() lines.pop(0).remove() close_or_save(pdf, fig)
def test_abline_remove(self): mod = self.mod intercept, slope = mod.params fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(self.X[:, 1], self.y) abline_plot(intercept=intercept, slope=slope, ax=ax) abline_plot(intercept=intercept, slope=2 * slope, ax=ax) lines = ax.get_lines() lines.pop(0).remove() close_or_save(pdf, fig)
def plot_scatter_and_line(self, result): '''(d)(f)''' fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(self.x, self.y, c='w') ax.set_ylabel("y") ax.set_xlabel("x") rp.abline_plot(intercept=-1, slope=0.5, ax=ax, c='r', label="model fit") rp.abline_plot(model_results=result, ax=ax, c='g', label="pop.regression") plt.legend(loc='lower right', shadow=True, fontsize='medium') plt.show()
def calibrate(data, plot = True, pdf = "rtcalibration.pdf"): """Fits a linear model to the irts""" filename = data.iloc[0,0] mod = sm.ols(formula = 'irt ~ rt', data = data) res = mod.fit() # scatter-plot data ax = data.plot(x='rt', y='irt', kind='scatter') a = abline_plot(model_results=res, ax=ax) a = abline_plot(intercept=0, slope=1, ax=ax) # print(res.rsquared) a.suptitle("%s \n R2: %s \n R2adj: %s" % (filename, res.rsquared, res.rsquared_adj), fontsize = 10) # text(0.9, 0.1,("R2:"), ha='center', va='center', transform=ax.transAxes) a.savefig(pdf, format = 'pdf') matplotlib.pyplot.close() return [filename, res.params.Intercept, res.params.rt]
def simple_regession(self): ''' The answer of exercise03-08: (a) (i) Yes, from F-stat (ii) Explain it from RSE and R^2 stat (iii)negative (iv) Code, no prediction interval (b) Code (c) Residual/fitted: non-linearity ''' # model = smf.ols(formula="mpg ~ horsepower", data=self.df) y = self.df['mpg'] X = self.df[['horsepower']] X = sm.add_constant(X) print X res = sm.OLS(y, X).fit() # res = model.fit() print res.summary() print "The prediction is: ", res.predict(exog=[[1, 98]]) print "The prediction interval is: " ''' self.df.plot(kind="scatter", x='horsepower', y='mpg', c='w') graph_x = np.linspace(min(self.df['horsepower']), 200) graph_y = res.predict(sm.add_constant(graph_x)) plt.plot(graph_x, graph_y) ''' fig = rp.abline_plot(model_results=res) ax = fig.axes[0] ax.scatter(X['horsepower'], y, c='w') plt.show() lrplot.plot_R_graphs(res)
def test_abline_ab_ax(self): mod = self.mod intercept, slope = mod.params fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(self.X[:,1], self.y) fig = abline_plot(intercept=intercept, slope=slope, ax=ax) plt.close(fig)
def test_abline_ab_ax(self): mod = self.mod intercept, slope = mod.params fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(self.X[:, 1], self.y) fig = abline_plot(intercept=intercept, slope=slope, ax=ax) close_or_save(pdf, fig)
def create_scatter(df, module): #to create a scatter plot, we find each student, get their average mark across all modules except this one and plot it against this one. df2 = pd.DataFrame({ "avg": (df.sum(axis=1) - df[module]) / (df.count(axis=1) - 1), module: df[module] }) plt.clf() sns.set_context("notebook", font_scale=0.5) sns.set(style="white") ax = df2.plot.scatter(x=module, y="avg", c='Black') model = sm.OLS(df2["avg"], sm.add_constant(df2[module]), missing='drop') abline_plot(model_results=model.fit(), ax=ax, c="Red") abline_plot(intercept=0, slope=1, ax=ax, c="Blue") ax.set(xlabel=module, ylabel="Average Mark") ax.set_xlim(0, 22) ax.set_ylim(0, 22) for l in [9, 12, 15, 18]: ax.axhline(y=l, c='Blue') ax.axvline(x=l, c='Blue') plt.savefig(module + "Scatter.png")
def plot_all_experiments(datasets: list, experiment_names: list, test_train_split: float = None, savefig: str = "...", ols_line: bool = False, legend_kwargs: dict = None) -> None: fig, ax = plt.subplots(8, 1) fig.set_size_inches(6, 24) for i, dataset in enumerate(datasets): axis = ax[i] axis.scatter(x=dataset.index, y=dataset[0], s=1) axis.set_title(f"Experiment {experiment_names[i]}") if test_train_split is not None: # plot vertical line test_train_split_index = int(len(dataset) * test_train_split) axis.axvline(x=test_train_split_index, color="grey", linestyle="--", linewidth=1) if ols_line: x = sm.add_constant(dataset.index) y = dataset[0] abline_plot(model_results=sm.OLS(y, x).fit(), ax=axis, color="black", linewidth=1) if legend_kwargs: axis.legend(**legend_kwargs) plt.tight_layout() if savefig: plt.savefig(savefig) plt.show()
def pltDfrXY(cDfr, dIPlt, pF='Hugo.pdf', cOff=0, pltAxXY=None, cModel=None): assert cDfr.shape[1] > 1 sTtl, xLbl, yLbl = dIPlt['title'], dIPlt['xLbl'], dIPlt['yLbl'] tpMark, szMark, ewMark = dIPlt['tpMark'], dIPlt['szMark'], dIPlt['ewMark'] styLn, wdthLn, lClr = dIPlt['styLn'], dIPlt['wdthLn'], dIPlt['lClr'] xLim = (dIPlt['xLimB'], dIPlt['xLimT']) yLim = (dIPlt['yLimB'], dIPlt['yLimT']) if cModel is not None: cFig = regplt.abline_plot(model_results=cModel) cAx = cFig.axes[0] else: cFig, cAx = plt.subplots() for k in range(1, cDfr.shape[1]): cClr = lClr[(cOff + k - 1) % len(lClr)] cAx.plot(cDfr.iloc[:, 0], cDfr.iloc[:, k], marker=tpMark, ms=szMark, mew=ewMark, mec=cClr, mfc=cClr, ls=styLn, lw=wdthLn, color=cClr) decorateSaveFigLegOut(pF, cFig, cDfr, sTtl, xLbl, yLbl, xLim, yLim, nmCX=cDfr.columns[0], nmCY=cDfr.columns[k], pltAxXY=pltAxXY) plt.close()
plotDf=pandas.DataFrame() plotDf['total']=df.sum(axis=1) for c in df.columns: plotDf[c+"y"]=(plotDf['total']-df[c])/(df.count(axis=1)-1) plotDf[c]=df[c] plotDf=plotDf.replace(to_replace=[0,np.inf,-np.inf],value=None) for c in df.columns: ax=plotDf.plot.scatter(x=c,y=c+"y",c='Black') try: #handle case where LSF can't occur model=sm.OLS(plotDf[c+"y"],sm.add_constant(plotDf[c]),missing='drop') abline_plot(model_results=model.fit(),ax=ax,c='Red') abline_plot(intercept=0,slope=1,ax=ax,c='Blue') except: pass ax.set(xlabel=c,ylabel="Average Mark (CGS)") ax.set_xlim(0,22.5) ax.set_ylim(0,22.5) for l in [9,12,15,18]: #draw lines for first etc boundaries ax.axhline(y=l,c='Blue') ax.axvline(x=l,c='Blue') plt.savefig(c+"_CGS.pdf") plt.clf() seaborn.set_context("notebook",font_scale=0.5) seaborn.violinplot(data=df,cut=0,fontsize=8)
def test_abline_ab(self): mod = self.mod intercept, slope = mod.params fig = abline_plot(intercept=intercept, slope=slope) close_or_save(pdf, fig)
def test_abline_model_ax(self): fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(self.X[:, 1], self.y) fig = abline_plot(model_results=self.mod, ax=ax) close_or_save(pdf, fig)
def test_abline_model(self): fig = abline_plot(model_results=self.mod) ax = fig.axes[0] ax.scatter(self.X[:, 1], self.y) close_or_save(pdf, fig)
plot_acf(value, lags=50) pcor = pacf(value, nlags=50) plt.plot(pcor) #plt.show() ################################# tendance (sert pas) from statsmodels.api import OLS from statsmodels.graphics.regressionplots import abline_plot X = np.ones((len(r_rachat_sous), 2)) X[:, 1] = np.arange(0, len(r_rachat_sous)) reg = OLS(r_rachat_sous, X) results = reg.fit() results.params fig = abline_plot(model_results=results) ax = fig.axes[0] ax.plot(X[:, 1], r_rachat_sous, 'r') ax.margins(.1) #plt.show() ############## test de Ljung Box - Bruit blanc import statsmodels as sm from statsmodels import * #on enlève rachat et souscription qui ne sont pas stationaires for key, value in element.items(): try: res = sm.tsa.arima_model.ARMA(value, (1, 1)).fit(disp=-1)
def plot_dosage_by_rsID(gene_reference, dos, cov_mat, counts, title=None, ax=None, additional_covar=None, adjx=True, adjy=True): """ Arguments: --------- gene_reference - a gene reference object meqtl - a list of matrix-eQTL objects, one for each chromosome cov_mat - covariate matrix counts - counts additional_covar - a matrix of same rows as cov_mat to add to the model """ gr = gene_reference cov_mat_t = cov_mat.copy(deep=True) try: geno = dos.ix[gr.rsID, cov_mat_t.index] except pd.core.indexing.IndexingError: geno = dos.ix[cov_mat_t.index] cov_mat_t[gr.rsID] = geno cov_mat_t = cov_mat_t.ix[cov_mat_t[gr.rsID].notnull() , :] geno = geno[geno.notnull()] c = counts.ix[gr.gene, cov_mat_t.index] cov_mat = cov_mat.ix[cov_mat_t.index,:] if adjx: results = sm.OLS(geno, cov_mat).fit() adj_dos_mat = geno -\ np.dot(results.params, cov_mat.T) else: adj_dos_mat = geno if adjy: results = sm.OLS(c, cov_mat).fit() adj_counts = c - np.dot(results.params, cov_mat.T) const = results.params.const else: adj_counts = c const = 0 # Need to grab original genotypes colors = [] # Make this into a function color_dict = np.linspace(0, 1, 3) for i in geno: if i <= 0.5: colors.append(color_dict[0]) elif i > 0.5 and i <= 1.5: colors.append(color_dict[1]) else: colors.append(color_dict[2]) if ax: ax_orig = True else: ax_orig = None fig, ax = plt.subplots(nrows=1, ncols=1, sharey=False, sharex=False, subplot_kw=dict(axisbg='#FFFFFF')) ax.scatter(adj_dos_mat, adj_counts + const, s=50, c=colors) xticks = ax.get_xticks() yticks = ax.get_yticks() ax.set_xticks(xticks[1::2]) ax.set_yticks(yticks[1::2]) fitted_line = sm.OLS(adj_counts, adj_dos_mat).fit() abline_plot(const, fitted_line.params[0], color='k', ax=ax) test = sm.OLS(c, cov_mat_t).fit() if test.params[gr.rsID] > 0: annot_y = - 1 else: annot_y = 1 yrange = yticks[-1] - yticks[0] ax.text(xticks[0] + 0.025, yticks[annot_y] + annot_y / 2 * yrange / 5, '$R^{2}$=%s' % str(test.rsquared)[0:4], style='italic') ax.set_ylabel('$log_{2}$ CPM') ax.set_xlabel('Fitted Dosages') if title: ax.set_title('%s partial regression\non %s' % (title, gr.rsID)) else: pass if ax_orig: return(ax, test) else: return(fig, test)
def test_abline_ab(self): mod = self.mod intercept, slope = mod.params fig = abline_plot(intercept=intercept, slope=slope) plt.close(fig)
def test_abline_model(self): fig = abline_plot(model_results=self.mod) ax = fig.axes[0] ax.scatter(self.X[:,1], self.y) plt.close(fig)
col_y = 'perMnd' #'mio' #'fasteUdg' #'perMnd' #ax=df.plot(x='m2',y='perMnd', kind='scatter',s=1.0) #0.5) ax=df.plot(x=col_x,y=col_y, kind='scatter',s=1.0) #0.5) #155=16.000, 200=17.700 #45m2 = 1700, 38kr/m2. 26m2 for 1000 kr mere. if 1: #X2 = sm.add_constant( df['m2'] ) X2 = sm.add_constant( df[col_x] ) mdl = sm.OLS(endog = df[col_y], exog = X2 ) result=mdl.fit() print(result.summary()) pprint.pprint(result.params) abline_plot(model_results=result, ax=ax) #plt.xlim(100,350) #600) ======= filename = 'C:\\Users\\jg.STATUSDK\\Downloads\\husudg - Sheet1.tsv' df = pd.read_csv( filename, sep='\t', #nrows=2 #chunksize=2, #iterator=True, ) #df = df.head(167)
def test_abline_model(self): fig = abline_plot(model_results=self.mod) ax = fig.axes[0] ax.scatter(self.X[:, 1], self.y) plt.close(fig)
# ii) # start y = 39 steigung aber negativ = 0.158 # iii) fit.conf_int() # 0 1 # const 38.525212 41.346510 # horsepower -0.170517 -0.145172 # iv) fit.rsquared # c) ax = df.plot(kind="scatter", x="horsepower", y="mpg") abline_plot(model_results=fit, ax=ax, color="orange", linewidth=3) # Aufgabe 11.2 # a) df = pd.read_csv( r"C:\Users\freya\OneDrive\HSLU\6. Semester 2020FS\STAT\SW11\Übungen\Boston.csv", index_col=0) df.head() df.columns # b) # i) # medv = bo + b1 * lstat # ii) y = df["medv"]
def plot_dosage_by_rsID(gene_reference, dos, cov_mat, counts, title=None, ax=None, additional_covar=None, adjx=True, adjy=True): """ Arguments: --------- gene_reference - a gene reference object meqtl - a list of matrix-eQTL objects, one for each chromosome cov_mat - covariate matrix counts - counts additional_covar - a matrix of same rows as cov_mat to add to the model """ gr = gene_reference cov_mat_t = cov_mat.copy(deep=True) try: geno = dos.ix[gr.rsID, cov_mat_t.index] except pd.core.indexing.IndexingError: geno = dos.ix[cov_mat_t.index] cov_mat_t[gr.rsID] = geno cov_mat_t = cov_mat_t.ix[cov_mat_t[gr.rsID].notnull(), :] geno = geno[geno.notnull()] c = counts.ix[gr.gene, cov_mat_t.index] cov_mat = cov_mat.ix[cov_mat_t.index, :] if adjx: results = sm.OLS(geno, cov_mat).fit() adj_dos_mat = geno -\ np.dot(results.params, cov_mat.T) else: adj_dos_mat = geno if adjy: results = sm.OLS(c, cov_mat).fit() adj_counts = c - np.dot(results.params, cov_mat.T) const = results.params.const else: adj_counts = c const = 0 # Need to grab original genotypes colors = [] # Make this into a function color_dict = np.linspace(0, 1, 3) for i in geno: if i <= 0.5: colors.append(color_dict[0]) elif i > 0.5 and i <= 1.5: colors.append(color_dict[1]) else: colors.append(color_dict[2]) if ax: ax_orig = True else: ax_orig = None fig, ax = plt.subplots(nrows=1, ncols=1, sharey=False, sharex=False, subplot_kw=dict(axisbg='#FFFFFF')) ax.scatter(adj_dos_mat, adj_counts + const, s=50, c=colors) xticks = ax.get_xticks() yticks = ax.get_yticks() ax.set_xticks(xticks[1::2]) ax.set_yticks(yticks[1::2]) fitted_line = sm.OLS(adj_counts, adj_dos_mat).fit() abline_plot(const, fitted_line.params[0], color='k', ax=ax) test = sm.OLS(c, cov_mat_t).fit() if test.params[gr.rsID] > 0: r2_text_pos = yticks[-1] - (yticks[-1] - yticks[-2]) / 5 else: r2_text_pos = yticks[0] + (yticks[1] - yticks[0]) / 5 ymin_, ymax_ = ax.get_ylim() if r2_text_pos < ax.get_ylim()[0]: r2_text_pos = ymin_ + (ymax_ - ymin_) / 12 elif r2_text_pos > ax.get_ylim()[0]: r2_text_pos = ymax_ - (ymax_ - ymin_) / 12 ax.text(xticks[0] + 0.025, r2_text_pos, '$r^{2}$=%s' % str(test.rsquared)[0:4], style='italic') ax.set_ylabel('$log_{2}$ CPM') ax.set_xlabel('Fitted Dosages') if title: ax.set_title('%s partial regression\non %s' % (title, gr.rsID)) else: pass if ax_orig: return (ax, test) else: return (fig, test)
# OLS X = sm.add_constant(x) model = sm.OLS(y, X) results = model.fit() # Main sc = ax.scatter(x=x, y=y, fc='#3182bdcc', ec='#3182bd', s=8, zorder=5) ax.set_ylabel('Fertility rate') ax.set_xlabel(r'Log$_2$(FPKM+1)') ax.grid() ax.text(x=9, y=0.5, s=r"$R^2={rsquared:.2f}$".format(rsquared=results.rsquared)) abline_plot(model_results=results, color='#d62728', ax=ax, zorder=6) # regression line # Top xw = np.ones(shape=len(x)) / len(x) axt.hist(x, bins=22, color='#ff9896', weights=xw, zorder=5) #axt.axes.get_xaxis().set_visible(False) axt.set_xticklabels([]) axt.yaxis.set_major_formatter(mtick.PercentFormatter(1, decimals=0)) axt.set_ylabel('%') axt.grid() # Bottom yw = np.ones(shape=len(y)) / len(y) axb.hist(y, bins=22, color='#aec7e8',
def test_abline_model_ax(self, close_figures): fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(self.X[:,1], self.y) fig = abline_plot(model_results=self.mod, ax=ax) close_or_save(pdf, fig)
def test_abline_model_ax(self): fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(self.X[:,1], self.y) fig = abline_plot(model_results=self.mod, ax=ax) plt.close(fig)
def test_abline_ab(self, close_figures): mod = self.mod intercept, slope = mod.params fig = abline_plot(intercept=intercept, slope=slope) close_or_save(pdf, fig)
def test_abline_model(self, close_figures): fig = abline_plot(model_results=self.mod) ax = fig.axes[0] ax.scatter(self.X[:,1], self.y) close_or_save(pdf, fig)
return df if __name__ == "__main__": training_file = "green_tripdata_2017-01.csv" testing_file = "green_tripdata_2017-06.csv" df_train = preprocess(training_file) lm = sm.OLS(df_train['fare_amount'], df_train.drop('fare_amount', axis=1)).fit() y_train = lm.predict(df_train.drop('fare_amount', axis=1)) rtrain = pearsonr(y_train, df_train['fare_amount']) df_test = preprocess(testing_file) y_test = lm.predict(df_test.drop('fare_amount', axis=1)) rtest = pearsonr(y_test, df_test['fare_amount']) print(lm.summary()) print("rtrain: {:.4f} , rtest: {:.4f}".format(rtrain[0], rtest[0])) # scatter-plot data ax = df_train.plot(x='trip_distance', y='fare_amount', kind='scatter', s=1) ax.set_ylim(0, 250) ax.set_xlim(0, 80) # plot regression line abline_plot(model_results=lm, ax=ax, markersize=1) plt.show()
]].fillna(0) # Dependent variable. Y = high_thc["cannabinoid_d9_thca_percent"].fillna(0) # Fit a regression model. X = sm.add_constant(X) model = sm.OLS(Y, X) regression_results = model.fit() print(regression_results.summary()) # Plot the regression ax = high_thc.plot(x='cannabinoid_cbda_percent', y='cannabinoid_d9_thca_percent', kind='scatter') abline_plot(model_results=regression_results, ax=ax) #-----------------------------------------------------------------------------# # Trend an analyte (butane) over time. # https://stackoverflow.com/questions/36410075/select-rows-from-a-dataframe-based-on-multiple-values-in-a-column-in-pandas # https://stackoverflow.com/questions/17706109/summing-the-number-of-occurrences-per-day-pandas # https://apps.leg.wa.gov/wac/default.aspx?cite=314-55-102 # https://stackoverflow.com/questions/10998621/rotate-axis-text-in-python-matplotlib # https://stackoverflow.com/questions/7917107/add-footnote-under-the-x-axis-using-matplotlib #-----------------------------------------------------------------------------# concentrate_types = [ "hydrocarbon_concentrate", "concentrate_for_inhalation", "non-solvent_based_concentrate", "co2_concentrate", "food_grade_solvent_concentrate",
// Horsepower ////////// """) regY = data.get('mpg').get_values() model = sm.OLS(regY, sm.add_constant(data['horsepower'])) results = model.fit() p = results.params print(results.summary()) # scatter-plot data ax = data.plot(x='horsepower', y='mpg', kind='scatter') # plot regression line abline_plot(model_results=results, ax=ax) #plt.show() print(""" ////////// // All features ////////// """) df = data[values] model = sm.OLS(regY, sm.add_constant(df)) results = model.fit() print(results.summary()) #plot_data()