def make_plots(groups): sns.stripplot("ammo", "moa", data=groups, jitter=True) postprocess() plt.savefig("points.png") plt.clf() sns.boxplot("ammo", "moa", data=groups) postprocess() plt.savefig("boxplot.png") plt.clf() sns.barplot("ammo", "mean", data=groups, ci=None) plt.title("mean moa for best 9 of 10 five shot groups") plt.ylabel("moa") postprocess() plt.savefig("avg_moa.png") plt.clf() std = groups["standard"] std = std[std.notnull()] fig, axes = plt.subplots(ncols=2) sns.distplot(std, ax=axes[0]) stats.probplot(std, plot=axes[1]) fig.set_size_inches(6, 4) fig.tight_layout() plt.savefig("qqplot.png")
def univariance(self, var): print self._df[var].describe() scaled = StandardScaler().fit_transform(self._df[var][:,np.newaxis]) low_range = scaled[scaled[:,0].argsort()][:10] high_range = scaled[scaled[:,0].argsort()][-10:] print 'outer range(low) of the dist' print low_range print 'outer range(high) of the dist' print high_range #show norm fig = plt.figure() plt.title('dist before log-adjust(skewness=%.3f kurtosis=%.3f)'%(self._df[var].skew(), self._df[var].kurt()) ) sns.distplot(self._df[var], fit=norm) fig = plt.figure() stats.probplot(self._df[var], plot=plt) #apply log to make it to be norm #in case of positive skewnewss, log transform will make it to be norm self._df[var+'LOG'] = np.log(self._df[var]) fig = plt.figure() plt.title('dist after log-adjust(skewness=%.3f kurtosis=%.3f)'%(self._df[var+"LOG"].skew(), self._df[var+"LOG"].kurt()) ) sns.distplot(self._df[var+'LOG'], fit=norm) fig = plt.figure() stats.probplot(self._df[var+'LOG'], plot=plt) plt.show() return
def plot_residuals(turnstile_weather, predictions): ''' Using the same methods that we used to plot a histogram of entries per hour for our data, why don't you make a histogram of the residuals (that is, the difference between the original hourly entry data and the predicted values). Try different binwidths for your histogram. Based on this residual histogram, do you have any insight into how our model performed? Reading a bit on this webpage might be useful: http://www.itl.nist.gov/div898/handbook/pri/section2/pri24.htm ''' plt.figure() # plt.xlabel('Residuals') # plt.ylabel('Frequency') # plt.title('Histogram of the residuals') # residual = turnstile_weather['ENTRIESn_hourly'] - predictions # plt.xlabel('Fitted Value') # plt.ylabel('Residual') # plt.title('Residuals versus fits') # plt.plot(predictions, residual, 'ro') #plt.plot(turnstile_weather['ENTRIESn_hourly'],predictions, 'b') #(turnstile_weather['ENTRIESn_hourly'] - predictions).hist(bins=100) #plt.plot(turnstile_weather['ENTRIESn_hourly'] - predictions, 'b') #plt.plot(predictions, 'b') # Probability Plot stats.probplot(turnstile_weather['ENTRIESn_hourly'] - predictions, plot=plt) plt.show() return plt
def statsAnalysis( self ): ge_arr = numpy.array( self.geneexpdict.values() ) descstats = stats.describe( ge_arr ) # descriptive statistics for the log fold change values: size of array, (min,max), mean, var, skewness, kurtosis print descstats raw_avg_logfc = numpy.mean( ge_arr ); raw_stdev_logfc = numpy.std( ge_arr ); print "raw mean and sd: ", raw_avg_logfc, raw_stdev_logfc; stats.probplot( ge_arr, plot=matplotlib.pyplot ) matplotlib.pyplot.savefig('qqplot_raw.png') matplotlib.pyplot.close(); # if the distribution is not central, the n and nn labels could be assigned to genes with > 0 log(fc). To avoid this, convert gene exp values to z-scores and recalculate mean and sd for k in self.geneexpdict.keys(): v = self.geneexpdict[k]; zscore = (v - raw_avg_logfc)/float(raw_stdev_logfc); self.geneexpdict[k] = zscore; # recompute distribution parameters ge_arr = numpy.array( self.geneexpdict.values() ) descstats = stats.describe( ge_arr ) # descriptive statistics for the log fold change values: size of array, (min,max), mean, var, skewness, kurtosis print descstats self.avg_logfoldchange = numpy.mean( ge_arr ); self.stdev_logfoldchange = numpy.std( ge_arr ); print "centralized mean and sd: ", self.avg_logfoldchange, self.stdev_logfoldchange; stats.probplot( ge_arr, plot=matplotlib.pyplot ) matplotlib.pyplot.savefig('qqplot_centralized.png') matplotlib.pyplot.close();
def testGeweke(totalSamples = 400, plot=False, observeToPredict=False, poisson=False): ripl=mk_p_ripl() assumes=[('mu','(normal 0 1)')] observes=[('(normal mu .1)','.5')] if poisson: assumes=[('mu','(poisson 30)')] observes=[('(normal mu 1)','6')] nameToSeries = geweke(ripl,assumes,observes,totalSamples,stepSize=5) mean=np.mean(nameToSeries['mu']) std=abs(np.std(nameToSeries['mu'])) print 'testGeweke: observeToPredict=%s'%observeToPredict print 'mu=(normal 0 1), infer mu: (mean,std)=', mean, std #assert .8 > abs(mean) and .8 > abs(std-1) if plot: dict2={'mu':np.random.normal(0,1,totalSamples)} label2='np.random.normal(0,1)' if poisson: dict2={'mu':np.random.poisson(30,totalSamples)} label2='np.random.poisson(30)' qqPlotAll((nameToSeries,dict2),('Geweke',label2)) plt.figure() probplot(nameToSeries['mu'],dist='norm',plot=plt) plt.show() return nameToSeries
def plot_qq_cca(tvalues): stats.probplot(tvalues, dist="norm", plot=Plot) Plot.title('Q-Q plot for CCA') Plot.xlabel('Actual Quantiles') Plot.ylabel('Theoretical Quantiles') Plot.savefig('qq-cca.png') Plot.show()
def plot_residuals(turnstile_weather, predictions): ''' Using the same methods that we used to plot a histogram of entries per hour for our data, why don't you make a histogram of the residuals (that is, the difference between the original hourly entry data and the predicted values). Based on this residual histogram, do you have any insight into how our model performed? Reading a bit on this webpage might be useful: http://www.itl.nist.gov/div898/handbook/pri/section2/pri24.htm ''' turnstile_weather['Residual'] = (turnstile_weather['ENTRIESn_hourly'] - predictions) ''' plot = (ggplot(turnstile_weather, aes(x='Residual')) + geom_histogram(color='green', binwidth=1) + ggtitle('Hourly entries residuals') + xlab('Difference between original hourly entries and predicted values') + ylab('Frequency')) return plot ''' stats.probplot((turnstile_weather ['Residual']), dist="norm", plot=plt) plt.title('Normal probability plot of hourly entries residuals') plt.xlabel('Theoretical normal quantiles') plt.ylabel('Residual quantiles') return plt
def plot_residuals(turnstile_weather, predictions): ''' Using the same methods that we used to plot a histogram of entries per hour for our data, why don't you make a histogram of the residuals (that is, the difference between the original hourly entry data and the predicted values). Try different binwidths for your histogram. Based on this residual histogram, do you have any insight into how our model performed? Reading a bit on this webpage might be useful: http://www.itl.nist.gov/div898/handbook/pri/section2/pri24.htm ''' plt.figure() (turnstile_weather['ENTRIESn_hourly'] - predictions).hist() # Histogram for residuals predictedVal = lr.predictions(turnstile_weather); (weather_turnstile['ENTRIESn_hourly'] - predictedVal).hist(bins = 100) ## line chart for residuals plt.plot(weather_turnstile['ENTRIESn_hourly'] - predictedVal); plt.ylabel('Residuals') plt.show(); ## qqplot of the residuals to test for normality stats.probplot(weather_turnstile['ENTRIESn_hourly'] - predictedVal, dist="norm", plot=pylab) pylab.show(); return plt
def check_weight_samples(test_model, samples): mu_w = test_model.network.Mu Sigma_w = test_model.network.Sigma rho = test_model.network.P A_samples = np.array([s.weight_model.A for s in samples]) W_samples = np.array([s.weight_model.W for s in samples]) # Check that A's mean is about p A_mean = A_samples.mean(0) print "P: ", rho print "Mean A: \n", A_mean # Get the samples where A is nonzero # assert test_model.N == 1 n_pre = n_post = b = 1 w_samples = np.array(W_samples[:,n_pre, n_post, b])[A_samples[:, n_pre, n_post] > 0, ...] w_mean = w_samples.mean(0) w_std = w_samples.std(0) print "Mean w: \n", w_mean, " +- ", w_std # Make Q-Q plots fig = plt.figure() w_ax = fig.add_subplot(121) w_dist = norm(mu_w[n_pre, n_post, b], np.sqrt(Sigma_w[n_pre, n_post, b, b])) probplot(w_samples, dist=w_dist, plot=w_ax) fig.add_subplot(122) _, bins, _ = plt.hist(w_samples, 20, normed=True, alpha=0.2) bincenters = 0.5*(bins[1:]+bins[:-1]) plt.plot(bincenters, w_dist.pdf(bincenters), 'r--', linewidth=1) plt.show()
def check_bias_samples(test_model, samples): """ Check that the bias samples match the prior :param test_model: :param samples: :return: """ mu_bias = test_model.bias_model.mu_0 sigma_bias = test_model.bias_model.sigma_0 # Convert samples to arrays bias_samples = np.array([s.bias_model.b for s in samples]) bias_mean = bias_samples.mean(0) bias_std = bias_samples.std(0) bias_dist = norm(mu_bias, np.sqrt(sigma_bias)) print "Mean bias: ", bias_mean, " +- ", bias_std # Make Q-Q plots fig = plt.figure() bias_ax = fig.add_subplot(121) probplot(bias_samples[:,0], dist=bias_dist, plot=bias_ax) fig.add_subplot(122) _, bins, _ = plt.hist(bias_samples[:,0], 20, normed=True, alpha=0.2) bincenters = 0.5*(bins[1:]+bins[:-1]) plt.plot(bincenters, bias_dist.pdf(bincenters), 'r--', linewidth=1) plt.show()
def draw_qq_plot(dataset, xlabel="", ylabel="", title=""): plt.figure() stats.probplot(dataset, dist="norm", plot=plt) plt.xlabel(xlabel) plt.ylabel(ylabel) plt.title(title) plt.show()
def test_sample_nig(): mu_0 = 0.0 lmbda_0 = 10.0 alpha_0 = 10.0 beta_0 = 10.0 # Directly sample nig and lookg at marginals from pyhawkes.utils.utils import sample_nig mu_samples = np.array([sample_nig(mu_0, lmbda_0, alpha_0, beta_0)[0] for _ in xrange(10000)]) # Plot the histogram of impulse means plt.figure() p_mu = t(df=2 * alpha_0, loc=mu_0, scale=np.sqrt(beta_0 / (alpha_0 * lmbda_0))) _, bins, _ = plt.hist(mu_samples, bins=50, alpha=0.5, normed=True) bincenters = 0.5 * (bins[1:] + bins[:-1]) plt.plot(bincenters, p_mu.pdf(bincenters), "r--", linewidth=1) plt.xlabel("mu") plt.ylabel("p(mu)") plt.figure() probplot(mu_samples, dist=p_mu, plot=plt.gca()) plt.show()
def reconstruction_plots(test_nonoise, test_dataset, test_tilde, test_pca_tilde): import pylab as plt vmin = np.amin(test_nonoise) vmax = np.amax(test_nonoise) npix = int(np.sqrt(np.shape(test_nonoise)[1])) for ii in range(10): residues = test_tilde[ii] - test_dataset[ii] f = plt.figure(figsize=(10,10)) plt.subplot(3,4,1) plt.imshow(test_nonoise[ii].reshape(npix,npix), interpolation='None', vmin=vmin, vmax=vmax) plt.title('No noise img') plt.subplot(3,4,2) plt.imshow((test_dataset[ii]).reshape(npix,npix), interpolation='None', vmin=vmin, vmax=vmax) plt.title(r'$N(x)$') stats.probplot((test_tilde[ii] - test_pca_tilde[ii]), plot=plt.subplot(3,4,3), dist='norm', fit=True) #stats.probplot(test_tilde[ii] - test_nonoise[ii], plot=plt.subplot(3,4,3), dist='norm', fit=True) plt.title("Normal Q-Q PCA") stats.probplot(test_tilde[ii] - test_nonoise[ii], plot=plt.subplot(3,4,4), dist='norm', fit=True) plt.title("Normal Q-Q AE") plt.subplot(3,4,5) plt.imshow(test_tilde[ii].reshape(npix,npix), interpolation='None', vmin=vmin, vmax=vmax) plt.title('Reconstructed') plt.subplot(3,4,6) plt.imshow(residues.reshape(npix,npix), interpolation='None') plt.title(r'$\widetilde{N(x)} - N(x)$') plt.subplot(3,4,7) plt.imshow((test_tilde[ii] - test_nonoise[ii]).reshape(npix,npix), interpolation='None')#, vmin=vmin, vmax=vmax) plt.title(r'$\widetilde{N(x)} - x$') plt.subplot(3,4,8) plt.imshow((test_tilde[ii] - test_pca_tilde[ii]).reshape(npix,npix), interpolation='None') plt.title(r'$\widetilde{N(x)} - PCA({N(x)})$') plt.subplot(3,4,9) plt.imshow(test_pca_tilde[ii].reshape(npix,npix), interpolation='None', vmin=vmin, vmax=vmax) plt.title('PCA Reconstructed') plt.subplot(3,4,10) plt.title(r'$\Delta$') plt.imshow((test_pca_tilde[ii] - test_dataset[ii]).reshape(npix,npix), interpolation='None') plt.subplot(3,4,11) plt.imshow((test_pca_tilde[ii] - test_nonoise[ii]).reshape(npix,npix), interpolation='None')#, vmin=vmin, vmax=vmax) plt.title(r'$PCA({N(x)}) - x$') plt.subplot(3,4,12) plt.imshow((test_tilde[ii] / test_pca_tilde[ii]).reshape(npix,npix), interpolation='None') plt.title(r'$\widetilde{N(x)} / PCA({N(x)})$') plt.show()
def get_residuals_ab(self): # For histogram import pylab import scipy.stats as stats measurements = np.random.normal(loc = 20, scale = 5, size=100) stats.probplot(measurements, dist="norm", plot=pylab) pylab.show()
def main(): #parse in the data file, list of floats input_file_name = "data.txt" data = parse(input_file_name) # print (len(data)) #plot a histogram of the data n, bins, patches = plt.hist(data,bins=40) plt.title('Plot of Mixture of Two Different Normal Distributions') plt.xlabel('x-values') plt.ylabel('y-values') plt.show() #plot a qq plot of the data probplot(data, plot=plt, dist="norm") plt.title('QQ Plot of the Data Against the Standard Normal Distribution') plt.xlabel('Index') plt.ylabel('Normal Data Quantiles') plt.show() #calculate the p-values for the data abs_values = list(abs(n) for n in data) abs_values.sort() p = list(2.0 * norm.cdf(n * -1) for n in abs_values) # print(p) #plt a histogram of the p-values n, bins, patches = plt.hist(p,bins=40) plt.title('Histogram of the P-Values') plt.xlabel('Index') plt.ylabel('P - Value') plt.show() #calculate the false discovery rate & the p-value at the FDR cutoff p.sort() fdr_cutoff = 0.05 r, p_value = FDR(p, fdr_cutoff) print("The p_value at the FDR cutoff of", fdr_cutoff, "is", p_value,".") print("There are", r, "p-values less than the pvalue calculated at the FDR cutoff.") #plot the raw data values with FDR cutoff x = range(len(p)) plt.scatter(x[:r], p[:r], c='r', edgecolor='r',label='Below FDR cutoff') plt.scatter(x[r+1:], p[r+1:], c='b',edgecolor='b',label='Above FDR cutoff') # plt.scatter(list(range(len(p_list1),len(p_list1)+len(p_list2))),p_list1) plt.plot([0, len(p)], [0.05,0.05], 'k-', lw=2, label="pvalue below 0.05") plt.legend(loc='upper left') plt.xlim(0,len(p)) plt.ylim(-.05,1.05) plt.xlabel('R - Value') plt.ylabel('P - Value') plt.title('False Positives of the P-Values') plt.show() return 0
def pplot(path, x): x = sorted([float(item) for item in x]) y = [100.0*((j - 0.5)/float(len(x))) for j in xrange(1, len(x)+1)] plt.clf() with warnings.catch_warnings(): warnings.simplefilter("ignore") stats.probplot(x, dist='norm', plot=plt) plt.savefig(path, format='png') plt.clf()
def normal_probability_plot(path, tables, conf): fname = path + '.png' treenums = tables['tree_number'] x = sorted([float(dec(num).log10()) for ast, num in treenums]) y = [100.0*((j - 0.5)/float(len(x))) for j in xrange(1, len(x)+1)] plt.clf() with warnings.catch_warnings(): warnings.simplefilter("ignore") stats.probplot(x, dist='norm', plot=plt) plt.savefig(fname, format='png')
def QQ_plot(self): """ returns the QQ-plot with normal distribution """ plt.figure(figsize=(12, 8), facecolor="w", edgecolor="k", linewidth=2.0, frameon=True) stats.probplot(self.scvr, dist="norm", plot=plt) plt.xlabel("SCVR") plt.ylabel("Standard quantile") plt.show()
def plot(file_name,negative_control_gRNAs=None,wald_only=False): data=open(file_name,'rb') short_file_name=file_name[:file_name.index(".gene_summary.txt")] data.readline() permute_p_value_list=[] wald_p_value_list=[] beta_value_list=[] if negative_control_gRNAs!=None: negative_control_permute_p_value_list=[] negative_control_wald_p_value_list=[] negative_control_beta_value_list=[] for line in data: elements=line.decode().strip().split("\t") if negative_control_gRNAs!=None and elements[0] in negative_control_gRNAs: negative_control_beta_value_list.append(float(elements[2])) if wald_only==True: negative_control_wald_p_value_list.append(float(elements[4])) else: negative_control_permute_p_value_list.append(float(elements[4])) negative_control_wald_p_value_list.append(float(elements[6])) else: beta_value_list.append(float(elements[2])) if wald_only==True: wald_p_value_list.append(float(elements[4])) else: permute_p_value_list.append(float(elements[4])) wald_p_value_list.append(float(elements[6])) beta_value_list=[x for x in beta_value_list if str(x) != 'nan' and abs(x)<3] wald_p_value_list=[x for x in wald_p_value_list if str(x) != 'nan'] if negative_control_gRNAs!=None: negative_control_beta_value_list=[x for x in beta_value_list if str(x) != 'nan' and abs(x)<3] negative_control_wald_p_value_list=[x for x in wald_p_value_list if str(x) != 'nan'] if wald_only!=True: permute_p_value_list=[x for x in permute_p_value_list if str(x) != 'nan'] stats.probplot(permute_p_value_list, dist="uniform",plot=pylab) pylab.savefig("QQplot of permute_p value %s.png" %short_file_name) pylab.close() pylab.hist(beta_value_list,bins=1000) pylab.savefig("Hist of beta value %s.png" %short_file_name) pylab.close() #stats.probplot(wald_p_value_list, dist="uniform",plot=pylab) fig=sm.qqplot(np.array(wald_p_value_list),stats.uniform,fit=True, line='45') pylab.xlim(0,1) pylab.ylim(0,1) #fig.set_xlim(0,1) pylab.savefig("QQplot of wald_p value %s.png" %short_file_name) pylab.close() '''
def is_normal(self): '''A series of Normal tests including qqplot, histogram and Shaprio-Wilks''' stats.probplot(self.residuals, dist = 'norm', plot = plt) plt.show() plt.hist(self.residuals) plt.show() print "The Shapiro-Wilk p-value is {}\n".format(stats.shapiro(self.residuals)[1])
def _residPlot(self, results): res = results.resid fig, axes = plt.subplots(nrows=2, ncols=2) plt.subplot(axes[0, 0]) stats.probplot(res, plot=plt) # QQ plot plt.subplot(axes[1, 0]) sns.distplot(res) # Histogram plt.subplot(axes[0, 1]) sns.regplot(results.predict(), res, lowess=True, ax=axes[0, 1], line_kws={"color":"black"}) res.plot(ax=axes[1, 1]) # Time series (residual v order)
def drawProbPlot(self, model, ax): x = pd.Series(self.dist) stats.probplot(x, dist=model, plot=ax) q = np.arange(0, 1.0001, 0.05) vals = x.quantile(q) ax.set_yticks(vals, True) ax.set_yticklabels(q, minor=True) ax.set_yticks([]) ax.yaxis.grid(True, 'both') ax.set_ylim(vals.min(), vals.max()) ax.set_xlabel("X") ax.set_ylabel("Quantile")
def residual_analysis(self): dropnan_DF = self.dropnan_DF model = sm.ols(formula='ob_value ~ sm_value', data=dropnan_DF) fitted = model.fit() fittedvalues = np.array(fitted.fittedvalues) residual = fittedvalues - np.array(dropnan_DF.ob_value) norm_residual = fitted.resid_pearson ### figure = plt.figure(facecolor='white') subplot1 = figure.add_subplot(2,2,1) subplot1.scatter(fittedvalues,residual) subplot1.set_xlabel("Fitted values") subplot1.set_ylabel("Residuals") subplot1.set_title("Residuals vs Fitted") subplot2 = figure.add_subplot(2,2,2) probplot(norm_residual,plot=subplot2) subplot2.set_title("Normal Q-Q") subplot2.set_ylabel("Standardized residuals") subplot2.set_xlabel("Theoretical Quantiles") subplot3 = figure.add_subplot(2,2,3) subplot3.scatter(fittedvalues,np.sqrt(np.abs(residual))) subplot3.set_title("Scale-Location") subplot3.set_ylabel(r'$\sqrt{\mathrm{|Standardized\/residuals|}}$') subplot3.set_xlabel("Fitted values") subplot4 = figure.add_subplot(2,2,4) norm_residual = (np.matrix(norm_residual)).T H = norm_residual*(norm_residual.T*norm_residual).I*norm_residual.T h = H.diagonal() subplot4.scatter(np.array(h),np.array(norm_residual.T)) subplot4.set_title("Residuals vs Leverage") subplot4.set_ylabel("Standardized residuals") subplot4.set_xlabel("Leverage") subplot4.xaxis.set_major_locator(MaxNLocator(6)) figure.tight_layout() # Store image in a string buffer buffer = StringIO.StringIO() canvas = pylab.get_current_fig_manager().canvas canvas.draw() pilImage = PIL.Image.frombytes("RGB", canvas.get_width_height(), canvas.tostring_rgb()) pilImage.save(buffer, "PNG") pylab.close() img = str((buffer.getvalue()).encode('Base64')) return img
def predictions(dataframe): # dataframe['ENTRIESn_hourly'] = np.log1p(dataframe.ENTRIESn_hourly) # log transformation # features = dataframe[['meantempi']] features = dataframe[['rain']] # dummy_rain = pd.get_dummies(dataframe['rain'], prefix='rain') dummy_unit = pd.get_dummies(dataframe['UNIT'], prefix='unit') dummy_hour = pd.get_dummies(dataframe['hour'], prefix='hour') dummy_day_week = pd.get_dummies(dataframe['day_week'], prefix='day_week') features = features.join(dummy_hour).join(dummy_day_week).join(dummy_unit) #join(dummy_rain). # removing one dummy from each group to reduce multicollinearity features.drop(['unit_R003'], axis = 1, inplace = True) features.drop(['hour_0'], axis = 1, inplace = True) features.drop(['day_week_0'], axis = 1, inplace = True) # features.drop(['rain_0'], axis = 1, inplace = True) # values_log = dataframe['log_ENTRIESn_hourly'] values = dataframe['ENTRIESn_hourly'] # Perform linear regression # intercept, params = linear_regression(features, values_log) intercept, params = linear_regression(features, values) predictions = intercept + np.dot(features, params) # log_predictions [log_predictions<0] = 1 # predictions = np.expm1(log_predictions) # inverse logarithmic transformation to produce ENTRIESn_hourly # residuals = values - predictions residuals = values - predictions print predictions[:5] print values[:5] ''' plt.figure() residuals.hist(alpha=1, bins=100, label='ENTRIESn_hourly residuals') plt.title("Residuals Histogram") # add a title plt.ylabel("Frequency") # add a label to the y-axis plt.xlabel("ENTRIESn_hourly residuals") # plt.legend() # add a legend plt.show() ''' # print 'log linear QQ plot' # sns.residplot(values_nl, predictions, lowess=True, color="navy") # plot qq plot stats.probplot(residuals, dist="norm", plot=pylab) # residuals.hist(alpha=1, bins=100, label='ENTRIESn_hourly residuals') pylab.show() return predictions
def normplot(e): """ parameters ---------- e: error of a single voxel through time Returns ------- a Q-Q plot """ stats.probplot(e, dist = "norm", plot = plt) plt.title("Normal Q-Q plot") plt.show() plt.savefig('../../../data/normal_assumption.png')
def test_sparams_keyword(self): np.random.seed(123456) x = stats.norm.rvs(size=100) # Check that None, () and 0 (loc=0, for normal distribution) all work # and give the same results osm1, osr1 = stats.probplot(x, sparams=None, fit=False) osm2, osr2 = stats.probplot(x, sparams=0, fit=False) osm3, osr3 = stats.probplot(x, sparams=(), fit=False) assert_allclose(osm1, osm2) assert_allclose(osm1, osm3) assert_allclose(osr1, osr2) assert_allclose(osr1, osr3) # Check giving (loc, scale) params for normal distribution osm, osr = stats.probplot(x, sparams=(), fit=False)
def qqplot(self, x, prefix='qq'): """Show qq plots compared to normal before and after the transform.""" from matplotlib import pylab from scipy.stats import probplot y = self.transform(x) for i, (x_i, y_i) in enumerate(zip(x.T, y.T)): probplot(x_i, dist="norm", plot=pylab) pylab.savefig(prefix + '_%d_before.png' % i) pylab.clf() probplot(y_i, dist="norm", plot=pylab) pylab.savefig(prefix + '_%d_after.png' % i) pylab.clf()
def plot_qq(file): tvalues = [] for line in open(file): line = line.split(' ') line = [i for i in line if i != ''] tvalues.append(math.fabs(float(line[3]))) tvalues = sorted(tvalues) tvalues = tvalues[:len(tvalues)-1] stats.probplot(tvalues, dist="norm", plot=Plot) Plot.title('Q-Q plot for Bag of Words') Plot.xlabel('Actual Quantiles') Plot.ylabel('Theoretical Quantiles') Plot.savefig('qq-bag-of-words.png') Plot.show()
def test_basic(self): np.random.seed(12345) x = stats.norm.rvs(size=20) osm, osr = stats.probplot(x, fit=False) osm_expected = [-1.8241636, -1.38768012, -1.11829229, -0.91222575, -0.73908135, -0.5857176, -0.44506467, -0.31273668, -0.18568928, -0.06158146, 0.06158146, 0.18568928, 0.31273668, 0.44506467, 0.5857176, 0.73908135, 0.91222575, 1.11829229, 1.38768012, 1.8241636] assert_allclose(osr, np.sort(x)) assert_allclose(osm, osm_expected) res, res_fit = stats.probplot(x, fit=True) res_fit_expected = [1.05361841, 0.31297795, 0.98741609] assert_allclose(res_fit, res_fit_expected)
def create_qqplots(data): plt.figure() # New figure test_data = np.random.normal(size=1000) graph1 = stats.probplot(test_data, dist="norm", plot=plt) plt.savefig("unit2_2_1_qq_normalplot.png") #this will generate the first graph plt.figure() test_data2 = np.random.uniform(size=1000) graph2 = stats.probplot(test_data2, dist="norm", plot=plt) plt.savefig("unit2_2_1_qq_uniformplot.png") #this will generate the second graph # Using the data to see if it looks like any of the distributions above plt.figure() graph2 = stats.probplot(data, dist="norm", plot=plt) plt.savefig("unit2_2_1_qq_data.png") #this will generate the third graph
plt.hist(dataset['Petal.Width'], color='purple');plt.title('Histogram of Petal Width');plt.xlabel('Petal Width');plt.ylabel('Frequency') # Barplot import seaborn as sns sns.countplot(dataset['Species']).set_title('Count of Species') # Normal Q-Q plot plt.plot(dataset.drop('Species', axis=1));plt.legend(list(dataset.columns)) sl = np.array(dataset['Sepal.Length']) sw = np.array(dataset['Sepal.Width']) pl = np.array(dataset['Petal.Length']) pw = np.array(dataset['Petal.Width']) from scipy import stats stats.probplot(sl, dist='norm', plot=plt);plt.title('Probability plot of Sepal Length') stats.probplot(sw, dist='norm', plot=plt);plt.title('Probability plot of Sepal Width') stats.probplot(pl, dist='norm', plot=plt);plt.title('Probability plot of Petal Length') stats.probplot(pw, dist='norm', plot=plt);plt.title('Probability plot of Petal Width') # Normal Probability Distribution x_sl = np.linspace(np.min(sl), np.max(sl)) y_sl = stats.norm.pdf(x_sl, np.median(x_sl), np.std(x_sl)) plt.plot(x_sl, y_sl);plt.xlim(np.min(sl), np.max(sl));plt.title('Normal Probability Distribution of Sepal Length');plt.xlabel('Sepal Length');plt.ylabel('Probability') x_sw = np.linspace(np.min(sw), np.max(sw)) y_sw = stats.norm.pdf(x_sw, np.median(x_sw), np.std(x_sw)) plt.plot(x_sw, y_sw);plt.xlim(np.min(sw), np.max(sw));plt.title('Normal Probability Distribution of Sepal Width');plt.xlabel('Sepal Width');plt.ylabel('Probability') x_pl = np.linspace(np.min(pl), np.max(pl)) y_pl = stats.norm.pdf(x_pl, np.median(x_pl), np.std(x_pl))
#observed value vs fitted value plt.scatter(Startup_50.Profit,profit_pred,c="r");plt.xlabel("observed value");plt.ylabel("fitted value") #residuals vs fitted value plt.scatter(profit_pred,model_5.resid_pearson, c="r"),plt.axhline(y=0,color='blue');plt.xlabel('fitted value');plt.ylabel('residuals') #normality plot for residuals #histogram plt.hist(model_5.resid_pearson) #qq plot for residuals import pylab import scipy.stats as st #checking residuals are normally distributed st.probplot(model_5.resid_pearson, dist="norm",plot=pylab) # get a list of columns cols = list(Startup_50) # move the column to head of list using index, pop and insert cols.insert(0, cols.pop(cols.index('Profit'))) cols # use ix to reorder Startup_50 = Startup_50.ix[:, cols] Startup_50 ### Splitting the data into train and test data from sklearn.model_selection import train_test_split startup_train,startup_test = train_test_split(Startup_50,test_size = 0.3) # 30% test data startup_train startup_test
import numpy as np import pylab import scipy.stats as stats from numpy.matlib import randn from pandas import read_csv, qcut, DataFrame, scatter_matrix from ExploratoryAnalysis import remove_border, hexbin measurements = np.random.normal(loc=20, scale=5, size=100) stats.probplot(measurements, dist='norm', plot=pylab) pylab.show() pylab.figure() pData = read_csv('E:/GitHub/DataAnalysis/data/ss06pid.csv') pData['AGEP'].plot(kind='kde', linewidth=3) pData['AGEP'][pData['SEX'] == 1].plot(kind='kde', linewidth=3, style='orange') #scatterplot --size matters pData.plot(x='JWMNP', y='WAGP', style='o', markersize=3) pylab.show() df = DataFrame(randn(1000, 4), columns=['a', 'b', 'c', 'd']) scatter_matrix(df, alpha=0.2, figsize=(6, 6), diagonal='kde') pylab.show() x = np.random.normal(size=10500) y = np.random.normal(size=10500) pylab.plot(x, y, 'o') pylab.show()
model2.fit(X = wcat.iloc[:,[0,2]],y=wcat.AT) pred2 = model2.predict(wcat.iloc[:,[0,2]]) # Adjusted R-Squared value model2.score(wcat.iloc[:,[0,2]],wcat.AT)# 0.67791 rmse2 = np.sqrt(np.mean((pred2-wcat.AT)**2)) # 32.366 model2.coef_ model2.intercept_ #### Residuals Vs Fitted values import matplotlib.pyplot as plt plt.scatter(pred2,(pred2-wcat.AT),c="r") plt.hlines(y=0,xmin=0,xmax=200) # Checking normal distribution plt.hist(pred2-wcat.AT) import pylab import scipy.stats as st st.probplot(pred2-wcat.AT,dist="norm",plot=pylab) # Let us prepare a model by applying transformation on dependent variable wcat["AT_sqrt"] = np.sqrt(wcat.AT) model3 = LinearRegression() model3.fit(X = wcat.iloc[:,[0,2]],y=wcat.AT_sqrt) pred3 = model3.predict(wcat.iloc[:,[0,2]]) # Adjusted R-Squared value model3.score(wcat.iloc[:,[0,2]],wcat.AT_sqrt)# 0.74051 rmse3 = np.sqrt(np.mean(((pred3)**2-wcat.AT)**2)) # 32.0507 model3.coef_ model3.intercept_ #### Residuals Vs Fitted values import matplotlib.pyplot as plt plt.scatter((pred3)**2,((pred3)**2-wcat.AT),c="r")
kor_result ) # Ttest_1sampResult(statistic=-1.3321801667713213, pvalue=0.19856051824785262) # 📌 p-value 0.1985 > 0.05 이므로 귀무 채택 == 국어 평균은 80이다. ''' 실습 예제 2) 여아 신생아 몸무게의 평균 검정 수행 babyboom.csv 여아 신생아의 몸무게는 평균이 2800(g)으로 알려져 왔으나 이보다 더 크다는 주장이 나왔다. 표본으로 여아 18명을 뽑아 체중을 측정하였다고 할 때 새로운 주장이 맞는지 검정해 보자. 귀무 : 여아 신생아 몸무게 평균이 2800g이다 대립 : ---------------------- 2800g이 아니다. ''' data2 = pd.read_csv('../testdata/babyboom.csv') # print(data2) #time gender[1여아, 2남아] weight minutes fdata = data2[data2.gender == 1] #여아데이터만 추출 print(np.mean(fdata.weight)) #3132.4 # 정규성 확인을 위한 시각화 sns.distplot(fdata.iloc[:, 2], fit=stats.norm) # 히스토그램같은 plt.show() stats.probplot(fdata.iloc[:, 2], plot=plt) #Q-Q plot : 회귀선과 실제 데이터 plt.show() print( stats.shapiro(fdata.iloc[:, 2]) ) #샤피로-윌크 검정으로 정규성 확인 (The test statistic., p=0.01798), p < 0.05 이므로 정규성을 따르지 않음 # 정규성을 따르지 않을 경우 원래는 t-test를 쓰면 안됨 => 정규성을 띄지 않을 때는 Wilcoxon 혹은 Mann-Whitney # 그러나 집단이 하나이므로 Wilcoxon 검정은 할 수 없다. baby_result = stats.ttest_1samp(fdata.weight, popmean=2800) print( baby_result ) #Ttest_1sampResult(statistic=2.233187669387536, pvalue=0.03926844173060218) # 📌 p-value 0.0392 < 0.05 이므로 귀무 기각 == 여아 몸무게 평균은 2800g이 아니다(더 크다).
else: pass plt.subplots(figsize=(12,9)) sns.distplot(train['Price'], fit=stats.norm) plt.subplots(figsize=(12,9)) sns.distplot(train['Price'], fit=stats.norm) # Get the fitted parameters used by the function (mu, sigma) = stats.norm.fit(train['Price']) plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)], loc='best') plt.ylabel('Frequency') fig = plt.figure() stats.probplot(train['Price'], plot=plt) plt.show() #Let's check if the data set has any missing values. train.columns[train.isnull().any()] #plot of missing value attributes plt.figure(figsize=(12, 6)) sns.heatmap(train.isnull()) plt.show() train_corr = train.select_dtypes(include=[np.number]) train_corr.shape #Coralation plot corr = train_corr.corr()
for s in range(nsubint): for i in range(nchan): if is_off_pulse: binmin1 = np.argmin(data[s, i, :nphbin / 2]) binmin2 = np.argmin(data[s, i, nphbin / 2:]) mean = np.median(data[ s, i, range(binmin1 - noff / 2, binmin1 + noff / 2) + range(binmin2 - noff / 2, binmin2 + noff / 2)]) rms = np.std(data[ s, i, range(binmin1 - noff / 2, binmin1 + noff / 2) + range(binmin2 - noff / 2, binmin2 + noff / 2)]) else: osm, osr = sc.probplot(data[s, i], sparams=(), dist='norm', fit=0) q_max = np.min(np.where(osm > 1.0)) q_min = np.max(np.where(osm < -1.0)) rms, mean = np.polyfit(osm[q_min:q_max], osr[q_min:q_max], 1) data[s, i] -= mean if rms == 0.0: data[s, i] = 0.0 else: data[s, i] /= rms crit = np.isfinite(data[s, i]) data[s, i][-crit] = 0.0 sp[s, i] = np.max(data[s, i]) else: scr = np.sum(data, axis=0) scr /= nsubint for i in range(nchan):
#程序文件Pex4_12.py import numpy as np import matplotlib.pyplot as plt from scipy.stats import norm, probplot a = np.loadtxt("Pdata4_6_2.txt") h = a[:, ::2] h = h.flatten() mu = np.mean(h) s = np.std(h) print([mu, s]) sh = np.sort(h) #按从小到大排序 n = len(sh) xi = (np.arange(1, n + 1) - 1 / 2) / n yi = norm.ppf(xi, mu, s) plt.rc('font', size=16) plt.rc('font', family='SimHei') plt.rc('axes', unicode_minus=False) #用来正常显示负号 plt.subplot(121) plt.plot(yi, sh, 'o', label='QQ图') plt.plot([155, 185], [155, 185], 'r-', label='参照直线') plt.legend() plt.subplot(122) res = probplot(h, plot=plt) plt.savefig("figure4_12.png", dpi=500) plt.show()
ax2.set(xlabel='Price Doc', ylabel='Year',title="Box Plot On Price Doc Across Year") ax3.set(xlabel='Month', ylabel='Count',title="Box Plot On Price Doc Across Month") # # Univariate Analysis # # # - Price Doc # - Build Year # ## Price Doc Distribution## # In[ ]: fig,axes = plt.subplots(ncols=2) fig.set_size_inches(20, 10) stats.probplot(train["price_doc"], dist='norm', fit=True, plot=axes[0]) stats.probplot(np.log1p(train["price_doc"]), dist='norm', fit=True, plot=axes[1]) # ## Build Year ## # In[ ]: fig,ax= plt.subplots() fig.set_size_inches(20,8) trainBuild = train.dropna() trainBuild["yearbuilt"] = trainBuild["build_year"].map(lambda x:str(x).split(".")[0]) sn.countplot(data=trainBuild,x="yearbuilt",ax=ax) ax.set(xlabel='Build Year', ylabel='Count',title="No of Buildings Across Year",label='big') plt.xticks(rotation=90)
from scipy.io import arff import pandas as pd import pylab import scipy.stats as stats from sklearn.linear_model import LinearRegression from sklearn.preprocessing import StandardScaler data = arff.loadarff('baseball.arff') df = pd.DataFrame(data[0]) columns = list(df.columns[4:]) train = df[df.columns[4:8]] test = df[df.columns[8]] lr = LinearRegression() lr.fit(train, test) preds = lr.predict(train) residuals = test - preds df['Residual'] = residuals # from sklearn.preprocessing import StandardScaler # scaler = StandardScaler() # df[columns] = scaler.fit_transform(df[columns].to_numpy()) # stats.probplot(df['number_of_wins_in_1986'], dist="norm", plot=pylab) # stats.probplot(df['number_of_losses_in_1986'], dist="norm", plot=pylab) # stats.probplot(df['attendance_for_home_games_in_1986'], dist="norm", plot=pylab) # stats.probplot(df['attendance_for_away_games_in_1986'], dist="norm", plot=pylab) # stats.probplot(df['1987_average_salary'], dist="norm", plot=pylab) stats.probplot(df['Residual'], dist="norm", plot=pylab) pylab.show()
def data_process(): rcParams['figure.figsize'] = (12.0, 6.0) df_train = pd.read_csv('./data/train.csv') df_test = pd.read_csv('./data/test.csv') # describe data type (count, mean, std, min, 25%, 50%, 75%, max) print(f"numerical feature: {df_train.describe().shape}") print(df_train.describe()) df_train['source'] = 'train' df_test['source'] = 'test' df_train.drop('building_id', axis=1, inplace=True) df_test.drop('building_id', axis=1, inplace=True) # kernel density plot sns.distplot(df_train.total_price, fit=norm) plt.ylabel('Frequency') plt.xlabel('total_price') (mu, sigma) = norm.fit(df_train['total_price']) fig = plt.figure() res = stats.probplot(df_train['total_price'], plot=plt) plt.show() print("skewness: %f" % df_train['total_price'].skew()) print("kurtosis: %f" % df_train['total_price'].kurt()) # log transform the target df_train['total_price'] = np.log1p(df_train['total_price']) # Kernel Density plot sns.distplot(df_train.total_price, fit=norm) plt.ylabel('Frequency') plt.title = ('SalePrice distribution') #Get the fitted parameters used by the function (mu, sigma) = norm.fit(df_train['total_price']) # QQ plot fig = plt.figure() res = stats.probplot(df_train['total_price'], plot=plt) plt.show() # fig, ax = plt.subplots() # ax.scatter(x=df_train['parking_price'], y=df_train['total_price']) # plt.xlabel('parking_price') # plt.ylabel('total_price') # plt.show() # fig, ax = plt.subplots() # ax.scatter(x=df_train['XIII_5000'], y=df_train['total_price']) # plt.xlabel('XIII_5000') # plt.ylabel('total_price') # plt.show() # fig, ax = plt.subplots() # ax.scatter(x=df_train['XIII_10000'], y=df_train['total_price']) # plt.xlabel('XIII_10000') # plt.ylabel('total_price') # plt.show() # fig, ax = plt.subplots() # ax.scatter(x=df_train['VII_10000'], y=df_train['total_price']) # plt.xlabel('VII_10000') # plt.ylabel('total_price') # plt.show() # fig, ax = plt.subplots() # ax.scatter(x=df_train['IX_10000'], y=df_train['total_price']) # plt.xlabel('IX_10000') # plt.ylabel('total_price') # plt.show() # fig, ax = plt.subplots() # ax.scatter(x=df_train['V_10000'], y=df_train['total_price']) # plt.xlabel('V_10000') # plt.ylabel('total_price') # plt.show() # # outlier deletion df_train = df_train.drop( df_train[(df_train['parking_price'] > 800000)].index) fig, ax = plt.subplots() ax.scatter(df_train['parking_price'], df_train['total_price']) plt.xlabel('parking_price') plt.ylabel('total_price') plt.show() # # combine data y_train = df_train['total_price'] y_train = df_train.total_price.values total1 = pd.concat([df_train, df_test], axis=0, join='outer', ignore_index=True) total1.drop(['total_price'], axis=1, inplace=True) # print(total1.shape) # # correration matrix # corrmat = df_train.corr() # f, ax = plt.subplots(figsize=(12, 9)) # sns.heatmap(corrmat, vmax=0.9, square=True) # plt.show() # # get the top 10 more correlative features # cols = corrmat.nlargest(10, 'total_price')['total_price'].index # cm = np.corrcoef(df_train[cols].values.T) # plt.subplots(figsize=(12, 9)) # sns.set(font_scale=1.25) # hm = sns.heatmap( # cm, # cbar=True, # annot=True, # square=True, # fmt='.2f', # annot_kws={'size': 10}, # yticklabels=cols.values, # xticklabels=cols.values) # plt.yticks(rotation=0) # plt.xticks(rotation=90) # plt.show() # sns.set() # cols = [ # 'total_price', 'parking_price', 'XIII_5000', 'jobschool_rate', # 'bachelor_rate', 'XIII_10000', 'VII_10000', 'IX_10000', 'V_10000', # 'master_rate' # ] # sns.pairplot(df_train[cols], size=1.25) # plt.show() # process missing data missing_data = total1.isnull().sum().sort_values(ascending=False) missing_precent = ((total1.isnull().sum()) / (total1.isnull().count())).sort_values(ascending=False) missing_type = total1.dtypes missing_all = pd.concat( [missing_data, missing_precent, missing_type], axis=1, keys=['missing_data', 'missing_precent', 'missing_type']) missing_all.drop(missing_all[missing_data == 0].index, inplace=True) missing_all.sort_values(by='missing_data', ascending=False) print(missing_all) total1.drop(missing_all[missing_data > 10000].index, axis=1, inplace=True) total1['village_income_median'] = total1['village_income_median'].fillna( total1['village_income_median'].mean()) missing_data = total1.isnull().sum().sort_values(ascending=False) missing_precent = ((total1.isnull().sum()) / (total1.isnull().count())).sort_values(ascending=False) missing_type = total1.dtypes missing_all = pd.concat( [missing_data, missing_precent, missing_type], axis=1, keys=['missing_data', 'missing_precent', 'missing_type']) missing_all.sort_values(by='missing_data', ascending=False) print(missing_all) cols = total1.columns num_cols = total1._get_numeric_data().columns cate = list(set(cols) - set(num_cols)) # only building_id is category numer_feat = total1.dtypes[total1.dtypes != 'object'].index skewed_feat = total1[numer_feat].apply(lambda x: (x.dropna()).skew()) skewed_feat = skewed_feat.sort_values(ascending=False) skewness = pd.DataFrame({'Skew': skewed_feat}) skewness = skewness[abs(skewness) > 0.75] skewness.dropna() skewness = skewness[abs(skewness) > 0.75] from scipy.special import boxcox1p skewness_feature = skewness.index lam = 0.15 for i in skewness_feature: total1[i] = boxcox1p(total1[i], 0.15) print(skewness.head(10)) # separate train and test data train = total1[total1['source'] == 'train'] test = total1[total1['source'] == 'test'] train.drop(['source'], axis=1, inplace=True) test.drop(['source'], axis=1, inplace=True) print('###########') missing_data = train.isnull().sum().sort_values(ascending=False) missing_precent = ((train.isnull().sum()) / (train.isnull().count())).sort_values(ascending=False) missing_type = train.dtypes missing_all = pd.concat( [missing_data, missing_precent, missing_type], axis=1, keys=['missing_data', 'missing_precent', 'missing_type']) missing_all.drop(missing_all[missing_data == 0].index, inplace=True) missing_all.sort_values(by='missing_data', ascending=False) print(missing_all) return (train, y_train)
# and append all the samples to one list all_cut_samp = list() samp_id = 1 for sublist in decid_samp_cut_list: formatted_samp = reformat_samples(sublist) cleaned_samp_list = md_clean(formatted_samp, md_bandnames) for elem in cleaned_samp_list: elem['id'] = samp_id all_cut_samp.append(elem) samp_id += 1 out_decid = list(site_samp['tc_value'] for site_samp in all_cut_samp) # fint uniform distribution for the given distribution resp, fit_stats = stats.probplot(np.array(tc_value_list), dist='uniform') # calculate quantiles for QQ plot theo_quantiles = list( np.quantile(resp[0], q) for q in Sublist.frange(0.0, 1.0, step)) actual_quantiles = list( np.quantile(resp[1], q) for q in Sublist.frange(0.0, 1.0, step)) print('R-sq before removal: {}'.format(str(fit_stats[2]**2 * 100.0))) fig1, ax1 = plt.subplots() ax1.plot(theo_quantiles, actual_quantiles, '.', markersize=15,
# H0: homoscedasticity # H1: heteroscedasticity # return value of breusch pagan test # lagrange_multiplier, pvalue, fscore, fp-value # parameters: [residuals, x-array] pval = sms.het_breuschpagan(m1.resid, m1.model.exog)[1] if pval < 0.05: print("Reject H0. Model is Heteroscedastic") else: print("FTR H0. Model is Homoscedastic") # iii) Reasiduals have a normal distribution stats.probplot(m1.resid, dist='norm', plot=pylab) pylab.show() # iv) rows > columns prot.shape # k-Fold Cross-Validation folds = 5 cv_mse = [] X = trainx.values Y = trainy.values kf = KFold(folds) # kf.get_n_splits(X)
# Homoscedasticity : Error term being same across all values of independent variables # Fitted Values Vs Residuals mlpt.figure(figsize=(15, 10)) mlpt.scatter(preds, regressor.resid_pearson, c="r"), mlpt.axhline(y=0, color='blue') mlpt.xlabel("Fitted_Values") mlpt.ylabel("Residuals") mlpt.show() #Normality Test for Residuals #In Order to have a Model's good fit it is important to have the Residuals follow a Normal Distribution Pattern # Normal Distribution Check using Q-Q plot mlpt.figure(figsize=(20, 15)) st.probplot( regressor.resid_pearson, dist="norm", plot=pylab) #Residuals can be said to be nearly Normally Distributed mlpt.show() #To predict the mean values. print('Mean Absolute Error by Stats Model:', metrics.mean_absolute_error(salary_dataframe['Salary'], preds)) print('Mean Squared Error by Stats Model:', metrics.mean_squared_error(salary_dataframe['Salary'], preds)) print( 'Root Mean Squared Error by Stats Model:', numpy.sqrt(metrics.mean_squared_error(salary_dataframe['Salary'], preds))) print("R-Square Value of the Model(Measure of Fit): ", regressor.rsquared) #Conclusion: Transforming the predictor variable to log/sqrt/normalize yields less R-Squared Value and High RMSE value so used #scale to standardize the predictor values and yields better R-Squared Value and low RMSE value compared to other Transformations
sys.stdout.write("\nBoundaries for 4 Equal Percentiles \n") print(ptiles_vers) sys.stdout.write(" \n") plt.figure() x_vers, y_vers=ecdf(df.iloc[:,i]) plt.plot(x_vers, y_vers, '.') plt.xlabel(columns[i-13]) plt.ylabel('ECDF') plt.title("Percentiles ECDF of column "+str(columns[i-13])) plt.plot(ptiles_vers, percentiles/100, marker='D', color='red', linestyle='none') plt.show() #Quantile‐Quantile Plot for each feature/target column for i in range(13,27): plt.figure() stats.probplot(df.iloc[:,i], dist="norm",plot=plt) plt.show() #graphical summary of the relationships plt.figure() sns.pairplot(df[columns], size=2.5) plt.tight_layout() plt.show() cols=['ZN','CHAS','RM','DIS','B','MEDV'] plt.figure() sns.pairplot(df[cols], size=2.5) plt.tight_layout() plt.show() plt.figure()
xs = np.linspace(min(x), max(x), 100) ys_unthinned = 0.5 * ys1 + 0.5 * ys2 plt.plot(xs, ys_unthinned, color='black', label='pdf_unthinned', alpha=0.3) plt.xlabel('Values') plt.legend() plt.subplot(3, 4, 9) plt.plot(np.sort(g), color='blue', label='g_sorted', alpha=0.3) plt.plot(np.sort(x), color='green', label='x_sorted', alpha=0.3) plt.title('Ordered comparison: X vs G') plt.xlabel('Samples') plt.ylabel('Values g, x') plt.legend() plt.subplot(3, 4, 10) probplot(g, dist='norm', plot=pylab) plt.title('QQ Plot: G') plt.subplot(3, 4, 11) probplot(x, dist='norm', plot=pylab) plt.title('QQ Plot: X') plt.subplot(3, 4, 12) plt.scatter(z, g) plt.title('Mapping from noise z to g') plt.xlabel('Noise z') plt.ylabel('Generated value') filename = os.path.join(base_path, 'result_plot.png') plt.savefig(filename)
def qq_residuals(error_residuals): return stats.probplot(error_residuals, dist="norm", plot=pylab)
def norm_plot(data_frame,var_name): sns.distplot(data_frame[var_name], fit=norm); fig = plt.figure() res = stats.probplot((data_frame)[var_name], plot=plt) plt.show()
def qq_residuals(error_residuals): #measurements = np.random.normal(loc = 20, scale = 5, size=100) stats.probplot(error_residuals, dist='norm', plot=pylab) pylab.show()
def qqplot(df, player, dist_name, dfs_pts_col='PTS_DK'): pts = df.loc[player, dfs_pts_col] dist = getattr(stats, dist_name) stats.probplot(pts, sparams=dist.fit(pts), dist=dist_name, plot=pylab)
# drop unnecessary variables vars_keep = ['Season', 'Team', 'wPCT', 'wOBA', 'FIP', 'Def', 'BsR'] team_df = team_df[vars_keep] ### 2. EDA ### # data structure print('Data Structure: {}'.format(team_df.shape)) # normality # 'wPCT' normality fig, axes = plt.subplots(1, 2, figsize=(20, 8)) sns.histplot(team_df['wPCT'], kde=True, ax=axes[0]) axes[0].set_title('Team Winning Percentage Histogram') stats.probplot(team_df['wPCT'], plot=axes[1]) axes[1].set_title('Team Winning Percentage Q-Q Plot') plt.show() # independent variables normality ind_vars = ['wOBA', 'FIP', 'Def', 'BsR'] fig, axes = plt.subplots(2, 2, figsize=(12, 10)) for col, ax in zip(ind_vars, axes.flatten()[:7]): sns.histplot(team_df[col], kde=True, color='navy', ax=ax) ax.set_title('Team {} Histogram'.format(col)) plt.show()
def draw_qq(self, content, title, write_figure_path): content = content.dropna(subset=[title]) stats.probplot(content[title], dist="norm", plot=plt) plt.title(title) plt.savefig(write_figure_path) plt.close()
from scipy.stats import norm from scipy import stats import matplotlib.pyplot as plt # set of objects in basket, media is 8 and standard deviation # is 2 whats the prob of take an object with the wheight is less then 6 kg norm.cdf(6, 8, 2) #what the prob of take object is + then 6kg norm.sf(6, 8, 2) 1 - norm.sf(6, 8, 2) # prob of take an object -6 or +10 kg norm.cdf(6, 8, 2) + norm.sf(10, 8, 2) # prob of take an object -10 and + 8kg norm.cdf(10, 8, 2) - norm.cdf(8, 8, 2) data = norm.rvs(size=100) stats.probplot(data, plot=plt) stats.shapiro(data)
catDict = dict(zip(list(unique), range(len(unique)))) catCount = [0] * 2 for elt in colData: catCount[catDict[elt]] += 1 sys.stdout.write("\nCounts for Each Value of Categorical Label \n") print(list(unique)) print(catCount) #Quantile‐Quantile Plot for 4th Rocks versus Mines Attribute col = 3 colData = [] for row in xList: colData.append(float(row[col])) stats.probplot(colData, dist="norm", plot=pylab) pylab.show() # list 2-5 Using Python Pandas to Read and Summarize Data import pandas as pd from pandas import DataFrame import matplotlib.pyplot as plot from random import uniform from math import sqrt import sys target_url = ( "https://archive.ics.uci.edu/ml/machine-learning-" "databases/undocumented/connectionist-bench/sonar/sonar.all-data") #read rocks versus mines data into pandas data frame rocksVMines = pd.read_csv(target_url, header=None, prefix="V") #print head and tail of data frame
def main(): data = [] workbook = xlsxwriter.Workbook('data/distances.xls') all_distances_sheet = workbook.add_worksheet('All Distances') min_distances_sheet = workbook.add_worksheet('Min Distances') for t in range(len(Utils.standingUpActions)): with open('data/state-space/state-space-t{}-0.pkl'.format(t), 'rb') as handle: data += pickle.load(handle) sn = StateNormalizer() sn.extend_bounds() distances = [] for index, targetAction in enumerate(Utils.standingUpActions): action = Utils.vecToInt(targetAction) targetState = getState(data, index, action) targetState = sn.normalize(targetState) for i in range(Utils.N_ACTIONS): if i == Utils.NULL_ACTION or i == action: continue s = getState(data, index, i) if s is None or targetState is None: continue s = sn.normalize(s) d = euclidean(s, targetState) distances.append(d) print('mean: {}'.format(numpy.mean(distances))) print('var: {}'.format(numpy.var(distances))) print('median: '.format(numpy.median(distances))) print('max: {}'.format(numpy.max(distances))) print('min: {}'.format(numpy.min(distances))) all_distances_sheet.write(0, 0, 'Min') all_distances_sheet.write(0, 1, numpy.min(distances)) all_distances_sheet.write(1, 0, 'Max') all_distances_sheet.write(1, 1, numpy.max(distances)) all_distances_sheet.write(2, 0, 'Mean') all_distances_sheet.write(2, 1, numpy.mean(distances)) all_distances_sheet.write(3, 0, 'Variance') all_distances_sheet.write(3, 1, numpy.var(distances)) all_distances_sheet.write(4, 0, 'Median') all_distances_sheet.write(4, 1, numpy.median(distances)) # measurements = numpy.random.normal(loc = 20, scale = 5, size=100) # probplot(measurements, dist="norm", plot=pylab) # pylab.show() probplot(distances, dist="norm", plot=pylab) pylab.show() # sm.qqplot(numpy.array(distances), line='45') # pylab.show() with open('data/state-space/state-space-all-0.pkl', 'rb') as handle: data = pickle.load(handle) for i in range(len(data)): data[i] = sn.normalize(data[i]) kdtree = KDTree(data) min_dists = [] for i in range(len(data)): dists, indexes = kdtree.query(data[i], 2) min_dists.append(dists[1]) #numpy.set_printoptions(threshold=numpy.nan) # print(min_dists) print('-----------------') print('min: {}'.format(numpy.min(min_dists))) print('max: {}'.format(numpy.max(min_dists))) print('mean: {}'.format(numpy.mean(min_dists))) print('variance: {}'.format(numpy.var(min_dists))) print('median: {}'.format(numpy.median(min_dists))) min_distances_sheet.write(0, 0, 'Min') min_distances_sheet.write(0, 1, numpy.min(min_dists)) min_distances_sheet.write(1, 0, 'Max') min_distances_sheet.write(1, 1, numpy.max(min_dists)) min_distances_sheet.write(2, 0, 'Mean') min_distances_sheet.write(2, 1, numpy.mean(min_dists)) min_distances_sheet.write(3, 0, 'Variance') min_distances_sheet.write(3, 1, numpy.var(min_dists)) min_distances_sheet.write(4, 0, 'Median') min_distances_sheet.write(4, 1, numpy.median(min_dists)) workbook.close()
def probplot(column): plt.figure() if type(column) == str: stats.probplot(all_data[column], plot=plt) else: stats.probplot(column, plot=plt)
from scipy.stats import norm, skew import seaborn as sns import matplotlib.pyplot as plt (mu, sigma) = norm.fit(df_julho_19_reg['TEMPO_PERM_INT_POSTERIOR']) plt.figure(figsize = (14, 7)) sns.distplot(df_julho_19_reg['TEMPO_PERM_INT_POSTERIOR'], fit = norm) plt.ylabel('Frequency') plt.title('Tempo de Permanência - Distribution') plt.legend(['Normal Dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)], loc = 'best') quantile_plot = stats.probplot(df_julho_19_reg['TEMPO_PERM_INT_POSTERIOR'], plot = plt) import numpy as np df_julho_19_reg['TEMPO_PERM_INT_POSTERIOR'] = np.log1p(df_julho_19_reg['TEMPO_PERM_INT_POSTERIOR']) (mu, sigma) = norm.fit(df_julho_19_reg['TEMPO_PERM_INT_POSTERIOR']) plt.figure(figsize = (14, 7)) plt.subplot(1,2,1) sns.distplot(df_julho_19_reg['TEMPO_PERM_INT_POSTERIOR'], fit = norm) plt.ylabel('Frequency') plt.title('Tempo de Permanência - Distribution') plt.subplot(1, 2, 2) quantile_plot = stats.probplot(df_julho_19_reg['TEMPO_PERM_INT_POSTERIOR'], plot = plt) #Cálculo de Simetria dos Dados
def qq_residuals(error_residuals): stats.probplot(error_residuals, dist='norm', plot=pylab) pylab.show() return None
# Get the fitted parameters used by the function (mu, sigma) = norm.fit(train['SalePrice']) print('\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma)) # Now plot the distribution plt.legend( ['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)], loc='best') plt.ylabel('Frequency') plt.title('SalePrice distribution') plt.show() # Get also the QQ-plot fig = plt.figure() res = stats.probplot(train['SalePrice'], plot=plt) plt.show() # Applying log transformation train['SalePrice'] = np.log(train['SalePrice']) ''' #Transformed histogram and normal probability plot -------------------------------------------------- We note how we got mor normal values for SalePrice, the mayority of the values are fairly distributed on the 2 sides of the mean. ''' # Check the new distribution sns.distplot(train['SalePrice'], fit=norm) # Get the fitted parameters used by the function (mu, sigma) = norm.fit(train['SalePrice'])
def eda_numerical_variable(self, variable): """ This provides basic EDA of the Numerical variable passed, - Basic Statistics like Count, Data Type, min, max, mean, median, etc., - Missing Values count and missing percentages - Generates distribution plots. Histogram and KDE Plots - Skewness and Kurtosis - Q-Q plot to check Normality - Box plot to check the spread outliers - Outliers using IQR - Various variable transformations Parameter : ---------- variable: Pass the Numerical variable for which EDA is required """ c = variable s = self.__df__[variable] # 1. Basic Statistics print ('Total Number of observations : ', len(s)) print () print ('Datatype :', (s.dtype)) print () printmd ('**<u>5 Point Summary :</u>**') print (' Minimum :\t\t', s.min(), '\n 25th Percentile :\t', s.quantile(0.25), '\n Median :\t\t', s.median(), '\n 75th Percentile :\t', s.quantile(0.75), '\n Maximum :\t\t', s.max()) print () # 2. Missing values printmd ('**<u>Missing Values :</u>**') print (' Number :', s.isnull().sum()) print (' Percentage :', s.isnull().mean()*100, '%') # 3. Histogram printmd ('**<u>Variable distribution and Spread statistics :</u>**') sns.distplot(s.dropna(), hist = True, fit = norm, kde = True) plt.show() # 4. Spread Statistics print ('Skewness :' , s.skew()) print ('Kurtosis :', s.kurt()) print () # 5. Q-Q plot printmd ('**<u>Normality Check :</u>**') res = stats.probplot(s.dropna(), dist = 'norm', plot = plt) plt.show() # 6. Box plot to check the spread outliers print () printmd ('**<u>Box Plot and Visual check for Outlier :</u>**') sns.boxplot(s.dropna(), orient = 'v') plt.show() # 7. Get outliers. Here distance could be a user defined parameter which defaults to 1.5 print () printmd ('**<u>Outliers (using IQR):</u>**') IQR = np.quantile(s, .75) - np.quantile(s, .25) upper_boundary = np.quantile(s, .75) + 1.5 * IQR lower_boundary = np.quantile(s, .25) - 1.5 * IQR print (' Right end outliers :', np.sum(s>upper_boundary)) print (' Left end outliers :', np.sum(s < lower_boundary)) # 8. Various Variable Transformations print () printmd (f'**<u>Explore various transformations for {c}</u>**') print () print ('1. Logarithmic Transformation') s_log = np.log(s) normality_diagnostic(s_log) print ('2. Exponential Transformation') s_exp = np.exp(s) normality_diagnostic(s_exp) print ('3. Square Transformation') s_sqr = np.square(s) normality_diagnostic(s_sqr) print ('4. Square-root Transformation') s_sqrt = np.sqrt(s) normality_diagnostic(s_sqrt) print ('5. Box-Cox Transformation') s_boxcox, lambda_param = stats.boxcox(s) normality_diagnostic(s_boxcox) print ('Optimal Lambda for Box-Cox transformation is :', lambda_param ) print () print ('6. Yeo Johnson Transformation') s = s.astype('float') s_yeojohnson, lambda_param = stats.yeojohnson(s) normality_diagnostic(s_yeojohnson) print ('Optimal Lambda for Yeo Johnson transformation is :', lambda_param ) print ()