Example #1
0
def make_plots(groups):

    sns.stripplot("ammo", "moa", data=groups, jitter=True)
    postprocess()
    plt.savefig("points.png")

    plt.clf()
    sns.boxplot("ammo", "moa", data=groups)
    postprocess()
    plt.savefig("boxplot.png")

    plt.clf()
    sns.barplot("ammo", "mean", data=groups, ci=None)
    plt.title("mean moa for best 9 of 10 five shot groups")
    plt.ylabel("moa")
    postprocess()
    plt.savefig("avg_moa.png")

    plt.clf()
    std = groups["standard"]
    std = std[std.notnull()]

    fig, axes = plt.subplots(ncols=2)
    sns.distplot(std, ax=axes[0])
    stats.probplot(std, plot=axes[1])
    fig.set_size_inches(6, 4)
    fig.tight_layout()
    plt.savefig("qqplot.png")
Example #2
0
    def univariance(self, var):
        print self._df[var].describe()

        scaled = StandardScaler().fit_transform(self._df[var][:,np.newaxis])
        low_range = scaled[scaled[:,0].argsort()][:10]
        high_range = scaled[scaled[:,0].argsort()][-10:]
        print 'outer range(low) of the dist'
        print low_range
        print 'outer range(high) of the dist'
        print high_range

        #show norm 
        fig = plt.figure()
        plt.title('dist before log-adjust(skewness=%.3f kurtosis=%.3f)'%(self._df[var].skew(), self._df[var].kurt())  )
        sns.distplot(self._df[var], fit=norm)
        fig = plt.figure()
        stats.probplot(self._df[var], plot=plt)
        #apply log to make it to be norm
        #in case of positive skewnewss, log transform will make it to be norm
        self._df[var+'LOG'] = np.log(self._df[var])
        fig = plt.figure()
        plt.title('dist after log-adjust(skewness=%.3f kurtosis=%.3f)'%(self._df[var+"LOG"].skew(), self._df[var+"LOG"].kurt())  )
        sns.distplot(self._df[var+'LOG'], fit=norm)
        fig = plt.figure()
        stats.probplot(self._df[var+'LOG'], plot=plt)
        plt.show()
        return
def plot_residuals(turnstile_weather, predictions):
    '''
    Using the same methods that we used to plot a histogram of entries
    per hour for our data, why don't you make a histogram of the residuals
    (that is, the difference between the original hourly entry data and the predicted values).
    Try different binwidths for your histogram.

    Based on this residual histogram, do you have any insight into how our model
    performed?  Reading a bit on this webpage might be useful:

    http://www.itl.nist.gov/div898/handbook/pri/section2/pri24.htm
    '''
    
    plt.figure()
    # plt.xlabel('Residuals')
    # plt.ylabel('Frequency')
    # plt.title('Histogram of the residuals')
    
    # residual = turnstile_weather['ENTRIESn_hourly'] - predictions
    # plt.xlabel('Fitted Value')
    # plt.ylabel('Residual')
    # plt.title('Residuals versus fits')
    # plt.plot(predictions, residual, 'ro')
    #plt.plot(turnstile_weather['ENTRIESn_hourly'],predictions, 'b')
    
    #(turnstile_weather['ENTRIESn_hourly'] - predictions).hist(bins=100)
    #plt.plot(turnstile_weather['ENTRIESn_hourly'] - predictions, 'b') 
    #plt.plot(predictions, 'b')
    

    
    # Probability Plot
    stats.probplot(turnstile_weather['ENTRIESn_hourly'] - predictions, plot=plt)
    plt.show()
    return plt
Example #4
0
    def statsAnalysis( self ):
        ge_arr = numpy.array( self.geneexpdict.values() )
        descstats = stats.describe( ge_arr ) # descriptive statistics for the log fold change values: size of array, (min,max), mean, var, skewness, kurtosis
        print descstats
        raw_avg_logfc = numpy.mean( ge_arr );
        raw_stdev_logfc = numpy.std( ge_arr );
        print "raw mean and sd: ", raw_avg_logfc, raw_stdev_logfc;
        stats.probplot( ge_arr, plot=matplotlib.pyplot )
        matplotlib.pyplot.savefig('qqplot_raw.png')
        matplotlib.pyplot.close();

        # if the distribution is not central, the n and nn labels could be assigned to genes with > 0 log(fc). To avoid this, convert gene exp values to z-scores and recalculate mean and sd
        for k in self.geneexpdict.keys():
            v = self.geneexpdict[k];
            zscore = (v - raw_avg_logfc)/float(raw_stdev_logfc);
            self.geneexpdict[k] = zscore;

        # recompute distribution parameters
        ge_arr = numpy.array( self.geneexpdict.values() )
        descstats = stats.describe( ge_arr ) # descriptive statistics for the log fold change values: size of array, (min,max), mean, var, skewness, kurtosis
        print descstats
        self.avg_logfoldchange = numpy.mean( ge_arr );
        self.stdev_logfoldchange = numpy.std( ge_arr );
        print "centralized mean and sd: ", self.avg_logfoldchange, self.stdev_logfoldchange;
        stats.probplot( ge_arr, plot=matplotlib.pyplot )
        matplotlib.pyplot.savefig('qqplot_centralized.png')
        matplotlib.pyplot.close();
Example #5
0
def testGeweke(totalSamples = 400, plot=False,
               observeToPredict=False, poisson=False):
    ripl=mk_p_ripl()
    assumes=[('mu','(normal 0 1)')]
    observes=[('(normal mu .1)','.5')]
    if poisson:
        assumes=[('mu','(poisson 30)')]
        observes=[('(normal mu 1)','6')]
    nameToSeries = geweke(ripl,assumes,observes,totalSamples,stepSize=5)
    mean=np.mean(nameToSeries['mu'])
    std=abs(np.std(nameToSeries['mu']))
    print 'testGeweke: observeToPredict=%s'%observeToPredict
    print 'mu=(normal 0 1), infer mu: (mean,std)=', mean, std
    #assert .8 > abs(mean) and .8 > abs(std-1)
    if plot:
        dict2={'mu':np.random.normal(0,1,totalSamples)}
        label2='np.random.normal(0,1)'
        if poisson:
            dict2={'mu':np.random.poisson(30,totalSamples)}
            label2='np.random.poisson(30)'
        qqPlotAll((nameToSeries,dict2),('Geweke',label2))
        plt.figure()
        probplot(nameToSeries['mu'],dist='norm',plot=plt)
        plt.show()
    return nameToSeries
def plot_qq_cca(tvalues):
    stats.probplot(tvalues, dist="norm", plot=Plot)
    Plot.title('Q-Q plot for CCA')
    Plot.xlabel('Actual Quantiles')
    Plot.ylabel('Theoretical Quantiles')
    Plot.savefig('qq-cca.png')
    Plot.show()
def plot_residuals(turnstile_weather, predictions):
    '''
    Using the same methods that we used to plot a histogram of entries
    per hour for our data, why don't you make a histogram of the residuals
    (that is, the difference between the original hourly entry data and the predicted values).

    Based on this residual histogram, do you have any insight into how our model
    performed?  Reading a bit on this webpage might be useful:

    http://www.itl.nist.gov/div898/handbook/pri/section2/pri24.htm
    '''
    

    turnstile_weather['Residual'] = (turnstile_weather['ENTRIESn_hourly'] - predictions)

    '''
    plot = (ggplot(turnstile_weather, 
            aes(x='Residual')) + 
            geom_histogram(color='green', binwidth=1) +
            ggtitle('Hourly entries residuals') + 
            xlab('Difference between original hourly entries and predicted values') + 
            ylab('Frequency'))

    return plot
    '''

    stats.probplot((turnstile_weather
        ['Residual']), dist="norm", plot=plt)
    plt.title('Normal probability plot of hourly entries residuals')
    plt.xlabel('Theoretical normal quantiles')
    plt.ylabel('Residual quantiles')
    return plt
Example #8
0
def plot_residuals(turnstile_weather, predictions):
    '''
    Using the same methods that we used to plot a histogram of entries
    per hour for our data, why don't you make a histogram of the residuals
    (that is, the difference between the original hourly entry data and the predicted values).
    Try different binwidths for your histogram.

    Based on this residual histogram, do you have any insight into how our model
    performed?  Reading a bit on this webpage might be useful:

    http://www.itl.nist.gov/div898/handbook/pri/section2/pri24.htm
    '''

    plt.figure()
    (turnstile_weather['ENTRIESn_hourly'] - predictions).hist()


    # Histogram for residuals
    predictedVal = lr.predictions(turnstile_weather);
    (weather_turnstile['ENTRIESn_hourly'] - predictedVal).hist(bins = 100)

    ## line chart for residuals
    plt.plot(weather_turnstile['ENTRIESn_hourly'] - predictedVal);
    plt.ylabel('Residuals')
    plt.show();

    ## qqplot of the residuals to test for normality
    stats.probplot(weather_turnstile['ENTRIESn_hourly'] - predictedVal, dist="norm", plot=pylab)
    pylab.show();

    return plt
Example #9
0
def check_weight_samples(test_model, samples):
    mu_w = test_model.network.Mu
    Sigma_w = test_model.network.Sigma
    rho = test_model.network.P

    A_samples = np.array([s.weight_model.A for s in samples])
    W_samples = np.array([s.weight_model.W for s in samples])

    # Check that A's mean is about p
    A_mean = A_samples.mean(0)
    print "P:        ", rho
    print "Mean A: \n", A_mean

    # Get the samples where A is nonzero
    # assert test_model.N == 1
    n_pre = n_post = b = 1
    w_samples = np.array(W_samples[:,n_pre, n_post, b])[A_samples[:, n_pre, n_post] > 0, ...]
    w_mean = w_samples.mean(0)
    w_std = w_samples.std(0)
    print "Mean w: \n", w_mean, " +- ", w_std


    # Make Q-Q plots
    fig = plt.figure()
    w_ax = fig.add_subplot(121)
    w_dist = norm(mu_w[n_pre, n_post, b], np.sqrt(Sigma_w[n_pre, n_post, b, b]))
    probplot(w_samples, dist=w_dist, plot=w_ax)

    fig.add_subplot(122)
    _, bins, _ = plt.hist(w_samples, 20, normed=True, alpha=0.2)
    bincenters = 0.5*(bins[1:]+bins[:-1])
    plt.plot(bincenters, w_dist.pdf(bincenters), 'r--', linewidth=1)
    plt.show()
Example #10
0
def check_bias_samples(test_model, samples):
    """
    Check that the bias samples match the prior
    :param test_model:
    :param samples:
    :return:
    """
    mu_bias = test_model.bias_model.mu_0
    sigma_bias = test_model.bias_model.sigma_0

    # Convert samples to arrays
    bias_samples = np.array([s.bias_model.b for s in samples])

    bias_mean = bias_samples.mean(0)
    bias_std = bias_samples.std(0)
    bias_dist = norm(mu_bias, np.sqrt(sigma_bias))
    print "Mean bias: ", bias_mean, " +- ", bias_std
    # Make Q-Q plots
    fig = plt.figure()
    bias_ax = fig.add_subplot(121)
    probplot(bias_samples[:,0], dist=bias_dist, plot=bias_ax)

    fig.add_subplot(122)
    _, bins, _ = plt.hist(bias_samples[:,0], 20, normed=True, alpha=0.2)
    bincenters = 0.5*(bins[1:]+bins[:-1])
    plt.plot(bincenters, bias_dist.pdf(bincenters), 'r--', linewidth=1)
    plt.show()
def draw_qq_plot(dataset, xlabel="", ylabel="", title=""):
    plt.figure()
    stats.probplot(dataset, dist="norm", plot=plt)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(title)
    plt.show()
Example #12
0
def test_sample_nig():
    mu_0 = 0.0
    lmbda_0 = 10.0
    alpha_0 = 10.0
    beta_0 = 10.0

    # Directly sample nig and lookg at marginals
    from pyhawkes.utils.utils import sample_nig

    mu_samples = np.array([sample_nig(mu_0, lmbda_0, alpha_0, beta_0)[0] for _ in xrange(10000)])

    # Plot the histogram of impulse means
    plt.figure()
    p_mu = t(df=2 * alpha_0, loc=mu_0, scale=np.sqrt(beta_0 / (alpha_0 * lmbda_0)))

    _, bins, _ = plt.hist(mu_samples, bins=50, alpha=0.5, normed=True)
    bincenters = 0.5 * (bins[1:] + bins[:-1])
    plt.plot(bincenters, p_mu.pdf(bincenters), "r--", linewidth=1)
    plt.xlabel("mu")
    plt.ylabel("p(mu)")

    plt.figure()
    probplot(mu_samples, dist=p_mu, plot=plt.gca())

    plt.show()
Example #13
0
def reconstruction_plots(test_nonoise, test_dataset, test_tilde, test_pca_tilde):
	import pylab as plt
	vmin = np.amin(test_nonoise)
	vmax = np.amax(test_nonoise)
	npix = int(np.sqrt(np.shape(test_nonoise)[1]))
	
	for ii in range(10):
		residues = test_tilde[ii] - test_dataset[ii]
		
		f = plt.figure(figsize=(10,10))
		plt.subplot(3,4,1)
		plt.imshow(test_nonoise[ii].reshape(npix,npix), interpolation='None', vmin=vmin, vmax=vmax)
		plt.title('No noise img')
		
		plt.subplot(3,4,2)
		plt.imshow((test_dataset[ii]).reshape(npix,npix), interpolation='None', vmin=vmin, vmax=vmax)
		plt.title(r'$N(x)$')	
		
		stats.probplot((test_tilde[ii] - test_pca_tilde[ii]), plot=plt.subplot(3,4,3), dist='norm', fit=True)
		#stats.probplot(test_tilde[ii] - test_nonoise[ii], plot=plt.subplot(3,4,3), dist='norm', fit=True)
		plt.title("Normal Q-Q PCA")
		
		stats.probplot(test_tilde[ii] - test_nonoise[ii], plot=plt.subplot(3,4,4), dist='norm', fit=True)
		plt.title("Normal Q-Q AE")
		
		plt.subplot(3,4,5)
		plt.imshow(test_tilde[ii].reshape(npix,npix), interpolation='None', vmin=vmin, vmax=vmax)
		plt.title('Reconstructed')
		
		plt.subplot(3,4,6)
		plt.imshow(residues.reshape(npix,npix), interpolation='None')
		plt.title(r'$\widetilde{N(x)} - N(x)$')
		
		plt.subplot(3,4,7)
		plt.imshow((test_tilde[ii] - test_nonoise[ii]).reshape(npix,npix), interpolation='None')#, vmin=vmin, vmax=vmax)
		plt.title(r'$\widetilde{N(x)} - x$')
		
		plt.subplot(3,4,8)
		plt.imshow((test_tilde[ii] - test_pca_tilde[ii]).reshape(npix,npix), interpolation='None')
		plt.title(r'$\widetilde{N(x)} - PCA({N(x)})$')
		
		plt.subplot(3,4,9)
		plt.imshow(test_pca_tilde[ii].reshape(npix,npix), interpolation='None', vmin=vmin, vmax=vmax)
		plt.title('PCA Reconstructed')
			
		plt.subplot(3,4,10)
		plt.title(r'$\Delta$')
		plt.imshow((test_pca_tilde[ii] - test_dataset[ii]).reshape(npix,npix), interpolation='None')
		
		plt.subplot(3,4,11)
		plt.imshow((test_pca_tilde[ii] - test_nonoise[ii]).reshape(npix,npix), interpolation='None')#, vmin=vmin, vmax=vmax)
		plt.title(r'$PCA({N(x)}) - x$')
		
		
		plt.subplot(3,4,12)
		plt.imshow((test_tilde[ii] / test_pca_tilde[ii]).reshape(npix,npix), interpolation='None')
		plt.title(r'$\widetilde{N(x)} / PCA({N(x)})$')

	plt.show()
Example #14
0
def get_residuals_ab(self):
    
    # For histogram
    import pylab 
    import scipy.stats as stats
    
    measurements = np.random.normal(loc = 20, scale = 5, size=100)   
    stats.probplot(measurements, dist="norm", plot=pylab)
    pylab.show()
Example #15
0
File: hw8.py Project: mayc2/compbio
def main():

	#parse in the data file, list of floats
	input_file_name = "data.txt"
	data = parse(input_file_name)
	# print (len(data))

	#plot a histogram of the data
	n, bins, patches = plt.hist(data,bins=40)
	plt.title('Plot of Mixture of Two Different Normal Distributions')
	plt.xlabel('x-values')
	plt.ylabel('y-values')
	plt.show()

	#plot a qq plot of the data
	probplot(data, plot=plt, dist="norm")
	plt.title('QQ Plot of the Data Against the Standard Normal Distribution')
	plt.xlabel('Index')
	plt.ylabel('Normal Data Quantiles')
	plt.show()

	#calculate the p-values for the data
	abs_values = list(abs(n) for n in data)
	abs_values.sort()
	p = list(2.0 * norm.cdf(n * -1) for n in abs_values)
	# print(p)

	#plt a histogram of the p-values
	n, bins, patches = plt.hist(p,bins=40)
	plt.title('Histogram of the P-Values')
	plt.xlabel('Index')
	plt.ylabel('P - Value')
	plt.show()

	#calculate the false discovery rate & the p-value at the FDR cutoff
	p.sort()
	fdr_cutoff = 0.05
	r, p_value = FDR(p, fdr_cutoff)
	print("The p_value at the FDR cutoff of", fdr_cutoff, "is", p_value,".")
	print("There are", r, "p-values less than the pvalue calculated at the FDR cutoff.")

	#plot the raw data values with FDR cutoff
	x = range(len(p))
	plt.scatter(x[:r], p[:r], c='r', edgecolor='r',label='Below FDR cutoff')
	plt.scatter(x[r+1:], p[r+1:], c='b',edgecolor='b',label='Above FDR cutoff')
	# plt.scatter(list(range(len(p_list1),len(p_list1)+len(p_list2))),p_list1)
	plt.plot([0, len(p)], [0.05,0.05], 'k-', lw=2, label="pvalue below 0.05")
	plt.legend(loc='upper left')
	plt.xlim(0,len(p))
	plt.ylim(-.05,1.05)
	plt.xlabel('R - Value')
	plt.ylabel('P - Value')
	plt.title('False Positives of the P-Values')
	plt.show()

	return 0
Example #16
0
def pplot(path, x):
    x = sorted([float(item) for item in x])
    y = [100.0*((j - 0.5)/float(len(x))) for j in xrange(1, len(x)+1)]
    plt.clf()
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        stats.probplot(x, dist='norm', plot=plt)
    plt.savefig(path, format='png')

    plt.clf()
Example #17
0
def normal_probability_plot(path, tables, conf):
    fname = path + '.png'
    treenums = tables['tree_number']
    x = sorted([float(dec(num).log10()) for ast, num in treenums])
    y = [100.0*((j - 0.5)/float(len(x))) for j in xrange(1, len(x)+1)]
    plt.clf()
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        stats.probplot(x, dist='norm', plot=plt)
    plt.savefig(fname, format='png')
Example #18
0
    def QQ_plot(self):
        """
        returns the QQ-plot with normal distribution

        """
        plt.figure(figsize=(12, 8), facecolor="w", edgecolor="k", linewidth=2.0, frameon=True)
        stats.probplot(self.scvr, dist="norm", plot=plt)
        plt.xlabel("SCVR")
        plt.ylabel("Standard quantile")
        plt.show()
Example #19
0
def plot(file_name,negative_control_gRNAs=None,wald_only=False):
    data=open(file_name,'rb')
    short_file_name=file_name[:file_name.index(".gene_summary.txt")]
    data.readline()
    permute_p_value_list=[]
    wald_p_value_list=[]
    beta_value_list=[]

    if negative_control_gRNAs!=None:
        negative_control_permute_p_value_list=[]
        negative_control_wald_p_value_list=[]
        negative_control_beta_value_list=[]


    for line in data:
        elements=line.decode().strip().split("\t")
        if negative_control_gRNAs!=None and elements[0] in negative_control_gRNAs:
            negative_control_beta_value_list.append(float(elements[2]))
            if wald_only==True:
                negative_control_wald_p_value_list.append(float(elements[4]))
            else:
                negative_control_permute_p_value_list.append(float(elements[4]))
                negative_control_wald_p_value_list.append(float(elements[6]))
        else:
            beta_value_list.append(float(elements[2]))
            if wald_only==True:
                wald_p_value_list.append(float(elements[4]))
            else:
                permute_p_value_list.append(float(elements[4]))
                wald_p_value_list.append(float(elements[6]))
    beta_value_list=[x for x in beta_value_list if str(x) != 'nan' and abs(x)<3]
    wald_p_value_list=[x for x in wald_p_value_list if str(x) != 'nan']
    if negative_control_gRNAs!=None:
        negative_control_beta_value_list=[x for x in beta_value_list if str(x) != 'nan' and abs(x)<3]
        negative_control_wald_p_value_list=[x for x in wald_p_value_list if str(x) != 'nan']

    if wald_only!=True:
        permute_p_value_list=[x for x in permute_p_value_list if str(x) != 'nan']
        stats.probplot(permute_p_value_list, dist="uniform",plot=pylab)
        pylab.savefig("QQplot of permute_p value %s.png" %short_file_name)
        pylab.close()

    pylab.hist(beta_value_list,bins=1000)
    pylab.savefig("Hist of beta value %s.png" %short_file_name)
    pylab.close()

    #stats.probplot(wald_p_value_list, dist="uniform",plot=pylab)
    fig=sm.qqplot(np.array(wald_p_value_list),stats.uniform,fit=True, line='45')
    pylab.xlim(0,1)
    pylab.ylim(0,1)
    #fig.set_xlim(0,1)
    pylab.savefig("QQplot of wald_p value %s.png" %short_file_name)
    pylab.close()
    '''
    def is_normal(self):
        '''A series of Normal tests including qqplot, histogram
        and Shaprio-Wilks'''

        stats.probplot(self.residuals, dist = 'norm', plot = plt)
        plt.show()

        plt.hist(self.residuals)
        plt.show()

        print "The Shapiro-Wilk p-value is {}\n".format(stats.shapiro(self.residuals)[1])
Example #21
0
File: Stats.py Project: alanhdu/Dex
 def _residPlot(self, results):
     res = results.resid 
     fig, axes = plt.subplots(nrows=2, ncols=2)
     plt.subplot(axes[0, 0])
     stats.probplot(res, plot=plt) # QQ plot
     plt.subplot(axes[1, 0])
     sns.distplot(res)             # Histogram
     plt.subplot(axes[0, 1])
     sns.regplot(results.predict(), res, lowess=True, ax=axes[0, 1],
             line_kws={"color":"black"})
     res.plot(ax=axes[1, 1]) # Time series (residual v order)
Example #22
0
	def drawProbPlot(self, model, ax):
		x = pd.Series(self.dist)
		stats.probplot(x, dist=model, plot=ax)
		q = np.arange(0, 1.0001, 0.05)
		vals = x.quantile(q)
		ax.set_yticks(vals, True)
		ax.set_yticklabels(q, minor=True)
		ax.set_yticks([])
		ax.yaxis.grid(True, 'both')
		ax.set_ylim(vals.min(), vals.max())
		ax.set_xlabel("X")
		ax.set_ylabel("Quantile")
Example #23
0
 def residual_analysis(self):
     dropnan_DF = self.dropnan_DF
     model = sm.ols(formula='ob_value ~ sm_value', data=dropnan_DF)
     fitted = model.fit()
     fittedvalues =  np.array(fitted.fittedvalues)
     residual = fittedvalues - np.array(dropnan_DF.ob_value)
     norm_residual = fitted.resid_pearson
     
     ### 
     
     
     figure = plt.figure(facecolor='white')
     
     subplot1 = figure.add_subplot(2,2,1)
     subplot1.scatter(fittedvalues,residual)
     subplot1.set_xlabel("Fitted values")
     subplot1.set_ylabel("Residuals")
     subplot1.set_title("Residuals vs Fitted")
     
     subplot2 = figure.add_subplot(2,2,2)
     probplot(norm_residual,plot=subplot2)
     subplot2.set_title("Normal Q-Q")
     subplot2.set_ylabel("Standardized residuals")
     subplot2.set_xlabel("Theoretical Quantiles")
     
     
     subplot3 = figure.add_subplot(2,2,3)
     subplot3.scatter(fittedvalues,np.sqrt(np.abs(residual)))
     subplot3.set_title("Scale-Location")
     subplot3.set_ylabel(r'$\sqrt{\mathrm{|Standardized\/residuals|}}$')
     subplot3.set_xlabel("Fitted values")
     
     subplot4 = figure.add_subplot(2,2,4)
     norm_residual = (np.matrix(norm_residual)).T
     H = norm_residual*(norm_residual.T*norm_residual).I*norm_residual.T
     h = H.diagonal()
     subplot4.scatter(np.array(h),np.array(norm_residual.T))
     subplot4.set_title("Residuals vs Leverage")
     subplot4.set_ylabel("Standardized residuals")
     subplot4.set_xlabel("Leverage")
     subplot4.xaxis.set_major_locator(MaxNLocator(6))
     
     figure.tight_layout()
     # Store image in a string buffer
     buffer = StringIO.StringIO()
     canvas = pylab.get_current_fig_manager().canvas
     canvas.draw()
     pilImage = PIL.Image.frombytes("RGB", canvas.get_width_height(), canvas.tostring_rgb())
     pilImage.save(buffer, "PNG")
     pylab.close()
     img = str((buffer.getvalue()).encode('Base64'))
     
     return img 
Example #24
0
def predictions(dataframe):
    
#    dataframe['ENTRIESn_hourly'] = np.log1p(dataframe.ENTRIESn_hourly) # log transformation 
    
#    features = dataframe[['meantempi']]
    features = dataframe[['rain']]
#    dummy_rain = pd.get_dummies(dataframe['rain'], prefix='rain')
    dummy_unit = pd.get_dummies(dataframe['UNIT'], prefix='unit')
    dummy_hour = pd.get_dummies(dataframe['hour'], prefix='hour')
    dummy_day_week = pd.get_dummies(dataframe['day_week'], prefix='day_week')
    features = features.join(dummy_hour).join(dummy_day_week).join(dummy_unit) #join(dummy_rain).
    
#    removing one dummy from each group to reduce multicollinearity
    features.drop(['unit_R003'], axis = 1, inplace = True)
    features.drop(['hour_0'], axis = 1, inplace = True)
    features.drop(['day_week_0'], axis = 1, inplace = True)   
#    features.drop(['rain_0'], axis = 1, inplace = True)   
#    values_log = dataframe['log_ENTRIESn_hourly']
    values = dataframe['ENTRIESn_hourly']
    
#    Perform linear regression
#    intercept, params = linear_regression(features, values_log)    
    intercept, params = linear_regression(features, values)    
    predictions = intercept + np.dot(features, params)
#    log_predictions [log_predictions<0] = 1
#    predictions = np.expm1(log_predictions) # inverse logarithmic transformation to produce ENTRIESn_hourly   
#    residuals = values - predictions
    residuals = values - predictions
    
    
    print predictions[:5]
    print values[:5]

    '''
    plt.figure()
    residuals.hist(alpha=1, bins=100, label='ENTRIESn_hourly residuals')
    plt.title("Residuals Histogram") # add a title
    plt.ylabel("Frequency") # add a label to the y-axis
    plt.xlabel("ENTRIESn_hourly residuals") 
#    plt.legend() # add a legend
    plt.show()
    '''
    
#    print 'log linear QQ plot'
#    sns.residplot(values_nl, predictions, lowess=True, color="navy")
               
#    plot qq plot
    stats.probplot(residuals, dist="norm", plot=pylab)
#    residuals.hist(alpha=1, bins=100, label='ENTRIESn_hourly residuals')

    pylab.show()
    return predictions
Example #25
0
def normplot(e):
    """
    parameters
    ----------
    e: error of a single voxel through time

    Returns
    -------
    a Q-Q plot
    """
    stats.probplot(e, dist = "norm", plot = plt)
    plt.title("Normal Q-Q plot")
    plt.show()
    plt.savefig('../../../data/normal_assumption.png')
Example #26
0
 def test_sparams_keyword(self):
     np.random.seed(123456)
     x = stats.norm.rvs(size=100)
     # Check that None, () and 0 (loc=0, for normal distribution) all work
     # and give the same results
     osm1, osr1 = stats.probplot(x, sparams=None, fit=False)
     osm2, osr2 = stats.probplot(x, sparams=0, fit=False)
     osm3, osr3 = stats.probplot(x, sparams=(), fit=False)
     assert_allclose(osm1, osm2)
     assert_allclose(osm1, osm3)
     assert_allclose(osr1, osr2)
     assert_allclose(osr1, osr3)
     # Check giving (loc, scale) params for normal distribution
     osm, osr = stats.probplot(x, sparams=(), fit=False)
    def qqplot(self, x, prefix='qq'):
        """Show qq plots compared to normal before and after the transform."""
        from matplotlib import pylab
        from scipy.stats import probplot
        y = self.transform(x)

        for i, (x_i, y_i) in enumerate(zip(x.T, y.T)):
            probplot(x_i, dist="norm", plot=pylab)
            pylab.savefig(prefix + '_%d_before.png' % i)
            pylab.clf()

            probplot(y_i, dist="norm", plot=pylab)
            pylab.savefig(prefix + '_%d_after.png' % i)
            pylab.clf()
def plot_qq(file):
    tvalues = []
    for line in open(file):
        line = line.split(' ')
        line = [i for i in line if i != '']
        tvalues.append(math.fabs(float(line[3])))
    tvalues = sorted(tvalues)
    tvalues = tvalues[:len(tvalues)-1]
    stats.probplot(tvalues, dist="norm", plot=Plot)
    Plot.title('Q-Q plot for Bag of Words')
    Plot.xlabel('Actual Quantiles')
    Plot.ylabel('Theoretical Quantiles')
    Plot.savefig('qq-bag-of-words.png')
    Plot.show()
Example #29
0
    def test_basic(self):
        np.random.seed(12345)
        x = stats.norm.rvs(size=20)
        osm, osr = stats.probplot(x, fit=False)
        osm_expected = [-1.8241636, -1.38768012, -1.11829229, -0.91222575,
                        -0.73908135, -0.5857176, -0.44506467, -0.31273668,
                        -0.18568928, -0.06158146, 0.06158146, 0.18568928,
                        0.31273668, 0.44506467, 0.5857176, 0.73908135,
                        0.91222575, 1.11829229, 1.38768012, 1.8241636]
        assert_allclose(osr, np.sort(x))
        assert_allclose(osm, osm_expected)

        res, res_fit = stats.probplot(x, fit=True)
        res_fit_expected = [1.05361841, 0.31297795, 0.98741609]
        assert_allclose(res_fit, res_fit_expected)
Example #30
0
def create_qqplots(data):
    plt.figure() # New figure    
    test_data = np.random.normal(size=1000)   
    graph1 = stats.probplot(test_data, dist="norm", plot=plt)
    plt.savefig("unit2_2_1_qq_normalplot.png") #this will generate the first graph
    
    plt.figure()
    test_data2 = np.random.uniform(size=1000)   
    graph2 = stats.probplot(test_data2, dist="norm", plot=plt)
    plt.savefig("unit2_2_1_qq_uniformplot.png") #this will generate the second graph
    
    # Using the data to see if it looks like any of the distributions above
    plt.figure()  
    graph2 = stats.probplot(data, dist="norm", plot=plt)
    plt.savefig("unit2_2_1_qq_data.png") #this will generate the third graph
plt.hist(dataset['Petal.Width'], color='purple');plt.title('Histogram of Petal Width');plt.xlabel('Petal Width');plt.ylabel('Frequency')

# Barplot 
import seaborn as sns
sns.countplot(dataset['Species']).set_title('Count of Species')

# Normal Q-Q plot
plt.plot(dataset.drop('Species', axis=1));plt.legend(list(dataset.columns))

sl = np.array(dataset['Sepal.Length'])
sw = np.array(dataset['Sepal.Width'])
pl = np.array(dataset['Petal.Length'])
pw = np.array(dataset['Petal.Width'])

from scipy import stats
stats.probplot(sl, dist='norm', plot=plt);plt.title('Probability plot of Sepal Length')
stats.probplot(sw, dist='norm', plot=plt);plt.title('Probability plot of Sepal Width')
stats.probplot(pl, dist='norm', plot=plt);plt.title('Probability plot of Petal Length')
stats.probplot(pw, dist='norm', plot=plt);plt.title('Probability plot of Petal Width')

# Normal Probability Distribution
x_sl = np.linspace(np.min(sl), np.max(sl))
y_sl = stats.norm.pdf(x_sl, np.median(x_sl), np.std(x_sl))
plt.plot(x_sl, y_sl);plt.xlim(np.min(sl), np.max(sl));plt.title('Normal Probability Distribution of Sepal Length');plt.xlabel('Sepal Length');plt.ylabel('Probability')

x_sw = np.linspace(np.min(sw), np.max(sw))
y_sw = stats.norm.pdf(x_sw, np.median(x_sw), np.std(x_sw))
plt.plot(x_sw, y_sw);plt.xlim(np.min(sw), np.max(sw));plt.title('Normal Probability Distribution of Sepal Width');plt.xlabel('Sepal Width');plt.ylabel('Probability')

x_pl = np.linspace(np.min(pl), np.max(pl))
y_pl = stats.norm.pdf(x_pl, np.median(x_pl), np.std(x_pl))
Example #32
0
#observed value vs fitted value
plt.scatter(Startup_50.Profit,profit_pred,c="r");plt.xlabel("observed value");plt.ylabel("fitted value")

#residuals vs fitted value
plt.scatter(profit_pred,model_5.resid_pearson, c="r"),plt.axhline(y=0,color='blue');plt.xlabel('fitted value');plt.ylabel('residuals')

#normality plot for residuals
#histogram
plt.hist(model_5.resid_pearson)

#qq plot for residuals
import pylab
import scipy.stats as st

#checking residuals are normally distributed
st.probplot(model_5.resid_pearson, dist="norm",plot=pylab)

# get a list of columns
cols = list(Startup_50)
# move the column to head of list using index, pop and insert
cols.insert(0, cols.pop(cols.index('Profit')))
cols
# use ix to reorder
Startup_50 = Startup_50.ix[:, cols]
Startup_50

### Splitting the data into train and test data 
from sklearn.model_selection import train_test_split
startup_train,startup_test  = train_test_split(Startup_50,test_size = 0.3) # 30% test data
startup_train
startup_test
Example #33
0
import numpy as np
import pylab
import scipy.stats as stats
from numpy.matlib import randn
from pandas import read_csv, qcut, DataFrame, scatter_matrix

from ExploratoryAnalysis import remove_border, hexbin

measurements = np.random.normal(loc=20, scale=5, size=100)
stats.probplot(measurements, dist='norm', plot=pylab)
pylab.show()

pylab.figure()
pData = read_csv('E:/GitHub/DataAnalysis/data/ss06pid.csv')
pData['AGEP'].plot(kind='kde', linewidth=3)
pData['AGEP'][pData['SEX'] == 1].plot(kind='kde', linewidth=3, style='orange')

#scatterplot --size matters
pData.plot(x='JWMNP', y='WAGP', style='o', markersize=3)

pylab.show()

df = DataFrame(randn(1000, 4), columns=['a', 'b', 'c', 'd'])
scatter_matrix(df, alpha=0.2, figsize=(6, 6), diagonal='kde')

pylab.show()

x = np.random.normal(size=10500)
y = np.random.normal(size=10500)
pylab.plot(x, y, 'o')
pylab.show()
model2.fit(X = wcat.iloc[:,[0,2]],y=wcat.AT)
pred2 = model2.predict(wcat.iloc[:,[0,2]])
# Adjusted R-Squared value
model2.score(wcat.iloc[:,[0,2]],wcat.AT)# 0.67791
rmse2 = np.sqrt(np.mean((pred2-wcat.AT)**2)) # 32.366
model2.coef_
model2.intercept_
#### Residuals Vs Fitted values
import matplotlib.pyplot as plt
plt.scatter(pred2,(pred2-wcat.AT),c="r")
plt.hlines(y=0,xmin=0,xmax=200)  
# Checking normal distribution
plt.hist(pred2-wcat.AT)
import pylab
import scipy.stats as st
st.probplot(pred2-wcat.AT,dist="norm",plot=pylab)

# Let us prepare a model by applying transformation on dependent variable
wcat["AT_sqrt"] = np.sqrt(wcat.AT)

model3 = LinearRegression()
model3.fit(X = wcat.iloc[:,[0,2]],y=wcat.AT_sqrt)
pred3 = model3.predict(wcat.iloc[:,[0,2]])
# Adjusted R-Squared value
model3.score(wcat.iloc[:,[0,2]],wcat.AT_sqrt)# 0.74051
rmse3 = np.sqrt(np.mean(((pred3)**2-wcat.AT)**2)) # 32.0507
model3.coef_
model3.intercept_
#### Residuals Vs Fitted values
import matplotlib.pyplot as plt
plt.scatter((pred3)**2,((pred3)**2-wcat.AT),c="r")
Example #35
0
    kor_result
)  # Ttest_1sampResult(statistic=-1.3321801667713213, pvalue=0.19856051824785262)
# 📌 p-value 0.1985 > 0.05 이므로 귀무 채택 == 국어 평균은 80이다.
'''  실습 예제 2)
    여아 신생아 몸무게의 평균 검정 수행 babyboom.csv
    여아 신생아의 몸무게는 평균이 2800(g)으로 알려져 왔으나 이보다 더 크다는 주장이 나왔다. 표본으로 여아 18명을 뽑아 체중을 측정하였다고 할 때 새로운 주장이 맞는지 검정해 보자.
    귀무 : 여아 신생아 몸무게 평균이 2800g이다
    대립 : ---------------------- 2800g이 아니다.
'''
data2 = pd.read_csv('../testdata/babyboom.csv')
# print(data2) #time  gender[1여아, 2남아]  weight  minutes
fdata = data2[data2.gender == 1]  #여아데이터만 추출
print(np.mean(fdata.weight))  #3132.4

# 정규성 확인을 위한 시각화
sns.distplot(fdata.iloc[:, 2], fit=stats.norm)  # 히스토그램같은
plt.show()
stats.probplot(fdata.iloc[:, 2], plot=plt)  #Q-Q plot : 회귀선과 실제 데이터
plt.show()

print(
    stats.shapiro(fdata.iloc[:, 2])
)  #샤피로-윌크 검정으로 정규성 확인 (The test statistic., p=0.01798),  p < 0.05 이므로 정규성을 따르지 않음
# 정규성을 따르지 않을 경우 원래는 t-test를 쓰면 안됨 => 정규성을 띄지 않을 때는 Wilcoxon 혹은  Mann-Whitney
# 그러나 집단이 하나이므로 Wilcoxon 검정은 할 수 없다.

baby_result = stats.ttest_1samp(fdata.weight, popmean=2800)
print(
    baby_result
)  #Ttest_1sampResult(statistic=2.233187669387536, pvalue=0.03926844173060218)
# 📌 p-value 0.0392 < 0.05 이므로 귀무 기각 == 여아 몸무게 평균은 2800g이 아니다(더 크다).
Example #36
0
    else:
        pass


plt.subplots(figsize=(12,9))
sns.distplot(train['Price'], fit=stats.norm)

plt.subplots(figsize=(12,9))
sns.distplot(train['Price'], fit=stats.norm)
# Get the fitted parameters used by the function

(mu, sigma) = stats.norm.fit(train['Price'])
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)], loc='best')
plt.ylabel('Frequency')
fig = plt.figure()
stats.probplot(train['Price'], plot=plt)
plt.show()

#Let's check if the data set has any missing values. 

train.columns[train.isnull().any()]

#plot of missing value attributes
plt.figure(figsize=(12, 6))
sns.heatmap(train.isnull())
plt.show()

train_corr = train.select_dtypes(include=[np.number])
train_corr.shape
#Coralation plot
corr = train_corr.corr()
Example #37
0
     for s in range(nsubint):
         for i in range(nchan):
             if is_off_pulse:
                 binmin1 = np.argmin(data[s, i, :nphbin / 2])
                 binmin2 = np.argmin(data[s, i, nphbin / 2:])
                 mean = np.median(data[
                     s, i,
                     range(binmin1 - noff / 2, binmin1 + noff / 2) +
                     range(binmin2 - noff / 2, binmin2 + noff / 2)])
                 rms = np.std(data[
                     s, i,
                     range(binmin1 - noff / 2, binmin1 + noff / 2) +
                     range(binmin2 - noff / 2, binmin2 + noff / 2)])
             else:
                 osm, osr = sc.probplot(data[s, i],
                                        sparams=(),
                                        dist='norm',
                                        fit=0)
                 q_max = np.min(np.where(osm > 1.0))
                 q_min = np.max(np.where(osm < -1.0))
                 rms, mean = np.polyfit(osm[q_min:q_max],
                                        osr[q_min:q_max], 1)
             data[s, i] -= mean
             if rms == 0.0: data[s, i] = 0.0
             else: data[s, i] /= rms
             crit = np.isfinite(data[s, i])
             data[s, i][-crit] = 0.0
             sp[s, i] = np.max(data[s, i])
 else:
     scr = np.sum(data, axis=0)
     scr /= nsubint
     for i in range(nchan):
Example #38
0
#程序文件Pex4_12.py
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm, probplot

a = np.loadtxt("Pdata4_6_2.txt")
h = a[:, ::2]
h = h.flatten()
mu = np.mean(h)
s = np.std(h)
print([mu, s])
sh = np.sort(h)  #按从小到大排序
n = len(sh)
xi = (np.arange(1, n + 1) - 1 / 2) / n
yi = norm.ppf(xi, mu, s)
plt.rc('font', size=16)
plt.rc('font', family='SimHei')
plt.rc('axes', unicode_minus=False)  #用来正常显示负号
plt.subplot(121)
plt.plot(yi, sh, 'o', label='QQ图')
plt.plot([155, 185], [155, 185], 'r-', label='参照直线')
plt.legend()
plt.subplot(122)
res = probplot(h, plot=plt)
plt.savefig("figure4_12.png", dpi=500)
plt.show()
Example #39
0
ax2.set(xlabel='Price Doc', ylabel='Year',title="Box Plot On Price Doc Across Year")
ax3.set(xlabel='Month', ylabel='Count',title="Box Plot On Price Doc Across Month")

# # Univariate Analysis #
# 
#  - Price Doc
#  - Build Year

# ## Price Doc Distribution##

# In[ ]:


fig,axes = plt.subplots(ncols=2)
fig.set_size_inches(20, 10)
stats.probplot(train["price_doc"], dist='norm', fit=True, plot=axes[0])
stats.probplot(np.log1p(train["price_doc"]), dist='norm', fit=True, plot=axes[1])

# ## Build Year ##

# In[ ]:


fig,ax= plt.subplots()
fig.set_size_inches(20,8)
trainBuild = train.dropna()
trainBuild["yearbuilt"] = trainBuild["build_year"].map(lambda x:str(x).split(".")[0])
sn.countplot(data=trainBuild,x="yearbuilt",ax=ax)
ax.set(xlabel='Build Year', ylabel='Count',title="No of Buildings Across Year",label='big')
plt.xticks(rotation=90)
Example #40
0
from scipy.io import arff
import pandas as pd
import pylab
import scipy.stats as stats
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

data = arff.loadarff('baseball.arff')
df = pd.DataFrame(data[0])
columns = list(df.columns[4:])
train = df[df.columns[4:8]]
test = df[df.columns[8]]
lr = LinearRegression()
lr.fit(train, test)
preds = lr.predict(train)

residuals = test - preds
df['Residual'] = residuals

# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# df[columns] = scaler.fit_transform(df[columns].to_numpy())
# stats.probplot(df['number_of_wins_in_1986'], dist="norm", plot=pylab)
# stats.probplot(df['number_of_losses_in_1986'], dist="norm", plot=pylab)
# stats.probplot(df['attendance_for_home_games_in_1986'], dist="norm", plot=pylab)
# stats.probplot(df['attendance_for_away_games_in_1986'], dist="norm", plot=pylab)
# stats.probplot(df['1987_average_salary'], dist="norm", plot=pylab)
stats.probplot(df['Residual'], dist="norm", plot=pylab)

pylab.show()
Example #41
0
def data_process():
    rcParams['figure.figsize'] = (12.0, 6.0)
    df_train = pd.read_csv('./data/train.csv')
    df_test = pd.read_csv('./data/test.csv')

    # describe data type (count, mean, std, min, 25%, 50%, 75%, max)
    print(f"numerical feature: {df_train.describe().shape}")
    print(df_train.describe())

    df_train['source'] = 'train'
    df_test['source'] = 'test'
    df_train.drop('building_id', axis=1, inplace=True)
    df_test.drop('building_id', axis=1, inplace=True)

    # kernel density plot
    sns.distplot(df_train.total_price, fit=norm)
    plt.ylabel('Frequency')
    plt.xlabel('total_price')
    (mu, sigma) = norm.fit(df_train['total_price'])
    fig = plt.figure()
    res = stats.probplot(df_train['total_price'], plot=plt)
    plt.show()
    print("skewness: %f" % df_train['total_price'].skew())
    print("kurtosis: %f" % df_train['total_price'].kurt())

    # log transform the target
    df_train['total_price'] = np.log1p(df_train['total_price'])

    # Kernel Density plot
    sns.distplot(df_train.total_price, fit=norm)
    plt.ylabel('Frequency')
    plt.title = ('SalePrice distribution')
    #Get the fitted parameters used by the function
    (mu, sigma) = norm.fit(df_train['total_price'])
    # QQ plot
    fig = plt.figure()
    res = stats.probplot(df_train['total_price'], plot=plt)
    plt.show()

    # fig, ax = plt.subplots()
    # ax.scatter(x=df_train['parking_price'], y=df_train['total_price'])
    # plt.xlabel('parking_price')
    # plt.ylabel('total_price')
    # plt.show()

    # fig, ax = plt.subplots()
    # ax.scatter(x=df_train['XIII_5000'], y=df_train['total_price'])
    # plt.xlabel('XIII_5000')
    # plt.ylabel('total_price')
    # plt.show()

    # fig, ax = plt.subplots()
    # ax.scatter(x=df_train['XIII_10000'], y=df_train['total_price'])
    # plt.xlabel('XIII_10000')
    # plt.ylabel('total_price')
    # plt.show()

    # fig, ax = plt.subplots()
    # ax.scatter(x=df_train['VII_10000'], y=df_train['total_price'])
    # plt.xlabel('VII_10000')
    # plt.ylabel('total_price')
    # plt.show()

    # fig, ax = plt.subplots()
    # ax.scatter(x=df_train['IX_10000'], y=df_train['total_price'])
    # plt.xlabel('IX_10000')
    # plt.ylabel('total_price')
    # plt.show()

    # fig, ax = plt.subplots()
    # ax.scatter(x=df_train['V_10000'], y=df_train['total_price'])
    # plt.xlabel('V_10000')
    # plt.ylabel('total_price')
    # plt.show()

    # # outlier deletion

    df_train = df_train.drop(
        df_train[(df_train['parking_price'] > 800000)].index)
    fig, ax = plt.subplots()
    ax.scatter(df_train['parking_price'], df_train['total_price'])
    plt.xlabel('parking_price')
    plt.ylabel('total_price')
    plt.show()

    # # combine data
    y_train = df_train['total_price']
    y_train = df_train.total_price.values
    total1 = pd.concat([df_train, df_test],
                       axis=0,
                       join='outer',
                       ignore_index=True)
    total1.drop(['total_price'], axis=1, inplace=True)
    # print(total1.shape)

    # # correration matrix
    # corrmat = df_train.corr()
    # f, ax = plt.subplots(figsize=(12, 9))
    # sns.heatmap(corrmat, vmax=0.9, square=True)
    # plt.show()

    # # get the top 10 more correlative features
    # cols = corrmat.nlargest(10, 'total_price')['total_price'].index
    # cm = np.corrcoef(df_train[cols].values.T)
    # plt.subplots(figsize=(12, 9))
    # sns.set(font_scale=1.25)
    # hm = sns.heatmap(
    #     cm,
    #     cbar=True,
    #     annot=True,
    #     square=True,
    #     fmt='.2f',
    #     annot_kws={'size': 10},
    #     yticklabels=cols.values,
    #     xticklabels=cols.values)
    # plt.yticks(rotation=0)
    # plt.xticks(rotation=90)
    # plt.show()

    # sns.set()
    # cols = [
    #     'total_price', 'parking_price', 'XIII_5000', 'jobschool_rate',
    #     'bachelor_rate', 'XIII_10000', 'VII_10000', 'IX_10000', 'V_10000',
    #     'master_rate'
    # ]
    # sns.pairplot(df_train[cols], size=1.25)
    # plt.show()

    # process missing data
    missing_data = total1.isnull().sum().sort_values(ascending=False)
    missing_precent = ((total1.isnull().sum()) /
                       (total1.isnull().count())).sort_values(ascending=False)
    missing_type = total1.dtypes
    missing_all = pd.concat(
        [missing_data, missing_precent, missing_type],
        axis=1,
        keys=['missing_data', 'missing_precent', 'missing_type'])

    missing_all.drop(missing_all[missing_data == 0].index, inplace=True)
    missing_all.sort_values(by='missing_data', ascending=False)
    print(missing_all)

    total1.drop(missing_all[missing_data > 10000].index, axis=1, inplace=True)
    total1['village_income_median'] = total1['village_income_median'].fillna(
        total1['village_income_median'].mean())

    missing_data = total1.isnull().sum().sort_values(ascending=False)
    missing_precent = ((total1.isnull().sum()) /
                       (total1.isnull().count())).sort_values(ascending=False)
    missing_type = total1.dtypes
    missing_all = pd.concat(
        [missing_data, missing_precent, missing_type],
        axis=1,
        keys=['missing_data', 'missing_precent', 'missing_type'])
    missing_all.sort_values(by='missing_data', ascending=False)
    print(missing_all)

    cols = total1.columns
    num_cols = total1._get_numeric_data().columns
    cate = list(set(cols) - set(num_cols))  # only building_id is category

    numer_feat = total1.dtypes[total1.dtypes != 'object'].index
    skewed_feat = total1[numer_feat].apply(lambda x: (x.dropna()).skew())
    skewed_feat = skewed_feat.sort_values(ascending=False)
    skewness = pd.DataFrame({'Skew': skewed_feat})

    skewness = skewness[abs(skewness) > 0.75]
    skewness.dropna()
    skewness = skewness[abs(skewness) > 0.75]
    from scipy.special import boxcox1p
    skewness_feature = skewness.index
    lam = 0.15
    for i in skewness_feature:
        total1[i] = boxcox1p(total1[i], 0.15)
    print(skewness.head(10))

    # separate train and test data
    train = total1[total1['source'] == 'train']
    test = total1[total1['source'] == 'test']
    train.drop(['source'], axis=1, inplace=True)
    test.drop(['source'], axis=1, inplace=True)

    print('###########')
    missing_data = train.isnull().sum().sort_values(ascending=False)
    missing_precent = ((train.isnull().sum()) /
                       (train.isnull().count())).sort_values(ascending=False)
    missing_type = train.dtypes
    missing_all = pd.concat(
        [missing_data, missing_precent, missing_type],
        axis=1,
        keys=['missing_data', 'missing_precent', 'missing_type'])
    missing_all.drop(missing_all[missing_data == 0].index, inplace=True)
    missing_all.sort_values(by='missing_data', ascending=False)

    print(missing_all)

    return (train, y_train)
Example #42
0
    # and append all the samples to one list
    all_cut_samp = list()
    samp_id = 1
    for sublist in decid_samp_cut_list:
        formatted_samp = reformat_samples(sublist)

        cleaned_samp_list = md_clean(formatted_samp, md_bandnames)
        for elem in cleaned_samp_list:
            elem['id'] = samp_id
            all_cut_samp.append(elem)
            samp_id += 1

    out_decid = list(site_samp['tc_value'] for site_samp in all_cut_samp)

    # fint uniform distribution for the given distribution
    resp, fit_stats = stats.probplot(np.array(tc_value_list), dist='uniform')

    # calculate quantiles for QQ plot
    theo_quantiles = list(
        np.quantile(resp[0], q) for q in Sublist.frange(0.0, 1.0, step))
    actual_quantiles = list(
        np.quantile(resp[1], q) for q in Sublist.frange(0.0, 1.0, step))

    print('R-sq before removal: {}'.format(str(fit_stats[2]**2 * 100.0)))

    fig1, ax1 = plt.subplots()

    ax1.plot(theo_quantiles,
             actual_quantiles,
             '.',
             markersize=15,
# H0: homoscedasticity
# H1: heteroscedasticity

# return value of breusch pagan test
# lagrange_multiplier, pvalue, fscore, fp-value

# parameters: [residuals, x-array]
pval = sms.het_breuschpagan(m1.resid, m1.model.exog)[1]

if pval < 0.05:
    print("Reject H0. Model is Heteroscedastic")
else:
    print("FTR H0. Model is Homoscedastic")

# iii) Reasiduals have a normal distribution
stats.probplot(m1.resid, dist='norm', plot=pylab)
pylab.show()

# iv) rows > columns
prot.shape

# k-Fold Cross-Validation

folds = 5
cv_mse = []

X = trainx.values
Y = trainy.values

kf = KFold(folds)
# kf.get_n_splits(X)
Example #44
0
# Homoscedasticity : Error term being same across all values of independent variables
# Fitted Values Vs Residuals
mlpt.figure(figsize=(15, 10))
mlpt.scatter(preds, regressor.resid_pearson, c="r"), mlpt.axhline(y=0,
                                                                  color='blue')
mlpt.xlabel("Fitted_Values")
mlpt.ylabel("Residuals")
mlpt.show()

#Normality Test for Residuals
#In Order to have a Model's good fit it is important to have the Residuals follow a Normal Distribution Pattern

# Normal Distribution Check using Q-Q plot
mlpt.figure(figsize=(20, 15))
st.probplot(
    regressor.resid_pearson, dist="norm",
    plot=pylab)  #Residuals can be said to be nearly Normally Distributed
mlpt.show()

#To predict the mean values.
print('Mean Absolute Error by Stats Model:',
      metrics.mean_absolute_error(salary_dataframe['Salary'], preds))
print('Mean Squared Error by Stats Model:',
      metrics.mean_squared_error(salary_dataframe['Salary'], preds))
print(
    'Root Mean Squared Error by Stats Model:',
    numpy.sqrt(metrics.mean_squared_error(salary_dataframe['Salary'], preds)))
print("R-Square Value of the Model(Measure of Fit): ", regressor.rsquared)

#Conclusion: Transforming the predictor variable to log/sqrt/normalize yields less R-Squared Value and High RMSE value so used
#scale to standardize the predictor values and yields better R-Squared Value and low RMSE value compared to other Transformations
    sys.stdout.write("\nBoundaries for 4 Equal Percentiles \n")
    print(ptiles_vers)
    sys.stdout.write(" \n")
    plt.figure()
    x_vers, y_vers=ecdf(df.iloc[:,i])
    plt.plot(x_vers, y_vers, '.')
    plt.xlabel(columns[i-13])
    plt.ylabel('ECDF')
    plt.title("Percentiles ECDF of column "+str(columns[i-13]))
    plt.plot(ptiles_vers, percentiles/100, marker='D', color='red', linestyle='none')
    plt.show()

#Quantile‐Quantile Plot for each feature/target column
for i in range(13,27):
    plt.figure()
    stats.probplot(df.iloc[:,i], dist="norm",plot=plt)
    plt.show()

#graphical summary of the relationships
plt.figure()
sns.pairplot(df[columns], size=2.5)
plt.tight_layout()
plt.show()

cols=['ZN','CHAS','RM','DIS','B','MEDV']
plt.figure()
sns.pairplot(df[cols], size=2.5)
plt.tight_layout()
plt.show()

plt.figure()
Example #46
0
xs = np.linspace(min(x), max(x), 100)
ys_unthinned = 0.5 * ys1 + 0.5 * ys2
plt.plot(xs, ys_unthinned, color='black', label='pdf_unthinned', alpha=0.3)
plt.xlabel('Values')
plt.legend()

plt.subplot(3, 4, 9)
plt.plot(np.sort(g), color='blue', label='g_sorted', alpha=0.3)
plt.plot(np.sort(x), color='green', label='x_sorted', alpha=0.3)
plt.title('Ordered comparison: X vs G')
plt.xlabel('Samples')
plt.ylabel('Values g, x')
plt.legend()

plt.subplot(3, 4, 10)
probplot(g, dist='norm', plot=pylab)
plt.title('QQ Plot: G')

plt.subplot(3, 4, 11)
probplot(x, dist='norm', plot=pylab)
plt.title('QQ Plot: X')

plt.subplot(3, 4, 12)
plt.scatter(z, g)
plt.title('Mapping from noise z to g')
plt.xlabel('Noise z')
plt.ylabel('Generated value')

filename = os.path.join(base_path, 'result_plot.png')
plt.savefig(filename)
def qq_residuals(error_residuals):
    return stats.probplot(error_residuals, dist="norm", plot=pylab)
Example #48
0
def norm_plot(data_frame,var_name):
    sns.distplot(data_frame[var_name], fit=norm);
    fig = plt.figure()
    res = stats.probplot((data_frame)[var_name], plot=plt)
    plt.show()
Example #49
0
def qq_residuals(error_residuals):
    #measurements = np.random.normal(loc = 20, scale = 5, size=100)
    stats.probplot(error_residuals, dist='norm', plot=pylab)
    pylab.show()
Example #50
0
def qqplot(df, player, dist_name, dfs_pts_col='PTS_DK'):
    pts = df.loc[player, dfs_pts_col]
    dist = getattr(stats, dist_name)
    stats.probplot(pts, sparams=dist.fit(pts), dist=dist_name, plot=pylab)
# drop unnecessary variables
vars_keep = ['Season', 'Team', 'wPCT', 'wOBA', 'FIP', 'Def', 'BsR']
team_df = team_df[vars_keep]

### 2. EDA ###
# data structure
print('Data Structure: {}'.format(team_df.shape))

# normality
# 'wPCT' normality
fig, axes = plt.subplots(1, 2, figsize=(20, 8))

sns.histplot(team_df['wPCT'], kde=True, ax=axes[0])
axes[0].set_title('Team Winning Percentage Histogram')

stats.probplot(team_df['wPCT'], plot=axes[1])
axes[1].set_title('Team Winning Percentage Q-Q Plot')

plt.show()

# independent variables normality
ind_vars = ['wOBA', 'FIP', 'Def', 'BsR']

fig, axes = plt.subplots(2, 2, figsize=(12, 10))

for col, ax in zip(ind_vars, axes.flatten()[:7]):
    sns.histplot(team_df[col], kde=True, color='navy', ax=ax)
    ax.set_title('Team {} Histogram'.format(col))

plt.show()
 def draw_qq(self, content, title, write_figure_path):
     content = content.dropna(subset=[title])
     stats.probplot(content[title], dist="norm", plot=plt)
     plt.title(title)
     plt.savefig(write_figure_path)
     plt.close()
from scipy.stats import norm
from scipy import stats
import matplotlib.pyplot as plt

# set of objects in basket, media is 8 and standard deviation
# is 2 whats the prob of take an object with the wheight is less then 6 kg

norm.cdf(6, 8, 2)

#what the prob of take object is + then 6kg
norm.sf(6, 8, 2)
1 - norm.sf(6, 8, 2)

# prob of take an object -6 or +10 kg
norm.cdf(6, 8, 2) + norm.sf(10, 8, 2)

# prob of take an object -10 and + 8kg
norm.cdf(10, 8, 2) - norm.cdf(8, 8, 2)

data = norm.rvs(size=100)
stats.probplot(data, plot=plt)

stats.shapiro(data)
catDict = dict(zip(list(unique), range(len(unique))))
catCount = [0] * 2
for elt in colData:
    catCount[catDict[elt]] += 1

sys.stdout.write("\nCounts for Each Value of Categorical Label \n")
print(list(unique))
print(catCount)

#Quantile‐Quantile Plot for 4th Rocks versus Mines Attribute
col = 3
colData = []
for row in xList:
    colData.append(float(row[col]))

stats.probplot(colData, dist="norm", plot=pylab)
pylab.show()

# list 2-5 Using Python Pandas to Read and Summarize Data
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plot
from random import uniform
from math import sqrt
import sys
target_url = (
    "https://archive.ics.uci.edu/ml/machine-learning-"
    "databases/undocumented/connectionist-bench/sonar/sonar.all-data")
#read rocks versus mines data into pandas data frame
rocksVMines = pd.read_csv(target_url, header=None, prefix="V")
#print head and tail of data frame
Example #55
0
def main():

    data = []

    workbook = xlsxwriter.Workbook('data/distances.xls')
    all_distances_sheet = workbook.add_worksheet('All Distances')
    min_distances_sheet = workbook.add_worksheet('Min Distances')

    for t in range(len(Utils.standingUpActions)):
        with open('data/state-space/state-space-t{}-0.pkl'.format(t), 'rb') as handle:
            data += pickle.load(handle)

    sn = StateNormalizer()
    sn.extend_bounds()

    distances = []

    for index, targetAction in enumerate(Utils.standingUpActions):
        action = Utils.vecToInt(targetAction)
        targetState = getState(data, index, action)
        targetState = sn.normalize(targetState)
        for i in range(Utils.N_ACTIONS):
            if i == Utils.NULL_ACTION or i == action:
                continue
            s = getState(data, index, i)

            if s is None or targetState is None:
                continue
            s = sn.normalize(s)
            d = euclidean(s, targetState)
            distances.append(d)

    print('mean: {}'.format(numpy.mean(distances)))
    print('var: {}'.format(numpy.var(distances)))
    print('median: '.format(numpy.median(distances)))
    print('max: {}'.format(numpy.max(distances)))
    print('min: {}'.format(numpy.min(distances)))

    all_distances_sheet.write(0, 0, 'Min')
    all_distances_sheet.write(0, 1, numpy.min(distances))
    all_distances_sheet.write(1, 0, 'Max')
    all_distances_sheet.write(1, 1, numpy.max(distances))
    all_distances_sheet.write(2, 0, 'Mean')
    all_distances_sheet.write(2, 1, numpy.mean(distances))
    all_distances_sheet.write(3, 0, 'Variance')
    all_distances_sheet.write(3, 1, numpy.var(distances))
    all_distances_sheet.write(4, 0, 'Median')
    all_distances_sheet.write(4, 1, numpy.median(distances))

    # measurements = numpy.random.normal(loc = 20, scale = 5, size=100)
    # probplot(measurements, dist="norm", plot=pylab)
    # pylab.show()

    probplot(distances, dist="norm", plot=pylab)
    pylab.show()

    # sm.qqplot(numpy.array(distances), line='45')
    # pylab.show()

    with open('data/state-space/state-space-all-0.pkl', 'rb') as handle:
        data = pickle.load(handle)

    for i in range(len(data)):
        data[i] = sn.normalize(data[i])

    kdtree = KDTree(data)

    min_dists = []
    for i in range(len(data)):
        dists, indexes = kdtree.query(data[i], 2)
        min_dists.append(dists[1])

    #numpy.set_printoptions(threshold=numpy.nan)
    # print(min_dists)
    print('-----------------')
    print('min: {}'.format(numpy.min(min_dists)))
    print('max: {}'.format(numpy.max(min_dists)))
    print('mean: {}'.format(numpy.mean(min_dists)))
    print('variance: {}'.format(numpy.var(min_dists)))
    print('median: {}'.format(numpy.median(min_dists)))

    min_distances_sheet.write(0, 0, 'Min')
    min_distances_sheet.write(0, 1, numpy.min(min_dists))
    min_distances_sheet.write(1, 0, 'Max')
    min_distances_sheet.write(1, 1, numpy.max(min_dists))
    min_distances_sheet.write(2, 0, 'Mean')
    min_distances_sheet.write(2, 1, numpy.mean(min_dists))
    min_distances_sheet.write(3, 0, 'Variance')
    min_distances_sheet.write(3, 1, numpy.var(min_dists))
    min_distances_sheet.write(4, 0, 'Median')
    min_distances_sheet.write(4, 1, numpy.median(min_dists))

    workbook.close()
Example #56
0
def probplot(column):
    plt.figure()
    if type(column) == str:
        stats.probplot(all_data[column], plot=plt)
    else:
        stats.probplot(column, plot=plt)
from scipy.stats import norm, skew

import seaborn as sns

import matplotlib.pyplot as plt 

(mu, sigma) = norm.fit(df_julho_19_reg['TEMPO_PERM_INT_POSTERIOR'])

plt.figure(figsize = (14, 7))
sns.distplot(df_julho_19_reg['TEMPO_PERM_INT_POSTERIOR'], fit = norm)
plt.ylabel('Frequency')
plt.title('Tempo de Permanência - Distribution')
plt.legend(['Normal Dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)], loc = 'best')

quantile_plot = stats.probplot(df_julho_19_reg['TEMPO_PERM_INT_POSTERIOR'], plot = plt)

import numpy as np

df_julho_19_reg['TEMPO_PERM_INT_POSTERIOR'] = np.log1p(df_julho_19_reg['TEMPO_PERM_INT_POSTERIOR'])

(mu, sigma) = norm.fit(df_julho_19_reg['TEMPO_PERM_INT_POSTERIOR'])
plt.figure(figsize = (14, 7))
plt.subplot(1,2,1)
sns.distplot(df_julho_19_reg['TEMPO_PERM_INT_POSTERIOR'], fit = norm)
plt.ylabel('Frequency')
plt.title('Tempo de Permanência - Distribution')
plt.subplot(1, 2, 2)
quantile_plot = stats.probplot(df_julho_19_reg['TEMPO_PERM_INT_POSTERIOR'], plot = plt)

#Cálculo de Simetria dos Dados
def qq_residuals(error_residuals):
    stats.probplot(error_residuals, dist='norm', plot=pylab)
    pylab.show()
    return None
Example #59
0
# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(train['SalePrice'])
print('\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

# Now plot the distribution
plt.legend(
    ['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
    loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')
plt.show()

# Get also the QQ-plot
fig = plt.figure()
res = stats.probplot(train['SalePrice'], plot=plt)
plt.show()

# Applying log transformation
train['SalePrice'] = np.log(train['SalePrice'])
'''
#Transformed histogram and normal probability plot
--------------------------------------------------
We note how we got mor normal values for SalePrice, the mayority of the values are fairly distributed on the 2 sides of
the mean. 
'''
# Check the new distribution
sns.distplot(train['SalePrice'], fit=norm)

# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(train['SalePrice'])
Example #60
0
    def eda_numerical_variable(self, variable):
        """
        This provides basic EDA of the Numerical variable passed,
            - Basic Statistics like Count, Data Type, min, max, mean, median, etc., 
            - Missing Values count and missing percentages 
            - Generates distribution plots. Histogram and KDE Plots 
            - Skewness and Kurtosis
            - Q-Q plot to check Normality
            - Box plot to check the spread outliers
            - Outliers using IQR
            - Various variable transformations

        Parameter :
        ----------
            variable: Pass the Numerical variable for which EDA is required
        """
        c = variable
        s = self.__df__[variable]

        
        # 1. Basic Statistics

        print ('Total Number of observations : ', len(s))
        print ()

        print ('Datatype :', (s.dtype))
        print ()

        printmd ('**<u>5 Point Summary :</u>**')

        print ('  Minimum  :\t\t', s.min(), '\n  25th Percentile :\t', s.quantile(0.25), 
               '\n  Median :\t\t', s.median(), '\n  75th Percentile :\t', s.quantile(0.75), 
               '\n  Maximum  :\t\t', s.max())

        print ()

        # 2. Missing values

        printmd ('**<u>Missing Values :</u>**')

        print ('  Number :', s.isnull().sum())
        print ('  Percentage :', s.isnull().mean()*100, '%')

        # 3. Histogram
        
        printmd ('**<u>Variable distribution and Spread statistics :</u>**')

        sns.distplot(s.dropna(), hist = True, fit = norm, kde = True)
        plt.show()

        # 4. Spread Statistics

        print ('Skewness :' , s.skew())
        print ('Kurtosis :', s.kurt())
        print ()

        # 5. Q-Q plot
        printmd ('**<u>Normality Check :</u>**')
        res = stats.probplot(s.dropna(), dist = 'norm', plot = plt)
        plt.show()

        # 6. Box plot to check the spread outliers
        print ()
        printmd ('**<u>Box Plot and Visual check for Outlier  :</u>**')
        sns.boxplot(s.dropna(), orient = 'v')
        plt.show()

        # 7. Get outliers. Here distance could be a user defined parameter which defaults to 1.5

        print ()
        printmd ('**<u>Outliers (using IQR):</u>**')

        IQR = np.quantile(s, .75) - np.quantile(s, .25)
        upper_boundary = np.quantile(s, .75) + 1.5 * IQR
        lower_boundary = np.quantile(s, .25) - 1.5 * IQR

        print ('  Right end outliers :', np.sum(s>upper_boundary))
        print ('  Left end outliers :', np.sum(s < lower_boundary))

        # 8. Various Variable Transformations

        print ()
        printmd (f'**<u>Explore various transformations for {c}</u>**')
        print ()

        print ('1. Logarithmic Transformation')
        s_log = np.log(s)
        normality_diagnostic(s_log)

        print ('2. Exponential Transformation')
        s_exp = np.exp(s)
        normality_diagnostic(s_exp)

        print ('3. Square Transformation')
        s_sqr = np.square(s)
        normality_diagnostic(s_sqr)

        print ('4. Square-root Transformation')
        s_sqrt = np.sqrt(s)
        normality_diagnostic(s_sqrt)

        print ('5. Box-Cox Transformation')
        s_boxcox, lambda_param = stats.boxcox(s)
        normality_diagnostic(s_boxcox)
        print ('Optimal Lambda for Box-Cox transformation is :', lambda_param )
        print ()

        print ('6. Yeo Johnson Transformation')
        s = s.astype('float')
        s_yeojohnson, lambda_param = stats.yeojohnson(s)
        normality_diagnostic(s_yeojohnson)
        print ('Optimal Lambda for Yeo Johnson transformation is :', lambda_param )
        print ()