def ks_metric(y_true, y_scores, bins, path): ''' :param y_true: array-like of shape = (n_samples) or (n_samples, n_outputs) Ground truth (correct) target values. :param y_scores: array-like of shape = (n_samples) or (n_samples, n_outputs) Estimated target values. :param bins: bins of y_scores :param path: if path equal 0, show ks plot; if path equal string of filepath, ks plot will save to filepath. :return: ks value ''' df = pd.DataFrame({'y': y_true, 'score': y_scores}) cdf_data1 = df[df['y'] == 0]['score'] cdf_data2 = df[df['y'] == 1]['score'] cdf1 = stats.cumfreq(cdf_data1, numbins=bins) cdf2 = stats.cumfreq(cdf_data2, numbins=bins) y_0 = cdf1[0] / cdf1[0][-1] y_1 = cdf2[0] / cdf2[0][-1] cdf_data = pd.DataFrame({'y_0': y_0, 'y_1': y_1}) # plot fig, ax = plt.subplots(figsize=(10, 8)) ax.plot(cdf_data) ax.legend(list(cdf_data.columns)) plt.ylabel('累积概率') plt.xlabel('预测得分') if path == 0: plt.show() else: plt.savefig(path, dpi=150) # KS值 ks = np.max(cdf1[0] / cdf1[0][-1] - cdf2[0] / cdf2[0][-1]) return ks
def plotCDF(forecast, validation, title): ax1 = plt.figure(figsize=(7, 5)) vals, x1, x2, x3 = cumfreq(forecast['modelled'], len(forecast['modelled'])) ax1 = plt.plot(np.linspace(np.min(forecast['modelled']), np.max(forecast['modelled']), len(forecast['modelled'])), vals / len(forecast['modelled']), "r", label=str(config.get('Main options', 'RunName'))) vals, x1, x2, x3 = cumfreq(validation['modelled'], len(validation['modelled'])) ax2 = plt.plot(np.linspace(np.min(validation['modelled']), np.max(validation['modelled']), len(validation['modelled'])), vals / len(validation['modelled']), "b", label=str(config.get('Reference options', 'RunName'))) vals, x1, x2, x3 = cumfreq(validation['observations'], len(validation['observations'])) ax3 = plt.plot(np.linspace(np.min(validation['observations']), np.max(validation['observations']), len(validation['observations'])), vals / len(validation['observations']), "black", label="Observations") ax3 = plt.legend(prop={'size': 10}, loc=2) ax1 = plt.title(title) ax1 = plt.xlabel("Discharge (m3/s)") ax1 = plt.ylabel("ECDF") ax1 = plt.gcf().set_tight_layout(True) pdf.savefig() plt.clf()
def HM_color_transfer(style, content): """ Color transfer the content image to the style image using cumulative distribution of both images, Args: style: target style in RGB space. content: content image in RGB space. Returns: Color transfer image in RGB space. """ #copy style image and content image then convert them from 0:1 to 0:255 scale. transfered = np.copy(content) style = np.copy(style) transfered *=255 style *= 255 #calculate normalized cumulative histogram then update the content image based on the calculated values. for h in range (0,3): content_c = transfered[:,:,h] style_c = style[:,:,h] height , width = content_c.shape contentValues,_,_,_ = stats.cumfreq(content_c, numbins=256) contentValues /= contentValues[-1] styleValues,_,_,_ = stats.cumfreq(style_c, numbins=256) styleValues /= styleValues[-1] K=256 new_values=np.zeros((K)) for a in np.arange(K): j=K-1 while True: new_values[a]=j j=j-1 if j<0 or contentValues[a]>styleValues[j]: break for i in np.arange(height): for j in np.arange(width): a=content_c.item(i,j) b=new_values[int(a)] transfered[:,:,h].itemset((i,j),b) #transfered[:,:,h] = gaussian(transfered[:,:,h]) #return the image to 0:1 scale transfered = transfered /255 return transfered
def compare_cdfs(data, A, num_bins=100): cdfs = {} assert len(np.unique(A)) == 2 limits = (min(data), max(data)) s = 0.5 * (limits[1] - limits[0]) / (num_bins - 1) limits = (limits[0] - s, limits[1] + s) for a in np.unique(A): subset = data[A == a] cdfs[a] = cumfreq(subset, numbins=num_bins, defaultreallimits=limits) lower_limits = [v.lowerlimit for _, v in cdfs.items()] bin_sizes = [v.binsize for _, v in cdfs.items()] actual_num_bins = [v.cumcount.size for _, v in cdfs.items()] assert len(np.unique(lower_limits)) == 1 assert len(np.unique(bin_sizes)) == 1 assert np.all([num_bins == v.cumcount.size for _, v in cdfs.items()]) xs = lower_limits[0] + np.linspace(0, bin_sizes[0] * num_bins, num_bins) disparities = np.zeros(num_bins) for i in range(num_bins): cdf_values = np.clip( [v.cumcount[i] / len(data[A == k]) for k, v in cdfs.items()], 0, 1) disparities[i] = max(cdf_values) - min(cdf_values) return xs, cdfs, disparities
def visualize_cumulative_sum(): ssandtss = [ss_mins,tss_mins,ss3end_mins] plt.rcParams["font.size"] = 16 for index in range(3): plt.figure() dists=[] for dis in ssandtss[index]: if abs(int(dis)) > args.xlimit: dists.append(args.xlimit+2) continue dists.append(abs(int(dis))) cums = stats.cumfreq(dists,numbins=args.xlimit+2) plt.xlabel('distance [bp]') x = pd.Series(cums.cumcount) plt.xlim(0,args.xlimit) ax = sns.lineplot(data=x) if index==0: plt.ylabel('Numper of splice sites') plt.savefig("ss_cumulative_plot.png", dpi=500, bbox_inches='tight') label="splice site" elif index==1: plt.ylabel('Numper of TSSs') plt.savefig("tss_cumulative_plot.png", dpi=500,bbox_inches='tight') else: plt.ylabel('Numper of 3-prime ends') plt.savefig("ss3end_cumulative_plot.png", dpi=500,bbox_inches='tight')
def create_cdf(X): """Create the cummulative density function of a continuous random variable, e.g. observed data. arguments X - Observed data from which a cdf should be constructed; please provide the data as a vector (Nx1 NumPy array or list). returns (bins, cdf) bins - All unique values in X, used as bins for the cdf. cdf - The cummulative density for each value in bins. """ # convert the data to a NumPy array. data = numpy.array(X) # the bins are all unique values in the data bins = copy.deepcopy(data) bins.sort() # calculate the cummulative frequency for each unique value in the data cumfreq, lowerlim, binsize, extra = stats.cumfreq(data, numbins=len(bins)) # transform the cummulative frequencies to a cdf cdf = cumfreq / numpy.max(cumfreq) return bins, cdf
def cumdist(vec, nbins=100): hist(vec, color='g', bins=nbins, normed=True, align='mid') # hist(vec, bins=nbins, normed=False, align='mid') # figure(2) disc = cumfreq(vec, numbins=nbins) plot(disc[0]/len(vec)) show()
def plot_percent_percentile_plot(test_pred_do, test_y1): test_pred_do = np.swapaxes(test_pred_do, 0, 1) percentile = np.zeros((test_pred_do.shape[0], test_pred_do.shape[1])) z_score = np.zeros_like(percentile) for i in range(test_pred_do.shape[0]): for j in range(test_pred_do.shape[1]): if test_y1[i, j, 1] == 0: percentile[i, j] = np.nan z_score[i, j] = np.nan continue temp = np.append(test_pred_do[i, j, :], test_y1[i, j, 0]) temp = np.sort(temp) ix = np.where(temp == test_y1[i, j, 0]) percentile[i, j] = ix[0][0] / (len(temp) - 1) * 100 z_score[i, j] = (test_y1[i, j, 0] - np.mean( test_pred_do[i, j, :])) / np.std(test_pred_do[i, j, :]) mask = test_y1[:, :, 1].reshape((-1, )) ix = np.where(mask == 1) percentile = percentile.reshape((-1, ))[ix] #pyplot.figure(); #pyplot.hist(percentile,bins=100); res = stats.cumfreq(percentile, numbins=100) x = res.lowerlimit + np.linspace(0, res.binsize * res.cumcount.size, res.cumcount.size) pyplot.figure() pyplot.bar(x, res.cumcount / np.count_nonzero(mask) * 100, width=res.binsize) pyplot.plot(x, x, '-r', label='y=x') pyplot.show() return z_score
def KS_principle(inData): '''Show the principle of the Kolmogorov-Smirnov test.''' # CDF of normally distributed data nd = stats.norm() nd_x = np.linspace(-4, 4, 101) nd_y = nd.cdf(nd_x) # Empirical CDF of the sample data, which range for approximately 0 to 10 numPts = 50 lowerLim = 0 upperLim = 10 ecdf_x = np.linspace(lowerLim, upperLim, numPts) ecdf_y = stats.cumfreq(data, numPts, (lowerLim, upperLim))[0] / len(inData) #Add zero-point by hand ecdf_x = np.hstack((0., ecdf_x)) ecdf_y = np.hstack((0., ecdf_y)) # Plot the data sns.set_style('ticks') sns.set_context('poster') setFonts(36) plt.plot(nd_x, nd_y, 'k--') plt.hold(True) plt.plot(ecdf_x, ecdf_y, color='k') plt.xlabel('X') plt.ylabel('Cumulative Probability') # For the arrow, find the start ecdf_startIndex = np.min(np.where(ecdf_x >= 2)) arrowStart = np.array([ecdf_x[ecdf_startIndex], ecdf_y[ecdf_startIndex]]) nd_startIndex = np.min(np.where(nd_x >= 2)) arrowEnd = np.array([nd_x[nd_startIndex], nd_y[nd_startIndex]]) arrowDelta = arrowEnd - arrowStart plt.arrow(arrowStart[0], arrowStart[1], 0, arrowDelta[1], width=0.05, length_includes_head=True, head_length=0.04, head_width=0.4, color='k') plt.arrow(arrowStart[0], arrowStart[1] + arrowDelta[1], 0, -arrowDelta[1], width=0.05, length_includes_head=True, head_length=0.04, head_width=0.4, color='k') outFile = 'KS_Example.png' showData(outFile)
def cdf_vals_from_data(data, numbins=None, maxbins=None): # make sure data is a numpy array data = numpy.array(data) # by default, use numbins equal to number of distinct values # TODO: shouldn't this be one per possible x val? if numbins == None: numbins = numpy.unique(data).size if maxbins != None and numbins > maxbins: numbins = maxbins # bin the data and count fraction of points in each bin (for PDF) rel_bin_counts, min_bin_x, bin_size, _ =\ stats.relfreq(data, numbins, (data.min(), data.max())) # bin the data and count each bin (cumulatively) (for CDF) cum_bin_counts, min_bin_x, bin_size, _ =\ stats.cumfreq(data, numbins, (data.min(), data.max())) # normalize bin counts so rightmost count is 1 cum_bin_counts /= cum_bin_counts.max() # make array of x-vals (lower end of each bin) x_vals = numpy.linspace(min_bin_x, min_bin_x+bin_size*numbins, numbins) # CDF always starts at y=0 cum_bin_counts = numpy.insert(cum_bin_counts, 0, 0) # y = 0 cdf_x_vals = numpy.insert(x_vals, 0, x_vals[0]) # x = min x return cum_bin_counts, cdf_x_vals, rel_bin_counts, x_vals
def cdfs(valueses, xlabel='value', labels=None, title='CDF', n_bins=500): """ Plot one or more cumulative density functions :param valueses: :param xlabel: :param labels: :param title: :param n_bins: :return: """ x_valueses = [] y_valueses = [] logger.debug("cdfs") for values in valueses: freq = cumfreq(values, n_bins) x_values = [ freq.lowerlimit + x * freq.binsize for x in xrange(0, n_bins) ] y_values = freq.cumcount / len(values) logger.debug("binsize: %f" % freq.binsize) logger.debug("range: %f" % (freq.binsize * n_bins)) logger.debug("y range: %f - %f" % (min(y_values), max(y_values))) x_valueses.append(x_values) y_valueses.append(y_values) return multiline(x_valueses, y_valueses, title=title, xlabel=xlabel, ylabel='density', labels=labels)
def plotRECCurve(self, nbins=20, highlight_error=None, linestyle='-', linewidth=1.0): """ Plot a Regression Error Characteristic (REC) curve. The resulting REC curve shows the cumulative distribution of errors over the dataset, where the error is measured in distance of the mode of the mixture distribution from the target value in standard deviations. TODO: Use the true mode rather than the kernel with the largest mixing coefficient. """ if self.y == None: self.update() alpha, sigma2, mu = mdn.getMixtureParams(self.y, self.module.M, self.module.c) #maxidxs = np.argmax(alpha, axis=1) maxidxs=self.getMaxKernel(alpha, sigma2) N=len(mu) mu = mu[np.arange(0,N), maxidxs] sigma2 = sigma2[np.arange(0,N), maxidxs] dist = np.sum(np.abs(mu-self.tgts), axis=1) dist /= np.sqrt(sigma2) h,_,_,_ = cumfreq(dist, nbins,[0,10]) h/=N plt.plot(np.linspace(0,10,nbins), h, linestyle, linewidth=linewidth) if highlight_error: plt.vlines(highlight_error, 0, 1, linestyles='-.') plt.xlabel('$\epsilon$ [n std deviations]') plt.ylabel('accuracy') return dist
def main(): # Univariate data ------------------------- # Generate data that are normally distributed x = randn(500) # Set the fonts the way I like them sns.set_context('poster') sns.set_style('ticks') #mystyle.set() # Scatter plot scatter(arange(len(x)), x) xlim([0, len(x)]) mystyle.printout('scatterPlot.png', xlabel='x', ylabel='y', title='Scatter') # Histogram hist(x) mystyle.printout('histogram_plain.png', xlabel='Data Values', ylabel='Frequency', title='Histogram, default settings') hist(x,25) mystyle.printout('histogram.png', xlabel='Data Values', ylabel='Frequency', title='Histogram, 25 bins') # Cumulative probability density numbins = 20 plot(stats.cumfreq(x,numbins)[0]) mystyle.printout('CumulativeFrequencyFunction.png', xlabel='Data Values', ylabel='Cumulative Frequency') # Boxplot # The ox consists of the first, second (middle) and third quartile boxplot(x, sym='*') mystyle.printout('boxplot.png', xlabel='Values', title='Boxplot') boxplot(x, sym='*', vert=False) title('Boxplot, horizontal') xlabel('Values') show() # Errorbars x = arange(5) y = x**2 errorBar = x/2 errorbar(x,y, yerr=errorBar, fmt='o', capsize=5, capthick=3) xlim([-0.2, 4.2]) ylim([-0.2, 19]) mystyle.printout('Errorbars.png', xlabel='Data Values', ylabel='Measurements', title='Errorbars') # Violinplot nd = stats.norm data = nd.rvs(size=(100)) nd2 = stats.norm(loc = 3, scale = 1.5) data2 = nd2.rvs(size=(100)) # Use pandas and the seaborn package for the violin plot df = pd.DataFrame({'Girls':data, 'Boys':data2}) #sns.violinplot(df, color = ["#999999", "#DDDDDD"]) sns.violinplot(df) mystyle.printout('violinplot.png')
def main(): # Univariate data ------------------------- # Generate data that are normally distributed x = randn(500) # Set the fonts the way I like them sns.set_context('paper') sns.set_style('white') mystyle.set() # Scatter plot plot(x, '.') mystyle.printout('scatterPlot.png', xlabel='x', ylabel='y', title='Scatter') # Histogram hist(x, color='#999999') mystyle.printout('histogram_plain.png', xlabel='Data Values', ylabel='Frequency', title='Histogram, default settings') hist(x, 25, color='#999999') mystyle.printout('histogram.png', xlabel='Data Values', ylabel='Frequency', title='Histogram, 25 bins') # Cumulative probability density numbins = 20 plot(stats.cumfreq(x, numbins)[0]) mystyle.printout('CumulativeFrequencyFunction.png', xlabel='Data Values', ylabel='Cumulative Freuqency') # Boxplot # The ox consists of the first, second (middle) and third quartile boxplot(x, sym='*') mystyle.printout('boxplot.png', xlabel='Values', title='Boxplot') boxplot(x, sym='*', vert=False) title('Boxplot, horizontal') xlabel('Values') show() # Violinplot nd = stats.norm data = nd.rvs(size=(100)) nd2 = stats.norm(loc=3, scale=1.5) data2 = nd2.rvs(size=(100)) # Use pandas and the seaborn package for the violin plot df = pd.DataFrame({'Girls': data, 'Boys': data2}) sns.violinplot(df, color=["#999999", "#DDDDDD"]) mystyle.printout('violinplot.png')
def hist_eq(b): bf = b.flatten() min_, max_ = nanmin(bf), nanmax(bf) cumfreqs, lowlim, binsize, extrapoints = cumfreq(bf, numbins=256, defaultreallimits=(min_, max_)) cumfreqs = (255.99 * cumfreqs / cumfreqs[-1]).astype('u1') result = (255.99*(b-min_)/(max_-min_)).clip(0, 255).astype('u1') return cumfreqs[result]
def __init__(self, data, numBins=None): if not numBins: numBins = int(len(data) / 5) res = stats.cumfreq(data, numbins=numBins) self.cdistr = res.cumcount / len(data) self.loLim = res.lowerlimit self.upLim = res.lowerlimit + res.binsize * res.cumcount.size self.binWidth = res.binsize
def getPercentile(self): self.percentile16 = [] self.percentile84 = [] for i in range(0, 6): res = stats.cumfreq(self.fileData[i], numbins=400) self.percentile16.append( self.findPercentile(res, 0.16, len(self.fileData[i]))) self.percentile84.append( self.findPercentile(res, 0.84, len(self.fileData[i])))
def frequency(self, frequencyMap): arrayOfkeys = [] weights = [] for key in frequencyMap: arrayOfkeys.append(int(key)) weights.append(frequencyMap[key]) cumcout, lowerlimit, binsize, extrapoints = stats.cumfreq( arrayOfkeys, numbins=10, weights=weights) return cumcout
def draw_cdf(e, cap, subplot): subplot.set_title(e['dist'].__name__ + ' capacity=' + str(cap)) samples = sorted(e['dist'](cap)) res = stats.cumfreq(samples, numbins=cap, defaultreallimits=e['section']) x = min(e['section']) + np.linspace(0, res.binsize * res.cumcount.size, res.cumcount.size) subplot.bar(x, res.cumcount / cap, width=res.binsize) subplot.set_ylim([0, 1.2]) subplot.set_xlim([min(e['section']) - 1, max(e['section']) + 1])
def main(filename): counts = get_data(filename) sorted_counts = sorted([v for v in counts.itervalues()]) cumfreqs, lowlim, binsize, extrapoints = cumfreq(sorted_counts, max(sorted_counts)) norm_cumfreqs = cumfreqs / max(cumfreqs) plot.plot(norm_cumfreqs[:500], linewidth=1.5) plot.xlabel("mapped reads") plot.ylabel("splice junction") plot.show()
def plot_cdf(hist_list, bins, norm_factor, min_spike_threshold, max_spike_threshold, plt_handle): res1 = stats.cumfreq(hist_list, numbins=len(bins), defaultreallimits=(min_spike_threshold, max_spike_threshold)) total_count = res1.cumcount[-1] cum_count = total_count - res1.cumcount plt_handle.plot(bins, cum_count * norm_factor)
def plotCDF(forecast, validation, title, xlims=[-1, 1]): vals, x1, x2, x3 = cumfreq(forecast, len(forecast)) ax1 = plt.plot(np.linspace(np.min(forecast), np.max(forecast), len(forecast)), vals / len(forecast), label='Simulation') vals, x1, x2, x3 = cumfreq(validation, len(validation)) ax2 = plt.plot(np.linspace(np.min(validation), np.max(validation), len(validation)), vals / len(validation), label='Reference') ax2 = plt.legend(prop={'size': 10}) ax1 = plt.title(title) ax1 = plt.xlabel("Value") ax1 = plt.ylabel("ECDF") ax1 = plt.xlim(xlims[0], xlims[1]) ax1 = plt.ylim(0, 1) pdf.savefig() plt.clf()
def plot(self, weights, row, col, shape, ix): full_lstm = np.zeros(shape) ix_lstm = np.zeros(shape) full_lstm[(row, col)] = weights ix_lstm[(row, col)] = 1 plt.imshow(-ix_lstm, cmap=plt.get_cmap('binary')) plt.savefig('{}/{}.png'.format(self.Dir, ix)) plt.clf() ng = shape[-1] // self.nh ix_lstm_p = np.reshape(ix_lstm, [-1, self.nh, ng]) reduce_row = np.sum(ix_lstm_p, axis=(0, -1)) from scipy import stats cumcount, lower, binsize, _ = stats.cumfreq(reduce_row, numbins=30) x = lower + np.linspace(0, binsize * cumcount.size, cumcount.size) plt.bar(x, cumcount / (len(reduce_row)), width=binsize) plt.xlim(0, 400) plt.xlabel('Parameters per Neuron') plt.ylabel('Cumulative %') plt.savefig('{}/cml{}.png'.format(self.Dir, ix)) plt.clf() ng = shape[-1] // self.nh input_list = [] rec_list = [] for i in range(ng): input_list.append( np.sum(ix_lstm[:self.ni, i * self.nh:(i + 1) * self.nh])) rec_list.append( np.sum(ix_lstm[self.ni:, i * self.nh:(i + 1) * self.nh])) print("ratio:", np.sum(input_list) / np.sum(rec_list), "true_ratio:", shape[0] / self.nh - 1) inds = np.arange(ng) width = 0.35 p1 = plt.bar(inds, input_list, width) p2 = plt.bar(inds, rec_list, width, bottom=input_list) if ng == 3: plt.xticks(inds, ('R-gate', 'Z-gate', 'O-gate')) elif ng == 4: plt.xticks(inds, ('I-gate', 'J-gate', 'F-gate', 'O-gate')) plt.ylabel('Number of Connections') plt.title("Remaining Weights by Gate and Type") plt.legend((p1[0], p2[0]), ('Input Parameters', 'Recurrent Parameters')) plt.savefig('{}/bar{}.png'.format(self.Dir, ix))
def GetDistributions(sample1, sample2, nbins): # For consistency between CDFs lower_limit = min(min(sample1), min(sample2)) upper_limit = max(max(sample1), max(sample2)) # Create objects (H1, H2) that includes the cumulative frequency and surrounding information H1 = stats.cumfreq(sample1, numbins=nbins, defaultreallimits=(lower_limit, upper_limit)) H2 = stats.cumfreq(sample2, numbins=nbins, defaultreallimits=(lower_limit, upper_limit)) cumdist1 = H1.cumcount cumdist2 = H2.cumcount binsize = H1.binsize return lower_limit, upper_limit, H1, H2, cumdist1, cumdist2, binsize
def cumulativePlot(samples, save_file=None): #fig = plt.figure(figsize=(8, 6)) res = stats.cumfreq(samples, numbins=25) x = res.lowerlimit + np.linspace(0, res.binsize * res.cumcount.size, res.cumcount.size) plt.bar(x, res.cumcount / res.cumcount[-1], width=res.binsize) plt.title('Cumulative histogram') plt.xlim([x.min(), x.max()]) plt.xlabel("Wind Speed m/s") if save_file is not None: plt.savefig(save_file, dpi=300, pad_inches=0, bbox_inches='tight') plt.show()
def plot_cumulative_frequency(vul_list): percent_fixing_commits = sorted([vul['num_fix_commits']/vul['num_release_commits']*100 for vul in vul_list]) cumulative_frequency = stats.cumfreq(percent_fixing_commits, defaultreallimits = (-1, 101), numbins=len(percent_fixing_commits)) trace = go.Scatter( name = 'Fixing release', x = [0] + percent_fixing_commits, y = [0] + list(map(lambda w: w/len(percent_fixing_commits) * 100, cumulative_frequency.cumcount)) ) layout = go.Layout( showlegend = False, yaxis = dict( title = 'Cumulative Frequency Distribution<br>(Fixing releases)', titlefont = dict(size=16), range = [0, 100], ticksuffix = '%' ), xaxis = dict( title = 'Fixing commits (%)', titlefont = dict(size=16), range = [0, 20], ticksuffix = '%' ), shapes = [ dict( type = 'line', x0 = 14.28, x1 = 14.28, y0 = 0, y1 = 110, line = dict( color = 'black', dash = 'dash' ) ), dict( type = 'line', x0 = 0, x1 = 110, y0 = 91.77, y1 = 91.77, line = dict( color = 'black', dash = 'dash' ) ) ] ) fig = go.Figure(data=[trace], layout=layout) fig.write_html('cum_freq_dist.html') fig.write_image('cum_freq_dist.pdf', height=400, width=600)
def plotCDF(forecast, validation, title, xlims=[-1, 1]): forecast[forecast < -1.01] = -1.01 vals, x1, x2, x3 = cumfreq(forecast, len(forecast)) ax1 = plt.plot(np.linspace(np.min(forecast), np.max(forecast), len(forecast)), vals / len(forecast), label=str(config.get('Main options', 'RunName'))) validation[validation < -1.01] = -1.01 vals, x1, x2, x3 = cumfreq(validation, len(validation)) ax2 = plt.plot(np.linspace(np.min(validation), np.max(validation), len(validation)), vals / len(validation), label=str(config.get('Reference options', 'RunName'))) ax2 = plt.legend(prop={'size': 10}, loc=2) ax1 = plt.title(title) ax1 = plt.xlabel("Value") ax1 = plt.ylabel("ECDF") ax1 = plt.xlim(xlims[0], xlims[1]) ax1 = plt.ylim(0, 1) ax1 = plt.gcf().set_tight_layout(True) pdf.savefig() plt.clf()
def iecdf(x, p, nbins=10): """f = iecdf(x, p, nbins=10) returns the reciprocal of the empirical cumulative distriution function at ordinate p """ # if (p > 1 or p < 0): # print "Error : Percentile p must be between 0 and 1." # exit cum = stats.cumfreq(x, nbins) a = cum[0] / len(x) lowlim = cum[1] bsize = cum[2] uplim = lowlim + bsize * nbins bins = np.linspace(lowlim + bsize / 2, uplim - bsize / 2, nbins) freqs = interpolate.interp1d(a, bins) f = freqs(p) return f
def computeCDF(data, precision=1000): from scipy.stats import cumfreq, scoreatpercentile maxVal = max(data) + 0. freqs, _, _, _ = cumfreq(data, precision) freqsNormalized = map(lambda x: x / maxVal, freqs) values = [] step = 100. / precision scores = numpy.arange(0, 100 + step, step) for s in scores: values.append(scoreatpercentile(data, s)) return values, freqs, freqsNormalized
def ecdfSyt(df,Group,Conc,threshold): tmp=df[(df['Conc']==Conc) & (df['Group']==Group)] tmp['time']=np.round(tmp['time'],1) tmp=tmp.sort('time') nbins=np.unique(tmp['time']).size tmp1=cumfreq(tmp['time'].values,numbins=nbins)[0] time=np.unique(tmp['time']) tmparray=np.zeros([nbins,4]) DF=pd.DataFrame(tmparray,columns=["Group","Conc","time","cumfreq"]) DF['Group']=[Group]*nbins; DF['Conc']=[Conc]*nbins; DF['time']=time;DF['cumfreq']=tmp1 DF['cumfreq']=DF['cumfreq']/DF['cumfreq'].max() DF=DF[DF['cumfreq']>threshold] DF['cumfreq']=(DF['cumfreq']/(DF['cumfreq'].max()-DF['cumfreq'].min()))-DF['cumfreq'].min() DF['time']=DF['time']-DF['time'].min() return DF
def cumulative_histogram(X, axis, bins=30): info = calculate_descriptive_stats(X) res = sps.cumfreq(X, numbins=30) x = res.lowerlimit + np.linspace(0, res.binsize * res.cumcount.size, res.cumcount.size) axis.bar( x, res.cumcount / np.max(res.cumcount), width=res.binsize, color='b', label= "Count: %d\nMin: %.4f\nMax: %.4f\nMean: %.4f\nStd: %.4f\nSkew: %.4f\nKurt: %.4f" % info) axis.set_title('Cumulative Histogram of Data') axis.set_xlim([x.min(), x.max()]) axis.legend(loc='lower right')
def make_cdf(img, n_bins=256): """ Creates CDF of input image (map from [0,255] to [0,1]) Inputs: - img: input image - n_bins: Number of bins used Output: - Dictionary containing Cummulative frequencies (CDF) of pixel values, contained in array, and number of items (pixels) used to compute CDF """ cdf = stats.cumfreq(img, n_bins, (0, 255))[0] cdf_ = {'cdf': np.array(cdf) / int(max(cdf)), 'n_items': int(max(cdf))} return cdf_
def histogram(self, data_dict, file_name=False, save=False, resolution=None): if type(data_dict) == dict: data = [] for i in data_dict: data.extend(data_dict[i]) elif type(data_dict) == list: data = data_dict else: print("Input must be dictionary or list") return for j, mape in enumerate(data): if mape > 50: data[j] = 50 res = stats.cumfreq(data, numbins=15, defaultreallimits=(0, 50)) x = res.lowerlimit + np.linspace(0, res.binsize * res.cumcount.size, res.cumcount.size) cum_y = [ i / (max(res.cumcount) - min(res.cumcount)) * 100 for i in res.cumcount ] fig = plt.figure(figsize=(10, 4)) ax1 = fig.add_subplot(1, 2, 1) ax2 = fig.add_subplot(1, 2, 2) ax1.hist(data, bins=15, histtype='bar', ec='black') ax1.set_title('Histogram') ax1.set_xlabel('MAPE(%)', fontsize=12) ax1.set_ylabel('Frequency (Days)', fontsize=12) # ax2.bar(x, res.cumcount, width=res.binsize) ax2.plot(x, cum_y, '-o') ax2.set_title('Cumulative Histogram') ax2.set_xlim([x.min(), x.max()]) ax2.set_xlabel('MAPE(%)', fontsize=12) ax2.set_ylabel('Dataset Percentage (%)', fontsize=12) if save is True: plt.savefig(file_name + '.jpg', format='jpg', dpi=resolution, bbox_inches='tight')
def KS_principle(inData): '''Show the principle of the Kolmogorov-Smirnov test.''' # CDF of normally distributed data nd = stats.norm() nd_x = np.linspace(-4, 4, 101) nd_y = nd.cdf(nd_x) # Empirical CDF of the sample data, which range for approximately 0 to 10 numPts = 50 lowerLim = 0 upperLim = 10 ecdf_x = np.linspace(lowerLim, upperLim, numPts) ecdf_y = stats.cumfreq(data, numPts, (lowerLim, upperLim))[0]/len(inData) #Add zero-point by hand ecdf_x = np.hstack((0., ecdf_x)) ecdf_y = np.hstack((0., ecdf_y)) # Plot the data sns.set_style('ticks') sns.set_context('poster') setFonts(36) plt.plot(nd_x, nd_y, 'k--') plt.hold(True) plt.plot(ecdf_x, ecdf_y, color='k') plt.xlabel('X') plt.ylabel('Cumulative Probability') # For the arrow, find the start ecdf_startIndex = np.min(np.where(ecdf_x >= 2)) arrowStart = np.array([ecdf_x[ecdf_startIndex], ecdf_y[ecdf_startIndex]]) nd_startIndex = np.min(np.where(nd_x >= 2)) arrowEnd = np.array([nd_x[nd_startIndex], nd_y[nd_startIndex]]) arrowDelta = arrowEnd - arrowStart plt.arrow(arrowStart[0], arrowStart[1], 0, arrowDelta[1], width=0.05, length_includes_head=True, head_length=0.04, head_width=0.4, color='k') plt.arrow(arrowStart[0], arrowStart[1]+arrowDelta[1], 0, -arrowDelta[1], width=0.05, length_includes_head=True, head_length=0.04, head_width=0.4, color='k') outFile = 'KS_Example.png' showData(outFile)
def get_null_reference_cdf( lowerlimit: np.float32, upperlimit: np.float32, numbins: int = 1000, ) -> ModifiedECDF: """ This function will return a CDF to be used as a null reference. :param lowerlimit: lower bound for the CDF :param upperlimit: upperbound for the CDF :param numbins: How many bins should be used for the reference :returns: ModifiedECDF of all zeros for the specified range """ return ModifiedECDF( stats.cumfreq([], numbins=numbins, defaultreallimits=(lowerlimit, upperlimit)))
def p(hinj=[], hrec=[], s=[], psrname='', detname='', style=sd.default_style, methods=[]): for method in methods: # First Calculate the interquartile range #(http://comments.gmane.org/gmane.comp.python.scientific.user/19755) data = np.sort(hrec) upperQuartile = stats.scoreatpercentile(data, .75) lowerQuartile = stats.scoreatpercentile(data, .25) IQR = upperQuartile - lowerQuartile # Get ideal bin size #(http://en.wikipedia.org/wiki/Freedman%E2%80%93Diaconis_rule) # fdsize = 3.49*np.std(data)*len(data)**(-1./3.) fdsize = 2 * IQR * len(data)**(-1. / 3.) #Get number of bins #(http://stats.stackexchange.com/questions/798/calculating-optimal-number-of-bins-in-a-histogram-for-n-where-n-ranges-from-30) num_bins = int((np.amax(data) - np.amin(data)) / fdsize) cumfreqs, lowlim, binsize, _ = stats.cumfreq(data, num_bins) pv = [1. - cdf / max(cumfreqs) for cdf in cumfreqs] bins = np.linspace(lowlim, num_bins * binsize, num_bins) plt.plot(bins, pv, style, color=sd.sd.pltcolor[method], label=method) plt.yscale('log') plt.title(detname + ' PSR ' + psrname) plt.xlabel('$h_{rec}$') plt.ylabel('1 - CDF (log scale)') plt.legend(numpoints=1) plt.savefig('plots/p_' + detname + '_' + psrname, bbox_inches='tight') print 'Plotted and saved in: ', print 'plots/p_' + detname + '_' + psrname plt.close()
def p_original(detector, psr, location='files/remote/source/'): d = pd.HDFStore(location + 'dataPitkin_' + detector + '.hdf5', 'r') a = d[psr].tolist() b = [abs(x) for x in a] # First Calculate the interquartile range #(http://comments.gmane.org/gmane.comp.python.scientific.user/19755) data = np.sort(d[psr].tolist()) upperQuartile = stats.scoreatpercentile(data,.75) lowerQuartile = stats.scoreatpercentile(data,.25) IQR = upperQuartile - lowerQuartile # Get ideal bin size #(http://en.wikipedia.org/wiki/Freedman%E2%80%93Diaconis_rule) # fdsize = 3.49*np.std(data)*len(data)**(-1./3.) fdsize = 2 * IQR * len(data)**(-1./3.) #Get number of bins #(http://stats.stackexchange.com/questions/798/calculating-optimal-number-of-bins-in-a-histogram-for-n-where-n-ranges-from-30) num_bins = int((np.amax(data) - np.amin(data))/fdsize) cumfreqs, lowlim, binsize, _ = stats.cumfreq(data, num_bins) pv = [1. - cdf/max(cumfreqs) for cdf in cumfreqs] bins = np.linspace(lowlim, num_bins*binsize, num_bins) plt.plot(bins, pv, style, color=sd.pltcolor[method], label=method) plt.yscale('log') plt.title(detname + ' PSR ' + psrname) plt.xlabel('$h_{rec}$') plt.ylabel('1 - CDF (log scale)') plt.legend(numpoints=1) plt.savefig('plots/p_' + detname + '_' + psrname, bbox_inches='tight') print 'Plotted and saved in: ', print 'plots/p_' + detname + '_' + psrname plt.close()
def p(hinj=[], hrec=[], s=[], psrname='', detname='', style=sd.default_style, methods=[]): for method in methods: # First Calculate the interquartile range #(http://comments.gmane.org/gmane.comp.python.scientific.user/19755) data = np.sort(hrec) upperQuartile = stats.scoreatpercentile(data,.75) lowerQuartile = stats.scoreatpercentile(data,.25) IQR = upperQuartile - lowerQuartile # Get ideal bin size #(http://en.wikipedia.org/wiki/Freedman%E2%80%93Diaconis_rule) # fdsize = 3.49*np.std(data)*len(data)**(-1./3.) fdsize = 2 * IQR * len(data)**(-1./3.) #Get number of bins #(http://stats.stackexchange.com/questions/798/calculating-optimal-number-of-bins-in-a-histogram-for-n-where-n-ranges-from-30) num_bins = int((np.amax(data) - np.amin(data))/fdsize) cumfreqs, lowlim, binsize, _ = stats.cumfreq(data, num_bins) pv = [1. - cdf/max(cumfreqs) for cdf in cumfreqs] bins = np.linspace(lowlim, num_bins*binsize, num_bins) plt.plot(bins, pv, style, color=sd.sd.pltcolor[method], label=method) plt.yscale('log') plt.title(detname + ' PSR ' + psrname) plt.xlabel('$h_{rec}$') plt.ylabel('1 - CDF (log scale)') plt.legend(numpoints=1) plt.savefig('plots/p_' + detname + '_' + psrname, bbox_inches='tight') print 'Plotted and saved in: ', print 'plots/p_' + detname + '_' + psrname plt.close()
def plot_cdfs(benchmark, benchmark_experiments, os_type, results_dir, output_dir, output_extension): print "Parsing and plotting runtime results for %d %s experiments...\n" % (len(benchmark_experiments), benchmark) runtime_results = parse_runtime_results(benchmark, benchmark_experiments, os_type, aggregate=False) if len(runtime_results) == 0: print "Not enough results found for %s. Skipping..." % benchmark return keyed_by_mem_size = defaultdict(list) for jvm_count, memsize_to_results in sorted(runtime_results.iteritems(), key=lambda t: t[0]): for memsize, runtimes in memsize_to_results.iteritems(): keyed_by_mem_size[memsize].append((jvm_count, runtimes)) for mem_size, jvm_to_runtimes in sorted(keyed_by_mem_size.iteritems(), key=lambda t: t[0]): plt.clf() ax = plt.subplot(111) longest_time = max(reduce(lambda x,y: x + y, [t[1] for t in jvm_to_runtimes])) shortest_time = min(reduce(lambda x,y: x + y, [t[1] for t in jvm_to_runtimes])) for jvm_count, runtime_list in jvm_to_runtimes: cum_freqs, ll, binsize, xp = cumfreq(runtime_list, numbins=len(runtime_list)) normed_cum_freqs = map(lambda x: x/max(cum_freqs), cum_freqs) padded_x = [shortest_time*0.8, min(runtime_list)] + sorted(runtime_list) + [longest_time*1.1] padded_y = [0, 0] + normed_cum_freqs + [1] ax.plot(padded_x, padded_y, label="%d JVMs" % jvm_count) # Apply labels and bounds plt.title("%s Mean Iteration Runtime CDF (%d MB Heap)" % (benchmark, mem_size)) plt.ylabel("Fraction of Jobs Completed") plt.xlabel("Time (ms)") plt.xlim(shortest_time*0.8, longest_time*1.1) plt.ylim(-0.025, 1.025) # Move legend to the right box = ax.get_position() ax.set_position([box.x0, box.y0, box.width * 0.85, box.height]) ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) save_or_show_current(output_dir, 'cdfs', benchmark, output_extension, suffix='%03dMB' % mem_size)
def gmm_test(X,k0,k1,nboot): nsample = X.shape[0] gmm0 = mixture.GMM(n_components=k0, covariance_type='full') gmm0.fit(X) L0 = sum(gmm0.score(X)) gmm1 = mixture.GMM(n_components=k1, covariance_type='full') gmm1.fit(X) L1 = sum(gmm1.score(X)) LRstat = -2*(L1 - L0) LRstat0 = [] for i in range(nboot): Xboot = gmm0.sample(n_samples=nsample) gmm0_boot = mixture.GMM(n_components=k0, covariance_type = 'full') gmm0_boot.fit(Xboot) L0_boot = sum(gmm0_boot.score(Xboot)) gmm1_boot = mixture.GMM(n_components=k1, covariance_type = 'full') gmm1_boot.fit(Xboot) L1_boot = sum(gmm1_boot.score(Xboot)) LRstat0.append(-2*(L1_boot - L0_boot)) ecdf, lowlim, binsize, extrapoints = cumfreq(LRstat0) ecdf = ecdf/len(LRstat0) bin = np.mean([lowlim,lowlim+binsize]) bins = [] for i in range(len(ecdf)): bins.append(bin) bin = bin + binsize p = max(ecdf[bins<=LRstat]) return p
#------------------------------------------------------------------------------------------------- ################################################################################################## #load data data = array([4.92, 6.52,7.33, 5.75]) #bootstrapping, assumes that the data here completely describe (are completely representative thereof) the underlying distribution. sample_count = data.shape[0] variable_count = 1 jitter_count = 1000 replicates = tile(data,(jitter_count,1)) replicates += (random(replicates.shape)*(max(data)-min(data))+min(data)) map(shuffle,replicates) distribution = ravel(diff(replicates,axis=1)) cdf = cumfreq(distribution) overview = plt.figure(figsize =(8.27,11.69)) #Thus instructeth PDM ax = overview.add_subplot(111) xvals = linspace(cdf[1],cdf[1]+10*cdf[2],num=10) #By default cumfreq divides into 10 bins n,bins,patches=ax.hist(distribution, normed=True) plt.setp(patches, 'facecolor', 'k', 'alpha', 0.75) tech.adjust_spines(ax,['left','bottom']) overview.text(0.5, 0.08, r'Weekly Change in Length of Stay', ha='center', va='top', fontsize=30, weight='bold') #xlabel overview.text(0.02875, 0.5, r'Chance of Occurrence', ha='center', va='center', rotation='vertical', fontsize=30, weight='bold') ax.annotate(r'April 21, 2010',xy=(1.1,.18), xytext=(1.2,.2),arrowprops=dict(facecolor='black', shrink=0.05), fontsize=20) plt.subplots_adjust(top=0.95, bottom =0.18, left=0.15) plt.savefig('cdf_LOS.jpg',dpi=600)
def simplePlots(): '''Demonstrate the generation of different statistical standard plots''' # Univariate data ------------------------- # Make sure that always the same random numbers are generated np.random.seed(1234) # Generate data that are normally distributed x = np.random.randn(500) # Other graphics settings sns.set(context='poster', style='ticks', palette=sns.color_palette('muted')) # Set the fonts the way I like them setFonts(32) # Scatter plot plt.scatter(np.arange(len(x)), x) plt.xlim([0, len(x)]) # Save and show the data, in a systematic format printout('scatterPlot.png', xlabel='Datapoints', ylabel='Values', title='Scatter') # Histogram plt.hist(x) printout('histogram_plain.png', xlabel='Data Values', ylabel='Frequency', title='Histogram, default settings') plt.hist(x,25) printout('histogram.png', xlabel='Data Values', ylabel='Frequency', title='Histogram, 25 bins') # Cumulative probability density numbins = 20 plt.plot(stats.cumfreq(x,numbins)[0]) printout('CumulativeFrequencyFunction.png', xlabel='Data Values', ylabel='CumFreq', title='Cumulative Frequency') # KDE-plot sns.kdeplot(x) printout('kde.png', xlabel='Data Values', ylabel='Density', title='KDE_plot') # Boxplot # The ox consists of the first, second (middle) and third quartile plt.boxplot(x, sym='*') printout('boxplot.png', xlabel='Values', title='Boxplot') plt.boxplot(x, sym='*', vert=False) plt.title('Boxplot, horizontal') plt.xlabel('Values') plt.show() # Errorbars x = np.arange(5) y = x**2 errorBar = x/2 plt.errorbar(x,y, yerr=errorBar, fmt='o', capsize=5, capthick=3) plt.xlim([-0.2, 4.2]) plt.ylim([-0.2, 19]) printout('Errorbars.png', xlabel='Data Values', ylabel='Measurements', title='Errorbars') # Violinplot nd = stats.norm data = nd.rvs(size=(100)) nd2 = stats.norm(loc = 3, scale = 1.5) data2 = nd2.rvs(size=(100)) # Use pandas and the seaborn package for the violin plot df = pd.DataFrame({'Girls':data, 'Boys':data2}) sns.violinplot(df) printout('violinplot.png', title='Violinplot') # Barplot # The font-size is set such that the legend does not overlap with the data np.random.seed(1234) setFonts(20) df = pd.DataFrame(np.random.rand(10, 4), columns=['a', 'b', 'c', 'd']) df.plot(kind='bar', grid=False, color=sns.color_palette('muted')) showData('barplot.png') setFonts(28) # Bivariate Plots df2 = pd.DataFrame(np.random.rand(50, 3), columns=['a', 'b', 'c']) df2.plot(kind='scatter', x='a', y='b', s=df2['c']*500); plt.axhline(0, ls='--', color='#999999') plt.axvline(0, ls='--', color='#999999') printout('bivariate.png') # Grouped Boxplot sns.set_style('whitegrid') sns.boxplot(df) setFonts(28) printout('groupedBoxplot.png', title='sns.boxplot') sns.set_style('ticks') # Pieplot txtLabels = 'Cats', 'Dogs', 'Frogs', 'Others' fractions = [45, 30, 15, 10] offsets =(0, 0.05, 0, 0) plt.pie(fractions, explode=offsets, labels=txtLabels, autopct='%1.1f%%', shadow=True, startangle=90, colors=sns.color_palette('muted') ) plt.axis('equal') printout('piePlot.png', title=' ')
density = gaussian_kde(list_merged_by_ball_id) xs = numpy.linspace(0,8,200) density.covariance_factor = lambda : .25 density._compute_covariance() plt.plot(xs,density(xs)) plt.xlabel('KDE,number of appear time by blue ball number') plt.ylabel('KDE,counter of appear time by blue ball number') plt.show() ##CDF(The Cumulative Distribution Function from scipy.stats import cumfreq idx_max = max(dfs_blue_balls_count_values) hi = idx_max a = numpy.arange(hi) ** 2 # for nbins in ( 2, 20, 100 ): for nbins in dfs_blue_balls_count_values: cf = cumfreq(a, nbins) # bin values, lowerlimit, binsize, extrapoints w = hi / nbins x = numpy.linspace( w/2, hi - w/2, nbins ) # care # print x, cf plt.plot( x, cf[0], label=str(nbins) ) plt.legend() plt.xlabel('CDF,number of appear time by blue ball number') plt.ylabel('CDF,counter of appear time by blue ball number') plt.show() ###Optional: Comparing Distributions with Probability Plots and QQ Plots ###Quantile plot of the server data. A quantile plot is a graph of the CDF with the x and y axes interchanged. ###Probability plot for the data set shown,a standard normal distribution: ###@see: http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.probplot.html import scipy.stats as stats
# Histogram hist(x) xlabel('Data Values') ylabel('Frequency') title('Histogram, default settings') show() hist(x,25) xlabel('Data Values') ylabel('Frequency') title('Histogram, 25 bins') show() # Cumulative probability density numbins = 20 cdf = stats.cumfreq(x,numbins) plot(cdf[0]) xlabel('Data Values') ylabel('Cumulative Frequency') title('Cumulative probablity density function') show() # Boxplot # The error bars indiacte the range, and the box consists of the # first, second (middle) and third quartile boxplot(x) title('Boxplot') ylabel('Values') show() boxplot(x, vert=False)
def execute(self, sqr_nodes, connectivity, randomize_boot, sec_before_inject, sec_after_inject, inject_node, k, distance, filenamebase): print "="*40 print "Executing HistGraph:" print "filenamebase\t\t", filenamebase print "="*40 node_re = 'DEBUG \((\d+)\):' node_re_c = re.compile(node_re) time_re = '(\d+):(\d+):(\d+.\d+)' time_re_c = re.compile(time_re) consist = np.zeros((sqr_nodes, sqr_nodes)) f = open(filenamebase+".log", "r") for line in f: #print line, if line.find("inconsistent") >= 0: #print line, node_obj = node_re_c.search(line) node = int(node_obj.group(1)) time_obj = time_re_c.search(line) #print "\t", time_obj.group(0), t = Time(time_obj.group(1), time_obj.group(2), time_obj.group(3)) #print t.in_second() #print "id", node, (x, y) = id2xy(node, sqr_nodes) #print "->", x, y consist[x][y] = t.in_second() - sec_before_inject f.close() LOW_TIME = 0 HIGH_TIME = 50 BINS = 100 #print consist.flatten() cdf = stats.cumfreq(consist.flatten(), BINS, (LOW_TIME, HIGH_TIME)) #print cdf #, max(cdf[0]), cdf[0]/max(cdf[0]) #print floatRange(LOW_TIME, HIGH_TIME, cdf[2]) fig = plt.figure(figsize=(10, 8)) ax = fig.add_subplot(111) plt.plot(floatRange(LOW_TIME, HIGH_TIME, cdf[2]), cdf[0]/max(cdf[0]), ls='steps') # plt.hist(consist.flatten(), # bins = 100, # cumulative=True, # normed=True, # histtype='step') plt.grid() plt.title('Model Time to Consistency (cdf)') text = str(sqr_nodes) + "x" + str(sqr_nodes) + "\n" + \ "Distance: " + str(distance) + "\n" + \ "K: " + str(k) # "Connectivity: " + str(connectivity) + "\n" + \ plt.text(.5, .1, text, horizontalalignment='center', verticalalignment='center', transform = ax.transAxes, bbox=dict(facecolor='red', alpha=0.2)) plt.ylim(0, 1) #plt.xlim(0, # 50) plt.xlabel("Model Time [s]") plt.ylabel("Nodes consistent [%]") plt.savefig(filenamebase+"_hist.png")
def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--infile", required=True, help="Tabular file.") parser.add_argument("-o", "--outfile", required=True, help="Path to the output file.") parser.add_argument("--sample_one_cols", help="Input format, like smi, sdf, inchi") parser.add_argument("--sample_two_cols", help="Input format, like smi, sdf, inchi") parser.add_argument("--sample_cols", help="Input format, like smi, sdf, inchi,separate arrays using ;") parser.add_argument("--test_id", help="statistical test method") parser.add_argument( "--mwu_use_continuity", action="store_true", default=False, help="Whether a continuity correction (1/2.) should be taken into account.", ) parser.add_argument( "--equal_var", action="store_true", default=False, help="If set perform a standard independent 2 sample test that assumes equal population variances. If not set, perform Welch's t-test, which does not assume equal population variance.", ) parser.add_argument( "--reta", action="store_true", default=False, help="Whether or not to return the internally computed a values." ) parser.add_argument("--fisher", action="store_true", default=False, help="if true then Fisher definition is used") parser.add_argument( "--bias", action="store_true", default=False, help="if false,then the calculations are corrected for statistical bias", ) parser.add_argument("--inclusive1", action="store_true", default=False, help="if false,lower_limit will be ignored") parser.add_argument( "--inclusive2", action="store_true", default=False, help="if false,higher_limit will be ignored" ) parser.add_argument("--inclusive", action="store_true", default=False, help="if false,limit will be ignored") parser.add_argument( "--printextras", action="store_true", default=False, help="If True, if there are extra points a warning is raised saying how many of those points there are", ) parser.add_argument( "--initial_lexsort", action="store_true", default="False", help="Whether to use lexsort or quicksort as the sorting method for the initial sort of the inputs.", ) parser.add_argument("--correction", action="store_true", default=False, help="continuity correction ") parser.add_argument( "--axis", type=int, default=0, help="Axis can equal None (ravel array first), or an integer (the axis over which to operate on a and b)", ) parser.add_argument( "--n", type=int, default=0, help="the number of trials. This is ignored if x gives both the number of successes and failures", ) parser.add_argument("--b", type=int, default=0, help="The number of bins to use for the histogram") parser.add_argument("--N", type=int, default=0, help="Score that is compared to the elements in a.") parser.add_argument("--ddof", type=int, default=0, help="Degrees of freedom correction") parser.add_argument("--score", type=int, default=0, help="Score that is compared to the elements in a.") parser.add_argument("--m", type=float, default=0.0, help="limits") parser.add_argument("--mf", type=float, default=2.0, help="lower limit") parser.add_argument("--nf", type=float, default=99.9, help="higher_limit") parser.add_argument( "--p", type=float, default=0.5, help="The hypothesized probability of success. 0 <= p <= 1. The default value is p = 0.5", ) parser.add_argument("--alpha", type=float, default=0.9, help="probability") parser.add_argument("--new", type=float, default=0.0, help="Value to put in place of values in a outside of bounds") parser.add_argument( "--proportiontocut", type=float, default=0.0, help="Proportion (in range 0-1) of total data set to trim of each end.", ) parser.add_argument( "--lambda_", type=float, default=1.0, help="lambda_ gives the power in the Cressie-Read power divergence statistic", ) parser.add_argument( "--imbda", type=float, default=0, help="If lmbda is not None, do the transformation for that value.If lmbda is None, find the lambda that maximizes the log-likelihood function and return it as the second output argument.", ) parser.add_argument("--base", type=float, default=1.6, help="The logarithmic base to use, defaults to e") parser.add_argument("--dtype", help="dtype") parser.add_argument("--med", help="med") parser.add_argument("--cdf", help="cdf") parser.add_argument("--zero_method", help="zero_method options") parser.add_argument("--dist", help="dist options") parser.add_argument("--ties", help="ties options") parser.add_argument("--alternative", help="alternative options") parser.add_argument("--mode", help="mode options") parser.add_argument("--method", help="method options") parser.add_argument("--md", help="md options") parser.add_argument("--center", help="center options") parser.add_argument("--kind", help="kind options") parser.add_argument("--tail", help="tail options") parser.add_argument("--interpolation", help="interpolation options") parser.add_argument("--statistic", help="statistic options") args = parser.parse_args() infile = args.infile outfile = open(args.outfile, "w+") test_id = args.test_id nf = args.nf mf = args.mf imbda = args.imbda inclusive1 = args.inclusive1 inclusive2 = args.inclusive2 sample0 = 0 sample1 = 0 sample2 = 0 if args.sample_cols != None: sample0 = 1 barlett_samples = [] for sample in args.sample_cols.split(";"): barlett_samples.append(map(int, sample.split(","))) if args.sample_one_cols != None: sample1 = 1 sample_one_cols = args.sample_one_cols.split(",") if args.sample_two_cols != None: sample_two_cols = args.sample_two_cols.split(",") sample2 = 1 for line in open(infile): sample_one = [] sample_two = [] cols = line.strip().split("\t") if sample0 == 1: b_samples = columns_to_values(barlett_samples, line) if sample1 == 1: for index in sample_one_cols: sample_one.append(cols[int(index) - 1]) if sample2 == 1: for index in sample_two_cols: sample_two.append(cols[int(index) - 1]) if test_id.strip() == "describe": size, min_max, mean, uv, bs, bk = stats.describe(map(float, sample_one)) cols.append(size) cols.append(min_max) cols.append(mean) cols.append(uv) cols.append(bs) cols.append(bk) elif test_id.strip() == "mode": vals, counts = stats.mode(map(float, sample_one)) cols.append(vals) cols.append(counts) elif test_id.strip() == "nanmean": m = stats.nanmean(map(float, sample_one)) cols.append(m) elif test_id.strip() == "nanmedian": m = stats.nanmedian(map(float, sample_one)) cols.append(m) elif test_id.strip() == "kurtosistest": z_value, p_value = stats.kurtosistest(map(float, sample_one)) cols.append(z_value) cols.append(p_value) elif test_id.strip() == "variation": ra = stats.variation(map(float, sample_one)) cols.append(ra) elif test_id.strip() == "itemfreq": freq = stats.itemfreq(map(float, sample_one)) for list in freq: elements = ",".join(map(str, list)) cols.append(elements) elif test_id.strip() == "nanmedian": m = stats.nanmedian(map(float, sample_one)) cols.append(m) elif test_id.strip() == "variation": ra = stats.variation(map(float, sample_one)) cols.append(ra) elif test_id.strip() == "boxcox_llf": IIf = stats.boxcox_llf(imbda, map(float, sample_one)) cols.append(IIf) elif test_id.strip() == "tiecorrect": fa = stats.tiecorrect(map(float, sample_one)) cols.append(fa) elif test_id.strip() == "rankdata": r = stats.rankdata(map(float, sample_one), method=args.md) cols.append(r) elif test_id.strip() == "nanstd": s = stats.nanstd(map(float, sample_one), bias=args.bias) cols.append(s) elif test_id.strip() == "anderson": A2, critical, sig = stats.anderson(map(float, sample_one), dist=args.dist) cols.append(A2) for list in critical: cols.append(list) cols.append(",") for list in sig: cols.append(list) elif test_id.strip() == "binom_test": p_value = stats.binom_test(map(float, sample_one), n=args.n, p=args.p) cols.append(p_value) elif test_id.strip() == "gmean": gm = stats.gmean(map(float, sample_one), dtype=args.dtype) cols.append(gm) elif test_id.strip() == "hmean": hm = stats.hmean(map(float, sample_one), dtype=args.dtype) cols.append(hm) elif test_id.strip() == "kurtosis": k = stats.kurtosis(map(float, sample_one), axis=args.axis, fisher=args.fisher, bias=args.bias) cols.append(k) elif test_id.strip() == "moment": n_moment = stats.moment(map(float, sample_one), n=args.n) cols.append(n_moment) elif test_id.strip() == "normaltest": k2, p_value = stats.normaltest(map(float, sample_one)) cols.append(k2) cols.append(p_value) elif test_id.strip() == "skew": skewness = stats.skew(map(float, sample_one), bias=args.bias) cols.append(skewness) elif test_id.strip() == "skewtest": z_value, p_value = stats.skewtest(map(float, sample_one)) cols.append(z_value) cols.append(p_value) elif test_id.strip() == "sem": s = stats.sem(map(float, sample_one), ddof=args.ddof) cols.append(s) elif test_id.strip() == "zscore": z = stats.zscore(map(float, sample_one), ddof=args.ddof) for list in z: cols.append(list) elif test_id.strip() == "signaltonoise": s2n = stats.signaltonoise(map(float, sample_one), ddof=args.ddof) cols.append(s2n) elif test_id.strip() == "percentileofscore": p = stats.percentileofscore(map(float, sample_one), score=args.score, kind=args.kind) cols.append(p) elif test_id.strip() == "bayes_mvs": c_mean, c_var, c_std = stats.bayes_mvs(map(float, sample_one), alpha=args.alpha) cols.append(c_mean) cols.append(c_var) cols.append(c_std) elif test_id.strip() == "sigmaclip": c, c_low, c_up = stats.sigmaclip(map(float, sample_one), low=args.m, high=args.n) cols.append(c) cols.append(c_low) cols.append(c_up) elif test_id.strip() == "kstest": d, p_value = stats.kstest( map(float, sample_one), cdf=args.cdf, N=args.N, alternative=args.alternative, mode=args.mode ) cols.append(d) cols.append(p_value) elif test_id.strip() == "chi2_contingency": chi2, p, dof, ex = stats.chi2_contingency( map(float, sample_one), correction=args.correction, lambda_=args.lambda_ ) cols.append(chi2) cols.append(p) cols.append(dof) cols.append(ex) elif test_id.strip() == "tmean": if nf is 0 and mf is 0: mean = stats.tmean(map(float, sample_one)) else: mean = stats.tmean(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(mean) elif test_id.strip() == "tmin": if mf is 0: min = stats.tmin(map(float, sample_one)) else: min = stats.tmin(map(float, sample_one), lowerlimit=mf, inclusive=args.inclusive) cols.append(min) elif test_id.strip() == "tmax": if nf is 0: max = stats.tmax(map(float, sample_one)) else: max = stats.tmax(map(float, sample_one), upperlimit=nf, inclusive=args.inclusive) cols.append(max) elif test_id.strip() == "tvar": if nf is 0 and mf is 0: var = stats.tvar(map(float, sample_one)) else: var = stats.tvar(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(var) elif test_id.strip() == "tstd": if nf is 0 and mf is 0: std = stats.tstd(map(float, sample_one)) else: std = stats.tstd(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(std) elif test_id.strip() == "tsem": if nf is 0 and mf is 0: s = stats.tsem(map(float, sample_one)) else: s = stats.tsem(map(float, sample_one), (mf, nf), (inclusive1, inclusive2)) cols.append(s) elif test_id.strip() == "scoreatpercentile": if nf is 0 and mf is 0: s = stats.scoreatpercentile( map(float, sample_one), map(float, sample_two), interpolation_method=args.interpolation ) else: s = stats.scoreatpercentile( map(float, sample_one), map(float, sample_two), (mf, nf), interpolation_method=args.interpolation ) for list in s: cols.append(list) elif test_id.strip() == "relfreq": if nf is 0 and mf is 0: rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b) else: rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b, (mf, nf)) for list in rel: cols.append(list) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "binned_statistic": if nf is 0 and mf is 0: st, b_edge, b_n = stats.binned_statistic( map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b ) else: st, b_edge, b_n = stats.binned_statistic( map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b, range=(mf, nf), ) cols.append(st) cols.append(b_edge) cols.append(b_n) elif test_id.strip() == "threshold": if nf is 0 and mf is 0: o = stats.threshold(map(float, sample_one), newval=args.new) else: o = stats.threshold(map(float, sample_one), mf, nf, newval=args.new) for list in o: cols.append(list) elif test_id.strip() == "trimboth": o = stats.trimboth(map(float, sample_one), proportiontocut=args.proportiontocut) for list in o: cols.append(list) elif test_id.strip() == "trim1": t1 = stats.trim1(map(float, sample_one), proportiontocut=args.proportiontocut, tail=args.tail) for list in t1: cols.append(list) elif test_id.strip() == "histogram": if nf is 0 and mf is 0: hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b) else: hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b, (mf, nf)) cols.append(hi) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "cumfreq": if nf is 0 and mf is 0: cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b) else: cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b, (mf, nf)) cols.append(cum) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "boxcox_normmax": if nf is 0 and mf is 0: ma = stats.boxcox_normmax(map(float, sample_one)) else: ma = stats.boxcox_normmax(map(float, sample_one), (mf, nf), method=args.method) cols.append(ma) elif test_id.strip() == "boxcox": if imbda is 0: box, ma, ci = stats.boxcox(map(float, sample_one), alpha=args.alpha) cols.append(box) cols.append(ma) cols.append(ci) else: box = stats.boxcox(map(float, sample_one), imbda, alpha=args.alpha) cols.append(box) elif test_id.strip() == "histogram2": h2 = stats.histogram2(map(float, sample_one), map(float, sample_two)) for list in h2: cols.append(list) elif test_id.strip() == "ranksums": z_statistic, p_value = stats.ranksums(map(float, sample_one), map(float, sample_two)) cols.append(z_statistic) cols.append(p_value) elif test_id.strip() == "ttest_1samp": t, prob = stats.ttest_1samp(map(float, sample_one), map(float, sample_two)) for list in t: cols.append(list) for list in prob: cols.append(list) elif test_id.strip() == "ansari": AB, p_value = stats.ansari(map(float, sample_one), map(float, sample_two)) cols.append(AB) cols.append(p_value) elif test_id.strip() == "linregress": slope, intercept, r_value, p_value, stderr = stats.linregress( map(float, sample_one), map(float, sample_two) ) cols.append(slope) cols.append(intercept) cols.append(r_value) cols.append(p_value) cols.append(stderr) elif test_id.strip() == "pearsonr": cor, p_value = stats.pearsonr(map(float, sample_one), map(float, sample_two)) cols.append(cor) cols.append(p_value) elif test_id.strip() == "pointbiserialr": r, p_value = stats.pointbiserialr(map(float, sample_one), map(float, sample_two)) cols.append(r) cols.append(p_value) elif test_id.strip() == "ks_2samp": d, p_value = stats.ks_2samp(map(float, sample_one), map(float, sample_two)) cols.append(d) cols.append(p_value) elif test_id.strip() == "mannwhitneyu": mw_stats_u, p_value = stats.mannwhitneyu( map(float, sample_one), map(float, sample_two), use_continuity=args.mwu_use_continuity ) cols.append(mw_stats_u) cols.append(p_value) elif test_id.strip() == "zmap": z = stats.zmap(map(float, sample_one), map(float, sample_two), ddof=args.ddof) for list in z: cols.append(list) elif test_id.strip() == "ttest_ind": mw_stats_u, p_value = stats.ttest_ind( map(float, sample_one), map(float, sample_two), equal_var=args.equal_var ) cols.append(mw_stats_u) cols.append(p_value) elif test_id.strip() == "ttest_rel": t, prob = stats.ttest_rel(map(float, sample_one), map(float, sample_two), axis=args.axis) cols.append(t) cols.append(prob) elif test_id.strip() == "mood": z, p_value = stats.mood(map(float, sample_one), map(float, sample_two), axis=args.axis) cols.append(z) cols.append(p_value) elif test_id.strip() == "shapiro": W, p_value, a = stats.shapiro(map(float, sample_one), map(float, sample_two), args.reta) cols.append(W) cols.append(p_value) for list in a: cols.append(list) elif test_id.strip() == "kendalltau": k, p_value = stats.kendalltau( map(float, sample_one), map(float, sample_two), initial_lexsort=args.initial_lexsort ) cols.append(k) cols.append(p_value) elif test_id.strip() == "entropy": s = stats.entropy(map(float, sample_one), map(float, sample_two), base=args.base) cols.append(s) elif test_id.strip() == "spearmanr": if sample2 == 1: rho, p_value = stats.spearmanr(map(float, sample_one), map(float, sample_two)) else: rho, p_value = stats.spearmanr(map(float, sample_one)) cols.append(rho) cols.append(p_value) elif test_id.strip() == "wilcoxon": if sample2 == 1: T, p_value = stats.wilcoxon( map(float, sample_one), map(float, sample_two), zero_method=args.zero_method, correction=args.correction, ) else: T, p_value = stats.wilcoxon( map(float, sample_one), zero_method=args.zero_method, correction=args.correction ) cols.append(T) cols.append(p_value) elif test_id.strip() == "chisquare": if sample2 == 1: rho, p_value = stats.chisquare(map(float, sample_one), map(float, sample_two), ddof=args.ddof) else: rho, p_value = stats.chisquare(map(float, sample_one), ddof=args.ddof) cols.append(rho) cols.append(p_value) elif test_id.strip() == "power_divergence": if sample2 == 1: stat, p_value = stats.power_divergence( map(float, sample_one), map(float, sample_two), ddof=args.ddof, lambda_=args.lambda_ ) else: stat, p_value = stats.power_divergence(map(float, sample_one), ddof=args.ddof, lambda_=args.lambda_) cols.append(stat) cols.append(p_value) elif test_id.strip() == "theilslopes": if sample2 == 1: mpe, met, lo, up = stats.theilslopes(map(float, sample_one), map(float, sample_two), alpha=args.alpha) else: mpe, met, lo, up = stats.theilslopes(map(float, sample_one), alpha=args.alpha) cols.append(mpe) cols.append(met) cols.append(lo) cols.append(up) elif test_id.strip() == "combine_pvalues": if sample2 == 1: stat, p_value = stats.combine_pvalues( map(float, sample_one), method=args.med, weights=map(float, sample_two) ) else: stat, p_value = stats.combine_pvalues(map(float, sample_one), method=args.med) cols.append(stat) cols.append(p_value) elif test_id.strip() == "obrientransform": ob = stats.obrientransform(*b_samples) for list in ob: elements = ",".join(map(str, list)) cols.append(elements) elif test_id.strip() == "f_oneway": f_value, p_value = stats.f_oneway(*b_samples) cols.append(f_value) cols.append(p_value) elif test_id.strip() == "kruskal": h, p_value = stats.kruskal(*b_samples) cols.append(h) cols.append(p_value) elif test_id.strip() == "friedmanchisquare": fr, p_value = stats.friedmanchisquare(*b_samples) cols.append(fr) cols.append(p_value) elif test_id.strip() == "fligner": xsq, p_value = stats.fligner(center=args.center, proportiontocut=args.proportiontocut, *b_samples) cols.append(xsq) cols.append(p_value) elif test_id.strip() == "bartlett": T, p_value = stats.bartlett(*b_samples) cols.append(T) cols.append(p_value) elif test_id.strip() == "levene": w, p_value = stats.levene(center=args.center, proportiontocut=args.proportiontocut, *b_samples) cols.append(w) cols.append(p_value) elif test_id.strip() == "median_test": stat, p_value, m, table = stats.median_test( ties=args.ties, correction=args.correction, lambda_=args.lambda_, *b_samples ) cols.append(stat) cols.append(p_value) cols.append(m) cols.append(table) for list in table: elements = ",".join(map(str, list)) cols.append(elements) outfile.write("%s\n" % "\t".join(map(str, cols))) outfile.close()
def main(): # Univariate data ------------------------- # Generate data that are normally distributed x = randn(500) # Scatter plot plot(x,'.') title('Scatter Plot') xlabel('X') ylabel('Y') draw() show() # Histogram hist(x) xlabel('Data Values') ylabel('Frequency') title('Histogram, default settings') show() hist(x,25) xlabel('Data Values') ylabel('Frequency') title('Histogram, 25 bins') show() # Cumulative probability density numbins = 20 cdf = stats.cumfreq(x,numbins) plot(cdf[0]) xlabel('Data Values') ylabel('Cumulative Frequency') title('Cumulative probablity density function') show() # Boxplot # The error bars indiacte the range, and the box consists of the # first, second (middle) and third quartile boxplot(x) title('Boxplot') ylabel('Values') show() boxplot(x, vert=False) title('Boxplot, horizontal') xlabel('Values') show() # Check for normality _ = stats.probplot(x, plot=plt) title('Probplot - check for normality') show() # Bivariate data ------------------------- # Generate data x = randn(200) y = 10+0.5*x+randn(len(x)) # Scatter plot scatter(x,y) # This one is quite similar to "plot(x,y,'.')" title('Scatter plot of data') xlabel('X') ylabel('Y') show() # LineFit M = vstack((ones(len(x)), x)).T pars = linalg.lstsq(M,y)[0] intercept = pars[0] slope = pars[1] scatter(x,y) hold(True) plot(x, intercept + slope*x, 'r') show()
def main(): # Univariate data ------------------------- # Generate data that are normally distributed x = np.random.randn(500) # Set the fonts the way I like them sns.set_context('poster') sns.set_style('ticks') #mystyle.set() # Scatter plot plt.scatter(np.arange(len(x)), x) plt.xlim([0, len(x)]) mystyle.printout('scatterPlot.png', xlabel='x', ylabel='y', title='Scatter') # Histogram plt.hist(x) mystyle.printout('histogram_plain.png', xlabel='Data Values', ylabel='Frequency', title='Histogram, default settings') plt.hist(x,25) mystyle.printout('histogram.png', xlabel='Data Values', ylabel='Frequency', title='Histogram, 25 bins') # Cumulative probability density numbins = 20 plt.plot(stats.cumfreq(x,numbins)[0]) mystyle.printout('CumulativeFrequencyFunction.png', xlabel='Data Values', ylabel='CumFreq', title='Cumulative Frequncy') # KDE-plot sns.kdeplot(x) mystyle.printout('kde.png', xlabel='Data Values', ylabel='Density', title='KDE_plot') # Boxplot # The ox consists of the first, second (middle) and third quartile plt.boxplot(x, sym='*') mystyle.printout('boxplot.png', xlabel='Values', title='Boxplot') plt.boxplot(x, sym='*', vert=False) plt.title('Boxplot, horizontal') plt.xlabel('Values') plt.show() # Errorbars x = np.arange(5) y = x**2 errorBar = x/2 plt.errorbar(x,y, yerr=errorBar, fmt='o', capsize=5, capthick=3) plt.xlim([-0.2, 4.2]) plt.ylim([-0.2, 19]) mystyle.printout('Errorbars.png', xlabel='Data Values', ylabel='Measurements', title='Errorbars') # Violinplot nd = stats.norm data = nd.rvs(size=(100)) nd2 = stats.norm(loc = 3, scale = 1.5) data2 = nd2.rvs(size=(100)) # Use pandas and the seaborn package for the violin plot df = pd.DataFrame({'Girls':data, 'Boys':data2}) #sns.violinplot(df, color = ["#999999", "#DDDDDD"]) sns.violinplot(df) mystyle.printout('violinplot.png', title='Violinplot') # Barplot df = pd.DataFrame(np.random.rand(10, 4), columns=['a', 'b', 'c', 'd']) df.plot(kind='bar', grid=False) mystyle.printout('barplot.png', title='Barplot') # Grouped Boxplot sns.set_style('whitegrid') sns.boxplot(df) mystyle.printout('groupedBoxplot.png', title='sns.boxplot') # Bivariate Plots df2 = pd.DataFrame(np.random.rand(50, 4), columns=['a', 'b', 'c', 'd']) df2.plot(kind='scatter', x='a', y='b', s=df['c']*300); mystyle.printout('bivariate.png') # Pieplot series = pd.Series(3 * np.random.rand(4), index=['a', 'b', 'c', 'd'], name='series') sns.set_palette("husl") series.plot(kind='pie', figsize=(6, 6)) mystyle.printout('piePlot.png', title='pie-plot')
def plot_acceleration_or_instantaneous_curves(number_of_repacks): def check_for_zeros(latitude, longitude, latitude_index, longitude_index, current_value): if current_value == 0: return 1 return 0 def or_function(this_value, other_value): return numpy.logical_or(this_value, other_value) def calculate_population_of_zerows(datamap, populationmap): population = 0 for i in range(400): for j in range(600): if datamap.get_value_by_index(i, j) == 1: population += populationmap.get_value_by_index(i, j) return population colors = {7: 'b', 14: 'r', 22: 'g', 25: 'm'} for num_channels_removed in [25]: zerows_map = west.data_map.DataMap2DContinentalUnitedStates.create(400, 600) zerows_map.reset_all_values(0) datamap_spec = west.data_management.SpecificationDataMap(west.data_map.DataMap2DContinentalUnitedStates, 400, 600) is_in_region_map_spec = west.data_management.SpecificationRegionMap(west.boundary.BoundaryContinentalUnitedStates, datamap_spec) is_in_region_map = is_in_region_map_spec.fetch_data() population_map_spec = west.data_management.SpecificationPopulationMap(is_in_region_map_spec, west.population.PopulationData) population_map = population_map_spec.fetch_data() instantaneous_values = numpy.zeros(number_of_repacks) acceleration_values = numpy.zeros(number_of_repacks) num_repacks_index = numpy.arange(number_of_repacks) if num_channels_removed == 25: repack_file_list = os.listdir(os.path.join("data", "Pickled Files - Whitespace Maps", "A-%dChannelsRemoved"%num_channels_removed, "Only UHF")) repack_file_list = repack_file_list[1:] else: repack_file_list = [] for i in range(number_of_repacks): repack_file_list.append("%dUHFnewUSMinimumStationstoRemove_OnlyUHF_PLMRS_FCCcontours%d.pcl"%(num_channels_removed, i)) for i in range(number_of_repacks): print i print repack_file_list[i] if num_channels_removed == 25: wsmap = west.data_map.DataMap2DContinentalUnitedStates.from_pickle(os.path.join("data", "Pickled Files - Whitespace Maps", "A-%dChannelsRemoved"%num_channels_removed, "Only UHF", repack_file_list[i])) else: wsmap = west.data_map.DataMap2DContinentalUnitedStates.from_pickle(os.path.join("data", "Pickled Files - Whitespace Maps", "A-%dChannelsREmoved"%num_channels_removed, repack_file_list[i])) wsmap.update_all_values_via_function(check_for_zeros) zerows_map = zerows_map.combine_datamaps_with_function(wsmap, or_function) instantaneous_values[i] = calculate_population_of_zerows(wsmap, population_map) acceleration_values[i] = calculate_population_of_zerows(zerows_map, population_map) print instantaneous_values[i], acceleration_values[i] from scipy.stats import cumfreq num_bins = 100 inst_values_cdf = cumfreq(instantaneous_values, num_bins) xaxis = numpy.linspace(0, max(instantaneous_values), num_bins) plt.plot(xaxis, inst_values_cdf[0]/number_of_repacks) plt.xlabel("Population that sees zero whitespace after repack") plt.ylabel("CDF") plt.show()
# plot the histogram plt.clf() plt.bar(bins[:-1], n, width=0.4, color='red') plt.xlabel('X', fontsize=20) plt.ylabel('number of data points in the bin', fontsize=15) plt.savefig('/home/tomer/my_books/python_in_hydrology/images/hist.png') # compute and plot the relfreq relfreqs, lowlim, binsize, extrapoints = st.relfreq(x) plt.clf() plt.bar(bins[:-1], relfreqs, width=0.4, color='magenta') plt.xlabel('X', fontsize=20) plt.ylabel('Relative frequencies', fontsize=15) plt.savefig('/home/tomer/my_books/python_in_hydrology/images/relfreq.png') # compute and plot pdf plt.clf() n, bins, patches = plt.hist(x, 10, normed=1, facecolor='yellow', alpha=0.5) plt.xlabel('X', fontsize=15) plt.ylabel('PDF', fontsize=15) plt.savefig('/home/tomer/my_books/python_in_hydrology/images/pdf.png') # compute and plot cdf cumfreqs, lowlim, binsize, extrapoints = st.cumfreq(x) plt.clf() plt.bar(bins[:-1], cumfreqs/cumfreqs[-1], width=0.4, color='black', alpha=0.45) plt.xlabel('X', fontsize=15) plt.ylabel('CDF', fontsize=15) plt.savefig('/home/tomer/my_books/python_in_hydrology/images/cdf.png')
label_files = [open(filename).readlines() for filename in os.listdir(".") if item.endswith(label_extension)] #NEED TO FIX #bootstrapping, assumes that the data here completely describe (are completely representative thereof) the underlying distribution. sample_count = data_nodates.shape[0] variable_count = data_nodates.shape[1] jitter_count = 1000 distributions = reshape(array([diff(shuffler(data_nodates),axis=0) for jitter in range(jitter_count)]),((sample_count-1)*jitter_count,variable_count)) #generate cdfs cdfs = array([(cumfreq(distribution)[0:3]) for distribution in distributions.transpose()]) #-1 because the list of differences of n sample counts will have n-1 members print cdfs[1] overview = plt.figure() ax = overview.add_subplot(111) for cdf in cdfs: xvals = array([ cdf[1] + i*cdf[2] for i in range(10)]) h, = ax.plot(xvals,cdf[0]/max(cdf[0]),'--.',markersize=30) h.set_clip_on(False) #plt.legend(('CXR','ABD CT','ABD + Chest CT'),'lower right', numpoints=1, fancybox=True, frameon=False, bbox_to_anchor=(1.1,0.2)) tech.adjust_spines(ax,['left','bottom']) overview.text(0.5, 0.08, r'Weekly Change in Cases', ha='center', va='top', fontsize=30, weight='bold') #xlabel overview.text(0.02875, 0.5, r'Frequency of Occurrence', ha='center', va='center', rotation='vertical', fontsize=30, weight='bold') plt.subplots_adjust(top=0.95, bottom =0.18, left=0.15) plt.savefig('cdf_CXRT_s.png',dpi=300)
def main(): # Univariate data ------------------------- # Generate data that are normally distributed x = randn(500) # Scatter plot plot(x,'.') title('Scatter Plot') xlabel('X') ylabel('Y') draw() show() # Histogram hist(x) xlabel('Data Values') ylabel('Frequency') title('Histogram, default settings') show() hist(x,25) xlabel('Data Values') ylabel('Frequency') title('Histogram, 25 bins') show() # Cumulative probability density numbins = 20 cdf = stats.cumfreq(x,numbins) plot(cdf[0]) xlabel('Data Values') ylabel('Cumulative Frequency') title('Cumulative probablity density function') show() # Boxplot # The error bars indiacte the range, and the box consists of the # first, second (middle) and third quartile boxplot(x) title('Boxplot') ylabel('Values') show() boxplot(x, vert=False) title('Boxplot, horizontal') xlabel('Values') show() # Violinplot nd = stats.norm data = nd.rvs(size=(100)) nd2 = stats.norm(loc = 0.5, scale = 1.2) data2 = nd2.rvs(size=(100)) # Use the seaborn package for the violin plot, and set the context for "poster" sns.set(context='poster') df = pd.DataFrame({'Girls':data, 'Boys':data2}) sns.violinplot(df) show() # Check for normality _ = stats.probplot(x, plot=plt) title('Probplot - check for normality') show() # Bivariate data ------------------------- # Generate data x = randn(200) y = 10+0.5*x+randn(len(x)) # Scatter plot scatter(x,y) # This one is quite similar to "plot(x,y,'.')" title('Scatter plot of data') xlabel('X') ylabel('Y') show() # LineFit M = vstack((ones(len(x)), x)).T pars = linalg.lstsq(M,y)[0] intercept = pars[0] slope = pars[1] scatter(x,y) hold(True) plot(x, intercept + slope*x, 'r') show()
import numpy as np import matplotlib.pyplot as plt from scipy.stats import cumfreq P = 0.4 Q = 1 - P a = np.random.geometric(P, size=1000) b = np.random.geometric(P, size=1000) c = np.random.geometric(P, size=1000) common_params = dict(bins=[x for x in range(1,14)], normed=1, range=(0,15)) plt.title('Histograma comparativo de simulaciones y probabilidad teorica') plt.ylabel('Frecuencia relativa') plt.xlabel('Valores') plt.hist([a, b, c], **common_params) gteo = [P * pow(Q, i-1) for i in range(1,14)] g = cumfreq(gteo, 15) plt.show()
def histplot1D(datain,**kwargs): datatype=kwargs.get('datatype','df') if datatype is 'df': histvals=datain.values binrange=kwargs.get('binrange',[datain.min(),datain.max()]) elif datatype is 'histdict': counts=datain['counts'] edges=datain['edges'] centers=datain['centers'] step=np.diff(edges) numbins=kwargs.get('numbins',100) missinglowval=kwargs.get('missinghighval',-99999) missinghighval=kwargs.get('missinglowval',99999) normalize=kwargs.get('normalize',True) cumulative=kwargs.get('cumulative',False) doplot=kwargs.get('doplot',True) showplot=kwargs.get('showplot',False) saveplot=kwargs.get('saveplot',False) plotfilename=kwargs.get('plotfilename','1Dhist_test.png') fsize=kwargs.get('fsize',32) #baseline font size ar=kwargs.get('ar',1.0) #aspect ratio figheight=kwargs.get('figheight',12) #inches dpi=kwargs.get('dpi',100) fignum=kwargs.get('fignum',0) xlog=kwargs.get('xlog',False) ylog=kwargs.get('ylog',False) xlimits=kwargs.get('xlimits',None) ylimits=kwargs.get('ylimits',None) xlabel=kwargs.get('xlabel',None) if ylog: ylabel=kwargs.get('ylabel','Log (Counts)') else: ylabel=kwargs.get('ylabel','Counts') if datatype is 'histdict': dictout=datain else: if not cumulative: counts,edges=np.histogram(datain.values,numbins,range=binrange,normed=normalize) step=np.diff(edges) centers=edges[:-1]+step*0.5 dictout={'counts':counts,'centers':centers,'edges':edges} else: counts,lowlim,barwidths,extrapoints=cumfreq(datain.values,numbins=numbins,defaultreallimits=binrange) if normalize: totcounts=((datain.values>missinglowval)&(datain.values<missinghighval)).sum() counts=counts/totcounts step=(binrange[1]-binrange[0])/numbins binvals=np.arange(binrange[0],binrange[1],step) centers=[v+step*0.5 for v in binvals] edges=np.hstack((binvals,binvals[-1]+step)) dictout={'counts':counts,'centers':centers,'edges':edges} if doplot: plt.rc('font', family='serif', size=fsize) fig1=plt.figure(fignum) if ar: fig1.set_size_inches(figheight*ar,figheight) ax1=fig1.add_subplot(111) barwidths=step if xlog: logplot=True plt.xscale('log') else: logplot=False if ylog: logplot=True plt.yscale('log') else: logplot=False ax1.set_aspect(ar) ax1.bar(centers,counts,width=barwidths,align='center',log=logplot) plt.xlabel(xlabel) plt.ylabel(ylabel) if ylimits: plt.ylim(ylimits) if xlimits: plt.xlim(xlimits) if saveplot: fig1.canvas.print_figure(plotfilename,dpi = dpi, edgecolor = 'b', bbox_inches = 'tight') if showplot: fig1.canvas.draw() return dictout
def _calcCMC(self, size): cumfreqs = (cumfreq(self.matching_order, numbins=size)[0] / size) * 100. self.CMC = cumfreqs.astype(np.float32)
## plot # countMax = max(overlapSer) # bins = np.arange(countMax+1) # plt.hist(overlapSer,bins) # plt.ylabel('freq',fontweight='bold') # plt.xlabel('number of cell lines',fontweight='bold') # plt.title('summlySpace DOS compounds (' + str(overlapCount) + ') - cell lines is_gold') # plt.xticks(bins) # outF = os.path.join(wkdir, 'DOS_summly_cell_line_distribution.png') # plt.savefig(outF, bbox_inches='tight',dpi=200) # plt.close() from scipy.stats import cumfreq num_bins=20 b, lowlim, binsize, extrapoints=cumfreq(passSer,num_bins) outF = os.path.join(wkdir, 'cdf_test.png') plt.plot(b) plt.savefig(outF, bbox_inches='tight',dpi=200) plt.close() #cumsum dx = .01 X = np.arange(-2,2,dx) Y = pylab.exp(-X**2) # Normalize the data to a proper PDF Y /= (dx*Y).sum() # Compute the CDF CY = np.cumsum(Y*dx)
build_count_graph([(name, [s.rtt / 1000 for s in data], color) for name, data, color in datasets], counts_of='ms') build_legend() save_graph("rtt") """ This is a CDF of the bandwidth, which is very useful for comparing the overall response of multiple versions/setups """ if "cdf" in args.graphs: build_graph("Cumulative Distribution of Throughput") # setup our x axis based on 1Gbps operation. hist_xpoints = range(0, 1000000000, 1000000) hist_xticks = xrange(0, 1000000000, 100000000) hist_xlabels = ["{:4.0f}".format(t / 1e6) for t in hist_xticks] for name, data, color in datasets: plt.plot(hist_xpoints[:-1], stats.cumfreq([s.bw for s in data], hist_xpoints, (0, 1e9))[0] / len(data), label=name, color=color) plt.xticks(hist_xticks, hist_xlabels) hist_yticks = np.arange(0, 101, 10) / 100.0 plt.yticks(hist_yticks, ["{:2.0%}".format(float(t)) for t in hist_yticks]) plt.xlabel("Mbps") plt.ylabel("percentile") build_legend(loc=4) save_graph("cdf")
loop_time = time.time() - start_time if sdOutput == "NA": print "randomSDFun while loop did not run in time" else: sd_list.append(sdOutput) print("--- %s seconds ---" % (time.time() - start_time)) if len(sd_list) == numBoots: minList= min(sd_list) maxList= max(sd_list) cumFreq = cumfreq(sd_list, numBins, defaultreallimits=(minList, maxList)) lowerLimit = cumFreq[1] countValues = cumFreq[0] freq_interval = cumFreq[2] upperLimit = lowerLimit + (freq_interval*numBins) xaxis = np.arange(lowerLimit, (upperLimit), freq_interval) if len(xaxis) > numBins: print "xaxis too long, length:", len(xaxis) del_index = numBins xaxis = np.delete(xaxis, del_index) result = (sd_data -minList) / freq_interval myIndex = int(round(result)) -1