def _SNR(self, minimum_noise_level=0.001): ''' Calculate signal to noise ratio. Signal is the highest CWT intensity of all scales, noise is the 95% quantile of the lowest scale WT, which is dominated by noise. ''' ridge_info=self.ridge_info cwt=self.CWT.getdata() noise_cwt=cwt[0] # minimum noise is the noise value for the whole dataset times minimum_noise_level minimum_noise=float(minimum_noise_level*mquantiles( noise_cwt, 0.95, 3./8., 3./8.)) for info in ridge_info: scale=max(3, info[2]) # get a minimal width of 30 items for noise calculation signal=info[3] base_left=max(0, int(info[1]-scale*5)) base_right=int(info[1]+scale*5) noise=mquantiles(noise_cwt[base_left:base_right+1], 0.95, 3./8., 3./8.) noise=numpy.nan_to_num(noise) noise=float(max([minimum_noise, noise])) info.append(signal/noise)
def makeReferenceRLHistogram(alnRatios, refLengths, outfile, format, quantile=None): fig = plt.figure(dpi=300, figsize=(6, 6)) ax = fig.add_subplot(111) ax.set_title("Aligned References RL Density Plot") fullPass = alnRatios[alnRatios['IsFullPass']] HQRegion = alnRatios[n.any([alnRatios['IsFullPass'], alnRatios['IsHQTrimmed']], axis=0)] max_y = 0 for l, label in zip((alnRatios, HQRegion, fullPass), ("Aln from All Subreads", "Aln from HQRegion Subreads", "Aln from HQRegion Full-Pass Subreads")): alnRefLength = l['RefLength'] if not quantile == None: alnRefLength = alnRefLength[alnRefLength < mstats.mquantiles(alnRefLength, [quantile])[0]] num, bins, patches = ax.hist(alnRefLength, bins=100, histtype='step', label=label, normed=True) if n.max(num) > max_y: max_y = n.max(num) if not quantile == None: refLengths = refLengths[refLengths < mstats.mquantiles(refLengths, [quantile])[0]] num, bins, patches = ax.hist(refLengths, bins=100, histtype='step', label="All References", normed=True) if n.max(num) > max_y: max_y = n.max(num) ax.set_ylim(0, max_y * 1.1) ax.legend(loc='upper center', prop={'size': 'small'}) fig.savefig(outfile, format=format)
def bootstrap(self,pred,expect) : """ Calculate bootstrapped values Parameters ---------- pred : numpy array the bootstrapped predicted values expect : numpy array the bootstrapped expected values """ nboots = pred.shape[1] nval = pred.shape[0] if nboots < 1 : return self.bootstrapped = np.zeros(nboots) for i in range(nboots) : self.bootstrapped[i] = self.eval(pred[:,i],expect[:,i]) self.delta = self.bootstrapped - self.biased self.std = np.std(self.bootstrapped,ddof=1) self.av = np.sum(self.bootstrapped)/nboots self.bias = np.sum(self.delta)/nboots self.unbiased = self.biased+self.bias self.median = np.median(self.bootstrapped) self.nlow = self.lower(self.biased - 1.96*self.std/np.sqrt(nval)) self.nhigh = self.upper(self.biased + 1.96*self.std/np.sqrt(nval)) self.dlow = self.lower(self.unbiased - stat.mquantiles(self.delta,prob=[0.95])) self.dhigh = self.upper(self.unbiased - stat.mquantiles(self.delta,prob=[0.05]))
def distribution(): data = get_statistics() ratio = data['OrderQty'] / data['adv'] print data filter_ratio = ratio[ratio > 0.05] count,division = np.histogram(filter_ratio, 0.025 * np.arange(80)) fig1 = plt.figure() ax1 = fig1.add_subplot(111) filter_ratio.hist(ax = ax1, bins = division) fig2 = plt.figure() ax2 = fig2.add_subplot(111) ax2.set_title('Repartition of Order Turnover (greater than 1M)') ax2.set_xlabel('Order Turnover (Millions of Euros)') ax2.set_ylabel('Number of Orders') ax2.xaxis.set_major_formatter(FixedOrderFormatter(6)) filtered_turnover = data[np.logical_and(np.isfinite(data['turnover']), data['turnover']>0)] turnover = filtered_turnover['turnover'] * filtered_turnover['rate_to_euro'] count,division = np.histogram(turnover, bins = np.arange(1e6, max(turnover), 2.5e5)) turnover.hist(ax = ax2, bins = division, color = kc_main_colors()['dark_blue']) print mquantiles(turnover.values, [0.99, 0.995, 0.999]) plt.show()
def main(args): (training_file, label_file, test_file, test_label, c, e) = args svr = SVR(C=float(c), epsilon=float(e), kernel='rbf') X = load_feat(training_file) y = [float(line.strip()) for line in open(label_file)] X = np.asarray(X) y = np.asarray(y) test_X = load_feat(test_file) test_X = np.asarray(test_X) test_X[np.isnan(test_X)] = 0 svr.fit(X, y) pred = svr.predict(test_X) if test_label != 'none': test_y = [float(line.strip()) for line in open(test_label)] test_y = np.asarray(test_y) print 'MAE: ', mean_absolute_error(test_y, pred) print 'RMSE: ', sqrt(mean_squared_error(test_y, pred)) print 'corrpearson: ', sp.stats.pearsonr(test_y, pred) print 'r-sqr: ', sp.stats.linregress(test_y, pred)[2] ** 2 print mquantiles(test_y, prob=[0.10, 0.90]) print mquantiles(pred, prob=[0.10, 0.90]) with open(test_file + '.svr.pred', 'w') as output: for p in pred: print >>output, p return
def get_map(self, name, burn, thinning, method='fit'): from scipy.stats.mstats import mquantiles d = name if isinstance(name, np.ndarray) else self.get_samples(burn, thinning, name) method = method.lower() if method.lower() == 'fit': map, ep, em = fit_distribution(d) elif method.lower() == 'fit2': map, ep, em, map2, ep2, em2, x = fit_distribution2(d) tt = np.linspace(d.min(), d.max(), 500) ft = ((1-x)*asymmetric_gaussian(tt, map, ep, em) + x*asymmetric_gaussian(tt, map2, ep2, em2)) map = tt[ft.argmax()] em = mquantiles(d[d<map],[1-2*0.341])[0] ep = mquantiles(d[d>map],[2*0.341])[0] elif method.lower() == 'median': map, ep, em = mquantiles(d, [0.5, 0.5-0.341, 0.5+0.341]) elif method == 'histogram': nb, vl = np.histogram(data, res) mid = np.argmax(nb) map = 0.5*(vl[mid]+vl[mid+1]) em = mquantiles(d[d<map],[1-2*0.341])[0] ep = mquantiles(d[d>map],[2*0.341])[0] return map, ep, em
def _compute_sig(self): """Calculates the significance level of the variable tested""" m = self._est_cond_mean() Y = self.endog X = self.exog n = np.shape(X)[0] u = Y - m u = u - np.mean(u) # center fct1 = (1 - 5**0.5) / 2. fct2 = (1 + 5**0.5) / 2. u1 = fct1 * u u2 = fct2 * u r = fct2 / (5 ** 0.5) I_dist = np.empty((self.nboot,1)) for j in range(self.nboot): u_boot = copy.deepcopy(u2) prob = np.random.uniform(0,1, size = (n,1)) ind = prob < r u_boot[ind] = u1[ind] Y_boot = m + u_boot I_dist[j] = self._compute_test_stat(Y_boot, X) sig = "Not Significant" if self.test_stat > mquantiles(I_dist, 0.9): sig = "*" if self.test_stat > mquantiles(I_dist, 0.95): sig = "**" if self.test_stat > mquantiles(I_dist, 0.99): sig = "***" return sig
def sy_integral_function(q, x, y): f1_inv = mquantiles(x, [q]) f2_inv = mquantiles(y, [q]) if ( f1_inv[0] == 0.0 and f2_inv[0] == 0.0 ): return 1.0 else: return min(f1_inv[0], f2_inv[0])/float(max(f1_inv[0], f2_inv[0]))
def _compute_min_std_IQR(data): """Compute minimum of std and IQR for each variable.""" s1 = np.std(data, axis=0) q75 = mquantiles(data, 0.75, axis=0).data[0] q25 = mquantiles(data, 0.25, axis=0).data[0] s2 = (q75 - q25) / 1.349 # IQR dispersion = np.minimum(s1, s2) return dispersion
def _makeHexbinHist(x, y, x_label, y_label, title, outfile, format, quantile=None): nullfmt = NullFormatter() left, width = 0.1, 0.6 bottom, height = 0.1, 0.6 bottom_h = left_h = left+width+0.02 rect_scatter = [left, bottom, width, height] rect_histx = [left, bottom_h, width, 0.2] rect_histy = [left_h, bottom, 0.2, height] # start with a rectangular Figure fig = plt.figure(dpi=300, figsize=(8,8)) fig.suptitle(title) axHexbin = plt.axes(rect_scatter) axHistx = plt.axes(rect_histx) axHisty = plt.axes(rect_histy) # no labels axHistx.xaxis.set_major_formatter(nullfmt) axHisty.yaxis.set_major_formatter(nullfmt) if quantile is not None: mask = n.all([x < mstats.mquantiles(x, [quantile])[0], y < mstats.mquantiles(y, [quantile])[0]], axis=0) x = x[mask] y = y[mask] x_min = n.min(x) x_max = n.max(x) y_min = n.min(y) y_max = n.max(y) axHexbin.hexbin(x, y, bins='log', edgecolors='none', cmap=plt.cm.hot) axHexbin.set_xlim(x_min, x_max) axHexbin.set_xlabel(x_label) axHexbin.set_ylim(y_min, y_max) axHexbin.set_ylabel(y_label) if max(x) < 1.: # is a fraction bins_for_x = 50 else: bins_for_x = (max(x)-min(x))/100 + 1 if max(y) < 1.: bins_for_y = 50 else: bins_for_y = (max(y)-min(y))/100 + 1 axHistx.hist(x, bins=bins_for_x) axHistx.set_xlim(x_min, x_max) axHisty.hist(y, bins=bins_for_y, orientation='horizontal') axHisty.set_ylim(y_min, y_max) for label in axHisty.get_xticklabels(): label.set_rotation('vertical') fig.savefig(outfile, format=format)
def _real_paste(self, box, u): ''' returns two candidate new boxes, pasted along upper and lower dimension :param box: a PrimBox instance :param u: the uncertainty for which to paste :returns: two box lims and the associated indices ''' box_diff = self.box_init[u][1]-self.box_init[u][0] pa = self.paste_alpha * box.yi.shape[0] pastes = [] for direction in ['upper', 'lower']: box_paste = np.copy(box.box_lims[-1]) test_box = np.copy(box.box_lims[-1]) if direction == 'lower': i = 0 box_diff = -1*box_diff test_box[u][1] = test_box[u][i] test_box[u][i] = self.box_init[u][i] indices = self.in_box(test_box) data = self.x[indices][u] paste_value = self.box_init[u][i] if data.shape[0] > 0: b = (data.shape[0]-pa)/data.shape[0] paste_value = mquantiles(data, [b], alphap=self.alpha, betap=self.beta)[0] elif direction == 'upper': i = 1 test_box[u][0] = test_box[u][i] test_box[u][i] = self.box_init[u][i] indices = self.in_box(test_box) data = self.x[indices][u] paste_value = self.box_init[u][i] if data.shape[0] > 0: b = (pa)/data.shape[0] paste_value = mquantiles(data, [b], alphap=self.alpha, betap=self.beta)[0] box_paste[u][i] = paste_value indices = self.in_box(box_paste) pastes.append((indices, box_paste)) return pastes
def main(fname): collision_data = CollisionData.data_from_file(fname) t1 = collision_data.type1s t2 = collision_data.type2s deltaVs = collision_data.deltaVs distances = collision_data.distances low = np.min(deltaVs) high = np.max(deltaVs) #diff_indices = np.where(t1 != t2) #diff_deltaVs = deltaVs[diff_indices] #diff_hist = gaussian_kde(diff_deltaVs) #diff_xs = np.linspace(low, high, 200) #he_indices = np.where(np.logical_and(t1 == 0, t2 == 0)) #he_deltaVs = deltaVs[he_indices] #he_hist = gaussian_kde(he_deltaVs) #he_xs = np.linspace(low, high, 200) #he_dist = distances[he_indices] xe_indices = np.where(np.logical_and(t1 == 7, t2 == 7)) xe_deltaVs = deltaVs[xe_indices] xe_xs = np.linspace(low, high, 200) xe_dist = distances[xe_indices] #print np.mean(xe_dist), np.median(xe_dist), np.mean(distances), np.median(distances) print mquantiles(distances, prob=[0.8, 0.85, 0.9, 0.95, 0.975, 0.99]) if len(xe_deltaVs) <= 1: return xe_hist = gaussian_kde(xe_deltaVs) fig = plt.figure() ax = fig.add_subplot(1, 1, 1) # print np.mean(xe_dist) # update the view limits #ax.plot(diff_xs, diff_hist(diff_xs), c='r', marker='.', label='He-Xe') #ax.plot(he_xs, he_hist(he_xs), c='g', marker='.', label='He-He') ax.plot(xe_xs, xe_hist(xe_xs), c='b', marker='.', label='Xe-Xe') ax.set_xlim(0, high) ax.set_title("Collision Radius vs Difference in Velocity") ax.set_xlabel("Delta V (m/s)") ax.set_ylabel("Relative Density") ax.legend() fig.savefig(os.path.splitext(fname)[0] + ".png", dpi=250) plt.close() fig = None ax = None
def generate_quantile_summary(result_summary_table, quantiles = numpy.linspace(0, 1, 101)): from scipy.stats.mstats import mquantiles import pandas summary_table = pandas.DataFrame.from_items([("id", result_summary_table["id"]), ("rmsd", result_summary_table["quartile"][...,0])]) result = numpy.empty_like(quantiles, dtype=[("quantile", float), ("global_quantile_value", float), ("worst_per_structure_quantile_value", float)]) result["quantile"] = quantiles result["global_quantile_value"] = mquantiles(summary_table["rmsd"].values, quantiles) result["worst_per_structure_quantile_value"] = mquantiles(summary_table.groupby("id")["rmsd"].max().values, quantiles) return result
def filtering(control_file, affected_file, filtered_control_file, filtered_affected_file, max_pvalue = None, min_cov = None, max_cov = None, min_delta_methylation = None, filter_quantil = None): control_quantil = None affected_quantil = None if filter_quantil: control_quantil = mquantiles( np.loadtxt(control_file, delimiter='\t', usecols=(3,)), prob = [filter_quantil])[0] affected_quantil = mquantiles( np.loadtxt(affected_file, delimiter='\t', usecols=(3,)), prob = [filter_quantil])[0] non_filtered_sites = 0 for site_counter, (control_line, affected_line) in enumerate( izip(open(control_file), open(affected_file)) ): c_chrom, c_start, c_end, c_cov, c_meth, c_strand = control_line.strip().split('\t') a_chrom, a_start, a_end, a_cov, a_meth, a_strand = affected_line.strip().split('\t') try: assert( c_chrom == a_chrom ) assert( c_start == a_start ) assert( c_end == a_end ) assert( c_strand == a_strand ) except AssertionError: sys.exit('That file needs intersected inputfiles, so that each site is present in both files, affected and control.\n %s : %s \n %s : %s \n %s : %s \n %s : %s \n' % (c_chrom, a_chrom, c_start, a_start, c_end, a_end, c_strand, a_strand)) c_cov, c_meth, a_cov, a_meth = map(float, [c_cov, c_meth, a_cov, a_meth]) if min_cov != None and (a_cov < min_cov or c_cov < min_cov): continue if max_cov != None and (a_cov > max_cov or c_cov > max_cov): continue if min_delta_methylation != None and abs(a_meth - c_meth) < min_delta_methylation: continue if filter_quantil and (c_cov > control_quantil or a_cov > affected_quantil): continue if max_pvalue != None: control_methylated = c_cov * c_meth / 100 control_unmethylated = c_cov - control_methylated affected_methylated = a_cov * a_meth / 100 affected_unmethylated = a_cov - affected_methylated try: #Try to use the much faster fisher module from http://pypi.python.org/pypi/fisher/ p = fisher_exact.pvalue(control_methylated, control_unmethylated, affected_methylated, affected_unmethylated) pvalue = p.two_tail except: oddsratio, pvalue = stats.fisher_exact([(control_methylated, control_unmethylated), (affected_methylated, affected_unmethylated)], alternative='two-sided') if pvalue > max_pvalue: continue non_filtered_sites += 1 filtered_control_file.write(control_line) filtered_affected_file.write(affected_line) sys.stdout.write( "%s from %s filtered.\n" % (site_counter+1 - non_filtered_sites, site_counter + 1) ) filtered_affected_file.close() filtered_control_file.close()
def PreparePloting(self): # Set plot ranges and titles maxlim = mquantiles(self.m2g.Y*self.m2g.norm, .98) minlim = mquantiles(self.m2g.Y*self.m2g.norm, .02) if minlim == maxlim: minlim, maxlim=0, 1 self.ax1.set_ylim(minlim-(maxlim-minlim)*.1, maxlim+(maxlim-minlim)*.3) maxlim = mquantiles(self.m2g.K[:-1, :], .98) minlim = min(0, mquantiles(self.m2g.K[:-1, :], .02)) if minlim == maxlim: minlim, maxlim=0, 1 self.ax2.set_ylim(minlim-(maxlim-minlim)*.1, maxlim+(maxlim-minlim)*.1) self.ax1.set_title('Fitted data: '+self.name+', '+'$\chi^2/doF$: %.2f'%self.m2g.chi2) self.plot_step()
def makeReferenceRLHistogram(alnRatios, refLengths, outfile, format, quantile): """ X-axis: (unique) reference length Y-axis: count """ fig = plt.figure(dpi=300, figsize=(10, 6)) ax = fig.add_subplot(111) ax.set_title("Aligned Reference Length Distribution (qCov>=80%)") fullPass = alnRatios[alnRatios['IsFullPass']&(alnRatios['rCov']>=.8)] fullLength = alnRatios[alnRatios['IsFullLength']&(alnRatios['rCov']>=.8)] if quantile is not None: refLengths = refLengths[refLengths < mstats.mquantiles(refLengths, [quantile])[0]] # plot all references first bins = 50 y,binEdges = n.histogram(refLengths, bins=bins) bincenters = 0.5*(binEdges[1:]+binEdges[:-1]) xnew = n.linspace(bincenters.min(), bincenters.max(), 100) ysmooth = spline(bincenters, y, xnew) # normalize by hand ysmooth = ysmooth*1./sum(ysmooth) ax.plot(xnew, ysmooth, '-', label="All References") max_y = max(ysmooth) for l, label in zip((fullPass, fullLength), ("Aligned full-pass subreads", "Aligned " + SeenName + " subreads")): alnRefLength = dict(zip(l['RefID'], l['RefLength'])) if len(alnRefLength) == 0: continue alnRefLength = n.array(alnRefLength.values()) if quantile is not None: alnRefLength = alnRefLength[alnRefLength < mstats.mquantiles(alnRefLength, [quantile])[0]] bins = (max(alnRefLength)-min(alnRefLength))/100 + 1 y,binEdges = n.histogram(alnRefLength, bins=bins) bincenters = 0.5*(binEdges[1:]+binEdges[:-1]) xnew = n.linspace(bincenters.min(), bincenters.max(), 300) ysmooth = spline(bincenters, y, xnew) # normalize by hand ysmooth = ysmooth*1./sum(ysmooth) ax.plot(xnew, ysmooth, '-', label=label) max_y = max(max_y, max(ysmooth)) #num, bins, patches = ax.hist(alnRefLength, bins=50, histtype='step', label=label, normed=True) #max_y = max(max_y, max(num)) ax.set_ylim(0, max_y * 1.1) ax.legend(loc='upper center', prop={'size': 'small'}) ax.set_xlabel("Reference Length") ax.set_ylabel("Fraction") fig.savefig(outfile, format=format)
def discretePaste(x_init,y_init, y, name, box,box_init, paste_alpha, n, direction, obj_func): box_diff = box_init[name][1]-box_init[name][0] if direction == 'lower': i = 0 paste_alpha = 1-paste_alpha box_diff = -1*box_diff if direction == 'upper': i = 1 box_paste = np.copy(box) y_paste = y test_box = np.copy(box) if direction == 'lower': test_box[name][i+1] = test_box[name][i] test_box[name][i] = box_init[name][i] logical = in_box(x_init, test_box) data = x_init[logical][name] if data.shape[0] > 0: a = paste_alpha * y.shape[0] b = (data.shape[0]-a)/data.shape[0] paste_value = mquantiles(data, [b], alphap=1/3, betap=1/3)[0] paste_value = int(round(paste_value)) box_paste[name][i] = paste_value logical = in_box(x_init, box_paste) y_paste = y_init[logical] if direction == 'upper': test_box[name][i-1] = test_box[name][i] test_box[name][i] = box_init[name][i] logical = in_box(x_init, test_box) data = x_init[logical][name] if data.shape[0] > 0: a = paste_alpha * y.shape[0] b = a/data.shape[0] paste_value = mquantiles(data, [b], alphap=1/3, betap=1/3)[0] paste_value = int(round(paste_value)) box_paste[name][i] = paste_value logical = in_box(x_init, box_paste) y_paste = y_init[logical] # y means of pasted boxes obj = obj_func(y, y_paste) # mass of pasted boxes mass_paste = y_init[logical].shape[0]/n return (obj, mass_paste, box_paste)
def prune(B,H=None,per=.2,deg=True,cap='in'): # prune Graph based on degree or properties in cap # node based # H will always be preserved G=nx.DiGraph(B) if deg: if cap=='in': seq=G.in_degree().values() cut=mquantiles(seq,1-per) mk=1 elif cap=='out': seq=G.out_degree().values() cut=mquantiles(seq,1-per) mk=2 else: print 'Error in setting cap string' return None else: seq=[] S=zip(*G.nodes(True))[1] for w in S: try: seq.append(w[cap]) mk=3 except KeyError: print 'Some nodes lack cap string' return None cut=mquantiles(seq,1-per) to_del=[] for n in G.nodes_iter(): G.node[n]['size']=2*G.in_degree(n) if H is not None: if n in H.nodes(): continue if mk==1: t=G.in_degree(n) if t < cut: to_del.append(n) elif mk==2: t=G.out_degree(n) if t < cut: to_del.append(n) else: t=G.node[n][cap] if t < cut: to_del.append(n) G.remove_nodes_from(to_del) return G
def create_grouped_index_df(bin_num): ## load the labels and start_time column for train and test data start_time = time.time() train_labels = pd.read_csv(data_path + train_num_file, index_col='Id', usecols=['Id', dep_var_name]) train_date_start_columm = pd.read_csv(data_path + train_date_file, index_col='Id', usecols=['Id', start_time_column_name]) test_date_start_columm = pd.read_csv(data_path + test_date_file, index_col='Id', usecols=['Id', start_time_column_name]) end_time = time.time() print 'data loading takes ', round((end_time - start_time), 1), ' seconds.' ## join the start_time with labels, then drop the NaN in start_time labeled_start_time = pd.merge(train_labels, train_date_start_columm, how='left', left_index=True, right_index=True) ## this labeled_start_time dataFrame doesn't contain the NaN, therefore it can be directly used for calculating the mquantiles labeled_start_time = labeled_start_time[~labeled_start_time[start_time_column_name].isnull()] ##section to subset the data by start_time prob_list = [1.*i/bin_num for i in range(1, bin_num)] quantile_values = mquantiles(labeled_start_time[start_time_column_name], prob=prob_list) bins = [labeled_start_time[start_time_column_name].min()] bins.extend(quantile_values) bins.append(labeled_start_time[start_time_column_name].max()) bin_names = [str(i) for i in range(len(bins)-1)] ## cut the entire dataframe into different time_windows by start_time tmp_train = train_date_start_columm.copy() tmp_test = test_date_start_columm.copy() tmp_train['time_window_num'] = pd.cut(tmp_train[start_time_column_name], bins, labels=bin_names) tmp_test['time_window_num'] = pd.cut(tmp_test[start_time_column_name], bins, labels=bin_names) ## create a row number column, start index is 1 tmp_train['row_num'] = range(1, (tmp_train.shape[0] + 1)) tmp_test['row_num'] = range(1, (tmp_test.shape[0] + 1)) return tmp_train, tmp_test, bins, bin_names
def try_peel(x,y,j,peel_alpha, box,direction): ''' make a test peel box returns a tuple (mean, volume, box) ''' alpha = 1/3 beta = 1/3 i=0 if direction=='upper': peel_alpha = 1-peel_alpha i=1 box_peel = mquantiles(x[:, j], [peel_alpha], alphap=alpha, betap=beta)[0] if direction=='lower': y_mean_peel = np.mean(y[ x[:, j] >= box_peel]) if direction=='upper': y_mean_peel = np.mean(y[ x[:, j] <= box_peel]) temp_box = copy.deepcopy(box) temp_box[i,j] = box_peel box_vol = vol_box(temp_box) return (y_mean_peel, box_vol, temp_box)
def do_mock_distance (self, dset, theta, thetastar, rng): ncount = dset.get_data (0) data = [] if ncount.get_len () > 0: if self.true_data: mass_vec = ncount.get_lnM_true () z_vec = ncount.get_z_true () for i in range (mass_vec.len ()): data.append ([z_vec.get (i), mass_vec.get (i)]) else: mass_mat = ncount.get_lnM_obs () z_mat = ncount.get_z_obs () for i in range (mass_mat.nrows ()): data.append ([z_mat.get (i, 0), mass_mat.get (i, 0)]) data = np.array (data) data_bin = np.array([ [ item[0] for item in data if item[1] >= self.dm_choose[i] ] for i in range (len (self.dm_choose) - 1)]) mock_summary = [ mquantiles( elem, prob=self.quant_list ) if len( elem ) > 0 else [ 0 for jj in self.quant_list] for elem in data_bin ] distance = [ np.sqrt( sum( [ ( self.data_summary[ i ][ j ] - mock_summary[ i ][ j ] ) ** 2 for j in range( len( self.data_summary[ i ] ) ) ] ) ) for i in range( len( self.data_summary ) ) ] del ncount del data del data_bin del mock_summary return sum (distance)
def _get_par_summary(sim, n, probs): """Summarize chains merged and individually Parameters ---------- sim : dict from stanfit object n : int parameter index probs : iterable of int quantiles Returns ------- summary : dict Dictionary containing summaries """ # _get_samples gets chains for nth parameter ss = _get_samples(n, sim, inc_warmup=False) msdfun = lambda chain: (np.mean(chain), np.std(chain, ddof=1)) qfun = lambda chain: mquantiles(chain, probs) c_msd = np.array([msdfun(s) for s in ss]).flatten() c_quan = np.array([qfun(s) for s in ss]).flatten() ass = np.asarray(ss).flatten() msd = np.asarray(msdfun(ass)) quan = qfun(np.asarray(ass)) return dict(msd=msd, quan=quan, c_msd=c_msd, c_quan=c_quan)
def TigerCalculateEfficiency(list_tigerruns, N=1, beta=[0.95], background=0): """ CALCULATE EFFICIENCY FROM A LIST OF TIGERRUNS """ efficiencies = [] OddsBeta=[mquantiles(list_tigerruns[background].odds(N),prob=[b]) for b in beta] efficiencies = empty((len(list_tigerruns)-1,len(beta))) for i in xrange(len(list_tigerruns)): if N>list_tigerruns[i].nsources: stdout.write("... Warning: Not sufficient events (%s) to calculate the efficiency for %s sources. Writing zeros\n"%(list_tigerruns[i].nsources,N)) if i < background: efficiencies[i,:] = 0.0 else: efficiencies[i-1,:] = 0.0 continue if i != background: tmp = list_tigerruns[i].odds(N) for j in xrange(len(OddsBeta)): msk = tmp>OddsBeta[j] nmsk = tmp<OddsBeta[j] nabovebeta=len(tmp[msk]) ntotal=len(tmp) eff=float(nabovebeta)/float(ntotal) if i < background: efficiencies[i,j] = eff else: efficiencies[i-1,j] = eff return efficiencies
def outlier_detection(q, time, mq, k=1.5): """ calculates outlier's using geodesic distances of the SRSFs from the median :param q: numpy ndarray of N x M of M SRS functions with N samples :param time: vector of size N describing the sample points :param mq: median calculated using :func:`time_warping.srsf_align` :param k: cutoff threshold (default = 1.5) :return: q_outlier: outlier functions """ N = q.shape[1] ds = zeros(N) for kk in range(0, N): ds[kk] = sqrt(trapz((mq - q[:, kk]) ** 2, time)) quartile_range = mquantiles(ds) IQR = quartile_range[2] - quartile_range[0] thresh = quartile_range[2] + k * IQR ind = (ds > thresh).nonzero() q_outlier = q[:, ind] return q_outlier
def plotTomo(self,ax=plt.gca()): #fig = figure(figsize=(7,5)) #ax = fig.add_axes([0.1, 0.1, 0.8, 0.85]) fig = ax.get_figure() #ax.set_yscale('log', nonposy='clip') lim = mquantiles(self.G,0.99) self.G[self.G>lim*1.5] = lim*1.5#most probably failure #dr = np.mean(np.diff(self.rho_grid)) #dr = 1 power = self.G[:,::-1].T/1e6 img = ax.imshow(power, extent=[self.tvec[0],self.tvec[-1] ,self.rho_grid[0],self.rho_grid[-1]], aspect='auto',clim=[0,lim/1e6]) minorLocator = plt.MultipleLocator(1) img.set_cmap('YlOrBr') ax.xaxis.set_minor_locator(minorLocator) ax.axis([self.tvec[0],self.tvec[-1],0,1]) cb1 = fig.colorbar(img) cb1.set_label('$P $ [MW/m$^3$]') ax.set_ylabel(r'$\rho_\phi$ [-]') ax.set_xlabel('t [s]') Rvec, Tvec = np.meshgrid(self.rho_grid, self.tvec) CS = ax.contour(Tvec,Rvec, self.G,10,colors = 'k',alpha=0.2) #plt.show() #fig.savefig('G%d.png'%self.shot) return fig
def _threshold_gradient(im): """Indicate pixel locations with gradient below the bottom 10th percentile Parameters ---------- im : array The mean intensity images for each channel. Size: (num_channels, num_rows, num_columns). Returns ------- array Binary values indicating whether the magnitude of the gradient is below the 10th percentile. Same size as im. """ if im.shape[0] > 1: # Calculate directional relative derivatives _, g_x, g_y = np.gradient(np.log(im)) else: # Calculate directional relative derivatives g_x, g_y = np.gradient(np.log(im[0])) g_x = g_x.reshape([1, g_x.shape[0], g_x.shape[1]]) g_y = g_y.reshape([1, g_y.shape[0], g_y.shape[1]]) gradient_magnitudes = np.sqrt((g_x ** 2) + (g_y ** 2)) below_threshold = [] for chan in gradient_magnitudes: threshold = mquantiles(chan[np.isfinite(chan)].flatten(), [0.1])[0] below_threshold.append(chan < threshold) return np.array(below_threshold)
def maxdd_montecarlo(changes, runs=5000, length=None, serial_dependence=None, quantiles=(0.75, 0.9, 0.975), return_array=False): if not length: length = len(changes) if not serial_dependence: seq = changes pick = lambda seq: [random.choice(seq)] else: # Serial dependance detected? Lets sample windows! class serial_sampler(object): def __init__(self, seq, size): self.seq = seq self.size = size def __len__(self): return len(self.seq) - self.size def __getitem__(self, i): return self.seq[i - self.size : i + self.size] pick = lambda seq: random.choice(seq) seq = serial_sampler(changes, serial_dependence) maxdds = [] for i in xrange(runs): # sample a maxdd new_seq = [] while len(new_seq) < length: new_seq += pick(seq) maxdds.append(maxdd(new_seq)) results = { 'mean maxdd': numpy.mean(maxdds), 'sd of maxdds': numpy.std(maxdds), 'quantiles': dict(zip(quantiles, mquantiles(maxdds, quantiles))) } if return_array: results['array of maxdd samples'] = maxdds return results
def sample_n_genes_quartile(sample_size, sample, blast_report_suffix,quartile, RefSeq, blast_report_dir): #print "sampling this many:\t" + str(sample_size) + " genes for this sample\t" + sample + "\n" RefSeq_to_percent_covered_hash = make_refseq_to_percent_covered_hash(RefSeq,blast_report_suffix,blast_report_dir,sample) percent_covered_keys = RefSeq_to_percent_covered_hash.keys() percent_covered_scores = RefSeq_to_percent_covered_hash.values() quantiles = mquantiles(percent_covered_scores) range = () if(quartile == "lower"): range = (0,quantiles[0]) elif(quartile == "median"): range = (quantiles[0],quantiles[1]) elif (quartile == "upper"): range = (quantiles[1],0) sampled_counter = 0 sampled_list = {} if(len(RefSeq_to_percent_covered_hash.keys()) < sample_size): sample_size = len(RefSeq_to_percent_covered_hash.keys()) while sampled_counter < sample_size: curr = percent_covered_keys[randint(0,len(percent_covered_scores))] curr_score = RefSeq_to_percent_covered_hash[curr] if(curr_score > range[0] and curr_score < range[1]): sampled_list[curr] = 1 sampled_counter = sampled_counter + 1 return sampled_list.keys()
def realPeel(x,y,n,name,peel_alpha, box,direction, obj_func): ''' make a test peel box returns a tuple (mean, volume, box) ''' alpha = 1/3 beta = 1/3 i=0 if direction=='upper': peel_alpha = 1-peel_alpha i=1 box_peel = mquantiles(x[name], [peel_alpha], alphap=alpha, betap=beta)[0] if direction=='lower': logical = x[name] >= box_peel if direction=='upper': logical = [x[name] <= box_peel] obj = obj_func(y, y[logical]) temp_box = np.copy(box) temp_box[name][i] = box_peel box_mass = y[logical].shape[0]/n box_vol = box_mass # box_vol = vol_box(temp_box) return (obj, box_vol, temp_box, logical)
def safety_production(CS, seuil): safety_production = 0 for i in range(len(CS)): data = summ(CS[i]).values XXX = mquantiles(data,[seuil]) safety_production += 10**-7 * XXX[0] return safety_production
def calibrate(self, X, Y, alpha, bbox=None, return_scores=False): if bbox is not None: self.init_bbox(bbox) # Store desired nominal level self.alpha = alpha # Compute predictions on calibration data q_calib = self.bbox.predict(X.astype(np.float32)) # Estimate conditional histogram for calibration points d_calib = self.hist.compute_histogram(q_calib, self.ymin, self.ymax, alpha) # Initialize histogram accumulator (grey-box) accumulator = HistogramAccumulator(d_calib, self.grid_histogram, self.alpha, delta_alpha=self.delta_alpha) # Generate noise for randomization n2 = X.shape[0] if self.randomize: epsilon = np.random.uniform(low=0.0, high=1.0, size=n2) else: epsilon = None # Compute conformity scores if self.intervals: scores = accumulator.calibrate_intervals(Y.astype(np.float32), epsilon=epsilon) else: # TODO: implement this assert (1 == 2) # Compute upper quantile of scores level_adjusted = (1.0 - alpha) * (1.0 + 1.0 / float(n2)) self.calibrated_alpha = np.round( 1.0 - mquantiles(scores, prob=level_adjusted)[0], 4) # Print message print("Calibrated alpha (nominal level: {}): {:.3f}.".format( alpha, self.calibrated_alpha)) return self.calibrated_alpha
def get_split(data, variables, y_variable, min_samples_leaf, n_quantiles): variance = np.var(data[y_variable]) split_value = None for variable in variables: value_list = data[variable] if len(np.unique(value_list))>n_quantiles: probs = [j/float(n_quantiles) for j in range(1,n_quantiles+1)] values = sc_st_mst.mquantiles(value_list,probs) else: if len(np.unique(value_list))==1: continue values = np.unique(value_list) for value in values[:-1]: data_with_value = data[data[variable] <= value] data_without_value = data[data[variable] > value] without_len = len(data_without_value.index) with_len = len(data_with_value.index) if (with_len < min_samples_leaf) or (without_len < min_samples_leaf): continue ### Ratios of each value of specified variable ratio = with_len/float(len(data.index)) ### split_entropy shows how good split seperates class_values in generaly split_variance = ratio*np.var(data_with_value[y_variable])+(1-ratio)*np.var(data_without_value[y_variable]) if split_variance < variance : variance = split_variance split_variable = variable split_value = value if split_value == None: return None return split_variable, split_value, variance
def binify_even_bin(X, N=10, dm=None, maxlag=None, **kwargs): """ Returns a distance matrix with all entries sorted into bin numbers, along with an array of bin widths. The matrix has the same form as the distance matrix dm in squareform. The bins will be indexed from 0 to n. If dm is None, then the point_dist function will be used to calculate a distance matrix. kwargs will be passed to point_matrix. For the bins, either N or w has to be given. N specifies the number of bins and w their width. If both are given, N bins of width w will be specified, which might result in unexpected results. :param X: np.array of x, y coordinates. :param N: int with the number of bins :param dm: numpy.ndarray with the distance matrix :param maxlag: maximum lag for the binning :param kwargs: will be passed to calculate the point_matrix if no dm is given :return: """ _X = list(X) # check that all coordinates in the list have the same dimension and are not empty if not len(set([len(e) for e in _X])) == 1 or len(_X[0]) == 0: raise ValueError( "One or more Coordinates are missing.\nPlease provide the coordinates for all values " ) # get the distance matrix if dm is None: _dm = nd_dist(_X, **kwargs) else: _dm = dm # create bin matrix as copy of dm bm = copy.deepcopy(_dm) # get the upper bounds by calculating the quantiles of the upper bounds binubound = mquantiles(np.array(_dm).flatten(), prob=[i / N for i in range(1, N + 1)]) # set all bins except the first one for i in range(1, N): bm[(_dm > binubound[i - 1]) & (_dm <= binubound[i])] = i # set the first bin bm[_dm < binubound[0]] = 0 return np.matrix(bm), np.diff([0, *binubound])
def test_mquantiles_limit_keyword(self): # Regression test for Trac ticket #867 data = np.array([[6., 7., 1.], [47., 15., 2.], [49., 36., 3.], [15., 39., 4.], [42., 40., -999.], [41., 41., -999.], [7., -999., -999.], [39., -999., -999.], [43., -999., -999.], [40., -999., -999.], [36., -999., -999.]]) desired = [[19.2, 14.6, 1.45], [40.0, 37.5, 2.5], [42.8, 40.05, 3.55]] quants = mstats.mquantiles(data, axis=0, limit=(0, 50)) assert_almost_equal(quants, desired)
def convert_to_8bit(img_array): """Converts to 8 bit, but strething the contrast according to image stats""" img_array = img_array.astype('float32') int_quant = mquantiles(img_array.ravel(), [0.01, 0.99]) # if the image is flat return the image or 255 if int_quant[0] == int_quant[1]: flat_field = np.min(img_array.max(), 255) return flat_field*np.ones_like(img_array) # Remove outliers img_array[img_array < int_quant[0]] = int_quant[0] img_array[img_array > int_quant[1]] = int_quant[1] img_array -= img_array.min() img_array /= img_array.max() img_array *= 255 return img_array
def _set_thresholds(self, newthresholds=None):#low,high): "Defines the indicator thresholds for the definition of ENSO phases." _optinfo = self.optinfo if (newthresholds is not None): try: (low, high) = newthresholds except: raise TypeError("The input thresholds must be given as a "\ "sequence (low, high)") if low > high: (low, high) = (high, low) thresholds = (float(low), float(high)) else: thresholds = mquantiles(self._series, (.25, .75), axis=None) if thresholds != _optinfo.get('thresholds', None): self._cachedmonthly = {} self._cachedcurrent = None _optinfo['thresholds'] = thresholds
def plot_wwadist(wwa): ''' Plot the distribution of wwa with the 95% quantile line. Args: wwa (array): the weighted wavelet amplitude. Returns: fig (figure): the 2-D plot of wavelet analysis ''' sns.set(style="darkgrid", font_scale=2) plt.subplots(figsize=[20, 4]) q95 = mstats.mquantiles(wwa, 0.95, alphap=0.5, betap=0.5) fig = sns.distplot(np.nan_to_num(wwa.flat)) fig.axvline(x=q95, ymin=0, ymax=0.5, linewidth=2, linestyle='-') return fig
def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100): """Generate a grid of points based on the ``percentiles of ``X``. The grid is generated by placing ``grid_resolution`` equally spaced points between the ``percentiles`` of each column of ``X``. Parameters ---------- X : ndarray The data percentiles : tuple of floats The percentiles which are used to construct the extreme values of the grid axes. grid_resolution : int The number of equally spaced points that are placed on the grid. Returns ------- grid : ndarray All data points on the grid; ``grid.shape[1] == X.shape[1]`` and ``grid.shape[0] == grid_resolution * X.shape[1]``. axes : seq of ndarray The axes with which the grid has been created. """ if len(percentiles) != 2: raise ValueError('percentile must be tuple of len 2') if not all(0. <= x <= 1. for x in percentiles): raise ValueError('percentile values must be in [0, 1]') axes = [] for col in range(X.shape[1]): uniques = np.unique(X[:, col]) if uniques.shape[0] < grid_resolution: # feature has low resolution use unique vals axis = uniques else: emp_percentiles = mquantiles(X, prob=percentiles, axis=0) # create axis based on percentiles and grid resolution axis = np.linspace(emp_percentiles[0, col], emp_percentiles[1, col], num=grid_resolution, endpoint=True) axes.append(axis) return cartesian(axes), axes
def process_outliers(self, mode='zscore'): """ Check for outliers in calculated summary stats. Outliers are the few very high or very low values that can potentially introduce bias in tasks such as parameter inference. One can either remove them, replace with mean value, or use log scale for the statistic in question. This choice is left to the user. Parameters ---------- mode : str, optional Either use 'z-score' or inter-quantile range 'iqr', by default 'zscore' Returns ------- array Indices of dataset.s columns containing outliers """ if mode == 'zscore': # This will give us per-feature/per-statistic z-scores zscores = zscore(self.s, axis=0) # Find columns where abs(zscore) > threshold zscore_threshold = 3 violation_indices = np.argwhere(np.abs(zscores) > zscore_threshold) if len(violation_indices) < 1: return outlier_indices = np.unique(np.argwhere(np.abs(zscores) > zscore_threshold)[:, 1]) else: # Outlier detection using IQR quants = mquantiles(self.s) iqr = quants[2] - quants[0] iqr_factor = 1.5 violations_left = self.s < quants[0] - iqr_factor * iqr violations_right = self.s > quants[2] + iqr_factor * iqr violation_indices = np.argwhere(violations_left | violations_right) if len(violation_indices) < 1: return outlier_indices = np.unique(np.argwhere(violations_left | violations_right)[:, 1]) if len(outlier_indices) > 0: self.outlier_column_indices = outlier_indices print('Dataset:process_outliers: found outliers at indice(s) {}'.format(outlier_indices)) print('Outliers can be transformed using the function Dataset.apply_func_to_outlier_columns if so desired.') return self.outlier_column_indices
def nearest_neighbors(lab_im, n=3, quantiles=[0.05, 0.25, 0.5, 0.75, 0.95]): """Find the distances to and angle between the n nearest neighbors. Parameters ---------- lab_im : 2D array of int An image of labeled objects. n : int, optional How many nearest neighbors to check. (Angle is always between the two nearest only.) quantiles : list of float in [0, 1], optional Which quantiles of the features to compute. Returns ------- nei : 1D array of float, shape (5 * (n + 1),) The quantiles of sines, cosines, angles, and `n` nearest neighbor distances. names : list of string The name of each feature. """ if lab_im.dtype == bool: lab_im = nd.label(lab_im)[0] centroids = np.array( [p.centroid for p in measure.regionprops(lab_im, coordinates='rc')]) nbrs = (NearestNeighbors(n_neighbors=(n + 1), algorithm='kd_tree').fit(centroids)) distances, indices = nbrs.kneighbors(centroids) angles = triplet_angles(centroids, indices[:, :3]) # ignore order/orientation of vectors, only measure acute angles angles[angles > np.pi] = 2 * np.pi - angles[angles > np.pi] distances[:, 0] = angles sines, cosines = np.sin(angles), np.cos(angles) features = np.hstack( (sines[:, np.newaxis], cosines[:, np.newaxis], distances)) nei = mquantiles(features, quantiles, axis=0).ravel() colnames = (['sin-theta', 'cos-theta', 'theta'] + ['d-neighbor-%i-' % i for i in range(1, n + 1)]) names = [ '%s-percentile-%i' % (colname, int(q * 100)) for colname, q in it.product(colnames, quantiles) ] return nei, names
def get_stats(arr): sz = arr.size amin, amax = arr.min(), arr.max() q = ms.mquantiles(arr, [0.1, 0.5, 0.9]) mu = arr.mean() sigma = arr.std() cv = sigma / mu return { "size": sz, "min": amin, "max": amax, "pct10": q[0], "pct50": q[1], "pct90": q[2], "mu": mu, "sigma": sigma, "cv": cv }
def compute_CDF_quantiles(CDFs, confidence=95.0): """ Takes a 2D array of CDFs of size (N_bs, N_bins). N_bs stands for the number of bootstraps N_bins stands for the number of bins within the CDF. Returns the median, lower and upper bounds at the desired confidence level. """ # Create percentiles: lower_percentile = (1. - confidence / 100.) / 2. upper_percentile = 1. - lower_percentile # Compute the percentiles for each bin q = mstats.mquantiles(CDFs.T, prob=[lower_percentile, 0.5, upper_percentile], axis=1) lower = q.T[0] median = q.T[1] upper = q.T[2] return median, lower, upper
def qq(data, ax, color): xmax = 0 ymax = 0 alpha = 0.9 color = '#000000' n_quantiles = 100 q_pos = np.concatenate([ np.arange(99.) / len(data), np.logspace(-np.log10(len(data)) + 2, 0, n_quantiles) ]) q_data = mquantiles(data, prob=q_pos, alphap=0, betap=1, limit=(0, 1)) q_th = q_pos.copy() q_err = np.zeros([len(q_pos), 2]) for i in range(0, len(q_pos)): q_err[i, :] = q_err[i, :] = beta.interval( alpha, len(data) * q_pos[i], len(data) - len(data) * q_pos[i]) q_err[i, q_err[i, :] < 0] = 1e-15 slope, intercept, r_value, p_value, std_err = linregress(q_th, q_data) xmax = np.max([xmax, -np.log10(q_th[1])]) ymax = np.max([ymax, -np.log10(q_data[0])]) ax.plot(-np.log10(q_th[n_quantiles - 1:]), -np.log10(q_data[n_quantiles - 1:]), '-', color=color) ax.plot(-np.log10(q_th[:n_quantiles]), -np.log10(q_data[:n_quantiles]), '.', color=color, label='gf') ax.plot([0, xmax], [0, xmax], '--k', color='#f42e30') ax.fill_between( -np.log10(q_th), -np.log10(q_err[:, 0]), -np.log10(q_err[:, 1]), color=color, alpha=0.1, )
def main(argv=None): parser=argparse.ArgumentParser(description="Compute various statistics related to the sequences either in the provided fasta files or for the sequences piped in") # parser.add_argument('infile', nargs='?', type=argparse.FileType('r'), default=sys.stdin) parser.add_argument('-p',dest="pretty",action="store_true",help="Pretty print using PrettyTable module") parser.add_argument('-d',dest="delimiter",help="Colum separator for output, default to whitespace",default=" ") parser.add_argument('-t',dest="min_length",help="Minimun length threshold to filter fasta file",default=0,type=int) parser.add_argument('-r',dest="reference_length",help="(Not yet implemented)Reference length used to compute corrected Nx values",default=0) parser.add_argument('-o', nargs='?', type=argparse.FileType('w'), default=sys.stdout,dest="outfile") parser.add_argument('FASTAFILE',action='append',nargs="+",help='List of fasta files to keep. Use "*" to keep them all') args=parser.parse_args() all_records=[] FASTAFILE=args.FASTAFILE[0] if args.pretty: import prettytable for f in FASTAFILE: for record in SeqIO.parse(f, "fasta", generic_dna): if len(record.seq)<=args.min_length: continue all_records.append(SequenceStat(f,record)) # Display summary statistics per file sequences_per_files=collections.defaultdict(list) for s in all_records: sequences_per_files[s.file].append(s) if args.pretty: table=prettytable.PrettyTable(["File","#Seqs","Avg GC","Avg Length(kb)", "Quant","min","max", "Sum Length(kb)","N50(kb)","L50"]) table.align["File"] = "l" for file,seqs in sequences_per_files.items(): lengths=[x.length for x in seqs] table.add_row([file,len(seqs),round(scipy.average([x.gc for x in seqs]),2),\ round(scipy.average(lengths)/1000,2),mquantiles(lengths),min(lengths),max(lengths),round(sum(lengths)/1000,2),round(N50.N50(lengths)/1000,2),N50.L50(lengths)]) print >>args.outfile,table.get_string(sortby="N50(kb)") else: for file,seqs in sequences_per_files.items(): lengths=[x.length for x in seqs] print >>args.outfile," ".join(map(str,[\ file,len(seqs),scipy.average([x.gc for x in seqs]),\ scipy.average(lengths),sum(lengths),N50.N50(lengths),N50.L50(lengths) ]))
def plotInfo(listex,listey, title): x=np.array(listex) y=np.array(listey) #calcul des valeurs n = len(x) mean = np.mean(x) var = np.var(x) f = plt.figure() ax = f.add_subplot(111) quantiles=ssm.mquantiles(x) leTexte = ( "nbPoint: " + '%.0f' % n +"\n" + "mean: " + '%.4f' % mean +"\n" + "var: " + '%.4f' % var + "\n" + "quantile1: " + '%.4f' % quantiles[0] + "\n" + "quantile2: " + '%.4f' % quantiles[2] ) #affichage des infos plt.text(0.82,0.80,leTexte,horizontalalignment='center', verticalalignment='center', transform = ax.transAxes) #affichage du graph plt.scatter(x,y, s = 7) #affichage des quantiles plt.axvline(x=quantiles[0], linewidth=3, color='g') plt.axvline(x=quantiles[2], linewidth=3, color='g') #labels plt.xlabel("Stickiness") plt.ylabel("Abondance") plt.title(title) plt.savefig(title)
def PredictAgeFreq(hdf5file,nAnimal,burn=0,quantile=[.025,.5,.975],MinYear=1875,MaxYear=1980): nyear=MaxYear-MinYear+1 PredAgeFreq=[] for FileName in hdf5file: print(FileName) f=tables.open_file(FileName,mode='r+') nTable=len(f.list_nodes('/')) #Names of tables curName='//chain0//PyMCsamples' try: curTable=f.get_node(curName) except: print('PredictAgeFreq 28 ',FileName) curTable=f.get_node(curName) i=0 for t in curTable.iterrows(): if i>=burn: try: LogRecruit=[t['LogRecruit_'+str(s)] for s in range(MinYear,1+MaxYear)] except: LogRecruit=[t['LogRecruit'+str(s)] for s in range(MinYear,1+MaxYear)] lnM=t['lnM'] M=exp(lnM) #UnNormalized probabilities UnNorm=[ exp(t+M*(y-nyear/2)) for y,t in enumerate(LogRecruit)] NormProb=[t/sum(UnNorm) for t in UnNorm] #Random Age Frequency CurAgeFreq=list(multinomial(nAnimal, NormProb).rvs()[0]) PredAgeFreq+=[CurAgeFreq] i+=1 #Quantiles on number of animals for every age-class qanimal=[ mquantiles( [t[i] for t in PredAgeFreq],prob=quantile) for i in range(nyear)] result={} for i,t in enumerate(quantile): result[t]=[s[i] for s in qanimal] return(result)
def check_qual(qual): """check seq for mean quality < 25 drop qual scores in lowest decile """ quals = [] for i in qual: quals.append(ord(i) - 33) # drop lowest decile of qual scores decile = float(mquantiles(quals, prob = [0.1])) quals = [x for x in quals if x > decile] mean_qual = float(sum(quals)) / max(len(quals), 1) if mean_qual < 25: return False else: return True
def calc_nonlin(spikes, generator, nr_bins=20): """ Calculate nonlinearities from the spikes and the generator signal. Bins for the generator are defined such that they contain equal number of samples. Since there are fewer samples for more extreme values of the generator signal, bins get wider. """ quantiles = np.linspace(0, 1, nr_bins + 1) # m stands for masked, to be able to apply the function # to masked numpy arrays. In practice, masked arrays are rarely needed. quantile_bins = mquantiles(generator, prob=quantiles) res = binned_statistic(generator, spikes, bins=quantile_bins) nonlinearity = res.statistic bins = bin_midpoints(quantile_bins) return nonlinearity, bins
def finalize(self): tot = 0 vals = [] for x in self.states.values(): vals.append(x.score) assert len(vals) > 0 #assert tot >= 0.1 limv = mquantiles(vals, 0.90)[0] / 3 #limv = 1e-9 tot = np.sum(vals) if tot < 1e-9: return False nstates = dict() for k, v in self.states.items(): if v.score < limv: continue v.score /= tot nstates[k] = v assert len(nstates) > 0, vals self.states = nstates return True
def grid_from_X(x, percentiles=(0.05, 0.95), grid_resolution=100): """Generate a grid of points based on the ``percentiles of ``x``. """ x = x[~x.isnull()] if len(percentiles) != 2: raise ValueError('percentile must be tuple of len 2') if not all(0. <= x <= 1. for x in percentiles): raise ValueError('percentile values must be in [0, 1]') uniques = np.unique(x) if uniques.shape[0] < grid_resolution: # feature has low resolution use unique vals return uniques else: emp_percentiles = mquantiles(x, prob=percentiles) # create axis based on percentiles and grid resolution return np.linspace(emp_percentiles[0], emp_percentiles[1], num=grid_resolution, endpoint=True)
def h_smooth(self, mag): ''' Function to calculate smoothing coefficient (h) for Gaussian Kernel estimation - based on Silverman (1986) formula :param numpy.ndarray mag: Magnitude vector :returns: Smoothing coefficient (h) (float) ''' neq = np.float(len(mag)) # Calculate inter-quartile range qtiles = mquantiles(mag, prob=[0.25, 0.75]) iqr = qtiles[1] - qtiles[0] hfact = 0.9 * np.min([np.std(mag), iqr / 1.34]) * (neq**(-1. / 5.)) # Round h to 2 dp hfact = np.round(100. * hfact) / 100. return hfact
def h_smooth(mag): """ Function to calculate smoothing coefficient (h) for Gaussian Kernel estimation - based on Silverman (1986) formula. :param mag: Magnitude vector :type mag: numpy.ndarray :return hfact: Smoothing coefficient (h) :rtype hfact: Float """ neq = np.float(np.shape(mag)[0]) # Calculate inter-quartile range qtiles = mquantiles(mag, prob=[0.25, 0.75]) iqr = qtiles[1] - qtiles[0] hfact = 0.9 * np.min([np.std, iqr / 1.34]) * (neq**(-1. / 5.)) # Round h to 2 dp hfact = np.round(100. * hfact) / 100. return hfact
def bin_data_in_energy(dtf, n_bins=20): ''' Bin the data in dtf to n_bins with equal statistics. Parameters ---------- dtf: pandas DataFrame The DataFrame containing the data. Must contain a 'log_reco_energy' column (used to calculate the bins). n_bins: int, default=20 The number of reconstructed energy bins to divide the data in. Returns ------- A dictionary of DataFrames (keys=energy ranges, values=separated DataFrames). ''' dtf_e = dict() log_e_reco_bins = mstats.mquantiles(dtf['log_reco_energy'].values, np.linspace(0, 1, n_bins)) for i_e_bin, log_e_high in enumerate(log_e_reco_bins): if i_e_bin == 0: continue mask = np.logical_and( dtf['log_reco_energy'] > log_e_reco_bins[i_e_bin - 1], dtf['log_reco_energy'] < log_e_high ) this_dtf = dtf[mask] if len(this_dtf) < 1: raise RuntimeError('One of the energy bins is empty') this_e_range = '{:3.3f} < E < {:3.3f} TeV'.format( 10**log_e_reco_bins[i_e_bin - 1], 10**log_e_high ) dtf_e[this_e_range] = this_dtf return dtf_e
def make_plot(df, site, mcmc_traces, x_range, depth, to_screen): #pm.traceplot(mcmc_traces) # posteriors for the parameters a_post = mcmc_traces["a"][:, None] b_post = mcmc_traces["b"][:, None] # mean prediction beta_pred = np_sigmoid(x_range, a_post, b_post) mean_pred = beta_pred.mean(0) # vectorized bottom and top 2.5% quantiles for "confidence interval" quantiles = mquantiles(beta_pred, [0.025, 0.975], axis=0) if to_screen: pm.traceplot(mcmc_traces) plt.figure(figsize=(10, 6)) plt.fill_between(x_range, *quantiles, alpha=0.7, color="salmon") plt.plot(x_range, mean_pred, lw=2, ls="-", color="crimson") plt.scatter(df.sw.values, df.beta.values, color="k", s=50, alpha=0.5) plt.xlim(x_range.min(), x_range.max()) plt.ylim(-0.02, 1.02) plt.xlabel("SW") plt.ylabel("Beta") plt.show() else: plt.figure(figsize=(10, 6)) plt.fill_between(x_range, *quantiles, alpha=0.7, color="salmon") plt.plot(x_range, mean_pred, lw=2, ls="-", color="crimson") plt.scatter(df.sw.values, df.beta.values, color="k", s=50, alpha=0.5) plt.xlim(x_range.min(), x_range.max()) plt.ylim(-0.02, 1.02) plt.xlabel("SW") plt.ylabel("Beta") plt.savefig("plots/%s_%s.png" % (site, depth), dpi=100) pm.traceplot(mcmc_traces) plt.savefig("plots/%s_%s_posterior.png" % (site, depth), dpi=100)
def smoothScatterCalcDensity(x, nbin, bandwidth=None, rangex=None): ''' Preprocessing step for kde function: 'nbin' initialization, 'bandwidth' initializtion and validation x : numpy array [shape = (2,N)] - array with coordinates of the points nbin : int or [int, int] - number of bins along both axis (in case single value - [nbin, nbin] is used) bandwidth : [optional] numeric positive array of size 2 with smoothing bandwidth return axes - pair of lists with axis breakpoints fhat - binning Kernel Density Estimation matrix (squared) bandwidth - smoothing bandwidth (own estimation in case of initial bandwidth = None) Source: R::KernSmooth::smoothScatterCalcDensity ''' if isinstance(nbin, numbers.Number): nbin = (nbin, nbin) elif (isinstance(nbin, list) and len(nbin) == 1) or (isinstance(nbin, np.ndarray) and len(nbin) == 1): nbin = (nbin[0], nbin[0]) if len(nbin) != 2 or not (isinstance(nbin[0], numbers.Number) and isinstance(nbin[1], numbers.Number)): raise ValueError("'nbin' must be numeric of length 1 or 2") if bandwidth is None: # R compatibility q_data = mquantiles(x, prob=[0.05, 0.95], alphap=1, betap=1, axis=0).data bandwidth = np.diff(q_data, axis=0) / 25 bandwidth[bandwidth == 0] = 1 bandwidth = bandwidth[0] else: if not (isinstance(bandwidth, numbers.Number) or isinstance(bandwidth, np.ndarray)): raise ValueError("'bandwidth' must be numeric") if isinstance(bandwidth, np.ndarray) and len(bandwidth[bandwidth <= 0]) > 0: raise ValueError("'bandwidth' must be positive") rv = bkde2D(x, bandwidth=bandwidth, gridsize=nbin, rangex=rangex) # return axes, fhat, bandwidth return rv[0], rv[1], bandwidth
def plot_features(FFSen, baseline_shape, baseline_mask, llimit=0.01, ulimit=0.99, num_features=32, xmin=200, xmax=1600): """ Visualize the sensitivity maps for the hidden layer units. :param FFSen: :param llimit: :param ulimit: :param num_features: :param xmin: :param xmax: :return: """ cols = 2 rows = num_features / cols plt.style.use('ggplot') plt.figure() plt.cla() for j, input in enumerate(FFSen[0:num_features, :]): input = input - np.mean(input, axis=0) input = input / np.max(np.abs(input)) + 1e-32 quantiles = mquantiles(input, [llimit, ulimit]) wt_vol = get3DVol(input, baseline_shape, baseline_mask) plt.subplot(rows, cols, j + 1) im = plt.imshow(wt_vol[:, xmin:xmax], cmap=plt.cm.RdBu_r, aspect='auto', interpolation='none', vmin=-0.06, vmax=0.06) plt.grid() im.set_clim(quantiles[0], quantiles[1]) plt.axis('off') plt.show()
def row_stats(row, as_strings=True, engin=False): q1, q2, q3 = mquantiles(row) stats = {} stats["N"] = len(row) stats["#0s"] = len([k for k in row if abs(k) < c_eps]) stats["%0s"] = stats["#0s"] / float(len(row)) stats["Sum"] = sum(row) stats["Min"] = min(row) stats["Q1"] = q1 stats["Q2_Med"] = q2 stats["Q3"] = q3 stats["Max"] = max(row) stats["Mean"] = mean(row) stats["StDev"] = std(row) stats[ "CfVar"] = stats["StDev"] / stats["Mean"] if stats["Mean"] != 0 else 0 if set(stats.keys()) != set(c_props): die("Inconsistent stat lists. Check code.") if as_strings: stats = {k: pretty(v, engin) for k, v in stats.items()} return stats
def gauss_degrade(image,margin=1.0,change=None,noise=0.02,minmargin=0.5,inner=1.0): if image.ndim==3: image = mean(image,axis=2) m = mean([amin(image),amax(image)]) image = 1*(image>m) if margin<minmargin: return 1.0*image pixels = sum(image) if change is not None: npixels = int((1.0+change)*pixels) else: edt = distance_transform_edt(image==0) npixels = sum(edt<=(margin+1e-4)) r = int(max(1,2*margin+0.5)) ri = int(margin+0.5-inner) if ri<=0: mask = binary_dilation(image,iterations=r)-image else: mask = binary_dilation(image,iterations=r)-binary_erosion(image,iterations=ri) image += mask*randn(*image.shape)*noise*min(1.0,margin**2) smoothed = gaussian_filter(1.0*image,margin) frac = max(0.0,min(1.0,npixels*1.0/prod(image.shape))) threshold = mquantiles(smoothed,prob=[1.0-frac])[0] result = (smoothed>threshold) return 1.0*result
def find_empirical_equiprobable_bins_midpoints(N, data): ''' N number of equiprobable bins and data, return the cutoffs and conditional expectation nodes (midpoints), and empirical probability of each bin. NOTE that the empirical probability will likely *not* be equal, due to the nature of the empirical data. As N_data -> infty, this will converge to the appropriate "true" equiprobable discrete value due to properties of the ECDF. Nathan M. Palmer ''' # Get initial cutoffs: cutoffs0 = np.linspace(0, 1, (N + 1)) # Need to plug into the inverse ecdf cutoffs = mquantiles(a=data, prob=cutoffs0, alphap=1.0 / 3.0, betap=1.0 / 3.0) # mquantiles(a, prob=[0.25, 0.5, 0.75], alphap=0.4, betap=0.4, axis=None, limit=()) # (alphap,betap) = (1/3, 1/3): p(k) = (k-1/3)/(n+1/3): Then p(k) ~ median[F(x[k])]. The resulting quantile estimates are approximately median-unbiased regardless of the distribution of x. (R type 8) # Set infinite upper and lower cutoffs: cutoffs[0] = -np.inf cutoffs[-1] = np.inf # Init containers EX = [] pX = [] for lo, hi in zip(cutoffs[:-1], cutoffs[1:]): bin_indx = np.logical_and(data >= lo, data < hi) EX.append(np.mean(data[bin_indx])) # Should converge to correct pX.append(np.mean(bin_indx)) # Should also converge to proper EX = np.array(EX) pX = np.array(pX) return EX, cutoffs[ 1:-1], pX # want to slice off the -inf and inf bin cutoffs
def get_potential_energy(self, atoms, output=(.5, )): """Returns the potential energy from the ensemble for the atoms object. By default only returns the median prediction (50th percentile) of the ensemble, such that it works like a normal ASE calculator. To get uncertainty information, use the output keyword with the following codes: <q>: (where <q> is a float) return the q quantile of the ensemble (where the quantile is a decimal, as in 0.5 for 50th percentile) e: return the whole ensemble prediction as a list Join the arguments with commas. For example, to return the median prediction plus a centered spread covering 90% of the ensemble prediction, use output=[.5, .05, .95]. If the ensemble is requested, it must be the last argument, e.g., output=[.5, .025, .97.5, 'e']. Note a list is typically returned, but if only one attribute is requested it returns it as a float, so that it's ASE-like. """ energies = [calc.get_potential_energy(atoms) for calc in self.ensemble] if output[-1] == 'e': quantiles = output[:-1] return_ensemble = True else: quantiles = output return_ensemble = False for quantile in quantiles: if (quantile > 1.0) or (quantile < 0.0): raise RuntimeError('Quantiles must be between 0 and 1.') result = mquantiles(energies, prob=quantiles) result = list(result) if return_ensemble: result.append(energies) if len(result) == 1: result = result[0] return result