def plot_core_pos_uncertainty_vs_R(table): figure() x, y = table.col('reference_core_pos').T x2, y2 = table.col('reconstructed_core_pos').T d = sqrt((x - x2) ** 2 + (y - y2) ** 2) r = table.col('r') bins = linspace(0, 50, 41) x, d25, d50, d75 = [], [], [], [] for low, high in zip(bins[:-1], bins[1:]): sel = d.compress((low <= r) & (r < high)) if len(sel) > 0: x.append((low + high) / 2) d25.append(scoreatpercentile(sel, 25)) d50.append(scoreatpercentile(sel, 50)) d75.append(scoreatpercentile(sel, 75)) fill_between(x, d25, d75, color='0.75') plot(x, d50, 'o-', color='black') xlabel("Core distance [m]") ylabel("Core position uncertainty [m]") utils.saveplot()
def plot_percentiles(data, numbins, xlim, ylim, vert = True, color = 'k', linestyle = 'solid', linew = 2): perc = 1. / numbins for i in range(1, numbins): if vert: plt.vlines(sts.scoreatpercentile(data, i * perc * 100.), ylim[0], ylim[1], color, linestyle, linewidth = linew) else: plt.hlines(sts.scoreatpercentile(data, i * perc * 100.), xlim[0], xlim[1], color, linestyle, linewidth = linew)
def lookatresults(data, modes, theta=None, vert=False, labels=None): P = data[-1][0] n = P.shape[0] if labels == None: labels = [""] * n else: pass if vert == True: subplots = range(n*100+11,n*100+n+11,1) figsize = (6, 3*n) elif vert == 'four': subplots = [221, 222, 223, 224] figsize = (10, 10) else: subplots = range(100+n*10+1,100+n*10+1+n,1) figsize = (5*n, 3) f = stats.gaussian_kde(data[-1][0]) int_guess = np.mean(data[-1][0], axis=1) modes = minimize(neg, int_guess, args=(f)).x thetas = [] P = data[-1][0] labelpad = 20 for i in xrange(n): x = P[i] t = r'$\theta_{3:}$ {1:.2f} +{2:.2f}/-{0:.2f}'.format( modes[i]-stats.scoreatpercentile(x, 16), modes[i], stats.scoreatpercentile(x, 84)-modes[i], i+1) thetas.append(t) if P.shape[1] > 10: bins = np.sqrt(P.shape[1]) else: bins=10 fig = plt.figure(figsize=figsize) for i in xrange(n): print subplots[i] plt.subplot(int(subplots[i])) #plt.title(thetas[0]) ker = stats.gaussian_kde(P[i]) h = plt.hist(P[i], bins=bins, normed=True, alpha=1) x = np.linspace(h[1][0],h[1][-1],1000) plt.plot(x,ker(x)) plt.xlabel(labels[i], labelpad=labelpad, fontsize=24) if theta != None: plt.axvline(theta[0]) for t in thetas: print t return fig
def __init__(self, a): self.min = a.min() self.q1 = stats.scoreatpercentile(a, 25) self.median = numpy.median(a) self.mean = a.mean() self.q3 = stats.scoreatpercentile(a, 75) self.max = a.max()
def plot_uncertainty_core_distance(table): N = 2 THETA = deg2rad(22.5) DTHETA = deg2rad(5.) DN = .5 DR = 10 LOGENERGY = 15 DLOGENERGY = .5 figure() x, y, y2 = [], [], [] for R in range(0, 81, 20): x.append(R) events = table.read_where('(abs(min_n134 - N) <= DN) & (abs(reference_theta - THETA) <= DTHETA) & (abs(r - R) <= DR) & (abs(log10(k_energy) - LOGENERGY) <= DLOGENERGY)') print(len(events),) errors = events['reference_theta'] - events['reconstructed_theta'] # Make sure -pi < errors < pi errors = (errors + pi) % (2 * pi) - pi errors2 = events['reference_phi'] - events['reconstructed_phi'] # Make sure -pi < errors2 < pi errors2 = (errors2 + pi) % (2 * pi) - pi #y.append(std(errors)) #y2.append(std(errors2)) y.append((scoreatpercentile(errors, 83) - scoreatpercentile(errors, 17)) / 2) y2.append((scoreatpercentile(errors2, 83) - scoreatpercentile(errors2, 17)) / 2) print() print("R: theta_std, phi_std") for u, v, w in zip(x, y, y2): print(u, v, w) print() # # Simulation data sx, sy, sy2 = loadtxt(os.path.join(DATADIR, 'DIR-plot_uncertainty_core_distance.txt')) graph = GraphArtist() # Plots plot(x, rad2deg(y), '^-', label="Theta") graph.plot(x[:-1], rad2deg(y[:-1]), mark='o') plot(sx, rad2deg(sy), '^-', label="Theta (sim)") graph.plot(sx[:-1], rad2deg(sy[:-1]), mark='square') plot(x, rad2deg(y2), 'v-', label="Phi") graph.plot(x[:-1], rad2deg(y2[:-1]), mark='*') plot(sx, rad2deg(sy2), 'v-', label="Phi (sim)") graph.plot(sx[:-1], rad2deg(sy2[:-1]), mark='square*') # Labels etc. xlabel("Core distance [m] $\pm %d$" % DR) graph.set_xlabel(r"Core distance [\si{\meter}] $\pm \SI{%d}{\meter}$" % DR) ylabel("Angle reconstruction uncertainty [deg]") graph.set_ylabel(r"Angle reconstruction uncertainty [\si{\degree}]") title(r"$N_{MIP} = %d \pm %.1f, \theta = 22.5^\circ \pm %d^\circ, %.1f \leq \log(E) \leq %.1f$" % (N, DN, rad2deg(DTHETA), LOGENERGY - DLOGENERGY, LOGENERGY + DLOGENERGY)) ylim(ymin=0) graph.set_ylimits(min=0) xlim(-2, 62) legend(numpoints=1, loc='best') utils.saveplot() artist.utils.save_graph(graph, dirname='plots') print
def freedmanDiaconisRule(self,data): """ Calculate number of bins to use in histogram according to this rule. Parameters: data - a numpy.ndarray containing the data for which a histogram is to be computed. Returns: The 'optimal' number of bins for the histogram. """ # interquartile range, Q3-Q1.... iqr = stats.scoreatpercentile(data, 75) - stats.scoreatpercentile(data, 25) binwidth = 2 * iqr * pow(len(data), -0.3333333) if(binwidth<=0): binwidth=60 # calculate n bins rnge = max(data) - min(data) nbins = ceil( rnge / binwidth ) if(self.verbose): print "\t\tFreedman Diaconis Rule values for bins:" print "\t\t\tIQR: ",iqr print "\t\t\tBin Width: ",binwidth print "\t\t\tRange: ",rnge print "\t\t\tNumber of bins: ", nbins return int(nbins)
def get_isize_stats(self, limit=1e5): """Estimate insert size median, mean and stdev. Also count pair orientations and select main. """ if self.log: self.log.write("Estimating insert size stats...\n") isizes = [] self.pairs = [0, 0, 0, 0] for alg in pysam.Samfile(self.bam): #take only reads with good alg quality and one read per pair if alg.mapq < self.mapq or alg.isize < 1: continue #store isize isizes.append(alg.isize) #store pair orientation self.pairs[self.alg2orientation(alg)] += 1 #stop if limit reached if len(isizes) >= limit: break #get rid of right 5 percentile maxins = stats.scoreatpercentile(isizes, 100-self.q) minins = stats.scoreatpercentile(isizes, self.q) isizes = filter(lambda x: minins<x<maxins, isizes) #store self.isize_median = np.median(isizes) self.isize_mean = np.mean(isizes) self.isize_stdev = np.std(isizes)
def _filter_ridge_lines(cwt, ridge_lines, window_size=None, min_length=None, min_snr=1, noise_perc=10): num_points = cwt.shape[1] if min_length is None: min_length = n.ceil(cwt.shape[0] / 4) if window_size is None: window_size = n.ceil(num_points / 20) hf_window = window_size / 2 #Filter based on SNR row_one = cwt[0, :] noises = n.zeros_like(row_one) for ind, val in enumerate(row_one): window = n.arange(max([ind - hf_window, 0]), min([ind + hf_window, num_points])) window = window.astype(int) noises[ind] = scoreatpercentile(row_one[window], per=noise_perc) #noises[ind] = n.std(row_one[window]) noise_level = scoreatpercentile(row_one, per = noise_perc) def filt_func(line): if len(line[0]) < min_length: return False #snr = abs(cwt[line[0][0], line[1][0]] / noises[line[1][0]]) c=line[0][-1]/2 #snr = -cwt[c, line[1][0]] / noises[line[1][0]] #snr = cwt[c, line[1][0]] / abs(noises[line[1][0]]) + 1 snr = cwt[c, line[1][0]] / abs(noise_level) + 1 #line.append(['snr=',snr,c,cwt[c, line[1][0]] , noises[line[1][0]]] ) line.append(['snr=',snr,c,cwt[c, line[1][0]] , noise_level] ) if snr < min_snr: return False return True return list(filter(filt_func, ridge_lines))
def bin_and_plot_data(pop, num_samples): max_file = './maxfiles/random/' + pop + '.' + str(num_samples) + '.' + str(ind) + '.max' composite_max_file = './maxfiles/random/' + pop + '.' + str(num_samples) + '.' + 'composite' + '.max' #make_quantiles(match_file, quant_list) data = read_data(composite_max_file) my_bins = make_bins() start = time.clock() counter_2, weighted_counter = bin_data_2(composite_max_file, my_bins) print 'my sum is', sum(counter_2) end = time.clock() print 'The SORTED binning process took', end - start, 'seconds.' #(n, bins, patches) = P.hist(data, bins = 100, normed = True, cumulative = False, alpha = .5, label = pop) #center = (bins[:-1] + bins[1:]) / 2 plt.plot(my_bins, counter_2, label = pop) #plt.plot(center,n) P.xlim([0,1e6]) #for i in range(0, len(my_bins)): # print my_bins[i], counter_2[i] print pop, len(data) print min(data) print stats.scoreatpercentile(data, .5) print max(data)
def sample_571_2(): """ 5.7.1 黄金分割线的定义方式 :return: """ from collections import namedtuple # 收盘价格序列中的最大值 cs_max = tsla_df.close.max() # 收盘价格序列中的最小值 cs_min = tsla_df.close.min() sp382 = (cs_max - cs_min) * 0.382 + cs_min sp618 = (cs_max - cs_min) * 0.618 + cs_min sp382_stats = stats.scoreatpercentile(tsla_df.close, 38.2) sp618_stats = stats.scoreatpercentile(tsla_df.close, 61.8) def plot_golden(): # 从视觉618和统计618中筛选更大的值 above618 = np.maximum(sp618, sp618_stats) # 从视觉618和统计618中筛选更小的值 below618 = np.minimum(sp618, sp618_stats) # 从视觉382和统计382中筛选更大的值 above382 = np.maximum(sp382, sp382_stats) # 从视觉382和统计382中筛选更小的值 below382 = np.minimum(sp382, sp382_stats) # 绘制收盘价 plt.plot(tsla_df.close) # 水平线视觉382 plt.axhline(sp382, c='r') # 水平线统计382 plt.axhline(sp382_stats, c='m') # 水平线视觉618 plt.axhline(sp618, c='g') # 水平线统计618 plt.axhline(sp618_stats, c='k') # 填充618 red plt.fill_between(tsla_df.index, above618, below618, alpha=0.5, color="r") # 填充382 green plt.fill_between(tsla_df.index, above382, below382, alpha=0.5, color="g") # 最后使用namedtuple包装上,方便获取 return namedtuple('golden', ['above618', 'below618', 'above382', 'below382'])( above618, below618, above382, below382) golden = plot_golden() # 根据绘制顺序标注名称 plt.legend(['close', 'sp382', 'sp382_stats', 'sp618', 'sp618_stats'], loc='best') plt.show() print('理论上的最高盈利: {}'.format(golden.above618 - golden.below382)) return golden
def generate_scipy_comparison(csvPathname): # this is some hack code for reading the csv and doing some percentile stuff in scipy from numpy import loadtxt, genfromtxt, savetxt dataset = loadtxt( open(csvPathname, 'r'), delimiter=',', dtype='int16'); print "csv read for training, done" # we're going to strip just the last column for percentile work # used below NUMCLASSES = 10 print "csv read for training, done" # data is last column # drop the output print dataset.shape if 1==0: n_features = len(dataset[0]) - 1; print "n_features:", n_features # get the end target = [x[-1] for x in dataset] print "histogram of target" print sp.histogram(target,bins=NUMCLASSES) print target[0] print target[1] from scipy import stats stats.scoreatpercentile(dataset, [10,20,30,40,50,60,70,80,90])
def generatePercentile(columns, theTimeFilter, trafficFilter, centricity, dataRes): print "Running timeseries report" report = TrafficOverallTimeSeriesReport(profiler) report.run( columns = columns, timefilter = theTimeFilter, trafficexpr = TrafficFilter(trafficFilter), centricity = centricity, resolution = dataRes ) data = report.get_data() report.delete() print "Getting data" dataList = fixBucket(data, int(sumTime)) if outputData == True and clean == False: print "Data points:" for x in dataList[1]: print x[0] if graph == True: genGraph(data, dataList, percentileVal) #print '{}% Average Bytes is {}'.format(percentileVal, stats.scoreatpercentile(data, percentileVal)[0]) if clean == False: print '{}% Average Bytes is {}'.format(percentileVal, stats.scoreatpercentile(dataList, percentileVal)[0]) if Max == True and clean == False: print 'Maximum Average Bytes is {}'.format(max(data)[0]) if Min == True and clean == False: print 'Minimum Average Bytes is {}'.format(min(data)[0]) if Median == True and clean == False: print 'Median Average Bytes is {}'.format(stats.scoreatpercentile(data, 50)[0]) if clean == True: print '{} {}'.format(trafficFilter, stats.scoreatpercentile(dataList[0], percentileVal)[0])
def plot_histogram(trans_w, trans_g, index, nComps=0, doCombinedFitting=True): import scipy.stats as sps nBins = 100 if isinstance(index, str): ww = trans_w[index] gg = trans_g[index] else: ww = trans_w[:, index] gg = trans_g[:, index] # low = min(np.min(gg), np.min(ww)) # high = max(np.max(gg), np.max(ww)) low = min(sps.scoreatpercentile(gg, 1), sps.scoreatpercentile(ww, 1)) high = max(sps.scoreatpercentile(gg, 99), sps.scoreatpercentile(ww, 99)) h_w, bin_edges = np.histogram(ww, nBins, (low, high)) h_g, bin_edges = np.histogram(gg, nBins, (low, high)) bin_centers = (bin_edges[:-1] + bin_edges[1:])/2 fig = plt.figure() ax = fig.add_subplot(111) ebkw = {'linewidth':1,} ax.errorbar(bin_centers, h_w, np.sqrt(h_w),label=w_label ,color='g', **ebkw) ax.errorbar(bin_centers, h_g, np.sqrt(h_g),label=g_label ,color='b', **ebkw) if type(index) == str: ax.set_xlabel(index, size='x-large') else: ax.set_xlabel(makeTitle(index, nComps, doCombinedFitting), size='x-large') ax.set_ylabel('Occupancy', size='x-large') ax.set_ylim(0,ax.get_ylim()[1]) ax.grid() plt.legend()
def just_plot_rc(rc, rc_controls, ax=None, x=None, top_limit=None, **kwargs): from numpy import array, mean from numpy.ma import masked_invalid if x is None: from numpy import arange x = arange(len(rc)) if ax is None: ax = gca() from scipy.stats import scoreatpercentile upper95 = array([scoreatpercentile(rc_controls[:,i], 97.5) for i in range(len(x))]) lower95 = array([scoreatpercentile(rc_controls[:,i], 2.5) for i in range(len(x))]) control_mean = mean(rc_controls,axis=0) rc_norm = rc/control_mean rc_norm = masked_invalid(rc_norm) ax.plot(x, rc_norm, linewidth=1, color='k') ax.plot(ax.get_xlim(), (1,1), linestyle='--', color='k') ax.set_ylim(bottom=0) if ax.get_ylim()[1]<2: ax.set_ylim(top=2) if top_limit: ax.set_ylim(top=top_limit) ax.set_ylabel(r"$RC_{norm}$")#+"\n (normalized rich club coefficient)") ax.fill_between(x, 1,rc_norm, where=(rc>upper95) | (rc<lower95), **kwargs) return ax
def trend_bins(x, y, xlow=None, xhigh=None, xbinwidth=None, nmin=100, lowpc=2.5, highpc=97.5, usemedian=True): if xlow is None: xlow = scoreatpercentile(x, 1) if xhigh is None: xhigh = scoreatpercentile(x, 99) if xbinwidth is None: xbinwidth = (xhigh - xlow) * 100*nmin / len(x) x_bin = N.arange(xlow, xhigh+xbinwidth/2.0, xbinwidth) n_bin = len(x_bin) xx_bin = N.zeros(n_bin, N.float) - 99999 y_bin = N.zeros((3, n_bin), N.float) - 99999 ok = N.ones(n_bin, N.bool) for i, xb in enumerate(x_bin): inbin = (x >= xb - 0.5*xbinwidth) & (x < xb + 0.5*xbinwidth) x_inbin = x[inbin] y_inbin = y[inbin] if len(y_inbin) > nmin: xx_bin[i] = median(x_inbin) if usemedian: y_bin[0, i] = median(y_inbin) else: y_bin[0, i] = N.mean(y_inbin) y_bin[1, i] = scoreatpercentile(y_inbin, lowpc) y_bin[2, i] = scoreatpercentile(y_inbin, highpc) else: ok[i] = False #x_bin = x_bin[ok] xx_bin = xx_bin[ok] y_bin = y_bin[:,ok] return xx_bin, y_bin
def calc_quartiles(data): """returns q1, q3, and iqr (interquartile range) of data (note data needs to be sorted)""" q1 = scoreatpercentile(data, 25) q3 = scoreatpercentile(data, 75) iqr = q3 - q1 return q1, q3, iqr
def confidence_interval_1d(A, alpha=SIGMA1, metric=np.mean, numResamples=10000, interpolate=True): """Calculate confidence interval along one dimensional array.""" if not isinstance(alpha, collections.Iterable): alpha = np.array([alpha]) N = len(A) resampleInds = np.random.randint(0, N, (numResamples, N)) metricOfResampled = metric(A[resampleInds], axis=-1) confidenceInterval = np.zeros(2*len(alpha), dtype='float') if interpolate: for thisAlphaInd, thisAlpha in enumerate(alpha): percenPos = (thisAlpha * 100 / 2.0) samplePos = scoreatpercentile(metricOfResampled, percenPos) confidenceInterval[2*thisAlphaInd] = samplePos percenNeg = (100 - thisAlpha * 100 / 2.0) sampleNeg = scoreatpercentile(metricOfResampled, percenNeg) confidenceInterval[2*thisAlphaInd+1] = sampleNeg else: sortedMetricOfResampled = np.sort(metricOfResampled) for thisAlphaInd, thisAlpha in enumerate(alpha): percenPos = int(round(thisAlpha*numResamples / 2.0)) samplePos = sortedMetricOfResampled[percenPos] confidenceInterval[2*thisAlphaInd] = samplePos percenNeg = int(round(numResamples - (thisAlpha * numResamples / 2.0))) sampleNeg = sortedMetricOfResampled[percenNeg] confidenceInterval[2*thisAlphaInd+1] = sampleNeg return confidenceInterval
def main(datafile, feature1, feature2, bins, percentile, copula, logscale): X, features = read_sah_h5(datafile, just_good=False) x = X[:, features.index(feature1)] y = X[:, features.index(feature2)] if percentile > 0 and not copula: bx = np.linspace( scoreatpercentile(x, percentile), scoreatpercentile(x, 100-percentile), bins) by = np.linspace( scoreatpercentile(y, percentile), scoreatpercentile(y, 100-percentile), bins) bins = (bx, by) if copula: x = copula_transform(x) y = copula_transform(y) if logscale: pl.hist2d(x, y, bins=bins, norm=LogNorm()) else: pl.hist2d(x, y, bins=bins) pl.xlabel(feature1) pl.ylabel(feature2) pl.show()
def freedman_diaconis(data): """ Estimate an optimal number of histogram bins using the Freedman-Diaconis rule of thumb. Parameters ---------- data : ndarray The data to histogram. Returns ------- n_bins : int The number of bins to use. """ data = data.flatten() q3 = stats.scoreatpercentile(data, 75.0) q1 = stats.scoreatpercentile(data, 25.0) h = 2.0 * (q3 - q1) * np.power(len(data), -1.0/3.0) n_bins = int( ( np.max(data) - np.min(data) ) / h ) return n_bins
def filterOutliers(times, irq_range=.5, rtn_range=False): # We need the original indexes for marking which were removed original_times = list(times) times.sort() np_times = array(times) q1 = scoreatpercentile(np_times, 25, interpolation_method='fraction') q3 = scoreatpercentile(np_times, 75, interpolation_method='fraction') irq = q3-q1 slack = irq_range * irq okRange = (q1-slack, q3+slack) newTimes = [] rm_indexes = [] for index,time in enumerate(original_times): # If there is only one element in the list it may not be an outlier but # but it could still be erroneous if time > max_key_time: rm_indexes.append(index) continue if okRange[0] <= time <= okRange[1]: newTimes.append(time) else: rm_indexes.append(index) if rtn_range: return newTimes, rm_indexes, okRange else: return newTimes, rm_indexes
def mk_image(galaxy): base = './../../images_v5/GS_2.5as_matched/gs_all_' i_img = pyf.getdata(base+str(galaxy)+'_I.fits') j_img = pyf.getdata(base+str(galaxy)+'_J.fits') h_img = pyf.getdata(base+str(galaxy)+'_H.fits') #include 90% of pixels x = pyl.hstack(i_img) i_lim = scoreatpercentile(x,99) x = pyl.hstack(j_img) j_lim = scoreatpercentile(x,99) x = pyl.hstack(h_img) h_lim = scoreatpercentile(x,99) print galaxy, i_lim, j_lim, h_lim img = pyl.zeros((h_img.shape[0], h_img.shape[1], 3), dtype=float) img[:,:,0] = img_scale.asinh(h_img, scale_min=-0.1*h_lim, scale_max=h_lim, non_linear=0.5) img[:,:,1] = img_scale.asinh(j_img, scale_min=-0.1*j_lim, scale_max=j_lim, non_linear=0.5) img[:,:,2] = img_scale.asinh(i_img, scale_min=-0.1*i_lim, scale_max=i_lim, non_linear=0.5) return img
def average_bins(xdata, ydata, xmin, xmax, nxbins=15): """ Computes mean and 16 and 84 percentiles of y-data in bins in x :param xdata: numpy array of xdata :param ydata: numpy arrya of ydata :param xmax: maximumx value of x that data are binned to :param xmin: minimum value of x that data are binned to :param nxbins: number of bins in x :return: mid points of the bins, mean, 16 per cent percentile, and 84 per cent percentile. """ xbin = N.linspace(xmin, xmax, nxbins) nbin = len(xbin) - 1 xbin_mid = N.zeros(nbin) y50 = N.zeros(nbin) - 99.0 y16 = N.zeros(nbin) - 99.0 y84 = N.zeros(nbin) - 99.0 for i in range(nbin): xbin_mid[i] = xbin[i] + 0.5 * (xbin[i + 1] - xbin[i]) mask = (xdata > xbin[i]) & (xdata <= xbin[i + 1]) if len(ydata[mask]) >= 10: y50[i] = N.mean(ydata[mask]) y16[i] = ss.scoreatpercentile(ydata[mask], 16) y84[i] = ss.scoreatpercentile(ydata[mask], 84) return xbin_mid, y50, y16, y84
def find_offset(p1, p2): datarange = (p1['long'].min(), p1['long'].min(), p1['long'].max(), p1['lat'].min(), p1['lat'].max()) long_min = max(p1['long'].min(), p2['long'].min()) long_max = min(p1['long'].max(), p2['long'].max()) lat_min = max(p1['lat'].min(), p2['lat'].min()) lat_max = min(p1['lat'].max(), p2['lat'].max()) select = (p1['long'] >= long_min) & (p1['long'] <= long_max) & (p1['lat'] >= lat_min) & (p1['lat'] <= lat_max) p1 = p1[select] select = (p2['long'] >= long_min) & (p2['long'] <= long_max) & (p2['lat'] >= lat_min) & (p2['lat'] <= lat_max) p2 = p2[select] if len(p1) < 1 or len(p2) < 1: print('Too few craters to find offset') return [0.0, 0.0] if len(p1) > 100 and len(p2) > 100: big = scoreatpercentile(p1['radius'], 75) big = min(big, scoreatpercentile(p2['radius'], 75)) p1 = p1[p1['radius'] > big] p2 = p2[p2['radius'] > big] minsize1 = numpy.zeros(p1.shape[0], [('minsize', numpy.double)]) minsize2 = numpy.zeros(p2.shape[0], [('minsize', numpy.double)]) X1 = numpy.asarray([p1[name] for name in ('long', 'lat', 'radius')]+[minsize1['minsize']], order='c', dtype=numpy.double) X2 = numpy.asarray([p2[name] for name in ('long', 'lat', 'radius')]+[minsize2['minsize']], order='c', dtype=numpy.double) results = fmin(comparedata, [0.0, 0.0], args=(X1, X2), xtol=0.001, maxiter=1000) results *= degrees_per_metre # convert from rough metres to degrees print('Found a shift of dlong = %e deg, dlat = %e deg'%tuple(results)) return results
def plot_xwt_wavetransf(power, time, wa, T, S, sig95, pangle, time_base, scalemin=0, scalemax=6, ylabel='Pressure (mb)', plot_percentile=False): """plotting WaveTransform Power with confidence interval contour and phase vectors""" fig = plt.figure(10) ax = plt.subplot(1,1,1) if plot_percentile: #use following to contour at "percentiles variances" when using non-normalized data to match web output csf =plt.contourf(T, S, power, levels=[ 0, stats.scoreatpercentile(power, 25), stats.scoreatpercentile(power, 50), stats.scoreatpercentile(power, 75), stats.scoreatpercentile(power, 95), stats.scoreatpercentile(power, 100)], colors=bmap) else: #use following to contour at "normalized variances" BAMS csf =plt.contourf(T, S, power, levels=[ 0, .2,.4,.6,.8,1], colors=bmap) cbar = plt.colorbar(pad=.1, shrink=.5, format='%.4f', extend='both') #move and shrink colorbar levels = [-99, 1] # values greater than 1 are significant plt.contour(T, S, sig95,levels, colors='black', linewidths=1) ax.set_yscale('log') ax.grid(True) # plot phase relationship arr_dens = [60, 30] arr_densx = np.round( len(time) / arr_dens[0] ) arr_densy = np.round( len(wa.scales) / arr_dens[1] ) if arr_dens == 0: arr_dens = 1 plt.quiver(T[::arr_densy,::arr_densx],S[::arr_densy,::arr_densx],(np.cos(pangle))[::arr_densy,::arr_densx],(np.sin(pangle))[::arr_densy,::arr_densx],\ width=.00125, headwidth=4, headlength=4, alpha=0.6, color='k') # put the ticks at powers of 2 in the scale ticks = np.unique(2 ** np.floor(np.log2(wa.scales)))[1:] ax.yaxis.set_ticks(ticks) ax.yaxis.set_ticklabels(ticks.astype(str)) ax.set_ylim(scalemax, scalemin) ax.set_ylabel('scales') # second y scale with equivalent fourier periods to scales # except with the ticks at the powers of 2 ax_fourier = ax.twinx() ax_fourier.set_yscale('log') # match the fourier ticks to the scale ticks ax_fourier.set_yticks(ticks) ax_fourier.set_yticklabels(ticks.astype(str)) ax_fourier.set_ylabel('fourier period (%s)' % time_base ) fourier_lim = [wa.fourier_period(i) for i in ax.get_ylim()] ax_fourier.set_ylim(fourier_lim) ax.xaxis.set_major_formatter(DateFormatter('%Y-%m-%d')) fig.autofmt_xdate() # shade the region between the edge and coi C, S = wa.coi ax.fill_between(x=C, y1=S, y2=wa.scales.max(), color='gray', alpha=0.5) ax.set_xlim(time.min(), time.max()) #plt.show() DefaultSize = fig.get_size_inches() fig.set_size_inches( (DefaultSize[0]*2, DefaultSize[1]) ) return (plt, fig)
def shell_move(inAtom,atomIndex): # we're going to be changing the position of atomIndex inside inAtom # make sure that you remove any crazy outliers before you do this # or else it'll just make a bunch more outliers, which is a poor idea # make sure atomIndex comes from range(len(inAtom.get_positions())) so we don't get out of bounds try: inCOM = inAtom.get_center_of_mass() inDistances = distanceCenter(inAtom) ninetyNinthRadius = stats.scoreatpercentile(inDistances,99) ninetyFifthRadius = stats.scoreatpercentile(inDistances,95) outerFourRadius = ninetyNinthRadius - ninetyFifthRadius randomNewRadius = random.normal( (ninetyNinthRadius+ninetyFifthRadius)/2 , (ninetyNinthRadius - ninetyFifthRadius)/2 ) xFromCenter = random.uniform(0,randomNewRadius) randomNewRadius = ((randomNewRadius**2) - (xFromCenter**2))**0.5 yFromCenter = random.uniform(0,randomNewRadius) zFromCenter = ((randomNewRadius**2) - (yFromCenter**2))**0.5 newXPosition = inCOM[0] + plusOrMinus()*xFromCenter newYPosition = inCOM[1] + plusOrMinus()*yFromCenter newZPosition = inCOM[2] + plusOrMinus()*zFromCenter positionArray = inAtom.get_positions() positionArray[atomIndex] = (newXPosition,newYPosition,newZPosition) inAtom.set_positions(positionArray) return inAtom except IndexError: print "The index of the atom you wanted to move is too high or too low." print "Please check your function call of shell_move(a,b)" print "-Jeff"
def fitnesscost_confidence(region, data, ax=None, fname=None): ''' bootstrap the fitness cost estimates and make distributions of the bootstrapped values for subsets of sites with a defined median. this should give an impression of how variable the estimates are. three such distributions are combined in one figure ''' from util import add_panel_label # generate boo strap estimates of minor SNP frequences av = process_average_allele_frequencies(data, [region], nbootstraps=100, bootstrap_type='bootstrap') combined_af = av['combined_af'] combined_entropy = av['combined_entropy'] minor_af = av['minor_af'] combined_entropy_bs = av['combined_entropy_bs'] minor_af_bs = av['minor_af_bs'] # convert minor_af to 100x(length of gene) array of minor SNPs minor_af_array=np.array(minor_af_bs[region]) qtiles = np.vstack([scoreatpercentile(minor_af_array, x, axis=0) for x in [25, 50, 75]]) # calculate selection coefficient quantiles corresponding to SNP_freq quantiles scb = (data['mut_rate'][region]/(af_cutoff+qtiles)).T sel_coeff_array = (data['mut_rate'][region]/(af_cutoff+minor_af_array)) sel_coeff_array[sel_coeff_array<0.001]=0.001 sel_coeff_array[sel_coeff_array>0.1]=0.1 which_quantile = np.zeros(minor_af_array.shape[1], dtype=int) thres = [20,40,60] for i,ql in enumerate(thres): # take sites if slice [ql,ql+2] sl,su=scoreatpercentile(scb[:,1], ql), scoreatpercentile(scb[:,1], ql+2) which_quantile[(scb[:,1]>=sl)&(scb[:,1]<su)]=i+1 scb[scb>0.1]=0.1 scb[scb<0.001]=0.001 if ax is None: fig, ax = plt.subplots(1, 1, figsize=(8,6)) for i in range(1,len(thres)+1): try: ind = (which_quantile==i)&(~np.any(np.isnan(sel_coeff_array),axis=0)) npoints = ind.sum()*sel_coeff_array.shape[0] ax.plot(np.median(scb[ind,1])*np.ones(2), [0,0.5], c=cols[i+3], lw=4) ax.hist(sel_coeff_array[:,ind].flatten(), weights=np.ones(npoints,dtype=float)/npoints, bins=np.logspace(-3, -1, 21),alpha=0.7, color=cols[i+3]) except: import ipdb; ipdb.set_trace() ax.set_xscale('log') ax.set_xlabel('fitness cost', fontsize=fs) ax.set_ylabel('normalized counts', fontsize=fs) ax.tick_params(labelsize=fs*0.8) region_panels = {'gag': 'A', 'pol': 'B', 'env': 'E', 'nef': 'F', 'vif': 'C', 'vpu': 'E', 'vpr': 'G'} ax.text(0.1, 0.9, region, transform=ax.transAxes, fontsize=fs*1.5) if fname is not None: plt.tight_layout() for ext in ['png', 'svg', 'pdf']: plt.savefig(fname+'.'+ext)
def identify_at_risk(graph): ''' We hypothesize that an actor is at risk for beginning to use drugs if: 1. He weighs social influence more heavily than other actors. (His alpha [social susceptibility]) is in the upper quartile.) 2. He receives more highly influential inputs than other actors. (His influence kernel is in the upper quartile with resepct to the number of influencers he recieves where those influencers are the strongest in the network) 3. Those inputs he weighs highly are from people who consume drugs frequently and have a positive attitude towards the consumption of drugs ''' all_kernel_values = np.concatenate(influence_kernel.values()) upper_quartile_influence_kernel = scoreatpercentile(all_kernel_values, 75) #Calculate distribution of fraction of upper_quartile_influencers per user fraction_of_influencers_per_user = [(influence_kernel[agent]>upper_quartile_influence_kernel).sum()/float(len(influence_kernel[agent])) for agent in graph.nodes()] #Identify those whoe recieve more influencers than other people upper_quartile_receiving_influence = scoreatpercentile(fraction_of_influencers_per_user,75) #At risk if recent uptick in drinking distribution_of_increases_in_drinking = np.diff(drinking_behavior[np.nonzero(drinking_behavior)], axis=1).ravel() threshold_for_concerning_drinking = scoreatpercentile(distribution_of_increases_in_drinking,75) print threshold_for_concerning_drinking upper_quartile_alpha = scoreatpercentile(alpha,75) at_risk = [agent for agent in graph.nodes() if alpha[agent]>=upper_quartile_alpha and fraction_of_influencers(influence_kernel[agent],upper_quartile_influence_kernel)>upper_quartile_receiving_influence] return at_risk
def percentile_bins(xdata, ydata, xmin, xmax, nxbins=15, log=False, limit=6): """ Computes median and 16 and 84 percentiles of y-data in bins in x. :param xdata: numpy array of xdata :param ydata: numpy arrya of ydata :param xmax: maximum value of x that data are binned to :param xmin: minimum value of x that data are binned to :param nxbins: number of bins in x :param log: if True, xbins are logarithmically spaced, else linearly :param limit: the minimum number of values in a bin for which the median and percentiles are returned for. :return: mid points of the bins, median, 16 per cent percentile, and 84 per cent percentile. """ if log: xbin = N.logspace(xmin, xmax, nxbins) else: xbin = N.linspace(xmin, xmax, nxbins) nbin = len(xbin) - 1 xbin_mid = N.zeros(nbin) y50 = N.zeros(nbin) - 99.0 y16 = N.zeros(nbin) - 99.0 y84 = N.zeros(nbin) - 99.0 for i in range(nbin): xbin_mid[i] = xbin[i] + 0.5 * (xbin[i + 1] - xbin[i]) mask = (xdata > xbin[i]) & (xdata <= xbin[i + 1]) if len(ydata[mask]) >= limit: y50[i] = ss.scoreatpercentile(ydata[mask], 50) y16[i] = ss.scoreatpercentile(ydata[mask], 16) y84[i] = ss.scoreatpercentile(ydata[mask], 84) return xbin_mid, y50, y16, y84
def coverage_plot(ax, x, data, color="red", percs=[50,90]): """ ax = matplotlib axes instance x = x-axis coordinates data = profile data color = color in any way matplotlib accepts """ # Might change this into an argument for the function percs = [(100 - float(p)) / 2 for p in percs[::-1]] alphas = [0.1, 0.4] # Convert to numpy array vals = array(data) # Get the median m = median(vals, axis=0) # Draw the minimum percentiles lines = [array([scoreatpercentile(vals[:,i], perc) for i in range(len(vals[0]))]) for perc in percs] + [m] for (line_min, line_max), alpha in zip([(lines[i], lines[i + 1]) for i in range(len(percs))], alphas): ax.fill_between(x, line_min, line_max, facecolor=color, alpha=alpha, edgecolor='face') # Draw the maximum percentiles lines = [m] + [array([scoreatpercentile(vals[:,i], 100 - perc) for i in range(len(vals[0]))]) for perc in percs[::-1]] for (line_min, line_max), alpha in zip([(lines[i], lines[i + 1]) for i in range(len(percs))], alphas[::-1]): ax.fill_between(x, line_min, line_max, facecolor=color, alpha=alpha, edgecolor='face') # Draw the median ax.plot(x, m, color="black", alpha=0.95, linewidth=0.8)
def bootstrap_stat(arr, stat=np.mean, n_iters=1000, alpha=0.05): """ Produce a boot-strap distribution of the mean of an array on axis 0 Parameters --------- arr : ndarray The array with data to be bootstrapped stat : callable The statistical function to call. will be called as `stat(arr, 0)`, so needs to accept that call signature. n_iters : int The number of bootstrap iterations to sample alpha : float The confidence interval size will be 1-alpha """ stat_orig = stat(arr, 0) boot_arr = np.empty((arr.shape[-1] , n_iters)) for ii in xrange(n_iters): this_arr=arr[np.random.random_integers(0, arr.shape[0]-1, arr.shape[0])] boot_arr[:, ii] = stat(this_arr, 0) eb = np.array([stats.scoreatpercentile(boot_arr[xx], 1-(alpha/2)) - stats.scoreatpercentile(boot_arr[xx], alpha/2) for xx in range(boot_arr.shape[0])]) return stat_orig, eb
def add_graph(self, adjacency_matrix, node_coords, node_color='auto', node_size=50, edge_cmap=cm.bwr, edge_threshold=None, edge_kwargs=None, node_kwargs=None): """Plot undirected graph on each of the axes Parameters ---------- adjacency_matrix: numpy array of shape (n, n) represents the edges strengths of the graph. Assumed to be a symmetric matrix. node_coords: numpy array_like of shape (n, 3) 3d coordinates of the graph nodes in world space. node_color: color or sequence of colors color(s) of the nodes. node_size: scalar or array_like size(s) of the nodes in points^2. edge_cmap: colormap colormap used for representing the strength of the edges. edge_threshold: str or number If it is a number only the edges with a value greater than edge_threshold will be shown. If it is a string it must finish with a percent sign, e.g. "25.3%", and only the edges with a abs(value) above the given percentile will be shown. edge_kwargs: dict will be passed as kwargs for each edge matlotlib Line2D. node_kwargs: dict will be passed as kwargs to the plt.scatter call that plots all the nodes in one go. """ # set defaults if edge_kwargs is None: edge_kwargs = {} if node_kwargs is None: node_kwargs = {} if node_color == 'auto': nb_nodes = len(node_coords) node_color = mpl_cm.Set2(np.linspace(0, 1, nb_nodes)) node_coords = np.asarray(node_coords) # decompress input matrix if sparse if sparse.issparse(adjacency_matrix): adjacency_matrix = adjacency_matrix.toarray() # make the lines below well-behaved adjacency_matrix = np.nan_to_num(adjacency_matrix) # safety checks if 's' in node_kwargs: raise ValueError("Please use 'node_size' and not 'node_kwargs' " "to specify node sizes") if 'c' in node_kwargs: raise ValueError("Please use 'node_color' and not 'node_kwargs' " "to specify node colors") adjacency_matrix_shape = adjacency_matrix.shape if (len(adjacency_matrix_shape) != 2 or adjacency_matrix_shape[0] != adjacency_matrix_shape[1]): raise ValueError( "'adjacency_matrix' is supposed to have shape (n, n)." ' Its shape was {0}'.format(adjacency_matrix_shape)) node_coords_shape = node_coords.shape if len(node_coords_shape) != 2 or node_coords_shape[1] != 3: message = ( "Invalid shape for 'node_coords'. You passed an " "'adjacency_matrix' of shape {0} therefore " "'node_coords' should be a array with shape ({0[0]}, 3) " 'while its shape was {1}').format(adjacency_matrix_shape, node_coords_shape) raise ValueError(message) if node_coords_shape[0] != adjacency_matrix_shape[0]: raise ValueError( "Shape mismatch between 'adjacency_matrix' " "and 'node_coords'" "'adjacency_matrix' shape is {0}, 'node_coords' shape is {1}". format(adjacency_matrix_shape, node_coords_shape)) if not np.allclose(adjacency_matrix, adjacency_matrix.T, rtol=1e-3): raise ValueError("'adjacency_matrix' should be symmetric") # For a masked array, masked values are replaced with zeros if hasattr(adjacency_matrix, 'mask'): if not (adjacency_matrix.mask == adjacency_matrix.mask.T).all(): raise ValueError( "'adjacency_matrix' was masked with a non symmetric mask") adjacency_matrix = adjacency_matrix.filled(0) if edge_threshold is not None: if isinstance(edge_threshold, _basestring): message = ("If 'edge_threshold' is given as a string it " 'should be a number followed by the percent sign, ' 'e.g. "25.3%"') if not edge_threshold.endswith('%'): raise ValueError(message) try: percentile = float(edge_threshold[:-1]) except ValueError as exc: exc.args += (message, ) raise # Keep a percentile of edges with the highest absolute # values, so only need to look at the covariance # coefficients below the diagonal lower_diagonal_indices = np.tril_indices_from(adjacency_matrix, k=-1) lower_diagonal_values = adjacency_matrix[ lower_diagonal_indices] edge_threshold = stats.scoreatpercentile( np.abs(lower_diagonal_values), percentile) elif not isinstance(edge_threshold, numbers.Real): raise TypeError('edge_threshold should be either a number ' 'or a string finishing with a percent sign') adjacency_matrix = adjacency_matrix.copy() threshold_mask = np.abs(adjacency_matrix) < edge_threshold adjacency_matrix[threshold_mask] = 0 lower_triangular_adjacency_matrix = np.tril(adjacency_matrix, k=-1) non_zero_indices = lower_triangular_adjacency_matrix.nonzero() line_coords = [ node_coords[list(index)] for index in zip(*non_zero_indices) ] adjacency_matrix_values = adjacency_matrix[non_zero_indices] for ax in self.axes.values(): ax._add_markers(node_coords, node_color, node_size, **node_kwargs) ax._add_lines(line_coords, adjacency_matrix_values, edge_cmap, **edge_kwargs) plt.draw_if_interactive()
def setup(): GG = nx.DiGraph() with open('traces/ripple/jan2013-lcc-t0.graph_CREDIT_LINKS', 'r') as f: for line in f: source = int(line.split()[0]) destination = int(line.split()[1]) total_channel_cap = (float(line.split()[3]) - float( line.split()[2])) + (float(line.split()[4]) - float(line.split()[3])) if total_channel_cap > 0: GG.add_edge(source, destination, capacity=total_channel_cap / 2) GG.add_edge(destination, source, capacity=total_channel_cap / 2) while True: nodes_to_remove = [] for node_index in list(GG.nodes()): if len(list(GG.neighbors(node_index))) < 2: nodes_to_remove.append(node_index) if len(nodes_to_remove) == 0: break for node_index in nodes_to_remove: GG.remove_node(node_index) node_list = list(GG.nodes()) random.seed(2) # make the node index be continuous G = nx.DiGraph() for e in GG.edges(): G.add_edge(node_list.index(e[0]), node_list.index(e[1]), capacity=GG[e[0]][e[1]]['capacity'], cost=random.random() * 10) G.add_edge(node_list.index(e[1]), node_list.index(e[0]), capacity=GG[e[1]][e[0]]['capacity'], cost=random.random() * 10) # transaction fees for 10% edges are especially high random_edges = [] random_edges = random.sample(xrange(G.number_of_edges()), int(G.number_of_edges() * 0.1)) i = 0 for e in G.edges(): if i in random_edges: G[e[0]][e[1]]['cost'] = G[e[0]][e[1]]['cost'] * 10 i += 1 listC = [] for e in G.edges(): listC.append(G[e[0]][e[1]]['capacity']) listC.append(G[e[1]][e[0]]['capacity']) print "number of nodes", len(G) print 'average channel cap', float(sum(listC)) / len(listC) print 'num of edges', len(listC) sorted_var = np.sort(listC) print 'medium capacity', stats.scoreatpercentile(sorted_var, 50) trans = [] with open('traces/ripple/ripple_val.csv', 'r') as f: csv_reader = csv.reader(f, delimiter=',') for row in csv_reader: if float(row[2]) > 0: src = int(row[0]) % len(G) dst = int(row[1]) % len(G) if src == dst: continue trans.append((int(src), int(dst), float(row[2]))) print 'num of transactions', len(trans) return G, trans
def fit(self, q=.5, vcov='robust', kernel='epa', bandwidth='hsheather', max_iter=1000, p_tol=1e-6, **kwargs): '''Solve by Iterative Weighted Least Squares Parameters ---------- q : float Quantile must be between 0 and 1 vcov : string, method used to calculate the variance-covariance matrix of the parameters. Default is ``robust``: - robust : heteroskedasticity robust standard errors (as suggested in Greene 6th edition) - iid : iid errors (as in Stata 12) kernel : string, kernel to use in the kernel density estimation for the asymptotic covariance matrix: - epa: Epanechnikov - cos: Cosine - gau: Gaussian - par: Parzene bandwidth: string, Bandwidth selection method in kernel density estimation for asymptotic covariance estimate (full references in QuantReg docstring): - hsheather: Hall-Sheather (1988) - bofinger: Bofinger (1975) - chamberlain: Chamberlain (1994) ''' if q < 0 or q > 1: raise Exception('p must be between 0 and 1') kern_names = ['biw', 'cos', 'epa', 'gau', 'par'] if kernel not in kern_names: raise Exception("kernel must be one of " + ', '.join(kern_names)) else: kernel = kernels[kernel] if bandwidth == 'hsheather': bandwidth = hall_sheather elif bandwidth == 'bofinger': bandwidth = bofinger elif bandwidth == 'chamberlain': bandwidth = chamberlain else: raise Exception( "bandwidth must be in 'hsheather', 'bofinger', 'chamberlain'") endog = self.endog exog = self.exog nobs = self.nobs exog_rank = np_matrix_rank(self.exog) self.rank = exog_rank self.df_model = float(self.rank - self.k_constant) self.df_resid = self.nobs - self.rank n_iter = 0 xstar = exog beta = np.ones(exog_rank) # TODO: better start, initial beta is used only for convergence check # Note the following doesn't work yet, # the iteration loop always starts with OLS as initial beta # if start_params is not None: # if len(start_params) != rank: # raise ValueError('start_params has wrong length') # beta = start_params # else: # # start with OLS # beta = np.dot(np.linalg.pinv(exog), endog) diff = 10 cycle = False history = dict(params=[], mse=[]) while n_iter < max_iter and diff > p_tol and not cycle: n_iter += 1 beta0 = beta xtx = np.dot(xstar.T, exog) xty = np.dot(xstar.T, endog) beta = np.dot(pinv(xtx), xty) resid = endog - np.dot(exog, beta) mask = np.abs(resid) < .000001 resid[mask] = np.sign(resid[mask]) * .000001 resid = np.where(resid < 0, q * resid, (1 - q) * resid) resid = np.abs(resid) xstar = exog / resid[:, np.newaxis] diff = np.max(np.abs(beta - beta0)) history['params'].append(beta) history['mse'].append(np.mean(resid * resid)) if (n_iter >= 300) and (n_iter % 100 == 0): # check for convergence circle, shouldn't happen for ii in range(2, 10): if np.all(beta == history['params'][-ii]): cycle = True break warnings.warn("Convergence cycle detected", ConvergenceWarning) if n_iter == max_iter: warnings.warn("Maximum number of iterations (1000) reached.", IterationLimitWarning) e = endog - np.dot(exog, beta) # Greene (2008, p.407) writes that Stata 6 uses this bandwidth: # h = 0.9 * np.std(e) / (nobs**0.2) # Instead, we calculate bandwidth as in Stata 12 iqre = stats.scoreatpercentile(e, 75) - stats.scoreatpercentile(e, 25) h = bandwidth(nobs, q) h = min(np.std(endog), iqre / 1.34) * (norm.ppf(q + h) - norm.ppf(q - h)) fhat0 = 1. / (nobs * h) * np.sum(kernel(e / h)) if vcov == 'robust': d = np.where(e > 0, (q / fhat0)**2, ((1 - q) / fhat0)**2) xtxi = pinv(np.dot(exog.T, exog)) xtdx = np.dot(exog.T * d[np.newaxis, :], exog) vcov = chain_dot(xtxi, xtdx, xtxi) elif vcov == 'iid': vcov = (1. / fhat0)**2 * q * (1 - q) * pinv(np.dot(exog.T, exog)) else: raise Exception("vcov must be 'robust' or 'iid'") lfit = QuantRegResults(self, beta, normalized_cov_params=vcov) lfit.q = q lfit.iterations = n_iter lfit.sparsity = 1. / fhat0 lfit.bandwidth = h lfit.history = history return RegressionResultsWrapper(lfit)
def do_javelin(ra, dec): #ra and dec are converted to 6 decimals format ras = "%.6f" % ra decs = "%.6f" % dec print "########## computing Javelin for source located at ra=%f and dec=%f ##########" % ( ra, dec) try: #the diver lc is loaded, we take into account the different formats used for opt and NIR data. if driving_filter != 'Q': agn_driving = lc_path + driving_filter + '/agn_' + str( ras) + '_' + str(decs) + '_' + driving_filter + '.fits' arch_driving = pf.open(agn_driving) jd_0 = 55000 head_driving = arch_driving[0].header datos_driving = arch_driving[1].data jd_driving = datos_driving['JD'] - jd_0 flux_driving = datos_driving[ 'FLUX_2'] * 1e27 #the flux value is multiplicated by 1e27 to avoid numerical errors produced by small numbers errflux_driving = datos_driving['FLUXERR_2'] * 1e27 zspec_driving = head_driving['REDSHIFT'] else: try: agn_driving = lc_path + driving_filter + '/bin3_onechip_' + str( ras) + '_' + str(decs) + '_' + field + '.fits' arch_driving = pf.open(agn_driving) except: agn_driving = lc_path + driving_filter + '/bin3_morechip_' + str( ras) + '_' + str(decs) + '_' + field + '.fits' arch_driving = pf.open(agn_driving) jd_0 = 2455000 head_driving = arch_driving[0].header datos_driving = arch_driving[1].data jd_driving = datos_driving['JD'] - jd_0 flux_driving = datos_driving[ 'fluxQ'] * 1e27 #the flux value is multiplicated by 1e27 to avoid numerical errors produced by small numbers errflux_driving = datos_driving['errfluxQ'] * 1e27 zspec_driving = head_driving['REDSHIFT'] lcd_name = 'temp/driving_lc_' + driving_filter + '_' + str( ras) + '_' + str(decs) + '.txt' np.savetxt( lcd_name, np.transpose([ jd_driving / (1.0 + zspec_driving), flux_driving, errflux_driving ])) try: #reading the responding filter data if responding_filter != 'Q': agn_responding = lc_path + responding_filter + '/agn_' + str( ras) + '_' + str(decs) + '_' + responding_filter + '.fits' arch_responding = pf.open(agn_responding) jd_0 = 55000 head_responding = arch_responding[0].header datos_responding = arch_responding[1].data jd_responding = datos_responding['JD'] - jd_0 flux_responding = datos_responding[ 'FLUX_2'] * 1e27 #the flux value is multiplicated by 1e27 to avoid numerical errors produced by small numbers errflux_responding = datos_responding['FLUXERR_2'] * 1e27 zspec_responding = head_responding['REDSHIFT'] else: try: agn_driving = lc_path + responding_filter + '/bin3_onechip_' + str( ras) + '_' + field + '.fits' arch_responding = pf.open(agn_responding) except: agn_driving = lc_path + responding_filter + '/bin3_morechip_' + str( ras) + '_' + field + '.fits' arch_responding = pf.open(agn_responding) jd_0 = 2455000 head_responding = arch_responding[0].header datos_responding = arch_responding[1].data jd_responding = datos_responding['JD'] - jd_0 flux_responding = datos_responding[ 'fluxQ'] * 1e27 #the flux value is multiplicated by 1e27 to avoid numerical errors produced by small numbers errflux_responding = datos_responding['errfluxQ'] * 1e27 zspec_responding = head_responding['REDSHIFT'] #converting lcs into a format accepted by the fortran method lcr_name = 'temp/responding_lc_' + responding_filter + '_' + str( ras) + '_' + str(decs) + '.txt' np.savetxt( lcr_name, np.transpose([ jd_responding / (1.0 + zspec_driving), flux_responding, errflux_responding ])) #running Javelin cont = get_data([lcd_name], names=[driving_filter]) cmod = Cont_Model(cont) cmod.do_mcmc(nwalkers=100, nburn=50, nchain=100, fchain=jav_stat_path + "chain_cont_" + driving_filter + "_vs_" + responding_filter + "_" + str(ras) + "_" + str(decs) + ".txt") bothdata = get_data([lcd_name, lcr_name], names=[driving_filter, responding_filter]) mod_2band = Pmap_Model(bothdata) #Rmap_Model(bothdata) mod_2band.do_mcmc(nwalkers=100, nburn=50, nchain=100, conthpd=cmod.hpd, laglimit=[[cent_lowlimit, cent_uplimit]], widlimit=widlimit, fchain=jav_stat_path + "jav_chain_all_" + driving_filter + "_vs_" + responding_filter + "_" + str(ras) + "_" + str(decs) + ".txt") sigma, tau, lag, width, scale = np.loadtxt( jav_stat_path + "jav_chain_all_" + driving_filter + "_vs_" + responding_filter + "_" + str(ras) + "_" + str(decs) + ".txt", unpack=True, usecols=[0, 1, 2, 3, 4]) centau_median = np.median(lag) centau_uperr = (stats.scoreatpercentile(lag, perclim)) - centau_median centau_loerr = centau_median - (stats.scoreatpercentile( lag, (100. - perclim))) len_chain = len(lag[np.where(lag > -2000000000000)]) return (ra, dec, zspec_driving, len_chain, centau_median, centau_loerr, centau_uperr) except: print "########## computing iccf FAILS for source located at ra=%f and dec=%f, NO RESPONDING LC available ##########" % ( ra, dec) cmd = 'rm ' + lcd_name os.system(cmd) return (ra, dec, -9999, -9999, -9999, -9999, -9999) except: print "########## computing iccf FAILS for source located at ra=%f and dec=%f, NO DRIVING LC available ##########" % ( ra, dec) return (ra, dec, -9999, -9999, -9999, -9999, -9999)
def main(): logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s', datefmt='%Y-%m-%d %H:%M:%S') logging.info('*** Starting: Compute CC ***') # Connection to the DB db = connect() if len(get_filters(db, all=False)) == 0: logging.info("NO FILTERS DEFINED, exiting") sys.exit() # Get Configuration params = get_params(db) filters = get_filters(db, all=False) logging.info("Will compute %s" % " ".join(params.components_to_compute)) if params.remove_response: logging.debug('Pre-loading all instrument response') responses = preload_instrument_responses(db) else: responses = None logging.info("Checking if there are jobs to do") while is_next_job(db, jobtype='CC'): logging.info("Getting the next job") jobs = get_next_job(db, jobtype='CC') if len(jobs) == 0: # edge case, should only occur when is_next returns true, but # get_next receives no jobs (heavily parallelised code) continue stations = [] pairs = [] refs = [] for job in jobs: refs.append(job.ref) pairs.append(job.pair) netsta1, netsta2 = job.pair.split(':') stations.append(netsta1) stations.append(netsta2) goal_day = job.day stations = np.unique(stations) logging.info("New CC Job: %s (%i pairs with %i stations)" % (goal_day, len(pairs), len(stations))) jt = time.time() comps = [] for comp in params.components_to_compute: if comp[0] in ["R", "T"] or comp[1] in ["R", "T"]: comps.append("E") comps.append("N") else: comps.append(comp[0]) comps.append(comp[1]) comps = np.unique(comps) stream = preprocess(db, stations, comps, goal_day, params, responses) if not len(stream): logging.info("Not enough data for this day !") logging.info("Marking job Done and continuing with next !") for job in jobs: update_job(db, job.day, job.pair, 'CC', 'D', ref=job.ref) continue # print '##### STREAMS ARE ALL PREPARED AT goal Hz #####' dt = 1. / params.goal_sampling_rate logging.info("Starting slides") start_processing = time.time() allcorr = {} for tmp in stream.slide(params.corr_duration, params.corr_duration * (1 - params.overlap)): logging.info("Processing %s - %s" % (tmp[0].stats.starttime, tmp[0].stats.endtime)) tmp = tmp.copy().sort() channels_to_remove = [] for gap in tmp.get_gaps(min_gap=0): if gap[-2] > 0: channels_to_remove.append(".".join( [gap[0], gap[1], gap[2], gap[3]])) for chan in np.unique(channels_to_remove): logging.debug("%s contains gap(s), removing it" % chan) net, sta, loc, chan = chan.split(".") for tr in tmp.select(network=net, station=sta, location=loc, channel=chan): tmp.remove(tr) if len(tmp) == 0: logging.debug("No traces without gaps") continue base = np.amax([tr.stats.npts for tr in tmp]) if base <= (params.maxlag * params.goal_sampling_rate * 2 + 1): logging.debug("All traces shorter are too short to export" " +-maxlag") continue for tr in tmp: if tr.stats.npts != base: tmp.remove(tr) logging.debug("One trace is too short, removing it") if len(tmp) == 0: logging.debug("No traces left in slice") continue nfft = next_fast_len(tmp[0].stats.npts) tmp.detrend("demean") for tr in tmp: if params.windsorizing == -1: np.sign(tr.data, tr.data) # inplace elif params.windsorizing != 0: imin, imax = scoreatpercentile(tr.data, [1, 99]) not_outliers = np.where((tr.data >= imin) & (tr.data <= imax))[0] rms = tr.data[not_outliers].std() * params.windsorizing np.clip(tr.data, -rms, rms, tr.data) # inplace # TODO should not hardcode 4 percent! tmp.taper(0.04) # TODO should not hardcode 100 taper points in spectrum napod = 100 data = np.asarray([tr.data for tr in tmp]) names = [tr.id.split(".") for tr in tmp] # index net.sta comps for energy later channel_index = {} psds = [] for i, name in enumerate(names): n1, s1, l1, c1 = name netsta = "%s.%s" % (n1, s1) if netsta not in channel_index: channel_index[netsta] = {} channel_index[netsta][c1[-1]] = i pxx, freqs = mlab.psd(tmp[i].data, Fs=tmp[i].stats.sampling_rate, NFFT=nfft, detrend='mean') psds.append(np.sqrt(pxx)) psds = np.asarray(psds) for chan in channel_index: comps = channel_index[chan].keys() if "E" in comps and "N" in comps: i_e = channel_index[chan]["E"] i_n = channel_index[chan]["N"] # iZ = channel_index[chan]["Z"] mm = psds[[i_e, i_n]].mean(axis=0) psds[i_e] = mm psds[i_n] = mm # psds[iZ] = mm # define pairwise CCs tmptime = tmp[0].stats.starttime.datetime thisdate = tmptime.strftime("%Y-%m-%d") thistime = tmptime.strftime("%Y-%m-%d %H:%M:%S") pair_index = [] # Different iterator func if autocorr: if params.autocorr: iterfunc = itertools.combinations_with_replacement else: iterfunc = itertools.combinations for sta1, sta2 in iterfunc(names, 2): n1, s1, l1, c1 = sta1 n2, s2, l2, c2 = sta2 comp = "%s%s" % (c1[-1], c2[-1]) if comp in params.components_to_compute: pair_index.append([ "%s.%s_%s.%s_%s" % (n1, s1, n2, s2, comp), names.index(sta1), names.index(sta2) ]) for filterdb in filters: filterid = filterdb.ref low = float(filterdb.low) high = float(filterdb.high) freq_vec = scipy.fftpack.fftfreq(nfft, d=dt)[:nfft // 2] freq_sel = np.where((freq_vec >= low) & (freq_vec <= high))[0] low = freq_sel[0] - napod if low <= 0: low = 1 p1 = freq_sel[0] p2 = freq_sel[-1] high = freq_sel[-1] + napod if high > nfft / 2: high = int(nfft // 2) ffts = scipy.fftpack.fftn(data, shape=[ nfft, ], axes=[ 1, ]) # TODO: AC will require a more clever handling, no whiten... whiten2(ffts, nfft, low, high, p1, p2, psds, params.whitening) # inplace # energy = np.sqrt(np.sum(np.abs(ffts)**2, axis=1)/nfft) energy = np.real( np.sqrt( np.mean(scipy.fftpack.ifft(ffts, n=nfft, axis=1)**2, axis=1))) # logging.info("Pre-whitened %i traces"%(i+1)) corr = myCorr2(ffts, np.ceil(params.maxlag / dt), energy, pair_index, plot=False, nfft=nfft) for key in corr: ccfid = key + "_%02i" % filterid + "_" + thisdate if ccfid not in allcorr: allcorr[ccfid] = {} allcorr[ccfid][thistime] = corr[key] del corr # Needed to clean the FFT memory caching of SciPy clean_scipy_cache() if params.keep_all: for ccfid in allcorr.keys(): export_allcorr2(db, ccfid, allcorr[ccfid]) if params.keep_days: for ccfid in allcorr.keys(): station1, station2, components, filterid, date = \ ccfid.split('_') corrs = np.asarray(list(allcorr[ccfid].values())) if not len(corrs): logging.debug("No data to stack.") continue corr = stack(corrs, params.stack_method, params.pws_timegate, params.pws_power, params.goal_sampling_rate) if not len(corr): logging.debug("No data to save.") continue thisdate = goal_day thistime = "0_0" add_corr(db, station1.replace('.', '_'), station2.replace('.', '_'), int(filterid), thisdate, thistime, params.min30 / params.goal_sampling_rate, components, corr, params.goal_sampling_rate, day=True, ncorr=corrs.shape[0], params=params) # THIS SHOULD BE IN THE API massive_update_job(db, jobs, "D") for job in jobs: update_job(db, job.day, job.pair, 'STACK', 'T') logging.info( "Job Finished. It took %.2f seconds (preprocess: %.2f s & " "process %.2f s)" % ((time.time() - jt), start_processing - jt, time.time() - start_processing)) del stream logging.info('*** Finished: Compute CC ***')
def do_iccf(source_name, source_ID, zspec_driving, jd_driving, flux_driving, errflux_driving, jd_responding, flux_responding, errflux_responding): #Calculate lag with python CCF program print "########## computing ICCF for source %d ##########" % (source_ID) tlag_peak, status_peak, tlag_centroid, status_centroid, ccf_pack, max_rval, status_rval, pval = myccf.peakcent( jd_driving / (zspec_driving + 1.0), flux_driving, jd_responding / (zspec_driving + 1.0), flux_responding, lag_range[0], lag_range[1], interp) tlags_peak, tlags_centroid, nsuccess_peak, nfail_peak, nsuccess_centroid, nfail_centroid, max_rvals, nfail_rvals, pvals = myccf.xcor_mc( jd_driving / (zspec_driving + 1.0), flux_driving, abs(errflux_driving), jd_responding / (zspec_driving + 1.0), flux_responding, abs(errflux_responding), lag_range[0], lag_range[1], interp, nsim=nsim, mcmode=mcmode, sigmode=sigmode) lag = ccf_pack[1] r = ccf_pack[0] ###Calculate the best peak and centroid and their uncertainties using the median of the ##distributions. centau = stats.scoreatpercentile(tlags_centroid, 50) centau_uperr = (stats.scoreatpercentile(tlags_centroid, perclim)) - centau centau_loerr = centau - (stats.scoreatpercentile(tlags_centroid, (100. - perclim))) print 'Centroid, error: %10.3f (+%10.3f -%10.3f)' % (centau, centau_uperr, centau_loerr) print "centroid org:", tlag_centroid peaktau = stats.scoreatpercentile(tlags_peak, 50) peaktau_uperr = (stats.scoreatpercentile(tlags_peak, perclim)) - peaktau peaktau_loerr = peaktau - (stats.scoreatpercentile(tlags_peak, (100. - perclim))) print 'Peak, errors: %10.3f (+%10.3f -%10.3f)' % (peaktau, peaktau_uperr, peaktau_loerr) print "peak org:", tlag_peak #Write results out to a file in case we want them later. centfile = open( 'iccf_stat/centdist_iccf_dt' + str(interp) + '_' + source_name + '.txt', 'w') peakfile = open( 'iccf_stat/peakdist_iccf_dt' + str(interp) + '_' + source_name + '.txt', 'w') ccf_file = open( 'iccf_stat/org_iccf_dt' + str(interp) + '_' + source_name + '.txt', 'w') for m in xrange(0, np.size(tlags_centroid)): centfile.write('%5.5f \n' % (tlags_centroid[m])) centfile.close() for m in xrange(0, np.size(tlags_peak)): peakfile.write('%5.5f \n' % (tlags_peak[m])) peakfile.close() for m in xrange(0, np.size(lag)): ccf_file.write('%5.5f %5.5f \n' % (lag[m], r[m])) ccf_file.close() return (source_ID, zspec_driving, nsuccess_peak, tlag_peak, peaktau, peaktau_loerr, peaktau_uperr, nsuccess_centroid, tlag_centroid, centau, centau_loerr, centau_uperr)
def pairwise_bootstrap_plot(theta_est, theta_star, alpha, axis_limits=None, filename=None): """ Plot pairwise relationship for theta estimates along with confidence intevals using multivariate normal and kernel density estimate distributions Parameters ---------- theta_est: `pandas DataFrame` (columns = variable names) Theta estimate (returned by parmest.bootstrap). If the DataFrame contains column names 'samples', these will not be included in the plot. theta_star: `dict` or `pandas Series` (index = variable names) Theta* alpha: `float` Confidence interval axis_limits: `dict` or `pandas Series` (optional) Axis limits in the format {variable: [min, max]} filename: `string` (optional) Filename used to save the figure Returns -------- Mutlivariate normal distribution (scipy.stats.multivariate_normal), gaussian kde distribution (scipy.stats.gaussian_kde) """ if 'samples' in theta_est.columns: theta_est = theta_est.drop('samples', axis=1) if isinstance(theta_star, dict): theta_star = pd.Series(theta_star) m = theta_est.mean() c = theta_est.cov() mvn_dist = stats.multivariate_normal(m, c, allow_singular=True) mvnZ = mvn_dist.pdf(theta_est) mvn_score = stats.scoreatpercentile(mvnZ.transpose(), (1 - alpha) * 100) kde_dist = stats.gaussian_kde( theta_est.transpose().values) # data.shape = (#dim, #data) kdeZ = kde_dist.pdf(theta_est.transpose()) kde_score = stats.scoreatpercentile(kdeZ.transpose(), (1 - alpha) * 100) columns = theta_est.columns ncells = 100 g = sns.PairGrid(theta_est) g.map_diag(sns.distplot, kde=False, hist=True, norm_hist=False) #g.map_diag(sns.distplot, fit=stats.norm, hist=False, fit_kws={'color': 'b'}) #, kde=False, norm_hist=False) # histogram and kde estimate #g.map_diag(sns.kdeplot) #, color='r') g.map_upper(_add_scatter, columns=columns, theta_star=theta_star) g.map_lower(_add_scatter, columns=columns, theta_star=theta_star) g.map_lower(_add_rectangle_CI, columns=columns, alpha=alpha) g.map_lower(_add_multivariate_normal_CI, columns=columns, ncells=ncells, alpha=mvn_score, mvn_dist=mvn_dist, theta_star=theta_star) g.map_lower(_add_gaussian_kde_CI, columns=columns, ncells=ncells, alpha=kde_score, kde_dist=kde_dist, theta_star=theta_star) if axis_limits is not None: for ax in g.fig.get_axes(): xvar, yvar, (xloc, yloc) = _get_variables(ax, columns) if xloc != yloc: # not on diagonal ax.set_ylim(axis_limits[yvar]) ax.set_xlim(axis_limits[xvar]) if filename is not None: plt.savefig(filename) return mvn_dist, kde_dist
data = pd.read_csv('data/Chapter9_Data.csv', parse_dates=True, index_col='date') returns = data.apply(np.log) - data.apply(np.log).shift() returns.dropna(inplace=True) returns *= scale returns.plot() percentile = range(15, 86) cor0 = pd.DataFrame(index=percentile, columns=['cor']) for p in percentile: score_sp = stats.scoreatpercentile(returns['sp'], p) score_tn = stats.scoreatpercentile(returns['tn'], p) if p <= 50: cut = returns.loc[(returns['sp'] <= score_sp) & (returns['tn'] <= score_tn), ] cor_num = stats.pearsonr(cut['sp'], cut['tn']) cor0.loc[p, 'cor'] = cor_num[0] else: cut = returns.loc[(returns['sp'] > score_sp) & (returns['tn'] > score_tn), ] cor_num = stats.pearsonr(cut['sp'], cut['tn'])
###################################################################### # Generate figures # ---------------- with warnings.catch_warnings(): warnings.simplefilter('ignore', DeprecationWarning) for index, (ic_map, ic_terms) in enumerate( zip(ica_maps, term_weights_for_components)): if -ic_map.min() > ic_map.max(): # Flip the map's sign for prettiness ic_map = -ic_map ic_terms = -ic_terms ic_threshold = stats.scoreatpercentile(np.abs(ic_map), 90) ic_img = masker.inverse_transform(ic_map) important_terms = vocabulary[np.argsort(ic_terms)[-3:]] title = 'IC%i %s' % (index, ', '.join(important_terms[::-1])) plotting.plot_stat_map(ic_img, threshold=ic_threshold, colorbar=False, title=title) ###################################################################### # As we can see, some of the components capture cognitive or neurological # maps, while other capture noise in the database. More data, better # filtering, and better cognitive labels would give better maps # Done.
def conditionsPlot(self, results): # summarize results for each experimental condition print(' Tabulating results for each experimental condition using marker sets.') itemsProcessed = 0 compDataDict = defaultdict(lambda : defaultdict(list)) contDataDict = defaultdict(lambda : defaultdict(list)) comps = set() conts = set() seqLens = set() compOutliers = defaultdict(list) contOutliers = defaultdict(list) genomeIds = set() for simId in results: itemsProcessed += 1 statusStr = ' Finished processing %d of %d (%.2f%%) test cases.' % (itemsProcessed, len(results), float(itemsProcessed)*100/len(results)) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() genomeId, seqLen, comp, cont = simId.split('-') genomeIds.add(genomeId) expCondStr = str(float(comp)) + '-' + str(float(cont)) + '-' + str(int(seqLen)) comps.add(float(comp)) conts.add(float(cont)) seqLens.add(int(seqLen)) compDataDict[expCondStr]['best'] += results[simId][2] compDataDict[expCondStr]['domain'] += results[simId][6] compDataDict[expCondStr]['selected'] += results[simId][10] for dComp in results[simId][2]: compOutliers[expCondStr] += [[dComp, genomeId]] contDataDict[expCondStr]['best'] += results[simId][3] contDataDict[expCondStr]['domain'] += results[simId][7] contDataDict[expCondStr]['selected'] += results[simId][11] for dCont in results[simId][3]: contOutliers[expCondStr] += [[dCont, genomeId]] print(' There are %d unique genomes.' % len(genomeIds)) sys.stdout.write('\n') print(' There are %d experimental conditions.' % (len(compDataDict))) # plot data print(' Plotting results.') compData = [] contData = [] rowLabels = [] foutComp = open('./simulations/simulation.scaffolds.draft.comp_outliers.domain.tsv', 'w') foutCont = open('./simulations/simulation.scaffolds.draft.cont_outliers.domain.tsv', 'w') for comp in self.compsToConsider: for cont in self.contsToConsider: for msStr in ['best', 'selected', 'domain']: for seqLen in [20000]: rowLabels.append(msStr +': %d%%, %d%%' % (comp*100, cont*100)) expCondStr = str(comp) + '-' + str(cont) + '-' + str(seqLen) compData.append(compDataDict[expCondStr][msStr]) contData.append(contDataDict[expCondStr][msStr]) # report completenes outliers foutComp.write(expCondStr) compOutliers[expCondStr].sort() dComps = array([r[0] for r in compOutliers[expCondStr]]) perc1 = scoreatpercentile(dComps, 1) perc99 = scoreatpercentile(dComps, 99) print(expCondStr, perc1, perc99) foutComp.write('\t%.2f\t%.2f' % (perc1, perc99)) outliers = [] for item in compOutliers[expCondStr]: if item[0] < perc1 or item[0] > perc99: outliers.append(item[1]) outlierCount = Counter(outliers) for genomeId, count in outlierCount.most_common(): foutComp.write('\t' + genomeId + ': ' + str(count)) foutComp.write('\n') # report contamination outliers foutCont.write(expCondStr) contOutliers[expCondStr].sort() dConts = array([r[0] for r in contOutliers[expCondStr]]) perc1 = scoreatpercentile(dConts, 1) perc99 = scoreatpercentile(dConts, 99) foutCont.write('\t%.2f\t%.2f' % (perc1, perc99)) outliers = [] for item in contOutliers[expCondStr]: if item[0] < perc1 or item[0] > perc99: outliers.append(item[1]) outlierCount = Counter(outliers) for genomeId, count in outlierCount.most_common(): foutCont.write('\t' + genomeId + ': ' + str(count)) foutCont.write('\n') foutComp.close() foutCont.close() print('best:\t%.2f\t%.2f' % (mean(abs(array(compData[0::3]))), mean(abs(array(contData[0::3]))))) print('selected:\t%.2f\t%.2f' % (mean(abs(array(compData[1::3]))), mean(abs(array(contData[1::3]))))) print('domain:\t%.2f\t%.2f' % (mean(abs(array(compData[2::3]))), mean(abs(array(contData[2::3]))))) boxPlot = BoxPlot() plotFilename = self.plotPrefix + '.conditions.png' boxPlot.plot(plotFilename, compData, contData, rowLabels, r'$\Delta$' + ' % Completion', 'Simulation Conditions', r'$\Delta$' + ' % Contamination', None, rowsPerCategory = 3, dpi = self.dpi) # print table of results tableOut = open(self.simCompareConditionOut, 'w') tableOut.write('Comp. (%)\tCont. (%)\tbest (5kb)\t\tselected (5kb)\t\tdomain (5kb)\t\tbest (20kb)\t\tselected (20kb)\t\tdomain (20kb)\t\tbest (50kb)\t\tselected (50kb)\t\tdomain (50kb)\n') avgComp = defaultdict(lambda : defaultdict(list)) avgCont = defaultdict(lambda : defaultdict(list)) for comp in [0.5, 0.7, 0.8, 0.9, 0.95, 1.0]: for cont in [0.0, 0.05, 0.1, 0.15, 0.2]: tableOut.write('%d\t%d' % (comp*100, cont*100)) for seqLen in [5000, 20000, 50000]: expCondStr = str(comp) + '-' + str(cont) + '-' + str(seqLen) meanCompD = mean(abs(array(compDataDict[expCondStr]['domain']))) stdCompD = std(abs(array(compDataDict[expCondStr]['domain']))) meanContD = mean(abs(array(contDataDict[expCondStr]['domain']))) stdContD = std(abs(array(contDataDict[expCondStr]['domain']))) avgComp[seqLen]['domain'] += compDataDict[expCondStr]['domain'] avgCont[seqLen]['domain'] += contDataDict[expCondStr]['domain'] meanCompS = mean(abs(array(compDataDict[expCondStr]['selected']))) stdCompS = std(abs(array(compDataDict[expCondStr]['selected']))) meanContS = mean(abs(array(contDataDict[expCondStr]['selected']))) stdContS = std(abs(array(contDataDict[expCondStr]['selected']))) avgComp[seqLen]['selected'] += compDataDict[expCondStr]['selected'] avgCont[seqLen]['selected'] += contDataDict[expCondStr]['selected'] meanCompB = mean(abs(array(compDataDict[expCondStr]['best']))) stdCompB = std(abs(array(compDataDict[expCondStr]['best']))) meanContB = mean(abs(array(contDataDict[expCondStr]['best']))) stdContB = std(abs(array(contDataDict[expCondStr]['best']))) avgComp[seqLen]['best'] += compDataDict[expCondStr]['best'] avgCont[seqLen]['best'] += contDataDict[expCondStr]['best'] tableOut.write('\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f' % (meanCompD, meanCompS, meanCompB, meanContD, meanContS, meanContB)) tableOut.write('\n') tableOut.write('\tAverage:') for seqLen in [5000, 20000, 50000]: meanCompD = mean(abs(array(avgComp[seqLen]['domain']))) stdCompD = std(abs(array(avgComp[seqLen]['domain']))) meanContD = mean(abs(array(avgCont[seqLen]['domain']))) stdContD = std(abs(array(avgCont[seqLen]['domain']))) meanCompS = mean(abs(array(avgComp[seqLen]['selected']))) stdCompS = std(abs(array(avgComp[seqLen]['selected']))) meanContS = mean(abs(array(avgCont[seqLen]['selected']))) stdContS = std(abs(array(avgCont[seqLen]['selected']))) meanCompB = mean(abs(array(avgComp[seqLen]['best']))) stdCompB = std(abs(array(avgComp[seqLen]['best']))) meanContB = mean(abs(array(avgCont[seqLen]['best']))) stdContB = std(abs(array(avgCont[seqLen]['best']))) tableOut.write('\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f' % (meanCompD, meanCompS, meanCompB, meanContD, meanContS, meanContB)) tableOut.write('\n') tableOut.close()
#! /usr/bin/python #-*-coding:utf-8 -*- import numpy as np import scipy.stats as sta data = np.random.random_sample(900) # print data Mean,std = sta.norm.fit(data) print (Mean , std) print (sta.skewtest(data)) print (sta.kurtosistest(data)) print (sta.normaltest(data)) print (sta.scoreatpercentile(data,50)) print (sta.scoreatpercentile(data,1)) import matplotlib.pyplot as plt plt.hist(data) plt.show()
def _filter_ridge_lines(cwt, ridge_lines, window_size=None, min_length=None, min_snr=1, noise_perc=10): """ Filter ridge lines according to prescribed criteria. Intended to be used for finding relative maxima. Parameters ---------- cwt : 2-D ndarray Continuous wavelet transform from which the `ridge_lines` were defined. ridge_lines : 1-D sequence Each element should contain 2 sequences, the rows and columns of the ridge line (respectively). window_size : int, optional Size of window to use to calculate noise floor. Default is ``cwt.shape[1] / 20``. min_length : int, optional Minimum length a ridge line needs to be acceptable. Default is ``cwt.shape[0] / 4``, ie 1/4-th the number of widths. min_snr : float, optional Minimum SNR ratio. Default 1. The signal is the value of the cwt matrix at the shortest length scale (``cwt[0, loc]``), the noise is the `noise_perc`th percentile of datapoints contained within a window of `window_size` around ``cwt[0, loc]``. noise_perc : float, optional When calculating the noise floor, percentile of data points examined below which to consider noise. Calculated using scipy.stats.scoreatpercentile. References ---------- Bioinformatics (2006) 22 (17): 2059-2065. :doi:`10.1093/bioinformatics/btl355` http://bioinformatics.oxfordjournals.org/content/22/17/2059.long """ num_points = cwt.shape[1] if min_length is None: min_length = np.ceil(cwt.shape[0] / 4) if window_size is None: window_size = np.ceil(num_points / 20) window_size = int(window_size) hf_window, odd = divmod(window_size, 2) # Filter based on SNR row_one = cwt[0, :] noises = np.zeros_like(row_one) for ind, val in enumerate(row_one): window_start = max(ind - hf_window, 0) window_end = min(ind + hf_window + odd, num_points) noises[ind] = scoreatpercentile(row_one[window_start:window_end], per=noise_perc) def filt_func(line): if len(line[0]) < min_length: return False snr = abs(cwt[line[0][0], line[1][0]] / noises[line[1][0]]) if snr < min_snr: return False return True return list(filter(filt_func, ridge_lines))
".h5") #fetch h5 file to allow faster preprocessing keys[idx], modes[idx] = ut.get_key_feature(track, h5) loudnesses[idx], loudnesses_var[idx], loudnesses_interval[ idx] = ut.get_loudness(track, h5) tempos[idx] = ut.get_tempo_feature(track, h5) time_signatures[idx] = ut.get_time_signature(track, h5) timbre_means[idx], timbre_vars[idx], timbre_median[idx], timbre_min[ idx], timbre_max[idx] = ut.get_timbre(track, h5) pitches_means[idx], pitches_vars[idx], pitches_median[idx], pitches_min[ idx], pitches_max[idx] = ut.get_pitches(track, h5) energies[idx] = ut.get_energy_feature(track) h5.close() #use binning for continious data #problem: number of bins => freedman-driaconis rule num_bins = 2 * (stats.scoreatpercentile(loudnesses_interval, 75) - stats.scoreatpercentile(loudnesses_interval, 25) ) * len(loudnesses_interval)**(1 / 3) bins = np.linspace(min(loudnesses_interval), max(loudnesses_interval), num=num_bins) d_loudnesses_interval = np.digitize(loudnesses_interval, bins) num_bins = 2 * (stats.scoreatpercentile(loudnesses, 75) - stats.scoreatpercentile(loudnesses, 25)) * len(loudnesses)**( 1 / 3) bins = np.linspace(min(loudnesses), max(loudnesses), num=100) d_loudnesses = np.digitize(loudnesses, bins) num_bins = 2 * (stats.scoreatpercentile(tempos, 75) - stats.scoreatpercentile(tempos, 25)) * len(tempos)**(1 / 3)
def redsker(b, idx, err=True): depth = 12 h = es.htm.HTM(depth) ra = b.field('ra') dec = b.field('dec') photoz = b.field('z') central = b.field('central') gmr = b.field('omag')[:, 0] - b.field('omag')[:, 1] rmi = b.field('omag')[:, 1] - b.field('omag')[:, 2] imz = b.field('omag')[:, 2] - b.field('omag')[:, 3] gmz = b.field('omag')[:, 0] - b.field('omag')[:, 3] rmz = b.field('omag')[:, 1] - b.field('omag')[:, 3] gmi = b.field('omag')[:, 0] - b.field('omag')[:, 2] num = len(ra) if err: gmrerr = b.field('omagerr')[:, 0] - b.field('omagerr')[:, 1] rmierr = b.field('omagerr')[:, 1] - b.field('omagerr')[:, 2] imzerr = b.field('omagerr')[:, 2] - b.field('omagerr')[:, 3] gmzerr = b.field('omagerr')[:, 0] - b.field('omagerr')[:, 3] rmzerr = b.field('omagerr')[:, 1] - b.field('omagerr')[:, 3] gmierr = b.field('omagerr')[:, 0] - b.field('omagerr')[:, 2] else: gmrerr = np.zeros(num) rmierr = np.zeros(num) imzerr = np.zeros(num) gmzerr = np.zeros(num) rmzerr = np.zeros(num) gmierr = np.zeros(num) iamag = b.field('amag')[:, 2] imag = b.field('omag')[:, 2] srad = np.rad2deg(1. / es.cosmology.Da(0, photoz[idx], h=0.7) / (1 + photoz[idx])) m1, m2, d12 = h.match(ra[idx], dec[idx], ra, dec, srad, maxmatch=5000) indices = (imag[m2] <= limi(photoz[idx])) * (imag[m2] > imag[m1]) #indices=(iamag[m2]<=-20)*(iamag[m2]>iamag[m1]) ntot = len(m2[indices]) alpha = np.array([0.5, 0.5]) mu = np.array([ sts.scoreatpercentile(gmr[m2[indices]], per=80), sts.scoreatpercentile(gmr[m2[indices]], per=30) ]) sigma = np.array([0.04, 0.3]) aic2 = gmm.aic_ecgmm(gmr[m2[indices]], gmrerr[m2[indices]], alpha, mu, sigma) aic1 = gmm.wstat(gmr[m2[indices]], gmrerr[m2[indices]])[3] fig = pl.figure(figsize=(15, 8)) ax = fig.add_subplot(2, 3, 1) pl.hist(gmr[m2[indices]], bins=30, normed=True, histtype='step') x = np.arange(-1, 5, 0.01) t = gmm.ecgmmplot(x, alpha, mu, sigma) pl.xlabel('g - r') pl.title('M200: ' + str(b[idx].field('m200'))) pl.text(0.1, 0.85, r'$\alpha$: ' + str(np.round(alpha, 4)), transform=ax.transAxes) pl.text(0.1, 0.8, r'$\mu$: ' + str(np.round(mu, 4)), transform=ax.transAxes) pl.text(0.1, 0.75, r'$\sigma$: ' + str(np.round(sigma, 4)), transform=ax.transAxes) pl.text(0.1, 0.68, r'$Ngals$: ' + str(np.round(ntot * alpha[0])), transform=ax.transAxes) pl.text(0.1, 0.6, r'$AIC$: ' + str(np.round(aic1)) + ', ' + str(np.round(aic2)), transform=ax.transAxes) alpha = np.array([0.5, 0.5]) mu = np.array([ sts.scoreatpercentile(rmi[m2[indices]], per=80), sts.scoreatpercentile(rmi[m2[indices]], per=30) ]) sigma = np.array([0.04, 0.3]) aic2 = gmm.aic_ecgmm(rmi[m2[indices]], rmierr[m2[indices]], alpha, mu, sigma) aic1 = gmm.wstat(rmi[m2[indices]], rmierr[m2[indices]])[3] ax = fig.add_subplot(2, 3, 2) pl.hist(rmi[m2[indices]], bins=30, normed=True, histtype='step') x = np.arange(-1, 5, 0.01) t = gmm.ecgmmplot(x, alpha, mu, sigma) pl.xlabel('r - i') pl.title('photoz: ' + str(photoz[idx])) pl.xlim(-0.2, 2.5) pl.text(0.1, 0.85, r'$\alpha$: ' + str(np.round(alpha, 4)), transform=ax.transAxes) pl.text(0.1, 0.8, r'$\mu$: ' + str(np.round(mu, 4)), transform=ax.transAxes) pl.text(0.1, 0.75, r'$\sigma$: ' + str(np.round(sigma, 4)), transform=ax.transAxes) pl.text(0.1, 0.68, r'$Ngals$: ' + str(np.round(ntot * alpha[0])), transform=ax.transAxes) pl.text(0.1, 0.6, r'$AIC$: ' + str(np.round(aic1)) + ', ' + str(np.round(aic2)), transform=ax.transAxes) alpha = np.array([0.5, 0.5]) mu = np.array([ sts.scoreatpercentile(imz[m2[indices]], per=60), sts.scoreatpercentile(imz[m2[indices]], per=30) ]) sigma = np.array([0.02, 0.3]) aic2 = gmm.aic_ecgmm(imz[m2[indices]], imzerr[m2[indices]], alpha, mu, sigma) aic1 = gmm.wstat(imz[m2[indices]], imzerr[m2[indices]])[3] ax = fig.add_subplot(2, 3, 3) pl.hist(imz[m2[indices]], bins=30, normed=True, histtype='step') x = np.arange(-1, 5, 0.01) t = gmm.ecgmmplot(x, alpha, mu, sigma) pl.xlabel('i - z') pl.title('Ntot: ' + str(ntot)) pl.xlim(-0.2, 2.5) pl.text(0.1, 0.85, r'$\alpha$: ' + str(np.round(alpha, 4)), transform=ax.transAxes) pl.text(0.1, 0.8, r'$\mu$: ' + str(np.round(mu, 4)), transform=ax.transAxes) pl.text(0.1, 0.75, r'$\sigma$: ' + str(np.round(sigma, 4)), transform=ax.transAxes) pl.text(0.1, 0.68, r'$Ngals$: ' + str(np.round(ntot * alpha[0])), transform=ax.transAxes) pl.text(0.1, 0.6, r'$AIC$: ' + str(np.round(aic1)) + ', ' + str(np.round(aic2)), transform=ax.transAxes) alpha = np.array([0.5, 0.5]) mu = np.array([ sts.scoreatpercentile(gmz[m2[indices]], per=60), sts.scoreatpercentile(gmz[m2[indices]], per=30) ]) sigma = np.array([0.02, 0.3]) aic2 = gmm.aic_ecgmm(gmz[m2[indices]], gmzerr[m2[indices]], alpha, mu, sigma) aic1 = gmm.wstat(gmz[m2[indices]], gmzerr[m2[indices]])[3] ax = fig.add_subplot(2, 3, 4) pl.hist(gmz[m2[indices]], bins=30, normed=True, histtype='step') x = np.arange(-1, 5, 0.01) t = gmm.ecgmmplot(x, alpha, mu, sigma) pl.xlabel('g - z') pl.text(0.1, 0.85, r'$\alpha$: ' + str(np.round(alpha, 4)), transform=ax.transAxes) pl.text(0.1, 0.8, r'$\mu$: ' + str(np.round(mu, 4)), transform=ax.transAxes) pl.text(0.1, 0.75, r'$\sigma$: ' + str(np.round(sigma, 4)), transform=ax.transAxes) pl.text(0.1, 0.68, r'$Ngals$: ' + str(np.round(ntot * alpha[0])), transform=ax.transAxes) pl.text(0.1, 0.6, r'$AIC$: ' + str(np.round(aic1)) + ', ' + str(np.round(aic2)), transform=ax.transAxes) alpha = np.array([0.5, 0.5]) mu = np.array([ sts.scoreatpercentile(rmz[m2[indices]], per=60), sts.scoreatpercentile(rmz[m2[indices]], per=30) ]) sigma = np.array([0.02, 0.3]) aic2 = gmm.aic_ecgmm(rmz[m2[indices]], rmzerr[m2[indices]], alpha, mu, sigma) aic1 = gmm.wstat(rmz[m2[indices]], rmzerr[m2[indices]])[3] ax = fig.add_subplot(2, 3, 5) pl.hist(rmz[m2[indices]], bins=30, normed=True, histtype='step') x = np.arange(-1, 5, 0.01) t = gmm.ecgmmplot(x, alpha, mu, sigma) pl.xlabel('r - z') pl.text(0.1, 0.85, r'$\alpha$: ' + str(np.round(alpha, 4)), transform=ax.transAxes) pl.text(0.1, 0.8, r'$\mu$: ' + str(np.round(mu, 4)), transform=ax.transAxes) pl.text(0.1, 0.75, r'$\sigma$: ' + str(np.round(sigma, 4)), transform=ax.transAxes) pl.text(0.1, 0.68, r'$Ngals$: ' + str(np.round(ntot * alpha[0])), transform=ax.transAxes) pl.text(0.1, 0.6, r'$AIC$: ' + str(np.round(aic1)) + ', ' + str(np.round(aic2)), transform=ax.transAxes) alpha = np.array([0.5, 0.5]) mu = np.array([ sts.scoreatpercentile(gmi[m2[indices]], per=60), sts.scoreatpercentile(gmi[m2[indices]], per=30) ]) sigma = np.array([0.02, 0.3]) aic2 = gmm.aic_ecgmm(gmi[m2[indices]], gmierr[m2[indices]], alpha, mu, sigma) aic1 = gmm.wstat(gmi[m2[indices]], gmierr[m2[indices]])[3] ax = fig.add_subplot(2, 3, 6) pl.hist(gmi[m2[indices]], bins=30, normed=True, histtype='step') x = np.arange(-1, 5, 0.01) t = gmm.ecgmmplot(x, alpha, mu, sigma) pl.xlabel('g - i') pl.text(0.1, 0.85, r'$\alpha$: ' + str(np.round(alpha, 4)), transform=ax.transAxes) pl.text(0.1, 0.8, r'$\mu$: ' + str(np.round(mu, 4)), transform=ax.transAxes) pl.text(0.1, 0.75, r'$\sigma$: ' + str(np.round(sigma, 4)), transform=ax.transAxes) pl.text(0.1, 0.68, r'$Ngals$: ' + str(np.round(ntot * alpha[0])), transform=ax.transAxes) pl.text(0.1, 0.6, r'$AIC$: ' + str(np.round(aic1)) + ', ' + str(np.round(aic2)), transform=ax.transAxes) return ('Plot is done!')
def scoreatpercentile(cum_preds, p): return [stats.scoreatpercentile(c, p) for c in cum_preds.T]
def crop(f, a, b): from scipy.stats import scoreatpercentile s1 = scoreatpercentile(f, a) s2 = scoreatpercentile(f, 100 - b) assert s1 <= s2 return np.logical_and(f >= s1, f <= s2)
def plot_best(trace=None, data_train=None, data_test=None, samples=1000, burn=200, axs=None): """Plot BEST significance analysis. Parameters ---------- trace : pymc3.sampling.BaseTrace, optional trace object as returned by model_best() If not passed, will run model_best(), for which data_train and data_test are required. data_train : pandas.Series, optional Returns of in-sample period. Required if trace=None. data_test : pandas.Series, optional Returns of out-of-sample period. Required if trace=None. samples : int, optional Posterior samples to draw. burn : int Posterior sampels to discard as burn-in. axs : array of matplotlib.axes objects, optional Plot into passed axes objects. Needs 6 axes. Returns ------- None See Also -------- model_best : Estimation of BEST model. """ if trace is None: if (data_train is not None) or (data_test is not None): raise ValueError('Either pass trace or data_train and data_test') trace = model_best(data_train, data_test, samples=samples) trace = trace[burn:] if axs is None: fig, axs = plt.subplots(ncols=2, nrows=3, figsize=(16, 4)) def distplot_w_perc(trace, ax): sns.distplot(trace, ax=ax) ax.axvline(stats.scoreatpercentile(trace, 2.5), color='0.5', label='2.5 and 97.5 percentiles') ax.axvline(stats.scoreatpercentile(trace, 97.5), color='0.5') sns.distplot(trace['group1_mean'], ax=axs[0], label='backtest') sns.distplot(trace['group2_mean'], ax=axs[0], label='forward') axs[0].legend(loc=0) axs[1].legend(loc=0) distplot_w_perc(trace['difference of means'], axs[1]) axs[0].set(xlabel='mean', ylabel='belief', yticklabels=[]) axs[1].set(xlabel='difference of means', yticklabels=[]) sns.distplot(trace['group1_annual_volatility'], ax=axs[2], label='backtest') sns.distplot(trace['group2_annual_volatility'], ax=axs[2], label='forward') distplot_w_perc( trace['group2_annual_volatility'] - trace['group1_annual_volatility'], axs[3]) axs[2].set(xlabel='Annual volatility', ylabel='belief', yticklabels=[]) axs[2].legend(loc=0) axs[3].set(xlabel='difference of volatility', yticklabels=[]) sns.distplot(trace['group1_sharpe'], ax=axs[4], label='backtest') sns.distplot(trace['group2_sharpe'], ax=axs[4], label='forward') distplot_w_perc(trace['group2_sharpe'] - trace['group1_sharpe'], axs[5]) axs[4].set(xlabel='Sharpe', ylabel='belief', yticklabels=[]) axs[4].legend(loc=0) axs[5].set(xlabel='difference of Sharpes', yticklabels=[]) sns.distplot(trace['effect size'], ax=axs[6]) axs[6].axvline(stats.scoreatpercentile(trace['effect size'], 2.5), color='0.5') axs[6].axvline(stats.scoreatpercentile(trace['effect size'], 97.5), color='0.5') axs[6].set(xlabel='difference of means normalized by volatility', ylabel='belief', yticklabels=[])
def plot_tags_per_basepair(self, data, labels, title='', xlabel='', ylabel='', window_len=100, ymax_percentile=99.5, tag_scalars=None, show_moving_average=True, show_count=False): ''' Given a list of data frames with cols basepair and tag_count, graph each as a line. ''' fig = pyplot.figure(figsize=[12, 6]) # Set up plot ax = pyplot.subplot(111) ax.set_xlim([self.from_bp, self.to_bp]) all_y_vals = [] colors = self.get_colors(len(data)) if show_moving_average: for i, dataset in enumerate(data): try: dataset[ 'tag_count'] = dataset['tag_count'] * tag_scalars[i] except TypeError: dataset['tag_count'] = dataset['tag_count'] * (tag_scalars or 1) all_y_vals.extend(dataset['tag_count']) pyplot.plot(dataset['basepair'], dataset['tag_count'], '.', markeredgecolor=colors[i], markerfacecolor='None', alpha=.2, markeredgewidth=.5) # Another loop, since we want all the lines above all the circles for i, dataset in enumerate(data): # Graph fit line line_type = i % 2 and '--' or '-' if show_moving_average: x, y = self.smooth(dataset['basepair'], dataset['tag_count'], window_len=window_len) else: x, y = dataset['basepair'], dataset['tag_count'] pyplot.plot(x, y, line_type, color=colors[i], label=labels[i], linewidth=2) if show_count: pyplot.text(.05, .9 - i * .05, 'Tag count: {0}'.format(sum(dataset['tag_count'])), color=colors[i], transform=ax.transAxes) # Limit yaxis by percentile if desired: if show_moving_average and ymax_percentile: ymax = stats.scoreatpercentile(all_y_vals, ymax_percentile) ax.set_ylim([0, int(math.ceil(ymax))]) pyplot.legend() self.add_title(title or 'Tag counts around transcription start sites', ax) self.add_axis_labels(xlabel or 'Basepairs from TSS', ylabel or 'Normalized number of tag starts') return ax
def iqr(a): """Calculate the IQR for an array of numbers.""" a = np.asarray(a) q1 = stats.scoreatpercentile(a, 25) q3 = stats.scoreatpercentile(a, 75) return q3 - q1
def distplot_w_perc(trace, ax): sns.distplot(trace, ax=ax) ax.axvline(stats.scoreatpercentile(trace, 2.5), color='0.5', label='2.5 and 97.5 percentiles') ax.axvline(stats.scoreatpercentile(trace, 97.5), color='0.5')
clf.fit(X_train) # predict raw anomaly score scores_pred = clf.decision_function(X_train)*-1 # prediction of a datapoint category outlier or inlier y_pred = clf.predict(X_train) # no of errors in prediction n_errors = (y_pred != Y_train).sum() print('No of Errors : ',clf_name, n_errors) # rest of the code is to create the visualization # threshold value to consider a datapoint inlier or outlier threshold = stats.scoreatpercentile(scores_pred,100 *outlier_fraction) # decision function calculates the raw anomaly score for every point Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) * -1 Z = Z.reshape(xx.shape) subplot = plt.subplot(1, 2, i + 1) # fill blue colormap from minimum anomaly score to threshold value subplot.contourf(xx, yy, Z, levels = np.linspace(Z.min(), threshold, 10),cmap=plt.cm.Blues_r) # draw red contour line where anomaly score is equal to threshold a = subplot.contour(xx, yy, Z, levels=[threshold],linewidths=2, colors='red') # fill orange contour lines where range of anomaly score is from threshold to maximum anomaly score subplot.contourf(xx, yy, Z, levels=[threshold, Z.max()],colors='orange')
def high_variance_confounds(series, n_confounds=5, percentile=2., detrend=True): """ Return confounds time series extracted from series with highest variance. Parameters ========== series: numpy.ndarray Timeseries. A timeseries is a column in the "series" array. shape (sample number, feature number) n_confounds: int, optional Number of confounds to return percentile: float, optional Highest-variance series percentile to keep before computing the singular value decomposition, 0. <= `percentile` <= 100. series.shape[0] * percentile / 100 must be greater than n_confounds detrend: bool, optional If True, detrend timeseries before processing. Returns ======= v: numpy.ndarray highest variance confounds. Shape: (samples, n_confounds) Notes ====== This method is related to what has been published in the literature as 'CompCor' (Behzadi NeuroImage 2007). The implemented algorithm does the following: - compute sum of squares for each time series (no mean removal) - keep a given percentile of series with highest variances (percentile) - compute an svd of the extracted series - return a given number (n_confounds) of series from the svd with highest singular values. See also ======== nilearn.image.high_variance_confounds """ if detrend: series = _detrend(series) # copy # Retrieve the voxels|features with highest variance # Compute variance without mean removal. var = _mean_of_squares(series) var_thr = stats.scoreatpercentile(var, 100. - percentile) series = series[:, var > var_thr] # extract columns (i.e. features) # Return the singular vectors with largest singular values # We solve the symmetric eigenvalue problem here, increasing stability s, u = linalg.eigh(series.dot(series.T) / series.shape[0]) ix_ = np.argsort(s)[::-1] u = u[:, ix_[:n_confounds]].copy() return u
def high_variance_confounds(series, n_confounds=10, percentile=1., detrend=True): """ Return confounds time series extracted from series with highest variance. Parameters ========== series: numpy.ndarray Timeseries. A timeseries is a column in the "series" array. shape (sample number, feature number) n_confounds: int Number of confounds to return percentile: float Highest-variance series percentile to keep before computing the singular value decomposition. series.shape[0] * percentile must be greater than n_confounds. detrend: bool If True, detrend timeseries before processing. Returns ======= v: numpy.ndarray highest variance confounds. Shape: (samples, n_confounds) Notes ====== This method is related to what has been published in the literature as 'CompCor' (Behzadi NeuroImage 2007). The implemented algorithm does the following: - compute sum of squares for each time series (no mean removal) - keep a given percentile of series with highest variances (percentile) - compute an svd of the extracted series - return a given number (n_confounds) of series from the svd with highest singular values. See also ======== nisl.image.high_variance_confounds """ # FIXME: when detrend=True, two copies of "series" are made. Variance # computation below can be made chunk-by-chunk, which uses almost no # extra memory, and is as fast (if not faster). if detrend: series = _detrend(series) # copy # Retrieve the voxels|features with highest variance # Compute variance without mean removal. var = np.copy(series) var **= 2 var = var.mean(axis=0) var_thr = stats.scoreatpercentile(var, 100. - percentile) series = series[:, var > var_thr] # extract columns (i.e. features) # Return the singular vectors with largest singular values u, _, _ = linalg.svd(series, full_matrices=False) u = u[:, :n_confounds].copy() return u
def ecgmmRidge(ra_c=None, dec_c=None, photoz_c=None, r200_c=None, m200_c=None, ra=None, dec=None, color=None, colorErr=None, mag=None, candidateIdx=None): #--define some quantity to be returned ---- rac = ra_c[candidateIdx] decc = dec_c[candidateIdx] photozc = photoz_c[candidateIdx] r200c = r200_c[candidateIdx] m200c = m200_c[candidateIdx] ok = (color >= -1) * (color <= 5.) color = color[ok] colorErr = colorErr[ok] #colorErr = np.zeros(len(color)) mag = mag[ok] ra = ra[ok] dec = dec[ok] BCGalpha0 = [] BCGalpha1 = [] BCGmu0 = [] BCGmu1 = [] BCGsigma0 = [] BCGsigma1 = [] BCGntot = [] BCGamp = [] BCGaic1 = [] BCGaic2 = [] BCGphotoz = [] BCGm200c = [] #----------------------------------------- Ncandidates = len(photozc) ridgeZ = np.zeros(Ncandidates) depth = 10 h = es.htm.HTM(depth) Cosmo = es.cosmology.Cosmo(h=0.7) DA = Cosmo.Da(0, photozc) srad = np.rad2deg(r200c / DA) m1, m2, d12 = h.match(rac, decc, ra, dec, srad, maxmatch=5000) r12 = np.deg2rad(d12) * DA[m1] indices = (mag[m2] <= limz(photozc[m1])) # no bcg assumed m1 = m1[indices] m2 = m2[indices] h, rev = es.stat.histogram(m1, binsize=1, rev=True) startTime = time.time() for i in range(h.size): if rev[i] != rev[i + 1]: print i indx = rev[rev[i]:rev[i + 1]] alpha = np.array([0.5, 0.5]) mu = np.array([ sts.scoreatpercentile(color[m2[indx]], per=70), sts.scoreatpercentile(color[m2[indx]], per=40) ]) sigma = np.array([0.04, 0.3]) aic2 = gmm.aic_ecgmm(color[m2[indx]], colorErr[m2[indx]], alpha, mu, sigma) aic1 = gmm.wstat(color[m2[indx]], colorErr[m2[indx]])[2] if aic2 < aic1: srt = np.argsort(sigma) BCGalpha0.append(alpha[srt[0]]) BCGalpha1.append(alpha[srt[1]]) BCGmu0.append(mu[srt[0]]) BCGmu1.append(mu[srt[1]]) BCGsigma0.append(sigma[srt[0]]) BCGsigma1.append(sigma[srt[1]]) BCGaic1.append(aic1) BCGaic2.append(aic2) BCGamp.append(len(indx) * alpha[srt[0]]) BCGphotoz.append(photozc[m1[indx[0]]]) BCGm200c.append(m200c[m1[indx[0]]]) print aic2, aic1 endTime = time.time() elapseTime = endTime - startTime print '---elapsed time: ' + str(elapseTime) return np.array(BCGalpha0), np.array(BCGalpha1), np.array( BCGmu0), np.array(BCGmu1), np.array(BCGsigma0), np.array( BCGsigma1), np.array(BCGaic1), np.array(BCGaic2), np.array( BCGamp), np.array(BCGphotoz), np.array(BCGm200c)
def test_2D(self): x = array([[1, 1, 1], [1, 1, 1], [4, 4, 3], [1, 1, 1], [1, 1, 1]]) assert_array_equal(stats.scoreatpercentile(x, 50), [1, 1, 1])
def learn_view(self, X_view, words_view, joint_from_view_index, C=1.0, aperture=0.90, aperture_type='probability', update_joint=True, sample_weight=1): initial_test_filter = np.empty(shape=(len(words_view), ), dtype=bool) for word_id in xrange(len(words_view)): y = self.data["y_lookup_init"][joint_from_view_index[word_id]] initial_test_filter[word_id] = (y != -1) # extend all the variables appropriately X_view_w = X_view words_view_w = words_view joint_from_view_index_w = joint_from_view_index # print "X VIEW" # print X_view.get_shape() # print X_view # print "X VIEW WINDOW - pre" # print X_view_w.get_shape() # print X_view_w initial_test_filter = initial_test_filter.nonzero()[0] # print "initial test filter" # print initial_test_filter # print len(initial_test_filter) for i in range(sample_weight - 1): X_view_w = vstack((X_view_w, X_view[initial_test_filter]), format="csr") words_view_w = np.concatenate( (words_view_w, words_view[initial_test_filter])) joint_from_view_index_w = np.concatenate( (joint_from_view_index_w, joint_from_view_index[initial_test_filter])) # print "X VIEW WINDOW - post" # print X_view_w.get_shape() # print X_view_w pred_view_w = np.empty( shape=(len(words_view_w), ), dtype=int) # make a new empty vector for predicted values # (pred_view is predicted population sizes; not true/false) # print self.pred_joint # create answer vectors with the seed answers for word_id in xrange(len(pred_view_w)): pred_view_w[word_id] = self.pred_joint[ joint_from_view_index_w[word_id]] y_view_w = (pred_view_w == words_view_w) * 2 - 1 # set Trues to 1 and Falses to -1 # set filter vectors (-1 = unknown) filter_train = (pred_view_w != -1).nonzero()[0] filter_test = (pred_view_w == -1).nonzero()[0] # print filter_train, len(filter_train) # print filter_train, len(filter_test) # self.metrics["cochrane_training_examples"].append(len(filter_train)) # self.metrics["cochrane_test_examples"].append(len(filter_test)) if len(filter_test) == 0: print "leaving early - run out of data!" raise IndexError("out of data") # set training vectors X_train = X_view_w[filter_train] y_train = y_view_w[filter_train] # and test vectors as the rest X_test = X_view_w[filter_test] y_test = y_view_w[filter_test] # and the numbers to go with it for illustration purposes words_test = words_view_w[filter_test] joint_from_view_index_test = joint_from_view_index_w[filter_test] # make and fit new LR model # model = LogisticRegression(C=C, penalty='l1') model = self.model(C=C) logging.debug("fitting model to cochrane data...") model.fit(X_train, y_train) if update_joint: preds = model.predict_proba(X_test)[:, 1] # predict unknowns # get top results (by aperture type selected) if aperture_type == "percentile": top_pc_score = stats.scoreatpercentile(preds, aperture) top_result_indices = (preds > top_pc_score).nonzero()[0] elif aperture_type == "absolute": top_result_indices = np.argsort(preds)[-aperture:] else: top_pc_score = aperture top_result_indices = (preds > top_pc_score).nonzero()[0] # extend the joint predictions for i in top_result_indices: self.pred_joint[joint_from_view_index_test[i]] = words_test[i] return model
def run_spherical_gNFW(self, par, plot=False, save=True, path='./', fname='single_rst', vmap='map', markersize=0.5, rDot=0.24): print('--------------------------------------------------') print('Run spherical gNFW model with given parameters') model['lnprob'] = lnprob_spherical_gNFW model['type'] = 'spherical_gNFW' model['ndim'] = 6 model['JAMpars'] = ['cosinc', 'beta', 'ml', 'logrho_s', 'rs', 'gamma'] # initialize the JAM class and pass to the global parameter model['JAM'] = \ pyjam.axi_rms.jam(model['lum2d'], model['pot2d'], model['distance'], model['xbin'], model['ybin'], mbh=model['bh'], quiet=True, sigmapsf=model['sigmapsf'], pixsize=model['pixsize'], nrad=model['nrad'], shape=model['shape']) rmsModel = lnprob_spherical_gNFW(par, True, False) xbin = model['xbin'] ybin = model['ybin'] rms = self.rms errRms = self.errRms goodbins = self.goodbins chi2 = np.sum( ((rms[goodbins] - rmsModel[goodbins]) / errRms[goodbins])**2) chi2_dof = chi2 / goodbins.sum() for i in range(len(par)): print('{}: {:.4f}'.format(model['JAMpars'][i], par[i])) print('chi2: {:.4f}'.format(chi2)) print('chi2/dof: {:.4f}'.format(chi2_dof)) print('--------------------------------------------------') rst = { 'xbin': xbin, 'ybin': ybin, 'rms': rms, 'errRms': errRms, 'goodbins': goodbins, 'rmsModel': rmsModel, 'chi2': chi2, 'chi2_dof': chi2_dof, 'pars': par } if save: with open('{}/{}.dat'.format(path, fname), 'wb') as f: pickle.dump(rst, f) if plot: fig = plt.figure(figsize=(18 / 1.5, 5. / 1.5)) axes0a = fig.add_subplot(131) axes0b = fig.add_subplot(132) axes0c = fig.add_subplot(133) fig.subplots_adjust(left=0.05, bottom=0.1, right=0.92, top=0.99, wspace=0.4) vmin, vmax = stats.scoreatpercentile(rms[goodbins], [0.5, 99.5]) norm = colors.Normalize(vmin=vmin, vmax=vmax) velocity_plot(xbin, ybin, rms, ax=axes0b, text='$\mathbf{V_{rms}: Obs}$', size=rDot, norm=norm, vmap=vmap, markersize=markersize) velocity_plot(xbin, ybin, rmsModel, ax=axes0a, text='$\mathbf{V_{rms}: JAM}$', size=rDot, norm=norm, bar=False, vmap=vmap, markersize=markersize) residualValue = rmsModel - rms vmax = \ stats.scoreatpercentile(abs(residualValue[goodbins]) .clip(-100, 100.), 99.5) norm_residual = colors.Normalize(vmin=-vmax, vmax=vmax) velocity_plot(xbin, ybin, residualValue, ax=axes0c, text='$\mathbf{Residual}$', size=rDot, norm=norm_residual, vmap=vmap, markersize=markersize) fig.savefig('{}/{}.png'.format(path, fname), dpi=300) return rst
# Perform outlier detection predicted_data = clf.predict(data) inlier_predicted_data = data[predicted_data == 1] outlier_predicted_data = data[predicted_data == -1] num_inliers_predicted = inlier_predicted_data.shape[0] num_outliers_predicted = outlier_predicted_data.shape[0] # Plot decision function values xr = np.linspace(3, 10, 500) yr = np.linspace(-5, 45, 500) xx, yy = np.meshgrid(xr, yr) zz = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) zz = zz.reshape(xx.shape) scores = clf.decision_function(data) threshold = stats.scoreatpercentile(scores, 100 * contamination) plt.contourf(xx, yy, zz, levels=np.linspace(zz.min(), threshold, 7), cmap=plt.cm.Blues_r) # Outlier plt.contour(xx, yy, zz, levels=np.array([threshold]), linewidths=2, colors="red") # The frontier plt.contourf(xx, yy, zz, levels=np.linspace(threshold, zz.max(), 7),
def fit(self, X, y): self.quantile = stats.scoreatpercentile(y, self.alpha * 100.0)
def test_percentile(self): x = arange(8) * 0.5 assert_equal(stats.scoreatpercentile(x, 0), 0.) assert_equal(stats.scoreatpercentile(x, 100), 3.5) assert_equal(stats.scoreatpercentile(x, 50), 1.75)