def get_stats(group_id): runs = get_group_runs(group_id) runs_by_type = {} boundaries = {} stats = {} last = {} # first pass, group runs for run in runs: rtype = run.type() if run.end_dt is not None: if not rtype in runs_by_type: runs_by_type[rtype] = [] if not rtype in last: last[rtype] = run.seconds() runs_by_type[rtype].append(run) # calculate boundary quantiles and most recent run for rtype, runs in runs_by_type.items(): sec_gen = (r.seconds() for r in runs if r.end_dt is not None) sorted_secs = sorted(sec_gen) boundaries[rtype] = { 'min': quantile(sorted_secs, MIN_QUANT, 7, True), 'max': quantile(sorted_secs, MAX_QUANT, 7, True), } # calculate stats using runs within the boundaries for rtype, runs in runs_by_type.items(): nruns = len(runs) stats[rtype] = {'count': 0, 'secs': 0, 'total': nruns} for run in runs: secs = run.seconds() if run.end_dt is not None: if nruns < MIN_RUNS or not is_outlier(secs, boundaries[rtype]): stats[rtype]['count'] += 1 stats[rtype]['secs'] += secs output = {} for rtype, stats in stats.items(): if stats['count'] > 0: avg = stats['secs'] / stats['count'] count = stats['total'] output[rtype] = { 'avg': avg, 'count': count, 'last': last[rtype] } return output
def histogram_intensities(spots, mskpath=join(SIC_ROOT, SIC_PROCESSED)): print "Building histogram of spot intensities..." #intensities = column(spots, 5) intensities = [i for i in column(spots, 6) if i < maxintensity] import locale locale.setlocale(locale.LC_NUMERIC, 'C') pl.figure() # How many bins in the histogram: # At least 50 # For large number of spots: half the number of spots mybins = max(len(spots) / 2.0, 50) n, histbins, patches = pl.hist(intensities, bins=mybins, normed=0, histtype='stepfilled') pl.setp(patches, 'facecolor', 'g', 'alpha', 0.75) pl.xlabel("Intensity") pl.ylabel("Frequency") pl.xlim(xmin=0) # the highest 2 % of intensities are not displayed in the histogram (they are usually outliers) pl.xlim(xmax=min(quantile(intensities, 0.98), maxintensity)) pl.grid(True) pl.savefig(join(mskpath, 'plot_intensity_histogram.png')) print "Intensity histogram contains", len(intensities), "spots." print "Finished building histogram of spot intensities."
def mode_est(x,w,nSigmaA,nSigmaB): means = [] sigmas = [] isResponse = quantile(x,w,0.1)>0 and quantile(x,w,0.9)<2 #if 80% of the data is between 0 and 2, probably response data for bins in [60]: if isResponse: hist = r.TH1F("h1","h1",bins,0,2) else: hist = r.TH1F("h1","h1",bins,min(x),max(x)) for xx,ww in zip(x,w): hist.Fill(xx,ww) mean = hist.GetMean() sigma = hist.GetRMS() max_val = hist.GetMaximum() lowestX = 0 highestX = 0 for i in range(hist.GetNbinsX()): if hist.GetBinContent(i+1) > max_val/10: highestX = hist.GetBinCenter(i+1) for i in reversed(range(hist.GetNbinsX())): if hist.GetBinContent(i+1) > max_val/10: lowestX = hist.GetBinCenter(i+1) #gfit = r.TF1("Gaussian","gaus", mean - nSigmaB * sigma, mean + nSigmaA * sigma) # Create the fit function #gfit.SetParameters(mean, sigma); #hist.Fit(gfit,"RQ0"); # Fit histogram h for nFit in range(2): minRange = mean - nSigmaB * sigma maxRange = mean + nSigmaA * sigma if minRange < lowestX: minRange = lowestX if maxRange > highestX: maxRange = highestX gfit = r.TF1("Gaussian","gaus", minRange, maxRange) # Create the fit function gfit.SetParLimits(1, minRange, maxRange) hist.Fit(gfit,"RQ0") # Fit histogram h mean=gfit.GetParameter(1) sigma=gfit.GetParameter(2) if isResponse and mean>2: print 'Interpreting data as response data, but mean is above 2. Possible error.' if not isResponse and mean<2: print 'Interpreting data as reco data, but mean is below 2. Possible error.' means.append(mean) sigmas.append(sigma) means = array(means) sigmas = array(sigmas) #print sqrt(average((means-average(means))**2)) #print sqrt(average((sigmas-average(sigmas))**2)) return means,sigmas,minRange,maxRange
def print_report(self): print ("") log("COMMENT SCORE CHECK CYCLE COMPLETED") urate = round(self.counts['upvoted'] / float(self.counts['total']) * 100) nrate = round(self.counts['unvoted'] / float(self.counts['total']) * 100) drate = round(self.counts['downvoted'] / float(self.counts['total']) * 100) warn("Upvoted: %s\t%s\b\b %%"%(self.counts['upvoted'],urate)) warn("Unvoted %s\t%s\b\b %%"%(self.counts['unvoted'],nrate)) warn("Downvoted: %s\t%s\b\b %%"%(self.counts['downvoted'],drate)) warn("Total: %s"%self.counts['total']) warn("Avg Score: %f"%self.avg_score) if have_quantile: quantspots = [0.25,0.5,0.75] score_list = sorted(self.score_map.values()) quant = [quantile(score_list, q, issorted=True) for q in quantspots] warn("Quantiles: %.1f-%.1f-%.1f"%tuple(quant)) sys.stdout.flush()
def print_report(self): print("") log("COMMENT SCORE CHECK CYCLE COMPLETED") urate = round(self.counts['upvoted'] / float(self.counts['total']) * 100) nrate = round(self.counts['unvoted'] / float(self.counts['total']) * 100) drate = round(self.counts['downvoted'] / float(self.counts['total']) * 100) warn("Upvoted: %s\t%s\b\b %%" % (self.counts['upvoted'], urate)) warn("Unvoted %s\t%s\b\b %%" % (self.counts['unvoted'], nrate)) warn("Downvoted: %s\t%s\b\b %%" % (self.counts['downvoted'], drate)) warn("Total: %s" % self.counts['total']) warn("Avg Score: %f" % self.avg_score) if have_quantile: quantspots = [0.25, 0.5, 0.75] score_list = sorted(self.score_map.values()) quant = [ quantile(score_list, q, issorted=True) for q in quantspots ] warn("Quantiles: %.1f-%.1f-%.1f" % tuple(quant)) sys.stdout.flush()
def histogram_intensities(spots, mskpath=join(SIC_ROOT, SIC_PROCESSED)): print "Building histogram of spot intensities..." #intensities = column(spots, 5) intensities = [i for i in column(spots, 6) if i < maxintensity] import locale locale.setlocale(locale.LC_NUMERIC, 'C') pl.figure() # How many bins in the histogram: # At least 50 # For large number of spots: half the number of spots mybins = max(len(spots)/2.0, 50) n, histbins, patches = pl.hist(intensities, bins=mybins, normed=0, histtype='stepfilled') pl.setp(patches, 'facecolor', 'g', 'alpha', 0.75) pl.xlabel("Intensity") pl.ylabel("Frequency") pl.xlim(xmin=0) # the highest 2 % of intensities are not displayed in the histogram (they are usually outliers) pl.xlim(xmax=min(quantile(intensities, 0.98), maxintensity)) pl.grid(True) pl.savefig(join(mskpath, 'plot_intensity_histogram.png')) print "Intensity histogram contains", len(intensities), "spots." print "Finished building histogram of spot intensities."
def scatterplot_intensities(spots, mskpath=join(SIC_ROOT, SIC_PROCESSED)): print "Building scatterplot of spot intensities_unsubtracted..." intensities_unsubtracted, intensities_subtracted, background = column( spots, 5), column(spots, 6), column(spots, 7) #ib = [(i, j) for (i, j) in zip(column(spots, 5), column(spots, 7)) if i < 2000] #intensities_unsubtracted, background = zip(*ib) area = 3**2 # radius pl.figure() pl.scatter(background, intensities_unsubtracted, s=area, marker='o', c='r') pl.xlabel("Background (median intensity) of cell") pl.ylabel("Spot intensity (background unsubtracted)") pl.xlim(xmin=quantile(background, 0.02) * 0.99) pl.xlim(xmax=quantile(background, 0.98) * 1.01) pl.ylim(ymin=0) pl.ylim(ymax=min(quantile(intensities_unsubtracted, 0.98), maxintensity)) pl.grid(True) pl.savefig(join(mskpath, 'plot_scatterplot_intensities_unsubtracted.png')) pl.figure() pl.scatter(background, intensities_subtracted, s=area, marker='o', c='r') pl.xlabel("Background (median intensity) of cell") pl.ylabel("Spot intensity (background subtracted)") pl.xlim(xmin=quantile(background, 0.02) * 0.99) pl.xlim(xmax=quantile(background, 0.98) * 1.01) pl.ylim(ymin=0) pl.ylim(ymax=min(quantile(intensities_subtracted, 0.98), maxintensity)) pl.grid(True) pl.savefig(join(mskpath, 'plot_scatterplot_intensities_subtracted.png')) print "Scatterplot subtracted contains", len( intensities_subtracted), "spots." print "Scatterplot unsubtracted contains", len( intensities_unsubtracted), "spots." print "Finished building scatterplots."
def scatterplot_intensities(spots, mskpath=join(SIC_ROOT, SIC_PROCESSED)): print "Building scatterplot of spot intensities_unsubtracted..." intensities_unsubtracted, intensities_subtracted, background = column(spots, 5), column(spots, 6), column(spots, 7) #ib = [(i, j) for (i, j) in zip(column(spots, 5), column(spots, 7)) if i < 2000] #intensities_unsubtracted, background = zip(*ib) area = 3**2 # radius pl.figure() pl.scatter(background, intensities_unsubtracted, s=area, marker='o', c='r') pl.xlabel("Background (median intensity) of cell") pl.ylabel("Spot intensity (background unsubtracted)") pl.xlim(xmin=quantile(background, 0.02)*0.99) pl.xlim(xmax=quantile(background, 0.98)*1.01) pl.ylim(ymin=0) pl.ylim(ymax=min(quantile(intensities_unsubtracted, 0.98), maxintensity)) pl.grid(True) pl.savefig(join(mskpath, 'plot_scatterplot_intensities_unsubtracted.png')) pl.figure() pl.scatter(background, intensities_subtracted, s=area, marker='o', c='r') pl.xlabel("Background (median intensity) of cell") pl.ylabel("Spot intensity (background subtracted)") pl.xlim(xmin=quantile(background, 0.02)*0.99) pl.xlim(xmax=quantile(background, 0.98)*1.01) pl.ylim(ymin=0) pl.ylim(ymax=min(quantile(intensities_subtracted, 0.98), maxintensity)) pl.grid(True) pl.savefig(join(mskpath, 'plot_scatterplot_intensities_subtracted.png')) print "Scatterplot subtracted contains", len(intensities_subtracted), "spots." print "Scatterplot unsubtracted contains", len(intensities_unsubtracted), "spots." print "Finished building scatterplots."
def distribution_values(data,weights,central,eff=1): if not len(data)==len(weights): raise RuntimeError('Lengths of data vector and weight vector have to be the same') weights=weights/sum(weights) #normalize # maximum likelihood estimates mean = average(data,weights=weights) var = average((data-mean)**2,weights=weights) std = sqrt(var) mean_err = std*sqrt(sum(weights**2)) var_err = var*sqrt(2*sum(weights**2)) # from https://web.eecs.umich.edu/~fessler/papers/files/tr/stderr.pdf #var = sigma^2 -> var_err/var = 2*sigma_err/sigma std_err = 0.5*var_err/std err = False if central == 'absolute_median': mu = quantile(data,weights,(0.5-(1-eff))/eff) mu_err = 1.2533*mean_err #http://influentialpoints.com/Training/standard_error_of_median.htm mu_ests = [mu-mu_err,mu,mu+mu_err] sigmas = [] for mu_est in mu_ests: mu_quantile = sum(weights[data<mu_est]) absolute_mu_quantile = mu_quantile*eff+(1-eff) #should be approximately 0.5 err = False if abs(absolute_mu_quantile-0.5) > 0.1: print '<< Fitted mode is > .1 away from 50th percentile. Efficiency might be less than 50%. Calibration value might not be very useful. >>' err = True if absolute_mu_quantile > 1-.3413: print '<< Fitted mode is at > 65th percentile! Bad fit. Efficiency might be less than 50%. Returning max value - mode. >>' upper_quantile = max(data) err = True else: upper_quantile = quantile(data,weights,mu_quantile+.3413/eff) # = (absolute_mu_quantile+0.3413-(1-eff))/eff if absolute_mu_quantile<0.3413+1-eff: lower_quantile = float('-inf') else: lower_quantile = quantile(data,weights,mu_quantile-.3413/eff) # = (absolute_mu_quantile-0.3413-(1-eff))/eff sigma = (upper_quantile-mu_est) sigmas.append(sigma) #sigma_err = 1.573*std_err #http://stats.stackexchange.com/questions/110902/error-on-interquartile-range seems reasonable sigma = average(sigmas) sigma_err = 0.5*(sigmas[2]-sigmas[0]) return mu,mu_err,sigma,sigma_err,upper_quantile,lower_quantile,err if central == 'median': mu = quantile(data,weights,0.5) mu_err = 1.2533*mean_err #http://influentialpoints.com/Training/standard_error_of_median.htm upper_quantile = quantile(data,weights,0.8413) #CDF(1) lower_quantile = quantile(data,weights,0.1587) #CDF(-1) sigma = 0.5*(upper_quantile-lower_quantile) sigma_err = 1.573*std_err #http://stats.stackexchange.com/questions/110902/error-on-interquartile-range seems reasonable return mu,mu_err,sigma,sigma_err,upper_quantile,lower_quantile if central == 'mean': return mean,mean_err,std,std_err if central == 'trimmed' or central == 'mode' or central=='kde_mode': n,bins = numpy.histogram(data,weights=weights,bins=100) newmean_ests,newstd_ests,lower_val,upper_val = mode_est(data,weights,1.75,1.75) newmean_est = average(newmean_ests) newstd_est = average(newstd_ests) newweights = weights[numpy.all([data>lower_val,data<upper_val],axis=0)] newweights/=sum(newweights) new_mean_err = newstd_est*sqrt(sum(newweights**2)) new_var_err = newstd_est**2*sqrt(2*sum(newweights**2)) # from https://web.eecs.umich.edu/~fessler/papers/files/tr/stderr.pdf #var = sigma^2 -> var_err/var = 2*sigma_err/sigma new_std_err = 0.5*new_var_err/newstd_est if central == 'trimmed': return newmean_est,new_mean_err,newstd_est,new_std_err,lower_val,upper_val #else central == 'mode' or central=='kde_mode': use absolute IQR if central == 'mode': #mu = newmean_est sigmas = [] for mu in newmean_ests: mu_quantile = sum(weights[data<mu]) absolute_mu_quantile = mu_quantile*eff+(1-eff) #should be approximately 0.5 err = False if abs(absolute_mu_quantile-0.5) > 0.1: print '<< Fitted mode is > .1 away from 50th percentile. Efficiency might be less than 50%. Calibration value might not be very useful. >>' err = True if absolute_mu_quantile > 1-.3413: print '<< Fitted mode is at > 65th percentile! Bad fit. Efficiency might be less than 50%. Returning max value - mode. >>' upper_quantile = max(data) err = True else: upper_quantile = quantile(data,weights,mu_quantile+.3413/eff) # = (absolute_mu_quantile+0.3413-(1-eff))/eff if absolute_mu_quantile<0.3413+1-eff: lower_quantile = float('-inf') else: lower_quantile = quantile(data,weights,mu_quantile-.3413/eff) # = (absolute_mu_quantile-0.3413-(1-eff))/eff sigma = (upper_quantile-mu) sigma_err = 1.573*std_err #http://stats.stackexchange.com/questions/110902/error-on-interquartile-range seems reasonable sigmas.append(sigma) #print sqrt(average((sigmas-average(sigmas))**2)) #print sigma_err sigma = average(sigmas) mu = average(newmean_ests) return mu,new_mean_err,sigma,sigma_err,upper_quantile,lower_quantile,err '''if central == 'kde_mode':
def interquartile_range(nums): return quantile(nums, 0.75) - quantile(nums, 0.25)
def load_empirical_data(filename, quantile_value=1.00): data = [int(line.strip()) for line in open(filename)] if quantile_value != 1.00: q = quantile(data, quantile_value) data = [d for d in data if d < q] return dist.EmpiricalDistribution(data)