Example #1
0
def get_stats(group_id):
    runs = get_group_runs(group_id)
    runs_by_type = {}
    boundaries = {}
    stats = {}
    last = {}

    # first pass, group runs
    for run in runs:
        rtype = run.type()
        if run.end_dt is not None:
            if not rtype in runs_by_type:
                runs_by_type[rtype] = []

            if not rtype in last:
                last[rtype] = run.seconds()

            runs_by_type[rtype].append(run)

    # calculate boundary quantiles and most recent run
    for rtype, runs in runs_by_type.items():
        sec_gen = (r.seconds() for r in runs if r.end_dt is not None)
        sorted_secs = sorted(sec_gen)
        boundaries[rtype] = {
            'min': quantile(sorted_secs, MIN_QUANT, 7, True),
            'max': quantile(sorted_secs, MAX_QUANT, 7, True),
        }

    # calculate stats using runs within the boundaries
    for rtype, runs in runs_by_type.items():
        nruns = len(runs)
        stats[rtype] = {'count': 0, 'secs': 0, 'total': nruns}
        for run in runs:
            secs = run.seconds()
            if run.end_dt is not None:
                if nruns < MIN_RUNS or not is_outlier(secs, boundaries[rtype]):
                    stats[rtype]['count'] += 1
                    stats[rtype]['secs'] += secs

    output = {}
    for rtype, stats in stats.items():
        if stats['count'] > 0:
            avg = stats['secs'] / stats['count']
            count = stats['total']
            output[rtype] = {
                'avg': avg,
                'count': count,
                'last': last[rtype]
            }

    return output
def histogram_intensities(spots, mskpath=join(SIC_ROOT, SIC_PROCESSED)):
    print "Building histogram of spot intensities..."
    #intensities = column(spots, 5)
    intensities = [i for i in column(spots, 6) if i < maxintensity]
    import locale
    locale.setlocale(locale.LC_NUMERIC, 'C')
    pl.figure()
    # How many bins in the histogram:
    # At least 50
    # For large number of spots: half the number of spots
    mybins = max(len(spots) / 2.0, 50)
    n, histbins, patches = pl.hist(intensities,
                                   bins=mybins,
                                   normed=0,
                                   histtype='stepfilled')
    pl.setp(patches, 'facecolor', 'g', 'alpha', 0.75)
    pl.xlabel("Intensity")
    pl.ylabel("Frequency")
    pl.xlim(xmin=0)
    # the highest 2 % of intensities are not displayed in the histogram (they are usually outliers)
    pl.xlim(xmax=min(quantile(intensities, 0.98), maxintensity))
    pl.grid(True)

    pl.savefig(join(mskpath, 'plot_intensity_histogram.png'))
    print "Intensity histogram contains", len(intensities), "spots."
    print "Finished building histogram of spot intensities."
def mode_est(x,w,nSigmaA,nSigmaB):
  means = []
  sigmas = []
  isResponse = quantile(x,w,0.1)>0 and quantile(x,w,0.9)<2 #if 80% of the data is between 0 and 2, probably response data
  for bins in [60]:
    if isResponse: hist = r.TH1F("h1","h1",bins,0,2)
    else: hist = r.TH1F("h1","h1",bins,min(x),max(x))
    for xx,ww in zip(x,w): hist.Fill(xx,ww)

    mean = hist.GetMean()
    sigma = hist.GetRMS()
    max_val = hist.GetMaximum()

    lowestX = 0
    highestX = 0
    for i in range(hist.GetNbinsX()):
      if hist.GetBinContent(i+1) > max_val/10: highestX = hist.GetBinCenter(i+1)
    for i in reversed(range(hist.GetNbinsX())):
      if hist.GetBinContent(i+1) > max_val/10: lowestX = hist.GetBinCenter(i+1)

    #gfit = r.TF1("Gaussian","gaus", mean - nSigmaB * sigma, mean + nSigmaA * sigma) # Create the fit function
    #gfit.SetParameters(mean, sigma);
    #hist.Fit(gfit,"RQ0"); # Fit histogram h

    for nFit in range(2):
      minRange = mean - nSigmaB * sigma
      maxRange = mean + nSigmaA * sigma
      if minRange < lowestX: minRange = lowestX
      if maxRange > highestX: maxRange = highestX
      gfit = r.TF1("Gaussian","gaus", minRange, maxRange) # Create the fit function
      gfit.SetParLimits(1, minRange, maxRange)
      hist.Fit(gfit,"RQ0") # Fit histogram h
      mean=gfit.GetParameter(1)
      sigma=gfit.GetParameter(2)
    if isResponse and mean>2: print 'Interpreting data as response data, but mean is above 2. Possible error.'
    if not isResponse and mean<2: print 'Interpreting data as reco data, but mean is below 2. Possible error.'
    means.append(mean)
    sigmas.append(sigma)
  means = array(means)
  sigmas = array(sigmas)
  #print sqrt(average((means-average(means))**2))
  #print sqrt(average((sigmas-average(sigmas))**2))

  return means,sigmas,minRange,maxRange
  def print_report(self):
    print ("")
    log("COMMENT SCORE CHECK CYCLE COMPLETED")
    urate = round(self.counts['upvoted'] / float(self.counts['total']) * 100)
    nrate = round(self.counts['unvoted'] / float(self.counts['total']) * 100)
    drate = round(self.counts['downvoted'] / float(self.counts['total']) * 100)
    warn("Upvoted:      %s\t%s\b\b %%"%(self.counts['upvoted'],urate))
    warn("Unvoted       %s\t%s\b\b %%"%(self.counts['unvoted'],nrate))
    warn("Downvoted:    %s\t%s\b\b %%"%(self.counts['downvoted'],drate))
    warn("Total:        %s"%self.counts['total'])
    warn("Avg Score:    %f"%self.avg_score)
    if have_quantile:
      quantspots = [0.25,0.5,0.75]
      score_list = sorted(self.score_map.values())
      quant = [quantile(score_list, q, issorted=True) for q in quantspots]
      warn("Quantiles:    %.1f-%.1f-%.1f"%tuple(quant))

    sys.stdout.flush()
    def print_report(self):
        print("")
        log("COMMENT SCORE CHECK CYCLE COMPLETED")
        urate = round(self.counts['upvoted'] / float(self.counts['total']) *
                      100)
        nrate = round(self.counts['unvoted'] / float(self.counts['total']) *
                      100)
        drate = round(self.counts['downvoted'] / float(self.counts['total']) *
                      100)
        warn("Upvoted:      %s\t%s\b\b %%" % (self.counts['upvoted'], urate))
        warn("Unvoted       %s\t%s\b\b %%" % (self.counts['unvoted'], nrate))
        warn("Downvoted:    %s\t%s\b\b %%" % (self.counts['downvoted'], drate))
        warn("Total:        %s" % self.counts['total'])
        warn("Avg Score:    %f" % self.avg_score)
        if have_quantile:
            quantspots = [0.25, 0.5, 0.75]
            score_list = sorted(self.score_map.values())
            quant = [
                quantile(score_list, q, issorted=True) for q in quantspots
            ]
            warn("Quantiles:    %.1f-%.1f-%.1f" % tuple(quant))

        sys.stdout.flush()
def histogram_intensities(spots, mskpath=join(SIC_ROOT, SIC_PROCESSED)):
    print "Building histogram of spot intensities..."
    #intensities = column(spots, 5)
    intensities = [i for i in column(spots, 6) if i < maxintensity]
    import locale
    locale.setlocale(locale.LC_NUMERIC, 'C')
    pl.figure()
    # How many bins in the histogram:
    # At least 50
    # For large number of spots: half the number of spots
    mybins = max(len(spots)/2.0, 50)
    n, histbins, patches = pl.hist(intensities, bins=mybins, normed=0, histtype='stepfilled')
    pl.setp(patches, 'facecolor', 'g', 'alpha', 0.75)
    pl.xlabel("Intensity")
    pl.ylabel("Frequency")
    pl.xlim(xmin=0)
    # the highest 2 % of intensities are not displayed in the histogram (they are usually outliers)
    pl.xlim(xmax=min(quantile(intensities, 0.98), maxintensity))
    pl.grid(True)

    pl.savefig(join(mskpath, 'plot_intensity_histogram.png'))
    print "Intensity histogram contains", len(intensities), "spots."
    print "Finished building histogram of spot intensities."
def scatterplot_intensities(spots, mskpath=join(SIC_ROOT, SIC_PROCESSED)):
    print "Building scatterplot of spot intensities_unsubtracted..."

    intensities_unsubtracted, intensities_subtracted, background = column(
        spots, 5), column(spots, 6), column(spots, 7)
    #ib = [(i, j) for (i, j) in zip(column(spots, 5), column(spots, 7)) if i < 2000]
    #intensities_unsubtracted, background = zip(*ib)

    area = 3**2  # radius

    pl.figure()
    pl.scatter(background, intensities_unsubtracted, s=area, marker='o', c='r')
    pl.xlabel("Background (median intensity) of cell")
    pl.ylabel("Spot intensity (background unsubtracted)")

    pl.xlim(xmin=quantile(background, 0.02) * 0.99)
    pl.xlim(xmax=quantile(background, 0.98) * 1.01)
    pl.ylim(ymin=0)
    pl.ylim(ymax=min(quantile(intensities_unsubtracted, 0.98), maxintensity))
    pl.grid(True)

    pl.savefig(join(mskpath, 'plot_scatterplot_intensities_unsubtracted.png'))

    pl.figure()
    pl.scatter(background, intensities_subtracted, s=area, marker='o', c='r')
    pl.xlabel("Background (median intensity) of cell")
    pl.ylabel("Spot intensity (background subtracted)")

    pl.xlim(xmin=quantile(background, 0.02) * 0.99)
    pl.xlim(xmax=quantile(background, 0.98) * 1.01)
    pl.ylim(ymin=0)
    pl.ylim(ymax=min(quantile(intensities_subtracted, 0.98), maxintensity))
    pl.grid(True)

    pl.savefig(join(mskpath, 'plot_scatterplot_intensities_subtracted.png'))

    print "Scatterplot subtracted contains", len(
        intensities_subtracted), "spots."
    print "Scatterplot unsubtracted contains", len(
        intensities_unsubtracted), "spots."
    print "Finished building scatterplots."
def scatterplot_intensities(spots, mskpath=join(SIC_ROOT, SIC_PROCESSED)):
    print "Building scatterplot of spot intensities_unsubtracted..."

    intensities_unsubtracted, intensities_subtracted, background = column(spots, 5), column(spots, 6), column(spots, 7) 
    #ib = [(i, j) for (i, j) in zip(column(spots, 5), column(spots, 7)) if i < 2000]
    #intensities_unsubtracted, background = zip(*ib)

    area = 3**2 # radius

    pl.figure()
    pl.scatter(background, intensities_unsubtracted, s=area, marker='o', c='r')
    pl.xlabel("Background (median intensity) of cell")
    pl.ylabel("Spot intensity (background unsubtracted)")

    pl.xlim(xmin=quantile(background, 0.02)*0.99)
    pl.xlim(xmax=quantile(background, 0.98)*1.01)
    pl.ylim(ymin=0)
    pl.ylim(ymax=min(quantile(intensities_unsubtracted, 0.98), maxintensity))
    pl.grid(True)

    pl.savefig(join(mskpath, 'plot_scatterplot_intensities_unsubtracted.png'))

    pl.figure()
    pl.scatter(background, intensities_subtracted, s=area, marker='o', c='r')
    pl.xlabel("Background (median intensity) of cell")
    pl.ylabel("Spot intensity (background subtracted)")

    pl.xlim(xmin=quantile(background, 0.02)*0.99)
    pl.xlim(xmax=quantile(background, 0.98)*1.01)
    pl.ylim(ymin=0)
    pl.ylim(ymax=min(quantile(intensities_subtracted, 0.98), maxintensity))
    pl.grid(True)
    
    pl.savefig(join(mskpath, 'plot_scatterplot_intensities_subtracted.png'))

    print "Scatterplot subtracted contains", len(intensities_subtracted), "spots."
    print "Scatterplot unsubtracted contains", len(intensities_unsubtracted), "spots."
    print "Finished building scatterplots."
def distribution_values(data,weights,central,eff=1):
      if not len(data)==len(weights): raise RuntimeError('Lengths of data vector and weight vector have to be the same')
      weights=weights/sum(weights) #normalize
      # maximum likelihood estimates
      mean = average(data,weights=weights)
      var = average((data-mean)**2,weights=weights)
      std = sqrt(var)
      mean_err = std*sqrt(sum(weights**2))
      var_err = var*sqrt(2*sum(weights**2)) # from https://web.eecs.umich.edu/~fessler/papers/files/tr/stderr.pdf
      #var = sigma^2 -> var_err/var = 2*sigma_err/sigma
      std_err = 0.5*var_err/std
      err = False
      if central == 'absolute_median':
        mu = quantile(data,weights,(0.5-(1-eff))/eff)
        mu_err = 1.2533*mean_err #http://influentialpoints.com/Training/standard_error_of_median.htm
        mu_ests = [mu-mu_err,mu,mu+mu_err]
        sigmas = []
        for mu_est in mu_ests:
          mu_quantile = sum(weights[data<mu_est]) 
          absolute_mu_quantile = mu_quantile*eff+(1-eff) #should be approximately 0.5
          err = False
          if abs(absolute_mu_quantile-0.5) > 0.1:
            print '<< Fitted mode is > .1 away from 50th percentile. Efficiency might be less than 50%. Calibration value might not be very useful. >>'
            err = True
          if absolute_mu_quantile > 1-.3413:
            print '<< Fitted mode is at > 65th percentile! Bad fit. Efficiency might be less than 50%. Returning max value - mode. >>'
            upper_quantile = max(data)
            err = True
          else: upper_quantile = quantile(data,weights,mu_quantile+.3413/eff) # = (absolute_mu_quantile+0.3413-(1-eff))/eff
          if absolute_mu_quantile<0.3413+1-eff: lower_quantile = float('-inf')
          else: lower_quantile = quantile(data,weights,mu_quantile-.3413/eff) # = (absolute_mu_quantile-0.3413-(1-eff))/eff
          sigma = (upper_quantile-mu_est)
          sigmas.append(sigma)
        #sigma_err = 1.573*std_err #http://stats.stackexchange.com/questions/110902/error-on-interquartile-range seems reasonable
        sigma = average(sigmas)
        sigma_err = 0.5*(sigmas[2]-sigmas[0])
        return mu,mu_err,sigma,sigma_err,upper_quantile,lower_quantile,err
      if central == 'median':
        mu = quantile(data,weights,0.5)
        mu_err = 1.2533*mean_err #http://influentialpoints.com/Training/standard_error_of_median.htm
        upper_quantile = quantile(data,weights,0.8413) #CDF(1)
        lower_quantile = quantile(data,weights,0.1587) #CDF(-1)
        sigma = 0.5*(upper_quantile-lower_quantile)
        sigma_err = 1.573*std_err #http://stats.stackexchange.com/questions/110902/error-on-interquartile-range seems reasonable
        return mu,mu_err,sigma,sigma_err,upper_quantile,lower_quantile
      if central == 'mean':
        return mean,mean_err,std,std_err
      if central == 'trimmed' or central == 'mode' or central=='kde_mode':
        n,bins = numpy.histogram(data,weights=weights,bins=100)
        newmean_ests,newstd_ests,lower_val,upper_val = mode_est(data,weights,1.75,1.75)
        newmean_est = average(newmean_ests)
        newstd_est = average(newstd_ests)

        newweights = weights[numpy.all([data>lower_val,data<upper_val],axis=0)]
        newweights/=sum(newweights)
        new_mean_err = newstd_est*sqrt(sum(newweights**2))
        new_var_err = newstd_est**2*sqrt(2*sum(newweights**2)) # from https://web.eecs.umich.edu/~fessler/papers/files/tr/stderr.pdf
        #var = sigma^2 -> var_err/var = 2*sigma_err/sigma
        new_std_err = 0.5*new_var_err/newstd_est

        if central == 'trimmed': return newmean_est,new_mean_err,newstd_est,new_std_err,lower_val,upper_val 
        #else central == 'mode' or central=='kde_mode': use absolute IQR

        if central == 'mode':
          #mu = newmean_est
          sigmas = []
          for mu in newmean_ests:
            mu_quantile = sum(weights[data<mu]) 
            absolute_mu_quantile = mu_quantile*eff+(1-eff) #should be approximately 0.5
            err = False
            if abs(absolute_mu_quantile-0.5) > 0.1:
              print '<< Fitted mode is > .1 away from 50th percentile. Efficiency might be less than 50%. Calibration value might not be very useful. >>'
              err = True
            if absolute_mu_quantile > 1-.3413:
              print '<< Fitted mode is at > 65th percentile! Bad fit. Efficiency might be less than 50%. Returning max value - mode. >>'
              upper_quantile = max(data)
              err = True
            else: upper_quantile = quantile(data,weights,mu_quantile+.3413/eff) # = (absolute_mu_quantile+0.3413-(1-eff))/eff
            if absolute_mu_quantile<0.3413+1-eff: lower_quantile = float('-inf')
            else: lower_quantile = quantile(data,weights,mu_quantile-.3413/eff) # = (absolute_mu_quantile-0.3413-(1-eff))/eff
            sigma = (upper_quantile-mu)
            sigma_err = 1.573*std_err #http://stats.stackexchange.com/questions/110902/error-on-interquartile-range seems reasonable
            sigmas.append(sigma)
          #print sqrt(average((sigmas-average(sigmas))**2))
          #print sigma_err
          sigma = average(sigmas)
          mu = average(newmean_ests)
          return mu,new_mean_err,sigma,sigma_err,upper_quantile,lower_quantile,err


        '''if central == 'kde_mode':
def interquartile_range(nums):
    return quantile(nums, 0.75) - quantile(nums, 0.25)
Example #11
0
def load_empirical_data(filename, quantile_value=1.00):
    data = [int(line.strip()) for line in open(filename)]
    if quantile_value != 1.00:
        q = quantile(data, quantile_value)
        data = [d for d in data if d < q]
    return dist.EmpiricalDistribution(data)