def read_pvalues(bedfilename, log_pvalues, verbose): ''' read in p-values from a bed file score field. returns: list sorted by signifance (most significant first)''' pvals = [] if verbose: print >>sys.stderr, ">> reading p-values from %s .." % bedfilename with maybe_gzip_open(bedfilename) as bedfile: for datum in read_bed(bedfile): if log_pvalues: pval = datum.score else: pval = -1 * log10(pvalue) pvals.append(pval) if verbose: print >>sys.stderr, ">> read %d p-values" % len(pvals) # sort the pvalues from most to least signif (smallest to largest) and # reverse so largest are first pvals.sort() # if pvals are log transformed, biggest (i.e. most significant) are # first if log_pvalues: pvals.reverse() return pvals
def read_qvalues(qvalue_filename): ''' read in q-values from file. returns: sorted list of q-values''' qvalues = [] with maybe_gzip_open(qvalue_filename) as qvalue_file: for datum in DictReader(qvalue_file, QVAL_FIELDNAMES): qvalues.append(float(datum['qvalue'])) qvalues.sort() return qvalues
def get_region_counts(bedfilenames, verbose): # counts the number of times a base is covered by a peak call region_counts = defaultdict(Counter) for bedfilename in bedfilenames: if verbose: print >>sys.stderr, ">> loading regions from %s" % \ bedfilename with maybe_gzip_open(bedfilename) as bedfile: for datum in read_bed(bedfile): for pos in range(datum.chromStart, datum.chromEnd): region_counts[datum.chrom][pos] += 1 return region_counts
def calc_qvalues(real_bedfilename, null_bedfilename, log_pvalues, verbose): # read in real p-values. real_pvals = read_pvalues(real_bedfilename, log_pvalues, verbose) # read in null p-values null_pvals = read_pvalues(null_bedfilename, log_pvalues, verbose) num_real = float(len(real_pvals)) num_null = float(len(null_pvals)) # make sure both are defined assert num_real and num_null # normalization factor to account for different numbers of real and # null p-values frac_real = num_real / num_null if verbose: print >>sys.stderr, ">> normalization factor: %.5f" % frac_real # compute pvalue thresholds pval_thresh = compute_pval_thresh(real_pvals, null_pvals, verbose) # go back over real pvalues and assign qvalues with maybe_gzip_open(real_bedfilename) as bedfile: for datum in read_bed(bedfile): if log_pvalues: pval = float(datum.score) else: pval = -1 * log10(pvalue) qval = pval_thresh[pval] norm_qval = qval * frac_real # print in table format fields = (datum.chrom, datum.chromStart, datum.chromEnd, datum.name, pval, datum.strand, norm_qval) print '\t'.join(map(str, fields))
def interpolate_peaks(gdfilename, bedfilename, trackname, spec_chrom, verbose): warnings.simplefilter('ignore') with Genome(gdfilename) as genome, \ maybe_gzip_open(bedfilename) as bedfile: for datum in read_native(bedfile): chrom = datum.chrom peak_start = datum.chromStart peak_end = datum.chromEnd # can parallelize by chrom if spec_chrom and spec_chrom != chrom: continue xs = range(peak_start, peak_end) ys = genome[chrom][peak_start:peak_end, trackname] interp_start, interp_end = fit_spline(xs, ys) fields = (chrom, interp_start, interp_end, datum.name, datum.score, datum.strand) print '\t'.join(map(str, fields))