Ejemplo n.º 1
0
def read_pvalues(bedfilename, log_pvalues, verbose):
    ''' read in p-values from a bed file score field.
    
    returns: list sorted by signifance (most significant first)'''
    pvals = []

    if verbose:
        print >>sys.stderr, ">> reading p-values from %s .." % bedfilename

    with maybe_gzip_open(bedfilename) as bedfile:
        for datum in read_bed(bedfile):
            if log_pvalues:
                pval = datum.score
            else:
                pval = -1 * log10(pvalue)
            pvals.append(pval)

    if verbose:
        print >>sys.stderr, ">> read %d p-values" % len(pvals)

    # sort the pvalues from most to least signif (smallest to largest) and
    # reverse so largest are first
    pvals.sort()

    # if pvals are log transformed, biggest (i.e. most significant) are
    # first
    if log_pvalues: pvals.reverse()

    return pvals
Ejemplo n.º 2
0
def read_qvalues(qvalue_filename):
    ''' read in q-values from file.

    returns: sorted list of q-values'''
    qvalues = []

    with maybe_gzip_open(qvalue_filename) as qvalue_file:
        for datum in DictReader(qvalue_file, QVAL_FIELDNAMES):
            qvalues.append(float(datum['qvalue']))

    qvalues.sort()

    return qvalues
Ejemplo n.º 3
0
def get_region_counts(bedfilenames, verbose):
    # counts the number of times a base is covered by a peak call
    region_counts = defaultdict(Counter)

    for bedfilename in bedfilenames:
        if verbose:
            print >>sys.stderr, ">> loading regions from %s" % \
                bedfilename

        with maybe_gzip_open(bedfilename) as bedfile:
            for datum in read_bed(bedfile):
                for pos in range(datum.chromStart, datum.chromEnd):
                    region_counts[datum.chrom][pos] += 1

    return region_counts
Ejemplo n.º 4
0
def calc_qvalues(real_bedfilename, null_bedfilename, log_pvalues, verbose):

    # read in real p-values.
    real_pvals = read_pvalues(real_bedfilename, log_pvalues, verbose)

    # read in null p-values
    null_pvals = read_pvalues(null_bedfilename, log_pvalues, verbose)

    num_real = float(len(real_pvals))
    num_null = float(len(null_pvals))

    # make sure both are defined
    assert num_real and num_null 

    # normalization factor to account for different numbers of real and
    # null p-values
    frac_real = num_real / num_null

    if verbose:
        print >>sys.stderr, ">> normalization factor: %.5f" % frac_real

    # compute pvalue thresholds
    pval_thresh = compute_pval_thresh(real_pvals, null_pvals, verbose)

    # go back over real pvalues and assign qvalues
    with maybe_gzip_open(real_bedfilename) as bedfile:
        for datum in read_bed(bedfile):

            if log_pvalues:
                pval = float(datum.score)
            else:
                pval = -1 * log10(pvalue)

            qval = pval_thresh[pval]

            norm_qval = qval * frac_real

            # print in table format
            fields = (datum.chrom, datum.chromStart, datum.chromEnd,
                      datum.name, pval, datum.strand, norm_qval)
            print '\t'.join(map(str, fields))
Ejemplo n.º 5
0
def interpolate_peaks(gdfilename, bedfilename, trackname, spec_chrom,
                      verbose):

    warnings.simplefilter('ignore')
    with Genome(gdfilename) as genome, \
        maybe_gzip_open(bedfilename) as bedfile:
        for datum in read_native(bedfile):

            chrom = datum.chrom
            peak_start = datum.chromStart
            peak_end = datum.chromEnd

            # can parallelize by chrom
            if spec_chrom and spec_chrom != chrom: continue

            xs = range(peak_start, peak_end)
            ys = genome[chrom][peak_start:peak_end, trackname]

            interp_start, interp_end = fit_spline(xs, ys)

            fields = (chrom, interp_start, interp_end,
                      datum.name, datum.score, datum.strand)
            print '\t'.join(map(str, fields))