예제 #1
0
def _qvality(fbed_file, col_num, col_null):
   from qvality import qvality

   ps = [b['p'] for b in bediter(fbed_file, col_num)]
   nulls = [b['p'] for b in bediter(fbed_file, col_null)]
   fh = ts.nopen(fbed_file)
   drop_header(fh)
   for (pval, pep, qval), l in izip(qvality(ps, nulls, r=None), fh):
       yield qval, pep, l
예제 #2
0
def _qvality(fbed_file, col_num, col_null):
    from qvality import qvality

    ps = [b['p'] for b in bediter(fbed_file, col_num)]
    nulls = [b['p'] for b in bediter(fbed_file, col_null)]
    fh = ts.nopen(fbed_file)
    drop_header(fh)
    for (pval, pep, qval), l in izip(qvality(ps, nulls, r=None), fh):
        yield qval, pep, l
예제 #3
0
def obs_fdr(fbed_file, col_num, col_null=None):
    ps = [b['p'] for b in bediter(fbed_file, col_num)]
    if col_null is None:
        # Benjamini-Hochberg.
        nulls = np.arange(1, len(ps) + 1, dtype=np.float64) / float(len(ps))
    else:
        nulls = [b['p'] for b in bediter(fbed_file, col_null)]
    fh = ts.nopen(fbed_file)
    drop_header(fh)
    for qval, l in izip(relative_fdr(ps, nulls), fh):
        yield qval, l
예제 #4
0
def obs_fdr(fbed_file, col_num, col_null=None):
    ps = [b['p'] for b in bediter(fbed_file, col_num)]
    if col_null is None:
        # Benjamini-Hochberg.
        nulls = np.arange(1, len(ps) + 1, dtype=np.float64) / float(len(ps))
    else:
        nulls = [b['p'] for b in bediter(fbed_file, col_null)]
    fh = ts.nopen(fbed_file)
    drop_header(fh)
    for qval, l in izip(relative_fdr(ps, nulls), fh):
        yield qval, l
def _get_ps_in_regions(fregions, fpvals, col_num):
    """
    find the pvalues associated with each region
    """
    region_info = []
    piter = chain(bediter(fpvals, col_num), [None])
    prow = piter.next()
    nr = 0
    for rchrom, rstart, rend, region_line in sorted(gen_regions(fregions),
                                                key=itemgetter(0, 1)):
        prows = []
        nr += 1
        # grab the p-values in the bed file that are within the current region
        while (prow["chrom"] != rchrom or prow["start"] < rstart):
        #while (prow["chrom"] != rchrom or prow["end"] < rstart):
            prow = piter.next()
            if prow is None: break

        #while prow is not None and (rchrom, rend) > (prow["chrom"], prow["start"]):
        while prow is not None and (rchrom, rend) >= (prow["chrom"], prow["end"]):
            prows.append(prow)
            prow = piter.next()
            if prow is None: break
        if not prows:
            print >>sys.stderr, "missed,:", prows, (region_line)
        region_len = rend - rstart
        region_info.append((region_line, region_len, prows[:]))
        del prows
    assert nr == len(region_info), (nr, len(region_info))
    return region_info
예제 #6
0
def region_p(fpvals, fregions, col_num, nsims, step, mlog=False, z=False):
    # just use 2 for col_num, but dont need the p from regions.

    if(sum(1 for _ in open(fregions) if _[0] != "#") == 0):
        print >>sys.stderr, "no regions in %s" % (fregions, )
        sys.exit()

    process, total_coverage_sync = _get_total_coverage(fpvals, col_num, step)
    region_info = _get_ps_in_regions(fregions, fpvals, col_num)

    acfs = _gen_acf(region_info, (fpvals,), col_num, step, mlog=mlog)
    process.join()
    total_coverage = total_coverage_sync.value

    # regions first and then create ACF for the longest one.
    print >>sys.stderr, "%i bases used as coverage for sidak correction" % \
                                (total_coverage)
    sample_distribution = np.array([b["p"] for b in bediter(fpvals,
                                                                col_num)])
    for region_line, region_len, prows in region_info:
        # gen_sigma expects a list of bed dicts.
        sigma = gen_sigma_matrix(prows, acfs)
        ps = np.array([prow["p"] for prow in prows])
        if ps.shape[0] == 0:
            print >>sys.stderr,("bad region", region_line)
            continue

        # calculate the SLK for the region.

        if z:
            region_slk = z_score_combine(ps, sigma)
        else:
            region_slk = stouffer_liptak(ps, sigma)

        if not region_slk["OK"]:
            print >>sys.stderr, "problem with:", region_slk, ps

        slk_p = region_slk["p"]

        sidak_slk_p = sidak(slk_p, region_len, total_coverage)

        result = [region_line, slk_p, sidak_slk_p]

        # corroborate those with p-values < 0.1 by simulation
        #"""
        if nsims > 0:

            # adjust nsims so it's an adjusted p-value.
            q_nsims = int(0.5 + total_coverage / float(region_len))
            assert sample_distribution is not None
            # trim sigma because we may have trimmed the ps above.
            sim_p = sl_sim(sigma, ps, q_nsims, sample_distribution)
            result.append(sim_p)
        else:
            result.append("NA")
        #"""
        #result.append("NA")
        yield result
예제 #7
0
def acf(fnames, lags, col_num0, partial=True, simple=False, mlog=True):
    """
    calculate the correlation of the numbers in `col_num0` from the bed files
    in `fnames` at various lags. The lags are specified by distance. Partial
    autocorrelation may be calculated as well.

    Since the bed files may be very large, this attempts to be as memory
    efficient as possible while still being very fast for a pure python
    implementation.
    """
    # reversing allows optimization below.
    imap = get_map()

    arg_list = [] # chaining
    for fname in fnames:
        # groupby chromosome.
        arg_list = chain(arg_list, ((list(chromlist), lags) for chrom, \
                    chromlist in \
                    groupby(bediter(fname, col_num0), lambda a: a["chrom"])))

    unmerged_acfs = [] # separated by chrom. need to merge later.
    for chrom_acf in imap(_acf_by_chrom, arg_list):
        unmerged_acfs.append(chrom_acf)

    acfs = merge_acfs(unmerged_acfs)
    acf_res = {}
    xs = np.array([], dtype='f')
    ys = np.array([], dtype='f')
    # iterate over it backwards and remove to reduce memory.
    while len(acfs):
        lmin, lmax, xys = acfs.pop()
        if partial:
            xs, ys = np.array(xys["x"]), np.array(xys["y"])
        else:
            # add the inner layers as we move out.
            xs = np.hstack((xs, xys["x"]))
            ys = np.hstack((ys, xys["y"]))
        if len(xs) == 0:
            print >>sys.stderr, "no values found at lag: %i-%i. skipping" \
                    % (lmin, lmax)
            continue
        if mlog:
            xs[xs == 0] = 1e-12
            ys[ys == 0] = 1e-12
            xs, ys = -np.log10(xs), -np.log10(ys)
        #slope, intercept, corr, p_val, stderr = ss.linregress(xs, ys)
        # NOTE: using pearson correlation, which assumes normality.
        # could switch to spearman as below.
        corr, p_val = ss.spearmanr(xs, ys)
        if simple:
            acf_res[(lmin, lmax)] = corr
        else:
            acf_res[(lmin, lmax)] = (corr, len(xs), p_val)
    return sorted(acf_res.items())
예제 #8
0
def acf(fnames, lags, col_num0, partial=True, simple=False, mlog=False):
    """
    calculate the correlation of the numbers in `col_num0` from the bed files
    in `fnames` at various lags. The lags are specified by distance. Partial
    autocorrelation may be calculated as well.

    Since the bed files may be very large, this attempts to be as memory
    efficient as possible while still being very fast for a pure python
    implementation.
    """
    # reversing allows optimization below.
    imap = get_map()

    arg_list = [] # chaining
    for fname in fnames:
        # groupby chromosome.
        arg_list = chain(arg_list, ((list(chromlist), lags) for chrom, \
                    chromlist in \
                    groupby(bediter(fname, col_num0), lambda a: a["chrom"])))

    unmerged_acfs = [] # separated by chrom. need to merge later.
    for chrom_acf in imap(_acf_by_chrom, arg_list):
        unmerged_acfs.append(chrom_acf)

    acfs = merge_acfs(unmerged_acfs)
    acf_res = {}
    xs = np.array([], dtype='f')
    ys = np.array([], dtype='f')
    # iterate over it backwards and remove to reduce memory.
    while len(acfs):
        lmin, lmax, xys = acfs.pop()
        if partial:
            xs, ys = np.array(xys["x"]), np.array(xys["y"])
        else:
            # add the inner layers as we move out.
            xs = np.hstack((xs, xys["x"]))
            ys = np.hstack((ys, xys["y"]))
        if len(xs) == 0:
            print >>sys.stderr, "no values found at lag: %i-%i. skipping" \
                    % (lmin, lmax)
            continue
        if mlog:
            xs[xs == 0] = 0.5 * xs[xs > 0].min()
            ys[ys == 0] = 0.5 * ys[ys > 0].min()
            xs, ys = -np.log10(xs), -np.log10(ys)
        slope, intercept, corr, p_val, stderr = ss.linregress(xs, ys)
        # NOTE: using pearson correlation, which assumes normality.
        # could switch to spearman as below.
        #corr, p_val = ss.spearmanr(xs, ys)
        if simple:
            acf_res[(lmin, lmax)] = corr
        else:
            acf_res[(lmin, lmax)] = (corr, len(xs), p_val)
    return sorted(acf_res.items())
def get_total_coverage(fpvals, col_num, out_val):
    """
    Calculate total bases of coverage in `fpvals`.
    Used for the sidak correction
    """
    total_coverage = 0
    for key, chrom_iter in groupby(bediter(fpvals, col_num),
            itemgetter('chrom')):
        bases = set([])
        for feat in chrom_iter:
            bases.update(range(feat['start'], feat['end']))
        total_coverage += len(bases)
    out_val.value = total_coverage
예제 #10
0
def _get_ps_in_regions(tree, fpvals, col_num):
    """
    find the pvalues associated with each region
    """
    region_info = defaultdict(list)
    for row in bediter(fpvals, col_num):
        for region in tree[row['chrom']].find((row['start'], row['end'])):
            region_len = max(1, region[1] - region[0])
            region_tup = tuple(region[-1])
            region_info[region_tup].append(row)
    assert sum(len(v) for v in tree.values()) >= len(region_info)
    if sum(len(v) for v in tree.values()) > len(region_info):
        sys.stderr.write("# note: not all regions contained measurements\n")
    return region_info
예제 #11
0
def _get_ps_in_regions(tree, fpvals, col_num):
    """
    find the pvalues associated with each region
    """
    region_info = defaultdict(list)
    for row in bediter(fpvals, col_num):
        for region in tree[row['chrom']].find((row['start'], row['end'])):
            region_len = max(1, region[1] - region[0])
            region_tup = tuple(region[-1])
            region_info[region_tup].append(row)
    assert sum(len(v) for v in tree.values()) >= len(region_info)
    if sum(len(v) for v in tree.values()) > len(region_info):
        sys.stderr.write("# note: not all regions contained measurements\n")
    return region_info
예제 #12
0
def adjust_pvals(fnames, col_num0, acfs, z=True):
    lag_max = acfs[-1][0][1]

    # parallelize if multiprocesing is installed.
    imap = get_map()
    arg_iter = []
    for fname in fnames:
        # 9e-17 seems to be limit of precision for cholesky.
        arg_iter = chain(arg_iter, ((list(chromlist), lag_max, acfs,
            z) \
                    for key, chromlist in groupby(bediter(fname, col_num0, 9e-117),
                            itemgetter("chrom"))))

    for chrom, results in imap(_slk_chrom, arg_iter):
        yield chrom, results
def stepsize(bed_files, col):

    D1 = []
    for bed_file in bed_files:
        for _, chromlist in groupby(bediter(bed_file, col), itemgetter('chrom')):
            L = list(chromlist)

            last_start = 0
            for i, ibed in enumerate(L):
                assert ibed['start'] >= last_start
                # look around ibed. nearest could be up or down-stream
                if i + 2 == len(L): break
                D1.append(L[i + 1]['start'] - ibed['start'])
        # round up to the nearest 10
    return int(round(np.median(D1) + 5, -1))
예제 #14
0
def adjust_pvals(fnames, col_num0, acfs, stringent=False):
    lag_max = acfs[-1][0][1]

    # parallelize if multiprocesing is installed.
    imap = get_map()
    arg_iter = []
    for fname in fnames:
        # 9e-17 seems to be limit of precision for cholesky.
        arg_iter = chain(arg_iter, ((list(chromlist), lag_max, acfs, stringent) \
                    for key, chromlist in groupby(bediter(fname, col_num0, 9e-17),
                            itemgetter("chrom"))))

    for results in imap(_slk_chrom, arg_iter):
        for r in results:
            yield r
예제 #15
0
def stepsize(bed_files, col):

    D1 = []
    for bed_file in bed_files:
        for _, chromlist in groupby(bediter(bed_file, col),
                                    itemgetter('chrom')):
            L = list(chromlist)

            last_start = 0
            for i, ibed in enumerate(L):
                assert ibed['start'] >= last_start
                # look around ibed. nearest could be up or down-stream
                if i + 2 == len(L): break
                D1.append(L[i + 1]['start'] - ibed['start'])
        # round up to the nearest 10
    return int(round(np.median(D1) + 5, -1))
예제 #16
0
def get_total_coverage(fpvals, col_num, step, out_val):
    """
    Calculate total bases of coverage in `fpvals`.
    Used for the sidak correction
    """
    total_coverage = 0
    for key, chrom_iter in groupby(bediter(fpvals, col_num),
                                   itemgetter('chrom')):
        bases = set([])
        for feat in chrom_iter:
            s, e = feat['start'], feat['end']
            if s == e: e += 1
            #e = max(e, s + step)
            bases.update(range(s, e))
        total_coverage += len(bases)
    out_val.value = total_coverage
예제 #17
0
def adjust_pvals(fnames, col_num0, acfs, stringent=False):
    lag_max = acfs[-1][0][1]

    # parallelize if multiprocesing is installed.
    try:
        from multiprocessing import Pool
        pool = Pool()
        imap = pool.imap
    except ImportError:
        import itertools
        imap = itertools.imap

    arg_iter = []
    for fname in fnames:
        # 9e-17 seems to be limit of precision for cholesky.
        arg_iter = chain(arg_iter, ((list(chromlist), lag_max, acfs, stringent) \
                    for key, chromlist in groupby(bediter(fname, col_num0, 9e-17),
                            itemgetter("chrom"))))

    for results in imap(_slk_chrom, arg_iter):
        for r in results:
            yield r
예제 #18
0
def region_p(fpvals, fregions, col_num, step, z=True):
    # just use 2 for col_num, but dont need the p from regions.

    tree = read_regions(fregions)
    process, total_coverage_sync = _get_total_coverage(fpvals, col_num, step)

    region_info = _get_ps_in_regions(tree, fpvals, col_num)

    acfs = _gen_acf(region_info, (fpvals, ), col_num, step)
    process.join()
    total_coverage = total_coverage_sync.value

    # regions first and then create ACF for the longest one.
    print("%i bases used as coverage for sidak correction" % \
                                (total_coverage), file=sys.stderr)
    sample_distribution = np.array([b["p"] for b in bediter(fpvals, col_num)])

    combine = z_score_combine if z else stouffer_liptak
    for region, prows in region_info.items():
        # gen_sigma expects a list of bed dicts.
        sigma = gen_sigma_matrix(prows, acfs)
        ps = np.array([prow["p"] for prow in prows])
        if ps.shape[0] == 0:
            print("bad region", region, file=sys.stderr)
            continue

        # calculate the SLK for the region.
        region_slk = combine(ps, sigma)
        if not region_slk["OK"]:
            print("problem with:", region_slk, ps, file=sys.stderr)

        slk_p = region_slk["p"]

        sidak_slk_p = sidak(slk_p,
                            int(region[2]) - int(region[1]), total_coverage)

        result = ["\t".join(region), slk_p, sidak_slk_p, "NA"]
        yield result
예제 #19
0
def region_p(fpvals, fregions, col_num, step, z=True):
    # just use 2 for col_num, but dont need the p from regions.

    tree = read_regions(fregions)
    process, total_coverage_sync = _get_total_coverage(fpvals, col_num, step)

    region_info = _get_ps_in_regions(tree, fpvals, col_num)

    acfs = _gen_acf(region_info, (fpvals,), col_num, step)
    process.join()
    total_coverage = total_coverage_sync.value

    # regions first and then create ACF for the longest one.
    print >>sys.stderr, "%i bases used as coverage for sidak correction" % \
                                (total_coverage)
    sample_distribution = np.array([b["p"] for b in bediter(fpvals,
                                                                col_num)])

    combine = z_score_combine if z else stouffer_liptak
    for region, prows in region_info.iteritems():
        # gen_sigma expects a list of bed dicts.
        sigma = gen_sigma_matrix(prows, acfs)
        ps = np.array([prow["p"] for prow in prows])
        if ps.shape[0] == 0:
            print >>sys.stderr,("bad region", region)
            continue

        # calculate the SLK for the region.
        region_slk = combine(ps, sigma)
        if not region_slk["OK"]:
            print >>sys.stderr, "problem with:", region_slk, ps

        slk_p = region_slk["p"]

        sidak_slk_p = sidak(slk_p, int(region[2]) - int(region[1]), total_coverage)

        result = ["\t".join(region), slk_p, sidak_slk_p, "NA"]
        yield result