Beispiel #1
0
def acf(fnames, lags, col_num0, partial=True, simple=False, mlog=False):
    """
    calculate the correlation of the numbers in `col_num0` from the bed files
    in `fnames` at various lags. The lags are specified by distance. Partial
    autocorrelation may be calculated as well.

    Since the bed files may be very large, this attempts to be as memory
    efficient as possible while still being very fast for a pure python
    implementation.
    """
    # reversing allows optimization below.
    imap = get_map()

    arg_list = [] # chaining
    for fname in fnames:
        # groupby chromosome.
        arg_list = chain(arg_list, ((list(chromlist), lags) for chrom, \
                    chromlist in \
                    groupby(bediter(fname, col_num0), lambda a: a["chrom"])))

    unmerged_acfs = [] # separated by chrom. need to merge later.
    for chrom_acf in imap(_acf_by_chrom, arg_list):
        unmerged_acfs.append(chrom_acf)

    acfs = merge_acfs(unmerged_acfs)
    acf_res = {}
    xs = np.array([], dtype='f')
    ys = np.array([], dtype='f')
    # iterate over it backwards and remove to reduce memory.
    while len(acfs):
        lmin, lmax, xys = acfs.pop()
        if partial:
            xs, ys = np.array(xys["x"]), np.array(xys["y"])
        else:
            # add the inner layers as we move out.
            xs = np.hstack((xs, xys["x"]))
            ys = np.hstack((ys, xys["y"]))
        if len(xs) == 0:
            print >>sys.stderr, "no values found at lag: %i-%i. skipping" \
                    % (lmin, lmax)
            continue
        if mlog:
            xs[xs == 0] = 0.5 * xs[xs > 0].min()
            ys[ys == 0] = 0.5 * ys[ys > 0].min()
            xs, ys = -np.log10(xs), -np.log10(ys)
        slope, intercept, corr, p_val, stderr = ss.linregress(xs, ys)
        # NOTE: using pearson correlation, which assumes normality.
        # could switch to spearman as below.
        #corr, p_val = ss.spearmanr(xs, ys)
        if simple:
            acf_res[(lmin, lmax)] = corr
        else:
            acf_res[(lmin, lmax)] = (corr, len(xs), p_val)
    return sorted(acf_res.items())
Beispiel #2
0
def acf(fnames, lags, col_num0, partial=True, simple=False, mlog=True):
    """
    calculate the correlation of the numbers in `col_num0` from the bed files
    in `fnames` at various lags. The lags are specified by distance. Partial
    autocorrelation may be calculated as well.

    Since the bed files may be very large, this attempts to be as memory
    efficient as possible while still being very fast for a pure python
    implementation.
    """
    # reversing allows optimization below.
    imap = get_map()

    arg_list = [] # chaining
    for fname in fnames:
        # groupby chromosome.
        arg_list = chain(arg_list, ((list(chromlist), lags) for chrom, \
                    chromlist in \
                    groupby(bediter(fname, col_num0), lambda a: a["chrom"])))

    unmerged_acfs = [] # separated by chrom. need to merge later.
    for chrom_acf in imap(_acf_by_chrom, arg_list):
        unmerged_acfs.append(chrom_acf)

    acfs = merge_acfs(unmerged_acfs)
    acf_res = {}
    xs = np.array([], dtype='f')
    ys = np.array([], dtype='f')
    # iterate over it backwards and remove to reduce memory.
    while len(acfs):
        lmin, lmax, xys = acfs.pop()
        if partial:
            xs, ys = np.array(xys["x"]), np.array(xys["y"])
        else:
            # add the inner layers as we move out.
            xs = np.hstack((xs, xys["x"]))
            ys = np.hstack((ys, xys["y"]))
        if len(xs) == 0:
            print >>sys.stderr, "no values found at lag: %i-%i. skipping" \
                    % (lmin, lmax)
            continue
        if mlog:
            xs[xs == 0] = 1e-12
            ys[ys == 0] = 1e-12
            xs, ys = -np.log10(xs), -np.log10(ys)
        #slope, intercept, corr, p_val, stderr = ss.linregress(xs, ys)
        # NOTE: using pearson correlation, which assumes normality.
        # could switch to spearman as below.
        corr, p_val = ss.spearmanr(xs, ys)
        if simple:
            acf_res[(lmin, lmax)] = corr
        else:
            acf_res[(lmin, lmax)] = (corr, len(xs), p_val)
    return sorted(acf_res.items())
Beispiel #3
0
def adjust_pvals(fnames, col_num0, acfs, stringent=False):
    lag_max = acfs[-1][0][1]

    # parallelize if multiprocesing is installed.
    imap = get_map()
    arg_iter = []
    for fname in fnames:
        # 9e-17 seems to be limit of precision for cholesky.
        arg_iter = chain(arg_iter, ((list(chromlist), lag_max, acfs, stringent) \
                    for key, chromlist in groupby(bediter(fname, col_num0, 9e-17),
                            itemgetter("chrom"))))

    for results in imap(_slk_chrom, arg_iter):
        for r in results:
            yield r
Beispiel #4
0
def adjust_pvals(fnames, col_num0, acfs, z=True):
    lag_max = acfs[-1][0][1]

    # parallelize if multiprocesing is installed.
    imap = get_map()
    arg_iter = []
    for fname in fnames:
        # 9e-17 seems to be limit of precision for cholesky.
        arg_iter = chain(arg_iter, ((list(chromlist), lag_max, acfs,
            z) \
                    for key, chromlist in groupby(bediter(fname, col_num0, 9e-117),
                            itemgetter("chrom"))))

    for chrom, results in imap(_slk_chrom, arg_iter):
        yield chrom, results