Beispiel #1
0
def run(args):
    # get rid of N, just keep the correlation.
    col_num = get_col_num(args.c)
    col_null = get_col_num(args.null) if args.null else None
    if args.qvality:
        for qval, pep, l in _qvality(args.bed_file, col_num, col_null):
            print "%s\t%.4g\t%4g" % (l.rstrip("\r\n"), qval, pep)
    else:
        for qval, l in obs_fdr(args.bed_file, col_num, col_null):
            print "%s\t%.4g" % (l.rstrip("\r\n"), qval)
Beispiel #2
0
def run(args):
    # get rid of N, just keep the correlation.
    col_num = get_col_num(args.c)
    col_null = get_col_num(args.null) if args.null else None
    if args.qvality:
        for qval, pep, l in _qvality(args.bed_file, col_num, col_null):
            print "%s\t%.4g\t%4g" % (l.rstrip("\r\n"), qval, pep)
    else:
        for qval, l in obs_fdr(args.bed_file, col_num, col_null):
            print "%s\t%.4g" % (l.rstrip("\r\n"), qval)
Beispiel #3
0
def run(args):
    acf_vals = read_acf(args.acf)
    col_num = get_col_num(args.c)
    for chrom, results in adjust_pvals(args.files, col_num, acf_vals):
        fmt = chrom + "\t%i\t%i\t%.5g\t%.5g\n"
        for row in results:
            sys.stdout.write(fmt % tuple(row))
Beispiel #4
0
def run(args):
    acf_vals = read_acf(args.acf)
    col_num = get_col_num(args.c)
    for chrom, results in adjust_pvals(args.files, col_num, acf_vals):
        fmt = chrom + "\t%i\t%i\t%.5g\t%.5g\n"
        for row in results:
            sys.stdout.write(fmt % tuple(row))
def run(args):
    col_num = get_col_num(args.c)
    file_iter =  (l.rstrip("\r\n").split("\t")
                  for l in open(args.file) if l[0] != "#")

    pvals = np.array([float(b[col_num]) for b in file_iter])
    kwargs = {"bins": args.n} if args.n else {}
    hist, bins = np.histogram(pvals, normed=True, **kwargs)
    xlabels = "|".join("%.2f-%.2f" % b for b in pairwise(bins))
    print "#", chart(hist, xlabels)
    hist, bins = np.histogram(pvals, normed=False, **kwargs)

    print "# median: %.3f mean:%.3f; std: %.3f min:%.3f; max:%.3f" % (
        np.median(pvals), pvals.mean(), pvals.std(), pvals.min(), pvals.max())

    try:
        from scipy.stats import chisquare
        chisq, p = chisquare(hist)
        print "#chi-square test of uniformity. p: %.3g " \
              "(low value means reject null of uniformity)" % p
    except ImportError:
        pass
    print "#bin_start\tbin_end\tn"
    for bin, val in zip(pairwise(bins), hist):
        print "%.2f\t%.2f\t%i" % (bin[0], bin[1], val)
Beispiel #6
0
def run(args):
    col_num = get_col_num(args.c)
    # order in results is slk, uniform, sample
    for region_line, slk, slk_sidak, sim_p in region_p(args.pvals, args.regions,
            col_num, args.N, args.step, mlog=args.mlog, z=args.z):
        #if sim_p != "NA":
        #    sim_p = "%.4g" % (sim_p)
        print "%s\t%.4g\t%.4g" % (region_line, slk, slk_sidak)
Beispiel #7
0
def run(args):
    col_num = get_col_num(args.c)
    # order in results is slk, uniform, sample
    for region_line, slk, slk_sidak, sim_p in region_p(args.pvals, args.regions,
            col_num, args.step, z=True):
        #if sim_p != "NA":
        #    sim_p = "%.4g" % (sim_p)
        print "%s\t%.4g\t%.4g" % (region_line, slk, slk_sidak)
def run(args):
    col_num = get_col_num(args.c)
    # order in results is slk, uniform, sample
    #for region_line, slk, slk_sidak, sim_p in region_p(args.pvals, args.regions,
    for region_line, slk, slk_sidak, sim_p in region_p(args.pvals, args.regions,
            col_num, args.N, args.step):
        if sim_p != "NA":
            sim_p = "%.4g" % (sim_p)
        print "%s\t%.4g\t%.4g\t%s" % (region_line, slk, slk_sidak, sim_p)
        """
def run(args):
    """
    general function that takes an args object (from argparse)
    with the necessary options and calls acf()
    """
    d = map(int, args.d.split(":"))
    d[1] += 1 # adjust for non-inclusive end-points...
    assert len(d) == 3
    lags = range(*d)

    acf_vals = acf(args.files, lags, get_col_num(args.c), partial=(not
                                                            args.full))
    write_acf(acf_vals, sys.stdout)
Beispiel #10
0
def run(args):
    """
    general function that takes an args object (from argparse)
    with the necessary options and calls acf()
    """
    d = map(int, args.d.split(":"))
    assert len(d) == 3, ("-d argument must in in the format start:end:step")
    d[1] += 1 # adjust for non-inclusive end-points...
    lags = range(*d)

    acf_vals = acf(args.files, lags, get_col_num(args.c), partial=(not
                                                            args.full))
    write_acf(acf_vals, sys.stdout)
def main():
    import argparse
    from _common import get_col_num

    p = argparse.ArgumentParser(description=__doc__,
                   formatter_class=argparse.RawDescriptionHelpFormatter)

    p.add_argument("-c", dest="c", help="column number that has the value to"
                   "take the  acf", default='4')
    p.add_argument("--dist", dest="dist", help="Maximum dist to extend the"
             " ACF calculation", type=int)
    p.add_argument("--step", dest="step", help="step size for bins in the"
             " ACF calculation", type=int)
    p.add_argument("--seed", dest="seed", help="A value must be at least this"
                 " large/small in order to seed a region.", type=float,
                 default=0.1)
    p.add_argument("--threshold", dest="threshold", help="After seeding, a value"
                 " of at least this number can extend a region. ",
                 type=float)
    p.add_argument("-p", "--prefix", dest="prefix",
            help="prefix for output files", default=None)

    p.add_argument("--mlog", dest="mlog", action="store_true",
                   default=False, help="do the correlation on the -log10 of"
                   "the p-values. Default is to do it on the raw values")

    p.add_argument("--region-filter-p", help="max adjusted region-level p-value to be reported"
                 "in final output", type=float, default=1)

    p.add_argument("--region-filter-n", help="require at least this many probes"
                 "for a region to be reported in final output", type=int, default=1)

    p.add_argument('bed_files', nargs='+', help='sorted bed file to process')

    args = p.parse_args()


    if not (args.prefix):
        sys.exit(p.print_help())

    if not args.threshold:
        args.threshold = args.seed
    assert op.exists(args.bed_files[0])

    col_num = get_col_num(args.c, args.bed_files[0])
    return pipeline(col_num, args.step, args.dist, args.prefix,
            args.threshold, args.seed,
            args.bed_files, mlog=args.mlog,
            region_filter_p=args.region_filter_p,
            region_filter_n=args.region_filter_n)
Beispiel #12
0
def run(args):
    col_num = get_col_num(args.c)
    file_iter =  (l.rstrip("\r\n").split("\t")
                  for l in ts.nopen(args.file) if l[0] != "#")

    pvals = np.array([float(b[col_num]) for b in file_iter])
    kwargs = {"bins": args.n} if args.n else {}
    hist, bins = np.histogram(pvals, normed=True, **kwargs)
    xlabels = "|".join("%.2f-%.2f" % b for b in pairwise(bins))
    hist, bins = np.histogram(pvals, normed=False, **kwargs)

    print("# median: %.3f mean:%.3f; std: %.3f min:%.3f; max:%.3f" % (
        np.median(pvals), pvals.mean(), pvals.std(), pvals.min(), pvals.max()))

    try:
        from scipy.stats import chisquare
        chisq, p = chisquare(hist)
        print("#chi-square test of uniformity. p: %.3g " \
              "(low value means reject null of uniformity)" % p)
    except ImportError:
        pass
    print("#bin_start\tbin_end\tn")
    for bin, val in zip(pairwise(bins), hist):
        print("%.2f\t%.2f\t%i" % (bin[0], bin[1], val))
Beispiel #13
0
def main():
    import argparse
    from _common import get_col_num

    p = argparse.ArgumentParser(description=__doc__,
                   formatter_class=argparse.RawDescriptionHelpFormatter)

    p.add_argument("-c", dest="c", help="column number that has the value to"
                   "take the  acf", default='4')
    p.add_argument("--dist", "--distance", dest="dist", help="Maximum dist to extend the"
             " ACF calculation", type=int)
    p.add_argument("--step", dest="step", help="step size for bins in the"
             " ACF calculation", type=int)
    p.add_argument("--seed", dest="seed", help="A value must be at least this"
                 " large/small in order to seed a region.", type=float,
                 default=0.1)
    p.add_argument("--threshold", dest="threshold", help="After seeding, a value"
                 " of at least this number can extend a region. ",
                 type=float)
    p.add_argument("-p", "--prefix", dest="prefix",
            help="prefix for output files", default=None)
    p.add_argument("-z", "--z-score", action="store_true", default=False,
            help="use z-score correction instead of liptak")

    p.add_argument("--genomic-control", dest="genomic_control",
            help="perform the genomic control correction on the input"
            " pvalues", action="store_true", default=False)

    p.add_argument("--mlog", "--nlog", dest="mlog", action="store_true",
                   default=False, help="do the correlation on the -log10 of"
                   "the p-values. Default is to do it on the raw values")

    p.add_argument("--region-filter-p", help="max adjusted region-level p-value"
                 " to be reported "
                 "in final output. this requires the input bed file to have"
                 " chrom, start, end, 't' columns", type=float, default=1)

    p.add_argument("--region-filter-n", help="require at least this many probes"
                 "for a region to be reported in final output. "
                 " this requires the input bed file to have chrom, start, "
                 "end, 't' columns", type=int, default=None)
    p.add_argument("--annotate", help="annotate with refGen from this db" \
            "in UCSC (e.g. hg19) requires cruzdb", default=None)

    p.add_argument('bed_files', nargs='+', help='sorted bed file to process')

    args = p.parse_args()

    if not (args.prefix):
        sys.exit(p.print_help())

    if not args.threshold:
        args.threshold = args.seed
    assert op.exists(args.bed_files[0])

    col_num = get_col_num(args.c, args.bed_files[0])
    return pipeline(col_num, args.step, args.dist, args.prefix,
            args.threshold, args.seed,
            args.bed_files, mlog=args.mlog,
            region_filter_p=args.region_filter_p,
            region_filter_n=args.region_filter_n,
            genome_control=args.genomic_control,
            db=args.annotate,
            z=args.z_score)
def run(args):
    acf_vals = read_acf(args.acf)
    col_num = get_col_num(args.c)
    for row in adjust_pvals(args.files, col_num, acf_vals, args.stringent):
        sys.stdout.write("%s\t%i\t%i\t%.5g\t%.5g\n" % row)
Beispiel #15
0
def run(args):
    col = get_col_num(args.c)
    print(stepsize((args.bed_file,), col))
Beispiel #16
0
def main():
    import argparse
    from _common import get_col_num

    p = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)

    p.add_argument("-c",
                   dest="c",
                   help="column number that has the value to"
                   "take the  acf",
                   default='4')
    p.add_argument("--dist", "--distance"
                   "--peak-dist",
                   dest="dist",
                   help="Maximum dist to "
                   " search for adjacent peaks.",
                   type=int)
    p.add_argument("--acf-dist",
                   help="distance/window-size to use for "
                   " smoothing. Defaults to 1/3 * peak-dist ",
                   type=int,
                   default=None)

    p.add_argument("--step",
                   dest="step",
                   help="step size for bins in the"
                   " ACF calculation",
                   type=int)
    p.add_argument("--seed",
                   dest="seed",
                   help="A value must be at least this"
                   " large/small in order to seed a region.",
                   type=float,
                   default=0.05)
    p.add_argument("--threshold",
                   dest="threshold",
                   help="After seeding, a value"
                   " of at least this number can extend a region. ",
                   type=float)
    p.add_argument(
        "--no-fdr",
        dest="no_fdr",
        help="Don't use FDR-corrected p-values "
        "for finding peaks (either way, we still do multiple-testing correction "
        "on the p-values for the regions).",
        action='store_true',
        default=False)
    p.add_argument("-p",
                   "--prefix",
                   dest="prefix",
                   help="prefix for output files",
                   default=None)

    p.add_argument("--genomic-control",
                   dest="genomic_control",
                   help="perform the genomic control correction on the input"
                   " pvalues",
                   action="store_true",
                   default=False)

    p.add_argument("--region-filter-p",
                   help="max adjusted region-level p-value"
                   " to be reported "
                   "in final output. this requires the input bed file to have"
                   " chrom, start, end, 't' columns",
                   type=float,
                   default=1)

    p.add_argument("--region-filter-n",
                   help="require at least this many probes"
                   "for a region to be reported in final output. "
                   " this requires the input bed file to have chrom, start, "
                   "end, 't' columns",
                   type=int,
                   default=None)
    p.add_argument("--annotate", help="annotate with refGen from this db" \
            "in UCSC (e.g. hg19) requires cruzdb", default=None)

    p.add_argument('bed_files', nargs='+', help='sorted bed file to process')

    args = p.parse_args()

    if not (args.prefix):
        sys.exit(p.print_help())

    if not args.threshold:
        args.threshold = args.seed
    assert op.exists(args.bed_files[0])

    if args.acf_dist is None:
        args.acf_dist = int(round(0.33333 * args.dist, -1))
        sys.stderr.write("setting --acf-dist to 0.33 * --dist == %i\n" %
                         args.acf_dist)

    col_num = get_col_num(args.c, args.bed_files[0])
    return pipeline(col_num,
                    args.step,
                    args.dist,
                    args.acf_dist,
                    args.prefix,
                    args.threshold,
                    args.seed,
                    args.bed_files,
                    region_filter_p=args.region_filter_p,
                    region_filter_n=args.region_filter_n,
                    genome_control=args.genomic_control,
                    db=args.annotate,
                    use_fdr=not args.no_fdr)
Beispiel #17
0
def main():
    import argparse
    from _common import get_col_num

    p = argparse.ArgumentParser(description=__doc__,
                   formatter_class=argparse.RawDescriptionHelpFormatter)

    p.add_argument("-c", dest="c", help="column number that has the value to"
                   "take the  acf", default='4')
    p.add_argument("--dist", "--distance" "--peak-dist", dest="dist", help="Maximum dist to "
            " search for adjacent peaks.", type=int, required=True)
    p.add_argument("--acf-dist", help="distance/window-size to use for "
            " smoothing. Defaults to 1/3 * peak-dist ", type=int, default=None)

    p.add_argument("--step", dest="step", help="step size for bins in the"
             " ACF calculation", type=int)
    p.add_argument("--seed", dest="seed", help="A value must be at least this"
                 " large/small in order to seed a region.", type=float,
                 default=0.05)
    p.add_argument("--threshold", dest="threshold", help="After seeding, a value"
                 " of at least this number can extend a region. ",
                 type=float)
    p.add_argument("--no-fdr", dest="no_fdr", help="Don't use FDR-corrected p-values "
            "for finding peaks (either way, we still do multiple-testing correction "
            "on the p-values for the regions).", action='store_true',
            default=False)
    p.add_argument("-p", "--prefix", dest="prefix",
            help="prefix for output files", default=None)

    p.add_argument("--genomic-control", dest="genomic_control",
            help="perform the genomic control correction on the input"
            " pvalues", action="store_true", default=False)

    p.add_argument("--region-filter-p", help="max adjusted region-level p-value"
                 " to be reported "
                 "in final output. this requires the input bed file to have"
                 " chrom, start, end, 't' columns", type=float, default=1)

    p.add_argument("--region-filter-n", help="require at least this many probes"
                 "for a region to be reported in final output. "
                 " this requires the input bed file to have chrom, start, "
                 "end, 't' columns", type=int, default=None)
    p.add_argument("--annotate", help="annotate with refGen from this db" \
            "in UCSC (e.g. hg19) requires cruzdb", default=None)

    p.add_argument('bed_files', nargs='+', help='sorted bed file to process')

    args = p.parse_args()

    if not (args.prefix):
        sys.exit(p.print_help())

    if not args.threshold:
        args.threshold = args.seed
    assert op.exists(args.bed_files[0])

    if args.acf_dist is None:
        args.acf_dist = int(round(0.33333 * args.dist, -1))
        sys.stderr.write("setting --acf-dist to 0.33 * --dist == %i\n" %
                args.acf_dist)

    col_num = get_col_num(args.c, args.bed_files[0])
    return pipeline(col_num, args.step,
            args.dist, args.acf_dist, args.prefix,
            args.threshold, args.seed,
            args.bed_files,
            region_filter_p=args.region_filter_p,
            region_filter_n=args.region_filter_n,
            genome_control=args.genomic_control,
            db=args.annotate,
            use_fdr=not args.no_fdr)
def run(args):
    col = get_col_num(args.c)
    print(stepsize((args.bed_file, ), col))
Beispiel #19
0
def run(args):
    acf_vals = read_acf(args.acf)
    col_num = get_col_num(args.c)
    for row in adjust_pvals(args.files, col_num, acf_vals):
        sys.stdout.write("%s\t%i\t%i\t%.5g\t%.5g\n" % row)