def run(args): # get rid of N, just keep the correlation. col_num = get_col_num(args.c) col_null = get_col_num(args.null) if args.null else None if args.qvality: for qval, pep, l in _qvality(args.bed_file, col_num, col_null): print "%s\t%.4g\t%4g" % (l.rstrip("\r\n"), qval, pep) else: for qval, l in obs_fdr(args.bed_file, col_num, col_null): print "%s\t%.4g" % (l.rstrip("\r\n"), qval)
def run(args): acf_vals = read_acf(args.acf) col_num = get_col_num(args.c) for chrom, results in adjust_pvals(args.files, col_num, acf_vals): fmt = chrom + "\t%i\t%i\t%.5g\t%.5g\n" for row in results: sys.stdout.write(fmt % tuple(row))
def run(args): col_num = get_col_num(args.c) file_iter = (l.rstrip("\r\n").split("\t") for l in open(args.file) if l[0] != "#") pvals = np.array([float(b[col_num]) for b in file_iter]) kwargs = {"bins": args.n} if args.n else {} hist, bins = np.histogram(pvals, normed=True, **kwargs) xlabels = "|".join("%.2f-%.2f" % b for b in pairwise(bins)) print "#", chart(hist, xlabels) hist, bins = np.histogram(pvals, normed=False, **kwargs) print "# median: %.3f mean:%.3f; std: %.3f min:%.3f; max:%.3f" % ( np.median(pvals), pvals.mean(), pvals.std(), pvals.min(), pvals.max()) try: from scipy.stats import chisquare chisq, p = chisquare(hist) print "#chi-square test of uniformity. p: %.3g " \ "(low value means reject null of uniformity)" % p except ImportError: pass print "#bin_start\tbin_end\tn" for bin, val in zip(pairwise(bins), hist): print "%.2f\t%.2f\t%i" % (bin[0], bin[1], val)
def run(args): col_num = get_col_num(args.c) # order in results is slk, uniform, sample for region_line, slk, slk_sidak, sim_p in region_p(args.pvals, args.regions, col_num, args.N, args.step, mlog=args.mlog, z=args.z): #if sim_p != "NA": # sim_p = "%.4g" % (sim_p) print "%s\t%.4g\t%.4g" % (region_line, slk, slk_sidak)
def run(args): col_num = get_col_num(args.c) # order in results is slk, uniform, sample for region_line, slk, slk_sidak, sim_p in region_p(args.pvals, args.regions, col_num, args.step, z=True): #if sim_p != "NA": # sim_p = "%.4g" % (sim_p) print "%s\t%.4g\t%.4g" % (region_line, slk, slk_sidak)
def run(args): col_num = get_col_num(args.c) # order in results is slk, uniform, sample #for region_line, slk, slk_sidak, sim_p in region_p(args.pvals, args.regions, for region_line, slk, slk_sidak, sim_p in region_p(args.pvals, args.regions, col_num, args.N, args.step): if sim_p != "NA": sim_p = "%.4g" % (sim_p) print "%s\t%.4g\t%.4g\t%s" % (region_line, slk, slk_sidak, sim_p) """
def run(args): """ general function that takes an args object (from argparse) with the necessary options and calls acf() """ d = map(int, args.d.split(":")) d[1] += 1 # adjust for non-inclusive end-points... assert len(d) == 3 lags = range(*d) acf_vals = acf(args.files, lags, get_col_num(args.c), partial=(not args.full)) write_acf(acf_vals, sys.stdout)
def run(args): """ general function that takes an args object (from argparse) with the necessary options and calls acf() """ d = map(int, args.d.split(":")) assert len(d) == 3, ("-d argument must in in the format start:end:step") d[1] += 1 # adjust for non-inclusive end-points... lags = range(*d) acf_vals = acf(args.files, lags, get_col_num(args.c), partial=(not args.full)) write_acf(acf_vals, sys.stdout)
def main(): import argparse from _common import get_col_num p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) p.add_argument("-c", dest="c", help="column number that has the value to" "take the acf", default='4') p.add_argument("--dist", dest="dist", help="Maximum dist to extend the" " ACF calculation", type=int) p.add_argument("--step", dest="step", help="step size for bins in the" " ACF calculation", type=int) p.add_argument("--seed", dest="seed", help="A value must be at least this" " large/small in order to seed a region.", type=float, default=0.1) p.add_argument("--threshold", dest="threshold", help="After seeding, a value" " of at least this number can extend a region. ", type=float) p.add_argument("-p", "--prefix", dest="prefix", help="prefix for output files", default=None) p.add_argument("--mlog", dest="mlog", action="store_true", default=False, help="do the correlation on the -log10 of" "the p-values. Default is to do it on the raw values") p.add_argument("--region-filter-p", help="max adjusted region-level p-value to be reported" "in final output", type=float, default=1) p.add_argument("--region-filter-n", help="require at least this many probes" "for a region to be reported in final output", type=int, default=1) p.add_argument('bed_files', nargs='+', help='sorted bed file to process') args = p.parse_args() if not (args.prefix): sys.exit(p.print_help()) if not args.threshold: args.threshold = args.seed assert op.exists(args.bed_files[0]) col_num = get_col_num(args.c, args.bed_files[0]) return pipeline(col_num, args.step, args.dist, args.prefix, args.threshold, args.seed, args.bed_files, mlog=args.mlog, region_filter_p=args.region_filter_p, region_filter_n=args.region_filter_n)
def run(args): col_num = get_col_num(args.c) file_iter = (l.rstrip("\r\n").split("\t") for l in ts.nopen(args.file) if l[0] != "#") pvals = np.array([float(b[col_num]) for b in file_iter]) kwargs = {"bins": args.n} if args.n else {} hist, bins = np.histogram(pvals, normed=True, **kwargs) xlabels = "|".join("%.2f-%.2f" % b for b in pairwise(bins)) hist, bins = np.histogram(pvals, normed=False, **kwargs) print("# median: %.3f mean:%.3f; std: %.3f min:%.3f; max:%.3f" % ( np.median(pvals), pvals.mean(), pvals.std(), pvals.min(), pvals.max())) try: from scipy.stats import chisquare chisq, p = chisquare(hist) print("#chi-square test of uniformity. p: %.3g " \ "(low value means reject null of uniformity)" % p) except ImportError: pass print("#bin_start\tbin_end\tn") for bin, val in zip(pairwise(bins), hist): print("%.2f\t%.2f\t%i" % (bin[0], bin[1], val))
def main(): import argparse from _common import get_col_num p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) p.add_argument("-c", dest="c", help="column number that has the value to" "take the acf", default='4') p.add_argument("--dist", "--distance", dest="dist", help="Maximum dist to extend the" " ACF calculation", type=int) p.add_argument("--step", dest="step", help="step size for bins in the" " ACF calculation", type=int) p.add_argument("--seed", dest="seed", help="A value must be at least this" " large/small in order to seed a region.", type=float, default=0.1) p.add_argument("--threshold", dest="threshold", help="After seeding, a value" " of at least this number can extend a region. ", type=float) p.add_argument("-p", "--prefix", dest="prefix", help="prefix for output files", default=None) p.add_argument("-z", "--z-score", action="store_true", default=False, help="use z-score correction instead of liptak") p.add_argument("--genomic-control", dest="genomic_control", help="perform the genomic control correction on the input" " pvalues", action="store_true", default=False) p.add_argument("--mlog", "--nlog", dest="mlog", action="store_true", default=False, help="do the correlation on the -log10 of" "the p-values. Default is to do it on the raw values") p.add_argument("--region-filter-p", help="max adjusted region-level p-value" " to be reported " "in final output. this requires the input bed file to have" " chrom, start, end, 't' columns", type=float, default=1) p.add_argument("--region-filter-n", help="require at least this many probes" "for a region to be reported in final output. " " this requires the input bed file to have chrom, start, " "end, 't' columns", type=int, default=None) p.add_argument("--annotate", help="annotate with refGen from this db" \ "in UCSC (e.g. hg19) requires cruzdb", default=None) p.add_argument('bed_files', nargs='+', help='sorted bed file to process') args = p.parse_args() if not (args.prefix): sys.exit(p.print_help()) if not args.threshold: args.threshold = args.seed assert op.exists(args.bed_files[0]) col_num = get_col_num(args.c, args.bed_files[0]) return pipeline(col_num, args.step, args.dist, args.prefix, args.threshold, args.seed, args.bed_files, mlog=args.mlog, region_filter_p=args.region_filter_p, region_filter_n=args.region_filter_n, genome_control=args.genomic_control, db=args.annotate, z=args.z_score)
def run(args): acf_vals = read_acf(args.acf) col_num = get_col_num(args.c) for row in adjust_pvals(args.files, col_num, acf_vals, args.stringent): sys.stdout.write("%s\t%i\t%i\t%.5g\t%.5g\n" % row)
def run(args): col = get_col_num(args.c) print(stepsize((args.bed_file,), col))
def main(): import argparse from _common import get_col_num p = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) p.add_argument("-c", dest="c", help="column number that has the value to" "take the acf", default='4') p.add_argument("--dist", "--distance" "--peak-dist", dest="dist", help="Maximum dist to " " search for adjacent peaks.", type=int) p.add_argument("--acf-dist", help="distance/window-size to use for " " smoothing. Defaults to 1/3 * peak-dist ", type=int, default=None) p.add_argument("--step", dest="step", help="step size for bins in the" " ACF calculation", type=int) p.add_argument("--seed", dest="seed", help="A value must be at least this" " large/small in order to seed a region.", type=float, default=0.05) p.add_argument("--threshold", dest="threshold", help="After seeding, a value" " of at least this number can extend a region. ", type=float) p.add_argument( "--no-fdr", dest="no_fdr", help="Don't use FDR-corrected p-values " "for finding peaks (either way, we still do multiple-testing correction " "on the p-values for the regions).", action='store_true', default=False) p.add_argument("-p", "--prefix", dest="prefix", help="prefix for output files", default=None) p.add_argument("--genomic-control", dest="genomic_control", help="perform the genomic control correction on the input" " pvalues", action="store_true", default=False) p.add_argument("--region-filter-p", help="max adjusted region-level p-value" " to be reported " "in final output. this requires the input bed file to have" " chrom, start, end, 't' columns", type=float, default=1) p.add_argument("--region-filter-n", help="require at least this many probes" "for a region to be reported in final output. " " this requires the input bed file to have chrom, start, " "end, 't' columns", type=int, default=None) p.add_argument("--annotate", help="annotate with refGen from this db" \ "in UCSC (e.g. hg19) requires cruzdb", default=None) p.add_argument('bed_files', nargs='+', help='sorted bed file to process') args = p.parse_args() if not (args.prefix): sys.exit(p.print_help()) if not args.threshold: args.threshold = args.seed assert op.exists(args.bed_files[0]) if args.acf_dist is None: args.acf_dist = int(round(0.33333 * args.dist, -1)) sys.stderr.write("setting --acf-dist to 0.33 * --dist == %i\n" % args.acf_dist) col_num = get_col_num(args.c, args.bed_files[0]) return pipeline(col_num, args.step, args.dist, args.acf_dist, args.prefix, args.threshold, args.seed, args.bed_files, region_filter_p=args.region_filter_p, region_filter_n=args.region_filter_n, genome_control=args.genomic_control, db=args.annotate, use_fdr=not args.no_fdr)
def main(): import argparse from _common import get_col_num p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) p.add_argument("-c", dest="c", help="column number that has the value to" "take the acf", default='4') p.add_argument("--dist", "--distance" "--peak-dist", dest="dist", help="Maximum dist to " " search for adjacent peaks.", type=int, required=True) p.add_argument("--acf-dist", help="distance/window-size to use for " " smoothing. Defaults to 1/3 * peak-dist ", type=int, default=None) p.add_argument("--step", dest="step", help="step size for bins in the" " ACF calculation", type=int) p.add_argument("--seed", dest="seed", help="A value must be at least this" " large/small in order to seed a region.", type=float, default=0.05) p.add_argument("--threshold", dest="threshold", help="After seeding, a value" " of at least this number can extend a region. ", type=float) p.add_argument("--no-fdr", dest="no_fdr", help="Don't use FDR-corrected p-values " "for finding peaks (either way, we still do multiple-testing correction " "on the p-values for the regions).", action='store_true', default=False) p.add_argument("-p", "--prefix", dest="prefix", help="prefix for output files", default=None) p.add_argument("--genomic-control", dest="genomic_control", help="perform the genomic control correction on the input" " pvalues", action="store_true", default=False) p.add_argument("--region-filter-p", help="max adjusted region-level p-value" " to be reported " "in final output. this requires the input bed file to have" " chrom, start, end, 't' columns", type=float, default=1) p.add_argument("--region-filter-n", help="require at least this many probes" "for a region to be reported in final output. " " this requires the input bed file to have chrom, start, " "end, 't' columns", type=int, default=None) p.add_argument("--annotate", help="annotate with refGen from this db" \ "in UCSC (e.g. hg19) requires cruzdb", default=None) p.add_argument('bed_files', nargs='+', help='sorted bed file to process') args = p.parse_args() if not (args.prefix): sys.exit(p.print_help()) if not args.threshold: args.threshold = args.seed assert op.exists(args.bed_files[0]) if args.acf_dist is None: args.acf_dist = int(round(0.33333 * args.dist, -1)) sys.stderr.write("setting --acf-dist to 0.33 * --dist == %i\n" % args.acf_dist) col_num = get_col_num(args.c, args.bed_files[0]) return pipeline(col_num, args.step, args.dist, args.acf_dist, args.prefix, args.threshold, args.seed, args.bed_files, region_filter_p=args.region_filter_p, region_filter_n=args.region_filter_n, genome_control=args.genomic_control, db=args.annotate, use_fdr=not args.no_fdr)
def run(args): col = get_col_num(args.c) print(stepsize((args.bed_file, ), col))
def run(args): acf_vals = read_acf(args.acf) col_num = get_col_num(args.c) for row in adjust_pvals(args.files, col_num, acf_vals): sys.stdout.write("%s\t%i\t%i\t%.5g\t%.5g\n" % row)