def pipeline(col_num, step, dist, acf_dist, prefix, threshold, seed, bed_files, mlog=True, region_filter_p=1, region_filter_n=None, genome_control=False, db=None, use_fdr=True): sys.path.insert(0, op.join(op.dirname(__file__), "..")) from cpv import acf, slk, fdr, peaks, region_p, stepsize, filter from cpv._common import genome_control_adjust, genomic_control, bediter import operator if step is None: step = min(acf_dist, stepsize.stepsize(bed_files, col_num)) print("calculated stepsize as: %i" % step, file=sys.stderr) lags = list(range(1, acf_dist, step)) lags.append(lags[-1] + step) prefix = prefix.rstrip(".") putative_acf_vals = acf.acf(bed_files, lags, col_num, simple=False, mlog=mlog) acf_vals = [] # go out to max requested distance but stop once an autocorrelation # < 0.05 is added. for a in putative_acf_vals: # a is ((lmin, lmax), (corr, N)) # this heuristic seems to work. stop just above the 0.08 correlation # lag. if a[1][0] < 0.04 and len(acf_vals) > 2: break acf_vals.append(a) if a[1][0] < 0.04 and len(acf_vals): break # save the arguments that this was called with. with open(prefix + ".args.txt", "w") as fh: print(" ".join(sys.argv[1:]) + "\n", file=fh) import datetime print("date: %s" % datetime.datetime.today(), file=fh) from .__init__ import __version__ print("version:", __version__, file=fh) with open(prefix + ".acf.txt", "w") as fh: acf_vals = acf.write_acf(acf_vals, fh) print("wrote: %s" % fh.name, file=fh) print("ACF:\n", open(prefix + ".acf.txt").read(), file=sys.stderr) spvals, opvals = array.array('f'), array.array('f') with ts.nopen(prefix + ".slk.bed.gz", "w") as fhslk: fhslk.write('#chrom\tstart\tend\tp\tregion-p\n') for chrom, results in slk.adjust_pvals(bed_files, col_num, acf_vals): fmt = chrom + "\t%i\t%i\t%.4g\t%.4g\n" for row in results: row = tuple(row) fhslk.write(fmt % row) opvals.append(row[-2]) spvals.append(row[-1]) print("# original lambda: %.2f" % genomic_control(opvals), file=sys.stderr) del opvals gc_lambda = genomic_control(spvals) print("wrote: %s with lambda: %.2f" % (fhslk.name, gc_lambda), file=sys.stderr) if genome_control: fhslk = ts.nopen(prefix + ".slk.gc.bed.gz", "w") adj = genome_control_adjust([d['p'] for d in bediter(prefix + ".slk.bed.gz", -1)]) for i, line in enumerate(ts.nopen(prefix + ".slk.bed.gz")): print("%s\t%.5g" % (line.rstrip("\r\n"), adj[i]), file=fhslk) fhslk.close() print("wrote: %s" % fhslk.name, file=sys.stderr) with ts.nopen(prefix + ".fdr.bed.gz", "w") as fh: fh.write('#chrom\tstart\tend\tp\tregion-p\tregion-q\n') for bh, l in fdr.fdr(fhslk.name, -1): fh.write("%s\t%.4g\n" % (l.rstrip("\r\n"), bh)) print("wrote: %s" % fh.name, file=sys.stderr) fregions = prefix + ".regions.bed.gz" with ts.nopen(fregions, "w") as fh: list(peaks.peaks(prefix + ".fdr.bed.gz", -1 if use_fdr else -2, threshold, seed, dist, fh, operator.le)) n_regions = sum(1 for _ in ts.nopen(fregions)) print("wrote: %s (%i regions)" % (fregions, n_regions), file=sys.stderr) if n_regions == 0: sys.exit() with ts.nopen(prefix + ".regions-p.bed.gz", "w") as fh: N = 0 fh.write("#chrom\tstart\tend\tmin_p\tn_probes\tz_p\tz_sidak_p\n") # use -2 for original, uncorrected p-values in slk.bed for region_line, slk_p, slk_sidak_p, sim_p in region_p.region_p( prefix + ".slk.bed.gz", prefix + ".regions.bed.gz", -2, step): fh.write("%s\t%.4g\t%.4g\n" % (region_line, slk_p, slk_sidak_p)) fh.flush() N += int(slk_sidak_p < 0.05) print("wrote: %s, (regions with corrected-p < 0.05: %i)" \ % (fh.name, N), file=sys.stderr) regions_bed = fh.name #if all(h in header for h in ('t', 'start', 'end')): if region_filter_n is None: region_filter_n = 0 with ts.nopen(prefix + ".regions-t.bed", "w") as fh: N = 0 for i, toks in enumerate(filter.filter(bed_files[0], regions_bed, p_col_name=col_num)): if i == 0: toks[0] = "#" + toks[0] else: if float(toks[6]) > region_filter_p: continue if int(toks[4]) < region_filter_n: continue #if region_filter_t and "/" in toks[7]: # # t-pos/t-neg. if the lower one is > region_filter_t? # vals = map(int, toks[7].split("/")) # if min(vals) > region_filter_t: continue N += 1 print("\t".join(toks), file=sys.stderr) print(("wrote: %s, (regions with region-p " "< %.3f and n-probes >= %i: %i)") \ % (fh.name, region_filter_p, region_filter_n, N), file=sys.stderr) try: from cpv import manhattan regions = manhattan.read_regions(fh.name) manhattan.manhattan(prefix + ".slk.bed.gz", 3, prefix.rstrip(".") + ".manhattan.png", False, ['#959899', '#484B4C'], "", False, None, regions=regions, bonferonni=False) except ImportError: pass # they dont have matplotlib if db is not None: from cruzdb import Genome g = Genome(db) lastf = fh.name with open(prefix + ".anno.%s.bed" % db, "w") as fh: fh.write('#') g.annotate(lastf, ("refGene", "cpgIslandExt"), out=fh, feature_strand=True, parallel=len(spvals) > 500) print("wrote: %s annotated with %s" % (fh.name, db), file=sys.stderr)
def pipeline(col_num, step, dist, prefix, threshold, seed, bed_files, mlog=False, region_filter_p=1, region_filter_n=1): sys.path.insert(0, op.join(op.dirname(__file__), "..")) from cpv import acf, slk, fdr, peaks, region_p, stepsize, filter import operator if step is None: step = stepsize.stepsize(bed_files, col_num) print >>sys.stderr, "calculated stepsize as: %i" % step lags = range(1, dist, step) lags.append(lags[-1] + step) prefix = prefix.rstrip(".") # go out to max requested distance but stop once an autocorrelation # < 0.05 is added. putative_acf_vals = acf.acf(bed_files, lags, col_num, simple=False, mlog=mlog) acf_vals = [] for a in putative_acf_vals: # a is ((lmin, lmax), (corr, N)) # this heuristic seems to work. stop just above the 0.08 correlation # lag. if a[1][0] < 0.04 and len(acf_vals) > 2: break acf_vals.append(a) if a[1][0] < 0.04 and len(acf_vals): break # save the arguments that this was called with. with open(prefix + ".args.txt", "w") as fh: print >>fh, " ".join(sys.argv[1:]) + "\n" import datetime print >>fh, "date: %s" % datetime.datetime.today() with open(prefix + ".acf.txt", "w") as fh: acf_vals = acf.write_acf(acf_vals, fh) print >>sys.stderr, "wrote: %s" % fh.name print >>sys.stderr, "ACF:\n", open(prefix + ".acf.txt").read() with open(prefix + ".slk.bed", "w") as fh: for row in slk.adjust_pvals(bed_files, col_num, acf_vals): fh.write("%s\t%i\t%i\t%.4g\t%.4g\n" % row) print >>sys.stderr, "wrote: %s" % fh.name with open(prefix + ".fdr.bed", "w") as fh: for bh, l in fdr.fdr(prefix + ".slk.bed", -1): fh.write("%s\t%.4g\n" % (l.rstrip("\r\n"), bh)) print >>sys.stderr, "wrote: %s" % fh.name fregions = prefix + ".regions.bed" with open(fregions, "w") as fh: peaks.peaks(prefix + ".fdr.bed", -1, threshold, seed, step, fh, operator.le) n_regions = sum(1 for _ in open(fregions)) print >>sys.stderr, "wrote: %s (%i regions)" % (fregions, n_regions) with open(prefix + ".regions-p.bed", "w") as fh: N = 0 fh.write("#chrom\tstart\tend\tmin_p\tn_probes\tslk_p\tslk_sidak_p\n") # use -2 for original, uncorrected p-values in slk.bed for region_line, slk_p, slk_sidak_p, sim_p in region_p.region_p( prefix + ".slk.bed", prefix + ".regions.bed", -2, 0, step, mlog=mlog): fh.write("%s\t%.4g\t%.4g\n" % (region_line, slk_p, slk_sidak_p)) fh.flush() N += int(slk_sidak_p < 0.05) print >>sys.stderr, "wrote: %s, (regions with corrected-p < 0.05: %i)" \ % (fh.name, N) regions_bed = fh.name with open(prefix + ".regions-t.bed", "w") as fh: N = 0 for i, toks in enumerate(filter.filter(bed_files[0], regions_bed)): if i == 0: toks[0] = "#" + toks[0] else: if float(toks[6]) > region_filter_p: continue if int(toks[4]) < region_filter_n: continue N += 1 print >>fh, "\t".join(toks) print >>sys.stderr, "wrote: %s, (regions with region-p < %.3f and n-probes >= %i: %i)" \ % (fh.name, region_filter_p, region_filter_n, N)
def pipeline(col_num, step, dist, acf_dist, prefix, threshold, seed, bed_files, mlog=True, region_filter_p=1, region_filter_n=None, genome_control=False, db=None, use_fdr=True): sys.path.insert(0, op.join(op.dirname(__file__), "..")) from cpv import acf, slk, fdr, peaks, region_p, stepsize, filter from cpv._common import genome_control_adjust, genomic_control, bediter import operator if step is None: step = min(acf_dist, stepsize.stepsize(bed_files, col_num)) print >> sys.stderr, "calculated stepsize as: %i" % step lags = range(1, acf_dist, step) lags.append(lags[-1] + step) prefix = prefix.rstrip(".") putative_acf_vals = acf.acf(bed_files, lags, col_num, simple=False, mlog=mlog) acf_vals = [] # go out to max requested distance but stop once an autocorrelation # < 0.05 is added. for a in putative_acf_vals: # a is ((lmin, lmax), (corr, N)) # this heuristic seems to work. stop just above the 0.08 correlation # lag. if a[1][0] < 0.04 and len(acf_vals) > 2: break acf_vals.append(a) if a[1][0] < 0.04 and len(acf_vals): break # save the arguments that this was called with. with open(prefix + ".args.txt", "w") as fh: print >> fh, " ".join(sys.argv[1:]) + "\n" import datetime print >> fh, "date: %s" % datetime.datetime.today() from .__init__ import __version__ print >> fh, "version:", __version__ with open(prefix + ".acf.txt", "w") as fh: acf_vals = acf.write_acf(acf_vals, fh) print >> sys.stderr, "wrote: %s" % fh.name print >> sys.stderr, "ACF:\n", open(prefix + ".acf.txt").read() spvals, opvals = [], [] with ts.nopen(prefix + ".slk.bed.gz", "w") as fhslk: fhslk.write('#chrom\tstart\tend\tp\tregion-p\n') for row in slk.adjust_pvals(bed_files, col_num, acf_vals): fhslk.write("%s\t%i\t%i\t%.4g\t%.4g\n" % row) opvals.append(row[-2]) spvals.append(row[-1]) print >> sys.stderr, "# original lambda: %.2f" % genomic_control(opvals) del opvals gc_lambda = genomic_control(spvals) print >> sys.stderr, "wrote: %s with lambda: %.2f" % (fhslk.name, gc_lambda) if genome_control: fhslk = ts.nopen(prefix + ".slk.gc.bed.gz", "w") adj = genome_control_adjust( [d['p'] for d in bediter(prefix + ".slk.bed.gz", -1)]) for i, line in enumerate(ts.nopen(prefix + ".slk.bed.gz")): print >> fhslk, "%s\t%.5g" % (line.rstrip("\r\n"), adj[i]) fhslk.close() print >> sys.stderr, "wrote: %s" % fhslk.name with ts.nopen(prefix + ".fdr.bed.gz", "w") as fh: fh.write('#chrom\tstart\tend\tp\tregion-p\tregion-q\n') for bh, l in fdr.fdr(fhslk.name, -1): fh.write("%s\t%.4g\n" % (l.rstrip("\r\n"), bh)) print >> sys.stderr, "wrote: %s" % fh.name fregions = prefix + ".regions.bed.gz" with ts.nopen(fregions, "w") as fh: list( peaks.peaks(prefix + ".fdr.bed.gz", -1 if use_fdr else -2, threshold, seed, dist, fh, operator.le)) n_regions = sum(1 for _ in ts.nopen(fregions)) print >> sys.stderr, "wrote: %s (%i regions)" % (fregions, n_regions) if n_regions == 0: sys.exit() with ts.nopen(prefix + ".regions-p.bed.gz", "w") as fh: N = 0 fh.write("#chrom\tstart\tend\tmin_p\tn_probes\tz_p\tz_sidak_p\n") # use -2 for original, uncorrected p-values in slk.bed for region_line, slk_p, slk_sidak_p, sim_p in region_p.region_p( prefix + ".slk.bed.gz", prefix + ".regions.bed.gz", -2, step): fh.write("%s\t%.4g\t%.4g\n" % (region_line, slk_p, slk_sidak_p)) fh.flush() N += int(slk_sidak_p < 0.05) print >>sys.stderr, "wrote: %s, (regions with corrected-p < 0.05: %i)" \ % (fh.name, N) regions_bed = fh.name header = ts.header(bed_files[0]) #if all(h in header for h in ('t', 'start', 'end')): if region_filter_n is None: region_filter_n = 0 with ts.nopen(prefix + ".regions-t.bed", "w") as fh: N = 0 for i, toks in enumerate( filter.filter(bed_files[0], regions_bed, p_col_name=col_num)): if i == 0: toks[0] = "#" + toks[0] else: if float(toks[6]) > region_filter_p: continue if int(toks[4]) < region_filter_n: continue #if region_filter_t and "/" in toks[7]: # # t-pos/t-neg. if the lower one is > region_filter_t? # vals = map(int, toks[7].split("/")) # if min(vals) > region_filter_t: continue N += 1 print >> fh, "\t".join(toks) print >>sys.stderr, ("wrote: %s, (regions with region-p " "< %.3f and n-probes >= %i: %i)") \ % (fh.name, region_filter_p, region_filter_n, N) try: from cpv import manhattan regions = manhattan.read_regions(fh.name) manhattan.manhattan(prefix + ".slk.bed.gz", 3, prefix.rstrip(".") + ".manhattan.png", False, ['#959899', '#484B4C'], "", False, None, regions=regions, bonferonni=False) except ImportError: pass # they dont have matplotlib if db is not None: from cruzdb import Genome g = Genome(db) lastf = fh.name with open(prefix + ".anno.%s.bed" % db, "w") as fh: fh.write('#') g.annotate(lastf, ("refGene", "cpgIslandExt"), out=fh, feature_strand=True, parallel=len(spvals) > 500) print >> sys.stderr, "wrote: %s annotated with %s" % (fh.name, db)
def pipeline(col_num, step, dist, prefix, threshold, seed, bed_files, mlog=False, region_filter_p=1, region_filter_n=1, genome_control=False, db=None): sys.path.insert(0, op.join(op.dirname(__file__), "..")) from cpv import acf, slk, fdr, peaks, region_p, stepsize, filter from cpv._common import genome_control_adjust, genomic_control, bediter import operator if step is None: step = stepsize.stepsize(bed_files, col_num) print >>sys.stderr, "calculated stepsize as: %i" % step lags = range(1, dist, step) lags.append(lags[-1] + step) prefix = prefix.rstrip(".") #if genome_control: # with open(prefix + ".adj.bed", "w") as fh: # genome_control_adjust_bed(bed_files, col_num, fh) # bed_files = [fh.name] putative_acf_vals = acf.acf(bed_files, lags, col_num, simple=False, mlog=mlog) acf_vals = [] # go out to max requested distance but stop once an autocorrelation # < 0.05 is added. for a in putative_acf_vals: # a is ((lmin, lmax), (corr, N)) # this heuristic seems to work. stop just above the 0.08 correlation # lag. if a[1][0] < 0.04 and len(acf_vals) > 2: break acf_vals.append(a) if a[1][0] < 0.04 and len(acf_vals): break # save the arguments that this was called with. with open(prefix + ".args.txt", "w") as fh: print >>fh, " ".join(sys.argv[1:]) + "\n" import datetime print >>fh, "date: %s" % datetime.datetime.today() with open(prefix + ".acf.txt", "w") as fh: acf_vals = acf.write_acf(acf_vals, fh) print >>sys.stderr, "wrote: %s" % fh.name print >>sys.stderr, "ACF:\n", open(prefix + ".acf.txt").read() spvals, opvals = [], [] with open(prefix + ".slk.bed", "w") as fhslk: for row in slk.adjust_pvals(bed_files, col_num, acf_vals): fhslk.write("%s\t%i\t%i\t%.4g\t%.4g\n" % row) opvals.append(row[-2]) spvals.append(row[-1]) print >>sys.stderr, "# original lambda: %.2f" % genomic_control(opvals) del opvals gc_lambda = genomic_control(spvals) print >>sys.stderr, "wrote: %s with lambda: %.2f" % (fhslk.name, gc_lambda) if genome_control: fhslk = open(prefix + ".slk.gc.bed", "w") adj = genome_control_adjust([d['p'] for d in bediter(prefix + ".slk.bed", -1)]) for i, line in enumerate(open(prefix + ".slk.bed")): print >>fhslk, "%s\t%.5g" % (line.rstrip("\r\n"), adj[i]) fhslk.close() print >>sys.stderr, "wrote: %s" % fhslk.name with open(prefix + ".fdr.bed", "w") as fh: for bh, l in fdr.fdr(fhslk.name, -1): fh.write("%s\t%.4g\n" % (l.rstrip("\r\n"), bh)) print >>sys.stderr, "wrote: %s" % fh.name fregions = prefix + ".regions.bed" with open(fregions, "w") as fh: list(peaks.peaks(prefix + ".fdr.bed", -1, threshold, seed, step, fh, operator.le)) n_regions = sum(1 for _ in open(fregions)) print >>sys.stderr, "wrote: %s (%i regions)" % (fregions, n_regions) with open(prefix + ".regions-p.bed", "w") as fh: N = 0 fh.write("#chrom\tstart\tend\tmin_p\tn_probes\tslk_p\tslk_sidak_p\n") # use -2 for original, uncorrected p-values in slk.bed for region_line, slk_p, slk_sidak_p, sim_p in region_p.region_p( prefix + ".slk.bed", prefix + ".regions.bed", -2, 0, step, mlog=mlog): fh.write("%s\t%.4g\t%.4g\n" % (region_line, slk_p, slk_sidak_p)) fh.flush() N += int(slk_sidak_p < 0.05) print >>sys.stderr, "wrote: %s, (regions with corrected-p < 0.05: %i)" \ % (fh.name, N) regions_bed = fh.name header = (gzip.open(bed_files[0]) if bed_files[0].endswith(".gz") else open(bed_files[0])).next().split("\t") if all(h in header for h in ('t', 'start', 'end')): with open(prefix + ".regions-t.bed", "w") as fh: N = 0 for i, toks in enumerate(filter.filter(bed_files[0], regions_bed, p_col_name=col_num)): if i == 0: toks[0] = "#" + toks[0] else: if float(toks[6]) > region_filter_p: continue if int(toks[4]) < region_filter_n: continue N += 1 print >>fh, "\t".join(toks) print >>sys.stderr, ("wrote: %s, (regions with region-p" "< %.3f and n-probes >= %i: %i)") \ % (fh.name, region_filter_p, region_filter_n, N) try: from cpv import manhattan regions = manhattan.read_regions(fh.name) manhattan.manhattan(prefix + ".slk.bed", 3, prefix.rstrip(".") + ".manhattan.png", False, ['#959899', '#484B4C'], "", False, None, regions=regions, bonferonni=True) except ImportError: pass # they dont have matplotlib if db is not None: from cruzdb import Genome g = Genome(db) lastf = fh.name with open(prefix + ".anno.%s.bed" % db, "w") as fh: g.annotate(lastf, ("refGene", "cpgIslandExt", "cytoBand"), out=fh, feature_strand=True, parallel=len(spvals) > 500) print >>sys.stderr, "wrote: %s annotated with %s" % (fh.name, db)