def do_indel_realign(p, rd, jobsize, sample, refid, ref, deps): odir = sample pfx = "{}.{}".format(sample, refid) opfx = "{}/{}".format(odir, pfx) ibam = "{}.bam".format(opfx) obam1 = "{}.realigned.bam".format(opfx) intervals = "{}.intervals".format(opfx) obam2 = "{}.{}.bam".format(sample, refid) obai2 = "{}.bai".format(obam2) if os.path.exists(obam2) and os.path.exists(obai2) and \ os.path.getctime(obam2) > os.path.getctime(ibam) and \ not has_deps(deps): ret = subprocess.call(["samtools", "quickcheck", obam2]) if ret == 0: # nothing to do return None, obam2 # train realigner res1 = Resource(rd["06:realign1"], jobsize) j1 = p.new_job("06:realign1:{}".format(pfx), res1, force=True) j1.add_cmd("rm -f {}".format(intervals)) gatk_cmd1 = [ "$gatk", # must have this defined somewhere "-T RealignerTargetCreator", "-nt {}".format(res1.cpus), "-R {}".format(ref), "-I {}".format(ibam), "-o {}".format(intervals) ] j1.add_cmd(" \\\n\t".join(gatk_cmd1)) stage1 = j1.sub(afterok=deps) # do indel realignment res2 = Resource(rd["06:realign2"], jobsize) j2 = p.new_job("06:realign2:{}".format(pfx), res2, force=True) j2.add_cmd("rm -f {}".format(obam1)) gatk_cmd2 = [ "$gatk", "-T IndelRealigner", "-R {}".format(ref), "-I {}".format(ibam), "--bam_compression 0", "--disable_bam_indexing", "-targetIntervals {}".format(intervals), "-o {}".format(obam1) ] j2.add_cmd(" \\\n\t".join(gatk_cmd2)) j2.add_cmd("rm {}".format(intervals)) stage2 = j2.sub(afterok=stage1) # regenerate MD tags to match realignments res3 = Resource(rd["06:realign3"], jobsize) j3 = p.new_job("06:realign3:{}".format(pfx), res2, force=True) j3.add_cmd("rm -f {}".format(obam2)) j3.add_cmd("rm -f {}".format(obai2)) calmd_cmd = ["samtools calmd", "-b", obam1, ref, "> {}".format(obam2)] j3.add_cmd(" \\\n\t".join(calmd_cmd)) j3.add_cmd("samtools index {}".format(obam2)) stage3 = j3.sub(afterok=stage2) return stage3, obam2
def do_fold(p, rd, jobsize, sample, lib, runid, r1, r2): odir = "{}/{}".format(sample, lib) pfx = "{}_{}_{}".format(sample, lib, runid) opfx = "{}/{}".format(odir, pfx) metrics = "{}.metrics".format(opfx) fastq = "{}.folded.fastq.gz".format(opfx) mkdir_p(odir) if os.path.exists(metrics) and os.path.exists(fastq): # nothing to do return None filecheck(r1) filecheck(r2) res = Resource(rd["01:fold"], jobsize) j = p.new_job("01:fold:{}".format(pfx), res, force=True) fold_cmd = ["foldreads", "-m", metrics, "-1 {}".format(r1), "-2 {}".format(r2), "| gzip -c -", "> {}".format(fastq)] j.add_cmd(" \\\n\t".join(fold_cmd)) return j.sub()
def do_samtools_stats(p, rd, jobsize, pfx, bam, deps): stats_mt = "{}.stats.MT.txt".format(bam) stats_aut = "{}.stats.Aut.txt".format(bam) stats_X = "{}.stats.chrX.txt".format(bam) if os.path.exists(stats_mt) and os.path.exists(stats_aut) and os.path.exists(stats_X) and \ os.path.getctime(stats_mt) > os.path.getctime(bam) and \ os.path.getctime(stats_aut) > os.path.getctime(bam) and \ os.path.getctime(stats_X) > os.path.getctime(bam) and \ not has_deps(deps): return None res = Resource(rd["10:stats"], jobsize) j = p.new_job("10:stats:{}".format(pfx), res, force=True) chrmax = 50 autlist = [str(c) for c in range(1,chrmax)] \ + ["chr{}".format(c) for c in range(1,chrmax)] \ + ["Chr{}".format(c) for c in range(1,chrmax)] autstr = " ".join(autlist) # 'samtools stats' doesn't fail if you specify a non-existant chr name, # so we try multiple common names in each case. j.add_cmd("samtools stats {} MT Mt M > {}".format(bam, stats_mt)) j.add_cmd("samtools stats {} {} > {}".format(bam, autstr, stats_aut)) j.add_cmd("samtools stats {} X chrX ChrX > {}".format(bam, stats_X)) return j.sub(afterok=deps)
def do_scanbp(p, rd, jobsize, sample, refid, ref, deps): pfx = "{}.{}".format(sample, refid) ibam = "{}.bam".format(pfx) pairs_txt = "{}.pairs.txt".format(pfx) pairs_pdf = "{}.pairs.pdf".format(pfx) if os.path.exists(pairs_txt) and os.path.exists(pairs_pdf) and \ os.path.getctime(pairs_txt) > os.path.getctime(ibam) and \ not has_deps(deps): # all files present, nothing to do return None res = Resource(rd["08:scanbp"], jobsize) j = p.new_job("08:scanbp:{}".format(pfx), res, force=True) j.add_cmd("rm -f {} {}".format(pairs_txt, pairs_pdf)) cmd1 = ["scanbp", ibam, "> {}".format(pairs_txt)] j.add_cmd(" \\\n\t".join(cmd1)) cmd2 = ["plot_nt_pairing.py", "--title {}".format(pfx), pairs_txt, pairs_pdf] j.add_cmd(" \\\n\t".join(cmd2)) return j.sub(afterok=deps)
def do_dedup(p, rd, jobsize, sample, lib, refid, deps): odir = "{}/{}".format(sample, lib) pfx = "{}_{}.{}".format(sample, lib, refid) opfx = "{}/{}".format(odir, pfx) ibam = "{}.bam".format(opfx) obam = "{}.dedup.bam".format(opfx) obai = "{}.bai".format(obam) if os.path.exists(obam) and os.path.exists(obai) and \ os.path.getctime(obam) > os.path.getctime(ibam) and \ not has_deps(deps): ret = subprocess.call(["samtools", "quickcheck", obam]) if ret == 0: # nothing to do return None, obam res = Resource(rd["04:dedup"], jobsize) j = p.new_job("04:dedup:{}".format(pfx), res, force=True) j.add_cmd("rm -f {}".format(obai)) j.add_cmd("rm -f {}".format(obam)) dedup_cmd = ["rmdup_collapsed.py --remove-duplicates", "< {}".format(ibam), "> {}".format(obam)] j.add_cmd(" \\\n\t".join(dedup_cmd)) j.add_cmd("samtools index {}".format(obam)) jobid = j.sub(afterok=deps) return jobid, obam
def do_mark_5mC(p, rd, jobsize, sample, refid, ref, deps): pfx = "{}.{}".format(sample, refid) ibam = "{}.bam".format(pfx) methlist = "{}.methlist.txt.gz".format(pfx) skip = True for fmt in ("methylkit", "pileOmeth"): for ctx in ("CpG", "CHG", "CHH"): if not os.path.exists("{}.{}.{}.txt.gz".format(pfx, fmt, ctx)): skip = False break if skip == False: break if os.path.exists(methlist) and skip and \ os.path.getctime(methlist) > os.path.getctime(ibam) and \ not has_deps(deps): # all files present, nothing to do return None res = Resource(rd["07:mark5mC"], jobsize) j = p.new_job("07:mark5mC:{}".format(pfx), res, force=True) j.add_cmd("rm -f {}".format(methlist)) cmd1 = [ "mark5mC", "-5 10", "-3 10", ibam, ref, "| gzip -c -", "> {}".format(methlist) ] j.add_cmd(" \\\n\t".join(cmd1)) j.add_cmd("rm -f {}.methylkit.{{CpG,CHG,CHH}}.txt.gz".format(pfx)) j.add_cmd("rm -f {}.pileOmeth.{{CpG,CHG,CHH}}.txt.gz".format(pfx)) cmd2 = ["frobmethlist.py", "--all", "--gzip", methlist, pfx] j.add_cmd(" \\\n\t".join(cmd2)) return j.sub(afterok=deps)
def do_merge_libs(p, rd, jobsize, sample, lib_list, sl_info, refid, deps): odir = sample pfx = "{}.{}".format(sample, refid) opfx = "{}/{}".format(odir, pfx) obam = "{}.bam".format(opfx) obai = "{}.bai".format(obam) # merge list ibamlist = [] for lib in lib_list: lbam = "{}/{}/{}_{}.{}.dedup.bam".format(sample, lib, sample, lib, refid) ibamlist.append(lbam) if os.path.exists(obam) and os.path.exists(obai) and \ not has_deps(deps): for ibam in ibamlist: if not os.path.exists(ibam): continue if os.path.getctime(obam) < os.path.getctime(ibam): break else: ret = subprocess.call(["samtools", "quickcheck", obam]) if ret == 0: # check header for the expected readgroups exp_rgs = set() for lib in lib_list: for runid in sl_info[(sample,lib)]: exp_rgs.add("{}_{}_{}".format(sample, lib, runid)) hdr_rgs = bam2rgids(obam) if hdr_rgs == exp_rgs: # nothing to do return None, obam if len(lib_list) == 1: # nothing to merge, just symlink lib = lib_list[0] lbam = "{}/{}_{}.{}.dedup.bam".format(lib, sample, lib, refid) lbai = "{}.bai".format(lbam) rm_f([obam, obai]) os.symlink(lbam, obam) os.symlink(lbai, obai) # pass through dependencies return deps, obam res = Resource(rd["05:mergelibs"], jobsize) j = p.new_job("05:mergelibs:{}".format(pfx), res, force=True) j.add_cmd("rm -f {}".format(obai)) j.add_cmd("rm -f {}".format(obam)) merge_cmd = ["samtools merge", obam] merge_cmd.extend(ibamlist) j.add_cmd(" \\\n\t".join(merge_cmd)) j.add_cmd("samtools index {}".format(obam)) jobid = j.sub(afterok=deps) return jobid, obam
def do_merge_runs(p, rd, jobsize, sample, lib, runid_list, refid, deps): odir = "{}/{}".format(sample, lib) pfx = "{}_{}.{}".format(sample, lib, refid) opfx = "{}/{}".format(odir, pfx) obam = "{}.bam".format(opfx) obai = "{}.bai".format(obam) # merge list ibamlist = [] for runid in runid_list: rpfx = "{}_{}_{}".format(sample, lib, runid) rbam = "{}/{}.{}.bam".format(odir, rpfx, refid) if os.path.exists(rbam): ibamlist.append(rbam) if os.path.exists(obam) and os.path.exists(obai) and \ not has_deps(deps): for ibam in ibamlist: if os.path.getctime(obam) < os.path.getctime(ibam): break else: ret = subprocess.call(["samtools", "quickcheck", obam]) if ret == 0: # check header for the expected readgroups exp_rgs = { "{}_{}_{}".format(sample, lib, r) for r in runid_list } hdr_rgs = bam2rgids(obam) if hdr_rgs == exp_rgs: # nothing to do return None, obam if len(runid_list) == 1: # nothing to merge, just symlink runid = runid_list[0] rbam = "{}_{}_{}.{}.bam".format(sample, lib, runid, refid) rbai = "{}.bai".format(rbam) rm_f([obam, obai]) os.symlink(rbam, obam) os.symlink(rbai, obai) # pass through dependencies return deps, obam res = Resource(rd["03:mergeruns"], jobsize) j = p.new_job("03:mergeruns:{}".format(pfx), res, force=True) j.add_cmd("rm -f {}".format(obai)) j.add_cmd("rm -f {}".format(obam)) merge_cmd = ["samtools merge", obam] merge_cmd.extend(ibamlist) j.add_cmd(" \\\n\t".join(merge_cmd)) j.add_cmd("samtools index {}".format(obam)) jobid = j.sub(afterok=deps) return jobid, obam
def do_map(p, rd, jobsize, sample, lib, runid, refid, ref, deps): odir = "{}/{}".format(sample, lib) pfx = "{}_{}_{}".format(sample, lib, runid) opfx = "{}/{}".format(odir, pfx) fastq = "{}.folded.fastq.gz".format(opfx) bam = "{}.{}.bam".format(opfx, refid) bai = "{}.bai".format(bam) if os.path.exists(bam) and os.path.exists(bai) and \ os.path.getctime(bam) > os.path.getctime(fastq) and \ not has_deps(deps): ret = subprocess.call(["samtools", "quickcheck", bam]) if ret == 0: # nothing to do return None # remove files if they exist, we will remap if os.path.lexists(bai): os.unlink(bai) if os.path.lexists(bam): os.unlink(bam) res = Resource(rd["02:map"], jobsize) j = p.new_job("02:map:{}.{}".format(pfx, refid), res, force=True) bwa_cmd = [ "bwa mem", "-t {}".format(res.cpus), "-R \"@RG\tID:{}\tSM:{}\tLB:{}\tPL:ILLUMINA\"".format( pfx, sample, lib), "-C", # Carry through the fastq comments ref, fastq, "|samtools view", "-q 25", "-Sbu", "-", "|samtools sort", "-O bam", "-m 1G", "-@ 1", "-T tmp.sort.{}.{}".format(pfx, refid), "-", ">{}".format(bam) ] j.add_cmd(" \\\n\t".join(bwa_cmd)) j.add_cmd("samtools index {}".format(bam)) return j.sub(afterok=deps)
def do_samtools_flagstat(p, rd, jobsize, pfx, bam, deps): flagstat = "{}.flagstat.txt".format(bam) if os.path.exists(flagstat) and \ os.path.getctime(flagstat) > os.path.getctime(bam) and \ not has_deps(deps): return None res = Resource(rd["09:flagstat"], jobsize) j = p.new_job("09:flagstat:{}".format(pfx), res, force=True) j.add_cmd("samtools flagstat {} > {}".format(bam, flagstat)) return j.sub(afterok=deps)
def do_samtools_bedcov(p, rd, jobsize, pfx, ref_fa, bam, deps): fai = "{}.fai".format(ref_fa) bed = "{}.bed.tmp".format(bam) bedcov = "{}.bedcov.txt".format(bam) if os.path.exists(bedcov) and \ os.path.getctime(bedcov) > os.path.getctime(bam) and \ not has_deps(deps): return None res = Resource(rd["11:bedcov"], jobsize) j = p.new_job("11:bedcov:{}".format(pfx), res, force=True) j.add_cmd("awk 'BEGIN {OFS=\"\t\"} {print $1, 0, $2}' {} > {}".format(fai, bed)) j.add_cmd("samtools bedcov {} {} > {}".format(bed, bam, bedcov)) j.add_cleanup_cmd = "rm -f {}".format(bed) return j.sub(afterok=deps)