Ejemplo n.º 1
0
def do_indel_realign(p, rd, jobsize, sample, refid, ref, deps):
    odir = sample
    pfx = "{}.{}".format(sample, refid)
    opfx = "{}/{}".format(odir, pfx)
    ibam = "{}.bam".format(opfx)
    obam1 = "{}.realigned.bam".format(opfx)
    intervals = "{}.intervals".format(opfx)

    obam2 = "{}.{}.bam".format(sample, refid)
    obai2 = "{}.bai".format(obam2)

    if os.path.exists(obam2) and os.path.exists(obai2) and \
       os.path.getctime(obam2) > os.path.getctime(ibam) and \
       not has_deps(deps):
        ret = subprocess.call(["samtools", "quickcheck", obam2])
        if ret == 0:
            # nothing to do
            return None, obam2

    # train realigner
    res1 = Resource(rd["06:realign1"], jobsize)
    j1 = p.new_job("06:realign1:{}".format(pfx), res1, force=True)
    j1.add_cmd("rm -f {}".format(intervals))
    gatk_cmd1 = [
        "$gatk",  # must have this defined somewhere
        "-T RealignerTargetCreator",
        "-nt {}".format(res1.cpus),
        "-R {}".format(ref),
        "-I {}".format(ibam),
        "-o {}".format(intervals)
    ]
    j1.add_cmd(" \\\n\t".join(gatk_cmd1))
    stage1 = j1.sub(afterok=deps)

    # do indel realignment
    res2 = Resource(rd["06:realign2"], jobsize)
    j2 = p.new_job("06:realign2:{}".format(pfx), res2, force=True)
    j2.add_cmd("rm -f {}".format(obam1))
    gatk_cmd2 = [
        "$gatk", "-T IndelRealigner", "-R {}".format(ref),
        "-I {}".format(ibam), "--bam_compression 0", "--disable_bam_indexing",
        "-targetIntervals {}".format(intervals), "-o {}".format(obam1)
    ]
    j2.add_cmd(" \\\n\t".join(gatk_cmd2))
    j2.add_cmd("rm {}".format(intervals))
    stage2 = j2.sub(afterok=stage1)

    # regenerate MD tags to match realignments
    res3 = Resource(rd["06:realign3"], jobsize)
    j3 = p.new_job("06:realign3:{}".format(pfx), res2, force=True)
    j3.add_cmd("rm -f {}".format(obam2))
    j3.add_cmd("rm -f {}".format(obai2))
    calmd_cmd = ["samtools calmd", "-b", obam1, ref, "> {}".format(obam2)]
    j3.add_cmd(" \\\n\t".join(calmd_cmd))

    j3.add_cmd("samtools index {}".format(obam2))
    stage3 = j3.sub(afterok=stage2)

    return stage3, obam2
Ejemplo n.º 2
0
def do_fold(p, rd, jobsize, sample, lib, runid, r1, r2):
    odir = "{}/{}".format(sample, lib)
    pfx = "{}_{}_{}".format(sample, lib, runid)
    opfx = "{}/{}".format(odir, pfx)

    metrics = "{}.metrics".format(opfx)
    fastq = "{}.folded.fastq.gz".format(opfx)

    mkdir_p(odir)

    if os.path.exists(metrics) and os.path.exists(fastq):
        # nothing to do
        return None

    filecheck(r1)
    filecheck(r2)

    res = Resource(rd["01:fold"], jobsize)
    j = p.new_job("01:fold:{}".format(pfx), res, force=True)

    fold_cmd = ["foldreads",
            "-m", metrics,
            "-1 {}".format(r1),
            "-2 {}".format(r2),
            "| gzip -c -",
            "> {}".format(fastq)]
    j.add_cmd(" \\\n\t".join(fold_cmd))

    return j.sub()
Ejemplo n.º 3
0
def do_samtools_stats(p, rd, jobsize, pfx, bam, deps):
    stats_mt = "{}.stats.MT.txt".format(bam)
    stats_aut = "{}.stats.Aut.txt".format(bam)
    stats_X = "{}.stats.chrX.txt".format(bam)

    if os.path.exists(stats_mt) and os.path.exists(stats_aut) and os.path.exists(stats_X) and \
       os.path.getctime(stats_mt) > os.path.getctime(bam) and \
       os.path.getctime(stats_aut) > os.path.getctime(bam) and \
       os.path.getctime(stats_X) > os.path.getctime(bam) and \
       not has_deps(deps):
        return None

    res = Resource(rd["10:stats"], jobsize)
    j = p.new_job("10:stats:{}".format(pfx), res, force=True)

    chrmax = 50
    autlist = [str(c) for c in range(1,chrmax)] \
                + ["chr{}".format(c) for c in range(1,chrmax)] \
                + ["Chr{}".format(c) for c in range(1,chrmax)]
    autstr = " ".join(autlist)

    # 'samtools stats' doesn't fail if you specify a non-existant chr name,
    # so we try multiple common names in each case.
    j.add_cmd("samtools stats {} MT Mt M > {}".format(bam, stats_mt))
    j.add_cmd("samtools stats {} {} > {}".format(bam, autstr, stats_aut))
    j.add_cmd("samtools stats {} X chrX ChrX > {}".format(bam, stats_X))

    return j.sub(afterok=deps)
Ejemplo n.º 4
0
def do_scanbp(p, rd, jobsize, sample, refid, ref, deps):
    pfx = "{}.{}".format(sample, refid)
    ibam = "{}.bam".format(pfx)
    pairs_txt = "{}.pairs.txt".format(pfx)
    pairs_pdf = "{}.pairs.pdf".format(pfx)

    if os.path.exists(pairs_txt) and os.path.exists(pairs_pdf) and \
       os.path.getctime(pairs_txt) > os.path.getctime(ibam) and \
       not has_deps(deps):
        # all files present, nothing to do
        return None

    res = Resource(rd["08:scanbp"], jobsize)
    j = p.new_job("08:scanbp:{}".format(pfx), res, force=True)

    j.add_cmd("rm -f {} {}".format(pairs_txt, pairs_pdf))
    cmd1 = ["scanbp",
            ibam,
            "> {}".format(pairs_txt)]
    j.add_cmd(" \\\n\t".join(cmd1))

    cmd2 = ["plot_nt_pairing.py",
            "--title {}".format(pfx),
            pairs_txt,
            pairs_pdf]
    j.add_cmd(" \\\n\t".join(cmd2))

    return j.sub(afterok=deps)
Ejemplo n.º 5
0
def do_dedup(p, rd, jobsize, sample, lib, refid, deps):
    odir = "{}/{}".format(sample, lib)
    pfx = "{}_{}.{}".format(sample, lib, refid)
    opfx = "{}/{}".format(odir, pfx)
    ibam = "{}.bam".format(opfx)
    obam = "{}.dedup.bam".format(opfx)
    obai = "{}.bai".format(obam)

    if os.path.exists(obam) and os.path.exists(obai) and \
       os.path.getctime(obam) > os.path.getctime(ibam) and \
       not has_deps(deps):
        ret = subprocess.call(["samtools", "quickcheck", obam])
        if ret == 0:
            # nothing to do
            return None, obam

    res = Resource(rd["04:dedup"], jobsize)
    j = p.new_job("04:dedup:{}".format(pfx), res, force=True)

    j.add_cmd("rm -f {}".format(obai))
    j.add_cmd("rm -f {}".format(obam))

    dedup_cmd = ["rmdup_collapsed.py --remove-duplicates",
                "< {}".format(ibam),
                "> {}".format(obam)]
    j.add_cmd(" \\\n\t".join(dedup_cmd))

    j.add_cmd("samtools index {}".format(obam))
    jobid =  j.sub(afterok=deps)

    return jobid, obam
Ejemplo n.º 6
0
def do_mark_5mC(p, rd, jobsize, sample, refid, ref, deps):
    pfx = "{}.{}".format(sample, refid)
    ibam = "{}.bam".format(pfx)
    methlist = "{}.methlist.txt.gz".format(pfx)

    skip = True
    for fmt in ("methylkit", "pileOmeth"):
        for ctx in ("CpG", "CHG", "CHH"):
            if not os.path.exists("{}.{}.{}.txt.gz".format(pfx, fmt, ctx)):
                skip = False
                break
        if skip == False:
            break
    if os.path.exists(methlist) and skip and \
       os.path.getctime(methlist) > os.path.getctime(ibam) and \
       not has_deps(deps):
        # all files present, nothing to do
        return None

    res = Resource(rd["07:mark5mC"], jobsize)
    j = p.new_job("07:mark5mC:{}".format(pfx), res, force=True)

    j.add_cmd("rm -f {}".format(methlist))
    cmd1 = [
        "mark5mC", "-5 10", "-3 10", ibam, ref, "| gzip -c -",
        "> {}".format(methlist)
    ]
    j.add_cmd(" \\\n\t".join(cmd1))

    j.add_cmd("rm -f {}.methylkit.{{CpG,CHG,CHH}}.txt.gz".format(pfx))
    j.add_cmd("rm -f {}.pileOmeth.{{CpG,CHG,CHH}}.txt.gz".format(pfx))
    cmd2 = ["frobmethlist.py", "--all", "--gzip", methlist, pfx]
    j.add_cmd(" \\\n\t".join(cmd2))

    return j.sub(afterok=deps)
Ejemplo n.º 7
0
def do_merge_libs(p, rd, jobsize, sample, lib_list, sl_info, refid, deps):
    odir = sample
    pfx = "{}.{}".format(sample, refid)
    opfx = "{}/{}".format(odir, pfx)
    obam = "{}.bam".format(opfx)
    obai = "{}.bai".format(obam)

    # merge list
    ibamlist = []
    for lib in lib_list:
        lbam = "{}/{}/{}_{}.{}.dedup.bam".format(sample, lib, sample, lib, refid)
        ibamlist.append(lbam)

    if os.path.exists(obam) and os.path.exists(obai) and \
       not has_deps(deps):
        for ibam in ibamlist:
            if not os.path.exists(ibam):
                continue
            if os.path.getctime(obam) < os.path.getctime(ibam):
                break
        else:
            ret = subprocess.call(["samtools", "quickcheck", obam])
            if ret == 0:
                # check header for the expected readgroups
                exp_rgs = set()
                for lib in lib_list:
                    for runid in sl_info[(sample,lib)]:
                        exp_rgs.add("{}_{}_{}".format(sample, lib, runid))
                hdr_rgs = bam2rgids(obam)
                if hdr_rgs == exp_rgs:
                    # nothing to do
                    return None, obam

    if len(lib_list) == 1:
        # nothing to merge, just symlink
        lib = lib_list[0]
        lbam = "{}/{}_{}.{}.dedup.bam".format(lib, sample, lib, refid)
        lbai = "{}.bai".format(lbam)
	rm_f([obam, obai])
        os.symlink(lbam, obam)
        os.symlink(lbai, obai)
        # pass through dependencies
        return deps, obam

    res = Resource(rd["05:mergelibs"], jobsize)
    j = p.new_job("05:mergelibs:{}".format(pfx), res, force=True)

    j.add_cmd("rm -f {}".format(obai))
    j.add_cmd("rm -f {}".format(obam))

    merge_cmd = ["samtools merge", obam]
    merge_cmd.extend(ibamlist)
    j.add_cmd(" \\\n\t".join(merge_cmd))

    j.add_cmd("samtools index {}".format(obam))

    jobid = j.sub(afterok=deps)

    return jobid, obam
Ejemplo n.º 8
0
def do_merge_runs(p, rd, jobsize, sample, lib, runid_list, refid, deps):
    odir = "{}/{}".format(sample, lib)
    pfx = "{}_{}.{}".format(sample, lib, refid)
    opfx = "{}/{}".format(odir, pfx)
    obam = "{}.bam".format(opfx)
    obai = "{}.bai".format(obam)

    # merge list
    ibamlist = []
    for runid in runid_list:
        rpfx = "{}_{}_{}".format(sample, lib, runid)
        rbam = "{}/{}.{}.bam".format(odir, rpfx, refid)
        if os.path.exists(rbam):
            ibamlist.append(rbam)

    if os.path.exists(obam) and os.path.exists(obai) and \
       not has_deps(deps):
        for ibam in ibamlist:
            if os.path.getctime(obam) < os.path.getctime(ibam):
                break
        else:
            ret = subprocess.call(["samtools", "quickcheck", obam])
            if ret == 0:
                # check header for the expected readgroups
                exp_rgs = {
                    "{}_{}_{}".format(sample, lib, r)
                    for r in runid_list
                }
                hdr_rgs = bam2rgids(obam)
                if hdr_rgs == exp_rgs:
                    # nothing to do
                    return None, obam

    if len(runid_list) == 1:
        # nothing to merge, just symlink
        runid = runid_list[0]
        rbam = "{}_{}_{}.{}.bam".format(sample, lib, runid, refid)
        rbai = "{}.bai".format(rbam)
        rm_f([obam, obai])
        os.symlink(rbam, obam)
        os.symlink(rbai, obai)
        # pass through dependencies
        return deps, obam

    res = Resource(rd["03:mergeruns"], jobsize)
    j = p.new_job("03:mergeruns:{}".format(pfx), res, force=True)

    j.add_cmd("rm -f {}".format(obai))
    j.add_cmd("rm -f {}".format(obam))
    merge_cmd = ["samtools merge", obam]
    merge_cmd.extend(ibamlist)
    j.add_cmd(" \\\n\t".join(merge_cmd))

    j.add_cmd("samtools index {}".format(obam))
    jobid = j.sub(afterok=deps)

    return jobid, obam
Ejemplo n.º 9
0
def do_map(p, rd, jobsize, sample, lib, runid, refid, ref, deps):
    odir = "{}/{}".format(sample, lib)
    pfx = "{}_{}_{}".format(sample, lib, runid)
    opfx = "{}/{}".format(odir, pfx)

    fastq = "{}.folded.fastq.gz".format(opfx)
    bam = "{}.{}.bam".format(opfx, refid)
    bai = "{}.bai".format(bam)

    if os.path.exists(bam) and os.path.exists(bai) and \
       os.path.getctime(bam) > os.path.getctime(fastq) and \
       not has_deps(deps):
        ret = subprocess.call(["samtools", "quickcheck", bam])
        if ret == 0:
            # nothing to do
            return None

    # remove files if they exist, we will remap
    if os.path.lexists(bai):
        os.unlink(bai)
    if os.path.lexists(bam):
        os.unlink(bam)

    res = Resource(rd["02:map"], jobsize)
    j = p.new_job("02:map:{}.{}".format(pfx, refid), res, force=True)

    bwa_cmd = [
        "bwa mem",
        "-t {}".format(res.cpus),
        "-R \"@RG\tID:{}\tSM:{}\tLB:{}\tPL:ILLUMINA\"".format(
            pfx, sample, lib),
        "-C",  # Carry through the fastq comments
        ref,
        fastq,
        "|samtools view",
        "-q 25",
        "-Sbu",
        "-",
        "|samtools sort",
        "-O bam",
        "-m 1G",
        "-@ 1",
        "-T tmp.sort.{}.{}".format(pfx, refid),
        "-",
        ">{}".format(bam)
    ]
    j.add_cmd(" \\\n\t".join(bwa_cmd))

    j.add_cmd("samtools index {}".format(bam))

    return j.sub(afterok=deps)
Ejemplo n.º 10
0
def do_samtools_flagstat(p, rd, jobsize, pfx, bam, deps):
    flagstat = "{}.flagstat.txt".format(bam)

    if os.path.exists(flagstat) and \
       os.path.getctime(flagstat) > os.path.getctime(bam) and \
       not has_deps(deps):
        return None

    res = Resource(rd["09:flagstat"], jobsize)
    j = p.new_job("09:flagstat:{}".format(pfx), res, force=True)

    j.add_cmd("samtools flagstat {} > {}".format(bam, flagstat))

    return j.sub(afterok=deps)
Ejemplo n.º 11
0
def do_samtools_bedcov(p, rd, jobsize, pfx, ref_fa, bam, deps):
    fai = "{}.fai".format(ref_fa)
    bed = "{}.bed.tmp".format(bam)
    bedcov = "{}.bedcov.txt".format(bam)

    if os.path.exists(bedcov) and \
       os.path.getctime(bedcov) > os.path.getctime(bam) and \
       not has_deps(deps):
        return None

    res = Resource(rd["11:bedcov"], jobsize)
    j = p.new_job("11:bedcov:{}".format(pfx), res, force=True)

    j.add_cmd("awk 'BEGIN {OFS=\"\t\"} {print $1, 0, $2}' {} > {}".format(fai, bed))
    j.add_cmd("samtools bedcov {} {} > {}".format(bed, bam, bedcov))
    j.add_cleanup_cmd = "rm -f {}".format(bed)

    return j.sub(afterok=deps)