Ejemplo n.º 1
0
def combine_fastq_files(in_files, work_dir, config):
    if len(in_files) == 1:
        return in_files[0]

    else:
        cur1, cur2 = in_files[0]
        out1 = os.path.join(work_dir, os.path.basename(cur1))
        out2 = os.path.join(work_dir, os.path.basename(cur2)) if cur2 else None
        if not os.path.exists(out1):
            with open(out1, "a") as out_handle:
                for (cur1, _) in in_files:
                    with open(cur1) as in_handle:
                        shutil.copyfileobj(in_handle, out_handle)

        if out2 and not os.path.exists(out2):
            with open(out2, "a") as out_handle:
                for (_, cur2) in in_files:
                    with open(cur2) as in_handle:
                        shutil.copyfileobj(in_handle, out_handle)

        if config["algorithm"].get("upload_fastq", False):
            return out1, out2

        for f1, f2 in in_files:
            utils.save_diskspace(f1, "fastq merged to %s" % out1, config)
            if f2:
                utils.save_diskspace(f2, "fastq merged to %s" % out2, config)

        return out1, out2
Ejemplo n.º 2
0
def sam_to_sort_bam(sam_file, ref_file, fastq1, fastq2, sample_name,
                    rg_name, lane_name, config):
    """Convert SAM file to merged and sorted BAM file.
    """
    picard = broad.runner_from_config(config)
    platform = config["algorithm"]["platform"]
    qual_format = config["algorithm"].get("quality_format", None)
    base_dir = os.path.dirname(sam_file)

    picard.run_fn("picard_index_ref", ref_file)
    out_fastq_bam = picard.run_fn("picard_fastq_to_bam", fastq1, fastq2,
                                  base_dir, platform, sample_name, rg_name, lane_name,
                                  qual_format)
    out_bam = picard.run_fn("picard_sam_to_bam", sam_file, out_fastq_bam, ref_file,
                            fastq2 is not None)
    sort_bam = picard.run_fn("picard_sort", out_bam)

    utils.save_diskspace(sam_file, "SAM converted to BAM", config)
    utils.save_diskspace(out_fastq_bam, "Combined into output BAM %s" % out_bam, config)
    utils.save_diskspace(out_bam, "Sorted to %s" % sort_bam, config)
    # merge FASTQ files, only if barcoded samples in the work directory
    if (os.path.commonprefix([fastq1, sort_bam]) ==
             os.path.split(os.path.dirname(sort_bam))[0]
          and not config["algorithm"].get("upload_fastq", True)):
        utils.save_diskspace(fastq1, "Merged into output BAM %s" % out_bam, config)
        if fastq2:
            utils.save_diskspace(fastq2, "Merged into output BAM %s" % out_bam, config)
    return sort_bam
Ejemplo n.º 3
0
def sam_to_sort_bam(sam_file, ref_file, fastq1, fastq2, names, config):
    """Convert SAM file to merged and sorted BAM file.
    """
    picard = broad.runner_from_config(config)
    base_dir = os.path.dirname(sam_file)

    picard.run_fn("picard_index_ref", ref_file)
    out_fastq_bam = picard.run_fn("picard_fastq_to_bam", fastq1, fastq2,
                                  base_dir, names)
    out_bam = picard.run_fn("picard_sam_to_bam", sam_file, out_fastq_bam,
                            ref_file, fastq2 is not None)
    sort_bam = picard.run_fn("picard_sort", out_bam)

    utils.save_diskspace(sam_file, "SAM converted to BAM", config)
    utils.save_diskspace(out_fastq_bam,
                         "Combined into output BAM %s" % out_bam, config)
    utils.save_diskspace(out_bam, "Sorted to %s" % sort_bam, config)
    # merge FASTQ files, only if barcoded samples in the work directory
    if (os.path.commonprefix([fastq1, sort_bam]) == os.path.split(
            os.path.dirname(sort_bam))[0]
            and not config["algorithm"].get("upload_fastq", True)):
        utils.save_diskspace(fastq1, "Merged into output BAM %s" % out_bam,
                             config)
        if fastq2:
            utils.save_diskspace(fastq2, "Merged into output BAM %s" % out_bam,
                                 config)
    return sort_bam
Ejemplo n.º 4
0
def realign_sample(data, region=None, out_file=None):
    """Realign sample BAM file at indels.
    """
    realigner = data["config"]["algorithm"].get("realign", True)
    realigner = "gatk" if realigner is True else realigner
    realign_fn = _realign_approaches[realigner] if realigner else None

    if realign_fn:
        logger.info("Realigning %s with %s: %s %s" %
                    (data["name"], realigner, os.path.basename(
                        data["work_bam"]), region))
        sam_ref = data["sam_ref"]
        config = data["config"]
        if region == "nochr":
            realign_bam = write_nochr_reads(data["work_bam"], out_file,
                                            data["config"])
        else:
            realign_bam = realign_fn(
                data["work_bam"], sam_ref, config,
                data["genome_resources"]["variation"]["dbsnp"], region,
                out_file)
        if region is None:
            save_diskspace(data["work_bam"], "Realigned to %s" % realign_bam,
                           config)
        data["work_bam"] = realign_bam
    return [data]
Ejemplo n.º 5
0
def merge_bam_files(bam_files, work_dir, config, out_file=None):
    """Merge multiple BAM files from a sample into a single BAM for processing.

    Uses bamtools for merging, which handles large numbers of input BAMs.
    """
    if len(bam_files) == 1:
        return bam_files[0]
    else:
        if out_file is None:
            out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0]))
        if not utils.file_exists(out_file) or not utils.file_exists(out_file + ".bai"):
            bamtools = config_utils.get_program("bamtools", config)
            samtools = config_utils.get_program("samtools", config)
            resources = config_utils.get_resources("samtools", config)
            num_cores = config["algorithm"].get("num_cores", 1)
            max_mem = resources.get("memory", "1G")
            with file_transaction(out_file) as tx_out_file:
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                with utils.tmpfile(dir=work_dir, prefix="bammergelist") as bam_file_list:
                    bam_file_list = "%s.list" % os.path.splitext(out_file)[0]
                    with open(bam_file_list, "w") as out_handle:
                        for f in sorted(bam_files):
                            out_handle.write("%s\n" % f)
                    cmd = (
                        "{bamtools} merge -list {bam_file_list} | "
                        "{samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}"
                    )
                    do.run(cmd.format(**locals()), "Merge bam files", None)
            for b in bam_files:
                utils.save_diskspace(b, "BAM merged to %s" % out_file, config)
        picard = broad.runner_from_config(config)
        picard.run_fn("picard_index", out_file)
        return out_file
Ejemplo n.º 6
0
def merge_bam_files(bam_files, work_dir, config, out_file=None):
    """Merge multiple BAM files from a sample into a single BAM for processing.

    Uses bamtools for merging, which handles large numbers of input BAMs.
    """
    if len(bam_files) == 1:
        return bam_files[0]
    else:
        if out_file is None:
            out_file = os.path.join(work_dir,
                                    os.path.basename(sorted(bam_files)[0]))
        if not utils.file_exists(out_file) or not utils.file_exists(out_file +
                                                                    ".bai"):
            bamtools = config_utils.get_program("bamtools", config)
            resources = config_utils.get_resources("bamtools", config)
            max_mem = resources.get("memory", "2048")
            with file_transaction(out_file) as tx_out_file:
                with utils.tmpfile(dir=work_dir,
                                   prefix="bammergelist") as bam_file_list:
                    with open(bam_file_list, "w") as out_handle:
                        for f in sorted(bam_files):
                            out_handle.write("%s\n" % f)
                    cmd = ("{bamtools} merge -list {bam_file_list} | "
                           "{bamtools} sort -mem {max_mem} -out {tx_out_file}")
                    do.run(cmd.format(**locals()), "Merge bam files", None)
            for b in bam_files:
                utils.save_diskspace(b, "BAM merged to %s" % out_file, config)
        picard = broad.runner_from_config(config)
        picard.run_fn("picard_index", out_file)
        return out_file
Ejemplo n.º 7
0
def merge_bam_files(bam_files, work_dir, config, out_file=None, batch=None):
    """Merge multiple BAM files from a sample into a single BAM for processing.

    Checks system open file limit and merges in batches if necessary to avoid
    file handle limits.
    """
    if len(bam_files) == 1 and bam.bam_already_sorted(bam_files[0], config,
                                                      "coordinate"):
        shutil.copy(bam_files[0], out_file)
    else:
        if out_file is None:
            out_file = os.path.join(work_dir,
                                    os.path.basename(sorted(bam_files)[0]))
        if batch is not None:
            base, ext = os.path.splitext(out_file)
            out_file = "%s-b%s%s" % (base, batch, ext)
        if not utils.file_exists(out_file):
            sambamba = config_utils.get_program("sambamba", config)
            samtools = config_utils.get_program("samtools", config)
            resources = config_utils.get_resources("samtools", config)
            num_cores = config["algorithm"].get("num_cores", 1)
            max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                                 2, "decrease").upper()
            # sambamba opens 4 handles per file, so try to guess a reasonable batch size
            batch_size = (system.open_file_limit() // 4) - 100
            if len(bam_files) > batch_size:
                bam_files = [
                    merge_bam_files(xs, work_dir, config, out_file, i)
                    for i, xs in enumerate(
                        utils.partition_all(batch_size, bam_files))
                ]
            with tx_tmpdir(config) as tmpdir:
                with utils.chdir(tmpdir):
                    with file_transaction(config, out_file) as tx_out_file:
                        with file_transaction(
                                config,
                                "%s.list" % os.path.splitext(out_file)[0]
                        ) as tx_bam_file_list:
                            with open(tx_bam_file_list, "w") as out_handle:
                                for f in sorted(bam_files):
                                    out_handle.write("%s\n" % f)
                            if bam.bam_already_sorted(bam_files[0], config,
                                                      "coordinate"):
                                cmd = _sambamba_merge(bam_files)
                            else:
                                assert config.get("mark_duplicates", True)
                                cmd = _biobambam_merge_dedup()
                            do.run(
                                cmd.format(**locals()),
                                "Merge bam files to %s" %
                                os.path.basename(out_file), None)
            # Ensure timestamps are up to date on output file and index
            # Works around issues on systems with inconsistent times
            for ext in ["", ".bai"]:
                if os.path.exists(out_file + ext):
                    subprocess.check_call(["touch", out_file + ext])
            for b in bam_files:
                utils.save_diskspace(b, "BAM merged to %s" % out_file, config)
    bam.index(out_file, config)
    return out_file
Ejemplo n.º 8
0
def apply_recal(data):
    """Apply recalibration tables to the sorted aligned BAM, producing recalibrated BAM.
    """
    orig_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
    had_work_bam = "work_bam" in data
    if dd.get_recalibrate(data) in [True, "gatk"]:
        logger.info("Applying BQSR recalibration with GATK: %s " %
                    str(dd.get_sample_name(data)))
        data["work_bam"] = _gatk_apply_bqsr(data)
    elif dd.get_recalibrate(data) == "sentieon":
        logger.info("Applying BQSR recalibration with sentieon: %s " %
                    str(dd.get_sample_name(data)))
        data["work_bam"] = sentieon.apply_bqsr(data)
    elif dd.get_recalibrate(data):
        raise NotImplementedError("Unsupported recalibration type: %s" %
                                  (dd.get_recalibrate(data)))
    # CWL does not have work/alignment BAM separation
    if not had_work_bam and dd.get_work_bam(data):
        data["align_bam"] = dd.get_work_bam(data)
    if orig_bam != dd.get_work_bam(data) and orig_bam != dd.get_align_bam(
            data):
        utils.save_diskspace(orig_bam,
                             "BAM recalibrated to %s" % dd.get_work_bam(data),
                             data["config"])
    return data
Ejemplo n.º 9
0
def split_bam_file(bam_file, split_size, out_dir, config):
    """Split a BAM file into paired end fastq splits based on split size.

    XXX Need to generalize for non-paired end inputs.
    """
    existing = _find_current_bam_split(bam_file, out_dir)
    if len(existing) > 0:
        return existing
    pipe = True

    utils.safe_makedir(out_dir)
    broad_runner = broad.runner_from_config(config)
    out_files = []

    def new_handle(num):
        out = []
        for pair in [1, 2]:
            fname = os.path.join(
                out_dir,
                "{base}_{pair}_{num}.fastq".format(
                    base=os.path.splitext(os.path.basename(bam_file))[0], pair=pair, num=num
                ),
            )
            out += [fname, open(fname, "w")]
        return out

    with utils.curdir_tmpdir(base_dir=config_utils.get_resources("tmp", config).get("dir")) as tmp_dir:
        if pipe:
            sort_file = os.path.join(tmp_dir, "%s-sort.bam" % os.path.splitext(os.path.basename(bam_file))[0])
            os.mkfifo(sort_file)
            broad_runner.run_fn("picard_sort", bam_file, "queryname", sort_file, compression_level=0, pipe=True)
        else:
            sort_file = os.path.join(out_dir, "%s-sort.bam" % os.path.splitext(os.path.basename(bam_file))[0])
            broad_runner.run_fn("picard_sort", bam_file, "queryname", sort_file)

        samfile = pysam.Samfile(sort_file, "rb")
        i = 0
        num = 0
        f1, out_handle1, f2, out_handle2 = new_handle(num)
        out_files.append([f1, f2, None])
        for x1, x2 in utils.partition_all(2, samfile):
            x1_seq, x1_qual = _get_seq_qual(x1)
            out_handle1.write("@%s/1\n%s\n+\n%s\n" % (i, x1_seq, x1_qual))
            x2_seq, x2_qual = _get_seq_qual(x2)
            out_handle2.write("@%s/2\n%s\n+\n%s\n" % (i, x2_seq, x2_qual))
            i += 1
            if i % split_size == 0:
                num += 1
                out_handle1.close()
                out_handle2.close()
                f1, out_handle1, f2, out_handle2 = new_handle(num)
                out_files.append([f1, f2, num])
        out_handle1.close()
        out_handle2.close()
        samfile.close()
        if pipe:
            os.unlink(sort_file)
        else:
            utils.save_diskspace(sort_file, "Split to {}".format(out_files[0][0]), config)
    return out_files
Ejemplo n.º 10
0
def write_recal_bam(data, region=None, out_file=None):
    """Step 2 of GATK recalibration -- use covariates to re-write output file.
    """
    config = data["config"]
    if out_file is None:
        out_file = "%s-gatkrecal.bam" % os.path.splitext(data["work_bam"])[0]
    logger.info("Writing recalibrated BAM for %s to %s" %
                (data["name"], out_file))
    if region == "nochr":
        out_bam = write_nochr_reads(data["work_bam"], out_file)
    else:
        out_bam = _run_recal_bam(data["work_bam"], data["prep_recal"], region,
                                 data["sam_ref"], out_file, config)
    qual_bin = config["algorithm"].get("quality_bin", None)
    if ((qual_bin is True or qual_bin == "postrecal"
         or isinstance(qual_bin, list) and "postrecal" in qual_bin)
            and has_aligned_reads(out_bam)):
        binned_bam = cram.illumina_qual_bin(out_bam, data["sam_ref"],
                                            os.path.dirname(out_bam), config)
        shutil.move(out_bam, out_bam + ".binned")
        shutil.move(binned_bam, out_bam)
        utils.save_diskspace(out_bam + ".binned",
                             "Quality binned to %s" % out_bam, config)
    data["work_bam"] = out_bam
    return [data]
Ejemplo n.º 11
0
Archivo: merge.py Proyecto: jme9/wabio
def merge_bam_files(bam_files, work_dir, config, batch=0):
    """Merge multiple BAM files from a sample into a single BAM for processing.

    Avoids too many open file issues by merging large numbers of files in batches.
    """
    max_merge = 500
    bam_files.sort()
    i = 1
    while len(bam_files) > max_merge:
        bam_files = [merge_bam_files(xs, work_dir, config, batch + i)
                     for xs in utils.partition_all(max_merge, bam_files)]
        i += 1
    if batch > 0:
        out_dir = utils.safe_makedir(os.path.join(work_dir, "batchmerge%s" % batch))
    else:
        out_dir = work_dir
    out_file = os.path.join(out_dir, os.path.basename(sorted(bam_files)[0]))
    picard = broad.runner_from_config(config)
    if len(bam_files) == 1:
        if not os.path.exists(out_file):
            os.symlink(bam_files[0], out_file)
    else:
        picard.run_fn("picard_merge", bam_files, out_file)
        for b in bam_files:
            utils.save_diskspace(b, "BAM merged to %s" % out_file, config)
    return out_file
Ejemplo n.º 12
0
Archivo: merge.py Proyecto: aminmg/bcbb
def combine_fastq_files(in_files, work_dir, config):
    if len(in_files) == 1:
        return in_files[0]

    else:
        cur1, cur2 = in_files[0]
        out1 = os.path.join(work_dir, os.path.basename(cur1))
        out2 = os.path.join(work_dir, os.path.basename(cur2)) if cur2 else None
        if not os.path.exists(out1):
            with open(out1, "a") as out_handle:
                for (cur1, _) in in_files:
                    with open(cur1) as in_handle:
                        shutil.copyfileobj(in_handle, out_handle)

        if out2 and not os.path.exists(out2):
            with open(out2, "a") as out_handle:
                for (_, cur2) in in_files:
                    with open(cur2) as in_handle:
                        shutil.copyfileobj(in_handle, out_handle)

        if config["algorithm"].get("upload_fastq", False):
            return out1, out2

        for f1, f2 in in_files:
            utils.save_diskspace(f1, "fastq merged to %s" % out1, config)
            if f2:
                utils.save_diskspace(f2, "fastq merged to %s" % out2, config)

        return out1, out2
Ejemplo n.º 13
0
def main(config_file,
         align_sam,
         ref_file,
         fastq_one,
         fastq_pair=None,
         sample_name="",
         rg_name="",
         pu_name=""):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)
    picard = BroadRunner(config["program"]["picard"],
                         max_memory=config["algorithm"].get("java_memory", ""))
    platform = config["algorithm"]["platform"]
    if platform.lower() == "illumina":
        qual_format = "Illumina"
    else:
        raise ValueError("Need to specify quality format for %s" % platform)
    index_ref_file(picard, ref_file)
    base_dir = os.path.split(align_sam)[0]
    with curdir_tmpdir() as tmp_dir:
        out_fastq_bam = picard_fastq_to_bam(picard, fastq_one, fastq_pair,
                                            base_dir, platform, qual_format,
                                            sample_name, rg_name, pu_name,
                                            tmp_dir)
        out_bam = picard_merge_bam(picard, align_sam, out_fastq_bam, ref_file,
                                   tmp_dir, fastq_pair is not None)
        sort_bam = picard_sort(picard, out_bam, tmp_dir)
    save_diskspace(out_fastq_bam, "Combined into output BAM %s" % out_bam,
                   config)
    save_diskspace(out_bam, "Sorted to %s" % sort_bam, config)
Ejemplo n.º 14
0
def combine_bam(in_files, out_file, config):
    """Parallel target to combine multiple BAM files.
    """
    runner = broad.runner_from_path("picard", config)
    runner.run_fn("picard_merge", in_files, out_file)
    for in_file in in_files:
        save_diskspace(in_file, "Merged into {0}".format(out_file), config)
    bam.index(out_file, config)
    return out_file
Ejemplo n.º 15
0
def split_bam_file(bam_file, split_size, out_dir, config):
    """Split a BAM file into paired end fastq splits based on split size.

    XXX Need to generalize for non-paired end inputs.
    """
    existing = _find_current_bam_split(bam_file, out_dir)
    if len(existing) > 0:
        return existing
    pipe = True

    utils.safe_makedir(out_dir)
    broad_runner = broad.runner_from_config(config)
    out_files = []
    def new_handle(num):
        out = []
        for pair in [1, 2]:
            fname = os.path.join(out_dir, "{base}_{pair}_{num}.fastq".format(
                base=os.path.splitext(os.path.basename(bam_file))[0], pair=pair, num=num))
            out += [fname, open(fname, "w")]
        return out
    with utils.curdir_tmpdir(base_dir=config_utils.get_resources("tmp", config).get("dir")) as tmp_dir:
        if pipe:
            sort_file = os.path.join(tmp_dir, "%s-sort.bam" %
                                     os.path.splitext(os.path.basename(bam_file))[0])
            os.mkfifo(sort_file)
            broad_runner.run_fn("picard_sort", bam_file, "queryname", sort_file,
                                compression_level=0, pipe=True)
        else:
            sort_file = os.path.join(out_dir, "%s-sort.bam" %
                                     os.path.splitext(os.path.basename(bam_file))[0])
            broad_runner.run_fn("picard_sort", bam_file, "queryname", sort_file)

        samfile = pysam.Samfile(sort_file, "rb")
        i = 0
        num = 0
        f1, out_handle1, f2, out_handle2 = new_handle(num)
        out_files.append([f1, f2, None])
        for x1, x2 in utils.partition_all(2, samfile):
            x1_seq, x1_qual = _get_seq_qual(x1)
            out_handle1.write("@%s/1\n%s\n+\n%s\n" % (i, x1_seq, x1_qual))
            x2_seq, x2_qual = _get_seq_qual(x2)
            out_handle2.write("@%s/2\n%s\n+\n%s\n" % (i, x2_seq, x2_qual))
            i += 1
            if i % split_size == 0:
                num += 1
                out_handle1.close()
                out_handle2.close()
                f1, out_handle1, f2, out_handle2 = new_handle(num)
                out_files.append([f1, f2, num])
        out_handle1.close()
        out_handle2.close()
        samfile.close()
        if pipe:
            os.unlink(sort_file)
        else:
            utils.save_diskspace(sort_file, "Split to {}".format(out_files[0][0]), config)
    return out_files
Ejemplo n.º 16
0
def merge_bam_files(bam_files, work_dir, config):
    """Merge multiple BAM files from a sample into a single BAM for processing.
    """
    out_file = os.path.join(work_dir, os.path.basename(bam_files[0]))
    picard = broad.runner_from_config(config)
    picard.run_fn("picard_merge", bam_files, out_file)
    for b in bam_files:
        utils.save_diskspace(b, "BAM merged to %s" % out_file, config)
    return out_file
Ejemplo n.º 17
0
def combine_bam(in_files, out_file, config):
    """Parallel target to combine multiple BAM files.
    """
    runner = broad.runner_from_path("picard", config)
    runner.run_fn("picard_merge", in_files, out_file)
    for in_file in in_files:
        save_diskspace(in_file, "Merged into {0}".format(out_file), config)
    bam.index(out_file, config)
    return out_file
Ejemplo n.º 18
0
def _finalize_merge(out_file, bam_files, config):
    """Handle indexes and cleanups of merged BAM and input files.
    """
    # Ensure timestamps are up to date on output file and index
    # Works around issues on systems with inconsistent times
    for ext in ["", ".bai"]:
        if os.path.exists(out_file + ext):
            subprocess.check_call(["touch", out_file + ext])
    for b in bam_files:
        utils.save_diskspace(b, "BAM merged to %s" % out_file, config)
Ejemplo n.º 19
0
def merge_bam_files(bam_files, work_dir, config):
    """Merge multiple BAM files from a sample into a single BAM for processing.
    """
    bam_files.sort()
    out_file = os.path.join(work_dir, os.path.basename(bam_files[0]))
    picard = broad.runner_from_config(config)
    picard.run_fn("picard_merge", bam_files, out_file)
    for b in bam_files:
        utils.save_diskspace(b, "BAM merged to %s" % out_file, config)
    return out_file
Ejemplo n.º 20
0
def _save_fastq_space(items):
    """Potentially save fastq space prior to merging, since alignments done.
    """
    to_cleanup = {}
    for data in (utils.to_single_data(x) for x in items):
        for fname in data.get("files", []):
            if os.path.realpath(fname).startswith(dd.get_work_dir(data)):
                to_cleanup[fname] = data["config"]
    for fname, config in to_cleanup.items():
        utils.save_diskspace(fname, "Cleanup prep files after alignment finished", config)
Ejemplo n.º 21
0
def _finalize_merge(out_file, bam_files, config):
    """Handle indexes and cleanups of merged BAM and input files.
    """
    # Ensure timestamps are up to date on output file and index
    # Works around issues on systems with inconsistent times
    for ext in ["", ".bai"]:
        if os.path.exists(out_file + ext):
            subprocess.check_call(["touch", out_file + ext])
    for b in bam_files:
        utils.save_diskspace(b, "BAM merged to %s" % out_file, config)
Ejemplo n.º 22
0
def _save_fastq_space(items):
    """Potentially save fastq space prior to merging, since alignments done.
    """
    to_cleanup = {}
    for data in (utils.to_single_data(x) for x in items):
        for fname in data.get("files", []):
            if os.path.realpath(fname).startswith(dd.get_work_dir(data)):
                to_cleanup[fname] = data["config"]
    for fname, config in to_cleanup.items():
        utils.save_diskspace(fname, "Cleanup prep files after alignment finished", config)
Ejemplo n.º 23
0
def recalibrate_sample(data):
    """Recalibrate quality values from aligned sample BAM file.
    """
    logger.info("Recalibrating %s with GATK" % str(data["name"]))
    if data["config"]["algorithm"]["recalibrate"]:
        recal_bam = recalibrate_quality(
            data["work_bam"], data["fastq1"], data["fastq2"], data["sam_ref"], data["dirs"], data["config"]
        )
        save_diskspace(data["work_bam"], "Recalibrated to %s" % recal_bam, data["config"])
        data["work_bam"] = recal_bam
    return [[data]]
Ejemplo n.º 24
0
def recalibrate_sample(data):
    """Recalibrate quality values from aligned sample BAM file.
    """
    logger.info("Recalibrating %s with GATK" % str(data["name"]))
    if data["config"]["algorithm"]["recalibrate"]:
        recal_bam = recalibrate_quality(data["work_bam"], data["fastq1"],
                                        data["fastq2"], data["sam_ref"],
                                        data["dirs"], data["config"])
        save_diskspace(data["work_bam"], "Recalibrated to %s" % recal_bam,
                       data["config"])
        data["work_bam"] = recal_bam
    return [[data]]
Ejemplo n.º 25
0
def sam_to_sort_bam(sam_file, ref_file, fastq1, fastq2, sample_name,
                    lane_name, config, config_file):
    """Convert SAM file to merged and sorted BAM file.
    """
    lane = lane_name.split("_")[0]
    cl = ["picard_sam_to_bam.py", "--name=%s" % sample_name,
            "--rg=%s" % lane, "--pu=%s" % lane_name,
            config_file, sam_file, ref_file, fastq1]
    if fastq2:
        cl.append(fastq2)
    subprocess.check_call(cl)
    utils.save_diskspace(sam_file, "SAM converted to BAM", config)
Ejemplo n.º 26
0
def merge_bam_files(bam_files, work_dir, config, out_file=None, batch=None):
    """Merge multiple BAM files from a sample into a single BAM for processing.

    Checks system open file limit and merges in batches if necessary to avoid
    file handle limits.
    """
    if len(bam_files) == 1:
        return bam_files[0]
    else:
        if out_file is None:
            out_file = os.path.join(work_dir,
                                    os.path.basename(sorted(bam_files)[0]))
        if batch is not None:
            base, ext = os.path.splitext(out_file)
            out_file = "%s-b%s%s" % (base, batch, ext)
        if not utils.file_exists(out_file) or not utils.file_exists(out_file +
                                                                    ".bai"):
            bamtools = config_utils.get_program("bamtools", config)
            samtools = config_utils.get_program("samtools", config)
            resources = config_utils.get_resources("samtools", config)
            num_cores = config["algorithm"].get("num_cores", 1)
            max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                                 2, "decrease").upper()
            batch_size = system.open_file_limit() - 100
            if len(bam_files) > batch_size:
                bam_files = [
                    merge_bam_files(xs, work_dir, config, out_file, i)
                    for i, xs in enumerate(
                        utils.partition_all(batch_size, bam_files))
                ]
            with utils.curdir_tmpdir({"config": config}) as tmpdir:
                with utils.chdir(tmpdir):
                    merge_cl = _bamtools_merge(bam_files)
                    with file_transaction(out_file) as tx_out_file:
                        with file_transaction("%s.list" %
                                              os.path.splitext(out_file)[0]
                                              ) as tx_bam_file_list:
                            tx_out_prefix = os.path.splitext(tx_out_file)[0]
                            with open(tx_bam_file_list, "w") as out_handle:
                                for f in sorted(bam_files):
                                    out_handle.write("%s\n" % f)
                            cmd = (
                                merge_cl + " | "
                                "{samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}"
                            )
                            do.run(
                                cmd.format(**locals()),
                                "Merge bam files to %s" %
                                os.path.basename(out_file), None)
            for b in bam_files:
                utils.save_diskspace(b, "BAM merged to %s" % out_file, config)
        bam.index(out_file, config)
        return out_file
Ejemplo n.º 27
0
def merge_bam_files(bam_files, work_dir, config, out_file=None, batch=None):
    """Merge multiple BAM files from a sample into a single BAM for processing.

    Checks system open file limit and merges in batches if necessary to avoid
    file handle limits.
    """
    if len(bam_files) == 1:
        bam.index(bam_files[0], config)
        return bam_files[0]
    else:
        if out_file is None:
            out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0]))
        if batch is not None:
            base, ext = os.path.splitext(out_file)
            out_file = "%s-b%s%s" % (base, batch, ext)
        if not utils.file_exists(out_file):
            sambamba = config_utils.get_program("sambamba", config)
            samtools = config_utils.get_program("samtools", config)
            samblaster = config_utils.get_program("samblaster", config)
            resources = config_utils.get_resources("samtools", config)
            num_cores = config["algorithm"].get("num_cores", 1)
            max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                                 2, "decrease").upper()
            # sambamba opens 4 handles per file, so try to guess a reasonable batch size
            batch_size = (system.open_file_limit() // 4) - 100
            if len(bam_files) > batch_size:
                bam_files = [merge_bam_files(xs, work_dir, config, out_file, i)
                             for i, xs in enumerate(utils.partition_all(batch_size, bam_files))]
            with tx_tmpdir(config) as tmpdir:
                with utils.chdir(tmpdir):
                    with file_transaction(config, out_file) as tx_out_file:
                        with file_transaction(config, "%s.list" % os.path.splitext(out_file)[0]) as tx_bam_file_list:
                            with open(tx_bam_file_list, "w") as out_handle:
                                for f in sorted(bam_files):
                                    out_handle.write("%s\n" % f)
                            if bam.bam_already_sorted(bam_files[0], config, "coordinate"):
                                cmd = _sambamba_merge(bam_files)
                            else:
                                assert config.get("mark_duplicates", True)
                                cmd = _biobambam_merge_dedup()
                            do.run(cmd.format(**locals()), "Merge bam files to %s" % os.path.basename(out_file),
                                   None)
            # Ensure timestamps are up to date on output file and index
            # Works around issues on systems with inconsistent times
            for ext in ["", ".bai"]:
                if os.path.exists(out_file + ext):
                    subprocess.check_call(["touch", out_file + ext])
            for b in bam_files:
                utils.save_diskspace(b, "BAM merged to %s" % out_file, config)
        bam.index(out_file, config)
        return out_file
Ejemplo n.º 28
0
Archivo: merge.py Proyecto: gturco/bcbb
def merge_bam_files(bam_files, work_dir, config):
    """Merge multiple BAM files from a sample into a single BAM for processing.
    """
    bam_files.sort()
    out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0]))
    picard = broad.runner_from_config(config)
    if len(bam_files) == 1:
        if not os.path.exists(out_file):
            os.symlink(bam_files[0], out_file)
    else:
        picard.run_fn("picard_merge", bam_files, out_file)
        for b in bam_files:
            utils.save_diskspace(b, "BAM merged to %s" % out_file, config)
    return out_file
Ejemplo n.º 29
0
def sam_to_sort_bam(sam_file, ref_file, fastq1, fastq2, sample_name, lane_name,
                    config, config_file):
    """Convert SAM file to merged and sorted BAM file.
    """
    lane = lane_name.split("_")[0]
    cl = [
        "picard_sam_to_bam.py",
        "--name=%s" % sample_name,
        "--rg=%s" % lane,
        "--pu=%s" % lane_name, config_file, sam_file, ref_file, fastq1
    ]
    if fastq2:
        cl.append(fastq2)
    subprocess.check_call(cl)
    utils.save_diskspace(sam_file, "SAM converted to BAM", config)
Ejemplo n.º 30
0
def mark_duplicates_sample(data):
    """Mark duplicate molecules in sample BAM file.
    """
    mark_duplicates = data["config"]["algorithm"].get("mark_duplicates", False)
    if not mark_duplicates:
        return [[data]]

    logger.info("Marking duplicates in {} with Picard".format(str(data["name"])))
    picard = broad.runner_from_config(data["config"])
    dup_bam, _ = picard_mark_duplicates(picard, data["work_bam"])
    reason = "Marked duplicates of {0} in {1}, so {0} is no longer needed" \
             "".format(data["work_bam"], dup_bam)
    save_diskspace(data["work_bam"], reason, data["config"])
    data["work_bam"] = dup_bam

    return [[data]]
Ejemplo n.º 31
0
def merge_bam_files(bam_files, work_dir, config):
    """Merge multiple BAM files from a sample into a single BAM for processing.
    """
    out_file = os.path.join(work_dir, os.path.basename(bam_files[0]))
    if not os.path.exists(out_file):
        picard = BroadRunner(config["program"]["picard"],
                             max_memory=config["algorithm"].get(
                                 "java_memory", ""))
        with utils.curdir_tmpdir() as tmp_dir:
            opts = [("OUTPUT", out_file), ("SORT_ORDER", "coordinate"),
                    ("TMP_DIR", tmp_dir)]
            for b in bam_files:
                opts.append(("INPUT", b))
            picard.run("MergeSamFiles", opts)
    for b in bam_files:
        utils.save_diskspace(b, "BAM merged to %s" % out_file, config)
    return out_file
Ejemplo n.º 32
0
def merge_bam_files(bam_files, work_dir, config):
    """Merge multiple BAM files from a sample into a single BAM for processing.
    """
    out_file = os.path.join(work_dir, os.path.basename(bam_files[0]))
    if not os.path.exists(out_file):
        picard = BroadRunner(config["program"]["picard"],
                             max_memory=config["algorithm"].get("java_memory", ""))
        with utils.curdir_tmpdir() as tmp_dir:
            opts = [("OUTPUT", out_file),
                    ("SORT_ORDER", "coordinate"),
                    ("TMP_DIR", tmp_dir)]
            for b in bam_files:
                opts.append(("INPUT", b))
            picard.run("MergeSamFiles", opts)
    for b in bam_files:
        utils.save_diskspace(b, "BAM merged to %s" % out_file, config)
    return out_file
Ejemplo n.º 33
0
def realign_sample(data, region=None, out_file=None):
    """Realign sample BAM file at indels.
    """
    logger.info("Realigning %s with GATK: %s %s" % (data["name"], os.path.basename(data["work_bam"]), region))
    if data["config"]["algorithm"]["snpcall"]:
        sam_ref = data["sam_ref"]
        config = data["config"]
        if region == "nochr":
            realign_bam = write_nochr_reads(data["work_bam"], out_file)
        else:
            realign_bam = gatk_realigner(
                data["work_bam"], sam_ref, config, configured_ref_file("dbsnp", config, sam_ref), region, out_file
            )
        if region is None:
            save_diskspace(data["work_bam"], "Realigned to %s" % realign_bam, config)
        data["work_bam"] = realign_bam
    return [data]
Ejemplo n.º 34
0
def mark_duplicates_sample(data):
    """Mark duplicate molecules in sample BAM file.
    """
    mark_duplicates = data["config"]["algorithm"].get("mark_duplicates", False)
    if not mark_duplicates:
        return [[data]]

    logger.info("Marking duplicates in {} with Picard".format(str(
        data["name"])))
    picard = broad.runner_from_config(data["config"])
    dup_bam, _ = picard_mark_duplicates(picard, data["work_bam"])
    reason = "Marked duplicates of {0} in {1}, so {0} is no longer needed" \
             "".format(data["work_bam"], dup_bam)
    save_diskspace(data["work_bam"], reason, data["config"])
    data["work_bam"] = dup_bam

    return [[data]]
Ejemplo n.º 35
0
def merge_bam_files(bam_files, work_dir, config, out_file=None):
    """Merge multiple BAM files from a sample into a single BAM for processing.

    Uses bamtools for merging, which handles large numbers of input BAMs.
    """
    if len(bam_files) == 1:
        return bam_files[0]
    else:
        if out_file is None:
            out_file = os.path.join(work_dir,
                                    os.path.basename(sorted(bam_files)[0]))
        if not utils.file_exists(out_file) or not utils.file_exists(out_file +
                                                                    ".bai"):
            bamtools = config_utils.get_program("bamtools", config)
            samtools = config_utils.get_program("samtools", config)
            resources = config_utils.get_resources("samtools", config)
            num_cores = config["algorithm"].get("num_cores", 1)
            max_mem = resources.get("memory", "1G")
            if len(bam_files) > system.open_file_limit():
                raise IOError(
                    "More files to merge (%s) then available open file descriptors (%s)\n"
                    "See documentation on tips for changing file limits:\n"
                    "https://bcbio-nextgen.readthedocs.org/en/latest/contents/"
                    "parallel.html#tuning-systems-for-scale" %
                    (len(bam_files), system.open_file_limit()))
            with file_transaction(out_file) as tx_out_file:
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                with utils.tmpfile(dir=work_dir,
                                   prefix="bammergelist") as bam_file_list:
                    bam_file_list = "%s.list" % os.path.splitext(out_file)[0]
                    with open(bam_file_list, "w") as out_handle:
                        for f in sorted(bam_files):
                            out_handle.write("%s\n" % f)
                    cmd = (
                        "{bamtools} merge -list {bam_file_list} | "
                        "{samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}"
                    )
                    do.run(cmd.format(**locals()), "Merge bam files", None)
            for b in bam_files:
                utils.save_diskspace(b, "BAM merged to %s" % out_file, config)
        picard = broad.runner_from_config(config)
        picard.run_fn("picard_index", out_file)
        return out_file
Ejemplo n.º 36
0
def main(config_file, align_sam, ref_file, fastq_one, fastq_pair=None, sample_name="", rg_name="", pu_name=""):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)
    picard = BroadRunner(config["program"]["picard"], max_memory=config["algorithm"].get("java_memory", ""))
    platform = config["algorithm"]["platform"]
    if platform.lower() == "illumina":
        qual_format = "Illumina"
    else:
        raise ValueError("Need to specify quality format for %s" % platform)
    index_ref_file(picard, ref_file)
    base_dir = os.path.split(align_sam)[0]
    with curdir_tmpdir() as tmp_dir:
        out_fastq_bam = picard_fastq_to_bam(
            picard, fastq_one, fastq_pair, base_dir, platform, qual_format, sample_name, rg_name, pu_name, tmp_dir
        )
        out_bam = picard_merge_bam(picard, align_sam, out_fastq_bam, ref_file, tmp_dir, fastq_pair is not None)
        sort_bam = picard_sort(picard, out_bam, tmp_dir)
    save_diskspace(out_fastq_bam, "Combined into output BAM %s" % out_bam, config)
    save_diskspace(out_bam, "Sorted to %s" % sort_bam, config)
Ejemplo n.º 37
0
def merge_bam_files(bam_files, work_dir, config, out_file=None, batch=None):
    """Merge multiple BAM files from a sample into a single BAM for processing.

    Checks system open file limit and merges in batches if necessary to avoid
    file handle limits.
    """
    if len(bam_files) == 1:
        return bam_files[0]
    else:
        if out_file is None:
            out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0]))
        if batch is not None:
            base, ext = os.path.splitext(out_file)
            out_file = "%s-b%s%s" % (base, batch, ext)
        if not utils.file_exists(out_file) or not utils.file_exists(out_file + ".bai"):
            bamtools = config_utils.get_program("bamtools", config)
            samtools = config_utils.get_program("samtools", config)
            resources = config_utils.get_resources("samtools", config)
            num_cores = config["algorithm"].get("num_cores", 1)
            max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                                 2, "decrease")
            batch_size = system.open_file_limit() - 100
            if len(bam_files) > batch_size:
                bam_files = [merge_bam_files(xs, work_dir, config, out_file, i)
                             for i, xs in enumerate(utils.partition_all(batch_size, bam_files))]
            with utils.curdir_tmpdir({"config": config}) as tmpdir:
                with utils.chdir(tmpdir):
                    merge_cl = _bamtools_merge(bam_files)
                    with file_transaction(out_file) as tx_out_file:
                        with file_transaction("%s.list" % os.path.splitext(out_file)[0]) as tx_bam_file_list:
                            tx_out_prefix = os.path.splitext(tx_out_file)[0]
                            with open(tx_bam_file_list, "w") as out_handle:
                                for f in sorted(bam_files):
                                    out_handle.write("%s\n" % f)
                            cmd = (merge_cl + " | "
                                   "{samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}")
                            do.run(cmd.format(**locals()), "Merge bam files to %s" % os.path.basename(out_file),
                                   None)
            for b in bam_files:
                utils.save_diskspace(b, "BAM merged to %s" % out_file, config)
        bam.index(out_file, config)
        return out_file
Ejemplo n.º 38
0
def realign_sample(data, region=None, out_file=None):
    """Realign sample BAM file at indels.
    """
    logger.info("Realigning %s with GATK: %s %s" %
                (data["name"], os.path.basename(data["work_bam"]), region))
    if data["config"]["algorithm"]["snpcall"]:
        sam_ref = data["sam_ref"]
        config = data["config"]
        if region == "nochr":
            realign_bam = write_nochr_reads(data["work_bam"], out_file)
        else:
            realign_bam = gatk_realigner(
                data["work_bam"], sam_ref, config,
                configured_ref_file("dbsnp", config, sam_ref), region,
                out_file)
        if region is None:
            save_diskspace(data["work_bam"], "Realigned to %s" % realign_bam,
                           config)
        data["work_bam"] = realign_bam
    return [data]
Ejemplo n.º 39
0
def apply_recal(data):
    """Apply recalibration tables to the sorted aligned BAM, producing recalibrated BAM.
    """
    orig_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
    had_work_bam = "work_bam" in data
    if dd.get_recalibrate(data) in [True, "gatk"]:
        if data.get("prep_recal"):
            logger.info("Applying BQSR recalibration with GATK: %s " % str(dd.get_sample_name(data)))
            data["work_bam"] = _gatk_apply_bqsr(data)
    elif dd.get_recalibrate(data) == "sentieon":
        if data.get("prep_recal"):
            logger.info("Applying BQSR recalibration with sentieon: %s " % str(dd.get_sample_name(data)))
            data["work_bam"] = sentieon.apply_bqsr(data)
    elif dd.get_recalibrate(data):
        raise NotImplementedError("Unsupported recalibration type: %s" % (dd.get_recalibrate(data)))
    # CWL does not have work/alignment BAM separation
    if not had_work_bam and dd.get_work_bam(data):
        data["align_bam"] = dd.get_work_bam(data)
    if orig_bam != dd.get_work_bam(data) and orig_bam != dd.get_align_bam(data):
        utils.save_diskspace(orig_bam, "BAM recalibrated to %s" % dd.get_work_bam(data), data["config"])
    return data
Ejemplo n.º 40
0
def merge_bam_files(bam_files, work_dir, config, out_file=None):
    """Merge multiple BAM files from a sample into a single BAM for processing.

    Uses samtools or bamtools for merging, both of which have some cavaets.
    samtools can run into file system limits on command line length, while
    bamtools runs into open file handle issues.
    """
    if len(bam_files) == 1:
        return bam_files[0]
    else:
        if out_file is None:
            out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0]))
        if not utils.file_exists(out_file) or not utils.file_exists(out_file + ".bai"):
            bamtools = config_utils.get_program("bamtools", config)
            samtools = config_utils.get_program("samtools", config)
            resources = config_utils.get_resources("samtools", config)
            num_cores = config["algorithm"].get("num_cores", 1)
            max_mem = resources.get("memory", "1G")
            with utils.curdir_tmpdir() as tmpdir:
                with utils.chdir(tmpdir):
                    if len(bam_files) < 4096:
                        merge_cl = _samtools_cat(bam_files, tmpdir)
                    else:
                        merge_cl = _bamtools_merge(bam_files)
                    with file_transaction(out_file) as tx_out_file:
                        tx_out_prefix = os.path.splitext(tx_out_file)[0]
                        with utils.tmpfile(dir=work_dir, prefix="bammergelist") as bam_file_list:
                            bam_file_list = "%s.list" % os.path.splitext(out_file)[0]
                            with open(bam_file_list, "w") as out_handle:
                                for f in sorted(bam_files):
                                    out_handle.write("%s\n" % f)
                            cmd = (merge_cl + " | "
                                   "{samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}")
                            do.run(cmd.format(**locals()), "Merge bam files", None)
            for b in bam_files:
                utils.save_diskspace(b, "BAM merged to %s" % out_file, config)
        bam.index(out_file, config)
        return out_file
Ejemplo n.º 41
0
def merge_bam_files(bam_files, work_dir, config, out_file=None):
    """Merge multiple BAM files from a sample into a single BAM for processing.

    Uses bamtools for merging, which handles large numbers of input BAMs.
    """
    if len(bam_files) == 1:
        return bam_files[0]
    else:
        if out_file is None:
            out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0]))
        if not utils.file_exists(out_file) or not utils.file_exists(out_file + ".bai"):
            bamtools = config_utils.get_program("bamtools", config)
            samtools = config_utils.get_program("samtools", config)
            resources = config_utils.get_resources("samtools", config)
            num_cores = config["algorithm"].get("num_cores", 1)
            max_mem = resources.get("memory", "1G")
            if len(bam_files) > system.open_file_limit():
                raise IOError("More files to merge (%s) then available open file descriptors (%s)\n"
                              "See documentation on tips for changing file limits:\n"
                              "https://bcbio-nextgen.readthedocs.org/en/latest/contents/"
                              "parallel.html#tuning-systems-for-scale"
                              % (len(bam_files), system.open_file_limit()))
            with file_transaction(out_file) as tx_out_file:
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                with utils.tmpfile(dir=work_dir, prefix="bammergelist") as bam_file_list:
                    bam_file_list = "%s.list" % os.path.splitext(out_file)[0]
                    with open(bam_file_list, "w") as out_handle:
                        for f in sorted(bam_files):
                            out_handle.write("%s\n" % f)
                    cmd = ("{bamtools} merge -list {bam_file_list} | "
                           "{samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}")
                    do.run(cmd.format(**locals()), "Merge bam files", None)
            for b in bam_files:
                utils.save_diskspace(b, "BAM merged to %s" % out_file, config)
        picard = broad.runner_from_config(config)
        picard.run_fn("picard_index", out_file)
        return out_file
Ejemplo n.º 42
0
Archivo: metrics.py Proyecto: vals/bcbb
    def report(self, align_bam, ref_file, is_paired, bait_file, target_file):
        """Produce report metrics using Picard with sorted aligned BAM file.
        """
        dup_bam, dup_metrics = self._get_current_dup_metrics(align_bam)
        align_metrics = self._collect_align_metrics(dup_bam, ref_file)
        # Prefer the GC metrics in FastQC instead of Picard
        # gc_graph, gc_metrics = self._gc_bias(dup_bam, ref_file)
        gc_graph = None
        insert_graph, insert_metrics, hybrid_metrics = (None, None, None)
        if is_paired:
            insert_graph, insert_metrics = self._insert_sizes(dup_bam)

        if bait_file and target_file:
            hybrid_metrics = self._hybrid_select_metrics(dup_bam, bait_file, target_file)

        vrn_vals = self._variant_eval_metrics(dup_bam)
        summary_info = self._parser.get_summary_metrics(
            align_metrics, dup_metrics, insert_metrics, hybrid_metrics, vrn_vals
        )
        pprint.pprint(summary_info)
        graphs = []
        if gc_graph and os.path.exists(gc_graph):
            graphs.append((gc_graph, "Distribution of GC content across reads"))

        if insert_graph and os.path.exists(insert_graph):
            graphs.append((insert_graph, "Distribution of paired end insert sizes"))

        # Attempt to clean up potential waste of space
        if dup_bam != align_bam:
            config = self._picard._config
            reason = (
                "Picard MarkDuplicates file {} only needed for metrics "
                "and has been removed to save space".format(dup_bam)
            )
            save_diskspace(dup_bam, reason, config)

        return summary_info, graphs
Ejemplo n.º 43
0
def realign_sample(data, region=None, out_file=None):
    """Realign sample BAM file at indels.
    """
    realigner = data["config"]["algorithm"].get("realign", True)
    realigner = "gatk" if realigner is True else realigner
    realign_fn = _realign_approaches[realigner] if realigner else None

    if realign_fn:
        logger.info("Realigning %s with %s: %s %s" % (data["name"], realigner,
                                                      os.path.basename(data["work_bam"]),
                                                      region))
        sam_ref = data["sam_ref"]
        config = data["config"]
        if region == "nochr":
            realign_bam = write_nochr_reads(data["work_bam"], out_file)
        else:
            realign_bam = realign_fn(data["work_bam"], sam_ref, config,
                                     data["genome_resources"]["variation"]["dbsnp"],
                                     region, out_file)
        if region is None:
            save_diskspace(data["work_bam"], "Realigned to %s" % realign_bam,
                           config)
        data["work_bam"] = realign_bam
    return [data]
Ejemplo n.º 44
0
def write_recal_bam(data, region=None, out_file=None):
    """Step 2 of GATK recalibration -- use covariates to re-write output file.
    """
    config = data["config"]
    if out_file is None:
        out_file = "%s-gatkrecal.bam" % os.path.splitext(data["work_bam"])[0]
    logger.info("Writing recalibrated BAM for %s to %s" % (data["name"], out_file))
    if region == "nochr":
        out_bam = write_nochr_reads(data["work_bam"], out_file, data["config"])
    else:
        out_bam = _run_recal_bam(data["work_bam"], data["prep_recal"],
                                 region, data["sam_ref"], out_file, config)
    qual_bin = config["algorithm"].get("quality_bin", None)
    if ((qual_bin is True or qual_bin == "postrecal" or
         isinstance(qual_bin, list) and "postrecal" in qual_bin)
         and has_aligned_reads(out_bam)):
        binned_bam = cram.illumina_qual_bin(out_bam, data["sam_ref"],
                                         os.path.dirname(out_bam), config)
        shutil.move(out_bam, out_bam + ".binned")
        shutil.move(binned_bam, out_bam)
        utils.save_diskspace(out_bam + ".binned",
                             "Quality binned to %s" % out_bam, config)
    data["work_bam"] = out_bam
    return [data]
Ejemplo n.º 45
0
def merge_bam_files(bam_files, work_dir, config, out_file=None):
    """Merge multiple BAM files from a sample into a single BAM for processing.

    Uses bamtools for merging, which handles large numbers of input BAMs.
    """
    if len(bam_files) == 1:
        return bam_files[0]
    else:
        if out_file is None:
            out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0]))
        if not utils.file_exists(out_file):
            with file_transaction(out_file) as tx_out_file:
                with utils.tmpfile(dir=work_dir, prefix="bammergelist") as bam_file_list:
                    with open(bam_file_list, "w") as out_handle:
                        for f in bam_files:
                            out_handle.write("%s\n" % f)
                    cmd = [config_utils.get_program("bamtools", config),
                           "merge", "-list", bam_file_list, "-out", tx_out_file]
                    do.run(cmd, "Merge bam files", None)
            for b in bam_files:
                utils.save_diskspace(b, "BAM merged to %s" % out_file, config)
        picard = broad.runner_from_config(config)
        picard.run_fn("picard_index", out_file)
        return out_file
Ejemplo n.º 46
0
    def report(self, align_bam, ref_file, is_paired, bait_file, target_file):
        """Produce report metrics using Picard with sorted aligned BAM file.
        """
        dup_bam, dup_metrics = self._get_current_dup_metrics(align_bam)
        align_metrics = self._collect_align_metrics(dup_bam, ref_file)
        # Prefer the GC metrics in FastQC instead of Picard
        # gc_graph, gc_metrics = self._gc_bias(dup_bam, ref_file)
        gc_graph = None
        insert_graph, insert_metrics, hybrid_metrics = (None, None, None)
        if is_paired:
            insert_graph, insert_metrics = self._insert_sizes(dup_bam)

        if bait_file and target_file:
            hybrid_metrics = self._hybrid_select_metrics(
                    dup_bam, bait_file, target_file)

        vrn_vals = self._variant_eval_metrics(dup_bam)
        summary_info = self._parser.get_summary_metrics(align_metrics,
                dup_metrics, insert_metrics, hybrid_metrics,
                vrn_vals)
        pprint.pprint(summary_info)
        graphs = []
        if gc_graph and os.path.exists(gc_graph):
            graphs.append((gc_graph, "Distribution of GC content across reads"))

        if insert_graph and os.path.exists(insert_graph):
            graphs.append((insert_graph, "Distribution of paired end insert sizes"))

        # Attempt to clean up potential waste of space
        if dup_bam != align_bam:
            config = self._picard._config
            reason = "Picard MarkDuplicates file {} only needed for metrics " \
            "and has been removed to save space".format(dup_bam)
            save_diskspace(dup_bam, reason, config)

        return summary_info, graphs
Ejemplo n.º 47
0
def sam_to_sort_bam(sam_file, ref_file, fastq1, fastq2, sample_name,
                    lane_name, config):
    """Convert SAM file to merged and sorted BAM file.
    """
    rg_name = lane_name.split("_")[0]
    picard = broad.runner_from_config(config)
    platform = config["algorithm"]["platform"]
    base_dir = os.path.dirname(sam_file)

    picard.run_fn("picard_index_ref", ref_file)
    out_fastq_bam = picard.run_fn("picard_fastq_to_bam", fastq1, fastq2,
                                  base_dir, platform, sample_name, rg_name, lane_name)
    out_bam = picard.run_fn("picard_sam_to_bam", sam_file, out_fastq_bam, ref_file,
                            fastq2 is not None)
    sort_bam = picard.run_fn("picard_sort", out_bam)

    utils.save_diskspace(sam_file, "SAM converted to BAM", config)
    utils.save_diskspace(out_fastq_bam, "Combined into output BAM %s" % out_bam, config)
    utils.save_diskspace(out_bam, "Sorted to %s" % sort_bam, config)

    return sort_bam
Ejemplo n.º 48
0
def _save_diskspace(in_file, out_file, config):
    """Potentially remove input file to save space if configured and in work directory.
    """
    if (os.path.commonprefix([in_file, out_file]).rstrip("/") == os.path.split(
            os.path.dirname(out_file))[0]):
        save_diskspace(in_file, "Trimmed to {}".format(out_file), config)
Ejemplo n.º 49
0
def _save_diskspace(in_file, out_file, config):
    """Potentially remove input file to save space if configured and in work directory.
    """
    if (os.path.commonprefix([in_file, out_file]).rstrip("/") ==
        os.path.split(os.path.dirname(out_file))[0]):
        save_diskspace(in_file, "Trimmed to {}".format(out_file), config)