Esempio n. 1
0
def samblaster_dedup_sort(data, tx_out_file, tx_sr_file, tx_disc_file):
    """Deduplicate and sort with samblaster, produces split read and discordant pair files.
    """
    samblaster = config_utils.get_program("samblaster", data["config"])
    samtools = config_utils.get_program("samtools", data["config"])
    tmp_prefix = "%s-sorttmp" % utils.splitext_plus(tx_out_file)[0]
    tobam_cmd = ("{samtools} sort {sort_opt} -@ {cores} -m {mem} -T {tmp_prefix}-{dext} {out_file} -")
    # full BAM -- associate more memory and cores
    cores, mem = _get_cores_memory(data, downscale=2)
    # Potentially downsample to maximum coverage here if not splitting and whole genome sample
    ds_cmd = None if data.get("align_split") else bam.get_maxcov_downsample_cl(data, "samtools")
    sort_opt = "-n" if data.get("align_split") and dd.get_mark_duplicates(data) else ""
    if ds_cmd:
        dedup_cmd = "%s %s > %s" % (tobam_cmd.format(out_file="", dext="full", **locals()), ds_cmd, tx_out_file)
    else:
        dedup_cmd = tobam_cmd.format(out_file="-o %s" % tx_out_file, dext="full", **locals())
    # split and discordant BAMs -- give less memory/cores since smaller files
    sort_opt = ""
    cores, mem = _get_cores_memory(data, downscale=4)
    splitter_cmd = tobam_cmd.format(out_file="-o %s" % tx_sr_file, dext="spl", **locals())
    discordant_cmd = tobam_cmd.format(out_file="-o %s" % tx_disc_file, dext="disc", **locals())
    # samblaster 0.1.22 and better require the -M flag for compatibility with bwa-mem
    cmd = ("{samblaster} --addMateTags -M --splitterFile >({splitter_cmd}) --discordantFile >({discordant_cmd}) "
           "| {dedup_cmd}")
    return cmd.format(**locals())
Esempio n. 2
0
def samblaster_dedup_sort(data, tx_out_file, tx_sr_file, tx_disc_file):
    """Deduplicate and sort with samblaster, produces split read and discordant pair files.
    """
    samblaster = config_utils.get_program("samblaster", data["config"])
    samtools = config_utils.get_program("samtools", data["config"])
    tmp_prefix = "%s-sorttmp" % utils.splitext_plus(tx_out_file)[0]
    tobam_cmd = ("{samtools} sort {sort_opt} -@ {cores} -m {mem} -T {tmp_prefix}-{dext} {out_file} -")
    # full BAM -- associate more memory and cores
    cores, mem = _get_cores_memory(data, downscale=2)
    # Potentially downsample to maximum coverage here if not splitting and whole genome sample
    ds_cmd = None if data.get("align_split") else bam.get_maxcov_downsample_cl(data, "samtools")
    sort_opt = "-n" if data.get("align_split") and dd.get_mark_duplicates(data) else ""
    if ds_cmd:
        dedup_cmd = "%s %s > %s" % (tobam_cmd.format(out_file="", dext="full", **locals()), ds_cmd, tx_out_file)
    else:
        dedup_cmd = tobam_cmd.format(out_file="-o %s" % tx_out_file, dext="full", **locals())
    # split and discordant BAMs -- give less memory/cores since smaller files
    sort_opt = ""
    cores, mem = _get_cores_memory(data, downscale=4)
    splitter_cmd = tobam_cmd.format(out_file="-o %s" % tx_sr_file, dext="spl", **locals())
    discordant_cmd = tobam_cmd.format(out_file="-o %s" % tx_disc_file, dext="disc", **locals())
    # samblaster 0.1.22 and better require the -M flag for compatibility with bwa-mem
    cmd = ("{samblaster} --addMateTags -M --splitterFile >({splitter_cmd}) --discordantFile >({discordant_cmd}) "
           "| {dedup_cmd}")
    return cmd.format(**locals())
Esempio n. 3
0
def _biobambam_merge_maxcov(data):
    """Combine query sorted BAM files, sort and truncate to maximum coverage.

    No de-duplication.
    """
    ds_cmd = bam.get_maxcov_downsample_cl(data, "bamsormadup")
    return ("bammerge IL={tx_bam_file_list} tmpfile={tx_out_file}-merge %s > {tx_out_file}" % ds_cmd)
Esempio n. 4
0
def _biobambam_merge_maxcov(data):
    """Combine query sorted BAM files, sort and truncate to maximum coverage.

    No de-duplication.
    """
    ds_cmd = bam.get_maxcov_downsample_cl(data, "bamsormadup")
    return (
        "bammerge IL={tx_bam_file_list} tmpfile={tx_out_file}-merge %s > {tx_out_file}"
        % ds_cmd)
Esempio n. 5
0
def _biobambam_merge_dedup_maxcov(data):
    """Combine query sorted BAM files, de-duplicate, sort and truncate to maximum coverage.

    Handles split files, checking for large scale whole genome coverage where
    we want to downsample to a maximum coverage.
    """
    ds_cmd = bam.get_maxcov_downsample_cl(data, "bamsormadup")
    return ("bamcat level=0 tmpfile={tx_out_file}-bammerge `cat {tx_bam_file_list}` | "
            "bamsormadup threads={num_cores} "
            "tmpfile={tx_out_file}-bamsormaduptmp %s > {tx_out_file}" % ds_cmd)
Esempio n. 6
0
def _biobambam_merge_dedup_maxcov(data):
    """Combine query sorted BAM files, de-duplicate, sort and truncate to maximum coverage.

    Handles split files, checking for large scale whole genome coverage where
    we want to downsample to a maximum coverage.
    """
    ds_cmd = bam.get_maxcov_downsample_cl(data, "bamsormadup")
    return (
        "bamcat level=0 tmpfile={tx_out_file}-bammerge `cat {tx_bam_file_list}` | "
        "bamsormadup threads={num_cores} "
        "tmpfile={tx_out_file}-bamsormaduptmp %s > {tx_out_file}" % ds_cmd)
Esempio n. 7
0
def _biobambam_dedup_sort(data, tx_out_file):
    """Perform streaming deduplication and sorting with biobambam's bamsormadup
    """
    samtools = config_utils.get_program("samtools", data["config"])
    cores, mem = _get_cores_memory(data, downscale=2)
    tmp_file = "%s-sorttmp" % utils.splitext_plus(tx_out_file)[0]
    if data.get("align_split"):
        cmd = "{samtools} sort -n -@ {cores} -m {mem} -O bam -T {tmp_file}-namesort -o {tx_out_file} -"
    else:
        ds_cmd = bam.get_maxcov_downsample_cl(data, "bamsormadup")
        cmd = ("bamsormadup inputformat=sam threads={cores} tmpfile={tmp_file}-markdup "
               "SO=coordinate %s > {tx_out_file}" % ds_cmd)
    return cmd.format(**locals())
Esempio n. 8
0
def _biobambam_dedup_sort(data, tx_out_file):
    """Perform streaming deduplication and sorting with biobambam's bamsormadup
    """
    samtools = config_utils.get_program("samtools", data["config"])
    cores, mem = _get_cores_memory(data, downscale=2)
    tmp_file = "%s-sorttmp" % utils.splitext_plus(tx_out_file)[0]
    if data.get("align_split"):
        sort_opt = "-n" if data.get("align_split") and dd.get_mark_duplicates(data) else ""
        cmd = "{samtools} sort %s -@ {cores} -m {mem} -O bam -T {tmp_file}-namesort -o {tx_out_file} -" % sort_opt
    else:
        ds_cmd = bam.get_maxcov_downsample_cl(data, "bamsormadup")
        cmd = ("bamsormadup inputformat=sam threads={cores} tmpfile={tmp_file}-markdup "
               "SO=coordinate %s > {tx_out_file}" % ds_cmd)
    return cmd.format(**locals())
Esempio n. 9
0
def _biobambam_dedup_sort(data, tx_out_file):
    """Perform streaming deduplication and sorting with biobambam's bamsormadup
    """
    samtools = config_utils.get_program("samtools", data["config"])
    cores, mem = _get_cores_memory(data, downscale=2)
    tmp_file = "%s-sorttmp" % utils.splitext_plus(tx_out_file)[0]
    if data.get("align_split"):
        sort_opt = "-n" if data.get("align_split") and _check_dedup(data) else ""
        cmd = "{samtools} sort %s -@ {cores} -m {mem} -O bam -T {tmp_file}-namesort -o {tx_out_file} -" % sort_opt
    else:
        # scale core usage to avoid memory issues with larger WGS samples
        cores = max(1, int(math.ceil(cores * 0.75)))
        ds_cmd = bam.get_maxcov_downsample_cl(data, "bamsormadup")
        bamsormadup = config_utils.get_program("bamsormadup", data)
        cmd = ("{bamsormadup} inputformat=sam threads={cores} tmpfile={tmp_file}-markdup "
               "SO=coordinate %s > {tx_out_file}" % ds_cmd)
    return cmd.format(**locals())