Exemple #1
0
def align_bam(in_bam, ref_file, names, align_dir, data):
    """Perform direct alignment of an input BAM file with BWA using pipes.

    This avoids disk IO by piping between processes:
     - samtools sort of input BAM to queryname
     - bedtools conversion to interleaved FASTQ
     - bwa-mem alignment
     - samtools conversion to BAM
     - samtools sort to coordinate
    """
    config = data["config"]
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    samtools = config_utils.get_program("samtools", config)
    bedtools = config_utils.get_program("bedtools", config)
    bwa = config_utils.get_program("bwa", config)
    resources = config_utils.get_resources("samtools", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    # adjust memory for samtools since used for input and output
    max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                         3, "decrease")
    rg_info = novoalign.get_rg_info(names)
    if not utils.file_exists(out_file):
        with utils.curdir_tmpdir() as work_dir:
            with postalign.tobam_cl(data, out_file, bam.is_paired(in_bam)) as (tobam_cl, tx_out_file):
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                prefix1 = "%s-in1" % tx_out_prefix
                cmd = ("{samtools} sort -n -o -l 0 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} "
                       "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout "
                       "| {bwa} mem -p -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} - | ")
                cmd = cmd.format(**locals()) + tobam_cl
                do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"], None,
                       [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, in_bam)])
    return out_file
Exemple #2
0
def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    """Perform a BWA alignment, generating a SAM file.
    """
    assert not data.get("align_split"), "Do not handle split alignments with non-piped bwa"
    config = data["config"]
    sai1_file = os.path.join(align_dir, "%s_1.sai" % names["lane"])
    sai2_file = (os.path.join(align_dir, "%s_2.sai" % names["lane"])
                 if pair_file else None)
    sam_file = os.path.join(align_dir, "%s.sam" % names["lane"])
    if not utils.file_exists(sam_file):
        if not utils.file_exists(sai1_file):
            with file_transaction(sai1_file) as tx_sai1_file:
                _run_bwa_align(fastq_file, ref_file, tx_sai1_file, config)
        if sai2_file and not utils.file_exists(sai2_file):
            with file_transaction(sai2_file) as tx_sai2_file:
                _run_bwa_align(pair_file, ref_file, tx_sai2_file, config)
        align_type = "sampe" if sai2_file else "samse"
        rg_info = novoalign.get_rg_info(names)
        sam_cl = [config_utils.get_program("bwa", config), align_type, "-r", "'%s'" % rg_info,
                  ref_file, sai1_file]
        if sai2_file:
            sam_cl.append(sai2_file)
        sam_cl.append(fastq_file)
        if sai2_file:
            sam_cl.append(pair_file)
        with file_transaction(sam_file) as tx_sam_file:
            cmd = "{cl} > {out_file}".format(cl=" ".join(sam_cl), out_file=tx_sam_file)
            do.run(cmd, "bwa {align_type}".format(**locals()), None)
    return sam_file
Exemple #3
0
def align(fastq_file, pair_file, index_dir, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM.

    TODO: Use streaming with new development version of SNAP to feed into
    structural variation preparation de-duplication.
    """
    pair_file = pair_file if pair_file else ""
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    assert not data.get(
        "align_split"), "Split alignments not supported with SNAP"
    snap = config_utils.get_program("snap", data["config"])
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    resources = config_utils.get_resources("snap", data["config"])
    rg_info = novoalign.get_rg_info(names)
    is_paired = bam.is_paired(fastq_file) if fastq_file.endswith(
        ".bam") else pair_file
    if not utils.file_exists(out_file):
        with postalign.tobam_cl(data, out_file,
                                is_paired) as (tobam_cl, tx_out_file):
            cmd_name = "paired" if is_paired else "single"
            cmd = ("{snap} {cmd_name} {index_dir} {fastq_file} {pair_file} "
                   "-R '{rg_info}' -t {num_cores} -M -o -sam - | ")
            do.run(
                cmd.format(**locals()) + tobam_cl,
                "SNAP alignment: %s" % names["sample"])
    data["work_bam"] = out_file
    return data
Exemple #4
0
def align(fastq_file, pair_file, index_dir, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM.

    TODO: Use streaming with new development version of SNAP to feed into
    structural variation preparation de-duplication.
    """
    pair_file = pair_file if pair_file else ""
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    assert not data.get("align_split"), "Split alignments not supported with SNAP"
    snap = config_utils.get_program("snap", data["config"])
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    resources = config_utils.get_resources("snap", data["config"])
    max_mem = resources.get("memory", "1G")
    rg_info = novoalign.get_rg_info(names)
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            with utils.curdir_tmpdir(data) as work_dir:
                if fastq_file.endswith(".bam"):
                    cmd_name = "paired" if bam.is_paired(fastq_file) else "single"
                else:
                    cmd_name = "single" if not pair_file else "paired"
                cmd = ("{snap} {cmd_name} {index_dir} {fastq_file} {pair_file} "
                       "-rg '{rg_info}' -t {num_cores} -sa -so -sm {max_mem} -o {tx_out_file}")
                do.run(cmd.format(**locals()), "SNAP alignment: %s" % names["sample"])
    data["work_bam"] = out_file
    return data
Exemple #5
0
def align(fastq_file, pair_file, index_dir, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM.
    """
    umi_ext = "-cumi" if "umi_bam" in data else ""
    out_file = os.path.join(align_dir, "{0}-sort{1}.bam".format(dd.get_sample_name(data), umi_ext))
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    rg_info = novoalign.get_rg_info(names)
    preset = "sr"

    pair_file = pair_file if pair_file else ""
    if data.get("align_split"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data)
    else:
        final_file = None

    if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)):
        with postalign.tobam_cl(data, out_file, pair_file != "") as (tobam_cl, tx_out_file):
            # If a single index present, index_dir points to that
            index_file = None
            if index_dir and os.path.isfile(index_dir):
                index_dir = os.path.dirname(index_dir)
                index_file = os.path.join(index_dir, "%s-%s.mmi" % (dd.get_genome_build(data), preset))
            if not index_file or not os.path.exists(index_file):
                index_file = dd.get_ref_file(data)
            cmd = ("minimap2 -a -x {preset} -R '{rg_info}' -t {num_cores} {index_file} "
                   "{fastq_file} {pair_file} | ")
            do.run(cmd.format(**locals()) + tobam_cl, "minimap2 alignment: %s" % dd.get_sample_name(data))
    data["work_bam"] = out_file
    return data
Exemple #6
0
def fixrg(in_bam, names, ref_file, dirs, data):
    """Fix read group in a file, using samtools addreplacerg.

    addreplacerg does not remove the old read group, causing confusion when
    checking. We use reheader to work around this
    """
    work_dir = utils.safe_makedir(
        os.path.join(dirs["work"], "bamclean", dd.get_sample_name(data)))
    out_file = os.path.join(
        work_dir,
        "%s-fixrg.bam" % utils.splitext_plus(os.path.basename(in_bam))[0])
    if not utils.file_uptodate(out_file, in_bam):
        with file_transaction(data, out_file) as tx_out_file:
            rg_info = novoalign.get_rg_info(names)
            new_header = "%s-header.txt" % os.path.splitext(out_file)[0]
            do.run(
                "samtools view -H {in_bam} | grep -v ^@RG > {new_header}".
                format(**locals()),
                "Create empty RG header: %s" % dd.get_sample_name(data))
            cmd = (
                "samtools reheader {new_header} {in_bam} | "
                "samtools addreplacerg -r '{rg_info}' -m overwrite_all -O bam -o {tx_out_file} -"
            )
            do.run(cmd.format(**locals()),
                   "Fix read groups: %s" % dd.get_sample_name(data))
    return out_file
Exemple #7
0
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted output BAM.
    """
    pair_file = pair_file if pair_file else ""
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    qual_format = data["config"]["algorithm"].get("quality_format", "").lower()
    if data.get("align_split"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file = alignprep.split_namedpipe_cl(fastq_file, data)
        if pair_file:
            pair_file = alignprep.split_namedpipe_cl(pair_file, data)
    else:
        final_file = None
        if qual_format == "illumina":
            fastq_file = alignprep.fastq_convert_pipe_cl(fastq_file, data)
            if pair_file:
                pair_file = alignprep.fastq_convert_pipe_cl(pair_file, data)
    rg_info = novoalign.get_rg_info(names)
    if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)):
        # If we cannot do piping, use older bwa aln approach
        if not _can_use_mem(fastq_file, data):
            out_file = _align_backtrack(fastq_file, pair_file, ref_file, out_file,
                                        names, rg_info, data)
        else:
            out_file = _align_mem(fastq_file, pair_file, ref_file, out_file,
                                  names, rg_info, data)
    data["work_bam"] = out_file
    return data
Exemple #8
0
def remove_extracontigs(in_bam, data):
    """Remove extra contigs (non chr1-22,X,Y) from an input BAM.

    These extra contigs can often be arranged in different ways, causing
    incompatibility issues with GATK and other tools. This also fixes the
    read group header as in fixrg.

    This does not yet handle mapping over 1 -> chr1 issues since this requires
    a ton of search/replace which slows down conversion.
    """
    work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "bamclean", dd.get_sample_name(data)))
    out_file = os.path.join(work_dir, "%s-noextras.bam" % utils.splitext_plus(os.path.basename(in_bam))[0])
    if not utils.file_exists(out_file):
        out_file = os.path.join(work_dir, "%s-noextras.bam" % dd.get_sample_name(data))
    if not utils.file_uptodate(out_file, in_bam):
        with file_transaction(data, out_file) as tx_out_file:
            target_chroms = _target_chroms_and_header(in_bam, data)
            str_chroms = " ".join(target_chroms)
            rg_info = novoalign.get_rg_info(data["rgnames"])
            bcbio_py = sys.executable
            ref_file = dd.get_ref_file(data)
            local_bam = os.path.join(os.path.dirname(tx_out_file), os.path.basename(in_bam))
            cores = dd.get_cores(data)
            utils.symlink_plus(in_bam, local_bam)
            bam.index(local_bam, data["config"])
            cmd = ("samtools view -@ {cores} -h {local_bam} {str_chroms} | "
                   """{bcbio_py} -c 'from bcbio.pipeline import cleanbam; """
                   """cleanbam.fix_header("{ref_file}")' | """
                   "samtools view -@ {cores} -u - | "
                   "samtools addreplacerg -@ {cores} -r '{rg_info}' -m overwrite_all -O bam -o {tx_out_file} - ")
            do.run(cmd.format(**locals()), "bamprep, remove extra contigs: %s" % dd.get_sample_name(data))
    return out_file
Exemple #9
0
def align_bam(in_bam, ref_file, names, align_dir, config):
    """Perform direct alignment of an input BAM file with BWA using pipes.

    This avoids disk IO by piping between processes:
     - samtools sort of input BAM to queryname
     - bedtools conversion to interleaved FASTQ
     - bwa-mem alignment
     - samtools conversion to BAM
     - samtools sort to coordinate
    """
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    samtools = config_utils.get_program("samtools", config)
    bedtools = config_utils.get_program("bedtools", config)
    bwa = config_utils.get_program("bwa", config)
    resources = config_utils.get_resources("samtools", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    max_mem = resources.get("memory", "768M")
    rg_info = novoalign.get_rg_info(names)
    if not utils.file_exists(out_file):
        _check_samtools_version()
        with utils.curdir_tmpdir() as work_dir:
            with file_transaction(out_file) as tx_out_file:
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                prefix1 = "%s-in1" % tx_out_prefix
                cmd = ("{samtools} sort -n -o -l 0 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} "
                       "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout "
                       "| {bwa} mem -p -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} - "
                       "| {samtools} view -b -S -u - "
                       "| {samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}")
                cmd = cmd.format(**locals())
                do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"], None,
                       [do.file_nonempty(tx_out_file)])
    return out_file
Exemple #10
0
def align(fastq_file, pair_file, index_dir, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM.

    TODO: Use streaming with new development version of SNAP to feed into
    structural variation preparation de-duplication.
    """
    pair_file = pair_file if pair_file else ""
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    assert not data.get(
        "align_split"), "Split alignments not supported with SNAP"
    snap = config_utils.get_program("snap", data["config"])
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    resources = config_utils.get_resources("snap", data["config"])
    max_mem = resources.get("memory", "1G")
    rg_info = novoalign.get_rg_info(names)
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            with utils.curdir_tmpdir(data) as work_dir:
                if fastq_file.endswith(".bam"):
                    cmd_name = "paired" if bam.is_paired(
                        fastq_file) else "single"
                else:
                    cmd_name = "single" if not pair_file else "paired"
                cmd = (
                    "{snap} {cmd_name} {index_dir} {fastq_file} {pair_file} "
                    "-rg '{rg_info}' -t {num_cores} -sa -so -sm {max_mem} -o {tx_out_file}"
                )
                do.run(cmd.format(**locals()),
                       "SNAP alignment: %s" % names["sample"])
    data["work_bam"] = out_file
    return data
Exemple #11
0
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted output BAM.
    """
    pair_file = pair_file if pair_file else ""
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    qual_format = data["config"]["algorithm"].get("quality_format", "").lower()
    if data.get("align_split"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file = alignprep.split_namedpipe_cl(fastq_file, data)
        if pair_file:
            pair_file = alignprep.split_namedpipe_cl(pair_file, data)
    else:
        final_file = None
        if qual_format == "illumina":
            fastq_file = alignprep.fastq_convert_pipe_cl(fastq_file, data)
            if pair_file:
                pair_file = alignprep.fastq_convert_pipe_cl(pair_file, data)
    rg_info = novoalign.get_rg_info(names)
    if not utils.file_exists(out_file) and (final_file is None or
                                            not utils.file_exists(final_file)):
        # If we cannot do piping, use older bwa aln approach
        if ("bwa-mem" in tz.get_in(["config", "algorithm", "tools_off"], data,
                                   []) or not _can_use_mem(fastq_file, data)):
            out_file = _align_backtrack(fastq_file, pair_file, ref_file,
                                        out_file, names, rg_info, data)
        else:
            out_file = _align_mem(fastq_file, pair_file, ref_file, out_file,
                                  names, rg_info, data)
    data["work_bam"] = out_file
    return data
Exemple #12
0
def align_bam(in_bam, ref_file, names, align_dir, config):
    """Perform direct alignment of an input BAM file with BWA using pipes.

    This avoids disk IO by piping between processes:
     - samtools sort of input BAM to queryname
     - bedtools conversion to interleaved FASTQ
     - bwa-mem alignment
     - samtools conversion to BAM
     - samtools sort to coordinate
    """
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    samtools = config_utils.get_program("samtools", config)
    bedtools = config_utils.get_program("bedtools", config)
    bwa = config_utils.get_program("bwa", config)
    resources = config_utils.get_resources("samtools", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    # adjust memory for samtools since used for input and output
    max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                         3, "decrease")
    rg_info = novoalign.get_rg_info(names)
    if not utils.file_exists(out_file):
        novoalign.check_samtools_version(config)
        with utils.curdir_tmpdir() as work_dir:
            with file_transaction(out_file) as tx_out_file:
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                prefix1 = "%s-in1" % tx_out_prefix
                cmd = ("{samtools} sort -n -o -l 0 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} "
                       "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout "
                       "| {bwa} mem -p -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} - "
                       "| {samtools} view -b -S -u - "
                       "| {samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}")
                cmd = cmd.format(**locals())
                do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"], None,
                       [do.file_nonempty(tx_out_file)])
    return out_file
Exemple #13
0
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, config):
    """Perform piped alignment of fastq input files, generating sorted output BAM.
    """
    pair_file = pair_file if pair_file else ""
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    samtools = config_utils.get_program("samtools", config)
    bwa = config_utils.get_program("bwa", config)
    resources = config_utils.get_resources("samtools", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    # adjust memory for samtools since used alongside alignment
    max_mem = config_utils.adjust_memory(resources.get("memory", "2G"),
                                         3, "decrease")
    rg_info = novoalign.get_rg_info(names)
    if not utils.file_exists(out_file):
        novoalign.check_samtools_version(config)
        with utils.curdir_tmpdir() as work_dir:
            with file_transaction(out_file) as tx_out_file:
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                cmd = ("{bwa} mem -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} "
                       "{fastq_file} {pair_file} "
                       "| {samtools} view -b -S -u - "
                       "| {samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}")
                cmd = cmd.format(**locals())
                do.run(cmd, "bwa mem alignment from fastq: %s" % names["sample"], None,
                       [do.file_nonempty(tx_out_file)])
    return out_file
Exemple #14
0
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted output BAM.
    """
    pair_file = pair_file if pair_file else ""
    # back compatible -- older files were named with lane information, use sample name now
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    if not utils.file_exists(out_file):
        out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data)))
    qual_format = data["config"]["algorithm"].get("quality_format", "").lower()
    min_size = None
    if data.get("align_split") or fastq_file.endswith(".sdf"):
        if fastq_file.endswith(".sdf"):
            min_size = rtg.min_read_size(fastq_file)
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data)
    else:
        final_file = None
        if qual_format == "illumina":
            fastq_file = alignprep.fastq_convert_pipe_cl(fastq_file, data)
            if pair_file:
                pair_file = alignprep.fastq_convert_pipe_cl(pair_file, data)
    rg_info = novoalign.get_rg_info(names)
    if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)):
        # If we cannot do piping, use older bwa aln approach
        if ("bwa-mem" not in dd.get_tools_on(data) and
              ("bwa-mem" in dd.get_tools_off(data) or not _can_use_mem(fastq_file, data, min_size))):
            out_file = _align_backtrack(fastq_file, pair_file, ref_file, out_file,
                                        names, rg_info, data)
        else:
            out_file = _align_mem(fastq_file, pair_file, ref_file, out_file,
                                  names, rg_info, data)
    data["work_bam"] = out_file
    return data
Exemple #15
0
def _get_bwa_mem_cmd(data, out_file, ref_file, fastq1, fastq2=""):
    """Perform piped bwa mem mapping potentially with alternative alleles in GRCh38 + HLA typing.

    Commands for HLA post-processing:
       base=TEST
       run-HLA $base.hla > $base.hla.top
       cat $base.hla.HLA*.gt | grep ^GT | cut -f2- > $base.hla.all
       rm -f $base.hla.HLA*gt
       rm -f $base.hla.HLA*gz
    """
    alt_file = ref_file + ".alt"
    if utils.file_exists(alt_file):
        bwakit_dir = os.path.dirname(os.path.realpath(utils.which("run-bwamem")))
        hla_base = os.path.join(utils.safe_makedir(os.path.join(os.path.dirname(out_file), "hla")),
                                os.path.basename(out_file) + ".hla")
        alt_cmd = (" | {bwakit_dir}/k8 {bwakit_dir}/bwa-postalt.js -p {hla_base} {alt_file}")
    else:
        alt_cmd = ""
    bwa = config_utils.get_program("bwa", data["config"])
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    bwa_resources = config_utils.get_resources("bwa", data["config"])
    bwa_params = (" ".join([str(x) for x in bwa_resources.get("options", [])])
                  if "options" in bwa_resources else "")
    rg_info = novoalign.get_rg_info(data["rgnames"])
    pairing = "-p" if not fastq2 else ""
    # Restrict seed occurances to 1/2 of default, manage memory usage for centromere repeats in hg38
    # https://sourceforge.net/p/bio-bwa/mailman/message/31514937/
    # http://ehc.ac/p/bio-bwa/mailman/message/32268544/
    mem_usage = "-c 250"
    bwa_cmd = ("{bwa} mem {pairing} {mem_usage} -M -t {num_cores} {bwa_params} -R '{rg_info}' -v 1 "
               "{ref_file} {fastq1} {fastq2} ")
    return (bwa_cmd + alt_cmd).format(**locals())
Exemple #16
0
def remove_extracontigs(in_bam, data):
    """Remove extra contigs (non chr1-22,X,Y) from an input BAM.

    These extra contigs can often be arranged in different ways, causing
    incompatibility issues with GATK and other tools. This also fixes the
    read group header as in fixrg.
    """
    work_dir = utils.safe_makedir(
        os.path.join(dd.get_work_dir(data), "bamclean",
                     dd.get_sample_name(data)))
    out_file = os.path.join(
        work_dir,
        "%s-noextras.bam" % utils.splitext_plus(os.path.basename(in_bam))[0])
    if not utils.file_uptodate(out_file, in_bam):
        with file_transaction(data, out_file) as tx_out_file:
            target_chroms = [
                x.name for x in ref.file_contigs(dd.get_ref_file(data))
                if chromhacks.is_autosomal_or_sex(x.name)
            ]
            str_chroms = " ".join(target_chroms)
            comma_chroms = ",".join(target_chroms)
            rg_info = novoalign.get_rg_info(data["rgnames"])
            bcbio_py = sys.executable
            cmd = (
                "samtools view -h {in_bam} {str_chroms} | "
                """{bcbio_py} -c 'from bcbio.pipeline import cleanbam; """
                """cleanbam.fix_header("{comma_chroms}")' | """
                "samtools view -u - | "
                "samtools addreplacerg -r '{rg_info}' -m overwrite_all -O bam -o {tx_out_file} - "
            )
            do.run(
                cmd.format(**locals()),
                "bamprep, remove extra contigs: %s" % dd.get_sample_name(data))
    return out_file
Exemple #17
0
def align(fastq_file, pair_file, index_dir, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM.
    """
    umi_ext = "-cumi" if "umi_bam" in data else ""
    out_file = os.path.join(align_dir, "{0}-sort{1}.bam".format(dd.get_sample_name(data), umi_ext))
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    rg_info = novoalign.get_rg_info(names)
    preset = "sr"

    pair_file = pair_file if pair_file else ""
    if data.get("align_split"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data)
    else:
        final_file = None

    if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)):
        with postalign.tobam_cl(data, out_file, pair_file != "") as (tobam_cl, tx_out_file):
            index_file = None
            # Skip trying to use indices now as they provide only slight speed-ups
            # and give inconsitent outputs in BAM headers
            # If a single index present, index_dir points to that
            # if index_dir and os.path.isfile(index_dir):
            #     index_dir = os.path.dirname(index_dir)
            #     index_file = os.path.join(index_dir, "%s-%s.mmi" % (dd.get_genome_build(data), preset))
            if not index_file or not os.path.exists(index_file):
                index_file = dd.get_ref_file(data)
            cmd = ("minimap2 -a -x {preset} -R '{rg_info}' -t {num_cores} {index_file} "
                   "{fastq_file} {pair_file} | ")
            do.run(cmd.format(**locals()) + tobam_cl, "minimap2 alignment: %s" % dd.get_sample_name(data))
    data["work_bam"] = out_file
    return data
Exemple #18
0
def remove_extracontigs(in_bam, data):
    """Remove extra contigs (non chr1-22,X,Y) from an input BAM.

    These extra contigs can often be arranged in different ways, causing
    incompatibility issues with GATK and other tools. This also fixes the
    read group header as in fixrg.

    This does not yet handle mapping over 1 -> chr1 issues since this requires
    a ton of search/replace which slows down conversion.
    """
    work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "bamclean", dd.get_sample_name(data)))
    out_file = os.path.join(work_dir, "%s-noextras.bam" % utils.splitext_plus(os.path.basename(in_bam))[0])
    if not utils.file_uptodate(out_file, in_bam):
        with file_transaction(data, out_file) as tx_out_file:
            target_chroms = _target_chroms_and_header(in_bam, data)
            str_chroms = " ".join(target_chroms)
            rg_info = novoalign.get_rg_info(data["rgnames"])
            bcbio_py = sys.executable
            ref_file = dd.get_ref_file(data)
            cmd = ("samtools view -h {in_bam} {str_chroms} | "
                   """{bcbio_py} -c 'from bcbio.pipeline import cleanbam; """
                   """cleanbam.fix_header("{ref_file}")' | """
                   "samtools view -u - | "
                   "samtools addreplacerg -r '{rg_info}' -m overwrite_all -O bam -o {tx_out_file} - ")
            do.run(cmd.format(**locals()), "bamprep, remove extra contigs: %s" % dd.get_sample_name(data))
    return out_file
Exemple #19
0
def align_bam(in_bam, ref_file, names, align_dir, config):
    """Perform direct alignment of an input BAM file with BWA using pipes.

    This avoids disk IO by piping between processes:
     - samtools sort of input BAM to queryname
     - bedtools conversion to interleaved FASTQ
     - bwa-mem alignment
     - samtools conversion to BAM
     - samtools sort to coordinate
    """
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    samtools = config_utils.get_program("samtools", config)
    bedtools = config_utils.get_program("bedtools", config)
    bwa = config_utils.get_program("bwa", config)
    resources = config_utils.get_resources("samtools", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    max_mem = resources.get("memory", "768M")
    rg_info = novoalign.get_rg_info(names)
    if not utils.file_exists(out_file):
        _check_samtools_version()
        with utils.curdir_tmpdir() as work_dir:
            with file_transaction(out_file) as tx_out_file:
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                prefix1 = "%s-in1" % tx_out_prefix
                cmd = ("{samtools} sort -n -o -l 0 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} "
                       "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout "
                       "| {bwa} mem -p -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} - "
                       "| {samtools} view -b -S -u - "
                       "| {samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}")
                logger.info(cmd.format(**locals()))
                subprocess.check_call(cmd.format(**locals()), shell=True)
    return out_file
Exemple #20
0
def align(fastq_file,
          pair_file,
          ref_file,
          names,
          align_dir,
          data,
          extra_args=None):
    """Do standard or paired end alignment with bowtie.
    """
    num_hits = 1
    if data["analysis"].lower().startswith("smallrna-seq"):
        num_hits = 1000
    config = data['config']
    out_file = os.path.join(align_dir,
                            "{0}-sort.bam".format(dd.get_sample_name(data)))
    if data.get("align_split"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file, pair_file = alignprep.split_namedpipe_cls(
            fastq_file, pair_file, data)
    else:
        final_file = None
        if fastq_file.endswith(".gz"):
            fastq_file = "<(gunzip -c %s)" % fastq_file
            if pair_file:
                pair_file = "<(gunzip -c %s)" % pair_file

    if not utils.file_exists(out_file) and (final_file is None or
                                            not utils.file_exists(final_file)):
        with postalign.tobam_cl(data, out_file, pair_file
                                is not None) as (tobam_cl, tx_out_file):
            cl = [config_utils.get_program("bowtie", config)]
            cl += _bowtie_args_from_config(data)
            cl += extra_args if extra_args is not None else []
            cl += [
                "-q",
                "-v",
                2,
                "-k",
                num_hits,
                "-X",
                2000,  # default is too selective for most data
                "--best",
                "--strata",
                "--sam",
                ref_file
            ]
            if pair_file:
                cl += ["-1", fastq_file, "-2", pair_file]
            else:
                cl += [fastq_file]
            cl = [str(i) for i in cl]
            fix_rg_cmd = r"samtools addreplacerg -r '%s' -" % novoalign.get_rg_info(
                names)
            cmd = " ".join(cl) + " | " + fix_rg_cmd + " | " + tobam_cl
            do.run(cmd,
                   "Running Bowtie on %s and %s." % (fastq_file, pair_file),
                   data)
    return out_file
Exemple #21
0
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted output BAM.
    """
    pair_file = pair_file if pair_file else ""
    # back compatible -- older files were named with lane information, use sample name now
    if names["lane"] != dd.get_sample_name(data):
        out_file = os.path.join(align_dir,
                                "{0}-sort.bam".format(names["lane"]))
    else:
        out_file = None
    if not out_file or not utils.file_exists(out_file):
        umi_ext = "-cumi" if "umi_bam" in data else ""
        out_file = os.path.join(
            align_dir, "{0}-sort{1}.bam".format(dd.get_sample_name(data),
                                                umi_ext))
    qual_format = data["config"]["algorithm"].get("quality_format", "").lower()
    min_size = None
    if data.get("align_split") or fastq_file.endswith(".sdf"):
        if fastq_file.endswith(".sdf"):
            min_size = rtg.min_read_size(fastq_file)
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file, pair_file = alignprep.split_namedpipe_cls(
            fastq_file, pair_file, data)
    else:
        final_file = None
        if qual_format == "illumina":
            fastq_file = alignprep.fastq_convert_pipe_cl(fastq_file, data)
            if pair_file:
                pair_file = alignprep.fastq_convert_pipe_cl(pair_file, data)
    rg_info = novoalign.get_rg_info(names)
    if not utils.file_exists(out_file) and (final_file is None or
                                            not utils.file_exists(final_file)):
        # If we cannot do piping, use older bwa aln approach
        if ("bwa-mem" not in dd.get_tools_on(data)
                and ("bwa-mem" in dd.get_tools_off(data)
                     or not _can_use_mem(fastq_file, data, min_size))):
            out_file = _align_backtrack(fastq_file, pair_file, ref_file,
                                        out_file, names, rg_info, data)
        else:
            if is_precollapsed_bam(
                    data) or not hla_on(data) or needs_separate_hla(data):
                out_file = _align_mem(fastq_file, pair_file, ref_file,
                                      out_file, names, rg_info, data)
            else:
                out_file = _align_mem_hla(fastq_file, pair_file, ref_file,
                                          out_file, names, rg_info, data)
    data["work_bam"] = out_file

    # bwakit will corrupt the non-HLA alignments in a UMI collapsed BAM file
    # (see https://github.com/bcbio/bcbio-nextgen/issues/3069)
    if needs_separate_hla(data):
        hla_file = os.path.join(os.path.dirname(out_file),
                                "HLA-" + os.path.basename(out_file))
        hla_file = _align_mem_hla(fastq_file, pair_file, ref_file, hla_file,
                                  names, rg_info, data)
        data["hla_bam"] = hla_file
    return data
Exemple #22
0
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted output BAM.
    """
    pair_file = pair_file if pair_file else ""
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    qual_format = data["config"]["algorithm"].get("quality_format", "").lower()
    if data.get("align_split"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file = alignprep.split_namedpipe_cl(fastq_file, data)
        if pair_file:
            pair_file = alignprep.split_namedpipe_cl(pair_file, data)
    else:
        final_file = None
        if qual_format == "illumina":
            fastq_file = alignprep.fastq_convert_pipe_cl(fastq_file, data)
            if pair_file:
                pair_file = alignprep.fastq_convert_pipe_cl(pair_file, data)
    samtools = config_utils.get_program("samtools", data["config"])
    bwa = config_utils.get_program("bwa", data["config"])
    resources = config_utils.get_resources("samtools", data["config"])
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    # adjust memory for samtools since used alongside alignment
    max_mem = config_utils.adjust_memory(resources.get("memory", "2G"), 3,
                                         "decrease")
    rg_info = novoalign.get_rg_info(names)
    if not utils.file_exists(out_file) and (final_file is None or
                                            not utils.file_exists(final_file)):
        # If we cannot do piping, use older bwa aln approach
        if not can_pipe(fastq_file, data):
            return align(fastq_file, pair_file, ref_file, names, align_dir,
                         data)
        else:
            with utils.curdir_tmpdir() as work_dir:
                with file_transaction(out_file) as tx_out_file:
                    tx_out_prefix = os.path.splitext(tx_out_file)[0]
                    cmd = (
                        "{bwa} mem -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} "
                        "{fastq_file} {pair_file} "
                        "| {samtools} view -b -S -u - "
                        "| {samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}"
                    )
                    cmd = cmd.format(**locals())
                    do.run(
                        cmd,
                        "bwa mem alignment from fastq: %s" % names["sample"],
                        None, [
                            do.file_nonempty(tx_out_file),
                            do.file_reasonable_size(tx_out_file, fastq_file)
                        ])
    data["work_bam"] = out_file
    return data
Exemple #23
0
def align(fastq_file, pair_file, ref_file, names, align_dir, data,
          extra_args=None):
    """Do standard or paired end alignment with bowtie.
    """
    num_hits = 1
    if data["analysis"].lower().startswith("smallrna-seq"):
        num_hits = 1000
    config = data['config']
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data)))
    if data.get("align_split"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data)
    else:
        final_file = None
        if fastq_file.endswith(".gz"):
            fastq_file = "<(gunzip -c %s)" % fastq_file
            if pair_file:
                pair_file = "<(gunzip -c %s)" % pair_file

    if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)):
        with postalign.tobam_cl(data, out_file, pair_file is not None) as (tobam_cl, tx_out_file):
            cl = [config_utils.get_program("bowtie", config)]
            cl += _bowtie_args_from_config(data)
            cl += extra_args if extra_args is not None else []
            cl += ["-q",
                   "-v", 2,
                   "-k", num_hits,
                   "-X", 2000, # default is too selective for most data
                   "--best",
                   "--strata",
                   "--sam",
                   ref_file]
            if pair_file:
                cl += ["-1", fastq_file, "-2", pair_file]
            else:
                cl += [fastq_file]
            cl = [str(i) for i in cl]
            fix_rg_cmd = r"samtools addreplacerg -r '%s' -" % novoalign.get_rg_info(data["rgnames"])
            if fix_rg_cmd:
                cmd = " ".join(cl) + " | " + fix_rg_cmd + " | " + tobam_cl
            else:
                cmd = " ".join(cl) + " | " + tobam_cl
            do.run(cmd, "Running Bowtie on %s and %s." % (fastq_file, pair_file), data)
    return out_file
Exemple #24
0
def align(fastq_file, pair_file, index_dir, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM.

    Pipes in input, handling paired and split inputs, using interleaving magic
    from: https://biowize.wordpress.com/2015/03/26/the-fastest-darn-fastq-decoupling-procedure-i-ever-done-seen/
    """
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data)))
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    resources = config_utils.get_resources("snap", data["config"])
    rg_info = novoalign.get_rg_info(names)

    if data.get("align_split"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data)
        fastq_file = fastq_file[2:-1]
        if pair_file:
            pair_file = pair_file[2:-1]
            stream_input = (r"paste <({fastq_file} | paste - - - -) "
                            r"<({pair_file} | paste - - - -) | tr '\t' '\n'")
        else:
            stream_input = fastq_file[2:-1]
    else:
        assert fastq_file.endswith(".gz")
        if pair_file:
            stream_input = (r"paste <(zcat {fastq_file} | paste - - - -) "
                            r"<(zcat {pair_file} | paste - - - -) | tr '\t' '\n'")
        else:
            stream_input = "zcat {fastq_file}"

    pair_file = pair_file if pair_file else ""
    if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)):
        with postalign.tobam_cl(data, out_file, pair_file is not None) as (tobam_cl, tx_out_file):
            if pair_file:
                sub_cmd = "paired"
                input_cmd = "-pairedInterleavedFastq -"
            else:
                sub_cmd = "single"
                input_cmd = "-fastq -"
            stream_input = stream_input.format(**locals())
            cmd = ("{stream_input} | snap-aligner {sub_cmd} {index_dir} {input_cmd} "
                   "-R '{rg_info}' -t {num_cores} -M -o -sam - | ")
            do.run(cmd.format(**locals()) + tobam_cl, "SNAP alignment: %s" % names["sample"])
    data["work_bam"] = out_file
    return data
Exemple #25
0
def fixrg(in_bam, names, ref_file, dirs, data):
    """Fix read group in a file, using samtools addreplacerg.

    addreplacerg does not remove the old read group, causing confusion when
    checking. We use reheader to work around this
    """
    work_dir = utils.safe_makedir(os.path.join(dirs["work"], "bamclean", dd.get_sample_name(data)))
    out_file = os.path.join(work_dir, "%s-fixrg.bam" % utils.splitext_plus(os.path.basename(in_bam))[0])
    if not utils.file_uptodate(out_file, in_bam):
        with file_transaction(data, out_file) as tx_out_file:
            rg_info = novoalign.get_rg_info(names)
            new_header = "%s-header.txt" % os.path.splitext(out_file)[0]
            do.run("samtools view -H {in_bam} | grep -v ^@RG > {new_header}".format(**locals()),
                   "Create empty RG header: %s" % dd.get_sample_name(data))
            cmd = ("samtools reheader {new_header} {in_bam} | "
                   "samtools addreplacerg -r '{rg_info}' -m overwrite_all -O bam -o {tx_out_file} -")
            do.run(cmd.format(**locals()), "Fix read groups: %s" % dd.get_sample_name(data))
    return out_file
Exemple #26
0
def align_bam(in_bam, ref_file, names, align_dir, data):
    """Perform direct alignment of an input BAM file with BWA using pipes.

    This avoids disk IO by piping between processes:
     - samtools sort of input BAM to queryname
     - bedtools conversion to interleaved FASTQ
     - bwa-mem alignment
     - samtools conversion to BAM
     - samtools sort to coordinate
    """
    config = data["config"]
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    samtools = config_utils.get_program("samtools", config)
    bedtools = config_utils.get_program("bedtools", config)
    bwa = config_utils.get_program("bwa", config)
    resources = config_utils.get_resources("samtools", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    # adjust memory for samtools since used for input and output
    max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), 3,
                                         "decrease").upper()
    bwa_resources = config_utils.get_resources("bwa", data["config"])
    bwa_params = (" ".join([str(x) for x in bwa_resources.get("options", [])])
                  if "options" in bwa_resources else "")
    rg_info = novoalign.get_rg_info(names)
    if not utils.file_exists(out_file):
        with tx_tmpdir(data) as work_dir:
            with postalign.tobam_cl(data, out_file,
                                    bam.is_paired(in_bam)) as (tobam_cl,
                                                               tx_out_file):
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                prefix1 = "%s-in1" % tx_out_prefix
                in_bam = utils.remote_cl_input(in_bam)
                cmd = (
                    "{samtools} sort -n -o -l 1 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} "
                    "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout "
                    "| {bwa} mem -p -M -t {num_cores} {bwa_params} -R '{rg_info}' -v 1 {ref_file} - | "
                )
                cmd = cmd.format(**locals()) + tobam_cl
                do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"],
                       None, [
                           do.file_nonempty(tx_out_file),
                           do.file_reasonable_size(tx_out_file, in_bam)
                       ])
    return out_file
Exemple #27
0
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted output BAM.
    """
    pair_file = pair_file if pair_file else ""
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    qual_format = data["config"]["algorithm"].get("quality_format", "").lower()
    if data.get("align_split"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file = alignprep.split_namedpipe_cl(fastq_file, data)
        if pair_file:
            pair_file = alignprep.split_namedpipe_cl(pair_file, data)
    else:
        final_file = None
        if qual_format == "illumina":
            fastq_file = alignprep.fastq_convert_pipe_cl(fastq_file, data)
            if pair_file:
                pair_file = alignprep.fastq_convert_pipe_cl(pair_file, data)
    samtools = config_utils.get_program("samtools", data["config"])
    bwa = config_utils.get_program("bwa", data["config"])
    resources = config_utils.get_resources("samtools", data["config"])
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    # adjust memory for samtools since used alongside alignment
    max_mem = config_utils.adjust_memory(resources.get("memory", "2G"),
                                         3, "decrease")
    rg_info = novoalign.get_rg_info(names)
    if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)):
        # If we cannot do piping, use older bwa aln approach
        if not can_pipe(fastq_file, data):
            return align(fastq_file, pair_file, ref_file, names, align_dir, data)
        else:
            with utils.curdir_tmpdir() as work_dir:
                with file_transaction(out_file) as tx_out_file:
                    tx_out_prefix = os.path.splitext(tx_out_file)[0]
                    cmd = ("{bwa} mem -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} "
                           "{fastq_file} {pair_file} "
                           "| {samtools} view -b -S -u - "
                           "| {samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}")
                    cmd = cmd.format(**locals())
                    do.run(cmd, "bwa mem alignment from fastq: %s" % names["sample"], None,
                           [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, fastq_file)])
    data["work_bam"] = out_file
    return data
Exemple #28
0
def align(fastq_file, pair_file, index_dir, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM.

    TODO: Use streaming with new development version of SNAP to feed into
    structural variation preparation de-duplication.
    """
    pair_file = pair_file if pair_file else ""
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    assert not data.get("align_split"), "Split alignments not supported with SNAP"
    snap = config_utils.get_program("snap", data["config"])
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    resources = config_utils.get_resources("snap", data["config"])
    rg_info = novoalign.get_rg_info(names)
    is_paired = bam.is_paired(fastq_file) if fastq_file.endswith(".bam") else pair_file
    if not utils.file_exists(out_file):
        with postalign.tobam_cl(data, out_file, is_paired) as (tobam_cl, tx_out_file):
            cmd_name = "paired" if is_paired else "single"
            cmd = ("{snap} {cmd_name} {index_dir} {fastq_file} {pair_file} "
                   "-R '{rg_info}' -t {num_cores} -M -o -sam - | ")
            do.run(cmd.format(**locals()) + tobam_cl, "SNAP alignment: %s" % names["sample"])
    data["work_bam"] = out_file
    return data
Exemple #29
0
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, config):
    """Perform piped alignment of fastq input files, generating sorted output BAM.
    """
    pair_file = pair_file if pair_file else ""
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    samtools = config_utils.get_program("samtools", config)
    bwa = config_utils.get_program("bwa", config)
    resources = config_utils.get_resources("samtools", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    max_mem = resources.get("memory", "768M")
    rg_info = novoalign.get_rg_info(names)
    if not utils.file_exists(out_file):
        _check_samtools_version()
        with utils.curdir_tmpdir() as work_dir:
            with file_transaction(out_file) as tx_out_file:
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                cmd = ("{bwa} mem -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} "
                       "{fastq_file} {pair_file} "
                       "| {samtools} view -b -S -u - "
                       "| {samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}")
                logger.info(cmd.format(**locals()))
                subprocess.check_call(cmd.format(**locals()), shell=True)
    return out_file
Exemple #30
0
def remove_extracontigs(in_bam, data):
    """Remove extra contigs (non chr1-22,X,Y) from an input BAM.

    These extra contigs can often be arranged in different ways, causing
    incompatibility issues with GATK and other tools. This also fixes the
    read group header as in fixrg.
    """
    work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "bamclean", dd.get_sample_name(data)))
    out_file = os.path.join(work_dir, "%s-noextras.bam" % utils.splitext_plus(os.path.basename(in_bam))[0])
    if not utils.file_uptodate(out_file, in_bam):
        with file_transaction(data, out_file) as tx_out_file:
            target_chroms = [x.name for x in ref.file_contigs(dd.get_ref_file(data))
                             if chromhacks.is_autosomal_or_sex(x.name)]
            str_chroms = " ".join(target_chroms)
            comma_chroms = ",".join(target_chroms)
            rg_info = novoalign.get_rg_info(data["rgnames"])
            bcbio_py = sys.executable
            cmd = ("samtools view -h {in_bam} {str_chroms} | "
                   """{bcbio_py} -c 'from bcbio.pipeline import cleanbam; """
                   """cleanbam.fix_header("{comma_chroms}")' | """
                   "samtools view -u - | "
                   "samtools addreplacerg -r '{rg_info}' -m overwrite_all -O bam -o {tx_out_file} - ")
            do.run(cmd.format(**locals()), "bamprep, remove extra contigs: %s" % dd.get_sample_name(data))
    return out_file
Exemple #31
0
def align(fastq_file, pair_file, index_dir, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM.

    Pipes in input, handling paired and split inputs, using interleaving magic
    from: https://biowize.wordpress.com/2015/03/26/the-fastest-darn-fastq-decoupling-procedure-i-ever-done-seen/

    Then converts a tab delimited set of outputs into interleaved fastq.

    awk changes spaces to underscores since SNAP only takes the initial name.
    SNAP requires /1 and /2 at the end of read names. If these are not present
    in the initial fastq may need to expand awk code to do this.
    """
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data)))
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    resources = config_utils.get_resources("snap", data["config"])
    rg_info = novoalign.get_rg_info(names)

    if data.get("align_split"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data)
        fastq_file = fastq_file[2:-1]
        if pair_file:
            pair_file = pair_file[2:-1]
            stream_input = (r"paste <({fastq_file} | paste - - - -) "
                            r"<({pair_file} | paste - - - -) | "
                            r"""awk 'BEGIN {{FS="\t"; OFS="\n"}} """
                            r"""{{ """
                            r"""split($1, P1, " "); split($5, P5, " "); """
                            r"""if ($1 !~ /\/1$/) $1 = P1[1]"/1"; if ($5 !~ /\/2$/) $5 = P5[1]"/2"; """
                            r"""gsub(" ", "_", $1); gsub(" ", "_", $5); """
                            r"""print $1, $2, "+", $4, $5, $6, "+", $8}}' | sponge """)
        else:
            stream_input = fastq_file[2:-1]
    else:
        final_file = None
        assert fastq_file.endswith(".gz")
        if pair_file:
            stream_input = (r"paste <(zcat {fastq_file} | paste - - - -) "
                            r"<(zcat {pair_file} | paste - - - -) | "
                            r"""awk 'BEGIN {{FS="\t"; OFS="\n"}} """
                            r"""{{ """
                            r"""split($1, P1, " "); split($5, P5, " "); """
                            r"""if ($1 !~ /\/1$/) $1 = P1[1]"/1"; if ($5 !~ /\/2$/) $5 = P5[1]"/2"; """
                            r"""gsub(" ", "_", $1); gsub(" ", "_", $5); """
                            r"""print $1, $2, "+", $4, $5, $6, "+", $8}}' | sponge """)
        else:
            stream_input = "zcat {fastq_file}"

    pair_file = pair_file if pair_file else ""
    if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)):
        with postalign.tobam_cl(data, out_file, pair_file is not None) as (tobam_cl, tx_out_file):
            if pair_file:
                sub_cmd = "paired"
                input_cmd = "-pairedInterleavedFastq -"
            else:
                sub_cmd = "single"
                input_cmd = "-fastq -"
            stream_input = stream_input.format(**locals())
            tmp_dir = os.path.dirname(tx_out_file)
            cmd = ("export TMPDIR={tmp_dir} && {stream_input} | snap-aligner {sub_cmd} {index_dir} {input_cmd} "
                   "-R '{rg_info}' -t {num_cores} -M -o -sam - | ")
            do.run(cmd.format(**locals()) + tobam_cl, "SNAP alignment: %s" % names["sample"])
    data["work_bam"] = out_file
    return data
Exemple #32
0
def align(fastq_file, pair_file, index_dir, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM.

    Pipes in input, handling paired and split inputs, using interleaving magic
    from: https://biowize.wordpress.com/2015/03/26/the-fastest-darn-fastq-decoupling-procedure-i-ever-done-seen/

    Then converts a tab delimited set of outputs into interleaved fastq.

    awk changes spaces to underscores since SNAP only takes the initial name.
    SNAP requires /1 and /2 at the end of read names. If these are not present
    in the initial fastq may need to expand awk code to do this.
    """
    out_file = os.path.join(align_dir,
                            "{0}-sort.bam".format(dd.get_sample_name(data)))
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    resources = config_utils.get_resources("snap", data["config"])
    rg_info = novoalign.get_rg_info(names)

    if data.get("align_split"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file, pair_file = alignprep.split_namedpipe_cls(
            fastq_file, pair_file, data)
        fastq_file = fastq_file[2:-1]
        if pair_file:
            pair_file = pair_file[2:-1]
            stream_input = (
                r"paste <({fastq_file} | paste - - - -) "
                r"<({pair_file} | paste - - - -) | "
                r"""awk 'BEGIN {{FS="\t"; OFS="\n"}} """
                r"""{{ """
                r"""split($1, P1, " "); split($5, P5, " "); """
                r"""if ($1 !~ /\/1$/) $1 = P1[1]"/1"; if ($5 !~ /\/2$/) $5 = P5[1]"/2"; """
                r"""gsub(" ", "_", $1); gsub(" ", "_", $5); """
                r"""print $1, $2, "+", $4, $5, $6, "+", $8}}' """)
        else:
            stream_input = fastq_file[2:-1]
    else:
        final_file = None
        assert fastq_file.endswith(".gz")
        if pair_file:
            stream_input = (
                r"paste <(zcat {fastq_file} | paste - - - -) "
                r"<(zcat {pair_file} | paste - - - -) | "
                r"""awk 'BEGIN {{FS="\t"; OFS="\n"}} """
                r"""{{ """
                r"""split($1, P1, " "); split($5, P5, " "); """
                r"""if ($1 !~ /\/1$/) $1 = P1[1]"/1"; if ($5 !~ /\/2$/) $5 = P5[1]"/2"; """
                r"""gsub(" ", "_", $1); gsub(" ", "_", $5); """
                r"""print $1, $2, "+", $4, $5, $6, "+", $8}}' """)
        else:
            stream_input = "zcat {fastq_file}"

    pair_file = pair_file if pair_file else ""
    if not utils.file_exists(out_file) and (final_file is None or
                                            not utils.file_exists(final_file)):
        with postalign.tobam_cl(data, out_file, pair_file
                                is not None) as (tobam_cl, tx_out_file):
            if pair_file:
                sub_cmd = "paired"
                input_cmd = "-pairedInterleavedFastq -"
            else:
                sub_cmd = "single"
                input_cmd = "-fastq -"
            stream_input = stream_input.format(**locals())
            tmp_dir = os.path.dirname(tx_out_file)
            cmd = (
                "export TMPDIR={tmp_dir} && unset JAVA_HOME && {stream_input} | "
                "snap-aligner {sub_cmd} {index_dir} {input_cmd} "
                "-R '{rg_info}' -t {num_cores} -M -o -sam - | ")
            do.run(
                cmd.format(**locals()) + tobam_cl,
                "SNAP alignment: %s" % names["sample"])
    data["work_bam"] = out_file
    return data