Beispiel #1
0
def align_bam(in_bam, ref_file, names, align_dir, data):
    """Perform realignment of input BAM file; uses unix pipes for avoid IO.
    """
    config = data["config"]
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    novoalign = config_utils.get_program("novoalign", config)
    samtools = config_utils.get_program("samtools", config)
    resources = config_utils.get_resources("novoalign", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    max_mem = resources.get("memory", "4G").upper()
    extra_novo_args = " ".join(_novoalign_args_from_config(config, False))

    if not file_exists(out_file):
        with utils.curdir_tmpdir(data, base_dir=align_dir) as work_dir:
            with postalign.tobam_cl(data, out_file, bam.is_paired(in_bam)) as (tobam_cl, tx_out_file):
                rg_info = get_rg_info(names)
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                prefix1 = "%s-in1" % tx_out_prefix
                cmd = ("{samtools} sort -n -o -l 0 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} "
                       "| {novoalign} -o SAM '{rg_info}' -d {ref_file} -f /dev/stdin "
                       "  -F BAMPE -c {num_cores} {extra_novo_args} | ")
                cmd = cmd.format(**locals()) + tobam_cl
                do.run(cmd, "Novoalign: %s" % names["sample"], None,
                       [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, in_bam)])
    return out_file
Beispiel #2
0
def align(fastq_file, pair_file, index_dir, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM.
    """
    umi_ext = "-cumi" if "umi_bam" in data else ""
    out_file = os.path.join(align_dir, "{0}-sort{1}.bam".format(dd.get_sample_name(data), umi_ext))
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    rg_info = novoalign.get_rg_info(names)
    preset = "sr"

    pair_file = pair_file if pair_file else ""
    if data.get("align_split"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data)
    else:
        final_file = None

    if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)):
        with postalign.tobam_cl(data, out_file, pair_file != "") as (tobam_cl, tx_out_file):
            index_file = None
            # Skip trying to use indices now as they provide only slight speed-ups
            # and give inconsitent outputs in BAM headers
            # If a single index present, index_dir points to that
            # if index_dir and os.path.isfile(index_dir):
            #     index_dir = os.path.dirname(index_dir)
            #     index_file = os.path.join(index_dir, "%s-%s.mmi" % (dd.get_genome_build(data), preset))
            if not index_file or not os.path.exists(index_file):
                index_file = dd.get_ref_file(data)
            cmd = ("minimap2 -a -x {preset} -R '{rg_info}' -t {num_cores} {index_file} "
                   "{fastq_file} {pair_file} | ")
            do.run(cmd.format(**locals()) + tobam_cl, "minimap2 alignment: %s" % dd.get_sample_name(data))
    data["work_bam"] = out_file
    return data
Beispiel #3
0
def align(fastq_file, pair_file, ref_file, names, align_dir, data,
          extra_args=None):
    """Alignment with bowtie2.
    """
    config = data["config"]
    analysis_config = ANALYSIS.get(data["analysis"].lower())
    assert analysis_config, "Analysis %s is not supported by bowtie2" % (data["analysis"])
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data)))
    if data.get("align_split"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data)
    else:
        final_file = None
    if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)):
        with postalign.tobam_cl(data, out_file, pair_file is not None) as (tobam_cl, tx_out_file):
            cl = [config_utils.get_program("bowtie2", config)]
            cl += _bowtie2_args_from_config(config)
            cl += extra_args if extra_args is not None else []
            cl += ["-q",
                   "-x", ref_file]
            cl += analysis_config.get("params", [])
            if pair_file:
                cl += ["-1", fastq_file, "-2", pair_file]
            else:
                cl += ["-U", fastq_file]
            if names and "rg" in names:
                cl += ["--rg-id", names["rg"]]
                for key, tag in [("sample", "SM"), ("pl", "PL"), ("pu", "PU"), ("lb", "LB")]:
                    if names.get(key):
                        cl += ["--rg", "%s:%s" % (tag, names[key])]
            cl = [str(i) for i in cl]
            cmd = " ".join(cl) + " | " + tobam_cl
            do.run(cmd, "Aligning %s and %s with Bowtie2." % (fastq_file, pair_file))
    return out_file
Beispiel #4
0
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted output BAM.
    """
    pair_file = pair_file if pair_file else ""
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    if data.get("align_split"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file = alignprep.split_namedpipe_cl(fastq_file, data)
        if pair_file:
            pair_file = alignprep.split_namedpipe_cl(pair_file, data)
    else:
        final_file = None
    samtools = config_utils.get_program("samtools", data["config"])
    novoalign = config_utils.get_program("novoalign", data["config"])
    resources = config_utils.get_resources("novoalign", data["config"])
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    max_mem = resources.get("memory", "1G")
    extra_novo_args = " ".join(_novoalign_args_from_config(data["config"]))
    rg_info = get_rg_info(names)
    if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)):
        with utils.curdir_tmpdir(data) as work_dir:
            with postalign.tobam_cl(data, out_file, pair_file != "") as (tobam_cl, tx_out_file):
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                cmd = ("{novoalign} -o SAM '{rg_info}' -d {ref_file} -f {fastq_file} {pair_file} "
                       "  -c {num_cores} {extra_novo_args} | ")
                cmd = cmd.format(**locals()) + tobam_cl
                do.run(cmd, "Novoalign: %s" % names["sample"], None,
                       [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, fastq_file)])
    data["work_bam"] = out_file
    return data
Beispiel #5
0
def align_bam(in_bam, ref_file, names, align_dir, data):
    """Perform direct alignment of an input BAM file with BWA using pipes.

    This avoids disk IO by piping between processes:
     - samtools sort of input BAM to queryname
     - bedtools conversion to interleaved FASTQ
     - bwa-mem alignment
     - samtools conversion to BAM
     - samtools sort to coordinate
    """
    config = data["config"]
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    samtools = config_utils.get_program("samtools", config)
    bedtools = config_utils.get_program("bedtools", config)
    bwa = config_utils.get_program("bwa", config)
    resources = config_utils.get_resources("samtools", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    # adjust memory for samtools since used for input and output
    max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                         3, "decrease")
    rg_info = novoalign.get_rg_info(names)
    if not utils.file_exists(out_file):
        with utils.curdir_tmpdir() as work_dir:
            with postalign.tobam_cl(data, out_file, bam.is_paired(in_bam)) as (tobam_cl, tx_out_file):
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                prefix1 = "%s-in1" % tx_out_prefix
                cmd = ("{samtools} sort -n -o -l 0 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} "
                       "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout "
                       "| {bwa} mem -p -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} - | ")
                cmd = cmd.format(**locals()) + tobam_cl
                do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"], None,
                       [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, in_bam)])
    return out_file
Beispiel #6
0
def _align_mem(fastq_file, pair_file, ref_file, out_file, names, rg_info, data):
    """Perform bwa-mem alignment on supported read lengths.
    """
    with postalign.tobam_cl(data, out_file, pair_file != "") as (tobam_cl, tx_out_file):
        cmd = "%s | %s" % (_get_bwa_mem_cmd(data, out_file, ref_file, fastq_file, pair_file), tobam_cl)
        do.run(cmd, "bwa mem alignment from fastq: %s" % names["sample"], None,
                [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, fastq_file)])
    return out_file
Beispiel #7
0
def _align_mem(fastq_file, pair_file, ref_file, out_file, names, rg_info, data):
    """Perform bwa-mem alignment on supported read lengths.
    """
    bwa = config_utils.get_program("bwa", data["config"])
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    with utils.curdir_tmpdir() as work_dir:
        with postalign.tobam_cl(data, out_file, pair_file != "") as (tobam_cl, tx_out_file):
            cmd = ("{bwa} mem -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} "
                   "{fastq_file} {pair_file} | ")
            cmd = cmd.format(**locals()) + tobam_cl
            do.run(cmd, "bwa mem alignment from fastq: %s" % names["sample"], None,
                   [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, fastq_file)])
    return out_file
Beispiel #8
0
def align(fastq_file, pair_file, ref_file, names, align_dir, data,
          extra_args=None):
    """Do standard or paired end alignment with bowtie.
    """
    num_hits = 1
    if data["analysis"].lower().startswith("smallrna-seq"):
        num_hits = 1000
    config = data['config']
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data)))
    if data.get("align_split"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data)
    else:
        final_file = None
        if fastq_file.endswith(".gz"):
            fastq_file = "<(gunzip -c %s)" % fastq_file
            if pair_file:
                pair_file = "<(gunzip -c %s)" % pair_file

    if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)):
        with postalign.tobam_cl(data, out_file, pair_file is not None) as (tobam_cl, tx_out_file):
            cl = [config_utils.get_program("bowtie", config)]
            cl += _bowtie_args_from_config(data)
            cl += extra_args if extra_args is not None else []
            cl += ["-q",
                   "-v", 2,
                   "-k", num_hits,
                   "-X", 2000, # default is too selective for most data
                   "--best",
                   "--strata",
                   "--sam",
                   ref_file]
            if pair_file:
                cl += ["-1", fastq_file, "-2", pair_file]
            else:
                cl += [fastq_file]
            cl = [str(i) for i in cl]
            fix_rg_cmd = r"samtools addreplacerg -r '%s' -" % novoalign.get_rg_info(data["rgnames"])
            if fix_rg_cmd:
                cmd = " ".join(cl) + " | " + fix_rg_cmd + " | " + tobam_cl
            else:
                cmd = " ".join(cl) + " | " + tobam_cl
            do.run(cmd, "Running Bowtie on %s and %s." % (fastq_file, pair_file), data)
    return out_file
Beispiel #9
0
def align(fastq_file, pair_file, index_dir, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM.

    Pipes in input, handling paired and split inputs, using interleaving magic
    from: https://biowize.wordpress.com/2015/03/26/the-fastest-darn-fastq-decoupling-procedure-i-ever-done-seen/
    """
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data)))
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    resources = config_utils.get_resources("snap", data["config"])
    rg_info = novoalign.get_rg_info(names)

    if data.get("align_split"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data)
        fastq_file = fastq_file[2:-1]
        if pair_file:
            pair_file = pair_file[2:-1]
            stream_input = (r"paste <({fastq_file} | paste - - - -) "
                            r"<({pair_file} | paste - - - -) | tr '\t' '\n'")
        else:
            stream_input = fastq_file[2:-1]
    else:
        assert fastq_file.endswith(".gz")
        if pair_file:
            stream_input = (r"paste <(zcat {fastq_file} | paste - - - -) "
                            r"<(zcat {pair_file} | paste - - - -) | tr '\t' '\n'")
        else:
            stream_input = "zcat {fastq_file}"

    pair_file = pair_file if pair_file else ""
    if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)):
        with postalign.tobam_cl(data, out_file, pair_file is not None) as (tobam_cl, tx_out_file):
            if pair_file:
                sub_cmd = "paired"
                input_cmd = "-pairedInterleavedFastq -"
            else:
                sub_cmd = "single"
                input_cmd = "-fastq -"
            stream_input = stream_input.format(**locals())
            cmd = ("{stream_input} | snap-aligner {sub_cmd} {index_dir} {input_cmd} "
                   "-R '{rg_info}' -t {num_cores} -M -o -sam - | ")
            do.run(cmd.format(**locals()) + tobam_cl, "SNAP alignment: %s" % names["sample"])
    data["work_bam"] = out_file
    return data
Beispiel #10
0
def _align_mem(fastq_file, pair_file, ref_file, out_file, names, rg_info,
               data):
    """Perform bwa-mem alignment on supported read lengths.
    """
    with postalign.tobam_cl(data, out_file,
                            pair_file != "") as (tobam_cl, tx_out_file):
        cmd = ("unset JAVA_HOME && "
               "%s | %s" % (_get_bwa_mem_cmd(data,
                                             out_file,
                                             ref_file,
                                             fastq_file,
                                             pair_file,
                                             with_hla=False), tobam_cl))
        do.run(cmd, "bwa mem alignment from fastq: %s" % names["sample"], None,
               [
                   do.file_nonempty(tx_out_file),
                   do.file_reasonable_size(tx_out_file, fastq_file)
               ])
    return out_file
Beispiel #11
0
def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    paired = True if pair_file else False
    hisat2 = config_utils.get_program("hisat2", data)
    num_cores = dd.get_num_cores(data)
    quality_flag = _get_quality_flag(data)
    stranded_flag = _get_stranded_flag(data, paired)
    rg_flags = _get_rg_flags(names)
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data)))
    if data.get("align_split"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data)
    else:
        final_file = None
    if not file_exists(out_file) and (final_file is None or not file_exists(final_file)):
        cmd = ("{hisat2} --new-summary -x {ref_file} -p {num_cores} {quality_flag} {stranded_flag} "
               "{rg_flags} ")
        if paired:
            cmd += "-1 {fastq_file} -2 {pair_file} "
        else:
            cmd += "-U {fastq_file} "
        if dd.get_analysis(data).lower() == "smallrna-seq":
            cmd += "-k 1000 "
        # if assembling transcripts, set flags that cufflinks/stringtie can use
        if dd.get_transcript_assembler(data):
            cmd += "--dta-cufflinks "
        if dd.get_analysis(data).lower() == "rna-seq":
            splicesites = get_known_splicesites_file(align_dir, data)
            if file_exists(splicesites):
                cmd += "--known-splicesite-infile {splicesites} "
        novel_splicesite_file = os.path.join(align_dir, "{0}-novelsplicesites.bed".format(dd.get_sample_name(data)))
        cmd += "--novel-splicesite-outfile {novel_splicesite_file} "
        # apply additional hisat2 options
        cmd += " ".join(_get_options_from_config(data))

        message = "Aligning %s and %s with hisat2." % (fastq_file, pair_file)
        with postalign.tobam_cl(data, out_file, pair_file is not None) as (tobam_cl, tx_out_file):
            cmd += " | " + tobam_cl
            do.run(cmd.format(**locals()), message)
    data = dd.set_work_bam(data, out_file)
    junctionbed = get_splicejunction_file(align_dir, data)
    data = dd.set_junction_bed(data, junctionbed)
    return data
Beispiel #12
0
def _align_mem(fastq_file, pair_file, ref_file, out_file, names, rg_info,
               data):
    """Perform bwa-mem alignment on supported read lengths.
    """
    bwa = config_utils.get_program("bwa", data["config"])
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    with utils.curdir_tmpdir(data) as work_dir:
        with postalign.tobam_cl(data, out_file,
                                pair_file != "") as (tobam_cl, tx_out_file):
            cmd = (
                "{bwa} mem -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} "
                "{fastq_file} {pair_file} | ")
            cmd = cmd.format(**locals()) + tobam_cl
            do.run(cmd, "bwa mem alignment from fastq: %s" % names["sample"],
                   None, [
                       do.file_nonempty(tx_out_file),
                       do.file_reasonable_size(tx_out_file, fastq_file)
                   ])
    return out_file
Beispiel #13
0
def _align_backtrack(fastq_file, pair_file, ref_file, out_file, names, rg_info, data):
    """Perform a BWA alignment using 'aln' backtrack algorithm.
    """
    bwa = config_utils.get_program("bwa", data["config"])
    config = data["config"]
    sai1_file = "%s_1.sai" % os.path.splitext(out_file)[0]
    sai2_file = "%s_2.sai" % os.path.splitext(out_file)[0] if pair_file else ""
    if not utils.file_exists(sai1_file):
        with file_transaction(data, sai1_file) as tx_sai1_file:
            _run_bwa_align(fastq_file, ref_file, tx_sai1_file, config)
    if sai2_file and not utils.file_exists(sai2_file):
        with file_transaction(data, sai2_file) as tx_sai2_file:
            _run_bwa_align(pair_file, ref_file, tx_sai2_file, config)
    with postalign.tobam_cl(data, out_file, pair_file != "") as (tobam_cl, tx_out_file):
        align_type = "sampe" if sai2_file else "samse"
        cmd = ("{bwa} {align_type} -r '{rg_info}' {ref_file} {sai1_file} {sai2_file} "
               "{fastq_file} {pair_file} | ")
        cmd = cmd.format(**locals()) + tobam_cl
        do.run(cmd, "bwa %s" % align_type, data)
    return out_file
Beispiel #14
0
def _align_backtrack(fastq_file, pair_file, ref_file, out_file, names, rg_info, data):
    """Perform a BWA alignment using 'aln' backtrack algorithm.
    """
    assert not data.get("align_split"), "Do not handle split alignments with non-piped bwa"
    bwa = config_utils.get_program("bwa", data["config"])
    config = data["config"]
    sai1_file = "%s_1.sai" % os.path.splitext(out_file)[0]
    sai2_file = "%s_2.sai" % os.path.splitext(out_file)[0] if pair_file else ""
    if not utils.file_exists(sai1_file):
        with file_transaction(data, sai1_file) as tx_sai1_file:
            _run_bwa_align(fastq_file, ref_file, tx_sai1_file, config)
    if sai2_file and not utils.file_exists(sai2_file):
        with file_transaction(data, sai2_file) as tx_sai2_file:
            _run_bwa_align(pair_file, ref_file, tx_sai2_file, config)
    with postalign.tobam_cl(data, out_file, pair_file != "") as (tobam_cl, tx_out_file):
        align_type = "sampe" if sai2_file else "samse"
        cmd = "{bwa} {align_type} -r '{rg_info}' {ref_file} {sai1_file} {sai2_file} " "{fastq_file} {pair_file} | "
        cmd = cmd.format(**locals()) + tobam_cl
        do.run(cmd, "bwa %s" % align_type, data)
    return out_file
Beispiel #15
0
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted output BAM.
    """
    pair_file = pair_file if pair_file else ""
    # back compatible -- older files were named with lane information, use sample name now
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    if not utils.file_exists(out_file):
        out_file = os.path.join(
            align_dir, "{0}-sort.bam".format(dd.get_sample_name(data)))
    if data.get("align_split") or fastq_file.endswith(".sdf"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file, pair_file = alignprep.split_namedpipe_cls(
            fastq_file, pair_file, data)
    else:
        final_file = None
    samtools = config_utils.get_program("samtools", data["config"])
    novoalign = config_utils.get_program("novoalign", data["config"])
    resources = config_utils.get_resources("novoalign", data["config"])
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    max_mem = resources.get("memory", "1G")
    extra_novo_args = " ".join(_novoalign_args_from_config(data["config"]))
    rg_info = get_rg_info(names)
    if not utils.file_exists(out_file) and (final_file is None or
                                            not utils.file_exists(final_file)):
        with tx_tmpdir(data) as work_dir:
            with postalign.tobam_cl(data, out_file,
                                    pair_file != "") as (tobam_cl,
                                                         tx_out_file):
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                cmd = (
                    "unset JAVA_HOME && "
                    "{novoalign} -o SAM '{rg_info}' -d {ref_file} -f {fastq_file} {pair_file} "
                    "  -c {num_cores} {extra_novo_args} | ")
                cmd = (cmd + tobam_cl).format(**locals())
                do.run(cmd, "Novoalign: %s" % names["sample"], None, [
                    do.file_nonempty(tx_out_file),
                    do.file_reasonable_size(tx_out_file, fastq_file)
                ])
    data["work_bam"] = out_file
    return data
def align(fastq_file, pair_file, index_dir, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM.
    """
    umi_ext = "-cumi" if "umi_bam" in data else ""
    out_file = os.path.join(
        align_dir, "{0}-sort{1}.bam".format(dd.get_sample_name(data), umi_ext))
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    rg_info = novoalign.get_rg_info(names)
    preset = "sr"

    pair_file = pair_file if pair_file else ""
    if data.get("align_split"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file, pair_file = alignprep.split_namedpipe_cls(
            fastq_file, pair_file, data)
    else:
        final_file = None

    if not utils.file_exists(out_file) and (final_file is None or
                                            not utils.file_exists(final_file)):
        with postalign.tobam_cl(data, out_file,
                                pair_file != "") as (tobam_cl, tx_out_file):
            index_file = None
            # Skip trying to use indices now as they provide only slight speed-ups
            # and give inconsitent outputs in BAM headers
            # If a single index present, index_dir points to that
            # if index_dir and os.path.isfile(index_dir):
            #     index_dir = os.path.dirname(index_dir)
            #     index_file = os.path.join(index_dir, "%s-%s.mmi" % (dd.get_genome_build(data), preset))
            if not index_file or not os.path.exists(index_file):
                index_file = dd.get_ref_file(data)
            cmd = (
                "minimap2 -a -x {preset} -R '{rg_info}' -t {num_cores} {index_file} "
                "{fastq_file} {pair_file} | ")
            do.run(
                cmd.format(**locals()) + tobam_cl,
                "minimap2 alignment: %s" % dd.get_sample_name(data))
    data["work_bam"] = out_file
    return data
Beispiel #17
0
def align_bam(in_bam, ref_file, names, align_dir, data):
    """Perform direct alignment of an input BAM file with BWA using pipes.

    This avoids disk IO by piping between processes:
     - samtools sort of input BAM to queryname
     - bedtools conversion to interleaved FASTQ
     - bwa-mem alignment
     - samtools conversion to BAM
     - samtools sort to coordinate
    """
    config = data["config"]
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    samtools = config_utils.get_program("samtools", config)
    bedtools = config_utils.get_program("bedtools", config)
    bwa = config_utils.get_program("bwa", config)
    resources = config_utils.get_resources("samtools", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    # adjust memory for samtools since used for input and output
    max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), 3,
                                         "decrease").upper()
    rg_info = novoalign.get_rg_info(names)
    if not utils.file_exists(out_file):
        with utils.curdir_tmpdir(data) as work_dir:
            with postalign.tobam_cl(data, out_file,
                                    bam.is_paired(in_bam)) as (tobam_cl,
                                                               tx_out_file):
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                prefix1 = "%s-in1" % tx_out_prefix
                cmd = (
                    "{samtools} sort -n -o -l 1 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} "
                    "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout "
                    "| {bwa} mem -p -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} - | "
                )
                cmd = cmd.format(**locals()) + tobam_cl
                do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"],
                       None, [
                           do.file_nonempty(tx_out_file),
                           do.file_reasonable_size(tx_out_file, in_bam)
                       ])
    return out_file
Beispiel #18
0
def align(fastq_file, pair_file, index_dir, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM.

    TODO: Use streaming with new development version of SNAP to feed into
    structural variation preparation de-duplication.
    """
    pair_file = pair_file if pair_file else ""
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    assert not data.get("align_split"), "Split alignments not supported with SNAP"
    snap = config_utils.get_program("snap", data["config"])
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    resources = config_utils.get_resources("snap", data["config"])
    rg_info = novoalign.get_rg_info(names)
    is_paired = bam.is_paired(fastq_file) if fastq_file.endswith(".bam") else pair_file
    if not utils.file_exists(out_file):
        with postalign.tobam_cl(data, out_file, is_paired) as (tobam_cl, tx_out_file):
            cmd_name = "paired" if is_paired else "single"
            cmd = ("{snap} {cmd_name} {index_dir} {fastq_file} {pair_file} "
                   "-R '{rg_info}' -t {num_cores} -M -o -sam - | ")
            do.run(cmd.format(**locals()) + tobam_cl, "SNAP alignment: %s" % names["sample"])
    data["work_bam"] = out_file
    return data
Beispiel #19
0
def align(fastq_file, pair_file, index_dir, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM.
    """
    umi_ext = "-cumi" if "umi_bam" in data else ""
    out_file = os.path.join(align_dir, "{0}-sort{1}.bam".format(dd.get_sample_name(data), umi_ext))
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    rg_info = "rgid={rg} rgpl={pl} rgpu={pu} rgsm={sample}".format(**names)
    pair_file = pair_file if pair_file else ""
    final_file = None
    if data.get("align_split"):
        # BBMap does not accept input fastq streams
        raise ValueError("bbmap is not compatible with alignment splitting, set `align_split: false`")
    pair_arg = "in2=%s" % pair_file if pair_file else ""
    if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)):
        with postalign.tobam_cl(data, out_file, pair_file != "") as (tobam_cl, tx_out_file):
            if index_dir.endswith(("/ref", "/ref/")):
                index_dir = os.path.dirname(index_dir)
            # sam=1.3 required for compatibility with strelka2
            cmd = ("bbmap.sh sam=1.3 mdtag=t {rg_info} path={index_dir} in1={fastq_file} "
                   "{pair_arg} out=stdout.sam | ")
            do.run(cmd.format(**locals()) + tobam_cl, "bbmap alignment: %s" % dd.get_sample_name(data))
    data["work_bam"] = out_file
    return data
Beispiel #20
0
def align(fastq_file,
          pair_file,
          ref_file,
          names,
          align_dir,
          data,
          extra_args=None):
    """Do standard or paired end alignment with bowtie.
    """
    num_hits = 1
    if data["analysis"].lower().startswith("smallrna-seq"):
        num_hits = 1000
    config = data['config']
    out_file = os.path.join(align_dir,
                            "{0}-sort.bam".format(dd.get_sample_name(data)))
    if data.get("align_split"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file, pair_file = alignprep.split_namedpipe_cls(
            fastq_file, pair_file, data)
    else:
        final_file = None
        if fastq_file.endswith(".gz"):
            fastq_file = "<(gunzip -c %s)" % fastq_file
            if pair_file:
                pair_file = "<(gunzip -c %s)" % pair_file

    if not utils.file_exists(out_file) and (final_file is None or
                                            not utils.file_exists(final_file)):
        with postalign.tobam_cl(data, out_file, pair_file
                                is not None) as (tobam_cl, tx_out_file):
            cl = [config_utils.get_program("bowtie", config)]
            cl += _bowtie_args_from_config(data)
            cl += extra_args if extra_args is not None else []
            cl += [
                "-q",
                "-v",
                2,
                "-k",
                num_hits,
                "-X",
                2000,  # default is too selective for most data
                "--best",
                "--strata",
                "--sam",
                ref_file
            ]
            if pair_file:
                cl += ["-1", fastq_file, "-2", pair_file]
            else:
                cl += [fastq_file]
            cl = [str(i) for i in cl]
            fix_rg_cmd = r"samtools addreplacerg -r '%s' -" % novoalign.get_rg_info(
                data["rgnames"])
            if fix_rg_cmd:
                cmd = " ".join(cl) + " | " + fix_rg_cmd + " | " + tobam_cl
            else:
                cmd = " ".join(cl) + " | " + tobam_cl
            do.run(cmd,
                   "Running Bowtie on %s and %s." % (fastq_file, pair_file),
                   data)
    return out_file
Beispiel #21
0
def align_bam(in_bam, ref_file, names, align_dir, data):
    """Perform direct alignment of an input BAM file with BWA using pipes.

    This avoids disk IO by piping between processes:
     - samtools sort of input BAM to queryname
     - bedtools conversion to interleaved FASTQ
     - bwa-mem alignment
     - samtools conversion to BAM
     - samtools sort to coordinate
    """
    config = data["config"]
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    samtools = config_utils.get_program("samtools", config)
    bedtools = config_utils.get_program("bedtools", config)
    resources = config_utils.get_resources("samtools", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    # adjust memory for samtools since used for input and output
    max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), 3,
                                         "decrease").upper()
    if not utils.file_exists(out_file):
        with tx_tmpdir(data) as work_dir:
            with postalign.tobam_cl(data, out_file,
                                    bam.is_paired(in_bam)) as (tobam_cl,
                                                               tx_out_file):
                if not hla_on(data) or needs_separate_hla(data):
                    bwa_cmd = _get_bwa_mem_cmd(data,
                                               out_file,
                                               ref_file,
                                               "-",
                                               with_hla=False)
                else:
                    bwa_cmd = _get_bwa_mem_cmd(data,
                                               out_file,
                                               ref_file,
                                               "-",
                                               with_hla=True)
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                prefix1 = "%s-in1" % tx_out_prefix
                cmd = (
                    "unset JAVA_HOME && "
                    "{samtools} sort -n -l 1 -@ {num_cores} -m {max_mem} {in_bam} -T {prefix1} "
                    "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout "
                    "| {bwa_cmd} | ")
                cmd = cmd.format(**locals()) + tobam_cl
                do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"],
                       None, [
                           do.file_nonempty(tx_out_file),
                           do.file_reasonable_size(tx_out_file, in_bam)
                       ])
    data["work_bam"] = out_file
    hla_file = "HLA-" + out_file
    if needs_separate_hla(data) and not utils.file_exists(hla_file):
        with tx_tmpdir(data) as work_dir:
            with postalign.tobam_cl(data, hla_file,
                                    bam.is_paired(in_bam)) as (tobam_cl,
                                                               tx_out_file):
                bwa_cmd = _get_bwa_mem_cmd(data,
                                           hla_file,
                                           ref_file,
                                           "-",
                                           with_hla=True)
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                prefix1 = "%s-in1" % tx_out_prefix
                cmd = (
                    "unset JAVA_HOME && "
                    "{samtools} sort -n -l 1 -@ {num_cores} -m {max_mem} {in_bam} -T {prefix1} "
                    "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout "
                    "| {bwa_cmd} | ")
                cmd = cmd.format(**locals()) + tobam_cl
                do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"],
                       None, [
                           do.file_nonempty(tx_out_file),
                           do.file_reasonable_size(tx_out_file, in_bam)
                       ])
        hla_file = _align_mem_hla(fastq_file, pair_file, ref_file, hla_file,
                                  names, rg_info, data)
        data["hla_bam"] = hla_file
    return data
Beispiel #22
0
def align(fastq_file, pair_file, index_dir, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM.

    Pipes in input, handling paired and split inputs, using interleaving magic
    from: https://biowize.wordpress.com/2015/03/26/the-fastest-darn-fastq-decoupling-procedure-i-ever-done-seen/

    Then converts a tab delimited set of outputs into interleaved fastq.

    awk changes spaces to underscores since SNAP only takes the initial name.
    SNAP requires /1 and /2 at the end of read names. If these are not present
    in the initial fastq may need to expand awk code to do this.
    """
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data)))
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    resources = config_utils.get_resources("snap", data["config"])
    rg_info = novoalign.get_rg_info(names)

    if data.get("align_split"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data)
        fastq_file = fastq_file[2:-1]
        if pair_file:
            pair_file = pair_file[2:-1]
            stream_input = (r"paste <({fastq_file} | paste - - - -) "
                            r"<({pair_file} | paste - - - -) | "
                            r"""awk 'BEGIN {{FS="\t"; OFS="\n"}} """
                            r"""{{ """
                            r"""split($1, P1, " "); split($5, P5, " "); """
                            r"""if ($1 !~ /\/1$/) $1 = P1[1]"/1"; if ($5 !~ /\/2$/) $5 = P5[1]"/2"; """
                            r"""gsub(" ", "_", $1); gsub(" ", "_", $5); """
                            r"""print $1, $2, "+", $4, $5, $6, "+", $8}}' | sponge """)
        else:
            stream_input = fastq_file[2:-1]
    else:
        final_file = None
        assert fastq_file.endswith(".gz")
        if pair_file:
            stream_input = (r"paste <(zcat {fastq_file} | paste - - - -) "
                            r"<(zcat {pair_file} | paste - - - -) | "
                            r"""awk 'BEGIN {{FS="\t"; OFS="\n"}} """
                            r"""{{ """
                            r"""split($1, P1, " "); split($5, P5, " "); """
                            r"""if ($1 !~ /\/1$/) $1 = P1[1]"/1"; if ($5 !~ /\/2$/) $5 = P5[1]"/2"; """
                            r"""gsub(" ", "_", $1); gsub(" ", "_", $5); """
                            r"""print $1, $2, "+", $4, $5, $6, "+", $8}}' | sponge """)
        else:
            stream_input = "zcat {fastq_file}"

    pair_file = pair_file if pair_file else ""
    if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)):
        with postalign.tobam_cl(data, out_file, pair_file is not None) as (tobam_cl, tx_out_file):
            if pair_file:
                sub_cmd = "paired"
                input_cmd = "-pairedInterleavedFastq -"
            else:
                sub_cmd = "single"
                input_cmd = "-fastq -"
            stream_input = stream_input.format(**locals())
            tmp_dir = os.path.dirname(tx_out_file)
            cmd = ("export TMPDIR={tmp_dir} && {stream_input} | snap-aligner {sub_cmd} {index_dir} {input_cmd} "
                   "-R '{rg_info}' -t {num_cores} -M -o -sam - | ")
            do.run(cmd.format(**locals()) + tobam_cl, "SNAP alignment: %s" % names["sample"])
    data["work_bam"] = out_file
    return data
Beispiel #23
0
def align(fastq_file, pair_file, index_dir, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM.

    Pipes in input, handling paired and split inputs, using interleaving magic
    from: https://biowize.wordpress.com/2015/03/26/the-fastest-darn-fastq-decoupling-procedure-i-ever-done-seen/

    Then converts a tab delimited set of outputs into interleaved fastq.

    awk changes spaces to underscores since SNAP only takes the initial name.
    SNAP requires /1 and /2 at the end of read names. If these are not present
    in the initial fastq may need to expand awk code to do this.
    """
    out_file = os.path.join(align_dir,
                            "{0}-sort.bam".format(dd.get_sample_name(data)))
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    resources = config_utils.get_resources("snap", data["config"])
    rg_info = novoalign.get_rg_info(names)

    if data.get("align_split"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file, pair_file = alignprep.split_namedpipe_cls(
            fastq_file, pair_file, data)
        fastq_file = fastq_file[2:-1]
        if pair_file:
            pair_file = pair_file[2:-1]
            stream_input = (
                r"paste <({fastq_file} | paste - - - -) "
                r"<({pair_file} | paste - - - -) | "
                r"""awk 'BEGIN {{FS="\t"; OFS="\n"}} """
                r"""{{ """
                r"""split($1, P1, " "); split($5, P5, " "); """
                r"""if ($1 !~ /\/1$/) $1 = P1[1]"/1"; if ($5 !~ /\/2$/) $5 = P5[1]"/2"; """
                r"""gsub(" ", "_", $1); gsub(" ", "_", $5); """
                r"""print $1, $2, "+", $4, $5, $6, "+", $8}}' """)
        else:
            stream_input = fastq_file[2:-1]
    else:
        final_file = None
        assert fastq_file.endswith(".gz")
        if pair_file:
            stream_input = (
                r"paste <(zcat {fastq_file} | paste - - - -) "
                r"<(zcat {pair_file} | paste - - - -) | "
                r"""awk 'BEGIN {{FS="\t"; OFS="\n"}} """
                r"""{{ """
                r"""split($1, P1, " "); split($5, P5, " "); """
                r"""if ($1 !~ /\/1$/) $1 = P1[1]"/1"; if ($5 !~ /\/2$/) $5 = P5[1]"/2"; """
                r"""gsub(" ", "_", $1); gsub(" ", "_", $5); """
                r"""print $1, $2, "+", $4, $5, $6, "+", $8}}' """)
        else:
            stream_input = "zcat {fastq_file}"

    pair_file = pair_file if pair_file else ""
    if not utils.file_exists(out_file) and (final_file is None or
                                            not utils.file_exists(final_file)):
        with postalign.tobam_cl(data, out_file, pair_file
                                is not None) as (tobam_cl, tx_out_file):
            if pair_file:
                sub_cmd = "paired"
                input_cmd = "-pairedInterleavedFastq -"
            else:
                sub_cmd = "single"
                input_cmd = "-fastq -"
            stream_input = stream_input.format(**locals())
            tmp_dir = os.path.dirname(tx_out_file)
            cmd = (
                "export TMPDIR={tmp_dir} && unset JAVA_HOME && {stream_input} | "
                "snap-aligner {sub_cmd} {index_dir} {input_cmd} "
                "-R '{rg_info}' -t {num_cores} -M -o -sam - | ")
            do.run(
                cmd.format(**locals()) + tobam_cl,
                "SNAP alignment: %s" % names["sample"])
    data["work_bam"] = out_file
    return data