def align_bam(in_bam, ref_file, names, align_dir, data): """Perform realignment of input BAM file; uses unix pipes for avoid IO. """ config = data["config"] out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) novoalign = config_utils.get_program("novoalign", config) samtools = config_utils.get_program("samtools", config) resources = config_utils.get_resources("novoalign", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "4G").upper() extra_novo_args = " ".join(_novoalign_args_from_config(config, False)) if not file_exists(out_file): with utils.curdir_tmpdir(data, base_dir=align_dir) as work_dir: with postalign.tobam_cl(data, out_file, bam.is_paired(in_bam)) as (tobam_cl, tx_out_file): rg_info = get_rg_info(names) tx_out_prefix = os.path.splitext(tx_out_file)[0] prefix1 = "%s-in1" % tx_out_prefix cmd = ("{samtools} sort -n -o -l 0 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} " "| {novoalign} -o SAM '{rg_info}' -d {ref_file} -f /dev/stdin " " -F BAMPE -c {num_cores} {extra_novo_args} | ") cmd = cmd.format(**locals()) + tobam_cl do.run(cmd, "Novoalign: %s" % names["sample"], None, [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, in_bam)]) return out_file
def align(fastq_file, pair_file, index_dir, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM. """ umi_ext = "-cumi" if "umi_bam" in data else "" out_file = os.path.join(align_dir, "{0}-sort{1}.bam".format(dd.get_sample_name(data), umi_ext)) num_cores = data["config"]["algorithm"].get("num_cores", 1) rg_info = novoalign.get_rg_info(names) preset = "sr" pair_file = pair_file if pair_file else "" if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data) else: final_file = None if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): with postalign.tobam_cl(data, out_file, pair_file != "") as (tobam_cl, tx_out_file): index_file = None # Skip trying to use indices now as they provide only slight speed-ups # and give inconsitent outputs in BAM headers # If a single index present, index_dir points to that # if index_dir and os.path.isfile(index_dir): # index_dir = os.path.dirname(index_dir) # index_file = os.path.join(index_dir, "%s-%s.mmi" % (dd.get_genome_build(data), preset)) if not index_file or not os.path.exists(index_file): index_file = dd.get_ref_file(data) cmd = ("minimap2 -a -x {preset} -R '{rg_info}' -t {num_cores} {index_file} " "{fastq_file} {pair_file} | ") do.run(cmd.format(**locals()) + tobam_cl, "minimap2 alignment: %s" % dd.get_sample_name(data)) data["work_bam"] = out_file return data
def align(fastq_file, pair_file, ref_file, names, align_dir, data, extra_args=None): """Alignment with bowtie2. """ config = data["config"] analysis_config = ANALYSIS.get(data["analysis"].lower()) assert analysis_config, "Analysis %s is not supported by bowtie2" % (data["analysis"]) out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data))) if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data) else: final_file = None if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): with postalign.tobam_cl(data, out_file, pair_file is not None) as (tobam_cl, tx_out_file): cl = [config_utils.get_program("bowtie2", config)] cl += _bowtie2_args_from_config(config) cl += extra_args if extra_args is not None else [] cl += ["-q", "-x", ref_file] cl += analysis_config.get("params", []) if pair_file: cl += ["-1", fastq_file, "-2", pair_file] else: cl += ["-U", fastq_file] if names and "rg" in names: cl += ["--rg-id", names["rg"]] for key, tag in [("sample", "SM"), ("pl", "PL"), ("pu", "PU"), ("lb", "LB")]: if names.get(key): cl += ["--rg", "%s:%s" % (tag, names[key])] cl = [str(i) for i in cl] cmd = " ".join(cl) + " | " + tobam_cl do.run(cmd, "Aligning %s and %s with Bowtie2." % (fastq_file, pair_file)) return out_file
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted output BAM. """ pair_file = pair_file if pair_file else "" out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file = alignprep.split_namedpipe_cl(fastq_file, data) if pair_file: pair_file = alignprep.split_namedpipe_cl(pair_file, data) else: final_file = None samtools = config_utils.get_program("samtools", data["config"]) novoalign = config_utils.get_program("novoalign", data["config"]) resources = config_utils.get_resources("novoalign", data["config"]) num_cores = data["config"]["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "1G") extra_novo_args = " ".join(_novoalign_args_from_config(data["config"])) rg_info = get_rg_info(names) if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): with utils.curdir_tmpdir(data) as work_dir: with postalign.tobam_cl(data, out_file, pair_file != "") as (tobam_cl, tx_out_file): tx_out_prefix = os.path.splitext(tx_out_file)[0] cmd = ("{novoalign} -o SAM '{rg_info}' -d {ref_file} -f {fastq_file} {pair_file} " " -c {num_cores} {extra_novo_args} | ") cmd = cmd.format(**locals()) + tobam_cl do.run(cmd, "Novoalign: %s" % names["sample"], None, [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, fastq_file)]) data["work_bam"] = out_file return data
def align_bam(in_bam, ref_file, names, align_dir, data): """Perform direct alignment of an input BAM file with BWA using pipes. This avoids disk IO by piping between processes: - samtools sort of input BAM to queryname - bedtools conversion to interleaved FASTQ - bwa-mem alignment - samtools conversion to BAM - samtools sort to coordinate """ config = data["config"] out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) samtools = config_utils.get_program("samtools", config) bedtools = config_utils.get_program("bedtools", config) bwa = config_utils.get_program("bwa", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) # adjust memory for samtools since used for input and output max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), 3, "decrease") rg_info = novoalign.get_rg_info(names) if not utils.file_exists(out_file): with utils.curdir_tmpdir() as work_dir: with postalign.tobam_cl(data, out_file, bam.is_paired(in_bam)) as (tobam_cl, tx_out_file): tx_out_prefix = os.path.splitext(tx_out_file)[0] prefix1 = "%s-in1" % tx_out_prefix cmd = ("{samtools} sort -n -o -l 0 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} " "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout " "| {bwa} mem -p -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} - | ") cmd = cmd.format(**locals()) + tobam_cl do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"], None, [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, in_bam)]) return out_file
def _align_mem(fastq_file, pair_file, ref_file, out_file, names, rg_info, data): """Perform bwa-mem alignment on supported read lengths. """ with postalign.tobam_cl(data, out_file, pair_file != "") as (tobam_cl, tx_out_file): cmd = "%s | %s" % (_get_bwa_mem_cmd(data, out_file, ref_file, fastq_file, pair_file), tobam_cl) do.run(cmd, "bwa mem alignment from fastq: %s" % names["sample"], None, [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, fastq_file)]) return out_file
def _align_mem(fastq_file, pair_file, ref_file, out_file, names, rg_info, data): """Perform bwa-mem alignment on supported read lengths. """ bwa = config_utils.get_program("bwa", data["config"]) num_cores = data["config"]["algorithm"].get("num_cores", 1) with utils.curdir_tmpdir() as work_dir: with postalign.tobam_cl(data, out_file, pair_file != "") as (tobam_cl, tx_out_file): cmd = ("{bwa} mem -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} " "{fastq_file} {pair_file} | ") cmd = cmd.format(**locals()) + tobam_cl do.run(cmd, "bwa mem alignment from fastq: %s" % names["sample"], None, [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, fastq_file)]) return out_file
def align(fastq_file, pair_file, ref_file, names, align_dir, data, extra_args=None): """Do standard or paired end alignment with bowtie. """ num_hits = 1 if data["analysis"].lower().startswith("smallrna-seq"): num_hits = 1000 config = data['config'] out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data))) if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data) else: final_file = None if fastq_file.endswith(".gz"): fastq_file = "<(gunzip -c %s)" % fastq_file if pair_file: pair_file = "<(gunzip -c %s)" % pair_file if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): with postalign.tobam_cl(data, out_file, pair_file is not None) as (tobam_cl, tx_out_file): cl = [config_utils.get_program("bowtie", config)] cl += _bowtie_args_from_config(data) cl += extra_args if extra_args is not None else [] cl += ["-q", "-v", 2, "-k", num_hits, "-X", 2000, # default is too selective for most data "--best", "--strata", "--sam", ref_file] if pair_file: cl += ["-1", fastq_file, "-2", pair_file] else: cl += [fastq_file] cl = [str(i) for i in cl] fix_rg_cmd = r"samtools addreplacerg -r '%s' -" % novoalign.get_rg_info(data["rgnames"]) if fix_rg_cmd: cmd = " ".join(cl) + " | " + fix_rg_cmd + " | " + tobam_cl else: cmd = " ".join(cl) + " | " + tobam_cl do.run(cmd, "Running Bowtie on %s and %s." % (fastq_file, pair_file), data) return out_file
def align(fastq_file, pair_file, index_dir, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM. Pipes in input, handling paired and split inputs, using interleaving magic from: https://biowize.wordpress.com/2015/03/26/the-fastest-darn-fastq-decoupling-procedure-i-ever-done-seen/ """ out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data))) num_cores = data["config"]["algorithm"].get("num_cores", 1) resources = config_utils.get_resources("snap", data["config"]) rg_info = novoalign.get_rg_info(names) if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data) fastq_file = fastq_file[2:-1] if pair_file: pair_file = pair_file[2:-1] stream_input = (r"paste <({fastq_file} | paste - - - -) " r"<({pair_file} | paste - - - -) | tr '\t' '\n'") else: stream_input = fastq_file[2:-1] else: assert fastq_file.endswith(".gz") if pair_file: stream_input = (r"paste <(zcat {fastq_file} | paste - - - -) " r"<(zcat {pair_file} | paste - - - -) | tr '\t' '\n'") else: stream_input = "zcat {fastq_file}" pair_file = pair_file if pair_file else "" if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): with postalign.tobam_cl(data, out_file, pair_file is not None) as (tobam_cl, tx_out_file): if pair_file: sub_cmd = "paired" input_cmd = "-pairedInterleavedFastq -" else: sub_cmd = "single" input_cmd = "-fastq -" stream_input = stream_input.format(**locals()) cmd = ("{stream_input} | snap-aligner {sub_cmd} {index_dir} {input_cmd} " "-R '{rg_info}' -t {num_cores} -M -o -sam - | ") do.run(cmd.format(**locals()) + tobam_cl, "SNAP alignment: %s" % names["sample"]) data["work_bam"] = out_file return data
def _align_mem(fastq_file, pair_file, ref_file, out_file, names, rg_info, data): """Perform bwa-mem alignment on supported read lengths. """ with postalign.tobam_cl(data, out_file, pair_file != "") as (tobam_cl, tx_out_file): cmd = ("unset JAVA_HOME && " "%s | %s" % (_get_bwa_mem_cmd(data, out_file, ref_file, fastq_file, pair_file, with_hla=False), tobam_cl)) do.run(cmd, "bwa mem alignment from fastq: %s" % names["sample"], None, [ do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, fastq_file) ]) return out_file
def align(fastq_file, pair_file, ref_file, names, align_dir, data): paired = True if pair_file else False hisat2 = config_utils.get_program("hisat2", data) num_cores = dd.get_num_cores(data) quality_flag = _get_quality_flag(data) stranded_flag = _get_stranded_flag(data, paired) rg_flags = _get_rg_flags(names) out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data))) if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data) else: final_file = None if not file_exists(out_file) and (final_file is None or not file_exists(final_file)): cmd = ("{hisat2} --new-summary -x {ref_file} -p {num_cores} {quality_flag} {stranded_flag} " "{rg_flags} ") if paired: cmd += "-1 {fastq_file} -2 {pair_file} " else: cmd += "-U {fastq_file} " if dd.get_analysis(data).lower() == "smallrna-seq": cmd += "-k 1000 " # if assembling transcripts, set flags that cufflinks/stringtie can use if dd.get_transcript_assembler(data): cmd += "--dta-cufflinks " if dd.get_analysis(data).lower() == "rna-seq": splicesites = get_known_splicesites_file(align_dir, data) if file_exists(splicesites): cmd += "--known-splicesite-infile {splicesites} " novel_splicesite_file = os.path.join(align_dir, "{0}-novelsplicesites.bed".format(dd.get_sample_name(data))) cmd += "--novel-splicesite-outfile {novel_splicesite_file} " # apply additional hisat2 options cmd += " ".join(_get_options_from_config(data)) message = "Aligning %s and %s with hisat2." % (fastq_file, pair_file) with postalign.tobam_cl(data, out_file, pair_file is not None) as (tobam_cl, tx_out_file): cmd += " | " + tobam_cl do.run(cmd.format(**locals()), message) data = dd.set_work_bam(data, out_file) junctionbed = get_splicejunction_file(align_dir, data) data = dd.set_junction_bed(data, junctionbed) return data
def _align_mem(fastq_file, pair_file, ref_file, out_file, names, rg_info, data): """Perform bwa-mem alignment on supported read lengths. """ bwa = config_utils.get_program("bwa", data["config"]) num_cores = data["config"]["algorithm"].get("num_cores", 1) with utils.curdir_tmpdir(data) as work_dir: with postalign.tobam_cl(data, out_file, pair_file != "") as (tobam_cl, tx_out_file): cmd = ( "{bwa} mem -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} " "{fastq_file} {pair_file} | ") cmd = cmd.format(**locals()) + tobam_cl do.run(cmd, "bwa mem alignment from fastq: %s" % names["sample"], None, [ do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, fastq_file) ]) return out_file
def _align_backtrack(fastq_file, pair_file, ref_file, out_file, names, rg_info, data): """Perform a BWA alignment using 'aln' backtrack algorithm. """ bwa = config_utils.get_program("bwa", data["config"]) config = data["config"] sai1_file = "%s_1.sai" % os.path.splitext(out_file)[0] sai2_file = "%s_2.sai" % os.path.splitext(out_file)[0] if pair_file else "" if not utils.file_exists(sai1_file): with file_transaction(data, sai1_file) as tx_sai1_file: _run_bwa_align(fastq_file, ref_file, tx_sai1_file, config) if sai2_file and not utils.file_exists(sai2_file): with file_transaction(data, sai2_file) as tx_sai2_file: _run_bwa_align(pair_file, ref_file, tx_sai2_file, config) with postalign.tobam_cl(data, out_file, pair_file != "") as (tobam_cl, tx_out_file): align_type = "sampe" if sai2_file else "samse" cmd = ("{bwa} {align_type} -r '{rg_info}' {ref_file} {sai1_file} {sai2_file} " "{fastq_file} {pair_file} | ") cmd = cmd.format(**locals()) + tobam_cl do.run(cmd, "bwa %s" % align_type, data) return out_file
def _align_backtrack(fastq_file, pair_file, ref_file, out_file, names, rg_info, data): """Perform a BWA alignment using 'aln' backtrack algorithm. """ assert not data.get("align_split"), "Do not handle split alignments with non-piped bwa" bwa = config_utils.get_program("bwa", data["config"]) config = data["config"] sai1_file = "%s_1.sai" % os.path.splitext(out_file)[0] sai2_file = "%s_2.sai" % os.path.splitext(out_file)[0] if pair_file else "" if not utils.file_exists(sai1_file): with file_transaction(data, sai1_file) as tx_sai1_file: _run_bwa_align(fastq_file, ref_file, tx_sai1_file, config) if sai2_file and not utils.file_exists(sai2_file): with file_transaction(data, sai2_file) as tx_sai2_file: _run_bwa_align(pair_file, ref_file, tx_sai2_file, config) with postalign.tobam_cl(data, out_file, pair_file != "") as (tobam_cl, tx_out_file): align_type = "sampe" if sai2_file else "samse" cmd = "{bwa} {align_type} -r '{rg_info}' {ref_file} {sai1_file} {sai2_file} " "{fastq_file} {pair_file} | " cmd = cmd.format(**locals()) + tobam_cl do.run(cmd, "bwa %s" % align_type, data) return out_file
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted output BAM. """ pair_file = pair_file if pair_file else "" # back compatible -- older files were named with lane information, use sample name now out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) if not utils.file_exists(out_file): out_file = os.path.join( align_dir, "{0}-sort.bam".format(dd.get_sample_name(data))) if data.get("align_split") or fastq_file.endswith(".sdf"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file, pair_file = alignprep.split_namedpipe_cls( fastq_file, pair_file, data) else: final_file = None samtools = config_utils.get_program("samtools", data["config"]) novoalign = config_utils.get_program("novoalign", data["config"]) resources = config_utils.get_resources("novoalign", data["config"]) num_cores = data["config"]["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "1G") extra_novo_args = " ".join(_novoalign_args_from_config(data["config"])) rg_info = get_rg_info(names) if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): with tx_tmpdir(data) as work_dir: with postalign.tobam_cl(data, out_file, pair_file != "") as (tobam_cl, tx_out_file): tx_out_prefix = os.path.splitext(tx_out_file)[0] cmd = ( "unset JAVA_HOME && " "{novoalign} -o SAM '{rg_info}' -d {ref_file} -f {fastq_file} {pair_file} " " -c {num_cores} {extra_novo_args} | ") cmd = (cmd + tobam_cl).format(**locals()) do.run(cmd, "Novoalign: %s" % names["sample"], None, [ do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, fastq_file) ]) data["work_bam"] = out_file return data
def align(fastq_file, pair_file, index_dir, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM. """ umi_ext = "-cumi" if "umi_bam" in data else "" out_file = os.path.join( align_dir, "{0}-sort{1}.bam".format(dd.get_sample_name(data), umi_ext)) num_cores = data["config"]["algorithm"].get("num_cores", 1) rg_info = novoalign.get_rg_info(names) preset = "sr" pair_file = pair_file if pair_file else "" if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file, pair_file = alignprep.split_namedpipe_cls( fastq_file, pair_file, data) else: final_file = None if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): with postalign.tobam_cl(data, out_file, pair_file != "") as (tobam_cl, tx_out_file): index_file = None # Skip trying to use indices now as they provide only slight speed-ups # and give inconsitent outputs in BAM headers # If a single index present, index_dir points to that # if index_dir and os.path.isfile(index_dir): # index_dir = os.path.dirname(index_dir) # index_file = os.path.join(index_dir, "%s-%s.mmi" % (dd.get_genome_build(data), preset)) if not index_file or not os.path.exists(index_file): index_file = dd.get_ref_file(data) cmd = ( "minimap2 -a -x {preset} -R '{rg_info}' -t {num_cores} {index_file} " "{fastq_file} {pair_file} | ") do.run( cmd.format(**locals()) + tobam_cl, "minimap2 alignment: %s" % dd.get_sample_name(data)) data["work_bam"] = out_file return data
def align_bam(in_bam, ref_file, names, align_dir, data): """Perform direct alignment of an input BAM file with BWA using pipes. This avoids disk IO by piping between processes: - samtools sort of input BAM to queryname - bedtools conversion to interleaved FASTQ - bwa-mem alignment - samtools conversion to BAM - samtools sort to coordinate """ config = data["config"] out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) samtools = config_utils.get_program("samtools", config) bedtools = config_utils.get_program("bedtools", config) bwa = config_utils.get_program("bwa", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) # adjust memory for samtools since used for input and output max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), 3, "decrease").upper() rg_info = novoalign.get_rg_info(names) if not utils.file_exists(out_file): with utils.curdir_tmpdir(data) as work_dir: with postalign.tobam_cl(data, out_file, bam.is_paired(in_bam)) as (tobam_cl, tx_out_file): tx_out_prefix = os.path.splitext(tx_out_file)[0] prefix1 = "%s-in1" % tx_out_prefix cmd = ( "{samtools} sort -n -o -l 1 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} " "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout " "| {bwa} mem -p -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} - | " ) cmd = cmd.format(**locals()) + tobam_cl do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"], None, [ do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, in_bam) ]) return out_file
def align(fastq_file, pair_file, index_dir, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM. TODO: Use streaming with new development version of SNAP to feed into structural variation preparation de-duplication. """ pair_file = pair_file if pair_file else "" out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) assert not data.get("align_split"), "Split alignments not supported with SNAP" snap = config_utils.get_program("snap", data["config"]) num_cores = data["config"]["algorithm"].get("num_cores", 1) resources = config_utils.get_resources("snap", data["config"]) rg_info = novoalign.get_rg_info(names) is_paired = bam.is_paired(fastq_file) if fastq_file.endswith(".bam") else pair_file if not utils.file_exists(out_file): with postalign.tobam_cl(data, out_file, is_paired) as (tobam_cl, tx_out_file): cmd_name = "paired" if is_paired else "single" cmd = ("{snap} {cmd_name} {index_dir} {fastq_file} {pair_file} " "-R '{rg_info}' -t {num_cores} -M -o -sam - | ") do.run(cmd.format(**locals()) + tobam_cl, "SNAP alignment: %s" % names["sample"]) data["work_bam"] = out_file return data
def align(fastq_file, pair_file, index_dir, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM. """ umi_ext = "-cumi" if "umi_bam" in data else "" out_file = os.path.join(align_dir, "{0}-sort{1}.bam".format(dd.get_sample_name(data), umi_ext)) num_cores = data["config"]["algorithm"].get("num_cores", 1) rg_info = "rgid={rg} rgpl={pl} rgpu={pu} rgsm={sample}".format(**names) pair_file = pair_file if pair_file else "" final_file = None if data.get("align_split"): # BBMap does not accept input fastq streams raise ValueError("bbmap is not compatible with alignment splitting, set `align_split: false`") pair_arg = "in2=%s" % pair_file if pair_file else "" if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): with postalign.tobam_cl(data, out_file, pair_file != "") as (tobam_cl, tx_out_file): if index_dir.endswith(("/ref", "/ref/")): index_dir = os.path.dirname(index_dir) # sam=1.3 required for compatibility with strelka2 cmd = ("bbmap.sh sam=1.3 mdtag=t {rg_info} path={index_dir} in1={fastq_file} " "{pair_arg} out=stdout.sam | ") do.run(cmd.format(**locals()) + tobam_cl, "bbmap alignment: %s" % dd.get_sample_name(data)) data["work_bam"] = out_file return data
def align(fastq_file, pair_file, ref_file, names, align_dir, data, extra_args=None): """Do standard or paired end alignment with bowtie. """ num_hits = 1 if data["analysis"].lower().startswith("smallrna-seq"): num_hits = 1000 config = data['config'] out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data))) if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file, pair_file = alignprep.split_namedpipe_cls( fastq_file, pair_file, data) else: final_file = None if fastq_file.endswith(".gz"): fastq_file = "<(gunzip -c %s)" % fastq_file if pair_file: pair_file = "<(gunzip -c %s)" % pair_file if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): with postalign.tobam_cl(data, out_file, pair_file is not None) as (tobam_cl, tx_out_file): cl = [config_utils.get_program("bowtie", config)] cl += _bowtie_args_from_config(data) cl += extra_args if extra_args is not None else [] cl += [ "-q", "-v", 2, "-k", num_hits, "-X", 2000, # default is too selective for most data "--best", "--strata", "--sam", ref_file ] if pair_file: cl += ["-1", fastq_file, "-2", pair_file] else: cl += [fastq_file] cl = [str(i) for i in cl] fix_rg_cmd = r"samtools addreplacerg -r '%s' -" % novoalign.get_rg_info( data["rgnames"]) if fix_rg_cmd: cmd = " ".join(cl) + " | " + fix_rg_cmd + " | " + tobam_cl else: cmd = " ".join(cl) + " | " + tobam_cl do.run(cmd, "Running Bowtie on %s and %s." % (fastq_file, pair_file), data) return out_file
def align_bam(in_bam, ref_file, names, align_dir, data): """Perform direct alignment of an input BAM file with BWA using pipes. This avoids disk IO by piping between processes: - samtools sort of input BAM to queryname - bedtools conversion to interleaved FASTQ - bwa-mem alignment - samtools conversion to BAM - samtools sort to coordinate """ config = data["config"] out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) samtools = config_utils.get_program("samtools", config) bedtools = config_utils.get_program("bedtools", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) # adjust memory for samtools since used for input and output max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), 3, "decrease").upper() if not utils.file_exists(out_file): with tx_tmpdir(data) as work_dir: with postalign.tobam_cl(data, out_file, bam.is_paired(in_bam)) as (tobam_cl, tx_out_file): if not hla_on(data) or needs_separate_hla(data): bwa_cmd = _get_bwa_mem_cmd(data, out_file, ref_file, "-", with_hla=False) else: bwa_cmd = _get_bwa_mem_cmd(data, out_file, ref_file, "-", with_hla=True) tx_out_prefix = os.path.splitext(tx_out_file)[0] prefix1 = "%s-in1" % tx_out_prefix cmd = ( "unset JAVA_HOME && " "{samtools} sort -n -l 1 -@ {num_cores} -m {max_mem} {in_bam} -T {prefix1} " "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout " "| {bwa_cmd} | ") cmd = cmd.format(**locals()) + tobam_cl do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"], None, [ do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, in_bam) ]) data["work_bam"] = out_file hla_file = "HLA-" + out_file if needs_separate_hla(data) and not utils.file_exists(hla_file): with tx_tmpdir(data) as work_dir: with postalign.tobam_cl(data, hla_file, bam.is_paired(in_bam)) as (tobam_cl, tx_out_file): bwa_cmd = _get_bwa_mem_cmd(data, hla_file, ref_file, "-", with_hla=True) tx_out_prefix = os.path.splitext(tx_out_file)[0] prefix1 = "%s-in1" % tx_out_prefix cmd = ( "unset JAVA_HOME && " "{samtools} sort -n -l 1 -@ {num_cores} -m {max_mem} {in_bam} -T {prefix1} " "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout " "| {bwa_cmd} | ") cmd = cmd.format(**locals()) + tobam_cl do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"], None, [ do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, in_bam) ]) hla_file = _align_mem_hla(fastq_file, pair_file, ref_file, hla_file, names, rg_info, data) data["hla_bam"] = hla_file return data
def align(fastq_file, pair_file, index_dir, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM. Pipes in input, handling paired and split inputs, using interleaving magic from: https://biowize.wordpress.com/2015/03/26/the-fastest-darn-fastq-decoupling-procedure-i-ever-done-seen/ Then converts a tab delimited set of outputs into interleaved fastq. awk changes spaces to underscores since SNAP only takes the initial name. SNAP requires /1 and /2 at the end of read names. If these are not present in the initial fastq may need to expand awk code to do this. """ out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data))) num_cores = data["config"]["algorithm"].get("num_cores", 1) resources = config_utils.get_resources("snap", data["config"]) rg_info = novoalign.get_rg_info(names) if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data) fastq_file = fastq_file[2:-1] if pair_file: pair_file = pair_file[2:-1] stream_input = (r"paste <({fastq_file} | paste - - - -) " r"<({pair_file} | paste - - - -) | " r"""awk 'BEGIN {{FS="\t"; OFS="\n"}} """ r"""{{ """ r"""split($1, P1, " "); split($5, P5, " "); """ r"""if ($1 !~ /\/1$/) $1 = P1[1]"/1"; if ($5 !~ /\/2$/) $5 = P5[1]"/2"; """ r"""gsub(" ", "_", $1); gsub(" ", "_", $5); """ r"""print $1, $2, "+", $4, $5, $6, "+", $8}}' | sponge """) else: stream_input = fastq_file[2:-1] else: final_file = None assert fastq_file.endswith(".gz") if pair_file: stream_input = (r"paste <(zcat {fastq_file} | paste - - - -) " r"<(zcat {pair_file} | paste - - - -) | " r"""awk 'BEGIN {{FS="\t"; OFS="\n"}} """ r"""{{ """ r"""split($1, P1, " "); split($5, P5, " "); """ r"""if ($1 !~ /\/1$/) $1 = P1[1]"/1"; if ($5 !~ /\/2$/) $5 = P5[1]"/2"; """ r"""gsub(" ", "_", $1); gsub(" ", "_", $5); """ r"""print $1, $2, "+", $4, $5, $6, "+", $8}}' | sponge """) else: stream_input = "zcat {fastq_file}" pair_file = pair_file if pair_file else "" if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): with postalign.tobam_cl(data, out_file, pair_file is not None) as (tobam_cl, tx_out_file): if pair_file: sub_cmd = "paired" input_cmd = "-pairedInterleavedFastq -" else: sub_cmd = "single" input_cmd = "-fastq -" stream_input = stream_input.format(**locals()) tmp_dir = os.path.dirname(tx_out_file) cmd = ("export TMPDIR={tmp_dir} && {stream_input} | snap-aligner {sub_cmd} {index_dir} {input_cmd} " "-R '{rg_info}' -t {num_cores} -M -o -sam - | ") do.run(cmd.format(**locals()) + tobam_cl, "SNAP alignment: %s" % names["sample"]) data["work_bam"] = out_file return data
def align(fastq_file, pair_file, index_dir, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM. Pipes in input, handling paired and split inputs, using interleaving magic from: https://biowize.wordpress.com/2015/03/26/the-fastest-darn-fastq-decoupling-procedure-i-ever-done-seen/ Then converts a tab delimited set of outputs into interleaved fastq. awk changes spaces to underscores since SNAP only takes the initial name. SNAP requires /1 and /2 at the end of read names. If these are not present in the initial fastq may need to expand awk code to do this. """ out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data))) num_cores = data["config"]["algorithm"].get("num_cores", 1) resources = config_utils.get_resources("snap", data["config"]) rg_info = novoalign.get_rg_info(names) if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file, pair_file = alignprep.split_namedpipe_cls( fastq_file, pair_file, data) fastq_file = fastq_file[2:-1] if pair_file: pair_file = pair_file[2:-1] stream_input = ( r"paste <({fastq_file} | paste - - - -) " r"<({pair_file} | paste - - - -) | " r"""awk 'BEGIN {{FS="\t"; OFS="\n"}} """ r"""{{ """ r"""split($1, P1, " "); split($5, P5, " "); """ r"""if ($1 !~ /\/1$/) $1 = P1[1]"/1"; if ($5 !~ /\/2$/) $5 = P5[1]"/2"; """ r"""gsub(" ", "_", $1); gsub(" ", "_", $5); """ r"""print $1, $2, "+", $4, $5, $6, "+", $8}}' """) else: stream_input = fastq_file[2:-1] else: final_file = None assert fastq_file.endswith(".gz") if pair_file: stream_input = ( r"paste <(zcat {fastq_file} | paste - - - -) " r"<(zcat {pair_file} | paste - - - -) | " r"""awk 'BEGIN {{FS="\t"; OFS="\n"}} """ r"""{{ """ r"""split($1, P1, " "); split($5, P5, " "); """ r"""if ($1 !~ /\/1$/) $1 = P1[1]"/1"; if ($5 !~ /\/2$/) $5 = P5[1]"/2"; """ r"""gsub(" ", "_", $1); gsub(" ", "_", $5); """ r"""print $1, $2, "+", $4, $5, $6, "+", $8}}' """) else: stream_input = "zcat {fastq_file}" pair_file = pair_file if pair_file else "" if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): with postalign.tobam_cl(data, out_file, pair_file is not None) as (tobam_cl, tx_out_file): if pair_file: sub_cmd = "paired" input_cmd = "-pairedInterleavedFastq -" else: sub_cmd = "single" input_cmd = "-fastq -" stream_input = stream_input.format(**locals()) tmp_dir = os.path.dirname(tx_out_file) cmd = ( "export TMPDIR={tmp_dir} && unset JAVA_HOME && {stream_input} | " "snap-aligner {sub_cmd} {index_dir} {input_cmd} " "-R '{rg_info}' -t {num_cores} -M -o -sam - | ") do.run( cmd.format(**locals()) + tobam_cl, "SNAP alignment: %s" % names["sample"]) data["work_bam"] = out_file return data