def align(fastq_file, pair_file, ref_file, names, align_dir, data, extra_args=None): """Alignment with bowtie2. """ config = data["config"] analysis_config = ANALYSIS.get(data["analysis"].lower()) assert analysis_config, "Analysis %s is not supported by bowtie2" % (data["analysis"]) out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data))) if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data) else: final_file = None if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): with postalign.tobam_cl(data, out_file, pair_file is not None) as (tobam_cl, tx_out_file): cl = [config_utils.get_program("bowtie2", config)] cl += extra_args if extra_args is not None else [] cl += ["-q", "-x", ref_file] cl += analysis_config.get("params", []) if pair_file: cl += ["-1", fastq_file, "-2", pair_file] else: cl += ["-U", fastq_file] if names and "rg" in names: cl += ["--rg-id", names["rg"]] for key, tag in [("sample", "SM"), ("pl", "PL"), ("pu", "PU"), ("lb", "LB")]: if names.get(key): cl += ["--rg", "%s:%s" % (tag, names[key])] cl += _bowtie2_args_from_config(config, cl) cl = [str(i) for i in cl] cmd = "unset JAVA_HOME && " + " ".join(cl) + " | " + tobam_cl do.run(cmd, "Aligning %s and %s with Bowtie2." % (fastq_file, pair_file)) return out_file
def align(fastq_file, pair_file, index_dir, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM. """ umi_ext = "-cumi" if "umi_bam" in data else "" out_file = os.path.join(align_dir, "{0}-sort{1}.bam".format(dd.get_sample_name(data), umi_ext)) num_cores = data["config"]["algorithm"].get("num_cores", 1) rg_info = novoalign.get_rg_info(names) preset = "sr" pair_file = pair_file if pair_file else "" if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data) else: final_file = None if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): with postalign.tobam_cl(data, out_file, pair_file != "") as (tobam_cl, tx_out_file): index_file = None # Skip trying to use indices now as they provide only slight speed-ups # and give inconsitent outputs in BAM headers # If a single index present, index_dir points to that # if index_dir and os.path.isfile(index_dir): # index_dir = os.path.dirname(index_dir) # index_file = os.path.join(index_dir, "%s-%s.mmi" % (dd.get_genome_build(data), preset)) if not index_file or not os.path.exists(index_file): index_file = dd.get_ref_file(data) cmd = ("minimap2 -a -x {preset} -R '{rg_info}' -t {num_cores} {index_file} " "{fastq_file} {pair_file} | ") do.run(cmd.format(**locals()) + tobam_cl, "minimap2 alignment: %s" % dd.get_sample_name(data)) data["work_bam"] = out_file return data
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted output BAM. """ pair_file = pair_file if pair_file else "" # back compatible -- older files were named with lane information, use sample name now out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) if not utils.file_exists(out_file): out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data))) qual_format = data["config"]["algorithm"].get("quality_format", "").lower() min_size = None if data.get("align_split") or fastq_file.endswith(".sdf"): if fastq_file.endswith(".sdf"): min_size = rtg.min_read_size(fastq_file) final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data) else: final_file = None if qual_format == "illumina": fastq_file = alignprep.fastq_convert_pipe_cl(fastq_file, data) if pair_file: pair_file = alignprep.fastq_convert_pipe_cl(pair_file, data) rg_info = novoalign.get_rg_info(names) if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): # If we cannot do piping, use older bwa aln approach if ("bwa-mem" not in dd.get_tools_on(data) and ("bwa-mem" in dd.get_tools_off(data) or not _can_use_mem(fastq_file, data, min_size))): out_file = _align_backtrack(fastq_file, pair_file, ref_file, out_file, names, rg_info, data) else: out_file = _align_mem(fastq_file, pair_file, ref_file, out_file, names, rg_info, data) data["work_bam"] = out_file return data
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted output BAM. """ pair_file = pair_file if pair_file else "" # back compatible -- older files were named with lane information, use sample name now out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) if not utils.file_exists(out_file): out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data))) if data.get("align_split") or fastq_file.endswith(".sdf"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data) else: final_file = None samtools = config_utils.get_program("samtools", data["config"]) novoalign = config_utils.get_program("novoalign", data["config"]) resources = config_utils.get_resources("novoalign", data["config"]) num_cores = data["config"]["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "1G") extra_novo_args = " ".join(_novoalign_args_from_config(data["config"])) rg_info = get_rg_info(names) if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): with tx_tmpdir(data) as work_dir: with postalign.tobam_cl(data, out_file, pair_file != "") as (tobam_cl, tx_out_file): tx_out_prefix = os.path.splitext(tx_out_file)[0] cmd = ("unset JAVA_HOME && " "{novoalign} -o SAM '{rg_info}' -d {ref_file} -f {fastq_file} {pair_file} " " -c {num_cores} {extra_novo_args} | ") cmd = (cmd + tobam_cl).format(**locals()) do.run(cmd, "Novoalign: %s" % names["sample"], None, [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, fastq_file)]) data["work_bam"] = out_file return data
def align(fastq_file, pair_file, ref_file, names, align_dir, data, extra_args=None): """Alignment with bowtie2. """ config = data["config"] analysis_config = ANALYSIS.get(data["analysis"].lower()) assert analysis_config, "Analysis %s is not supported by bowtie2" % (data["analysis"]) out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data))) if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data) else: final_file = None if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): with postalign.tobam_cl(data, out_file, pair_file is not None) as (tobam_cl, tx_out_file): cl = [config_utils.get_program("bowtie2", config)] cl += _bowtie2_args_from_config(config) cl += extra_args if extra_args is not None else [] cl += ["-q", "-x", ref_file] cl += analysis_config.get("params", []) if pair_file: cl += ["-1", fastq_file, "-2", pair_file] else: cl += ["-U", fastq_file] if names and "rg" in names: cl += ["--rg-id", names["rg"]] for key, tag in [("sample", "SM"), ("pl", "PL"), ("pu", "PU"), ("lb", "LB")]: if names.get(key): cl += ["--rg", "%s:%s" % (tag, names[key])] cl = [str(i) for i in cl] cmd = " ".join(cl) + " | " + tobam_cl do.run(cmd, "Aligning %s and %s with Bowtie2." % (fastq_file, pair_file)) return out_file
def align(fastq_file, pair_file, index_dir, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM. """ umi_ext = "-cumi" if "umi_bam" in data else "" out_file = os.path.join(align_dir, "{0}-sort{1}.bam".format(dd.get_sample_name(data), umi_ext)) num_cores = data["config"]["algorithm"].get("num_cores", 1) rg_info = novoalign.get_rg_info(names) preset = "sr" pair_file = pair_file if pair_file else "" if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data) else: final_file = None if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): with postalign.tobam_cl(data, out_file, pair_file != "") as (tobam_cl, tx_out_file): # If a single index present, index_dir points to that index_file = None if index_dir and os.path.isfile(index_dir): index_dir = os.path.dirname(index_dir) index_file = os.path.join(index_dir, "%s-%s.mmi" % (dd.get_genome_build(data), preset)) if not index_file or not os.path.exists(index_file): index_file = dd.get_ref_file(data) cmd = ("minimap2 -a -x {preset} -R '{rg_info}' -t {num_cores} {index_file} " "{fastq_file} {pair_file} | ") do.run(cmd.format(**locals()) + tobam_cl, "minimap2 alignment: %s" % dd.get_sample_name(data)) data["work_bam"] = out_file return data
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted output BAM. """ pair_file = pair_file if pair_file else "" out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) if data.get("align_split") or fastq_file.endswith(".sdf"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data) else: final_file = None samtools = config_utils.get_program("samtools", data["config"]) novoalign = config_utils.get_program("novoalign", data["config"]) resources = config_utils.get_resources("novoalign", data["config"]) num_cores = data["config"]["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "1G") extra_novo_args = " ".join(_novoalign_args_from_config(data["config"])) rg_info = get_rg_info(names) if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): with tx_tmpdir(data) as work_dir: with postalign.tobam_cl(data, out_file, pair_file != "") as (tobam_cl, tx_out_file): tx_out_prefix = os.path.splitext(tx_out_file)[0] cmd = ("{novoalign} -o SAM '{rg_info}' -d {ref_file} -f {fastq_file} {pair_file} " " -c {num_cores} {extra_novo_args} | ") cmd = (cmd + tobam_cl).format(**locals()) do.run(cmd, "Novoalign: %s" % names["sample"], None, [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, fastq_file)]) data["work_bam"] = out_file return data
def align(fastq_file, pair_file, ref_file, names, align_dir, data, extra_args=None): """Do standard or paired end alignment with bowtie. """ num_hits = 1 if data["analysis"].lower().startswith("smallrna-seq"): num_hits = 1000 config = data['config'] out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data))) if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file, pair_file = alignprep.split_namedpipe_cls( fastq_file, pair_file, data) else: final_file = None if fastq_file.endswith(".gz"): fastq_file = "<(gunzip -c %s)" % fastq_file if pair_file: pair_file = "<(gunzip -c %s)" % pair_file if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): with postalign.tobam_cl(data, out_file, pair_file is not None) as (tobam_cl, tx_out_file): cl = [config_utils.get_program("bowtie", config)] cl += _bowtie_args_from_config(data) cl += extra_args if extra_args is not None else [] cl += [ "-q", "-v", 2, "-k", num_hits, "-X", 2000, # default is too selective for most data "--best", "--strata", "--sam", ref_file ] if pair_file: cl += ["-1", fastq_file, "-2", pair_file] else: cl += [fastq_file] cl = [str(i) for i in cl] fix_rg_cmd = r"samtools addreplacerg -r '%s' -" % novoalign.get_rg_info( names) cmd = " ".join(cl) + " | " + fix_rg_cmd + " | " + tobam_cl do.run(cmd, "Running Bowtie on %s and %s." % (fastq_file, pair_file), data) return out_file
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted output BAM. """ pair_file = pair_file if pair_file else "" # back compatible -- older files were named with lane information, use sample name now if names["lane"] != dd.get_sample_name(data): out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) else: out_file = None if not out_file or not utils.file_exists(out_file): umi_ext = "-cumi" if "umi_bam" in data else "" out_file = os.path.join( align_dir, "{0}-sort{1}.bam".format(dd.get_sample_name(data), umi_ext)) qual_format = data["config"]["algorithm"].get("quality_format", "").lower() min_size = None if data.get("align_split") or fastq_file.endswith(".sdf"): if fastq_file.endswith(".sdf"): min_size = rtg.min_read_size(fastq_file) final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file, pair_file = alignprep.split_namedpipe_cls( fastq_file, pair_file, data) else: final_file = None if qual_format == "illumina": fastq_file = alignprep.fastq_convert_pipe_cl(fastq_file, data) if pair_file: pair_file = alignprep.fastq_convert_pipe_cl(pair_file, data) rg_info = novoalign.get_rg_info(names) if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): # If we cannot do piping, use older bwa aln approach if ("bwa-mem" not in dd.get_tools_on(data) and ("bwa-mem" in dd.get_tools_off(data) or not _can_use_mem(fastq_file, data, min_size))): out_file = _align_backtrack(fastq_file, pair_file, ref_file, out_file, names, rg_info, data) else: if is_precollapsed_bam( data) or not hla_on(data) or needs_separate_hla(data): out_file = _align_mem(fastq_file, pair_file, ref_file, out_file, names, rg_info, data) else: out_file = _align_mem_hla(fastq_file, pair_file, ref_file, out_file, names, rg_info, data) data["work_bam"] = out_file # bwakit will corrupt the non-HLA alignments in a UMI collapsed BAM file # (see https://github.com/bcbio/bcbio-nextgen/issues/3069) if needs_separate_hla(data): hla_file = os.path.join(os.path.dirname(out_file), "HLA-" + os.path.basename(out_file)) hla_file = _align_mem_hla(fastq_file, pair_file, ref_file, hla_file, names, rg_info, data) data["hla_bam"] = hla_file return data
def align(fastq_file, pair_file, ref_file, names, align_dir, data): paired = True if pair_file else False hisat2 = config_utils.get_program("hisat2", data) num_cores = dd.get_num_cores(data) quality_flag = _get_quality_flag(data) stranded_flag = _get_stranded_flag(data, paired) rg_flags = _get_rg_flags(names) out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data))) if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file, pair_file = alignprep.split_namedpipe_cls( fastq_file, pair_file, data) else: final_file = None if not file_exists(out_file) and (final_file is None or not file_exists(final_file)): cmd = ( "{hisat2} --new-summary -x {ref_file} -p {num_cores} {quality_flag} {stranded_flag} " "{rg_flags} ") if paired: cmd += "-1 {fastq_file} -2 {pair_file} " else: cmd += "-U {fastq_file} " if dd.get_analysis(data).lower() == "smallrna-seq": cmd += "-k 1000 " # if assembling transcripts, set flags that cufflinks/stringtie can use if dd.get_transcript_assembler(data): cmd += "--dta-cufflinks " if dd.get_analysis(data).lower() == "rna-seq": splicesites = get_known_splicesites_file(align_dir, data) if file_exists(splicesites): cmd += "--known-splicesite-infile {splicesites} " novel_splicesite_file = os.path.join( align_dir, "{0}-novelsplicesites.bed".format(dd.get_sample_name(data))) cmd += "--novel-splicesite-outfile {novel_splicesite_file} " # apply additional hisat2 options cmd += " ".join(_get_options_from_config(data)) message = "Aligning %s and %s with hisat2." % (fastq_file, pair_file) with postalign.tobam_cl(data, out_file, pair_file is not None) as (tobam_cl, tx_out_file): cmd += " | " + tobam_cl do.run(cmd.format(**locals()), message) data = dd.set_work_bam(data, out_file) junctionbed = get_splicejunction_file(align_dir, data) data = dd.set_junction_bed(data, junctionbed) return data
def align(fastq_file, pair_file, ref_file, names, align_dir, data, extra_args=None): """Do standard or paired end alignment with bowtie. """ num_hits = 1 if data["analysis"].lower().startswith("smallrna-seq"): num_hits = 1000 config = data['config'] out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data))) if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data) else: final_file = None if fastq_file.endswith(".gz"): fastq_file = "<(gunzip -c %s)" % fastq_file if pair_file: pair_file = "<(gunzip -c %s)" % pair_file if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): with postalign.tobam_cl(data, out_file, pair_file is not None) as (tobam_cl, tx_out_file): cl = [config_utils.get_program("bowtie", config)] cl += _bowtie_args_from_config(data) cl += extra_args if extra_args is not None else [] cl += ["-q", "-v", 2, "-k", num_hits, "-X", 2000, # default is too selective for most data "--best", "--strata", "--sam", ref_file] if pair_file: cl += ["-1", fastq_file, "-2", pair_file] else: cl += [fastq_file] cl = [str(i) for i in cl] fix_rg_cmd = r"samtools addreplacerg -r '%s' -" % novoalign.get_rg_info(data["rgnames"]) if fix_rg_cmd: cmd = " ".join(cl) + " | " + fix_rg_cmd + " | " + tobam_cl else: cmd = " ".join(cl) + " | " + tobam_cl do.run(cmd, "Running Bowtie on %s and %s." % (fastq_file, pair_file), data) return out_file
def align(fastq_file, pair_file, index_dir, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM. Pipes in input, handling paired and split inputs, using interleaving magic from: https://biowize.wordpress.com/2015/03/26/the-fastest-darn-fastq-decoupling-procedure-i-ever-done-seen/ """ out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data))) num_cores = data["config"]["algorithm"].get("num_cores", 1) resources = config_utils.get_resources("snap", data["config"]) rg_info = novoalign.get_rg_info(names) if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data) fastq_file = fastq_file[2:-1] if pair_file: pair_file = pair_file[2:-1] stream_input = (r"paste <({fastq_file} | paste - - - -) " r"<({pair_file} | paste - - - -) | tr '\t' '\n'") else: stream_input = fastq_file[2:-1] else: assert fastq_file.endswith(".gz") if pair_file: stream_input = (r"paste <(zcat {fastq_file} | paste - - - -) " r"<(zcat {pair_file} | paste - - - -) | tr '\t' '\n'") else: stream_input = "zcat {fastq_file}" pair_file = pair_file if pair_file else "" if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): with postalign.tobam_cl(data, out_file, pair_file is not None) as (tobam_cl, tx_out_file): if pair_file: sub_cmd = "paired" input_cmd = "-pairedInterleavedFastq -" else: sub_cmd = "single" input_cmd = "-fastq -" stream_input = stream_input.format(**locals()) cmd = ("{stream_input} | snap-aligner {sub_cmd} {index_dir} {input_cmd} " "-R '{rg_info}' -t {num_cores} -M -o -sam - | ") do.run(cmd.format(**locals()) + tobam_cl, "SNAP alignment: %s" % names["sample"]) data["work_bam"] = out_file return data
def align(fastq_file, pair_file, ref_file, names, align_dir, data): paired = True if pair_file else False hisat2 = config_utils.get_program("hisat2", data) num_cores = dd.get_num_cores(data) quality_flag = _get_quality_flag(data) stranded_flag = _get_stranded_flag(data, paired) rg_flags = _get_rg_flags(names) out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data))) if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data) else: final_file = None if not file_exists(out_file) and (final_file is None or not file_exists(final_file)): cmd = ("{hisat2} --new-summary -x {ref_file} -p {num_cores} {quality_flag} {stranded_flag} " "{rg_flags} ") if paired: cmd += "-1 {fastq_file} -2 {pair_file} " else: cmd += "-U {fastq_file} " if dd.get_analysis(data).lower() == "smallrna-seq": cmd += "-k 1000 " # if assembling transcripts, set flags that cufflinks/stringtie can use if dd.get_transcript_assembler(data): cmd += "--dta-cufflinks " if dd.get_analysis(data).lower() == "rna-seq": splicesites = get_known_splicesites_file(align_dir, data) if file_exists(splicesites): cmd += "--known-splicesite-infile {splicesites} " novel_splicesite_file = os.path.join(align_dir, "{0}-novelsplicesites.bed".format(dd.get_sample_name(data))) cmd += "--novel-splicesite-outfile {novel_splicesite_file} " # apply additional hisat2 options cmd += " ".join(_get_options_from_config(data)) message = "Aligning %s and %s with hisat2." % (fastq_file, pair_file) with postalign.tobam_cl(data, out_file, pair_file is not None) as (tobam_cl, tx_out_file): cmd += " | " + tobam_cl do.run(cmd.format(**locals()), message) data = dd.set_work_bam(data, out_file) junctionbed = get_splicejunction_file(align_dir, data) data = dd.set_junction_bed(data, junctionbed) return data
def align(fastq_file, pair_file, index_dir, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM. Pipes in input, handling paired and split inputs, using interleaving magic from: https://biowize.wordpress.com/2015/03/26/the-fastest-darn-fastq-decoupling-procedure-i-ever-done-seen/ Then converts a tab delimited set of outputs into interleaved fastq. awk changes spaces to underscores since SNAP only takes the initial name. SNAP requires /1 and /2 at the end of read names. If these are not present in the initial fastq may need to expand awk code to do this. """ out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data))) num_cores = data["config"]["algorithm"].get("num_cores", 1) resources = config_utils.get_resources("snap", data["config"]) rg_info = novoalign.get_rg_info(names) if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file, pair_file = alignprep.split_namedpipe_cls( fastq_file, pair_file, data) fastq_file = fastq_file[2:-1] if pair_file: pair_file = pair_file[2:-1] stream_input = ( r"paste <({fastq_file} | paste - - - -) " r"<({pair_file} | paste - - - -) | " r"""awk 'BEGIN {{FS="\t"; OFS="\n"}} """ r"""{{ """ r"""split($1, P1, " "); split($5, P5, " "); """ r"""if ($1 !~ /\/1$/) $1 = P1[1]"/1"; if ($5 !~ /\/2$/) $5 = P5[1]"/2"; """ r"""gsub(" ", "_", $1); gsub(" ", "_", $5); """ r"""print $1, $2, "+", $4, $5, $6, "+", $8}}' """) else: stream_input = fastq_file[2:-1] else: final_file = None assert fastq_file.endswith(".gz") if pair_file: stream_input = ( r"paste <(zcat {fastq_file} | paste - - - -) " r"<(zcat {pair_file} | paste - - - -) | " r"""awk 'BEGIN {{FS="\t"; OFS="\n"}} """ r"""{{ """ r"""split($1, P1, " "); split($5, P5, " "); """ r"""if ($1 !~ /\/1$/) $1 = P1[1]"/1"; if ($5 !~ /\/2$/) $5 = P5[1]"/2"; """ r"""gsub(" ", "_", $1); gsub(" ", "_", $5); """ r"""print $1, $2, "+", $4, $5, $6, "+", $8}}' """) else: stream_input = "zcat {fastq_file}" pair_file = pair_file if pair_file else "" if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): with postalign.tobam_cl(data, out_file, pair_file is not None) as (tobam_cl, tx_out_file): if pair_file: sub_cmd = "paired" input_cmd = "-pairedInterleavedFastq -" else: sub_cmd = "single" input_cmd = "-fastq -" stream_input = stream_input.format(**locals()) tmp_dir = os.path.dirname(tx_out_file) cmd = ( "export TMPDIR={tmp_dir} && unset JAVA_HOME && {stream_input} | " "snap-aligner {sub_cmd} {index_dir} {input_cmd} " "-R '{rg_info}' -t {num_cores} -M -o -sam - | ") do.run( cmd.format(**locals()) + tobam_cl, "SNAP alignment: %s" % names["sample"]) data["work_bam"] = out_file return data
def align(fastq_file, pair_file, index_dir, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM. Pipes in input, handling paired and split inputs, using interleaving magic from: https://biowize.wordpress.com/2015/03/26/the-fastest-darn-fastq-decoupling-procedure-i-ever-done-seen/ Then converts a tab delimited set of outputs into interleaved fastq. awk changes spaces to underscores since SNAP only takes the initial name. SNAP requires /1 and /2 at the end of read names. If these are not present in the initial fastq may need to expand awk code to do this. """ out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data))) num_cores = data["config"]["algorithm"].get("num_cores", 1) resources = config_utils.get_resources("snap", data["config"]) rg_info = novoalign.get_rg_info(names) if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data) fastq_file = fastq_file[2:-1] if pair_file: pair_file = pair_file[2:-1] stream_input = (r"paste <({fastq_file} | paste - - - -) " r"<({pair_file} | paste - - - -) | " r"""awk 'BEGIN {{FS="\t"; OFS="\n"}} """ r"""{{ """ r"""split($1, P1, " "); split($5, P5, " "); """ r"""if ($1 !~ /\/1$/) $1 = P1[1]"/1"; if ($5 !~ /\/2$/) $5 = P5[1]"/2"; """ r"""gsub(" ", "_", $1); gsub(" ", "_", $5); """ r"""print $1, $2, "+", $4, $5, $6, "+", $8}}' | sponge """) else: stream_input = fastq_file[2:-1] else: final_file = None assert fastq_file.endswith(".gz") if pair_file: stream_input = (r"paste <(zcat {fastq_file} | paste - - - -) " r"<(zcat {pair_file} | paste - - - -) | " r"""awk 'BEGIN {{FS="\t"; OFS="\n"}} """ r"""{{ """ r"""split($1, P1, " "); split($5, P5, " "); """ r"""if ($1 !~ /\/1$/) $1 = P1[1]"/1"; if ($5 !~ /\/2$/) $5 = P5[1]"/2"; """ r"""gsub(" ", "_", $1); gsub(" ", "_", $5); """ r"""print $1, $2, "+", $4, $5, $6, "+", $8}}' | sponge """) else: stream_input = "zcat {fastq_file}" pair_file = pair_file if pair_file else "" if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): with postalign.tobam_cl(data, out_file, pair_file is not None) as (tobam_cl, tx_out_file): if pair_file: sub_cmd = "paired" input_cmd = "-pairedInterleavedFastq -" else: sub_cmd = "single" input_cmd = "-fastq -" stream_input = stream_input.format(**locals()) tmp_dir = os.path.dirname(tx_out_file) cmd = ("export TMPDIR={tmp_dir} && {stream_input} | snap-aligner {sub_cmd} {index_dir} {input_cmd} " "-R '{rg_info}' -t {num_cores} -M -o -sam - | ") do.run(cmd.format(**locals()) + tobam_cl, "SNAP alignment: %s" % names["sample"]) data["work_bam"] = out_file return data