def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted output BAM. """ pair_file = pair_file if pair_file else "" out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) qual_format = data["config"]["algorithm"].get("quality_format", "").lower() if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file = alignprep.split_namedpipe_cl(fastq_file, data) if pair_file: pair_file = alignprep.split_namedpipe_cl(pair_file, data) else: final_file = None if qual_format == "illumina": fastq_file = alignprep.fastq_convert_pipe_cl(fastq_file, data) if pair_file: pair_file = alignprep.fastq_convert_pipe_cl(pair_file, data) else: fastq_file = utils.remote_cl_input(fastq_file) pair_file = utils.remote_cl_input(pair_file) rg_info = novoalign.get_rg_info(names) if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): # If we cannot do piping, use older bwa aln approach if ("bwa-mem" in tz.get_in(["config", "algorithm", "tools_off"], data, []) or not _can_use_mem(fastq_file, data)): out_file = _align_backtrack(fastq_file, pair_file, ref_file, out_file, names, rg_info, data) else: out_file = _align_mem(fastq_file, pair_file, ref_file, out_file, names, rg_info, data) data["work_bam"] = out_file return data
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted output BAM. """ pair_file = pair_file if pair_file else "" out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) qual_format = data["config"]["algorithm"].get("quality_format", "").lower() if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file = alignprep.split_namedpipe_cl(fastq_file, data) if pair_file: pair_file = alignprep.split_namedpipe_cl(pair_file, data) else: final_file = None if qual_format == "illumina": fastq_file = alignprep.fastq_convert_pipe_cl(fastq_file, data) if pair_file: pair_file = alignprep.fastq_convert_pipe_cl(pair_file, data) else: fastq_file = utils.remote_cl_input(fastq_file) pair_file = utils.remote_cl_input(pair_file) rg_info = novoalign.get_rg_info(names) if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): # If we cannot do piping, use older bwa aln approach if not _can_use_mem(fastq_file, data): out_file = _align_backtrack(fastq_file, pair_file, ref_file, out_file, names, rg_info, data) else: out_file = _align_mem(fastq_file, pair_file, ref_file, out_file, names, rg_info, data) data["work_bam"] = out_file return data
def _can_use_mem(fastq_file, data): """bwa-mem handle longer (> 70bp) reads with improved piping. Randomly samples 5000 reads from the first two million. Default to no piping if more than 75% of the sampled reads are small. """ min_size = 70 thresh = 0.75 head_count = 8000000 tocheck = 5000 seqtk = config_utils.get_program("seqtk", data["config"]) fastq_file = utils.remote_cl_input(fastq_file) gzip_cmd = "zcat {fastq_file}" if fastq_file.endswith( ".gz") else "cat {fastq_file}" cmd = (gzip_cmd + " | head -n {head_count} | " "{seqtk} sample -s42 - {tocheck} | " "awk '{{if(NR%4==2) print length($1)}}' | sort | uniq -c") count_out = subprocess.check_output(cmd.format(**locals()), shell=True, executable="/bin/bash", stderr=open("/dev/null", "w")) if not count_out.strip(): raise IOError("Failed to check fastq file sizes with: %s" % cmd.format(**locals())) shorter = 0 for count, size in (l.strip().split() for l in count_out.strip().split("\n")): if int(size) < min_size: shorter += int(count) return (float(shorter) / float(tocheck)) <= thresh
def _cram_to_fastq_region(cram_file, work_dir, base_name, region, data): """Convert CRAM to fastq in a specified region. """ ref_file = tz.get_in(["reference", "fasta", "base"], data) resources = config_utils.get_resources("bamtofastq", data["config"]) cores = tz.get_in(["config", "algorithm", "num_cores"], data, 1) max_mem = int(resources.get("memory", "1073741824")) * cores # 1Gb/core default rext = "-%s" % region.replace(":", "_").replace("-", "_") if region else "full" out_s, out_p1, out_p2, out_o1, out_o2 = [ os.path.join(work_dir, "%s%s-%s.fq.gz" % (base_name, rext, fext)) for fext in ["s1", "p1", "p2", "o1", "o2"] ] if not utils.file_exists(out_p1): with file_transaction(data, out_s, out_p1, out_p2, out_o1, out_o2) as \ (tx_out_s, tx_out_p1, tx_out_p2, tx_out_o1, tx_out_o2): cram_file = utils.remote_cl_input(cram_file) sortprefix = "%s-sort" % utils.splitext_plus(tx_out_s)[0] cmd = ( "bamtofastq filename={cram_file} inputformat=cram T={sortprefix} " "gz=1 collate=1 colsbs={max_mem} exclude=SECONDARY,SUPPLEMENTARY " "F={tx_out_p1} F2={tx_out_p2} S={tx_out_s} O={tx_out_o1} O2={tx_out_o2} " "reference={ref_file}") if region: cmd += " ranges='{region}'" do.run(cmd.format(**locals()), "CRAM to fastq %s" % region if region else "") return [[out_p1, out_p2, out_s]]
def _bgzip_file(in_file, dirs, config, needs_bgzip, needs_gunzip, needs_convert): """Handle bgzip of input file, potentially gunzipping an existing file. """ work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep")) out_file = os.path.join(work_dir, os.path.basename(in_file) + (".gz" if not in_file.endswith(".gz") else "")) if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: bgzip = tools.get_bgzip_cmd(config) is_remote = in_file.startswith(utils.SUPPORTED_REMOTES) in_file = utils.remote_cl_input(in_file, unpack=needs_gunzip or needs_convert or needs_bgzip) if needs_convert: in_file = fastq_convert_pipe_cl(in_file, {"config": config}) if needs_gunzip: gunzip_cmd = "gunzip -c {in_file} |".format(**locals()) bgzip_in = "/dev/stdin" else: gunzip_cmd = "" bgzip_in = in_file if needs_bgzip: do.run("{gunzip_cmd} {bgzip} -c {bgzip_in} > {tx_out_file}".format(**locals()), "bgzip input file") elif is_remote: do.run("cat {in_file} > {tx_out_file}".format(**locals()), "Get remote input") else: raise ValueError("Unexpected inputs: %s %s %s %s" % (in_file, needs_bgzip, needs_gunzip, needs_convert)) return out_file
def align_bam(in_bam, ref_file, names, align_dir, data): """Perform direct alignment of an input BAM file with BWA using pipes. This avoids disk IO by piping between processes: - samtools sort of input BAM to queryname - bedtools conversion to interleaved FASTQ - bwa-mem alignment - samtools conversion to BAM - samtools sort to coordinate """ config = data["config"] out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) samtools = config_utils.get_program("samtools", config) bedtools = config_utils.get_program("bedtools", config) bwa = config_utils.get_program("bwa", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) # adjust memory for samtools since used for input and output max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), 3, "decrease").upper() rg_info = novoalign.get_rg_info(names) if not utils.file_exists(out_file): with utils.curdir_tmpdir(data) as work_dir: with postalign.tobam_cl(data, out_file, bam.is_paired(in_bam)) as (tobam_cl, tx_out_file): tx_out_prefix = os.path.splitext(tx_out_file)[0] prefix1 = "%s-in1" % tx_out_prefix in_bam = utils.remote_cl_input(in_bam) cmd = ("{samtools} sort -n -o -l 1 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} " "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout " "| {bwa} mem -p -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} - | ") cmd = cmd.format(**locals()) + tobam_cl do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"], None, [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, in_bam)]) return out_file
def fastq_convert_pipe_cl(in_file, data): """Create an anonymous pipe converting Illumina 1.3-1.7 to Sanger. Uses seqtk: https://github.com/lh3/seqt """ seqtk = config_utils.get_program("seqtk", data["config"]) in_file = utils.remote_cl_input(in_file) return "<({seqtk} seq -Q64 -V {in_file})".format(**locals())
def is_paired(bam_file): """Determine if a BAM file has paired reads. """ bam_file = utils.remote_cl_input(bam_file) cmd = ("sambamba view -h {bam_file} | head -50000 | " "sambamba view -S -F paired /dev/stdin | head -1 | wc -l") out = subprocess.check_output(cmd.format(**locals()), shell=True, executable=do.find_bash(), stderr=open("/dev/null", "w")) return int(out) > 0
def _cutadapt_se_cmd(fastq_files, out_files, base_cmd): """ this has to use the -o option, not redirect to stdout in order for gzipping to be honored """ min_length = MINIMUM_LENGTH cmd = base_cmd + " --minimum-length={min_length} ".format(**locals()) fq1 = utils.remote_cl_input(fastq_files[0]) of1 = out_files[0] cmd += " -o {of1} " + str(fq1) return cmd
def _cutadapt_pe_nosickle(fastq_files, out_files, quality_format, base_cmd): """ sickle has an issue with 0 length reads, here is the open issue for it: https://github.com/najoshi/sickle/issues/32 until that is resolved, this is a workaround which avoids using sickle """ fq1, fq2 = [utils.remote_cl_input(x) for x in fastq_files] of1, of2 = out_files base_cmd += " --minimum-length={min_length} ".format(min_length=MINIMUM_LENGTH) first_cmd = base_cmd + " -o {tmp_fq1} -p {tmp_fq2} " + fq1 + " " + fq2 second_cmd = base_cmd + " -o {of2_tx} -p {of1_tx} {tmp_fq2} {tmp_fq1}" return first_cmd + ";" + second_cmd + "; rm {tmp_fq1} {tmp_fq2} "
def _bgzip_from_bam(bam_file, dirs, config, is_retry=False): """Create bgzipped fastq files from an input BAM file. """ # tools bamtofastq = config_utils.get_program("bamtofastq", config) resources = config_utils.get_resources("bamtofastq", config) cores = config["algorithm"].get("num_cores", 1) max_mem = int(resources.get("memory", "1073741824")) * cores # 1Gb/core default bgzip = tools.get_bgzip_cmd(config, is_retry) # files work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep")) out_file_1 = os.path.join( work_dir, "%s-1.fq.gz" % os.path.splitext(os.path.basename(bam_file))[0]) if bam.is_paired(bam_file): out_file_2 = out_file_1.replace("-1.fq.gz", "-2.fq.gz") else: out_file_2 = None needs_retry = False if is_retry or not utils.file_exists(out_file_1): with file_transaction(config, out_file_1) as tx_out_file: for f in [tx_out_file, out_file_1, out_file_2]: if f and os.path.exists(f): os.remove(f) fq1_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, tx_out_file) sortprefix = "%s-sort" % os.path.splitext(tx_out_file)[0] if bam.is_paired(bam_file): fq2_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, out_file_2) out_str = ( "F=>({fq1_bgzip_cmd}) F2=>({fq2_bgzip_cmd}) S=/dev/null O=/dev/null " "O2=/dev/null collate=1 colsbs={max_mem}") else: out_str = "S=>({fq1_bgzip_cmd})" bam_file = utils.remote_cl_input(bam_file) cmd = "{bamtofastq} filename={bam_file} T={sortprefix} " + out_str try: do.run(cmd.format(**locals()), "BAM to bgzipped fastq", checks=[do.file_reasonable_size(tx_out_file, bam_file)], log_error=False) except subprocess.CalledProcessError, msg: if not is_retry and "deflate failed" in str(msg): logger.info( "bamtofastq deflate IO failure preparing %s. Retrying with single core." % (bam_file)) needs_retry = True else: logger.exception() raise
def align_bam(in_bam, ref_file, names, align_dir, data): """Perform direct alignment of an input BAM file with BWA using pipes. This avoids disk IO by piping between processes: - samtools sort of input BAM to queryname - bedtools conversion to interleaved FASTQ - bwa-mem alignment - samtools conversion to BAM - samtools sort to coordinate """ config = data["config"] out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) samtools = config_utils.get_program("samtools", config) bedtools = config_utils.get_program("bedtools", config) bwa = config_utils.get_program("bwa", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) # adjust memory for samtools since used for input and output max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), 3, "decrease").upper() bwa_resources = config_utils.get_resources("bwa", data["config"]) bwa_params = (" ".join([str(x) for x in bwa_resources.get("options", [])]) if "options" in bwa_resources else "") rg_info = novoalign.get_rg_info(names) if not utils.file_exists(out_file): with tx_tmpdir(data) as work_dir: with postalign.tobam_cl(data, out_file, bam.is_paired(in_bam)) as (tobam_cl, tx_out_file): tx_out_prefix = os.path.splitext(tx_out_file)[0] prefix1 = "%s-in1" % tx_out_prefix in_bam = utils.remote_cl_input(in_bam) cmd = ( "{samtools} sort -n -o -l 1 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} " "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout " "| {bwa} mem -p -M -t {num_cores} {bwa_params} -R '{rg_info}' -v 1 {ref_file} - | " ) cmd = cmd.format(**locals()) + tobam_cl do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"], None, [ do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, in_bam) ]) return out_file
def _bgzip_from_bam(bam_file, dirs, config, is_retry=False): """Create bgzipped fastq files from an input BAM file. """ # tools bamtofastq = config_utils.get_program("bamtofastq", config) resources = config_utils.get_resources("bamtofastq", config) cores = config["algorithm"].get("num_cores", 1) max_mem = int(resources.get("memory", "1073741824")) * cores # 1Gb/core default bgzip = tools.get_bgzip_cmd(config, is_retry) # files work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep")) out_file_1 = os.path.join(work_dir, "%s-1.fq.gz" % os.path.splitext(os.path.basename(bam_file))[0]) if bam.is_paired(bam_file): out_file_2 = out_file_1.replace("-1.fq.gz", "-2.fq.gz") else: out_file_2 = None needs_retry = False if is_retry or not utils.file_exists(out_file_1): with file_transaction(config, out_file_1) as tx_out_file: for f in [tx_out_file, out_file_1, out_file_2]: if f and os.path.exists(f): os.remove(f) fq1_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, tx_out_file) sortprefix = "%s-sort" % os.path.splitext(tx_out_file)[0] if bam.is_paired(bam_file): fq2_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, out_file_2) out_str = ("F=>({fq1_bgzip_cmd}) F2=>({fq2_bgzip_cmd}) S=/dev/null O=/dev/null " "O2=/dev/null collate=1 colsbs={max_mem}") else: out_str = "S=>({fq1_bgzip_cmd})" bam_file = utils.remote_cl_input(bam_file) cmd = "{bamtofastq} filename={bam_file} T={sortprefix} " + out_str try: do.run(cmd.format(**locals()), "BAM to bgzipped fastq", checks=[do.file_reasonable_size(tx_out_file, bam_file)], log_error=False) except subprocess.CalledProcessError, msg: if not is_retry and "deflate failed" in str(msg): logger.info("bamtofastq deflate IO failure preparing %s. Retrying with single core." % (bam_file)) needs_retry = True else: logger.exception() raise
def _bgzip_file(in_file, dirs, config, needs_bgzip, needs_gunzip, needs_convert): """Handle bgzip of input file, potentially gunzipping an existing file. """ work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep")) out_file = os.path.join(work_dir, os.path.basename(in_file) + (".gz" if not in_file.endswith(".gz") else "")) if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: assert needs_bgzip bgzip = tools.get_bgzip_cmd(config) in_file = utils.remote_cl_input(in_file) if needs_convert: in_file = fastq_convert_pipe_cl(in_file, {"config": config}) if needs_gunzip: gunzip_cmd = "gunzip -c {in_file} |".format(**locals()) bgzip_in = "/dev/stdin" else: gunzip_cmd = "" bgzip_in = in_file do.run("{gunzip_cmd} {bgzip} -c {bgzip_in} > {tx_out_file}".format(**locals()), "bgzip input file") return out_file
def _bgzip_file(in_file, dirs, config, needs_bgzip, needs_gunzip, needs_convert): """Handle bgzip of input file, potentially gunzipping an existing file. """ work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep")) out_file = os.path.join( work_dir, os.path.basename(in_file) + (".gz" if not in_file.endswith(".gz") else "")) if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: bgzip = tools.get_bgzip_cmd(config) is_remote = in_file.startswith(utils.SUPPORTED_REMOTES) in_file = utils.remote_cl_input(in_file, unpack=needs_gunzip or needs_convert or needs_bgzip) if needs_convert: in_file = fastq_convert_pipe_cl(in_file, {"config": config}) if needs_gunzip and not needs_convert: gunzip_cmd = "gunzip -c {in_file} |".format(**locals()) bgzip_in = "/dev/stdin" else: gunzip_cmd = "" bgzip_in = in_file if needs_bgzip: do.run( "{gunzip_cmd} {bgzip} -c {bgzip_in} > {tx_out_file}". format(**locals()), "bgzip input file") elif is_remote: bgzip = "| bgzip -c" if needs_convert else "" do.run( "cat {in_file} {bgzip} > {tx_out_file}".format(**locals()), "Get remote input") else: raise ValueError( "Unexpected inputs: %s %s %s %s" % (in_file, needs_bgzip, needs_gunzip, needs_convert)) return out_file
def _cram_to_fastq_region(cram_file, work_dir, base_name, region, data): """Convert CRAM to fastq in a specified region. """ ref_file = tz.get_in(["reference", "fasta", "base"], data) resources = config_utils.get_resources("bamtofastq", data["config"]) cores = tz.get_in(["config", "algorithm", "num_cores"], data, 1) max_mem = int(resources.get("memory", "1073741824")) * cores # 1Gb/core default rext = "-%s" % region.replace(":", "_").replace("-", "_") if region else "full" out_s, out_p1, out_p2 = [os.path.join(work_dir, "%s%s-%s.fq.gz" % (base_name, rext, fext)) for fext in ["s1", "p1", "p2"]] if not utils.file_exists(out_p1): with file_transaction(data, out_s, out_p1, out_p2) as (tx_out_s, tx_out_p1, tx_out_p2): cram_file = utils.remote_cl_input(cram_file) sortprefix = "%s-sort" % utils.splitext_plus(tx_out_s)[0] cmd = ("bamtofastq filename={cram_file} inputformat=cram T={sortprefix} " "gz=1 collate=1 colsbs={max_mem} " "F={tx_out_p1} F2={tx_out_p2} S={tx_out_s} O=/dev/null O2=/dev/null " "reference={ref_file}") if region: cmd += " ranges='{region}'" do.run(cmd.format(**locals()), "CRAM to fastq %s" % region if region else "") return [[out_p1, out_p2, out_s]]
def _can_use_mem(fastq_file, data): """bwa-mem handle longer (> 70bp) reads with improved piping. Randomly samples 5000 reads from the first two million. Default to no piping if more than 75% of the sampled reads are small. """ min_size = 70 thresh = 0.75 head_count = 8000000 tocheck = 5000 seqtk = config_utils.get_program("seqtk", data["config"]) fastq_file = utils.remote_cl_input(fastq_file) gzip_cmd = "zcat {fastq_file}" if fastq_file.endswith(".gz") else "cat {fastq_file}" cmd = (gzip_cmd + " | head -n {head_count} | " "{seqtk} sample -s42 - {tocheck} | " "awk '{{if(NR%4==2) print length($1)}}' | sort | uniq -c") count_out = subprocess.check_output(cmd.format(**locals()), shell=True, executable="/bin/bash", stderr=open("/dev/null", "w")) if not count_out.strip(): raise IOError("Failed to check fastq file sizes with: %s" % cmd.format(**locals())) shorter = 0 for count, size in (l.strip().split() for l in count_out.strip().split("\n")): if int(size) < min_size: shorter += int(count) return (float(shorter) / float(tocheck)) <= thresh