def _atropos_trim(fastq_files, adapters, out_dir, data): """Perform multicore trimming with atropos. """ report_file = os.path.join( out_dir, "%s-report.json" % utils.splitext_plus(os.path.basename(fastq_files[0]))[0]) out_files = [ os.path.join( out_dir, "%s-trimmed.fq.gz" % utils.splitext_plus(os.path.basename(x))[0]) for x in fastq_files ] if not utils.file_exists(out_files[0]): with file_transaction(data, *[report_file] + out_files) as tx_out: tx_report_file, tx_out1 = tx_out[:2] if len(tx_out) > 2: tx_out2 = tx_out[2] adapters_args = " ".join(["-a %s" % a for a in adapters]) aligner_args = "--aligner adapter" if len(fastq_files) == 1: input_args = "-se %s" % objectstore.cl_input(fastq_files[0]) output_args = "-o >(bgzip --threads %s -c > {tx_out1})".format( **locals()) else: assert len(fastq_files) == 2, fastq_files adapters_args = adapters_args + " " + " ".join( ["-A %s" % a for a in adapters]) input_args = "-pe1 %s -pe2 %s" % tuple( [objectstore.cl_input(x) for x in fastq_files]) output_args = "-o >(bgzip -c > {tx_out1}) -p >(bgzip -c > {tx_out2})".format( **locals()) adapters_args += " --no-default-adapters" # Prevent GitHub queries quality_base = "64" if dd.get_quality_format( data).lower() == "illumina" else "33" sample_name = dd.get_sample_name(data) report_args = "--report-file %s --report-formats json --sample-id %s" % ( tx_report_file, dd.get_sample_name(data)) ropts = " ".join( str(x) for x in config_utils.get_resources( "atropos", data["config"]).get("options", [])) extra_opts = [] for k, alt_ks, v in [("--quality-cutoff", ["-q "], "5"), ("--minimum-length", ["-m "], str(dd.get_min_read_length(data))), ("--nextseq-trim", [], "25")]: if k not in ropts and not any(alt_k in ropts for alt_k in alt_ks): extra_opts.append("%s=%s" % (k, v)) extra_opts = " ".join(extra_opts) thread_args = ("--threads %s" % dd.get_num_cores(data) if dd.get_num_cores(data) > 1 else "") cmd = ( "atropos trim {ropts} {thread_args} --quality-base {quality_base} --format fastq " "{adapters_args} {input_args} {output_args} {report_args} {extra_opts}" ) do.run(cmd.format(**locals()), "Trimming with atropos: %s" % dd.get_sample_name(data)) return out_files, report_file
def _atropos_trim(fastq_files, adapters, out_dir, data): """Perform multicore trimming with atropos. """ report_file = os.path.join( out_dir, "%s-report.json" % utils.splitext_plus(os.path.basename(fastq_files[0]))[0]) out_files = [ os.path.join( out_dir, "%s-trimmed.fq.gz" % utils.splitext_plus(os.path.basename(x))[0]) for x in fastq_files ] if not utils.file_exists(out_files[0]): with file_transaction(data, *[report_file] + out_files) as tx_out: tx_report_file, tx_out1 = tx_out[:2] if len(tx_out) > 2: tx_out2 = tx_out[2] adapters_args = " ".join(["-a %s" % a for a in adapters]) aligner_args = "--aligner adapter" if len(fastq_files) == 1: input_args = "-se %s" % objectstore.cl_input(fastq_files[0]) output_args = "-o >(bgzip --threads %s -c > {tx_out1})".format( **locals()) else: assert len(fastq_files) == 2, fastq_files if adapters and len(adapters) <= 2: aligner_args = "--aligner insert" adapters_args = adapters_args + " " + " ".join( ["-A %s" % a for a in adapters]) input_args = "-pe1 %s -pe2 %s" % tuple( [objectstore.cl_input(x) for x in fastq_files]) output_args = "-o >(bgzip -c > {tx_out1}) -p >(bgzip -c > {tx_out2})".format( **locals()) quality_base = "64" if dd.get_quality_format( data).lower() == "illumina" else "33" sample_name = dd.get_sample_name(data) report_args = "--report-file %s --report-formats json --sample-id %s" % ( tx_report_file, dd.get_sample_name(data)) ropts = " ".join( str(x) for x in config_utils.get_resources( "atropos", data["config"]).get("options", [])) thread_args = ("--threads %s" % dd.get_num_cores(data) if dd.get_num_cores(data) > 1 else "") cmd = ( "atropos trim {ropts} {thread_args} --quality-base {quality_base} --format fastq " "{adapters_args} {aligner_args} {input_args} {output_args} {report_args}" ) cmd += " --quality-cutoff=5 --minimum-length=%s" % dd.get_min_read_length( data) do.run(cmd.format(**locals()), "Trimming with atropos: %s" % dd.get_sample_name(data)) return out_files, report_file
def _atropos_trim(fastq_files, adapters, out_dir, data): """Perform multicore trimming with atropos. """ report_file = os.path.join(out_dir, "%s-report.json" % utils.splitext_plus(os.path.basename(fastq_files[0]))[0]) out_files = [os.path.join(out_dir, "%s-trimmed.fq.gz" % utils.splitext_plus(os.path.basename(x))[0]) for x in fastq_files] if not utils.file_exists(out_files[0]): with file_transaction(data, *[report_file] + out_files) as tx_out: tx_report_file, tx_out1 = tx_out[:2] if len(tx_out) > 2: tx_out2 = tx_out[2] # polyX trimming, anchored to the 3' ends of reads if "polyx" in dd.get_adapters(data): adapters += ["A{200}$", "C{200}$", "G{200}$", "T{200}$"] adapters_args = " ".join(["-a '%s'" % a for a in adapters]) adapters_args += " --overlap 8" # Avoid very short internal matches (default is 3) adapters_args += " --no-default-adapters --no-cache-adapters" # Prevent GitHub queries and saving pickles aligner_args = "--aligner adapter" if len(fastq_files) == 1: cores = dd.get_num_cores(data) input_args = "-se %s" % objectstore.cl_input(fastq_files[0]) output_args = "-o >(bgzip --threads {cores} -c > {tx_out1})".format(**locals()) else: assert len(fastq_files) == 2, fastq_files cores = max(1, dd.get_num_cores(data) // 2) adapters_args = adapters_args + " " + " ".join(["-A '%s'" % a for a in adapters]) input_args = "-pe1 %s -pe2 %s" % tuple([objectstore.cl_input(x) for x in fastq_files]) output_args = ("-o >(bgzip --threads {cores} -c > {tx_out1}) " "-p >(bgzip --threads {cores} -c > {tx_out2})").format(**locals()) quality_base = "64" if dd.get_quality_format(data).lower() == "illumina" else "33" sample_name = dd.get_sample_name(data) report_args = "--report-file %s --report-formats json --sample-id %s" % (tx_report_file, dd.get_sample_name(data)) ropts = " ".join(str(x) for x in config_utils.get_resources("atropos", data["config"]).get("options", [])) extra_opts = [] for k, alt_ks, v, want in [("--quality-cutoff", ["-q "], "5", True), ("--minimum-length", ["-m "], str(dd.get_min_read_length(data)), True), ("--nextseq-trim", [], "25", ("polyx" in dd.get_adapters(data) or "polyg" in dd.get_adapters(data)))]: if k not in ropts and not any(alt_k in ropts for alt_k in alt_ks): if want: extra_opts.append("%s=%s" % (k, v)) extra_opts = " ".join(extra_opts) thread_args = ("--threads %s" % cores if cores > 1 else "") cmd = ("atropos trim {ropts} {thread_args} --quality-base {quality_base} --format fastq " "{adapters_args} {input_args} {output_args} {report_args} {extra_opts}") do.run(cmd.format(**locals()), "Trimming with atropos: %s" % dd.get_sample_name(data)) return out_files, report_file
def _atropos_trim(fastq_files, adapters, out_dir, data): """Perform multicore trimming with atropos. """ report_file = os.path.join(out_dir, "%s-report.json" % utils.splitext_plus(os.path.basename(fastq_files[0]))[0]) out_files = [os.path.join(out_dir, "%s-trimmed.fq.gz" % utils.splitext_plus(os.path.basename(x))[0]) for x in fastq_files] if not utils.file_exists(out_files[0]): with file_transaction(data, *[report_file] + out_files) as tx_out: tx_report_file, tx_out1 = tx_out[:2] if len(tx_out) > 2: tx_out2 = tx_out[2] # polyX trimming, anchored to the 3' ends of reads if "polyx" in dd.get_adapters(data): adapters += ["A{200}", "C{200}", "G{200}", "T{200}"] adapters_args = " ".join(["-a '%s'" % a for a in adapters]) adapters_args += " --overlap 8" # Avoid very short internal matches (default is 3) adapters_args += " --no-default-adapters --no-cache-adapters" # Prevent GitHub queries and saving pickles aligner_args = "--aligner adapter" if len(fastq_files) == 1: cores = dd.get_num_cores(data) input_args = "-se %s" % objectstore.cl_input(fastq_files[0]) output_args = "-o >(bgzip --threads {cores} -c > {tx_out1})".format(**locals()) else: assert len(fastq_files) == 2, fastq_files cores = max(1, dd.get_num_cores(data) // 2) adapters_args = adapters_args + " " + " ".join(["-A '%s'" % a for a in adapters]) input_args = "-pe1 %s -pe2 %s" % tuple([objectstore.cl_input(x) for x in fastq_files]) output_args = ("-o >(bgzip --threads {cores} -c > {tx_out1}) " "-p >(bgzip --threads {cores} -c > {tx_out2})").format(**locals()) quality_base = "64" if dd.get_quality_format(data).lower() == "illumina" else "33" sample_name = dd.get_sample_name(data) report_args = "--report-file %s --report-formats json --sample-id %s" % (tx_report_file, dd.get_sample_name(data)) ropts = " ".join(str(x) for x in config_utils.get_resources("atropos", data["config"]).get("options", [])) extra_opts = [] for k, alt_ks, v, want in [("--quality-cutoff", ["-q "], "5", True), ("--minimum-length", ["-m "], str(dd.get_min_read_length(data)), True), ("--nextseq-trim", [], "25", ("polyx" in dd.get_adapters(data) or "polyg" in dd.get_adapters(data)))]: if k not in ropts and not any(alt_k in ropts for alt_k in alt_ks): if want: extra_opts.append("%s=%s" % (k, v)) extra_opts = " ".join(extra_opts) thread_args = ("--threads %s" % cores if cores > 1 else "") cmd = ("atropos trim {ropts} {thread_args} --quality-base {quality_base} --format fastq " "{adapters_args} {input_args} {output_args} {report_args} {extra_opts}") do.run(cmd.format(**locals()), "Trimming with atropos: %s" % dd.get_sample_name(data)) return out_files, report_file
def _can_use_mem(fastq_file, data): """bwa-mem handle longer (> 70bp) reads with improved piping. Randomly samples 5000 reads from the first two million. Default to no piping if more than 75% of the sampled reads are small. """ min_size = 70 thresh = 0.75 head_count = 8000000 tocheck = 5000 seqtk = config_utils.get_program("seqtk", data["config"]) fastq_file = objectstore.cl_input(fastq_file) gzip_cmd = "zcat {fastq_file}" if fastq_file.endswith(".gz") else "cat {fastq_file}" cmd = ( gzip_cmd + " | head -n {head_count} | " "{seqtk} sample -s42 - {tocheck} | " "awk '{{if(NR%4==2) print length($1)}}' | sort | uniq -c" ) count_out = subprocess.check_output( cmd.format(**locals()), shell=True, executable="/bin/bash", stderr=open("/dev/null", "w") ) if not count_out.strip(): raise IOError("Failed to check fastq file sizes with: %s" % cmd.format(**locals())) shorter = 0 for count, size in (l.strip().split() for l in count_out.strip().split("\n")): if int(size) < min_size: shorter += int(count) return (float(shorter) / float(tocheck)) <= thresh
def _cram_to_fastq_region(cram_file, work_dir, base_name, region, data): """Convert CRAM to fastq in a specified region. """ ref_file = tz.get_in(["reference", "fasta", "base"], data) resources = config_utils.get_resources("bamtofastq", data["config"]) cores = tz.get_in(["config", "algorithm", "num_cores"], data, 1) max_mem = config_utils.convert_to_bytes(resources.get("memory", "1G")) * cores rext = "-%s" % region.replace(":", "_").replace("-", "_") if region else "full" out_s, out_p1, out_p2, out_o1, out_o2 = [ os.path.join(work_dir, "%s%s-%s.fq.gz" % (base_name, rext, fext)) for fext in ["s1", "p1", "p2", "o1", "o2"] ] if not utils.file_exists(out_p1): with file_transaction(data, out_s, out_p1, out_p2, out_o1, out_o2) as \ (tx_out_s, tx_out_p1, tx_out_p2, tx_out_o1, tx_out_o2): cram_file = objectstore.cl_input(cram_file) sortprefix = "%s-sort" % utils.splitext_plus(tx_out_s)[0] cmd = ( "bamtofastq filename={cram_file} inputformat=cram T={sortprefix} " "gz=1 collate=1 colsbs={max_mem} exclude=SECONDARY,SUPPLEMENTARY " "F={tx_out_p1} F2={tx_out_p2} S={tx_out_s} O={tx_out_o1} O2={tx_out_o2} " "reference={ref_file}") if region: cmd += " ranges='{region}'" do.run(cmd.format(**locals()), "CRAM to fastq %s" % region if region else "") return [[out_p1, out_p2, out_s]]
def is_paired(bam_file): """Determine if a BAM file has paired reads. Works around issues with head closing the samtools pipe using signal trick from: http://stackoverflow.com/a/12451083/252589 """ bam_file = objectstore.cl_input(bam_file) cmd = ("set -o pipefail; " "samtools view -h {bam_file} | head -300000 | " "samtools view -S -f 1 /dev/stdin | head -1 | wc -l") p = subprocess.Popen( cmd.format(**locals()), shell=True, executable=do.find_bash(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, preexec_fn=lambda: signal.signal(signal.SIGPIPE, signal.SIG_DFL)) stdout, stderr = p.communicate() stdout = stdout.decode() stderr = stderr.decode() stderr = stderr.strip() if ((p.returncode == 0 or p.returncode == 141) and (stderr == "" or (stderr.startswith("gof3r") and stderr.endswith("broken pipe")))): return int(stdout) > 0 else: raise ValueError("Failed to check paired status of BAM file: %s" % str(stderr))
def _can_use_mem(fastq_file, data, read_min_size=None): """bwa-mem handle longer (> 70bp) reads with improved piping. Randomly samples 5000 reads from the first two million. Default to no piping if more than 75% of the sampled reads are small. If we've previously calculated minimum read sizes (from rtg SDF output) we can skip the formal check. """ min_size = 70 if read_min_size and read_min_size >= min_size: return True thresh = 0.75 head_count = 8000000 tocheck = 5000 seqtk = config_utils.get_program("seqtk", data["config"]) fastq_file = objectstore.cl_input(fastq_file) gzip_cmd = "zcat {fastq_file}" if fastq_file.endswith( ".gz") else "cat {fastq_file}" cmd = (gzip_cmd + " | head -n {head_count} | " "{seqtk} sample -s42 - {tocheck} | " "awk '{{if(NR%4==2) print length($1)}}' | sort | uniq -c") count_out = subprocess.check_output(cmd.format(**locals()), shell=True, executable="/bin/bash") if not count_out.strip(): raise IOError("Failed to check fastq file sizes with: %s" % cmd.format(**locals())) shorter = 0 for count, size in (l.strip().split() for l in count_out.strip().split("\n")): if int(size) < min_size: shorter += int(count) return (float(shorter) / float(tocheck)) <= thresh
def _bgzip_file(in_file, config, work_dir, needs_bgzip, needs_gunzip, needs_convert): """Handle bgzip of input file, potentially gunzipping an existing file. """ out_file = os.path.join(work_dir, os.path.basename(in_file) + (".gz" if not in_file.endswith(".gz") else "")) if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: bgzip = tools.get_bgzip_cmd(config) is_remote = objectstore.is_remote(in_file) in_file = objectstore.cl_input(in_file, unpack=needs_gunzip or needs_convert or needs_bgzip) if needs_convert: in_file = fastq_convert_pipe_cl(in_file, {"config": config}) if needs_gunzip and not needs_convert: gunzip_cmd = "gunzip -c {in_file} |".format(**locals()) bgzip_in = "/dev/stdin" else: gunzip_cmd = "" bgzip_in = in_file if needs_bgzip: do.run("{gunzip_cmd} {bgzip} -c {bgzip_in} > {tx_out_file}".format(**locals()), "bgzip input file") elif is_remote: bgzip = "| bgzip -c" if needs_convert else "" do.run("cat {in_file} {bgzip} > {tx_out_file}".format(**locals()), "Get remote input") else: raise ValueError("Unexpected inputs: %s %s %s %s" % (in_file, needs_bgzip, needs_gunzip, needs_convert)) return out_file
def fastq_size_output(fastq_file, tocheck): head_count = 8000000 fastq_file = objectstore.cl_input(fastq_file) gzip_cmd = "zcat {fastq_file}" if fastq_file.endswith( ".gz") else "cat {fastq_file}" cmd = (utils.local_path_export() + gzip_cmd + " | head -n {head_count} | " "seqtk sample -s42 - {tocheck} | " "awk '{{if(NR%4==2) print length($1)}}' | sort | uniq -c") def fix_signal(): """Avoid spurious 'cat: write error: Broken pipe' message due to head command. Work around from: https://bitbucket.org/brodie/cram/issues/16/broken-pipe-when-heading-certain-output """ signal.signal(signal.SIGPIPE, signal.SIG_DFL) count_out = subprocess.check_output(cmd.format(**locals()), shell=True, executable="/bin/bash", preexec_fn=fix_signal).decode() if not count_out.strip(): raise IOError("Failed to check fastq file sizes with: %s" % cmd.format(**locals())) for count, size in (l.strip().split() for l in count_out.strip().split("\n")): yield count, size
def download_prepped_genome(genome_build, data, name, need_remap, out_dir=None): """Get a pre-prepared genome from S3, unpacking it locally. Supports runs on AWS where we can retrieve the resources on demand. Upgrades GEMINI in place if installed inside a Docker container with the biological data. GEMINI install requires write permissions to standard data directories -- works on AWS but not generalizable elsewhere. """ from bcbio.variation import population from bcbio import install if not out_dir: out_dir = utils.safe_makedir(os.path.join(tz.get_in(["dirs", "work"], data), "inputs", "data", "genomes")) for target in REMAP_NAMES.get(name, [name]): ref_dir = os.path.join(out_dir, genome_build, target) if not os.path.exists(ref_dir): if target in INPLACE_INDEX: ref_file = glob.glob(os.path.normpath(os.path.join(ref_dir, os.pardir, "seq", "*.fa")))[0] # Need to add genome resources so we can retrieve GTF files for STAR data["genome_resources"] = get_resources(data["genome_build"], ref_file, data) INPLACE_INDEX[target](ref_file, ref_dir, data) else: # XXX Currently only supports genomes from S3 us-east-1 bucket. # Need to assess how slow this is from multiple regions and generalize to non-AWS. fname = objectstore.BIODATA_INFO["s3"].format(build=genome_build, target=target) try: objectstore.connect(fname) except: raise ValueError("Could not find reference genome file %s %s" % (genome_build, name)) with utils.chdir(out_dir): cmd = objectstore.cl_input(fname, unpack=False, anonpipe=False) + " | pigz -d -c | tar -xvp" do.run(cmd.format(**locals()), "Download pre-prepared genome data: %s" % genome_build) ref_file = glob.glob(os.path.normpath(os.path.join(ref_dir, os.pardir, "seq", "*.fa")))[0] if data.get("genome_build"): gresources = get_resources(data["genome_build"], ref_file, data) if data.get("files") and population.do_db_build([data], need_bam=False, gresources=gresources): # symlink base GEMINI directory to work directory, avoiding write/space issues out_gemini_dir = utils.safe_makedir(os.path.join(os.path.dirname(ref_dir), "gemini_data")) orig_gemini_dir = install.get_gemini_dir() # Remove empty initial directory created by installer if os.path.isdir(orig_gemini_dir) and len(os.listdir(orig_gemini_dir)) == 0: if os.path.islink(orig_gemini_dir): os.remove(orig_gemini_dir) else: os.rmdir(orig_gemini_dir) if not os.path.exists(orig_gemini_dir): os.symlink(out_gemini_dir, orig_gemini_dir) cmd = [os.path.join(os.path.dirname(sys.executable), "gemini"), "update", "--dataonly"] do.run(cmd, "Download GEMINI data") genome_dir = os.path.join(out_dir, genome_build) genome_build = genome_build.replace("-test", "") if need_remap or name == "samtools": return os.path.join(genome_dir, "seq", "%s.fa" % genome_build) else: ref_dir = os.path.join(genome_dir, REMAP_NAMES.get(name, [name])[-1]) base_name = os.path.commonprefix(os.listdir(ref_dir)) while base_name.endswith("."): base_name = base_name[:-1] return os.path.join(ref_dir, base_name)
def download_prepped_genome(genome_build, data, name, need_remap, out_dir=None): """Get a pre-prepared genome from S3, unpacking it locally. Supports runs on AWS where we can retrieve the resources on demand. Upgrades GEMINI in place if installed inside a Docker container with the biological data. GEMINI install requires write permissions to standard data directories -- works on AWS but not generalizable elsewhere. """ from bcbio.variation import population from bcbio import install if not out_dir: out_dir = utils.safe_makedir(os.path.join(tz.get_in(["dirs", "work"], data), "inputs", "data", "genomes")) for target in REMAP_NAMES.get(name, [name]): ref_dir = os.path.join(out_dir, genome_build, target) if not os.path.exists(ref_dir): if target in INPLACE_INDEX: ref_file = glob.glob(os.path.normpath(os.path.join(ref_dir, os.pardir, "seq", "*.fa")))[0] # Need to add genome resources so we can retrieve GTF files for STAR data["genome_resources"] = get_resources(data["genome_build"], ref_file, data) INPLACE_INDEX[target](ref_file, ref_dir, data) else: # XXX Currently only supports genomes from S3 us-east-1 bucket. # Need to assess how slow this is from multiple regions and generalize to non-AWS. fname = objectstore.BIODATA_INFO["s3"].format(build=genome_build, target=target) try: objectstore.connect(fname) except: raise ValueError("Could not find reference genome file %s %s" % (genome_build, name)) with utils.chdir(out_dir): cmd = objectstore.cl_input(fname, unpack=False, anonpipe=False) + " | pigz -d -c | tar -xvp" do.run(cmd.format(**locals()), "Download pre-prepared genome data: %s" % genome_build) ref_file = glob.glob(os.path.normpath(os.path.join(ref_dir, os.pardir, "seq", "*.fa")))[0] if data.get("genome_build"): if (data.get("files") and population.do_db_build([data], need_bam=False) and population.support_gemini_orig(data)): # symlink base GEMINI directory to work directory, avoiding write/space issues out_gemini_dir = utils.safe_makedir(os.path.join(os.path.dirname(ref_dir), "gemini_data")) orig_gemini_dir = install.get_gemini_dir() # Remove empty initial directory created by installer if os.path.isdir(orig_gemini_dir) and len(os.listdir(orig_gemini_dir)) == 0: if os.path.islink(orig_gemini_dir): os.remove(orig_gemini_dir) else: os.rmdir(orig_gemini_dir) if not os.path.exists(orig_gemini_dir): os.symlink(out_gemini_dir, orig_gemini_dir) cmd = [os.path.join(os.path.dirname(sys.executable), "gemini"), "update", "--dataonly"] do.run(cmd, "Download GEMINI data") genome_dir = os.path.join(out_dir, genome_build) genome_build = genome_build.replace("-test", "") if need_remap or name == "samtools": return os.path.join(genome_dir, "seq", "%s.fa" % genome_build) else: ref_dir = os.path.join(genome_dir, REMAP_NAMES.get(name, [name])[-1]) base_name = os.path.commonprefix(os.listdir(ref_dir)) while base_name.endswith("."): base_name = base_name[:-1] return os.path.join(ref_dir, base_name)
def fastq_convert_pipe_cl(in_file, data): """Create an anonymous pipe converting Illumina 1.3-1.7 to Sanger. Uses seqtk: https://github.com/lh3/seqt """ seqtk = config_utils.get_program("seqtk", data["config"]) in_file = objectstore.cl_input(in_file) return "<({seqtk} seq -Q64 -V {in_file})".format(**locals())
def is_paired(bam_file): """Determine if a BAM file has paired reads. """ bam_file = objectstore.cl_input(bam_file) cmd = "sambamba view -h {bam_file} | head -50000 | " "sambamba view -S -F paired /dev/stdin | head -1 | wc -l" out = subprocess.check_output( cmd.format(**locals()), shell=True, executable=do.find_bash(), stderr=open("/dev/null", "w") ) return int(out) > 0
def _cutadapt_pe_cmd(fastq_files, out_files, quality_format, base_cmd, data): """ run cutadapt in paired end mode """ fq1, fq2 = [objectstore.cl_input(x) for x in fastq_files] of1, of2 = out_files base_cmd += " --minimum-length={min_length} ".format(min_length=dd.get_min_read_length(data)) first_cmd = base_cmd + " -o {of1_tx} -p {of2_tx} " + fq1 + " " + fq2 return first_cmd + "| tee > {log_tx};"
def _cutadapt_pe_cmd(fastq_files, out_files, quality_format, base_cmd, data): """ run cutadapt in paired end mode """ fq1, fq2 = [objectstore.cl_input(x) for x in fastq_files] of1, of2 = out_files base_cmd += " --minimum-length={min_length} ".format(min_length=dd.get_min_read_length(data)) first_cmd = base_cmd + " -o {of1_tx} -p {of2_tx} " + fq1 + " " + fq2 return first_cmd + "| tee > {log_tx};"
def _bgzip_from_bam(bam_file, dirs, config, is_retry=False, output_infix=''): """Create bgzipped fastq files from an input BAM file. """ # tools bamtofastq = config_utils.get_program("bamtofastq", config) resources = config_utils.get_resources("bamtofastq", config) cores = config["algorithm"].get("num_cores", 1) max_mem = config_utils.convert_to_bytes(resources.get("memory", "1G")) * cores bgzip = tools.get_bgzip_cmd(config, is_retry) # files work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep")) out_file_1 = os.path.join( work_dir, "%s%s-1.fq.gz" % (os.path.splitext(os.path.basename(bam_file))[0], output_infix)) out_file_2 = out_file_1.replace("-1.fq.gz", "-2.fq.gz") needs_retry = False if is_retry or not utils.file_exists(out_file_1): if not bam.is_paired(bam_file): out_file_2 = None with file_transaction(config, out_file_1) as tx_out_file: for f in [tx_out_file, out_file_1, out_file_2]: if f and os.path.exists(f): os.remove(f) fq1_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, tx_out_file) sortprefix = "%s-sort" % os.path.splitext(tx_out_file)[0] if bam.is_paired(bam_file): fq2_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, out_file_2) out_str = ( "F=>({fq1_bgzip_cmd}) F2=>({fq2_bgzip_cmd}) S=/dev/null O=/dev/null " "O2=/dev/null collate=1 colsbs={max_mem}") else: out_str = "S=>({fq1_bgzip_cmd})" bam_file = objectstore.cl_input(bam_file) cmd = "{bamtofastq} filename={bam_file} T={sortprefix} " + out_str try: do.run(cmd.format(**locals()), "BAM to bgzipped fastq", checks=[do.file_reasonable_size(tx_out_file, bam_file)], log_error=False) except subprocess.CalledProcessError as msg: if not is_retry and "deflate failed" in str(msg): logger.info( "bamtofastq deflate IO failure preparing %s. Retrying with single core." % (bam_file)) needs_retry = True else: logger.exception() raise if needs_retry: return _bgzip_from_bam(bam_file, dirs, config, is_retry=True) else: return [ x for x in [out_file_1, out_file_2] if x is not None and utils.file_exists(x) ]
def is_paired(bam_file): """Determine if a BAM file has paired reads. """ bam_file = objectstore.cl_input(bam_file) cmd = ("sambamba view -h {bam_file} | head -50000 | " "sambamba view -S -F paired /dev/stdin | head -1 | wc -l") out = subprocess.check_output(cmd.format(**locals()), shell=True, executable=do.find_bash(), stderr=open("/dev/null", "w")) return int(out) > 0
def _bgzip_from_bam(bam_file, dirs, data, is_retry=False, output_infix=''): """Create bgzipped fastq files from an input BAM file. """ # tools config = data["config"] bamtofastq = config_utils.get_program("bamtofastq", config) resources = config_utils.get_resources("bamtofastq", config) cores = config["algorithm"].get("num_cores", 1) max_mem = config_utils.convert_to_bytes(resources.get("memory", "1G")) * cores bgzip = tools.get_bgzip_cmd(config, is_retry) # files work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep")) out_file_1 = os.path.join(work_dir, "%s%s-1.fq.gz" % (os.path.splitext(os.path.basename(bam_file))[0], output_infix)) out_file_2 = out_file_1.replace("-1.fq.gz", "-2.fq.gz") needs_retry = False if is_retry or not utils.file_exists(out_file_1): if not bam.is_paired(bam_file): out_file_2 = None with file_transaction(config, out_file_1) as tx_out_file: for f in [tx_out_file, out_file_1, out_file_2]: if f and os.path.exists(f): os.remove(f) fq1_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, tx_out_file) prep_cmd = _seqtk_fastq_prep_cl(data, read_num=0) if prep_cmd: fq1_bgzip_cmd = prep_cmd + " | " + fq1_bgzip_cmd sortprefix = "%s-sort" % os.path.splitext(tx_out_file)[0] if bam.is_paired(bam_file): prep_cmd = _seqtk_fastq_prep_cl(data, read_num=1) fq2_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, out_file_2) if prep_cmd: fq2_bgzip_cmd = prep_cmd + " | " + fq2_bgzip_cmd out_str = ("F=>({fq1_bgzip_cmd}) F2=>({fq2_bgzip_cmd}) S=/dev/null O=/dev/null " "O2=/dev/null collate=1 colsbs={max_mem}") else: out_str = "S=>({fq1_bgzip_cmd})" bam_file = objectstore.cl_input(bam_file) extra_opts = " ".join([str(x) for x in resources.get("options", [])]) cmd = "{bamtofastq} filename={bam_file} T={sortprefix} {extra_opts} " + out_str try: do.run(cmd.format(**locals()), "BAM to bgzipped fastq", checks=[do.file_reasonable_size(tx_out_file, bam_file)], log_error=False) except subprocess.CalledProcessError as msg: if not is_retry and "deflate failed" in str(msg): logger.info("bamtofastq deflate IO failure preparing %s. Retrying with single core." % (bam_file)) needs_retry = True else: logger.exception() raise if needs_retry: return _bgzip_from_bam(bam_file, dirs, data, is_retry=True) else: return [x for x in [out_file_1, out_file_2] if x is not None and utils.file_exists(x)]
def _cutadapt_se_cmd(fastq_files, out_files, base_cmd): """ this has to use the -o option, not redirect to stdout in order for gzipping to be honored """ min_length = MINIMUM_LENGTH cmd = base_cmd + " --minimum-length={min_length} ".format(**locals()) fq1 = objectstore.cl_input(fastq_files[0]) of1 = out_files[0] cmd += " -o {of1} " + str(fq1) return cmd
def _cutadapt_se_cmd(fastq_files, out_files, base_cmd): """ this has to use the -o option, not redirect to stdout in order for gzipping to be honored """ min_length = MINIMUM_LENGTH cmd = base_cmd + " --minimum-length={min_length} ".format(**locals()) fq1 = objectstore.cl_input(fastq_files[0]) of1 = out_files[0] cmd += " -o {of1} " + str(fq1) return cmd
def _cutadapt_se_cmd(fastq_files, out_files, base_cmd, data): """ this has to use the -o option, not redirect to stdout in order for gzipping to be supported """ min_length = dd.get_min_read_length(data) cmd = base_cmd + " --minimum-length={min_length} ".format(**locals()) fq1 = objectstore.cl_input(fastq_files[0]) of1 = out_files[0] cmd += " -o {of1_tx} " + str(fq1) cmd = "%s | tee > {log_tx}" % cmd return cmd
def _cutadapt_pe_nosickle(fastq_files, out_files, quality_format, base_cmd): """ sickle has an issue with 0 length reads, here is the open issue for it: https://github.com/najoshi/sickle/issues/32 until that is resolved, this is a workaround which avoids using sickle """ fq1, fq2 = [objectstore.cl_input(x) for x in fastq_files] of1, of2 = out_files base_cmd += " --minimum-length={min_length} ".format(min_length=MINIMUM_LENGTH) first_cmd = base_cmd + " -o {tmp_fq1} -p {tmp_fq2} " + fq1 + " " + fq2 second_cmd = base_cmd + " -o {of2_tx} -p {of1_tx} {tmp_fq2} {tmp_fq1}" return first_cmd + ";" + second_cmd + "; rm {tmp_fq1} {tmp_fq2} "
def _cutadapt_pe_nosickle(fastq_files, out_files, quality_format, base_cmd): """ sickle has an issue with 0 length reads, here is the open issue for it: https://github.com/najoshi/sickle/issues/32 until that is resolved, this is a workaround which avoids using sickle """ fq1, fq2 = [objectstore.cl_input(x) for x in fastq_files] of1, of2 = out_files base_cmd += " --minimum-length={min_length} ".format(min_length=MINIMUM_LENGTH) first_cmd = base_cmd + " -o {tmp_fq1} -p {tmp_fq2} " + fq1 + " " + fq2 second_cmd = base_cmd + " -o {of2_tx} -p {of1_tx} {tmp_fq2} {tmp_fq1}" return first_cmd + ";" + second_cmd + "; rm {tmp_fq1} {tmp_fq2} "
def _cutadapt_se_cmd(fastq_files, out_files, base_cmd, data): """ this has to use the -o option, not redirect to stdout in order for gzipping to be supported """ min_length = dd.get_min_read_length(data) cmd = base_cmd + " --minimum-length={min_length} ".format(**locals()) fq1 = objectstore.cl_input(fastq_files[0]) of1 = out_files[0] cmd += " -o {of1_tx} " + str(fq1) cmd = "%s | tee > {log_tx}" % cmd return cmd
def _atropos_trim(fastq_files, adapters, out_dir, data): """Perform multicore trimming with atropos. """ report_file = os.path.join(out_dir, "%s-report.json" % utils.splitext_plus(os.path.basename(fastq_files[0]))[0]) out_files = [os.path.join(out_dir, "%s-trimmed.fq.gz" % utils.splitext_plus(os.path.basename(x))[0]) for x in fastq_files] if not utils.file_exists(out_files[0]): with file_transaction(data, *[report_file] + out_files) as tx_out: tx_report_file, tx_out1 = tx_out[:2] if len(tx_out) > 2: tx_out2 = tx_out[2] adapters_args = " ".join(["-a %s" % a for a in adapters]) aligner_args = "--aligner adapter" if len(fastq_files) == 1: input_args = "-se %s" % objectstore.cl_input(fastq_files[0]) output_args = "-o >(bgzip --threads %s -c > {tx_out1})".format(**locals()) else: assert len(fastq_files) == 2, fastq_files adapters_args = adapters_args + " " + " ".join(["-A %s" % a for a in adapters]) input_args = "-pe1 %s -pe2 %s" % tuple([objectstore.cl_input(x) for x in fastq_files]) output_args = "-o >(bgzip -c > {tx_out1}) -p >(bgzip -c > {tx_out2})".format(**locals()) quality_base = "64" if dd.get_quality_format(data).lower() == "illumina" else "33" sample_name = dd.get_sample_name(data) report_args = "--report-file %s --report-formats json --sample-id %s" % (tx_report_file, dd.get_sample_name(data)) ropts = " ".join(str(x) for x in config_utils.get_resources("atropos", data["config"]).get("options", [])) extra_opts = [] for k, alt_ks, v in [("--quality-cutoff", ["-q "], "5"), ("--minimum-length", ["-m "], str(dd.get_min_read_length(data)))]: if k not in ropts and not any(alt_k in ropts for alt_k in alt_ks): extra_opts.append("%s=%s" % (k, v)) extra_opts = " ".join(extra_opts) thread_args = ("--threads %s" % dd.get_num_cores(data) if dd.get_num_cores(data) > 1 else "") cmd = ("atropos trim {ropts} {thread_args} --quality-base {quality_base} --format fastq " "{adapters_args} {input_args} {output_args} {report_args} {extra_opts}") do.run(cmd.format(**locals()), "Trimming with atropos: %s" % dd.get_sample_name(data)) return out_files, report_file
def _bgzip_file(finput, config, work_dir, needs_bgzip, needs_gunzip, needs_convert, data): """Handle bgzip of input file, potentially gunzipping an existing file. Handles cases where finput might be multiple files and need to be concatenated. """ if isinstance(finput, six.string_types): in_file = finput else: assert not needs_convert, "Do not yet handle quality conversion with multiple inputs" return _bgzip_multiple_files(finput, work_dir, data) out_file = os.path.join( work_dir, os.path.basename(in_file).replace(".bz2", "") + (".gz" if not in_file.endswith(".gz") else "")) if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: bgzip = tools.get_bgzip_cmd(config) is_remote = objectstore.is_remote(in_file) in_file = objectstore.cl_input(in_file, unpack=needs_gunzip or needs_convert or needs_bgzip or dd.get_trim_ends(data)) if needs_convert or dd.get_trim_ends(data): in_file = fastq_convert_pipe_cl(in_file, data) if needs_gunzip and not (needs_convert or dd.get_trim_ends(data)): if in_file.endswith(".bz2"): gunzip_cmd = "bunzip2 -c {in_file} |".format(**locals()) else: gunzip_cmd = "gunzip -c {in_file} |".format(**locals()) bgzip_in = "/dev/stdin" else: gunzip_cmd = "" bgzip_in = in_file if needs_bgzip: do.run( "{gunzip_cmd} {bgzip} -c {bgzip_in} > {tx_out_file}". format(**locals()), "bgzip input file") elif is_remote: bgzip = "| bgzip -c" if (needs_convert or dd.get_trim_ends(data)) else "" do.run( "cat {in_file} {bgzip} > {tx_out_file}".format(**locals()), "Get remote input") else: raise ValueError( "Unexpected inputs: %s %s %s %s" % (in_file, needs_bgzip, needs_gunzip, needs_convert)) return out_file
def is_empty(bam_file): """Determine if a BAM file is empty """ bam_file = objectstore.cl_input(bam_file) cmd = ("set -o pipefail; " "samtools view {bam_file} | head -1 | wc -l") p = subprocess.Popen(cmd.format(**locals()), shell=True, executable=do.find_bash(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, preexec_fn=lambda: signal.signal(signal.SIGPIPE, signal.SIG_DFL)) stdout, stderr = p.communicate() stderr = stderr.strip() if ((p.returncode == 0 or p.returncode == 141) and (stderr == "" or (stderr.startswith("gof3r") and stderr.endswith("broken pipe")))): return int(stdout) == 0 else: raise ValueError("Failed to check empty status of BAM file: %s" % str(stderr))
def is_empty(bam_file): """Determine if a BAM file is empty """ bam_file = objectstore.cl_input(bam_file) sambamba = config_utils.get_program("sambamba", {}) cmd = ("set -o pipefail; " "{sambamba} view {bam_file} | head -1 | wc -l") p = subprocess.Popen(cmd.format(**locals()), shell=True, executable=do.find_bash(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, preexec_fn=lambda: signal.signal(signal.SIGPIPE, signal.SIG_DFL)) stdout, stderr = p.communicate() stderr = stderr.strip() if ((p.returncode == 0 or p.returncode == 141) and (stderr == "" or (stderr.startswith("gof3r") and stderr.endswith("broken pipe")))): return int(stdout) == 0 else: raise ValueError("Failed to check empty status of BAM file: %s" % str(stderr))
def _bgzip_from_bam(bam_file, dirs, config, is_retry=False): """Create bgzipped fastq files from an input BAM file. """ # tools bamtofastq = config_utils.get_program("bamtofastq", config) resources = config_utils.get_resources("bamtofastq", config) cores = config["algorithm"].get("num_cores", 1) max_mem = int(resources.get("memory", "1073741824")) * cores # 1Gb/core default bgzip = tools.get_bgzip_cmd(config, is_retry) # files work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep")) out_file_1 = os.path.join(work_dir, "%s-1.fq.gz" % os.path.splitext(os.path.basename(bam_file))[0]) if bam.is_paired(bam_file): out_file_2 = out_file_1.replace("-1.fq.gz", "-2.fq.gz") else: out_file_2 = None needs_retry = False if is_retry or not utils.file_exists(out_file_1): with file_transaction(config, out_file_1) as tx_out_file: for f in [tx_out_file, out_file_1, out_file_2]: if f and os.path.exists(f): os.remove(f) fq1_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, tx_out_file) sortprefix = "%s-sort" % os.path.splitext(tx_out_file)[0] if bam.is_paired(bam_file): fq2_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, out_file_2) out_str = ("F=>({fq1_bgzip_cmd}) F2=>({fq2_bgzip_cmd}) S=/dev/null O=/dev/null " "O2=/dev/null collate=1 colsbs={max_mem}") else: out_str = "S=>({fq1_bgzip_cmd})" bam_file = objectstore.cl_input(bam_file) cmd = "{bamtofastq} filename={bam_file} T={sortprefix} " + out_str try: do.run(cmd.format(**locals()), "BAM to bgzipped fastq", checks=[do.file_reasonable_size(tx_out_file, bam_file)], log_error=False) except subprocess.CalledProcessError, msg: if not is_retry and "deflate failed" in str(msg): logger.info("bamtofastq deflate IO failure preparing %s. Retrying with single core." % (bam_file)) needs_retry = True else: logger.exception() raise
def is_paired(bam_file): """Determine if a BAM file has paired reads. Works around issues with head closing the samtools pipe using signal trick from: http://stackoverflow.com/a/12451083/252589 """ bam_file = objectstore.cl_input(bam_file) cmd = ("set -o pipefail; " "sambamba view -h {bam_file} | head -50000 | " "sambamba view -S -F paired /dev/stdin | head -1 | wc -l") p = subprocess.Popen(cmd.format(**locals()), shell=True, executable=do.find_bash(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, preexec_fn=lambda: signal.signal(signal.SIGPIPE, signal.SIG_DFL)) stdout, stderr = p.communicate() if p.returncode == 0 or p.returncode == 141 and stderr.strip() == "": return int(stdout) > 0 else: raise ValueError("Failed to check paired status of BAM file: %s" % str(stderr))
def _can_use_mem(fastq_file, data, read_min_size=None): """bwa-mem handle longer (> 70bp) reads with improved piping. Randomly samples 5000 reads from the first two million. Default to no piping if more than 75% of the sampled reads are small. If we've previously calculated minimum read sizes (from rtg SDF output) we can skip the formal check. """ min_size = 70 if read_min_size and read_min_size >= min_size: return True thresh = 0.75 head_count = 8000000 tocheck = 5000 seqtk = config_utils.get_program("seqtk", data["config"]) fastq_file = objectstore.cl_input(fastq_file) gzip_cmd = "zcat {fastq_file}" if fastq_file.endswith( ".gz") else "cat {fastq_file}" cmd = (gzip_cmd + " | head -n {head_count} | " "{seqtk} sample -s42 - {tocheck} | " "awk '{{if(NR%4==2) print length($1)}}' | sort | uniq -c") def fix_signal(): """Avoid spurious 'cat: write error: Broken pipe' message due to head command. Work around from: https://bitbucket.org/brodie/cram/issues/16/broken-pipe-when-heading-certain-output """ signal.signal(signal.SIGPIPE, signal.SIG_DFL) count_out = subprocess.check_output(cmd.format(**locals()), shell=True, executable="/bin/bash", preexec_fn=fix_signal) if not count_out.strip(): raise IOError("Failed to check fastq file sizes with: %s" % cmd.format(**locals())) shorter = 0 for count, size in (l.strip().split() for l in count_out.strip().split("\n")): if int(size) < min_size: shorter += int(count) return (float(shorter) / float(tocheck)) <= thresh
def fastq_size_output(fastq_file, tocheck): head_count = 8000000 fastq_file = objectstore.cl_input(fastq_file) gzip_cmd = "zcat {fastq_file}" if fastq_file.endswith(".gz") else "cat {fastq_file}" cmd = (utils.local_path_export() + gzip_cmd + " | head -n {head_count} | " "seqtk sample -s42 - {tocheck} | " "awk '{{if(NR%4==2) print length($1)}}' | sort | uniq -c") def fix_signal(): """Avoid spurious 'cat: write error: Broken pipe' message due to head command. Work around from: https://bitbucket.org/brodie/cram/issues/16/broken-pipe-when-heading-certain-output """ signal.signal(signal.SIGPIPE, signal.SIG_DFL) count_out = subprocess.check_output(cmd.format(**locals()), shell=True, executable="/bin/bash", preexec_fn=fix_signal).decode() if not count_out.strip(): raise IOError("Failed to check fastq file sizes with: %s" % cmd.format(**locals())) for count, size in (l.strip().split() for l in count_out.strip().split("\n")): yield count, size
def _bgzip_file(finput, config, work_dir, needs_bgzip, needs_gunzip, needs_convert, data): """Handle bgzip of input file, potentially gunzipping an existing file. Handles cases where finput might be multiple files and need to be concatenated. """ if isinstance(finput, six.string_types): in_file = finput else: assert not needs_convert, "Do not yet handle quality conversion with multiple inputs" return _bgzip_multiple_files(finput, work_dir, data) out_file = os.path.join(work_dir, os.path.basename(in_file).replace(".bz2", "") + (".gz" if not in_file.endswith(".gz") else "")) if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: bgzip = tools.get_bgzip_cmd(config) is_remote = objectstore.is_remote(in_file) in_file = objectstore.cl_input(in_file, unpack=needs_gunzip or needs_convert or needs_bgzip or dd.get_trim_ends(data)) if needs_convert or dd.get_trim_ends(data): in_file = fastq_convert_pipe_cl(in_file, data) if needs_gunzip and not (needs_convert or dd.get_trim_ends(data)): if in_file.endswith(".bz2"): gunzip_cmd = "bunzip2 -c {in_file} |".format(**locals()) else: gunzip_cmd = "gunzip -c {in_file} |".format(**locals()) bgzip_in = "/dev/stdin" else: gunzip_cmd = "" bgzip_in = in_file if needs_bgzip: do.run("{gunzip_cmd} {bgzip} -c {bgzip_in} > {tx_out_file}".format(**locals()), "bgzip input file") elif is_remote: bgzip = "| bgzip -c" if (needs_convert or dd.get_trim_ends(data)) else "" do.run("cat {in_file} {bgzip} > {tx_out_file}".format(**locals()), "Get remote input") else: raise ValueError("Unexpected inputs: %s %s %s %s" % (in_file, needs_bgzip, needs_gunzip, needs_convert)) return out_file
def _seqtk_fastq_prep_cl(data, in_file=None, read_num=0): """Provide a commandline for prep of fastq inputs with seqtk. Handles fast conversion of fastq quality scores and trimming. """ needs_convert = dd.get_quality_format(data).lower() == "illumina" trim_ends = dd.get_trim_ends(data) seqtk = config_utils.get_program("seqtk", data["config"]) if in_file: in_file = objectstore.cl_input(in_file) else: in_file = "/dev/stdin" cmd = "" if needs_convert: cmd += "{seqtk} seq -Q64 -V {in_file}".format(**locals()) if trim_ends: left_trim, right_trim = trim_ends[0:2] if data.get("read_num", read_num) == 0 else trim_ends[2:4] if left_trim or right_trim: trim_infile = "/dev/stdin" if needs_convert else in_file pipe = " | " if needs_convert else "" cmd += "{pipe}{seqtk} trimfq -b {left_trim} -e {right_trim} {trim_infile}".format(**locals()) return cmd
def _seqtk_fastq_prep_cl(data, in_file=None, read_num=0): """Provide a commandline for prep of fastq inputs with seqtk. Handles fast conversion of fastq quality scores and trimming. """ needs_convert = dd.get_quality_format(data).lower() == "illumina" trim_ends = dd.get_trim_ends(data) seqtk = config_utils.get_program("seqtk", data["config"]) if in_file: in_file = objectstore.cl_input(in_file) else: in_file = "/dev/stdin" cmd = "" if needs_convert: cmd += "{seqtk} seq -Q64 -V {in_file}".format(**locals()) if trim_ends: left_trim, right_trim = trim_ends[0:2] if data.get("read_num", read_num) == 0 else trim_ends[2:4] if left_trim or right_trim: trim_infile = "/dev/stdin" if needs_convert else in_file pipe = " | " if needs_convert else "" cmd += "{pipe}{seqtk} trimfq -b {left_trim} -e {right_trim} {trim_infile}".format(**locals()) return cmd
def _cram_to_fastq_region(cram_file, work_dir, base_name, region, data): """Convert CRAM to fastq in a specified region. """ ref_file = tz.get_in(["reference", "fasta", "base"], data) resources = config_utils.get_resources("bamtofastq", data["config"]) cores = tz.get_in(["config", "algorithm", "num_cores"], data, 1) max_mem = config_utils.convert_to_bytes(resources.get("memory", "1G")) * cores rext = "-%s" % region.replace(":", "_").replace("-", "_") if region else "full" out_s, out_p1, out_p2, out_o1, out_o2 = [os.path.join(work_dir, "%s%s-%s.fq.gz" % (base_name, rext, fext)) for fext in ["s1", "p1", "p2", "o1", "o2"]] if not utils.file_exists(out_p1): with file_transaction(data, out_s, out_p1, out_p2, out_o1, out_o2) as \ (tx_out_s, tx_out_p1, tx_out_p2, tx_out_o1, tx_out_o2): cram_file = objectstore.cl_input(cram_file) sortprefix = "%s-sort" % utils.splitext_plus(tx_out_s)[0] cmd = ("bamtofastq filename={cram_file} inputformat=cram T={sortprefix} " "gz=1 collate=1 colsbs={max_mem} exclude=SECONDARY,SUPPLEMENTARY " "F={tx_out_p1} F2={tx_out_p2} S={tx_out_s} O={tx_out_o1} O2={tx_out_o2} " "reference={ref_file}") if region: cmd += " ranges='{region}'" do.run(cmd.format(**locals()), "CRAM to fastq %s" % region if region else "") return [[out_p1, out_p2, out_s]]