def _fastp_trim(fastq_files, adapters, out_dir, data): """Perform multicore trimming with fastp (https://github.com/OpenGene/fastp) """ report_file = os.path.join(out_dir, "%s-report.json" % utils.splitext_plus(os.path.basename(fastq_files[0]))[0]) out_files = [os.path.join(out_dir, "%s-trimmed.fq.gz" % utils.splitext_plus(os.path.basename(x))[0]) for x in fastq_files] if not utils.file_exists(out_files[0]): with file_transaction(data, *[report_file] + out_files) as tx_out: tx_report = tx_out[0] tx_out_files = tx_out[1:] cmd = ["fastp", "--thread", dd.get_num_cores(data)] if dd.get_quality_format(data).lower() == "illumina": cmd += ["--phred64"] for i, (inf, outf) in enumerate(zip(fastq_files, tx_out_files)): if i == 0: cmd += ["-i", inf, "-o", outf] else: cmd += ["-I", inf, "-O", outf] cmd += ["--cut_by_quality3", "--cut_mean_quality", "5", "--length_required", str(dd.get_min_read_length(data)), "--disable_quality_filtering"] if "polyx" in dd.get_adapters(data): cmd += ["--trim_poly_x", "--poly_x_min_len", "8"] if "polyx" in dd.get_adapters(data) or "polyg" in dd.get_adapters(data): cmd += ["--trim_poly_g", "--poly_g_min_len", "8"] for a in adapters: cmd += ["--adapter_sequence", a] if not adapters: cmd += ["--disable_adapter_trimming"] cmd += ["--json", report_file, "--report_title", dd.get_sample_name(data)] do.run(cmd, "Trimming with fastp: %s" % dd.get_sample_name(data)) return out_files, report_file
def _fastp_trim(fastq_files, adapters, out_dir, data): """Perform multicore trimming with fastp (https://github.com/OpenGene/fastp) """ report_file = os.path.join(out_dir, "%s-report.json" % utils.splitext_plus(os.path.basename(fastq_files[0]))[0]) out_files = [os.path.join(out_dir, "%s-trimmed.fq.gz" % utils.splitext_plus(os.path.basename(x))[0]) for x in fastq_files] if not utils.file_exists(out_files[0]): with file_transaction(data, *[report_file] + out_files) as tx_out: tx_report = tx_out[0] tx_out_files = tx_out[1:] cmd = ["fastp", "--thread", dd.get_num_cores(data)] if dd.get_quality_format(data).lower() == "illumina": cmd += ["--phred64"] for i, (inf, outf) in enumerate(zip(fastq_files, tx_out_files)): if i == 0: cmd += ["-i", inf, "-o", outf] else: cmd += ["-I", inf, "-O", outf] cmd += ["--trim_poly_g", "--cut_by_quality3", "--cut_mean_quality", "5", "--disable_quality_filtering", "--length_required", str(dd.get_min_read_length(data))] for a in adapters: cmd += ["--adapter_sequence", a] if not adapters: cmd += ["--disable_adapter_trimming"] cmd += ["--json", report_file, "--report_title", dd.get_sample_name(data)] do.run(cmd, "Trimming with fastp: %s" % dd.get_sample_name(data)) return out_files, report_file
def _atropos_trim(fastq_files, adapters, out_dir, data): """Perform multicore trimming with atropos. """ report_file = os.path.join( out_dir, "%s-report.json" % utils.splitext_plus(os.path.basename(fastq_files[0]))[0]) out_files = [ os.path.join( out_dir, "%s-trimmed.fq.gz" % utils.splitext_plus(os.path.basename(x))[0]) for x in fastq_files ] if not utils.file_exists(out_files[0]): with file_transaction(data, *[report_file] + out_files) as tx_out: tx_report_file, tx_out1 = tx_out[:2] if len(tx_out) > 2: tx_out2 = tx_out[2] adapters_args = " ".join(["-a %s" % a for a in adapters]) aligner_args = "--aligner adapter" if len(fastq_files) == 1: input_args = "-se %s" % objectstore.cl_input(fastq_files[0]) output_args = "-o >(bgzip --threads %s -c > {tx_out1})".format( **locals()) else: assert len(fastq_files) == 2, fastq_files adapters_args = adapters_args + " " + " ".join( ["-A %s" % a for a in adapters]) input_args = "-pe1 %s -pe2 %s" % tuple( [objectstore.cl_input(x) for x in fastq_files]) output_args = "-o >(bgzip -c > {tx_out1}) -p >(bgzip -c > {tx_out2})".format( **locals()) adapters_args += " --no-default-adapters" # Prevent GitHub queries quality_base = "64" if dd.get_quality_format( data).lower() == "illumina" else "33" sample_name = dd.get_sample_name(data) report_args = "--report-file %s --report-formats json --sample-id %s" % ( tx_report_file, dd.get_sample_name(data)) ropts = " ".join( str(x) for x in config_utils.get_resources( "atropos", data["config"]).get("options", [])) extra_opts = [] for k, alt_ks, v in [("--quality-cutoff", ["-q "], "5"), ("--minimum-length", ["-m "], str(dd.get_min_read_length(data))), ("--nextseq-trim", [], "25")]: if k not in ropts and not any(alt_k in ropts for alt_k in alt_ks): extra_opts.append("%s=%s" % (k, v)) extra_opts = " ".join(extra_opts) thread_args = ("--threads %s" % dd.get_num_cores(data) if dd.get_num_cores(data) > 1 else "") cmd = ( "atropos trim {ropts} {thread_args} --quality-base {quality_base} --format fastq " "{adapters_args} {input_args} {output_args} {report_args} {extra_opts}" ) do.run(cmd.format(**locals()), "Trimming with atropos: %s" % dd.get_sample_name(data)) return out_files, report_file
def _cutadapt_pe_cmd(fastq_files, out_files, quality_format, base_cmd, data): """ run cutadapt in paired end mode """ fq1, fq2 = [objectstore.cl_input(x) for x in fastq_files] of1, of2 = out_files base_cmd += " --minimum-length={min_length} ".format(min_length=dd.get_min_read_length(data)) first_cmd = base_cmd + " -o {of1_tx} -p {of2_tx} " + fq1 + " " + fq2 return first_cmd + "| tee > {log_tx};"
def _cutadapt_se_cmd(fastq_files, out_files, base_cmd, data): """ this has to use the -o option, not redirect to stdout in order for gzipping to be supported """ min_length = dd.get_min_read_length(data) cmd = base_cmd + " --minimum-length={min_length} ".format(**locals()) fq1 = objectstore.cl_input(fastq_files[0]) of1 = out_files[0] cmd += " -o {of1_tx} " + str(fq1) cmd = "%s | tee > {log_tx}" % cmd return cmd
def _atropos_trim(fastq_files, adapters, out_dir, data): """Perform multicore trimming with atropos. """ report_file = os.path.join( out_dir, "%s-report.json" % utils.splitext_plus(os.path.basename(fastq_files[0]))[0]) out_files = [ os.path.join( out_dir, "%s-trimmed.fq.gz" % utils.splitext_plus(os.path.basename(x))[0]) for x in fastq_files ] if not utils.file_exists(out_files[0]): with file_transaction(data, *[report_file] + out_files) as tx_out: tx_report_file, tx_out1 = tx_out[:2] if len(tx_out) > 2: tx_out2 = tx_out[2] adapters_args = " ".join(["-a %s" % a for a in adapters]) aligner_args = "--aligner adapter" if len(fastq_files) == 1: input_args = "-se %s" % objectstore.cl_input(fastq_files[0]) output_args = "-o >(bgzip --threads %s -c > {tx_out1})".format( **locals()) else: assert len(fastq_files) == 2, fastq_files if adapters and len(adapters) <= 2: aligner_args = "--aligner insert" adapters_args = adapters_args + " " + " ".join( ["-A %s" % a for a in adapters]) input_args = "-pe1 %s -pe2 %s" % tuple( [objectstore.cl_input(x) for x in fastq_files]) output_args = "-o >(bgzip -c > {tx_out1}) -p >(bgzip -c > {tx_out2})".format( **locals()) quality_base = "64" if dd.get_quality_format( data).lower() == "illumina" else "33" sample_name = dd.get_sample_name(data) report_args = "--report-file %s --report-formats json --sample-id %s" % ( tx_report_file, dd.get_sample_name(data)) ropts = " ".join( str(x) for x in config_utils.get_resources( "atropos", data["config"]).get("options", [])) thread_args = ("--threads %s" % dd.get_num_cores(data) if dd.get_num_cores(data) > 1 else "") cmd = ( "atropos trim {ropts} {thread_args} --quality-base {quality_base} --format fastq " "{adapters_args} {aligner_args} {input_args} {output_args} {report_args}" ) cmd += " --quality-cutoff=5 --minimum-length=%s" % dd.get_min_read_length( data) do.run(cmd.format(**locals()), "Trimming with atropos: %s" % dd.get_sample_name(data)) return out_files, report_file
def _atropos_trim(fastq_files, adapters, out_dir, data): """Perform multicore trimming with atropos. """ report_file = os.path.join(out_dir, "%s-report.json" % utils.splitext_plus(os.path.basename(fastq_files[0]))[0]) out_files = [os.path.join(out_dir, "%s-trimmed.fq.gz" % utils.splitext_plus(os.path.basename(x))[0]) for x in fastq_files] if not utils.file_exists(out_files[0]): with file_transaction(data, *[report_file] + out_files) as tx_out: tx_report_file, tx_out1 = tx_out[:2] if len(tx_out) > 2: tx_out2 = tx_out[2] # polyX trimming, anchored to the 3' ends of reads if "polyx" in dd.get_adapters(data): adapters += ["A{200}", "C{200}", "G{200}", "T{200}"] adapters_args = " ".join(["-a '%s'" % a for a in adapters]) adapters_args += " --overlap 8" # Avoid very short internal matches (default is 3) adapters_args += " --no-default-adapters --no-cache-adapters" # Prevent GitHub queries and saving pickles aligner_args = "--aligner adapter" if len(fastq_files) == 1: cores = dd.get_num_cores(data) input_args = "-se %s" % objectstore.cl_input(fastq_files[0]) output_args = "-o >(bgzip --threads {cores} -c > {tx_out1})".format(**locals()) else: assert len(fastq_files) == 2, fastq_files cores = max(1, dd.get_num_cores(data) // 2) adapters_args = adapters_args + " " + " ".join(["-A '%s'" % a for a in adapters]) input_args = "-pe1 %s -pe2 %s" % tuple([objectstore.cl_input(x) for x in fastq_files]) output_args = ("-o >(bgzip --threads {cores} -c > {tx_out1}) " "-p >(bgzip --threads {cores} -c > {tx_out2})").format(**locals()) quality_base = "64" if dd.get_quality_format(data).lower() == "illumina" else "33" sample_name = dd.get_sample_name(data) report_args = "--report-file %s --report-formats json --sample-id %s" % (tx_report_file, dd.get_sample_name(data)) ropts = " ".join(str(x) for x in config_utils.get_resources("atropos", data["config"]).get("options", [])) extra_opts = [] for k, alt_ks, v, want in [("--quality-cutoff", ["-q "], "5", True), ("--minimum-length", ["-m "], str(dd.get_min_read_length(data)), True), ("--nextseq-trim", [], "25", ("polyx" in dd.get_adapters(data) or "polyg" in dd.get_adapters(data)))]: if k not in ropts and not any(alt_k in ropts for alt_k in alt_ks): if want: extra_opts.append("%s=%s" % (k, v)) extra_opts = " ".join(extra_opts) thread_args = ("--threads %s" % cores if cores > 1 else "") cmd = ("atropos trim {ropts} {thread_args} --quality-base {quality_base} --format fastq " "{adapters_args} {input_args} {output_args} {report_args} {extra_opts}") do.run(cmd.format(**locals()), "Trimming with atropos: %s" % dd.get_sample_name(data)) return out_files, report_file
def _atropos_trim(fastq_files, adapters, out_dir, data): """Perform multicore trimming with atropos. """ report_file = os.path.join(out_dir, "%s-report.json" % utils.splitext_plus(os.path.basename(fastq_files[0]))[0]) out_files = [os.path.join(out_dir, "%s-trimmed.fq.gz" % utils.splitext_plus(os.path.basename(x))[0]) for x in fastq_files] if not utils.file_exists(out_files[0]): with file_transaction(data, *[report_file] + out_files) as tx_out: tx_report_file, tx_out1 = tx_out[:2] if len(tx_out) > 2: tx_out2 = tx_out[2] # polyX trimming, anchored to the 3' ends of reads if "polyx" in dd.get_adapters(data): adapters += ["A{200}$", "C{200}$", "G{200}$", "T{200}$"] adapters_args = " ".join(["-a '%s'" % a for a in adapters]) adapters_args += " --overlap 8" # Avoid very short internal matches (default is 3) adapters_args += " --no-default-adapters --no-cache-adapters" # Prevent GitHub queries and saving pickles aligner_args = "--aligner adapter" if len(fastq_files) == 1: cores = dd.get_num_cores(data) input_args = "-se %s" % objectstore.cl_input(fastq_files[0]) output_args = "-o >(bgzip --threads {cores} -c > {tx_out1})".format(**locals()) else: assert len(fastq_files) == 2, fastq_files cores = max(1, dd.get_num_cores(data) // 2) adapters_args = adapters_args + " " + " ".join(["-A '%s'" % a for a in adapters]) input_args = "-pe1 %s -pe2 %s" % tuple([objectstore.cl_input(x) for x in fastq_files]) output_args = ("-o >(bgzip --threads {cores} -c > {tx_out1}) " "-p >(bgzip --threads {cores} -c > {tx_out2})").format(**locals()) quality_base = "64" if dd.get_quality_format(data).lower() == "illumina" else "33" sample_name = dd.get_sample_name(data) report_args = "--report-file %s --report-formats json --sample-id %s" % (tx_report_file, dd.get_sample_name(data)) ropts = " ".join(str(x) for x in config_utils.get_resources("atropos", data["config"]).get("options", [])) extra_opts = [] for k, alt_ks, v, want in [("--quality-cutoff", ["-q "], "5", True), ("--minimum-length", ["-m "], str(dd.get_min_read_length(data)), True), ("--nextseq-trim", [], "25", ("polyx" in dd.get_adapters(data) or "polyg" in dd.get_adapters(data)))]: if k not in ropts and not any(alt_k in ropts for alt_k in alt_ks): if want: extra_opts.append("%s=%s" % (k, v)) extra_opts = " ".join(extra_opts) thread_args = ("--threads %s" % cores if cores > 1 else "") cmd = ("atropos trim {ropts} {thread_args} --quality-base {quality_base} --format fastq " "{adapters_args} {input_args} {output_args} {report_args} {extra_opts}") do.run(cmd.format(**locals()), "Trimming with atropos: %s" % dd.get_sample_name(data)) return out_files, report_file
def _atropos_trim(fastq_files, adapters, out_dir, data): """Perform multicore trimming with atropos. """ report_file = os.path.join(out_dir, "%s-report.json" % utils.splitext_plus(os.path.basename(fastq_files[0]))[0]) out_files = [os.path.join(out_dir, "%s-trimmed.fq.gz" % utils.splitext_plus(os.path.basename(x))[0]) for x in fastq_files] if not utils.file_exists(out_files[0]): with file_transaction(data, *[report_file] + out_files) as tx_out: tx_report_file, tx_out1 = tx_out[:2] if len(tx_out) > 2: tx_out2 = tx_out[2] adapters_args = " ".join(["-a %s" % a for a in adapters]) aligner_args = "--aligner adapter" if len(fastq_files) == 1: input_args = "-se %s" % objectstore.cl_input(fastq_files[0]) output_args = "-o >(bgzip --threads %s -c > {tx_out1})".format(**locals()) else: assert len(fastq_files) == 2, fastq_files adapters_args = adapters_args + " " + " ".join(["-A %s" % a for a in adapters]) input_args = "-pe1 %s -pe2 %s" % tuple([objectstore.cl_input(x) for x in fastq_files]) output_args = "-o >(bgzip -c > {tx_out1}) -p >(bgzip -c > {tx_out2})".format(**locals()) quality_base = "64" if dd.get_quality_format(data).lower() == "illumina" else "33" sample_name = dd.get_sample_name(data) report_args = "--report-file %s --report-formats json --sample-id %s" % (tx_report_file, dd.get_sample_name(data)) ropts = " ".join(str(x) for x in config_utils.get_resources("atropos", data["config"]).get("options", [])) extra_opts = [] for k, alt_ks, v in [("--quality-cutoff", ["-q "], "5"), ("--minimum-length", ["-m "], str(dd.get_min_read_length(data)))]: if k not in ropts and not any(alt_k in ropts for alt_k in alt_ks): extra_opts.append("%s=%s" % (k, v)) extra_opts = " ".join(extra_opts) thread_args = ("--threads %s" % dd.get_num_cores(data) if dd.get_num_cores(data) > 1 else "") cmd = ("atropos trim {ropts} {thread_args} --quality-base {quality_base} --format fastq " "{adapters_args} {input_args} {output_args} {report_args} {extra_opts}") do.run(cmd.format(**locals()), "Trimming with atropos: %s" % dd.get_sample_name(data)) return out_files, report_file