def _fastp_trim(fastq_files, adapters, out_dir, data): """Perform multicore trimming with fastp (https://github.com/OpenGene/fastp) """ report_file = os.path.join(out_dir, "%s-report.json" % utils.splitext_plus(os.path.basename(fastq_files[0]))[0]) out_files = [os.path.join(out_dir, "%s-trimmed.fq.gz" % utils.splitext_plus(os.path.basename(x))[0]) for x in fastq_files] if not utils.file_exists(out_files[0]): with file_transaction(data, *[report_file] + out_files) as tx_out: tx_report = tx_out[0] tx_out_files = tx_out[1:] cmd = ["fastp", "--thread", dd.get_num_cores(data)] if dd.get_quality_format(data).lower() == "illumina": cmd += ["--phred64"] for i, (inf, outf) in enumerate(zip(fastq_files, tx_out_files)): if i == 0: cmd += ["-i", inf, "-o", outf] else: cmd += ["-I", inf, "-O", outf] cmd += ["--cut_by_quality3", "--cut_mean_quality", "5", "--length_required", str(dd.get_min_read_length(data)), "--disable_quality_filtering"] if "polyx" in dd.get_adapters(data): cmd += ["--trim_poly_x", "--poly_x_min_len", "8"] if "polyx" in dd.get_adapters(data) or "polyg" in dd.get_adapters(data): cmd += ["--trim_poly_g", "--poly_g_min_len", "8"] for a in adapters: cmd += ["--adapter_sequence", a] if not adapters: cmd += ["--disable_adapter_trimming"] cmd += ["--json", report_file, "--report_title", dd.get_sample_name(data)] do.run(cmd, "Trimming with fastp: %s" % dd.get_sample_name(data)) return out_files, report_file
def _atropos_trim(fastq_files, adapters, out_dir, data): """Perform multicore trimming with atropos. """ report_file = os.path.join(out_dir, "%s-report.json" % utils.splitext_plus(os.path.basename(fastq_files[0]))[0]) out_files = [os.path.join(out_dir, "%s-trimmed.fq.gz" % utils.splitext_plus(os.path.basename(x))[0]) for x in fastq_files] if not utils.file_exists(out_files[0]): with file_transaction(data, *[report_file] + out_files) as tx_out: tx_report_file, tx_out1 = tx_out[:2] if len(tx_out) > 2: tx_out2 = tx_out[2] # polyX trimming, anchored to the 3' ends of reads if "polyx" in dd.get_adapters(data): adapters += ["A{200}", "C{200}", "G{200}", "T{200}"] adapters_args = " ".join(["-a '%s'" % a for a in adapters]) adapters_args += " --overlap 8" # Avoid very short internal matches (default is 3) adapters_args += " --no-default-adapters --no-cache-adapters" # Prevent GitHub queries and saving pickles aligner_args = "--aligner adapter" if len(fastq_files) == 1: cores = dd.get_num_cores(data) input_args = "-se %s" % objectstore.cl_input(fastq_files[0]) output_args = "-o >(bgzip --threads {cores} -c > {tx_out1})".format(**locals()) else: assert len(fastq_files) == 2, fastq_files cores = max(1, dd.get_num_cores(data) // 2) adapters_args = adapters_args + " " + " ".join(["-A '%s'" % a for a in adapters]) input_args = "-pe1 %s -pe2 %s" % tuple([objectstore.cl_input(x) for x in fastq_files]) output_args = ("-o >(bgzip --threads {cores} -c > {tx_out1}) " "-p >(bgzip --threads {cores} -c > {tx_out2})").format(**locals()) quality_base = "64" if dd.get_quality_format(data).lower() == "illumina" else "33" sample_name = dd.get_sample_name(data) report_args = "--report-file %s --report-formats json --sample-id %s" % (tx_report_file, dd.get_sample_name(data)) ropts = " ".join(str(x) for x in config_utils.get_resources("atropos", data["config"]).get("options", [])) extra_opts = [] for k, alt_ks, v, want in [("--quality-cutoff", ["-q "], "5", True), ("--minimum-length", ["-m "], str(dd.get_min_read_length(data)), True), ("--nextseq-trim", [], "25", ("polyx" in dd.get_adapters(data) or "polyg" in dd.get_adapters(data)))]: if k not in ropts and not any(alt_k in ropts for alt_k in alt_ks): if want: extra_opts.append("%s=%s" % (k, v)) extra_opts = " ".join(extra_opts) thread_args = ("--threads %s" % cores if cores > 1 else "") cmd = ("atropos trim {ropts} {thread_args} --quality-base {quality_base} --format fastq " "{adapters_args} {input_args} {output_args} {report_args} {extra_opts}") do.run(cmd.format(**locals()), "Trimming with atropos: %s" % dd.get_sample_name(data)) return out_files, report_file
def _atropos_trim(fastq_files, adapters, out_dir, data): """Perform multicore trimming with atropos. """ report_file = os.path.join(out_dir, "%s-report.json" % utils.splitext_plus(os.path.basename(fastq_files[0]))[0]) out_files = [os.path.join(out_dir, "%s-trimmed.fq.gz" % utils.splitext_plus(os.path.basename(x))[0]) for x in fastq_files] if not utils.file_exists(out_files[0]): with file_transaction(data, *[report_file] + out_files) as tx_out: tx_report_file, tx_out1 = tx_out[:2] if len(tx_out) > 2: tx_out2 = tx_out[2] # polyX trimming, anchored to the 3' ends of reads if "polyx" in dd.get_adapters(data): adapters += ["A{200}$", "C{200}$", "G{200}$", "T{200}$"] adapters_args = " ".join(["-a '%s'" % a for a in adapters]) adapters_args += " --overlap 8" # Avoid very short internal matches (default is 3) adapters_args += " --no-default-adapters --no-cache-adapters" # Prevent GitHub queries and saving pickles aligner_args = "--aligner adapter" if len(fastq_files) == 1: cores = dd.get_num_cores(data) input_args = "-se %s" % objectstore.cl_input(fastq_files[0]) output_args = "-o >(bgzip --threads {cores} -c > {tx_out1})".format(**locals()) else: assert len(fastq_files) == 2, fastq_files cores = max(1, dd.get_num_cores(data) // 2) adapters_args = adapters_args + " " + " ".join(["-A '%s'" % a for a in adapters]) input_args = "-pe1 %s -pe2 %s" % tuple([objectstore.cl_input(x) for x in fastq_files]) output_args = ("-o >(bgzip --threads {cores} -c > {tx_out1}) " "-p >(bgzip --threads {cores} -c > {tx_out2})").format(**locals()) quality_base = "64" if dd.get_quality_format(data).lower() == "illumina" else "33" sample_name = dd.get_sample_name(data) report_args = "--report-file %s --report-formats json --sample-id %s" % (tx_report_file, dd.get_sample_name(data)) ropts = " ".join(str(x) for x in config_utils.get_resources("atropos", data["config"]).get("options", [])) extra_opts = [] for k, alt_ks, v, want in [("--quality-cutoff", ["-q "], "5", True), ("--minimum-length", ["-m "], str(dd.get_min_read_length(data)), True), ("--nextseq-trim", [], "25", ("polyx" in dd.get_adapters(data) or "polyg" in dd.get_adapters(data)))]: if k not in ropts and not any(alt_k in ropts for alt_k in alt_ks): if want: extra_opts.append("%s=%s" % (k, v)) extra_opts = " ".join(extra_opts) thread_args = ("--threads %s" % cores if cores > 1 else "") cmd = ("atropos trim {ropts} {thread_args} --quality-base {quality_base} --format fastq " "{adapters_args} {input_args} {output_args} {report_args} {extra_opts}") do.run(cmd.format(**locals()), "Trimming with atropos: %s" % dd.get_sample_name(data)) return out_files, report_file
def trim_srna_sample(data): """ Remove 3' adapter for smallRNA-seq Uses cutadapt but with different parameters than for other pipelines. """ in_file = data["files"][0] names = data["rgnames"]['sample'] work_dir = os.path.join(dd.get_work_dir(data), "trimmed") out_dir = os.path.join(work_dir, names) utils.safe_makedir(out_dir) out_file = replace_directory(append_stem(in_file, ".clean"), out_dir) trim_reads = data["config"]["algorithm"].get("trim_reads", True) if trim_reads: adapter = dd.get_adapters(data)[0] out_noadapter_file = replace_directory(append_stem(in_file, ".fragments"), out_dir) out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir) cutadapt = os.path.join(os.path.dirname(sys.executable), "cutadapt") cmd = _cmd_cutadapt() if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), "remove adapter") else: symlink_plus(in_file, out_file) data["clean_fastq"] = out_file data["collapse"] = _collapse(data["clean_fastq"]) data["size_stats"] = _summary(data['collapse']) return [[data]]
def trim_srna_sample(data): """ Remove 3' adapter for smallRNA-seq Uses cutadapt but with different parameters than for other pipelines. """ in_file = data["files"][0] names = data["rgnames"]['sample'] work_dir = os.path.join(dd.get_work_dir(data), "trimmed") out_dir = os.path.join(work_dir, names) utils.safe_makedir(out_dir) out_file = replace_directory(append_stem(in_file, ".clean"), out_dir) trim_reads = data["config"]["algorithm"].get("trim_reads", True) adapter = dd.get_adapters(data) if trim_reads and not adapter and error_dnapi: raise ValueError(error_dnapi) adapters = adapter if adapter else _dnapi_prediction(in_file) times = "" if len(adapters) == 1 else "--times %s" % len(adapters) if trim_reads and adapters: adapter_cmd = " ".join(map(lambda x: "-a " + x, adapters)) out_noadapter_file = replace_directory( append_stem(in_file, ".fragments"), out_dir) out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir) log_out = os.path.join(out_dir, "%s.log" % names) atropos = _get_atropos() options = " ".join( data.get('resources', {}).get('atropos', {}).get("options", "")) cores = ("--threads %s" % dd.get_num_cores(data) if dd.get_num_cores(data) > 1 else "") if " ".join( data.get('resources', {}).get('cutadapt', {}).get("options", "")): raise ValueError( "Atropos is now used, but cutadapt options found in YAML file." "See https://atropos.readthedocs.io/en/latest/") cmd = _cmd_atropos() if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), "remove adapter for %s" % names) if utils.file_exists(log_out): content = open(log_out).read().replace( out_short_file, names) open(log_out, 'w').write(content) if options: in_file = append_stem(tx_out_file, ".tmp") utils.move_safe(tx_out_file, in_file) cmd = "{atropos} {cores} {options} -se {in_file} -o {tx_out_file} -m 17" do.run(cmd.format(**locals()), "cutadapt with this %s for %s" % (options, names)) else: if not trim_reads: logger.debug("Skip trimming for: %s" % names) elif not adapters: logger.info("No adapter founds in %s, this is an issue related" " to no small RNA enrichment in your sample." % names) symlink_plus(in_file, out_file) data["clean_fastq"] = out_file data["collapse"] = _collapse(data["clean_fastq"]) data["size_stats"] = _summary(data['collapse']) return [[data]]
def trim_srna_sample(data): """ Remove 3' adapter for smallRNA-seq Uses cutadapt but with different parameters than for other pipelines. """ in_file = data["files"][0] names = data["rgnames"]['sample'] work_dir = os.path.join(dd.get_work_dir(data), "trimmed") out_dir = os.path.join(work_dir, names) utils.safe_makedir(out_dir) out_file = replace_directory(append_stem(in_file, ".clean"), out_dir) trim_reads = data["config"]["algorithm"].get("trim_reads", True) if trim_reads: adapter = dd.get_adapters(data)[0] out_noadapter_file = replace_directory( append_stem(in_file, ".fragments"), out_dir) out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir) cutadapt = os.path.join(os.path.dirname(sys.executable), "cutadapt") cmd = _cmd_cutadapt() if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), "remove adapter") else: symlink_plus(in_file, out_file) data["clean_fastq"] = out_file data["collapse"] = _collapse(data["clean_fastq"]) data["size_stats"] = _summary(data['collapse']) return [[data]]
def trim_srna_sample(data): """ Remove 3' adapter for smallRNA-seq Uses cutadapt but with different parameters than for other pipelines. """ in_file = data["files"][0] names = data["rgnames"]['sample'] work_dir = os.path.join(dd.get_work_dir(data), "trimmed") out_dir = os.path.join(work_dir, names) utils.safe_makedir(out_dir) out_file = replace_directory(append_stem(in_file, ".clean"), out_dir) trim_reads = data["config"]["algorithm"].get("trim_reads", True) if utils.file_exists(out_file): data["clean_fastq"] = out_file data["collapse"] = _collapse(data["clean_fastq"]) data["size_stats"] = _summary(data['collapse']) return [[data]] adapter = dd.get_adapters(data) if trim_reads and not adapter and error_dnapi: raise ValueError(error_dnapi) adapters = adapter if adapter else _dnapi_prediction(in_file, out_dir) times = "" if len(adapters) == 1 else "--times %s" % len(adapters) if trim_reads and adapters: adapter_cmd = " ".join(map(lambda x: "-a " + x, adapters)) out_noadapter_file = replace_directory(append_stem(in_file, ".fragments"), out_dir) out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir) log_out = os.path.join(out_dir, "%s.log" % names) atropos = _get_atropos() options = " ".join(data.get('resources', {}).get('atropos', {}).get("options", "")) cores = ("--threads %s" % dd.get_num_cores(data) if dd.get_num_cores(data) > 1 else "") if " ".join(data.get('resources', {}).get('cutadapt', {}).get("options", "")): raise ValueError("Atropos is now used, but cutadapt options found in YAML file." "See https://atropos.readthedocs.io/en/latest/") cmd = _cmd_atropos() if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), "remove adapter for %s" % names) if utils.file_exists(log_out): content = open(log_out).read().replace(out_short_file, names) open(log_out, 'w').write(content) if options: in_file = append_stem(tx_out_file, ".tmp") utils.move_safe(tx_out_file, in_file) cmd = "{atropos} {cores} {options} -se {in_file} -o {tx_out_file} -m 17" do.run(cmd.format(**locals()), "cutadapt with this %s for %s" %(options, names)) else: if not trim_reads: logger.debug("Skip trimming for: %s" % names) elif not adapters: logger.info("No adapter founds in %s, this is an issue related" " to no small RNA enrichment in your sample." % names) symlink_plus(in_file, out_file) data["clean_fastq"] = out_file data["collapse"] = _collapse(data["clean_fastq"]) data["size_stats"] = _summary(data['collapse']) return [[data]]
def trim_srna_sample(data): """ Remove 3' adapter for smallRNA-seq Uses cutadapt but with different parameters than for other pipelines. """ in_file = data["files"][0] names = data["rgnames"]['sample'] work_dir = os.path.join(dd.get_work_dir(data), "trimmed") out_dir = os.path.join(work_dir, names) utils.safe_makedir(out_dir) out_file = replace_directory(append_stem(in_file, ".clean"), out_dir) trim_reads = data["config"]["algorithm"].get("trim_reads", True) adapter = dd.get_adapters(data) if trim_reads and adapter: adapter = adapter[0] out_noadapter_file = replace_directory( append_stem(in_file, ".fragments"), out_dir) out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir) log_out = os.path.join(out_dir, "%s.log" % names) cutadapt = os.path.join(os.path.dirname(sys.executable), "cutadapt") options = " ".join( config_utils.get_resources("cutadapt", data['config']).get("options", "")) cmd = _cmd_cutadapt() if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), "remove adapter for %s" % names) if utils.file_exists(log_out): content = open(log_out).read().replace( out_short_file, names) open(log_out, 'w').write(content) if options: in_file = tx_out_file + ".tmp.fastq" utils.move_safe(tx_out_file, in_file) cmd = "{cutadapt} {options} {in_file} -o {tx_out_file} -m 17" do.run(cmd.format(**locals()), "cutadapt with this %s for %s" % (options, names)) else: logger.debug("Skip trimming for: %s" % names) symlink_plus(in_file, out_file) data["clean_fastq"] = out_file data["collapse"] = _collapse(data["clean_fastq"]) data["size_stats"] = _summary(data['collapse']) return [[data]]
def trim_srna_sample(data): adapter = dd.get_adapters(data)[0] names = data["rgnames"]['sample'] work_dir = os.path.join(dd.get_work_dir(data), "trimmed") out_dir = os.path.join(work_dir, names) in_file = data["files"][0] utils.safe_makedir(out_dir) out_file = replace_directory(append_stem(in_file, ".clean"), out_dir) out_noadapter_file = replace_directory(append_stem(in_file, ".fragments"), out_dir) out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir) cutadapt = os.path.join(os.path.dirname(sys.executable), "cutadapt") cmd = _cmd_cutadapt() if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), "remove adapter") data["clean_fastq"] = out_file data["collapse"] = _collapse(data["clean_fastq"]) data["size_stats"] = _summary(data['collapse']) return [[data]]
def trim_srna_sample(data): """ Remove 3' adapter for smallRNA-seq Uses cutadapt but with different parameters than for other pipelines. """ in_file = data["files"][0] names = data["rgnames"]['sample'] work_dir = os.path.join(dd.get_work_dir(data), "trimmed") out_dir = os.path.join(work_dir, names) utils.safe_makedir(out_dir) out_file = replace_directory(append_stem(in_file, ".clean"), out_dir) trim_reads = data["config"]["algorithm"].get("trim_reads", True) adapter = dd.get_adapters(data) if trim_reads and adapter: adapter = adapter[0] out_noadapter_file = replace_directory(append_stem(in_file, ".fragments"), out_dir) out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir) log_out = os.path.join(out_dir, "%s.log" % names) cutadapt = os.path.join(os.path.dirname(sys.executable), "cutadapt") options = " ".join(data.get('resources', {}).get('cutadapt', {}).get("options", "")) cmd = _cmd_cutadapt() if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), "remove adapter for %s" % names) if utils.file_exists(log_out): content = open(log_out).read().replace(out_short_file, names) open(log_out, 'w').write(content) if options: in_file = append_stem(tx_out_file, ".tmp") utils.move_safe(tx_out_file, in_file) cmd = "{cutadapt} {options} {in_file} -o {tx_out_file} -m 17" do.run(cmd.format(**locals()), "cutadapt with this %s for %s" %(options, names)) else: logger.debug("Skip trimming for: %s" % names) symlink_plus(in_file, out_file) data["clean_fastq"] = out_file data["collapse"] = _collapse(data["clean_fastq"]) data["size_stats"] = _summary(data['collapse']) return [[data]]
def trim_srna_sample(data): """ Remove 3' adapter for smallRNA-seq Uses cutadapt but with different parameters than for other pipelines. """ data = umi_transform(data) in_file = data["files"][0] names = data["rgnames"]['sample'] work_dir = os.path.join(dd.get_work_dir(data), "trimmed") out_dir = os.path.join(work_dir, names) log_out = os.path.join(out_dir, "%s.log" % names) utils.safe_makedir(out_dir) out_file = replace_directory(append_stem(in_file, ".clean"), out_dir) trim_reads = data["config"]["algorithm"].get("trim_reads", True) if utils.file_exists(out_file): data["files"][0] = out_file data["clean_fastq"] = out_file data["collapse"] = _collapse(data["clean_fastq"]) data["size_stats"] = _summary(data['collapse']) data["log_trimming"] = log_out return [[data]] adapter = dd.get_adapters(data) is_4n = any([a == "4N" for a in adapter]) adapter = [a for a in adapter if re.compile("^([NATGC]+)$").match(a)] if adapter and not trim_reads: trim_reads = True logger.info( "Adapter is set up in config file, but trim_reads is not true." "If you want to skip trimming, skip adapter option from config.") if trim_reads and not adapter and error_dnapi: raise ValueError(error_dnapi) if trim_reads: adapters = adapter if adapter else _dnapi_prediction(in_file, out_dir) times = "" if not trim_reads or len( adapters) == 1 else "--times %s" % len(adapters) if trim_reads and adapters: adapter_cmd = " ".join(map(lambda x: "-a " + x, adapters)) if any([a for a in adapters if re.compile("^N+$").match(a)]): adapter_cmd = "-N %s" % adapter_cmd out_noadapter_file = replace_directory( append_stem(in_file, ".fragments"), out_dir) out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir) # atropos = _get_atropos() atropos = config_utils.get_program("atropos", data, default="atropos") options = " ".join( data.get('resources', {}).get('atropos', {}).get("options", "")) if options.strip() == "-u 4 -u -4": options = "" is_4n = "4N" cores = ("--threads %s" % dd.get_num_cores(data) if dd.get_num_cores(data) > 1 else "") if " ".join( data.get('resources', {}).get('cutadapt', {}).get("options", "")): raise ValueError( "Atropos is now used, but cutadapt options found in YAML file." "See https://atropos.readthedocs.io/en/latest/") cmd = _cmd_atropos() if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), "remove adapter for %s" % names) if utils.file_exists(log_out): content = open(log_out).read().replace( out_short_file, names) open(log_out, 'w').write(content) if is_4n: options = "-u 4 -u -4" in_file = append_stem(tx_out_file, ".tmp") utils.move_safe(tx_out_file, in_file) cmd = "{atropos} {cores} {options} -se {in_file} -o {tx_out_file} -m 17" do.run( cmd.format(**locals()), "atropos with this parameters %s for %s" % (options, names)) data["log_trimming"] = log_out else: if not trim_reads: logger.debug("Skip trimming for: %s" % names) elif not adapters: logger.info("No adapter founds in %s, this is an issue related" " to no small RNA enrichment in your sample." % names) symlink_plus(in_file, out_file) data["files"][0] = out_file data["clean_fastq"] = out_file data["collapse"] = _collapse(data["clean_fastq"]) data["size_stats"] = _summary(data['collapse']) return [[data]]
def _get_rv_adapters(data): builtin = [ RV_ADAPTERS[x] for x in dd.get_adapters(data) if x in FW_ADAPTERS ] return flatten(builtin + dd.get_custom_trim(data))
def _get_rv_adapters(data): builtin = [RV_ADAPTERS[x] for x in dd.get_adapters(data) if x in FW_ADAPTERS] return flatten(builtin + dd.get_custom_trim(data))