def trim_srna_sample(data): """ Remove 3' adapter for smallRNA-seq Uses cutadapt but with different parameters than for other pipelines. """ in_file = data["files"][0] names = data["rgnames"]['sample'] work_dir = os.path.join(dd.get_work_dir(data), "trimmed") out_dir = os.path.join(work_dir, names) utils.safe_makedir(out_dir) out_file = replace_directory(append_stem(in_file, ".clean"), out_dir) trim_reads = data["config"]["algorithm"].get("trim_reads", True) if trim_reads: adapter = dd.get_adapters(data)[0] out_noadapter_file = replace_directory(append_stem(in_file, ".fragments"), out_dir) out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir) cutadapt = os.path.join(os.path.dirname(sys.executable), "cutadapt") cmd = _cmd_cutadapt() if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), "remove adapter") else: symlink_plus(in_file, out_file) data["clean_fastq"] = out_file data["collapse"] = _collapse(data["clean_fastq"]) data["size_stats"] = _summary(data['collapse']) return [[data]]
def trim_srna_sample(data): """ Remove 3' adapter for smallRNA-seq Uses cutadapt but with different parameters than for other pipelines. """ in_file = data["files"][0] names = data["rgnames"]['sample'] work_dir = os.path.join(dd.get_work_dir(data), "trimmed") out_dir = os.path.join(work_dir, names) utils.safe_makedir(out_dir) out_file = replace_directory(append_stem(in_file, ".clean"), out_dir) trim_reads = data["config"]["algorithm"].get("trim_reads", True) if trim_reads: adapter = dd.get_adapters(data)[0] out_noadapter_file = replace_directory( append_stem(in_file, ".fragments"), out_dir) out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir) cutadapt = os.path.join(os.path.dirname(sys.executable), "cutadapt") cmd = _cmd_cutadapt() if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), "remove adapter") else: symlink_plus(in_file, out_file) data["clean_fastq"] = out_file data["collapse"] = _collapse(data["clean_fastq"]) data["size_stats"] = _summary(data['collapse']) return [[data]]
def trim_srna_sample(data): """ Remove 3' adapter for smallRNA-seq Uses cutadapt but with different parameters than for other pipelines. """ in_file = data["files"][0] names = data["rgnames"]['sample'] work_dir = os.path.join(dd.get_work_dir(data), "trimmed") out_dir = os.path.join(work_dir, names) utils.safe_makedir(out_dir) out_file = replace_directory(append_stem(in_file, ".clean"), out_dir) trim_reads = data["config"]["algorithm"].get("trim_reads", True) adapter = dd.get_adapters(data) if trim_reads and not adapter and error_dnapi: raise ValueError(error_dnapi) adapters = adapter if adapter else _dnapi_prediction(in_file) times = "" if len(adapters) == 1 else "--times %s" % len(adapters) if trim_reads and adapters: adapter_cmd = " ".join(map(lambda x: "-a " + x, adapters)) out_noadapter_file = replace_directory( append_stem(in_file, ".fragments"), out_dir) out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir) log_out = os.path.join(out_dir, "%s.log" % names) atropos = _get_atropos() options = " ".join( data.get('resources', {}).get('atropos', {}).get("options", "")) cores = ("--threads %s" % dd.get_num_cores(data) if dd.get_num_cores(data) > 1 else "") if " ".join( data.get('resources', {}).get('cutadapt', {}).get("options", "")): raise ValueError( "Atropos is now used, but cutadapt options found in YAML file." "See https://atropos.readthedocs.io/en/latest/") cmd = _cmd_atropos() if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), "remove adapter for %s" % names) if utils.file_exists(log_out): content = open(log_out).read().replace( out_short_file, names) open(log_out, 'w').write(content) if options: in_file = append_stem(tx_out_file, ".tmp") utils.move_safe(tx_out_file, in_file) cmd = "{atropos} {cores} {options} -se {in_file} -o {tx_out_file} -m 17" do.run(cmd.format(**locals()), "cutadapt with this %s for %s" % (options, names)) else: if not trim_reads: logger.debug("Skip trimming for: %s" % names) elif not adapters: logger.info("No adapter founds in %s, this is an issue related" " to no small RNA enrichment in your sample." % names) symlink_plus(in_file, out_file) data["clean_fastq"] = out_file data["collapse"] = _collapse(data["clean_fastq"]) data["size_stats"] = _summary(data['collapse']) return [[data]]
def trim_srna_sample(data): """ Remove 3' adapter for smallRNA-seq Uses cutadapt but with different parameters than for other pipelines. """ in_file = data["files"][0] names = data["rgnames"]['sample'] work_dir = os.path.join(dd.get_work_dir(data), "trimmed") out_dir = os.path.join(work_dir, names) utils.safe_makedir(out_dir) out_file = replace_directory(append_stem(in_file, ".clean"), out_dir) trim_reads = data["config"]["algorithm"].get("trim_reads", True) if utils.file_exists(out_file): data["clean_fastq"] = out_file data["collapse"] = _collapse(data["clean_fastq"]) data["size_stats"] = _summary(data['collapse']) return [[data]] adapter = dd.get_adapters(data) if trim_reads and not adapter and error_dnapi: raise ValueError(error_dnapi) adapters = adapter if adapter else _dnapi_prediction(in_file, out_dir) times = "" if len(adapters) == 1 else "--times %s" % len(adapters) if trim_reads and adapters: adapter_cmd = " ".join(map(lambda x: "-a " + x, adapters)) out_noadapter_file = replace_directory(append_stem(in_file, ".fragments"), out_dir) out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir) log_out = os.path.join(out_dir, "%s.log" % names) atropos = _get_atropos() options = " ".join(data.get('resources', {}).get('atropos', {}).get("options", "")) cores = ("--threads %s" % dd.get_num_cores(data) if dd.get_num_cores(data) > 1 else "") if " ".join(data.get('resources', {}).get('cutadapt', {}).get("options", "")): raise ValueError("Atropos is now used, but cutadapt options found in YAML file." "See https://atropos.readthedocs.io/en/latest/") cmd = _cmd_atropos() if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), "remove adapter for %s" % names) if utils.file_exists(log_out): content = open(log_out).read().replace(out_short_file, names) open(log_out, 'w').write(content) if options: in_file = append_stem(tx_out_file, ".tmp") utils.move_safe(tx_out_file, in_file) cmd = "{atropos} {cores} {options} -se {in_file} -o {tx_out_file} -m 17" do.run(cmd.format(**locals()), "cutadapt with this %s for %s" %(options, names)) else: if not trim_reads: logger.debug("Skip trimming for: %s" % names) elif not adapters: logger.info("No adapter founds in %s, this is an issue related" " to no small RNA enrichment in your sample." % names) symlink_plus(in_file, out_file) data["clean_fastq"] = out_file data["collapse"] = _collapse(data["clean_fastq"]) data["size_stats"] = _summary(data['collapse']) return [[data]]
def trim_srna_sample(data): """ Remove 3' adapter for smallRNA-seq Uses cutadapt but with different parameters than for other pipelines. """ in_file = data["files"][0] names = data["rgnames"]['sample'] work_dir = os.path.join(dd.get_work_dir(data), "trimmed") out_dir = os.path.join(work_dir, names) utils.safe_makedir(out_dir) out_file = replace_directory(append_stem(in_file, ".clean"), out_dir) trim_reads = data["config"]["algorithm"].get("trim_reads", True) adapter = dd.get_adapters(data) if trim_reads and adapter: adapter = adapter[0] out_noadapter_file = replace_directory( append_stem(in_file, ".fragments"), out_dir) out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir) log_out = os.path.join(out_dir, "%s.log" % names) cutadapt = os.path.join(os.path.dirname(sys.executable), "cutadapt") options = " ".join( config_utils.get_resources("cutadapt", data['config']).get("options", "")) cmd = _cmd_cutadapt() if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), "remove adapter for %s" % names) if utils.file_exists(log_out): content = open(log_out).read().replace( out_short_file, names) open(log_out, 'w').write(content) if options: in_file = tx_out_file + ".tmp.fastq" utils.move_safe(tx_out_file, in_file) cmd = "{cutadapt} {options} {in_file} -o {tx_out_file} -m 17" do.run(cmd.format(**locals()), "cutadapt with this %s for %s" % (options, names)) else: logger.debug("Skip trimming for: %s" % names) symlink_plus(in_file, out_file) data["clean_fastq"] = out_file data["collapse"] = _collapse(data["clean_fastq"]) data["size_stats"] = _summary(data['collapse']) return [[data]]
def trim_srna_sample(data): adapter = dd.get_adapters(data)[0] names = data["rgnames"]['sample'] work_dir = os.path.join(dd.get_work_dir(data), "trimmed") out_dir = os.path.join(work_dir, names) in_file = data["files"][0] utils.safe_makedir(out_dir) out_file = replace_directory(append_stem(in_file, ".clean"), out_dir) out_noadapter_file = replace_directory(append_stem(in_file, ".fragments"), out_dir) out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir) cutadapt = os.path.join(os.path.dirname(sys.executable), "cutadapt") cmd = _cmd_cutadapt() if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), "remove adapter") data["clean_fastq"] = out_file data["collapse"] = _collapse(data["clean_fastq"]) data["size_stats"] = _summary(data['collapse']) return [[data]]
def trim_srna_sample(data): """ Remove 3' adapter for smallRNA-seq Uses cutadapt but with different parameters than for other pipelines. """ in_file = data["files"][0] names = data["rgnames"]['sample'] work_dir = os.path.join(dd.get_work_dir(data), "trimmed") out_dir = os.path.join(work_dir, names) utils.safe_makedir(out_dir) out_file = replace_directory(append_stem(in_file, ".clean"), out_dir) trim_reads = data["config"]["algorithm"].get("trim_reads", True) adapter = dd.get_adapters(data) if trim_reads and adapter: adapter = adapter[0] out_noadapter_file = replace_directory(append_stem(in_file, ".fragments"), out_dir) out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir) log_out = os.path.join(out_dir, "%s.log" % names) cutadapt = os.path.join(os.path.dirname(sys.executable), "cutadapt") options = " ".join(data.get('resources', {}).get('cutadapt', {}).get("options", "")) cmd = _cmd_cutadapt() if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), "remove adapter for %s" % names) if utils.file_exists(log_out): content = open(log_out).read().replace(out_short_file, names) open(log_out, 'w').write(content) if options: in_file = append_stem(tx_out_file, ".tmp") utils.move_safe(tx_out_file, in_file) cmd = "{cutadapt} {options} {in_file} -o {tx_out_file} -m 17" do.run(cmd.format(**locals()), "cutadapt with this %s for %s" %(options, names)) else: logger.debug("Skip trimming for: %s" % names) symlink_plus(in_file, out_file) data["clean_fastq"] = out_file data["collapse"] = _collapse(data["clean_fastq"]) data["size_stats"] = _summary(data['collapse']) return [[data]]
def _trim_adapters(fastq_files, out_dir, name, config): """ for small insert sizes, the read length can be longer than the insert resulting in the reverse complement of the 3' adapter being sequenced. this takes adapter sequences and trims the only the reverse complement of the adapter MYSEQUENCEAAAARETPADA -> MYSEQUENCEAAAA (no polyA trim) """ quality_format = _get_quality_format(config) to_trim = _get_sequences_to_trim(config, SUPPORTED_ADAPTERS) out_files = replace_directory(append_stem(fastq_files, "_%s.trimmed" % name), out_dir) out_files = _cutadapt_trim(fastq_files, quality_format, to_trim, out_files, config) return out_files
def trim_adapters(fastq_files, out_dir, config): """ for small insert sizes, the read length can be longer than the insert resulting in the reverse complement of the 3' adapter being sequenced. this takes adapter sequences and trims the only the reverse complement of the adapter MYSEQUENCEAAAARETPADA -> MYSEQUENCEAAAA (no polyA trim) """ quality_format = _get_quality_format(config) to_trim = _get_sequences_to_trim(config, SUPPORTED_ADAPTERS) out_files = replace_directory(append_stem(fastq_files, ".trimmed"), out_dir) out_files = _cutadapt_trim(fastq_files, quality_format, to_trim, out_files, config) return out_files
def _trim_adapters(fastq_files, out_dir, name, config): """ for small insert sizes, the read length can be longer than the insert resulting in the reverse complement of the 3' adapter being sequenced. this takes adapter sequences and trims the only the reverse complement of the adapter MYSEQUENCEAAAARETPADA -> MYSEQUENCEAAAA (no polyA trim) """ quality_format = _get_quality_format(config) to_trim = _get_sequences_to_trim(config, SUPPORTED_ADAPTERS) out_files = replace_directory(append_stem(fastq_files, "_%s.trimmed" % name), out_dir) log_file = "%s_log_cutadapt.txt" % splitext_plus(out_files[0])[0] out_files = _cutadapt_trim(fastq_files, quality_format, to_trim, out_files, log_file, config) if file_exists(log_file): content = open(log_file).read().replace(fastq_files[0], name) open(log_file, 'w').write(content) return out_files
def _trim_adapters(fastq_files, out_dir, name, config): """ for small insert sizes, the read length can be longer than the insert resulting in the reverse complement of the 3' adapter being sequenced. this takes adapter sequences and trims the only the reverse complement of the adapter MYSEQUENCEAAAARETPADA -> MYSEQUENCEAAAA (no polyA trim) """ quality_format = _get_quality_format(config) to_trim = _get_sequences_to_trim(config, SUPPORTED_ADAPTERS) out_files = replace_directory(append_stem(fastq_files, "_%s.trimmed" % name), out_dir) log_file = "%s_log_cutadapt.txt" % splitext_plus(out_files[0])[0] out_files = _cutadapt_trim(fastq_files, quality_format, to_trim, out_files, log_file, config) if file_exists(log_file): content = open(log_file).read().replace(fastq_files[0], name) if len(fastq_files) > 1: content = content.replace(fastq_files[1], name) open(log_file, 'w').write(content) return out_files
def test_replace_directory_of_string(self): test_string = "/string/test/foo.txt" correct = "/new/dir/foo.txt" out_string = utils.replace_directory(test_string, "/new/dir") self.assertEquals(correct, out_string)
def trim_srna_sample(data): """ Remove 3' adapter for smallRNA-seq Uses cutadapt but with different parameters than for other pipelines. """ data = umi_transform(data) in_file = data["files"][0] names = data["rgnames"]['sample'] work_dir = os.path.join(dd.get_work_dir(data), "trimmed") out_dir = os.path.join(work_dir, names) log_out = os.path.join(out_dir, "%s.log" % names) utils.safe_makedir(out_dir) out_file = replace_directory(append_stem(in_file, ".clean"), out_dir) trim_reads = data["config"]["algorithm"].get("trim_reads", True) if utils.file_exists(out_file): data["files"][0] = out_file data["clean_fastq"] = out_file data["collapse"] = _collapse(data["clean_fastq"]) data["size_stats"] = _summary(data['collapse']) data["log_trimming"] = log_out return [[data]] adapter = dd.get_adapters(data) is_4n = any([a == "4N" for a in adapter]) adapter = [a for a in adapter if re.compile("^([NATGC]+)$").match(a)] if adapter and not trim_reads: trim_reads = True logger.info( "Adapter is set up in config file, but trim_reads is not true." "If you want to skip trimming, skip adapter option from config.") if trim_reads and not adapter and error_dnapi: raise ValueError(error_dnapi) if trim_reads: adapters = adapter if adapter else _dnapi_prediction(in_file, out_dir) times = "" if not trim_reads or len( adapters) == 1 else "--times %s" % len(adapters) if trim_reads and adapters: adapter_cmd = " ".join(map(lambda x: "-a " + x, adapters)) if any([a for a in adapters if re.compile("^N+$").match(a)]): adapter_cmd = "-N %s" % adapter_cmd out_noadapter_file = replace_directory( append_stem(in_file, ".fragments"), out_dir) out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir) # atropos = _get_atropos() atropos = config_utils.get_program("atropos", data, default="atropos") options = " ".join( data.get('resources', {}).get('atropos', {}).get("options", "")) if options.strip() == "-u 4 -u -4": options = "" is_4n = "4N" cores = ("--threads %s" % dd.get_num_cores(data) if dd.get_num_cores(data) > 1 else "") if " ".join( data.get('resources', {}).get('cutadapt', {}).get("options", "")): raise ValueError( "Atropos is now used, but cutadapt options found in YAML file." "See https://atropos.readthedocs.io/en/latest/") cmd = _cmd_atropos() if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), "remove adapter for %s" % names) if utils.file_exists(log_out): content = open(log_out).read().replace( out_short_file, names) open(log_out, 'w').write(content) if is_4n: options = "-u 4 -u -4" in_file = append_stem(tx_out_file, ".tmp") utils.move_safe(tx_out_file, in_file) cmd = "{atropos} {cores} {options} -se {in_file} -o {tx_out_file} -m 17" do.run( cmd.format(**locals()), "atropos with this parameters %s for %s" % (options, names)) data["log_trimming"] = log_out else: if not trim_reads: logger.debug("Skip trimming for: %s" % names) elif not adapters: logger.info("No adapter founds in %s, this is an issue related" " to no small RNA enrichment in your sample." % names) symlink_plus(in_file, out_file) data["files"][0] = out_file data["clean_fastq"] = out_file data["collapse"] = _collapse(data["clean_fastq"]) data["size_stats"] = _summary(data['collapse']) return [[data]]
def _get_read_through_trimmed_outfiles(fastq_files, dirs): out_dir = os.path.join(dirs["work"], "trim") safe_makedir(out_dir) out_files = replace_directory(append_stem(fastq_files, "_trimmed"), out_dir) return out_files
def test_replace_directory_of_list(self): test_list = ["/list/test/bar.txt", "/list/test/foobar.txt"] correct = ["/new/dir/bar.txt", "/new/dir/foobar.txt"] out_list = utils.replace_directory(test_list, "/new/dir") for c, o in zip(correct, out_list): self.assertEquals(c, o)
def _fix_output(in_file, stem, out_dir): out_file = utils.splitext_plus( replace_directory(append_stem(in_file, stem), out_dir)) return "%s%s" % (out_file[0], out_file[1].replace("fastq", "fq"))