def run(name, chip_bam, input_bam, genome_build, out_dir, config): """ Run macs2 for chip and input samples avoiding errors due to samples. """ # output file name need to have the caller name out_file = os.path.join(out_dir, name + "_peaks_macs2.xls") macs2_file = os.path.join(out_dir, name + "_peaks.xls") if utils.file_exists(out_file): return out_file macs2 = config_utils.get_program("macs2", config) options = " ".join(config_utils.get_resources("macs2", config).get("options", "")) if genome_build not in HS and options.find("-g") == -1: raise ValueError("This %s genome doesn't have a pre-set value." "You can add specific values using resources " "option for macs2 in the YAML file (-g genome_size)." "Check Chip-seq configuration in " "bcbio-nextgen documentation.") genome_size = "" if options.find("-g") > -1 else "-g %s" % HS[genome_build] with utils.chdir(out_dir): cmd = _macs2_cmd() try: do.run(cmd.format(**locals()), "macs2 for %s" % name) utils.move_safe(macs2_file, out_file) except subprocess.CalledProcessError: raise RuntimeWarning("macs2 terminated with an error.\n" "Please, check the message and report " "error if it is related to bcbio.\n" "You can add specific options for the sample " "setting resources as explained in docs: " "https://bcbio-nextgen.readthedocs.org/en/latest/contents/configuration.html#sample-specific-resources") return out_file
def _make_isomir_counts(data, srna_type="seqbuster", out_dir=None, stem=""): """ Parse miraligner files to create count matrix. """ work_dir = dd.get_work_dir(data[0][0]) if not out_dir: out_dir = op.join(work_dir, "mirbase") out_novel_isomir = append_stem(op.join(out_dir, "counts.tsv"), stem) out_novel_mirna = append_stem(op.join(out_dir, "counts_mirna.tsv"), stem) logger.debug("Create %s count data at %s." % (srna_type, out_dir)) if file_exists(out_novel_mirna): return [out_novel_mirna, out_novel_isomir] out_dts = [] for sample in data: if sample[0].get(srna_type): miraligner_fn = sample[0][srna_type] reads = _read_miraligner(miraligner_fn) if reads: out_file, dt, dt_pre = _tab_output(reads, miraligner_fn + ".back", dd.get_sample_name(sample[0])) out_dts.append(dt) else: logger.debug("WARNING::%s has NOT miRNA annotated for %s. Check if fasta files is small or species value." % (dd.get_sample_name(sample[0]), srna_type)) if out_dts: out_files = _create_counts(out_dts, out_dir) out_files = [move_safe(out_files[0], out_novel_isomir), move_safe(out_files[1], out_novel_mirna)] return out_files else: logger.debug("WARNING::any samples have miRNA annotated for %s. Check if fasta files is small or species value." % srna_type)
def trim_srna_sample(data): """ Remove 3' adapter for smallRNA-seq Uses cutadapt but with different parameters than for other pipelines. """ in_file = data["files"][0] names = data["rgnames"]['sample'] work_dir = os.path.join(dd.get_work_dir(data), "trimmed") out_dir = os.path.join(work_dir, names) utils.safe_makedir(out_dir) out_file = replace_directory(append_stem(in_file, ".clean"), out_dir) trim_reads = data["config"]["algorithm"].get("trim_reads", True) adapter = dd.get_adapters(data) if trim_reads and not adapter and error_dnapi: raise ValueError(error_dnapi) adapters = adapter if adapter else _dnapi_prediction(in_file) times = "" if len(adapters) == 1 else "--times %s" % len(adapters) if trim_reads and adapters: adapter_cmd = " ".join(map(lambda x: "-a " + x, adapters)) out_noadapter_file = replace_directory( append_stem(in_file, ".fragments"), out_dir) out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir) log_out = os.path.join(out_dir, "%s.log" % names) atropos = _get_atropos() options = " ".join( data.get('resources', {}).get('atropos', {}).get("options", "")) cores = ("--threads %s" % dd.get_num_cores(data) if dd.get_num_cores(data) > 1 else "") if " ".join( data.get('resources', {}).get('cutadapt', {}).get("options", "")): raise ValueError( "Atropos is now used, but cutadapt options found in YAML file." "See https://atropos.readthedocs.io/en/latest/") cmd = _cmd_atropos() if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), "remove adapter for %s" % names) if utils.file_exists(log_out): content = open(log_out).read().replace( out_short_file, names) open(log_out, 'w').write(content) if options: in_file = append_stem(tx_out_file, ".tmp") utils.move_safe(tx_out_file, in_file) cmd = "{atropos} {cores} {options} -se {in_file} -o {tx_out_file} -m 17" do.run(cmd.format(**locals()), "cutadapt with this %s for %s" % (options, names)) else: if not trim_reads: logger.debug("Skip trimming for: %s" % names) elif not adapters: logger.info("No adapter founds in %s, this is an issue related" " to no small RNA enrichment in your sample." % names) symlink_plus(in_file, out_file) data["clean_fastq"] = out_file data["collapse"] = _collapse(data["clean_fastq"]) data["size_stats"] = _summary(data['collapse']) return [[data]]
def run(name, chip_bam, input_bam, genome_build, out_dir, method, resources, data): """ Run macs2 for chip and input samples avoiding errors due to samples. """ # output file name need to have the caller name config = dd.get_config(data) out_file = os.path.join(out_dir, name + "_peaks_macs2.xls") macs2_file = os.path.join(out_dir, name + "_peaks.xls") if utils.file_exists(out_file): _compres_bdg_files(out_dir) return _get_output_files(out_dir) macs2 = config_utils.get_program("macs2", config) options = " ".join(resources.get("macs2", {}).get("options", "")) genome_size = HS.get( genome_build, bam.fasta.total_sequence_length(dd.get_ref_file(data))) genome_size = "" if options.find("-g") > -1 else "-g %s" % genome_size paired = "-f BAMPE" if bam.is_paired(chip_bam) else "" with utils.chdir(out_dir): cmd = _macs2_cmd(method) try: do.run(cmd.format(**locals()), "macs2 for %s" % name) utils.move_safe(macs2_file, out_file) except subprocess.CalledProcessError: raise RuntimeWarning( "macs2 terminated with an error.\n" "Please, check the message and report " "error if it is related to bcbio.\n" "You can add specific options for the sample " "setting resources as explained in docs: " "https://bcbio-nextgen.readthedocs.org/en/latest/contents/configuration.html#sample-specific-resources" ) _compres_bdg_files(out_dir) return _get_output_files(out_dir)
def run(name, chip_bam, input_bam, genome_build, out_dir, method, resources, data): """ Run macs2 for chip and input samples avoiding errors due to samples. """ # output file name need to have the caller name config = dd.get_config(data) out_file = os.path.join(out_dir, name + "_peaks_macs2.xls") macs2_file = os.path.join(out_dir, name + "_peaks.xls") if utils.file_exists(out_file): _compres_bdg_files(out_dir) return _get_output_files(out_dir) macs2 = config_utils.get_program("macs2", config) options = " ".join(resources.get("macs2", {}).get("options", "")) genome_size = bam.fasta.total_sequence_length(dd.get_ref_file(data)) genome_size = "" if options.find("-g") > -1 else "-g %s" % genome_size paired = "-f BAMPE" if bam.is_paired(chip_bam) else "" with utils.chdir(out_dir): cmd = _macs2_cmd(method) try: do.run(cmd.format(**locals()), "macs2 for %s" % name) utils.move_safe(macs2_file, out_file) except subprocess.CalledProcessError: raise RuntimeWarning("macs2 terminated with an error.\n" "Please, check the message and report " "error if it is related to bcbio.\n" "You can add specific options for the sample " "setting resources as explained in docs: " "https://bcbio-nextgen.readthedocs.org/en/latest/contents/configuration.html#sample-specific-resources") _compres_bdg_files(out_dir) return _get_output_files(out_dir)
def trim_srna_sample(data): """ Remove 3' adapter for smallRNA-seq Uses cutadapt but with different parameters than for other pipelines. """ in_file = data["files"][0] names = data["rgnames"]['sample'] work_dir = os.path.join(dd.get_work_dir(data), "trimmed") out_dir = os.path.join(work_dir, names) utils.safe_makedir(out_dir) out_file = replace_directory(append_stem(in_file, ".clean"), out_dir) trim_reads = data["config"]["algorithm"].get("trim_reads", True) if utils.file_exists(out_file): data["clean_fastq"] = out_file data["collapse"] = _collapse(data["clean_fastq"]) data["size_stats"] = _summary(data['collapse']) return [[data]] adapter = dd.get_adapters(data) if trim_reads and not adapter and error_dnapi: raise ValueError(error_dnapi) adapters = adapter if adapter else _dnapi_prediction(in_file, out_dir) times = "" if len(adapters) == 1 else "--times %s" % len(adapters) if trim_reads and adapters: adapter_cmd = " ".join(map(lambda x: "-a " + x, adapters)) out_noadapter_file = replace_directory(append_stem(in_file, ".fragments"), out_dir) out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir) log_out = os.path.join(out_dir, "%s.log" % names) atropos = _get_atropos() options = " ".join(data.get('resources', {}).get('atropos', {}).get("options", "")) cores = ("--threads %s" % dd.get_num_cores(data) if dd.get_num_cores(data) > 1 else "") if " ".join(data.get('resources', {}).get('cutadapt', {}).get("options", "")): raise ValueError("Atropos is now used, but cutadapt options found in YAML file." "See https://atropos.readthedocs.io/en/latest/") cmd = _cmd_atropos() if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), "remove adapter for %s" % names) if utils.file_exists(log_out): content = open(log_out).read().replace(out_short_file, names) open(log_out, 'w').write(content) if options: in_file = append_stem(tx_out_file, ".tmp") utils.move_safe(tx_out_file, in_file) cmd = "{atropos} {cores} {options} -se {in_file} -o {tx_out_file} -m 17" do.run(cmd.format(**locals()), "cutadapt with this %s for %s" %(options, names)) else: if not trim_reads: logger.debug("Skip trimming for: %s" % names) elif not adapters: logger.info("No adapter founds in %s, this is an issue related" " to no small RNA enrichment in your sample." % names) symlink_plus(in_file, out_file) data["clean_fastq"] = out_file data["collapse"] = _collapse(data["clean_fastq"]) data["size_stats"] = _summary(data['collapse']) return [[data]]
def run(name, chip_bam, input_bam, genome_build, out_dir, method, resources, data): """ Run macs2 for chip and input samples avoiding errors due to samples. """ # output file name need to have the caller name config = dd.get_config(data) out_file = os.path.join(out_dir, name + "_peaks_macs2.xls") macs2_file = os.path.join(out_dir, name + "_peaks.xls") if utils.file_exists(out_file): _compress_and_sort_bdg_files(out_dir, data) return _get_output_files(out_dir) macs2 = config_utils.get_program("macs2", config) antibody = dd.get_antibody(data) if antibody: antibody = antibody.lower() if antibody not in antibodies.SUPPORTED_ANTIBODIES: logger.error( f"{antibody} specified, but not listed as a supported antibody. Valid antibodies are {antibodies.SUPPORTED_ANTIBODIES}. If you know your antibody " f"should be called with narrow or broad peaks, supply 'narrow' or 'broad' as the antibody." f"It will run 'narrow' if the antibody is not supported.") antibody = 'narrow' antibody = antibodies.ANTIBODIES[antibody] logger.info( f"{antibody.name} specified, using {antibody.peaktype} peak settings." ) peaksettings = select_peak_parameters(antibody) elif method == "atac": logger.info(f"ATAC-seq specified, using narrow peak settings.") peaksettings = " " else: peaksettings = " " options = " ".join(resources.get("macs2", {}).get("options", "")) genome_size = bam.fasta.total_sequence_length(dd.get_ref_file(data)) genome_size = "" if options.find("-g") > -1 else "-g %s" % genome_size paired = "-f BAMPE" if bam.is_paired(chip_bam) else "" with utils.chdir(out_dir): cmd = _macs2_cmd(data) cmd += peaksettings try: do.run(cmd.format(**locals()), "macs2 for %s" % name) utils.move_safe(macs2_file, out_file) except subprocess.CalledProcessError: raise RuntimeWarning( "macs2 terminated with an error. " "Please, check the message and report " "error if it is related to bcbio. " "You can add specific options for the sample " "setting resources as explained in docs: " "https://bcbio-nextgen.readthedocs.org/en/latest/contents/configuration.html#sample-specific-resources" ) _compress_and_sort_bdg_files(out_dir, data) return _get_output_files(out_dir)
def trim_srna_sample(data): """ Remove 3' adapter for smallRNA-seq Uses cutadapt but with different parameters than for other pipelines. """ in_file = data["files"][0] names = data["rgnames"]['sample'] work_dir = os.path.join(dd.get_work_dir(data), "trimmed") out_dir = os.path.join(work_dir, names) utils.safe_makedir(out_dir) out_file = replace_directory(append_stem(in_file, ".clean"), out_dir) trim_reads = data["config"]["algorithm"].get("trim_reads", True) adapter = dd.get_adapters(data) if trim_reads and adapter: adapter = adapter[0] out_noadapter_file = replace_directory( append_stem(in_file, ".fragments"), out_dir) out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir) log_out = os.path.join(out_dir, "%s.log" % names) cutadapt = os.path.join(os.path.dirname(sys.executable), "cutadapt") options = " ".join( config_utils.get_resources("cutadapt", data['config']).get("options", "")) cmd = _cmd_cutadapt() if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), "remove adapter for %s" % names) if utils.file_exists(log_out): content = open(log_out).read().replace( out_short_file, names) open(log_out, 'w').write(content) if options: in_file = tx_out_file + ".tmp.fastq" utils.move_safe(tx_out_file, in_file) cmd = "{cutadapt} {options} {in_file} -o {tx_out_file} -m 17" do.run(cmd.format(**locals()), "cutadapt with this %s for %s" % (options, names)) else: logger.debug("Skip trimming for: %s" % names) symlink_plus(in_file, out_file) data["clean_fastq"] = out_file data["collapse"] = _collapse(data["clean_fastq"]) data["size_stats"] = _summary(data['collapse']) return [[data]]
def trim_srna_sample(data): """ Remove 3' adapter for smallRNA-seq Uses cutadapt but with different parameters than for other pipelines. """ in_file = data["files"][0] names = data["rgnames"]['sample'] work_dir = os.path.join(dd.get_work_dir(data), "trimmed") out_dir = os.path.join(work_dir, names) utils.safe_makedir(out_dir) out_file = replace_directory(append_stem(in_file, ".clean"), out_dir) trim_reads = data["config"]["algorithm"].get("trim_reads", True) adapter = dd.get_adapters(data) if trim_reads and adapter: adapter = adapter[0] out_noadapter_file = replace_directory(append_stem(in_file, ".fragments"), out_dir) out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir) log_out = os.path.join(out_dir, "%s.log" % names) cutadapt = os.path.join(os.path.dirname(sys.executable), "cutadapt") options = " ".join(data.get('resources', {}).get('cutadapt', {}).get("options", "")) cmd = _cmd_cutadapt() if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), "remove adapter for %s" % names) if utils.file_exists(log_out): content = open(log_out).read().replace(out_short_file, names) open(log_out, 'w').write(content) if options: in_file = append_stem(tx_out_file, ".tmp") utils.move_safe(tx_out_file, in_file) cmd = "{cutadapt} {options} {in_file} -o {tx_out_file} -m 17" do.run(cmd.format(**locals()), "cutadapt with this %s for %s" %(options, names)) else: logger.debug("Skip trimming for: %s" % names) symlink_plus(in_file, out_file) data["clean_fastq"] = out_file data["collapse"] = _collapse(data["clean_fastq"]) data["size_stats"] = _summary(data['collapse']) return [[data]]
def run(name, chip_bam, rep_bam, input_bam, gtf_file, out_dir, rlength, rpair, config): """ Run rmats for muatant and control samples avoiding errors due to samples. """ # output file name need to have the caller name MATS_output = os.path.join(out_dir, name + "_MATS_output") MATS_dir = os.path.join(out_dir, "MATS_output") rmats_file = os.path.join(out_dir, "summary.txt") out_file = os.path.join(out_dir, name + "_summary.txt") '''myCmd = 'samtools view '+chip_bam+' | head -n 1' status,output=commands.getstatusoutput(myCmd) rlength=len(output.strip().split('\t')[9])''' libType = _get_stranded_flag(config) if rep_bam != "": chip_bam = chip_bam + "," + rep_bam if utils.file_exists(out_file): return out_file rmats = config_utils.get_program("rmats", config) options = " ".join(config_utils.get_resources("rmats", config).get("options", "")) with utils.chdir(out_dir): cmd = _rmats_cmd() try: do.run(cmd.format(**locals()), "rmats for %s" % name) utils.move_safe(rmats_file, out_file) utils.move_safe(MATS_dir, MATS_output) repdir_dir = os.path.join(out_dir,"SAMPLE_1") utils.remove_safe(repdir_dir) repdir_dir = os.path.join(out_dir,"SAMPLE_2") utils.remove_safe(repdir_dir) print repdir_dir except subprocess.CalledProcessError: raise RuntimeWarning("rMATS terminated with an error.\n" "Please, check the message and report " "error if it is related to bcbio.\n" "You can add specific options for the sample " "setting resources as explained in docs: " "https://bcbio-nextgen.readthedocs.org/en/latest/contents/configuration.html#sample-specific-resources") return (out_file)
def trim_srna_sample(data): """ Remove 3' adapter for smallRNA-seq Uses cutadapt but with different parameters than for other pipelines. """ data = umi_transform(data) in_file = data["files"][0] names = data["rgnames"]['sample'] work_dir = os.path.join(dd.get_work_dir(data), "trimmed") out_dir = os.path.join(work_dir, names) log_out = os.path.join(out_dir, "%s.log" % names) utils.safe_makedir(out_dir) out_file = replace_directory(append_stem(in_file, ".clean"), out_dir) trim_reads = data["config"]["algorithm"].get("trim_reads", True) if utils.file_exists(out_file): data["files"][0] = out_file data["clean_fastq"] = out_file data["collapse"] = _collapse(data["clean_fastq"]) data["size_stats"] = _summary(data['collapse']) data["log_trimming"] = log_out return [[data]] adapter = dd.get_adapters(data) is_4n = any([a == "4N" for a in adapter]) adapter = [a for a in adapter if re.compile("^([NATGC]+)$").match(a)] if adapter and not trim_reads: trim_reads = True logger.info( "Adapter is set up in config file, but trim_reads is not true." "If you want to skip trimming, skip adapter option from config.") if trim_reads and not adapter and error_dnapi: raise ValueError(error_dnapi) if trim_reads: adapters = adapter if adapter else _dnapi_prediction(in_file, out_dir) times = "" if not trim_reads or len( adapters) == 1 else "--times %s" % len(adapters) if trim_reads and adapters: adapter_cmd = " ".join(map(lambda x: "-a " + x, adapters)) if any([a for a in adapters if re.compile("^N+$").match(a)]): adapter_cmd = "-N %s" % adapter_cmd out_noadapter_file = replace_directory( append_stem(in_file, ".fragments"), out_dir) out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir) # atropos = _get_atropos() atropos = config_utils.get_program("atropos", data, default="atropos") options = " ".join( data.get('resources', {}).get('atropos', {}).get("options", "")) if options.strip() == "-u 4 -u -4": options = "" is_4n = "4N" cores = ("--threads %s" % dd.get_num_cores(data) if dd.get_num_cores(data) > 1 else "") if " ".join( data.get('resources', {}).get('cutadapt', {}).get("options", "")): raise ValueError( "Atropos is now used, but cutadapt options found in YAML file." "See https://atropos.readthedocs.io/en/latest/") cmd = _cmd_atropos() if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), "remove adapter for %s" % names) if utils.file_exists(log_out): content = open(log_out).read().replace( out_short_file, names) open(log_out, 'w').write(content) if is_4n: options = "-u 4 -u -4" in_file = append_stem(tx_out_file, ".tmp") utils.move_safe(tx_out_file, in_file) cmd = "{atropos} {cores} {options} -se {in_file} -o {tx_out_file} -m 17" do.run( cmd.format(**locals()), "atropos with this parameters %s for %s" % (options, names)) data["log_trimming"] = log_out else: if not trim_reads: logger.debug("Skip trimming for: %s" % names) elif not adapters: logger.info("No adapter founds in %s, this is an issue related" " to no small RNA enrichment in your sample." % names) symlink_plus(in_file, out_file) data["files"][0] = out_file data["clean_fastq"] = out_file data["collapse"] = _collapse(data["clean_fastq"]) data["size_stats"] = _summary(data['collapse']) return [[data]]