def run(in_file, ref, blastn_config, config): logger.info("Preparing the reference file for %s." % (ref.get("name"))) ref_file = prepare_ref_file(ref, config) logger.info("Preparing the blast database for %s." % (ref.get("name"))) blast_db = prepare_blast_db(ref_file, "nucl") logger.info("Blasting %s against %s." % (in_file, ref.get("name"))) results_dir = build_results_dir(blastn_config, config) utils.safe_makedir(results_dir) out_file = os.path.join(results_dir, replace_suffix(os.path.basename(in_file), ref.get("name") + "hits.tsv")) tmp_out = out_file + ".tmp" blast_results = blast_search(in_file, blast_db, tmp_out) #logger.info("Filtering results for at least %f percent of the " # "sequences covered." %(0.5*100)) #filtered_results = filter_results_by_length(blast_results, 0.5) #logger.info("Filtered output file here: %s" %(filtered_results)) with open(blast_results) as in_handle: reader = csv.reader(in_handle, delimiter="\t") with open(out_file, "w") as out_handle: writer = csv.writer(out_handle, delimiter="\t") writer.writerow(HEADER_FIELDS.split(" ")) for line in reader: writer.writerow(line) return out_file
def run(in_file, ref, blastn_config, config): logger.info("Preparing the reference file for %s." % (ref.get("name"))) ref_file = prepare_ref_file(ref, config) logger.info("Preparing the blast database for %s." % (ref.get("name"))) blast_db = prepare_blast_db(ref_file, "nucl") logger.info("Blasting %s against %s." % (in_file, ref.get("name"))) results_dir = build_results_dir(blastn_config, config) utils.safe_makedir(results_dir) out_file = os.path.join( results_dir, replace_suffix(os.path.basename(in_file), ref.get("name") + "hits.tsv")) tmp_out = out_file + ".tmp" blast_results = blast_search(in_file, blast_db, tmp_out) #logger.info("Filtering results for at least %f percent of the " # "sequences covered." %(0.5*100)) #filtered_results = filter_results_by_length(blast_results, 0.5) #logger.info("Filtered output file here: %s" %(filtered_results)) with open(blast_results) as in_handle: reader = csv.reader(in_handle, delimiter="\t") with open(out_file, "w") as out_handle: writer = csv.writer(out_handle, delimiter="\t") writer.writerow(HEADER_FIELDS.split(" ")) for line in reader: writer.writerow(line) return out_file
def _build_output_prefix(input_file, jellyfish_config, config): out_dir = build_results_dir(jellyfish_config, config) out_prefix = os.path.join(out_dir, replace_suffix(input_file, "count")) #out_prefix = "_".join([jellyfish_config["name"], # remove_suffix(input_file)]) return out_prefix
def _sam_to_bam(in_file): import sh from bipy.utils import replace_suffix from bcbio.utils import file_exists bam_file = replace_suffix(in_file, "bam") if file_exists(bam_file): return bam_file sh.samtools.view("-Sb", in_file, "-o", bam_file) return bam_file
def sam2bam(in_file, out_file=None): """ convert a SAM file to a BAM file """ if out_file is None: out_file = replace_suffix(in_file, "bam") if file_exists(out_file): return out_file sh.samtools.view("-Sb", in_file, "-o", out_file) return out_file
def bam_name_sort(in_file, out_prefix=None): """ sort a bam file by read name """ if out_prefix is None: out_prefix = replace_suffix(in_file, "name_sorted") out_file = out_prefix + ".bam" if file_exists(out_file): return out_file sh.samtools.sort("-n", in_file, out_prefix) return out_file
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) # specific for thesis pipeline in_dir = config["dir"]["data"] id_file = config["id_file"] curr_files = input_files_from_dir(in_dir, id_file) logger.info("Running pipeline on %s." % (curr_files)) for stage in config["run"]: if stage == "fastqc": logger.info("Running fastqc on %s." % (curr_files)) stage_runner = fastqc.FastQC(config) view.map(stage_runner, curr_files, block=False) if stage == "cutadapt": logger.info("Running cutadapt on %s." % (curr_files)) stage_runner = trim.Cutadapt(config) curr_files = view.map(stage_runner, curr_files) if stage == "bowtie": logger.info("Running bowtie on %s." % (curr_files)) bowtie = Bowtie(config) curr_files = view.map(bowtie, curr_files) mapped = view.map(sam.only_mapped, curr_files) unmapped = view.map(sam.only_unmapped, curr_files) curr_files = mapped bam_files = view.map(sam.sam2bam, mapped) bam_sorted = view.map(sam.bamsort, bam_files) view.map(sam.bamindex, bam_sorted) if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(config["dir"]["results"], stage) safe_makedir(out_dir) out_files = [replace_suffix(os.path.basename(x), "metrics") for x in curr_files] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) stop_cluster()
def bamsort(in_file, out_prefix=None): """ sort a BAM file """ if out_prefix is None: out_prefix = replace_suffix(in_file, "sorted") out_file = out_prefix + ".bam" if file_exists(out_file): return out_file sh.samtools.sort(in_file, out_prefix) return out_file
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) # specific for thesis pipeline in_dir = config["dir"]["data"] id_file = config["id_file"] curr_files = input_files_from_dir(in_dir, id_file) logger.info("Running pipeline on %s." % (curr_files)) for stage in config["run"]: if stage == "fastqc": logger.info("Running fastqc on %s." % (curr_files)) stage_runner = fastqc.FastQC(config) view.map(stage_runner, curr_files, block=False) if stage == "cutadapt": logger.info("Running cutadapt on %s." % (curr_files)) stage_runner = trim.Cutadapt(config) curr_files = view.map(stage_runner, curr_files) if stage == "bowtie": logger.info("Running bowtie on %s." % (curr_files)) bowtie = Bowtie(config) curr_files = view.map(bowtie, curr_files) mapped = view.map(sam.only_mapped, curr_files) unmapped = view.map(sam.only_unmapped, curr_files) curr_files = mapped bam_files = view.map(sam.sam2bam, mapped) bam_sorted = view.map(sam.bamsort, bam_files) view.map(sam.bamindex, bam_sorted) if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(config["dir"]["results"], stage) safe_makedir(out_dir) out_files = [ replace_suffix(os.path.basename(x), "metrics") for x in curr_files ] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) stop_cluster()
def bam2sam(in_file, out_file=None): """ convert a BAM file to a SAM file """ if is_sam(in_file): return in_file if out_file is None: out_file = replace_suffix(in_file, "sam") if file_exists(out_file): return out_file with file_transaction(out_file) as tmp_out_file: cmd = sh.samtools.view.bake(h=True, _out=tmp_out_file) cmd(in_file) return out_file
def sam2bam(in_file, out_file=None): """ convert a SAM file to a BAM file. if the file is already a BAM file, return the BAM file name """ if is_bam(in_file): return in_file if out_file is None: out_file = replace_suffix(in_file, "bam") if file_exists(out_file): return out_file with file_transaction(out_file) as tmp_out_file: sort_sam = sh.samtools.view.bake(S=True, b=True, o=tmp_out_file) sort_sam(in_file) return out_file
def count_overlaps(in_file, bed, out_file=None): """ calculates coverage across the features in the bedfile bed """ if not which("coverageBed"): logger.error("Cannot find coverageBed. Make sure it is in your " "path or install bedtools.") exit(-1) if not out_file: out_file = replace_suffix(in_file, ".counts") if os.path.exists(out_file): return out_file cmd = ["coverageBed", "-abam", in_file, "-b", bed] with open(out_file, "w") as out_handle: subprocess.check_call(cmd, stdout=out_handle) return out_file
def run(in_file, bin_size=30, covariate=None, out_file=None): """ takes a sorted BAM input file and runs Piranha on it with the specified bin size """ if not out_file: out_file = replace_suffix(in_file, "piranha.bed") if file_exists(out_file): return out_file if covariate and file_exists(covariate): print "%s, %s, %s, %s" % (in_file, covariate, str(bin_size), out_file) Piranha("-s", in_file, covariate, b=bin_size, o=out_file) else: print "%s, %s, %s" % (in_file, str(bin_size), out_file) Piranha("-s", in_file, b=bin_size, o=out_file) return out_file
def run(in_file, bin_size=30, covariate=None, out_file=None): """ takes a sorted BAM input file and runs Piranha on it with the specified bin size """ if not out_file: out_file = replace_suffix(in_file, "piranha.bed") if file_exists(out_file): return out_file if covariate and file_exists(covariate): print "%s, %s, %s, %s" % (in_file, covariate, str(bin_size), out_file) sh.piranha("-s", in_file, covariate, b=bin_size, o=out_file) else: print "%s, %s, %s" % (in_file, str(bin_size), out_file) sh.piranha("-s", in_file, b=bin_size, o=out_file) return out_file
def _get_outfilename(input_file): out_file = replace_suffix(os.path.basename(input_file), "counts") return out_file
def chr_out(chrom): out_file = os.path.join(break_dir, append_stem(in_file, chrom)) out_file = replace_suffix(out_file, "vcf") return out_file
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) # specific for thesis pipeline input_dirs = config["input_dirs"] results_dir = config["dir"].get("results", "results") input_files = _find_input_files(config) conditions = _group_input_by_condition(input_files) logger.info("Input_files: %s" % (input_files)) logger.info("Condition groups %s" % (conditions)) htseq_outdict = {} for condition, curr_files in conditions.items(): condition_dir = os.path.join(results_dir, condition) safe_makedir(condition_dir) config["dir"]["results"] = condition_dir for stage in config["run"]: if stage == "fastqc": _emit_stage_message(stage, curr_files) fastqc_config = _get_stage_config(config, stage) fastqc_args = zip( *product(curr_files, [fastqc_config], [config])) view.map(fastqc.run, *fastqc_args) if stage == "cutadapt": _emit_stage_message(stage, curr_files) cutadapt_config = _get_stage_config(config, stage) cutadapt_args = zip( *product(curr_files, [cutadapt_config], [config])) cutadapt_outputs = view.map(cutadapt_tool.run, *cutadapt_args) curr_files = cutadapt_outputs logger.info("Fixing mate pair information.") pairs = combine_pairs(curr_files) first = [x[0] for x in pairs] second = [x[1] for x in pairs] logger.info("Forward: %s" % (first)) logger.info("Reverse: %s" % (second)) fixed = view.map(fastq.fix_mate_pairs_with_config, first, second, [config] * len(first)) curr_files = list(flatten(fixed)) if stage == "sickle": _emit_stage_message(stage, curr_files) pairs = combine_pairs(curr_files) first = [x[0] for x in pairs] second = [x[1] for x in pairs] fixed = view.map(sickle.run_with_config, first, second, [config] * len(first)) curr_files = list(flatten(fixed)) if stage == "tophat": _emit_stage_message(stage, curr_files) tophat_config = _get_stage_config(config, stage) pairs = combine_pairs(curr_files) first = [x[0] for x in pairs] second = [x[1] for x in pairs] logger.info("first %s" % (first)) logger.info("second %s" % (second)) #tophat_args = zip(*product(first, second, [config["ref"]], # ["tophat"], [config])) tophat_outputs = view.map(tophat.run_with_config, first, second, [config["ref"]] * len(first), ["tophat"] * len(first), [config] * len(first)) bamfiles = view.map(sam.sam2bam, tophat_outputs) bamsort = view.map(sam.bamsort, bamfiles) view.map(sam.bamindex, bamsort) final_bamfiles = bamsort curr_files = tophat_outputs if stage == "htseq-count": _emit_stage_message(stage, curr_files) htseq_config = _get_stage_config(config, stage) htseq_args = zip(*product(curr_files, [config], [stage])) htseq_outputs = view.map(htseq_count.run_with_config, *htseq_args) htseq_outdict[condition] = htseq_outputs if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) out_files = [ replace_suffix(os.path.basename(x), "metrics") for x in curr_files ] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) if stage == "rseqc": _emit_stage_message(stage, curr_files) rseqc_config = _get_stage_config(config, stage) rseq_args = zip(*product(curr_files, [config])) view.map(rseqc.bam_stat, *rseq_args) view.map(rseqc.genebody_coverage, *rseq_args) view.map(rseqc.junction_annotation, *rseq_args) view.map(rseqc.junction_saturation, *rseq_args) RPKM_args = zip(*product(final_bamfiles, [config])) RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args) RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file, RPKM_count_out) """ annotate_args = zip(*product(RPKM_count_fixed, ["gene_id"], ["ensembl_gene_id"], ["human"])) view.map(annotate.annotate_table_with_biomart, *annotate_args) """ view.map(rseqc.RPKM_saturation, *rseq_args) curr_files = tophat_outputs # combine htseq-count files and run deseq on them conditions, htseq_files = dict_to_vectors(htseq_outdict) deseq_config = _get_stage_config(config, "deseq") cell_types = _group_input_by_cell_type(htseq_files) for cell_type, files in cell_types.items(): for comparison in deseq_config["comparisons"]: comparison_name = "_vs_".join(comparison) deseq_dir = os.path.join(results_dir, "deseq", cell_type, comparison_name) safe_makedir(deseq_dir) out_file = os.path.join(deseq_dir, comparison_name + ".counts.txt") files_by_condition = _group_input_by_condition(files) _emit_stage_message("deseq", files_by_condition) c, f = dict_to_vectors(files_by_condition) combined_out = htseq_count.combine_counts(f, None, out_file) deseq_out = os.path.join(deseq_dir, comparison_name) logger.info("Running deseq on %s with conditions %s " "and writing ot %s" % (combined_out, conditions, deseq_out)) deseq_out = view.map(deseq.run, [combined_out], [c], [deseq_out]) annotate.annotate_table_with_biomart(deseq_out[0], "id", "ensembl_gene_id", "human") #annotated_file = view.map(annotate.annotate_table_with_biomart, # [deseq_out], # ["id"], # ["ensembl_gene_id"], # ["human"]) # end gracefully stop_cluster()
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) # specific for thesis pipeline input_dirs = config["input_dirs"] results_dir = config["dir"].get("results", "results") input_files = _find_input_files(config) conditions = _group_input_by_condition(input_files) logger.info("Input_files: %s" % (input_files)) logger.info("Condition groups %s" %(conditions)) htseq_outdict = {} for condition, curr_files in conditions.items(): condition_dir = os.path.join(results_dir, condition) safe_makedir(condition_dir) config["dir"]["results"] = condition_dir for stage in config["run"]: if stage == "fastqc": logger.info("Running fastqc on %s." % (curr_files)) stage_runner = FastQC(config) view.map(stage_runner, curr_files) if stage == "cutadapt": logger.info("Running cutadapt on %s." % (curr_files)) stage_runner = Cutadapt(config) curr_files = view.map(stage_runner, curr_files) if stage == "tophat": logger.info("Running tophat on %s." % (curr_files)) stage_runner = Tophat(config) tophat_outputs = view.map(stage_runner, curr_files) bamfiles = view.map(sam.sam2bam, tophat_outputs) bamsort = view.map(sam.bamsort, bamfiles) view.map(sam.bamindex, bamsort) final_bamfiles = bamsort curr_files = tophat_outputs if stage == "htseq-count": _emit_stage_message(stage, curr_files) htseq_config = _get_stage_config(config, stage) htseq_args = zip(*product(curr_files, [config], [stage])) htseq_outputs = view.map(htseq_count.run_with_config, *htseq_args) htseq_outdict[condition] = htseq_outputs if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) out_files = [replace_suffix(os.path.basename(x), "metrics") for x in curr_files] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) if stage == "rseqc": _emit_stage_message(stage, curr_files) rseqc_config = _get_stage_config(config, stage) rseq_args = zip(*product(curr_files, [config])) view.map(rseqc.bam_stat, *rseq_args) view.map(rseqc.genebody_coverage, *rseq_args) view.map(rseqc.junction_annotation, *rseq_args) view.map(rseqc.junction_saturation, *rseq_args) RPKM_args = zip(*product(final_bamfiles, [config])) RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args) RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file, RPKM_count_out) """ annotate_args = zip(*product(RPKM_count_fixed, ["gene_id"], ["ensembl_gene_id"], ["human"])) view.map(annotate.annotate_table_with_biomart, *annotate_args) """ view.map(rseqc.RPKM_saturation, *rseq_args) curr_files = tophat_outputs # combine htseq-count files and run deseq on them conditions, htseq_files = dict_to_vectors(htseq_outdict) deseq_config = _get_stage_config(config, "deseq") cell_types = _group_input_by_cell_type(htseq_files) for cell_type, files in cell_types.items(): for comparison in deseq_config["comparisons"]: comparison_name = "_vs_".join(comparison) deseq_dir = os.path.join(results_dir, "deseq", cell_type, comparison_name) safe_makedir(deseq_dir) out_file = os.path.join(deseq_dir, comparison_name + ".counts.txt") files_by_condition = _group_input_by_condition(files) _emit_stage_message("deseq", files_by_condition) c, f = dict_to_vectors(files_by_condition) combined_out = htseq_count.combine_counts(f, None, out_file) deseq_out = os.path.join(deseq_dir, comparison_name) logger.info("Running deseq on %s with conditions %s " "and writing ot %s" % (combined_out, conditions, deseq_out)) deseq_out = view.map(deseq.run, [combined_out], [c], [deseq_out]) annotate.annotate_table_with_biomart(deseq_out[0], "id", "ensembl_gene_id", "human") # end gracefully stop_cluster()
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) setup_logging(config) from bipy.log import logger start_cluster(config) from bipy.cluster import view # view.push({'logger': logger}) input_files = [ os.path.join(config["dir"]["data"], x) for x in config["input"] ] results_dir = config["dir"]["results"] map(safe_makedir, config["dir"].values()) curr_files = input_files for stage in config["run"]: if stage == "fastqc": nfiles = len(curr_files) logger.info("Running %s on %s" % (stage, str(curr_files))) fastqc_config = _get_stage_config(config, stage) fastqc_outputs = view.map(fastqc.run, curr_files, [fastqc_config] * nfiles, [config] * nfiles) if stage == "cutadapt": nfiles = len(curr_files) cutadapt_config = _get_stage_config(config, stage) cutadapt_outputs = view.map(cutadapt_tool.run, curr_files, [cutadapt_config] * nfiles, [config] * nfiles) curr_files = cutadapt_outputs if stage == "novoalign": nfiles = len(curr_files) novoalign_config = _get_stage_config(config, stage) #db = novoindex.run(config["ref"], # _get_stage_config(config, "novoindex"), # config) db = config["genome"]["file"] novoalign_outputs = view.map(novoalign.run, curr_files, [db] * nfiles, [novoalign_config] * nfiles, [config] * nfiles) picard = BroadRunner(config["program"]["picard"]) args = zip(*itertools.product([picard], novoalign_outputs)) # conver to bam bamfiles = view.map(picardrun.picard_formatconverter, *args) args = zip(*itertools.product([picard], bamfiles)) # sort bam sorted_bf = view.map(picardrun.picard_sort, *args) # index bam args = zip(*itertools.product([picard], sorted_bf)) view.map(picardrun.picard_index, *args) curr_files = novoalign_outputs if stage == "htseq-count": nfiles = len(curr_files) htseq_config = _get_stage_config(config, stage) htseq_outputs = view.map(htseq_count.run_with_config, curr_files, [config] * nfiles, [stage] * nfiles) column_names = _get_short_names(input_files) logger.info("Column names: %s" % (column_names)) out_file = os.path.join(config["dir"]["results"], stage, "combined.counts") combined_out = htseq_count.combine_counts(htseq_outputs, column_names, out_file) rpkm = htseq_count.calculate_rpkm(combined_out, config["annotation"]["file"]) rpkm_file = os.path.join(config["dir"]["results"], stage, "rpkm.txt") rpkm.to_csv(rpkm_file, sep="\t") if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = blastn.prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) out_files = [ replace_suffix(os.path.basename(x), "metrics") for x in curr_files ] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) if stage == "deseq": conditions = [ os.path.basename(x).split("_")[0] for x in input_files ] deseq_config = _get_stage_config(config, stage) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) for comparison in deseq_config["comparisons"]: comparison_name = "_vs_".join(comparison) out_dir = os.path.join(results_dir, stage, comparison_name) safe_makedir(out_dir) # get the of the conditons that match this comparison indexes = [ x for x, y in enumerate(conditions) if y in comparison ] # find the htseq_files to combine and combine them htseq_files = [htseq_outputs[index] for index in indexes] htseq_columns = [column_names[index] for index in indexes] logger.info(htseq_files) logger.info(htseq_columns) out_file = os.path.join(out_dir, comparison_name + ".counts.txt") combined_out = htseq_count.combine_counts( htseq_files, htseq_columns, out_file) deseq_conds = [conditions[index] for index in indexes] deseq_prefix = os.path.join(out_dir, comparison_name) deseq_out = view.map(deseq.run, [combined_out], [deseq_conds], [deseq_prefix]) logger.info("Annotating %s." % (deseq_out)) annotated_file = view.map(annotate.annotate_table_with_biomart, deseq_out, ["id"], ["ensembl_gene_id"], ["human"]) stop_cluster()
def main(config_file): """ this assumes that we are keeping the same order of the files throughout """ with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) input_dict = config["input"] curr_files = _make_current_files(input_dict.keys()) input_meta = input_dict.values() for stage in config["run"]: if stage == "fastqc": _emit_stage_message(stage, curr_files) fastqc_config = _get_stage_config(config, stage) fastqc_args = zip(*product(curr_files, [fastqc_config], [config])) view.map(fastqc.run, *fastqc_args) if stage == "cutadapt": _emit_stage_message(stage, curr_files) cutadapt_config = _get_stage_config(config, stage) cutadapt_args = zip( *product(curr_files, [cutadapt_config], [config])) cutadapt_outputs = view.map(cutadapt_tool.run, *cutadapt_args) curr_files = _make_current_files(cutadapt_outputs) if stage == "tophat": _emit_stage_message(stage, curr_files) tophat_config = _get_stage_config(config, stage) tophat_args = zip(*product(curr_files, [None], [config["ref"]], ["tophat"], [config])) tophat_outputs = view.map(tophat.run_with_config, *tophat_args) bamfiles = view.map(sam.sam2bam, tophat_outputs) bamsort = view.map(sam.bamsort, bamfiles) view.map(sam.bamindex, bamsort) final_bamfiles = bamsort curr_files = tophat_outputs if stage == "htseq-count": _emit_stage_message(stage, curr_files) htseq_config = _get_stage_config(config, stage) htseq_args = zip(*product(curr_files, [config], [stage])) htseq_outputs = view.map(htseq_count.run_with_config, *htseq_args) combined_out = os.path.join(config["dir"]["results"], stage, "all_combined.counts") combined_out = htseq_count.combine_counts(htseq_outputs, None, out_file=combined_out) if stage == "rseqc": _emit_stage_message(stage, curr_files) rseqc_config = _get_stage_config(config, stage) rseq_args = zip(*product(curr_files, [config])) view.map(rseqc.bam_stat, *rseq_args) view.map(rseqc.genebody_coverage, *rseq_args) view.map(rseqc.junction_annotation, *rseq_args) view.map(rseqc.junction_saturation, *rseq_args) RPKM_args = zip(*product(final_bamfiles, [config])) RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args) RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file, RPKM_count_out) annotate_args = zip(*product(RPKM_count_fixed, ["gene_id"], ["ensembl_transcript_id"], ["mouse"])) view.map(annotate.annotate_table_with_biomart, *annotate_args) view.map(rseqc.RPKM_saturation, *RPKM_args) if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(config["dir"]["results"], stage) safe_makedir(out_dir) out_files = [ replace_suffix(os.path.basename(x), "metrics") for x in curr_files ] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) if stage == "deseq": _emit_stage_message(stage, curr_files) deseq_config = _get_stage_config(config, stage) out_dir = os.path.join(config["dir"]["results"], stage) safe_makedir(out_dir) for test in deseq_config["tests"]: indexes = [ _find_file_index_for_test(input_meta, condition) for condition in test ] files = [htseq_outputs[x] for x in indexes] conditions = [input_meta[x]["condition"] for x in indexes] combined_out = os.path.join( out_dir, "_".join(conditions) + "_combined.counts") logger.info("Combining %s to %s." % (files, combined_out)) count_file = htseq_count.combine_counts(files, None, out_file=combined_out) out_file = os.path.join(out_dir, "_".join(conditions) + "_deseq.txt") logger.info("Running deseq on %s with conditions %s " "and writing to %s" % (count_file, conditions, out_file)) view.map(deseq.run, [count_file], [conditions], [out_file]) #deseq.run(count_file, conditions, out_file=out_file) # end gracefully stop_cluster()
def _build_output_file(input_file, config): safe_makedir(config["dir"]["ref"]) return os.path.join(config["dir"]["ref"], os.path.basename(replace_suffix(input_file, "nix")))
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) # specific for thesis pipeline in_dir = config["dir"]["data"] curr_files = input_files_from_dir(in_dir) for stage in config["run"]: if stage == "fastqc": stage_runner = fastqc.FastQCStage(config) view.map(stage_runner, curr_files) if stage == "cutadapt": stage_runner = trim.Cutadapt(config) curr_files = view.map(stage_runner, curr_files) if stage == "tophat": _emit_stage_message(stage, curr_files) tophat_config = _get_stage_config(config, stage) tophat_outputs = view.map(tophat.run_with_config, first, [None] * len(curr_files), [config["ref"]] * len(curr_files), ["tophat"] * len(curr_files), [config] * len(curr_files)) bamfiles = view.map(sam.sam2bam, tophat_outputs) bamsort = view.map(sam.bamsort, bamfiles) view.map(sam.bamindex, bamsort) final_bamfiles = bamsort curr_files = tophat_outputs if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) out_files = [replace_suffix(os.path.basename(x), "metrics") for x in curr_files] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) if stage == "rseqc": _emit_stage_message(stage, curr_files) rseqc_config = _get_stage_config(config, stage) rseq_args = zip(*product(curr_files, [config])) view.map(rseqc.bam_stat, *rseq_args) view.map(rseqc.genebody_coverage, *rseq_args) view.map(rseqc.junction_annotation, *rseq_args) view.map(rseqc.junction_saturation, *rseq_args) RPKM_args = zip(*product(final_bamfiles, [config])) RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args) RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file, RPKM_count_out) """ i annotate_args = zip(*product(RPKM_count_fixed, ["gene_id"], ["ensembl_gene_id"], ["human"])) view.map(annotate.annotate_table_with_biomart, *annotate_args) """ view.map(rseqc.RPKM_saturation, *rseq_args) curr_files = tophat_outputs
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) setup_logging(config) start_cluster(config) # after the cluster is up, import the view to i from bipy.cluster import view input_files = config["input"] results_dir = config["dir"]["results"] # make the needed directories map(safe_makedir, config["dir"].values()) curr_files = input_files ## qc steps for stage in config["run"]: if stage == "fastqc": # run the basic fastqc logger.info("Running %s on %s" % (stage, str(curr_files))) fastqc_config = config["stage"][stage] fastqc_outputs = view.map(fastqc.run, curr_files, [fastqc_config] * len(curr_files), [config] * len(curr_files)) # this does nothing for now, not implemented yet summary_file = _combine_fastqc(fastqc_outputs) if stage == "trim": logger.info("Trimming poor quality ends " " from %s" % (str(curr_files))) nlen = len(curr_files) min_length = str(config["stage"][stage].get("min_length", 20)) # trim low quality ends of reads # do this dirty for now out_dir = os.path.join(results_dir, "trimmed") safe_makedir(out_dir) out_files = [ append_stem(os.path.basename(x), "trim") for x in curr_files ] out_files = [os.path.join(out_dir, x) for x in out_files] # XXX remove the magic number of 10 the length of the # minimum read to keep out_files = view.map(sickle.run, curr_files, ["se"] * nlen, ["sanger"] * nlen, [min_length] * nlen, out_files) curr_files = out_files if stage == "tagdust": input_files = curr_files # remove tags matching the other miRNA tested logger.info("Running %s on %s." % (stage, input_files)) tagdust_config = config["stage"][stage] tagdust_outputs = view.map(tagdust.run, input_files, [tagdust_config] * len(input_files), [config] * len(input_files)) curr_files = [x[0] for x in tagdust_outputs] if stage == "filter_length": # filter out reads below or above a certain length filter_config = config["stage"][stage] min_length = filter_config.get("min_length", 0) max_length = filter_config.get("max_length", MAX_READ_LENGTH) # length predicate def length_filter(x): return min_length < len(x.seq) < max_length # filter the input reads based on length # parallelizing this doesn't seem to work # ipython can't accept closures as an argument to view.map() """ filtered_fastq = view.map(filter_seqio, tagdust_outputs, [lf] * len(tagdust_outputs), ["filt"] * len(tagdust_outputs), ["fastq"] * len(tagdust_outputs))""" out_files = [ append_stem(os.path.basename(input_file[0]), "filt") for input_file in tagdust_outputs ] out_dir = os.path.join(config["dir"]["results"], "length_filtered") safe_makedir(out_dir) out_files = [os.path.join(out_dir, x) for x in out_files] filtered_fastq = [ filter_seqio(x[0], length_filter, y, "fastq") for x, y in zip(tagdust_outputs, out_files) ] curr_files = filtered_fastq if stage == "count_ends": logger.info("Compiling nucleotide counts at 3' and 5' ends.") # count the nucleotide at the end of each read def count_ends(x, y): """ keeps a running count of an arbitrary set of keys during the reduce step """ x[y] = x.get(y, 0) + 1 return x def get_3prime_end(x): return str(x.seq[-1]) def get_5prime_end(x): return str(x.seq[0]) def output_counts(end_function, count_file): # if the count_file already exists, skip outdir = os.path.join(config["dir"]["results"], stage) safe_makedir(outdir) count_file = os.path.join(outdir, count_file) if os.path.exists(count_file): return count_file # outputs a tab file of the counts at the end # of the fastq files kj counts = [ reduce(count_ends, apply_seqio(x, end_function, kind="fastq"), {}) for x in curr_files ] df = pd.DataFrame(counts, index=map(_short_name, curr_files)) df = df.astype(float) total = df.sum(axis=1) df = df.div(total, axis=0) df["total"] = total df.to_csv(count_file, sep="\t") output_counts(get_3prime_end, "3prime_counts.tsv") output_counts(get_5prime_end, "5prime_counts.tsv") if stage == "tophat": tophat_config = config["stage"][stage] logger.info("Running tophat on %s" % (str(curr_files))) nlen = len(curr_files) pair_file = None ref_file = tophat_config["annotation"] out_base = os.path.join(results_dir, "mirna") align_dir = os.path.join(results_dir, "tophat") config = config tophat_files = view.map(tophat.align, curr_files, [pair_file] * nlen, [ref_file] * nlen, [out_base] * nlen, [align_dir] * nlen, [config] * nlen) curr_files = tophat_files if stage == "novoalign": logger.info("Running novoalign on %s" % (str(curr_files))) # align ref = config["genome"]["file"] novoalign_config = config["stage"][stage] aligned_outputs = view.map(novoalign.run, curr_files, [ref] * len(curr_files), [novoalign_config] * len(curr_files), [config] * len(curr_files)) # convert sam to bam, sort and index picard = BroadRunner(config["program"]["picard"], None, {}) bamfiles = view.map(picardrun.picard_formatconverter, [picard] * len(aligned_outputs), aligned_outputs) sorted_bf = view.map(picardrun.picard_sort, [picard] * len(bamfiles), bamfiles) view.map(picardrun.picard_index, [picard] * len(sorted_bf), sorted_bf) # these files are the new starting point for the downstream # analyses, so copy them over into the data dir and setting # them to read only #data_dir = os.path.join(config["dir"]["data"], stage) #safe_makedir(data_dir) #view.map(shutil.copy, sorted_bf, [data_dir] * len(sorted_bf)) #new_files = [os.path.join(data_dir, x) for x in # map(os.path.basename, sorted_bf)] #[os.chmod(x, stat.S_IREAD | stat.S_IRGRP) for x in new_files] # index the bam files for later use #view.map(picardrun.picard_index, [picard] * len(new_files), # new_files) curr_files = sorted_bf if stage == "new_coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = blastn.prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"], None, {}) out_dir = os.path.join(results_dir, "new_coverage") safe_makedir(out_dir) out_files = [ replace_suffix(os.path.basename(x), "metrics") for x in curr_files ] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) curr_files = out_files if stage == "coverage": gtf = blastn.prepare_ref_file(config["annotation"], config) logger.info("Calculating coverage of features in %s for %s" % (gtf, str(sorted_bf))) out_files = [replace_suffix(x, "counts.bed") for x in sorted_bf] out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) logger.info(out_files) out_files = [ os.path.join(out_dir, os.path.basename(x)) for x in out_files ] logger.info(out_files) view.map(bedtools.count_overlaps, sorted_bf, [gtf] * len(sorted_bf), out_files) if stage == "htseq-count": nfiles = len(curr_files) htseq_config = _get_stage_config(config, stage) htseq_outputs = view.map(htseq_count.run_with_config, aligned_outputs, [config] * nfiles, [stage] * nfiles) column_names = _get_short_names(input_files) logger.info("Column names: %s" % (column_names)) out_file = os.path.join(config["dir"]["results"], stage, "combined.counts") combined_out = htseq_count.combine_counts(htseq_outputs, column_names, out_file) if stage == "bedtools_intersect": bedfiles = config["stage"]["bedtools_intersect"].get("bed", None) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) for bedfile in bedfiles: bedbase, bedext = os.path.splitext(bedfile) out_files = [remove_suffix(x) for x in sorted_bf] out_files = [ os.path.join(out_dir, os.path.basename(x)) for x in out_files ] out_files = [ "_vs_".join([x, os.path.basename(bedbase)]) for x in out_files ] out_files = [".".join([x, "bam"]) for x in out_files] test_out = map(bedtools.intersectbam2bed, sorted_bf, [bedfile] * len(sorted_bf), [False] * len(sorted_bf), out_files) count_files = [replace_suffix(x, "stats") for x in out_files] map(write_ratios, sorted_bf, out_files, count_files) if stage == "piranha": piranha_runner = piranha.PiranhaStage(config) out_files = view.map(piranha_runner, curr_files) stop_cluster()
def out_file(self, in_file): results_dir = self.config["dir"].get("results", "results") out_dir = os.path.join(results_dir, self.stage) safe_makedir(out_dir) out_file = replace_suffix(os.path.basename(in_file), "metrics") return os.path.join(out_dir, out_file)
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) stage_dict = {"download_encode": _download_encode, "fastqc": _run_fastqc} curr_files = config["encode_file"] results_dir = config["dir"].get("results", "results") for cell_type in config["cell_types"]: cell_type_dir = os.path.join(results_dir, cell_type) safe_makedir(cell_type_dir) config["dir"]["results"] = cell_type_dir in_files = glob.glob(os.path.join(config["dir"]["data"], cell_type, "*")) curr_files = in_files for stage in config["run"]: if stage == "fastqc": _emit_stage_message(stage, curr_files) fastqc_config = _get_stage_config(config, stage) fastqc_args = zip(*product(curr_files, [fastqc_config], [config])) view.map(fastqc.run, *fastqc_args) if stage == "cutadapt": _emit_stage_message(stage, curr_files) cutadapt_config = _get_stage_config(config, stage) cutadapt_args = zip(*product(curr_files, [cutadapt_config], [config])) cutadapt_outputs = view.map(cutadapt_tool.run, *cutadapt_args) curr_files = cutadapt_outputs if stage == "tophat": _emit_stage_message(stage, curr_files) tophat_config = _get_stage_config(config, stage) tophat_args = zip(*product(curr_files, [None], [config["ref"]], ["tophat"], [config])) tophat_outputs = view.map(tophat.run_with_config, *tophat_args) picard = BroadRunner(config["program"]["picard"]) # convert to bam #args = zip(*product([picard], tophat_outputs)) #bamfiles = view.map(picardrun.picard_formatconverter, # *args) bamfiles = view.map(sam.sam2bam, tophat_outputs) sorted_bf = view.map(sam.bamsort, bamfiles) view.map(sam.bamindex, sorted_bf) curr_files = sorted_bf if stage == "rseqc": _emit_stage_message(stage, curr_files) rseqc_config = _get_stage_config(config, stage) rseq_args = zip(*product(curr_files, [config])) view.map(rseqc.bam2bigwig, *rseq_args, block=False) view.map(rseqc.bam_stat, *rseq_args, block=False) view.map(rseqc.clipping_profile, *rseq_args, block=False) view.map(rseqc.genebody_coverage, *rseq_args, block=False) view.map(rseqc.junction_annotation, *rseq_args, block=False) view.map(rseqc.junction_saturation, *rseq_args, block=False) RPKM_count_files = view.map(rseqc.RPKM_count, *rseq_args) dirs_to_process = list(set(map(os.path.dirname, RPKM_count_files))) logger.info("Count files: %s" % (RPKM_count_files)) logger.info("dirnames to process: %s" % (dirs_to_process)) RPKM_merged = view.map(rseqc.merge_RPKM, dirs_to_process) view.map(rseqc.RPKM_saturation, *rseq_args, block=False) curr_files = tophat_outputs if stage == "htseq-count": _emit_stage_message(stage, curr_files) htseq_config = _get_stage_config(config, stage) htseq_args = zip(*product(curr_files, [config], [stage])) htseq_outputs = view.map(htseq_count.run_with_config, *htseq_args) column_names = in_files out_file = os.path.join(config["dir"]["results"], stage, cell_type + ".combined.counts") combined_out = htseq_count.combine_counts(htseq_outputs, column_names, out_file) rpkm = htseq_count.calculate_rpkm(combined_out, config["annotation"]["file"]) rpkm_file = os.path.join(config["dir"]["results"], stage, cell_type + ".rpkm.txt") rpkm.to_csv(rpkm_file, sep="\t") if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(config["dir"]["results"], stage) safe_makedir(out_dir) out_files = [replace_suffix(os.path.basename(x), "metrics") for x in curr_files] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) # end gracefully, wait for jobs to finish, then exit view.wait() stop_cluster()
def main(config_file): if config_file: with open(config_file) as in_handle: config = yaml.load(in_handle) dirs = config["in_dir"] conditions = config["conditions"] glob_string = config["glob_string"] files = list(flatten([glob.glob(os.path.join(x, glob_string)) for x in dirs])) out_dir = config["dir"]["results"] safe_makedir(out_dir) curr_files = [] for condition in conditions: condition_files = [x for x in files if condition in x] out_file = os.path.join(out_dir, condition + "_v2_v3.bam") print "Combining %s into %s." % (condition_files, out_file) sh.samtools.merge(list(flatten([out_file, condition_files]))) # bsub_call = list(flatten(["-q", "hsph", "-o", "out" + condition, "-e", "err" + condition, "samtools", "merge", out_file, condition_files])) #sh.bsub(bsub_call) sorted_prefix = remove_suffix(out_file) + ".sorted" sorted_file = sorted_prefix + ".bam" sh.samtools.sort(out_file, sorted_prefix) sh.samtools.index(sorted_file) mapped_file = append_stem(sorted_file, "mapped") sh.samtools.view(sorted_file, F=4, b=True, o=mapped_file) sh.samtools.index(mapped_file) # find the reads that don't intersect with the rrna in_file = mapped_file out_file = os.path.join(out_dir, condition + "_noribo" + "_v2_v3.bam") ribo = config["ribo"] print "Filtering %s for rRNA in %s into %s." % (in_file, ribo, out_file) sh.bedtools.intersect("-abam", in_file, "-v", "-b", ribo, _out=out_file) filtered_file = out_file print "Calculating RNASeq metrics on %s." % (out_file) in_file = out_file ref = blastn.prepare_ref_file(config["stage"]["new_coverage"]["ref"], config) ribo = config["stage"]["new_coverage"]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(config["dir"]["results"], "new_coverage") safe_makedir(out_dir) out_file = replace_suffix(os.path.basename(in_file), "metrics") out_file = os.path.join(out_dir, out_file) metrics_file = picardrun.picard_rnaseq_metrics(picard, in_file, ref, ribo, out_file) jelly_dir = os.path.join(config["dir"]["results"], "jellyfish") safe_makedir(jelly_dir) # convert the filtered file to fastq for jellyfish counting fastq_file = os.path.join(jelly_dir, os.path.basename(replace_suffix(filtered_file, "fastq"))) sh.bam2fastx(filtered_file, fastq=True, _out=fastq_file) for mer in config["stage"]["jellyfish"]["mer_lengths"]: base, _ = os.path.splitext(os.path.basename(fastq_file)) out_prefix = base + "_%dmer" % (mer) out_file = os.path.join(jelly_dir, out_prefix) if not file_exists(out_file): sh.jellyfish.count(fastq_file, config["stage"]["jellyfish"]["options"], m=mer, o=out_file)
def main(config_file): """ this assumes that we are keeping the same order of the files throughout """ with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) input_dir = config["input_dir"] results_dir = config["dir"].get("results", "results") input_files = glob.glob(os.path.join(input_dir, "*.fq")) curr_files = _make_current_files(input_files) conditions = [os.path.basename(x).split("_")[0] for x in input_files] for stage in config["run"]: if stage == "fastqc": _emit_stage_message(stage, curr_files) fastqc_config = _get_stage_config(config, stage) fastqc_args = zip(*product(curr_files, [fastqc_config], [config])) fastqc_out = view.map(fastqc.run, *fastqc_args) logger.info("fastqc outfiles: %s" % (fastqc_out)) if stage == "cutadapt": _emit_stage_message(stage, curr_files) cutadapt_config = _get_stage_config(config, stage) cutadapt_args = zip(*product(curr_files, [cutadapt_config], [config])) cutadapt_outputs = view.map(cutadapt_tool.run, *cutadapt_args) curr_files = _make_current_files(cutadapt_outputs) if stage == "tophat": _emit_stage_message(stage, curr_files) tophat_config = _get_stage_config(config, stage) tophat_args = zip(*product(curr_files, [None], [config["ref"]], ["tophat"], [config])) tophat_outputs = view.map(tophat.run_with_config, *tophat_args) # convert to bam, sort and index bamfiles = view.map(sam.sam2bam, tophat_outputs) sorted_bf = view.map(sam.bamsort, bamfiles) view.map(sam.bamindex, sorted_bf) curr_files = sorted_bf if stage == "rseqc": _emit_stage_message(stage, curr_files) rseqc_config = _get_stage_config(config, stage) rseq_args = zip(*product(curr_files, [config])) view.map(rseqc.bam2bigwig, *rseq_args, block=False) view.map(rseqc.bam_stat, *rseq_args, block=False) view.map(rseqc.clipping_profile, *rseq_args, block=False) view.map(rseqc.genebody_coverage, *rseq_args, block=False) view.map(rseqc.junction_annotation, *rseq_args, block=False) view.map(rseqc.junction_saturation, *rseq_args, block=False) view.map(rseqc.RPKM_count, *rseq_args, block=False) view.map(rseqc.RPKM_saturation, *rseq_args, block=False) curr_files = tophat_outputs if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) out_files = [replace_suffix(os.path.basename(x), "metrics") for x in curr_files] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) if stage == "htseq-count": _emit_stage_message(stage, curr_files) htseq_config = _get_stage_config(config, stage) htseq_args = zip(*product(curr_files, [config], [stage])) htseq_outputs = view.map(htseq_count.run_with_config, *htseq_args) combined_out = os.path.join(config["dir"]["results"], stage, "all_combined.counts") combined_out = htseq_count.combine_counts(htseq_outputs, None, out_file=combined_out) if stage == "deseq": _emit_stage_message(stage, curr_files) deseq_config = _get_stage_config(config, stage) out_dir = os.path.join(config["dir"]["results"], stage) safe_makedir(out_dir) for comparison in deseq_config["comparisons"]: comparison_name = "_vs_".join(comparison) out_dir = os.path.join(results_dir, stage, comparison_name) safe_makedir(out_dir) indexes = [x for x, y in enumerate(conditions) if y in comparison] htseq_files = [htseq_outputs[index] for index in indexes] htseq_columns = [conditions[index] for index in indexes] out_file = os.path.join(out_dir, comparison_name + ".counts.txt") combined_out = htseq_count.combine_counts(htseq_files, htseq_columns, out_file) deseq_conds = [conditions[index] for index in indexes] deseq_out = os.path.join(out_dir, comparison_name + ".deseq.txt") logger.info("Running deseq on %s with conditions %s " "and writing to %s" % (combined_out, conditions, deseq_out)) view.map(deseq.run, [combined_out], [deseq_conds], [deseq_out]) annotated_file = view.map(annotate.annotate_table_with_biomart, [deseq_out], ["id"], ["ensembl_gene_id"], ["zebrafish"]) # end gracefully stop_cluster()
def main(config_file): """ this assumes that we are keeping the same order of the files throughout """ with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) input_dict = config["input"] curr_files = _make_current_files(input_dict.keys()) input_meta = input_dict.values() for stage in config["run"]: if stage == "fastqc": _emit_stage_message(stage, curr_files) fastqc_config = _get_stage_config(config, stage) fastqc_args = zip(*product(curr_files, [fastqc_config], [config])) view.map(fastqc.run, *fastqc_args) if stage == "cutadapt": _emit_stage_message(stage, curr_files) cutadapt_config = _get_stage_config(config, stage) cutadapt_args = zip(*product(curr_files, [cutadapt_config], [config])) cutadapt_outputs = view.map(cutadapt_tool.run, *cutadapt_args) curr_files = _make_current_files(cutadapt_outputs) if stage == "tophat": _emit_stage_message(stage, curr_files) tophat_config = _get_stage_config(config, stage) tophat_args = zip(*product(curr_files, [None], [config["ref"]], ["tophat"], [config])) tophat_outputs = view.map(tophat.run_with_config, *tophat_args) bamfiles = view.map(sam.sam2bam, tophat_outputs) bamsort = view.map(sam.bamsort, bamfiles) view.map(sam.bamindex, bamsort) final_bamfiles = bamsort curr_files = tophat_outputs if stage == "htseq-count": _emit_stage_message(stage, curr_files) htseq_config = _get_stage_config(config, stage) htseq_args = zip(*product(curr_files, [config], [stage])) htseq_outputs = view.map(htseq_count.run_with_config, *htseq_args) combined_out = os.path.join(config["dir"]["results"], stage, "all_combined.counts") combined_out = htseq_count.combine_counts(htseq_outputs, None, out_file=combined_out) if stage == "rseqc": _emit_stage_message(stage, curr_files) rseqc_config = _get_stage_config(config, stage) rseq_args = zip(*product(curr_files, [config])) view.map(rseqc.bam_stat, *rseq_args) view.map(rseqc.genebody_coverage, *rseq_args) view.map(rseqc.junction_annotation, *rseq_args) view.map(rseqc.junction_saturation, *rseq_args) RPKM_args = zip(*product(final_bamfiles, [config])) RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args) RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file, RPKM_count_out) annotate_args = zip(*product(RPKM_count_fixed, ["gene_id"], ["ensembl_transcript_id"], ["mouse"])) view.map(annotate.annotate_table_with_biomart, *annotate_args) view.map(rseqc.RPKM_saturation, *RPKM_args) if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(config["dir"]["results"], stage) safe_makedir(out_dir) out_files = [replace_suffix(os.path.basename(x), "metrics") for x in curr_files] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) if stage == "deseq": _emit_stage_message(stage, curr_files) deseq_config = _get_stage_config(config, stage) out_dir = os.path.join(config["dir"]["results"], stage) safe_makedir(out_dir) for test in deseq_config["tests"]: indexes = [_find_file_index_for_test(input_meta, condition) for condition in test] files = [htseq_outputs[x] for x in indexes] conditions = [input_meta[x]["condition"] for x in indexes] combined_out = os.path.join(out_dir, "_".join(conditions) + "_combined.counts") logger.info("Combining %s to %s." % (files, combined_out)) count_file = htseq_count.combine_counts(files, None, out_file=combined_out) out_file = os.path.join(out_dir, "_".join(conditions) + "_deseq.txt") logger.info("Running deseq on %s with conditions %s " "and writing to %s" % (count_file, conditions, out_file)) view.map(deseq.run, [count_file], [conditions], [out_file]) #deseq.run(count_file, conditions, out_file=out_file) # end gracefully stop_cluster()
def setUp(self): with open(CONFIG_FILE) as in_handle: self.config = yaml.load(in_handle) self.input_files = self.config["input"] self.db = os.path.basename(replace_suffix(self.config["ref"], "nix")) self.db = os.path.join(self.config["dir"]["ref"], self.db)
def main(config_file): """ this assumes that we are keeping the same order of the files throughout """ with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) input_dir = config["input_dir"] results_dir = config["dir"].get("results", "results") input_files = glob.glob(os.path.join(input_dir, "*.fq")) curr_files = _make_current_files(input_files) conditions = [os.path.basename(x).split("_")[0] for x in input_files] for stage in config["run"]: if stage == "fastqc": _emit_stage_message(stage, curr_files) fastqc_config = _get_stage_config(config, stage) fastqc_args = zip(*product(curr_files, [fastqc_config], [config])) fastqc_out = view.map(fastqc.run, *fastqc_args) logger.info("fastqc outfiles: %s" % (fastqc_out)) if stage == "cutadapt": _emit_stage_message(stage, curr_files) cutadapt_config = _get_stage_config(config, stage) cutadapt_args = zip( *product(curr_files, [cutadapt_config], [config])) cutadapt_outputs = view.map(cutadapt_tool.run, *cutadapt_args) curr_files = _make_current_files(cutadapt_outputs) if stage == "tophat": _emit_stage_message(stage, curr_files) tophat_config = _get_stage_config(config, stage) tophat_args = zip(*product(curr_files, [None], [config["ref"]], ["tophat"], [config])) tophat_outputs = view.map(tophat.run_with_config, *tophat_args) # convert to bam, sort and index bamfiles = view.map(sam.sam2bam, tophat_outputs) sorted_bf = view.map(sam.bamsort, bamfiles) view.map(sam.bamindex, sorted_bf) curr_files = sorted_bf if stage == "rseqc": _emit_stage_message(stage, curr_files) rseqc_config = _get_stage_config(config, stage) rseq_args = zip(*product(curr_files, [config])) view.map(rseqc.bam2bigwig, *rseq_args, block=False) view.map(rseqc.bam_stat, *rseq_args, block=False) view.map(rseqc.clipping_profile, *rseq_args, block=False) view.map(rseqc.genebody_coverage, *rseq_args, block=False) view.map(rseqc.junction_annotation, *rseq_args, block=False) view.map(rseqc.junction_saturation, *rseq_args, block=False) view.map(rseqc.RPKM_count, *rseq_args, block=False) view.map(rseqc.RPKM_saturation, *rseq_args, block=False) curr_files = tophat_outputs if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) out_files = [ replace_suffix(os.path.basename(x), "metrics") for x in curr_files ] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) if stage == "htseq-count": _emit_stage_message(stage, curr_files) htseq_config = _get_stage_config(config, stage) htseq_args = zip(*product(curr_files, [config], [stage])) htseq_outputs = view.map(htseq_count.run_with_config, *htseq_args) combined_out = os.path.join(config["dir"]["results"], stage, "all_combined.counts") combined_out = htseq_count.combine_counts(htseq_outputs, None, out_file=combined_out) if stage == "deseq": _emit_stage_message(stage, curr_files) deseq_config = _get_stage_config(config, stage) out_dir = os.path.join(config["dir"]["results"], stage) safe_makedir(out_dir) for comparison in deseq_config["comparisons"]: comparison_name = "_vs_".join(comparison) out_dir = os.path.join(results_dir, stage, comparison_name) safe_makedir(out_dir) indexes = [ x for x, y in enumerate(conditions) if y in comparison ] htseq_files = [htseq_outputs[index] for index in indexes] htseq_columns = [conditions[index] for index in indexes] out_file = os.path.join(out_dir, comparison_name + ".counts.txt") combined_out = htseq_count.combine_counts( htseq_files, htseq_columns, out_file) deseq_conds = [conditions[index] for index in indexes] deseq_out = os.path.join(out_dir, comparison_name + ".deseq.txt") logger.info("Running deseq on %s with conditions %s " "and writing to %s" % (combined_out, conditions, deseq_out)) view.map(deseq.run, [combined_out], [deseq_conds], [deseq_out]) annotated_file = view.map(annotate.annotate_table_with_biomart, [deseq_out], ["id"], ["ensembl_gene_id"], ["zebrafish"]) # end gracefully stop_cluster()
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) setup_logging(config) from bipy.log import logger start_cluster(config) data_dir = config["dir"]["data"] from bipy.cluster import view input_files = [glob.glob(os.path.join(data_dir, x, "*_rep*")) for x in config["input_dirs"]] input_files = list(flatten(input_files)) logger.info("Input files to process: %s" % (input_files)) results_dir = config["dir"]["results"] map(safe_makedir, config["dir"].values()) curr_files = input_files for stage in config["run"]: if stage == "fastqc": nfiles = len(curr_files) logger.info("Running %s on %s" % (stage, str(curr_files))) fastqc_config = _get_stage_config(config, stage) fastqc_outputs = view.map(fastqc.run, curr_files, [fastqc_config] * nfiles, [config] * nfiles) if stage == "cutadapt": nfiles = len(curr_files) cutadapt_config = _get_stage_config(config, stage) cutadapt_outputs = view.map(cutadapt_tool.run, curr_files, [cutadapt_config] * nfiles, [config] * nfiles) curr_files = cutadapt_outputs if stage == "novoalign": nfiles = len(curr_files) novoalign_config = _get_stage_config(config, stage) #db = novoindex.run(config["ref"], # _get_stage_config(config, "novoindex"), # config) db = config["genome"]["file"] novoalign_outputs = view.map(novoalign.run, curr_files, [db] * nfiles, [novoalign_config] * nfiles, [config] * nfiles) picard = BroadRunner(config["program"]["picard"]) args = zip(*itertools.product([picard], novoalign_outputs)) # conver to bam bamfiles = view.map(picardrun.picard_formatconverter, *args) args = zip(*itertools.product([picard], bamfiles)) # sort bam sorted_bf = view.map(picardrun.picard_sort, *args) # index bam args = zip(*itertools.product([picard], sorted_bf)) view.map(picardrun.picard_index, *args) curr_files = novoalign_outputs if stage == "htseq-count": logger.info("Running htseq-count on %s" %(curr_files)) htseq_outputs = curr_files column_names = _get_short_names(input_files) logger.info("Column names: %s" % (column_names)) out_file = os.path.join(config["dir"]["results"], stage, "combined.counts") out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) combined_out = htseq_count.combine_counts(htseq_outputs, column_names, out_file) rpkm = htseq_count.calculate_rpkm(combined_out, config["annotation"]["file"]) rpkm_file = os.path.join(config["dir"]["results"], stage, "rpkm.txt") rpkm.to_csv(rpkm_file, sep="\t") if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = blastn.prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) out_files = [replace_suffix(os.path.basename(x), "metrics") for x in curr_files] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) if stage == "deseq": conditions = [os.path.basename(x).split("_")[0] for x in input_files] deseq_config = _get_stage_config(config, stage) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) for comparison in deseq_config["comparisons"]: comparison_name = "_vs_".join(comparison) out_dir = os.path.join(results_dir, stage, comparison_name) safe_makedir(out_dir) # get the of the conditons that match this comparison indexes = [x for x, y in enumerate(conditions) if y in comparison] # find the htseq_files to combine and combine them htseq_files = [htseq_outputs[index] for index in indexes] htseq_columns = [column_names[index] for index in indexes] logger.info(htseq_files) logger.info(htseq_columns) out_file = os.path.join(out_dir, comparison_name + ".counts.txt") combined_out = htseq_count.combine_counts(htseq_files, htseq_columns, out_file) deseq_conds = [conditions[index] for index in indexes] deseq_prefix = os.path.join(out_dir, comparison_name) deseq_out = view.map(deseq.run, [combined_out], [deseq_conds], [deseq_prefix]) logger.info("Annotating %s." % (deseq_out)) annotated_file = view.map(annotate.annotate_table_with_biomart, deseq_out, ["id"], ["ensembl_gene_id"], ["human"]) if stage == "dss": conditions = [os.path.basename(x).split("_")[0] for x in input_files] dss_config = _get_stage_config(config, stage) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) for comparison in dss_config["comparisons"]: comparison_name = "_vs_".join(comparison) out_dir = os.path.join(results_dir, stage, comparison_name) safe_makedir(out_dir) # get the of the conditons that match this comparison indexes = [x for x, y in enumerate(conditions) if y in comparison] # find the htseq_files to combine and combine them htseq_files = [htseq_outputs[index] for index in indexes] htseq_columns = [column_names[index] for index in indexes] out_file = os.path.join(out_dir, comparison_name + ".counts.txt") combined_out = htseq_count.combine_counts(htseq_files, htseq_columns, out_file) dss_conds = [conditions[index] for index in indexes] dss_prefix = os.path.join(out_dir, comparison_name) logger.info("Running DSS on %s with conditions %s and comparison %s." % (combined_out, dss_conds, comparison)) dss_out = dss.run(combined_out, dss_conds, comparison, dss_prefix) stop_cluster()
def _gtf2bed(gtf): bed = replace_suffix(gtf, "bed") if not file_exists(bed): sh.gtf2bigbed(gtf, _out=bed) return bed
def _build_output_file(input_file, novoalign_config, config): outdir = build_results_dir(novoalign_config, config) safe_makedir(outdir) return os.path.join(outdir, os.path.basename(replace_suffix(input_file, "sam")))
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) setup_logging(config) start_cluster(config) # after the cluster is up, import the view to i from bipy.cluster import view input_files = config["input"] results_dir = config["dir"]["results"] # make the needed directories map(safe_makedir, config["dir"].values()) curr_files = input_files ## qc steps for stage in config["run"]: if stage == "fastqc": # run the basic fastqc logger.info("Running %s on %s" % (stage, str(curr_files))) fastqc_config = config["stage"][stage] fastqc_outputs = view.map(fastqc.run, curr_files, [fastqc_config] * len(curr_files), [config] * len(curr_files)) # this does nothing for now, not implemented yet summary_file = _combine_fastqc(fastqc_outputs) if stage == "trim": logger.info("Trimming poor quality ends " " from %s" % (str(curr_files))) nlen = len(curr_files) min_length = str(config["stage"][stage].get("min_length", 20)) # trim low quality ends of reads # do this dirty for now out_dir = os.path.join(results_dir, "trimmed") safe_makedir(out_dir) out_files = [append_stem(os.path.basename(x), "trim") for x in curr_files] out_files = [os.path.join(out_dir, x) for x in out_files] # XXX remove the magic number of 10 the length of the # minimum read to keep out_files = view.map(sickle.run, curr_files, ["se"] * nlen, ["sanger"] * nlen, [min_length] * nlen, out_files) curr_files = out_files if stage == "tagdust": input_files = curr_files # remove tags matching the other miRNA tested logger.info("Running %s on %s." % (stage, input_files)) tagdust_config = config["stage"][stage] tagdust_outputs = view.map(tagdust.run, input_files, [tagdust_config] * len(input_files), [config] * len(input_files)) curr_files = [x[0] for x in tagdust_outputs] if stage == "filter_length": # filter out reads below or above a certain length filter_config = config["stage"][stage] min_length = filter_config.get("min_length", 0) max_length = filter_config.get("max_length", MAX_READ_LENGTH) # length predicate def length_filter(x): return min_length < len(x.seq) < max_length # filter the input reads based on length # parallelizing this doesn't seem to work # ipython can't accept closures as an argument to view.map() """ filtered_fastq = view.map(filter_seqio, tagdust_outputs, [lf] * len(tagdust_outputs), ["filt"] * len(tagdust_outputs), ["fastq"] * len(tagdust_outputs))""" out_files = [append_stem(os.path.basename(input_file[0]), "filt") for input_file in tagdust_outputs] out_dir = os.path.join(config["dir"]["results"], "length_filtered") safe_makedir(out_dir) out_files = [os.path.join(out_dir, x) for x in out_files] filtered_fastq = [filter_seqio(x[0], length_filter, y, "fastq") for x, y in zip(tagdust_outputs, out_files)] curr_files = filtered_fastq if stage == "count_ends": logger.info("Compiling nucleotide counts at 3' and 5' ends.") # count the nucleotide at the end of each read def count_ends(x, y): """ keeps a running count of an arbitrary set of keys during the reduce step """ x[y] = x.get(y, 0) + 1 return x def get_3prime_end(x): return str(x.seq[-1]) def get_5prime_end(x): return str(x.seq[0]) def output_counts(end_function, count_file): # if the count_file already exists, skip outdir = os.path.join(config["dir"]["results"], stage) safe_makedir(outdir) count_file = os.path.join(outdir, count_file) if os.path.exists(count_file): return count_file # outputs a tab file of the counts at the end # of the fastq files kj counts = [reduce(count_ends, apply_seqio(x, end_function, kind="fastq"), {}) for x in curr_files] df = pd.DataFrame(counts, index=map(_short_name, curr_files)) df = df.astype(float) total = df.sum(axis=1) df = df.div(total, axis=0) df["total"] = total df.to_csv(count_file, sep="\t") output_counts(get_3prime_end, "3prime_counts.tsv") output_counts(get_5prime_end, "5prime_counts.tsv") if stage == "tophat": tophat_config = config["stage"][stage] logger.info("Running tophat on %s" % (str(curr_files))) nlen = len(curr_files) pair_file = None ref_file = tophat_config["annotation"] out_base = os.path.join(results_dir, "mirna") align_dir = os.path.join(results_dir, "tophat") config = config tophat_files = view.map(tophat.align, curr_files, [pair_file] * nlen, [ref_file] * nlen, [out_base] * nlen, [align_dir] * nlen, [config] * nlen) curr_files = tophat_files if stage == "novoalign": logger.info("Running novoalign on %s" % (str(curr_files))) # align ref = config["genome"]["file"] novoalign_config = config["stage"][stage] aligned_outputs = view.map(novoalign.run, curr_files, [ref] * len(curr_files), [novoalign_config] * len(curr_files), [config] * len(curr_files)) # convert sam to bam, sort and index picard = BroadRunner(config["program"]["picard"], None, {}) bamfiles = view.map(picardrun.picard_formatconverter, [picard] * len(aligned_outputs), aligned_outputs) sorted_bf = view.map(picardrun.picard_sort, [picard] * len(bamfiles), bamfiles) view.map(picardrun.picard_index, [picard] * len(sorted_bf), sorted_bf) # these files are the new starting point for the downstream # analyses, so copy them over into the data dir and setting # them to read only #data_dir = os.path.join(config["dir"]["data"], stage) #safe_makedir(data_dir) #view.map(shutil.copy, sorted_bf, [data_dir] * len(sorted_bf)) #new_files = [os.path.join(data_dir, x) for x in # map(os.path.basename, sorted_bf)] #[os.chmod(x, stat.S_IREAD | stat.S_IRGRP) for x in new_files] # index the bam files for later use #view.map(picardrun.picard_index, [picard] * len(new_files), # new_files) curr_files = sorted_bf if stage == "new_coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = blastn.prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"], None, {}) out_dir = os.path.join(results_dir, "new_coverage") safe_makedir(out_dir) out_files = [replace_suffix(os.path.basename(x), "metrics") for x in curr_files] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) curr_files = out_files if stage == "coverage": gtf = blastn.prepare_ref_file(config["annotation"], config) logger.info("Calculating coverage of features in %s for %s" % (gtf, str(sorted_bf))) out_files = [replace_suffix(x, "counts.bed") for x in sorted_bf] out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) logger.info(out_files) out_files = [os.path.join(out_dir, os.path.basename(x)) for x in out_files] logger.info(out_files) view.map(bedtools.count_overlaps, sorted_bf, [gtf] * len(sorted_bf), out_files) if stage == "htseq-count": nfiles = len(curr_files) htseq_config = _get_stage_config(config, stage) htseq_outputs = view.map(htseq_count.run_with_config, aligned_outputs, [config] * nfiles, [stage] * nfiles) column_names = _get_short_names(input_files) logger.info("Column names: %s" % (column_names)) out_file = os.path.join(config["dir"]["results"], stage, "combined.counts") combined_out = htseq_count.combine_counts(htseq_outputs, column_names, out_file) if stage == "bedtools_intersect": bedfiles = config["stage"]["bedtools_intersect"].get("bed", None) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) for bedfile in bedfiles: bedbase, bedext = os.path.splitext(bedfile) out_files = [remove_suffix(x) for x in sorted_bf] out_files = [os.path.join(out_dir, os.path.basename(x)) for x in out_files] out_files = ["_vs_".join([x, os.path.basename(bedbase)]) for x in out_files] out_files = [".".join([x, "bam"]) for x in out_files] test_out = map(bedtools.intersectbam2bed, sorted_bf, [bedfile] * len(sorted_bf), [False] * len(sorted_bf), out_files) count_files = [replace_suffix(x, "stats") for x in out_files] map(write_ratios, sorted_bf, out_files, count_files) if stage == "piranha": piranha_runner = piranha.PiranhaStage(config) out_files = view.map(piranha_runner, curr_files) stop_cluster()