def __init__(self, config): self.config = config self.stage_config = config["stage"][self.stage] self.ribo = self.stage_config["ribo"] self.picard = BroadRunner(config["program"]["picard"], None, {"algorithm": {}}) self.ref = prepare_ref_file(self.stage_config["ref"], self.config)
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) # specific for thesis pipeline in_dir = config["dir"]["data"] id_file = config["id_file"] curr_files = input_files_from_dir(in_dir, id_file) logger.info("Running pipeline on %s." % (curr_files)) for stage in config["run"]: if stage == "fastqc": logger.info("Running fastqc on %s." % (curr_files)) stage_runner = fastqc.FastQC(config) view.map(stage_runner, curr_files, block=False) if stage == "cutadapt": logger.info("Running cutadapt on %s." % (curr_files)) stage_runner = trim.Cutadapt(config) curr_files = view.map(stage_runner, curr_files) if stage == "bowtie": logger.info("Running bowtie on %s." % (curr_files)) bowtie = Bowtie(config) curr_files = view.map(bowtie, curr_files) mapped = view.map(sam.only_mapped, curr_files) unmapped = view.map(sam.only_unmapped, curr_files) curr_files = mapped bam_files = view.map(sam.sam2bam, mapped) bam_sorted = view.map(sam.bamsort, bam_files) view.map(sam.bamindex, bam_sorted) if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(config["dir"]["results"], stage) safe_makedir(out_dir) out_files = [replace_suffix(os.path.basename(x), "metrics") for x in curr_files] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) stop_cluster()
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) # specific for thesis pipeline in_dir = config["dir"]["data"] id_file = config["id_file"] curr_files = input_files_from_dir(in_dir, id_file) logger.info("Running pipeline on %s." % (curr_files)) for stage in config["run"]: if stage == "fastqc": logger.info("Running fastqc on %s." % (curr_files)) stage_runner = fastqc.FastQC(config) view.map(stage_runner, curr_files, block=False) if stage == "cutadapt": logger.info("Running cutadapt on %s." % (curr_files)) stage_runner = trim.Cutadapt(config) curr_files = view.map(stage_runner, curr_files) if stage == "bowtie": logger.info("Running bowtie on %s." % (curr_files)) bowtie = Bowtie(config) curr_files = view.map(bowtie, curr_files) mapped = view.map(sam.only_mapped, curr_files) unmapped = view.map(sam.only_unmapped, curr_files) curr_files = mapped bam_files = view.map(sam.sam2bam, mapped) bam_sorted = view.map(sam.bamsort, bam_files) view.map(sam.bamindex, bam_sorted) if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(config["dir"]["results"], stage) safe_makedir(out_dir) out_files = [ replace_suffix(os.path.basename(x), "metrics") for x in curr_files ] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) stop_cluster()
def run_with_config(input_file, config, stage, out_file=None): if out_file is None: out_dir = os.path.join(config["dir"].get("results", None), stage) out_file = os.path.join(out_dir, _get_outfilename(input_file)) safe_makedir(out_dir) if "annotation" not in config: logger.error("annotation must appear in the config file, see example " "configuration files.") exit(1) ref = prepare_ref_file(config["annotation"], config) out_file = run(input_file, ref, out_file) return out_file
def main(config_file): """ this assumes that we are keeping the same order of the files throughout """ with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) input_dict = config["input"] curr_files = _make_current_files(input_dict.keys()) input_meta = input_dict.values() for stage in config["run"]: if stage == "fastqc": _emit_stage_message(stage, curr_files) fastqc_config = _get_stage_config(config, stage) fastqc_args = zip(*product(curr_files, [fastqc_config], [config])) view.map(fastqc.run, *fastqc_args) if stage == "cutadapt": _emit_stage_message(stage, curr_files) cutadapt_config = _get_stage_config(config, stage) cutadapt_args = zip( *product(curr_files, [cutadapt_config], [config])) cutadapt_outputs = view.map(cutadapt_tool.run, *cutadapt_args) curr_files = _make_current_files(cutadapt_outputs) if stage == "tophat": _emit_stage_message(stage, curr_files) tophat_config = _get_stage_config(config, stage) tophat_args = zip(*product(curr_files, [None], [config["ref"]], ["tophat"], [config])) tophat_outputs = view.map(tophat.run_with_config, *tophat_args) bamfiles = view.map(sam.sam2bam, tophat_outputs) bamsort = view.map(sam.bamsort, bamfiles) view.map(sam.bamindex, bamsort) final_bamfiles = bamsort curr_files = tophat_outputs if stage == "htseq-count": _emit_stage_message(stage, curr_files) htseq_config = _get_stage_config(config, stage) htseq_args = zip(*product(curr_files, [config], [stage])) htseq_outputs = view.map(htseq_count.run_with_config, *htseq_args) combined_out = os.path.join(config["dir"]["results"], stage, "all_combined.counts") combined_out = htseq_count.combine_counts(htseq_outputs, None, out_file=combined_out) if stage == "rseqc": _emit_stage_message(stage, curr_files) rseqc_config = _get_stage_config(config, stage) rseq_args = zip(*product(curr_files, [config])) view.map(rseqc.bam_stat, *rseq_args) view.map(rseqc.genebody_coverage, *rseq_args) view.map(rseqc.junction_annotation, *rseq_args) view.map(rseqc.junction_saturation, *rseq_args) RPKM_args = zip(*product(final_bamfiles, [config])) RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args) RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file, RPKM_count_out) annotate_args = zip(*product(RPKM_count_fixed, ["gene_id"], ["ensembl_transcript_id"], ["mouse"])) view.map(annotate.annotate_table_with_biomart, *annotate_args) view.map(rseqc.RPKM_saturation, *RPKM_args) if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(config["dir"]["results"], stage) safe_makedir(out_dir) out_files = [ replace_suffix(os.path.basename(x), "metrics") for x in curr_files ] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) if stage == "deseq": _emit_stage_message(stage, curr_files) deseq_config = _get_stage_config(config, stage) out_dir = os.path.join(config["dir"]["results"], stage) safe_makedir(out_dir) for test in deseq_config["tests"]: indexes = [ _find_file_index_for_test(input_meta, condition) for condition in test ] files = [htseq_outputs[x] for x in indexes] conditions = [input_meta[x]["condition"] for x in indexes] combined_out = os.path.join( out_dir, "_".join(conditions) + "_combined.counts") logger.info("Combining %s to %s." % (files, combined_out)) count_file = htseq_count.combine_counts(files, None, out_file=combined_out) out_file = os.path.join(out_dir, "_".join(conditions) + "_deseq.txt") logger.info("Running deseq on %s with conditions %s " "and writing to %s" % (count_file, conditions, out_file)) view.map(deseq.run, [count_file], [conditions], [out_file]) #deseq.run(count_file, conditions, out_file=out_file) # end gracefully stop_cluster()
def main(config_file): """ this assumes that we are keeping the same order of the files throughout """ with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) input_dir = config["input_dir"] results_dir = config["dir"].get("results", "results") input_files = glob.glob(os.path.join(input_dir, "*.fq")) curr_files = _make_current_files(input_files) conditions = [os.path.basename(x).split("_")[0] for x in input_files] for stage in config["run"]: if stage == "fastqc": _emit_stage_message(stage, curr_files) fastqc_config = _get_stage_config(config, stage) fastqc_args = zip(*product(curr_files, [fastqc_config], [config])) fastqc_out = view.map(fastqc.run, *fastqc_args) logger.info("fastqc outfiles: %s" % (fastqc_out)) if stage == "cutadapt": _emit_stage_message(stage, curr_files) cutadapt_config = _get_stage_config(config, stage) cutadapt_args = zip( *product(curr_files, [cutadapt_config], [config])) cutadapt_outputs = view.map(cutadapt_tool.run, *cutadapt_args) curr_files = _make_current_files(cutadapt_outputs) if stage == "tophat": _emit_stage_message(stage, curr_files) tophat_config = _get_stage_config(config, stage) tophat_args = zip(*product(curr_files, [None], [config["ref"]], ["tophat"], [config])) tophat_outputs = view.map(tophat.run_with_config, *tophat_args) # convert to bam, sort and index bamfiles = view.map(sam.sam2bam, tophat_outputs) sorted_bf = view.map(sam.bamsort, bamfiles) view.map(sam.bamindex, sorted_bf) curr_files = sorted_bf if stage == "rseqc": _emit_stage_message(stage, curr_files) rseqc_config = _get_stage_config(config, stage) rseq_args = zip(*product(curr_files, [config])) view.map(rseqc.bam2bigwig, *rseq_args, block=False) view.map(rseqc.bam_stat, *rseq_args, block=False) view.map(rseqc.clipping_profile, *rseq_args, block=False) view.map(rseqc.genebody_coverage, *rseq_args, block=False) view.map(rseqc.junction_annotation, *rseq_args, block=False) view.map(rseqc.junction_saturation, *rseq_args, block=False) view.map(rseqc.RPKM_count, *rseq_args, block=False) view.map(rseqc.RPKM_saturation, *rseq_args, block=False) curr_files = tophat_outputs if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) out_files = [ replace_suffix(os.path.basename(x), "metrics") for x in curr_files ] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) if stage == "htseq-count": _emit_stage_message(stage, curr_files) htseq_config = _get_stage_config(config, stage) htseq_args = zip(*product(curr_files, [config], [stage])) htseq_outputs = view.map(htseq_count.run_with_config, *htseq_args) combined_out = os.path.join(config["dir"]["results"], stage, "all_combined.counts") combined_out = htseq_count.combine_counts(htseq_outputs, None, out_file=combined_out) if stage == "deseq": _emit_stage_message(stage, curr_files) deseq_config = _get_stage_config(config, stage) out_dir = os.path.join(config["dir"]["results"], stage) safe_makedir(out_dir) for comparison in deseq_config["comparisons"]: comparison_name = "_vs_".join(comparison) out_dir = os.path.join(results_dir, stage, comparison_name) safe_makedir(out_dir) indexes = [ x for x, y in enumerate(conditions) if y in comparison ] htseq_files = [htseq_outputs[index] for index in indexes] htseq_columns = [conditions[index] for index in indexes] out_file = os.path.join(out_dir, comparison_name + ".counts.txt") combined_out = htseq_count.combine_counts( htseq_files, htseq_columns, out_file) deseq_conds = [conditions[index] for index in indexes] deseq_out = os.path.join(out_dir, comparison_name + ".deseq.txt") logger.info("Running deseq on %s with conditions %s " "and writing to %s" % (combined_out, conditions, deseq_out)) view.map(deseq.run, [combined_out], [deseq_conds], [deseq_out]) annotated_file = view.map(annotate.annotate_table_with_biomart, [deseq_out], ["id"], ["ensembl_gene_id"], ["zebrafish"]) # end gracefully stop_cluster()
def main(config_file): """ this assumes that we are keeping the same order of the files throughout """ with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) input_dict = config["input"] curr_files = _make_current_files(input_dict.keys()) input_meta = input_dict.values() for stage in config["run"]: if stage == "fastqc": _emit_stage_message(stage, curr_files) fastqc_config = _get_stage_config(config, stage) fastqc_args = zip(*product(curr_files, [fastqc_config], [config])) view.map(fastqc.run, *fastqc_args) if stage == "cutadapt": _emit_stage_message(stage, curr_files) cutadapt_config = _get_stage_config(config, stage) cutadapt_args = zip(*product(curr_files, [cutadapt_config], [config])) cutadapt_outputs = view.map(cutadapt_tool.run, *cutadapt_args) curr_files = _make_current_files(cutadapt_outputs) if stage == "tophat": _emit_stage_message(stage, curr_files) tophat_config = _get_stage_config(config, stage) tophat_args = zip(*product(curr_files, [None], [config["ref"]], ["tophat"], [config])) tophat_outputs = view.map(tophat.run_with_config, *tophat_args) bamfiles = view.map(sam.sam2bam, tophat_outputs) bamsort = view.map(sam.bamsort, bamfiles) view.map(sam.bamindex, bamsort) final_bamfiles = bamsort curr_files = tophat_outputs if stage == "htseq-count": _emit_stage_message(stage, curr_files) htseq_config = _get_stage_config(config, stage) htseq_args = zip(*product(curr_files, [config], [stage])) htseq_outputs = view.map(htseq_count.run_with_config, *htseq_args) combined_out = os.path.join(config["dir"]["results"], stage, "all_combined.counts") combined_out = htseq_count.combine_counts(htseq_outputs, None, out_file=combined_out) if stage == "rseqc": _emit_stage_message(stage, curr_files) rseqc_config = _get_stage_config(config, stage) rseq_args = zip(*product(curr_files, [config])) view.map(rseqc.bam_stat, *rseq_args) view.map(rseqc.genebody_coverage, *rseq_args) view.map(rseqc.junction_annotation, *rseq_args) view.map(rseqc.junction_saturation, *rseq_args) RPKM_args = zip(*product(final_bamfiles, [config])) RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args) RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file, RPKM_count_out) annotate_args = zip(*product(RPKM_count_fixed, ["gene_id"], ["ensembl_transcript_id"], ["mouse"])) view.map(annotate.annotate_table_with_biomart, *annotate_args) view.map(rseqc.RPKM_saturation, *RPKM_args) if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(config["dir"]["results"], stage) safe_makedir(out_dir) out_files = [replace_suffix(os.path.basename(x), "metrics") for x in curr_files] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) if stage == "deseq": _emit_stage_message(stage, curr_files) deseq_config = _get_stage_config(config, stage) out_dir = os.path.join(config["dir"]["results"], stage) safe_makedir(out_dir) for test in deseq_config["tests"]: indexes = [_find_file_index_for_test(input_meta, condition) for condition in test] files = [htseq_outputs[x] for x in indexes] conditions = [input_meta[x]["condition"] for x in indexes] combined_out = os.path.join(out_dir, "_".join(conditions) + "_combined.counts") logger.info("Combining %s to %s." % (files, combined_out)) count_file = htseq_count.combine_counts(files, None, out_file=combined_out) out_file = os.path.join(out_dir, "_".join(conditions) + "_deseq.txt") logger.info("Running deseq on %s with conditions %s " "and writing to %s" % (count_file, conditions, out_file)) view.map(deseq.run, [count_file], [conditions], [out_file]) #deseq.run(count_file, conditions, out_file=out_file) # end gracefully stop_cluster()
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) # specific for thesis pipeline input_dirs = config["input_dirs"] results_dir = config["dir"].get("results", "results") input_files = _find_input_files(config) conditions = _group_input_by_condition(input_files) logger.info("Input_files: %s" % (input_files)) logger.info("Condition groups %s" %(conditions)) htseq_outdict = {} for condition, curr_files in conditions.items(): condition_dir = os.path.join(results_dir, condition) safe_makedir(condition_dir) config["dir"]["results"] = condition_dir for stage in config["run"]: if stage == "fastqc": logger.info("Running fastqc on %s." % (curr_files)) stage_runner = FastQC(config) view.map(stage_runner, curr_files) if stage == "cutadapt": logger.info("Running cutadapt on %s." % (curr_files)) stage_runner = Cutadapt(config) curr_files = view.map(stage_runner, curr_files) if stage == "tophat": logger.info("Running tophat on %s." % (curr_files)) stage_runner = Tophat(config) tophat_outputs = view.map(stage_runner, curr_files) bamfiles = view.map(sam.sam2bam, tophat_outputs) bamsort = view.map(sam.bamsort, bamfiles) view.map(sam.bamindex, bamsort) final_bamfiles = bamsort curr_files = tophat_outputs if stage == "htseq-count": _emit_stage_message(stage, curr_files) htseq_config = _get_stage_config(config, stage) htseq_args = zip(*product(curr_files, [config], [stage])) htseq_outputs = view.map(htseq_count.run_with_config, *htseq_args) htseq_outdict[condition] = htseq_outputs if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) out_files = [replace_suffix(os.path.basename(x), "metrics") for x in curr_files] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) if stage == "rseqc": _emit_stage_message(stage, curr_files) rseqc_config = _get_stage_config(config, stage) rseq_args = zip(*product(curr_files, [config])) view.map(rseqc.bam_stat, *rseq_args) view.map(rseqc.genebody_coverage, *rseq_args) view.map(rseqc.junction_annotation, *rseq_args) view.map(rseqc.junction_saturation, *rseq_args) RPKM_args = zip(*product(final_bamfiles, [config])) RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args) RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file, RPKM_count_out) """ annotate_args = zip(*product(RPKM_count_fixed, ["gene_id"], ["ensembl_gene_id"], ["human"])) view.map(annotate.annotate_table_with_biomart, *annotate_args) """ view.map(rseqc.RPKM_saturation, *rseq_args) curr_files = tophat_outputs # combine htseq-count files and run deseq on them conditions, htseq_files = dict_to_vectors(htseq_outdict) deseq_config = _get_stage_config(config, "deseq") cell_types = _group_input_by_cell_type(htseq_files) for cell_type, files in cell_types.items(): for comparison in deseq_config["comparisons"]: comparison_name = "_vs_".join(comparison) deseq_dir = os.path.join(results_dir, "deseq", cell_type, comparison_name) safe_makedir(deseq_dir) out_file = os.path.join(deseq_dir, comparison_name + ".counts.txt") files_by_condition = _group_input_by_condition(files) _emit_stage_message("deseq", files_by_condition) c, f = dict_to_vectors(files_by_condition) combined_out = htseq_count.combine_counts(f, None, out_file) deseq_out = os.path.join(deseq_dir, comparison_name) logger.info("Running deseq on %s with conditions %s " "and writing ot %s" % (combined_out, conditions, deseq_out)) deseq_out = view.map(deseq.run, [combined_out], [c], [deseq_out]) annotate.annotate_table_with_biomart(deseq_out[0], "id", "ensembl_gene_id", "human") # end gracefully stop_cluster()
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) stage_dict = {"download_encode": _download_encode, "fastqc": _run_fastqc} curr_files = config["encode_file"] results_dir = config["dir"].get("results", "results") for cell_type in config["cell_types"]: cell_type_dir = os.path.join(results_dir, cell_type) safe_makedir(cell_type_dir) config["dir"]["results"] = cell_type_dir in_files = glob.glob(os.path.join(config["dir"]["data"], cell_type, "*")) curr_files = in_files for stage in config["run"]: if stage == "fastqc": _emit_stage_message(stage, curr_files) fastqc_config = _get_stage_config(config, stage) fastqc_args = zip(*product(curr_files, [fastqc_config], [config])) view.map(fastqc.run, *fastqc_args) if stage == "cutadapt": _emit_stage_message(stage, curr_files) cutadapt_config = _get_stage_config(config, stage) cutadapt_args = zip(*product(curr_files, [cutadapt_config], [config])) cutadapt_outputs = view.map(cutadapt_tool.run, *cutadapt_args) curr_files = cutadapt_outputs if stage == "tophat": _emit_stage_message(stage, curr_files) tophat_config = _get_stage_config(config, stage) tophat_args = zip(*product(curr_files, [None], [config["ref"]], ["tophat"], [config])) tophat_outputs = view.map(tophat.run_with_config, *tophat_args) picard = BroadRunner(config["program"]["picard"]) # convert to bam #args = zip(*product([picard], tophat_outputs)) #bamfiles = view.map(picardrun.picard_formatconverter, # *args) bamfiles = view.map(sam.sam2bam, tophat_outputs) sorted_bf = view.map(sam.bamsort, bamfiles) view.map(sam.bamindex, sorted_bf) curr_files = sorted_bf if stage == "rseqc": _emit_stage_message(stage, curr_files) rseqc_config = _get_stage_config(config, stage) rseq_args = zip(*product(curr_files, [config])) view.map(rseqc.bam2bigwig, *rseq_args, block=False) view.map(rseqc.bam_stat, *rseq_args, block=False) view.map(rseqc.clipping_profile, *rseq_args, block=False) view.map(rseqc.genebody_coverage, *rseq_args, block=False) view.map(rseqc.junction_annotation, *rseq_args, block=False) view.map(rseqc.junction_saturation, *rseq_args, block=False) RPKM_count_files = view.map(rseqc.RPKM_count, *rseq_args) dirs_to_process = list(set(map(os.path.dirname, RPKM_count_files))) logger.info("Count files: %s" % (RPKM_count_files)) logger.info("dirnames to process: %s" % (dirs_to_process)) RPKM_merged = view.map(rseqc.merge_RPKM, dirs_to_process) view.map(rseqc.RPKM_saturation, *rseq_args, block=False) curr_files = tophat_outputs if stage == "htseq-count": _emit_stage_message(stage, curr_files) htseq_config = _get_stage_config(config, stage) htseq_args = zip(*product(curr_files, [config], [stage])) htseq_outputs = view.map(htseq_count.run_with_config, *htseq_args) column_names = in_files out_file = os.path.join(config["dir"]["results"], stage, cell_type + ".combined.counts") combined_out = htseq_count.combine_counts(htseq_outputs, column_names, out_file) rpkm = htseq_count.calculate_rpkm(combined_out, config["annotation"]["file"]) rpkm_file = os.path.join(config["dir"]["results"], stage, cell_type + ".rpkm.txt") rpkm.to_csv(rpkm_file, sep="\t") if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(config["dir"]["results"], stage) safe_makedir(out_dir) out_files = [replace_suffix(os.path.basename(x), "metrics") for x in curr_files] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) # end gracefully, wait for jobs to finish, then exit view.wait() stop_cluster()
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) # specific for thesis pipeline in_dir = config["dir"]["data"] curr_files = input_files_from_dir(in_dir) for stage in config["run"]: if stage == "fastqc": stage_runner = fastqc.FastQCStage(config) view.map(stage_runner, curr_files) if stage == "cutadapt": stage_runner = trim.Cutadapt(config) curr_files = view.map(stage_runner, curr_files) if stage == "tophat": _emit_stage_message(stage, curr_files) tophat_config = _get_stage_config(config, stage) tophat_outputs = view.map(tophat.run_with_config, first, [None] * len(curr_files), [config["ref"]] * len(curr_files), ["tophat"] * len(curr_files), [config] * len(curr_files)) bamfiles = view.map(sam.sam2bam, tophat_outputs) bamsort = view.map(sam.bamsort, bamfiles) view.map(sam.bamindex, bamsort) final_bamfiles = bamsort curr_files = tophat_outputs if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) out_files = [replace_suffix(os.path.basename(x), "metrics") for x in curr_files] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) if stage == "rseqc": _emit_stage_message(stage, curr_files) rseqc_config = _get_stage_config(config, stage) rseq_args = zip(*product(curr_files, [config])) view.map(rseqc.bam_stat, *rseq_args) view.map(rseqc.genebody_coverage, *rseq_args) view.map(rseqc.junction_annotation, *rseq_args) view.map(rseqc.junction_saturation, *rseq_args) RPKM_args = zip(*product(final_bamfiles, [config])) RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args) RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file, RPKM_count_out) """ i annotate_args = zip(*product(RPKM_count_fixed, ["gene_id"], ["ensembl_gene_id"], ["human"])) view.map(annotate.annotate_table_with_biomart, *annotate_args) """ view.map(rseqc.RPKM_saturation, *rseq_args) curr_files = tophat_outputs
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) # specific for thesis pipeline input_dirs = config["input_dirs"] results_dir = config["dir"].get("results", "results") input_files = _find_input_files(config) conditions = _group_input_by_condition(input_files) logger.info("Input_files: %s" % (input_files)) logger.info("Condition groups %s" % (conditions)) htseq_outdict = {} for condition, curr_files in conditions.items(): condition_dir = os.path.join(results_dir, condition) safe_makedir(condition_dir) config["dir"]["results"] = condition_dir for stage in config["run"]: if stage == "fastqc": _emit_stage_message(stage, curr_files) fastqc_config = _get_stage_config(config, stage) fastqc_args = zip( *product(curr_files, [fastqc_config], [config])) view.map(fastqc.run, *fastqc_args) if stage == "cutadapt": _emit_stage_message(stage, curr_files) cutadapt_config = _get_stage_config(config, stage) cutadapt_args = zip( *product(curr_files, [cutadapt_config], [config])) cutadapt_outputs = view.map(cutadapt_tool.run, *cutadapt_args) curr_files = cutadapt_outputs logger.info("Fixing mate pair information.") pairs = combine_pairs(curr_files) first = [x[0] for x in pairs] second = [x[1] for x in pairs] logger.info("Forward: %s" % (first)) logger.info("Reverse: %s" % (second)) fixed = view.map(fastq.fix_mate_pairs_with_config, first, second, [config] * len(first)) curr_files = list(flatten(fixed)) if stage == "sickle": _emit_stage_message(stage, curr_files) pairs = combine_pairs(curr_files) first = [x[0] for x in pairs] second = [x[1] for x in pairs] fixed = view.map(sickle.run_with_config, first, second, [config] * len(first)) curr_files = list(flatten(fixed)) if stage == "tophat": _emit_stage_message(stage, curr_files) tophat_config = _get_stage_config(config, stage) pairs = combine_pairs(curr_files) first = [x[0] for x in pairs] second = [x[1] for x in pairs] logger.info("first %s" % (first)) logger.info("second %s" % (second)) #tophat_args = zip(*product(first, second, [config["ref"]], # ["tophat"], [config])) tophat_outputs = view.map(tophat.run_with_config, first, second, [config["ref"]] * len(first), ["tophat"] * len(first), [config] * len(first)) bamfiles = view.map(sam.sam2bam, tophat_outputs) bamsort = view.map(sam.bamsort, bamfiles) view.map(sam.bamindex, bamsort) final_bamfiles = bamsort curr_files = tophat_outputs if stage == "htseq-count": _emit_stage_message(stage, curr_files) htseq_config = _get_stage_config(config, stage) htseq_args = zip(*product(curr_files, [config], [stage])) htseq_outputs = view.map(htseq_count.run_with_config, *htseq_args) htseq_outdict[condition] = htseq_outputs if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) out_files = [ replace_suffix(os.path.basename(x), "metrics") for x in curr_files ] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) if stage == "rseqc": _emit_stage_message(stage, curr_files) rseqc_config = _get_stage_config(config, stage) rseq_args = zip(*product(curr_files, [config])) view.map(rseqc.bam_stat, *rseq_args) view.map(rseqc.genebody_coverage, *rseq_args) view.map(rseqc.junction_annotation, *rseq_args) view.map(rseqc.junction_saturation, *rseq_args) RPKM_args = zip(*product(final_bamfiles, [config])) RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args) RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file, RPKM_count_out) """ annotate_args = zip(*product(RPKM_count_fixed, ["gene_id"], ["ensembl_gene_id"], ["human"])) view.map(annotate.annotate_table_with_biomart, *annotate_args) """ view.map(rseqc.RPKM_saturation, *rseq_args) curr_files = tophat_outputs # combine htseq-count files and run deseq on them conditions, htseq_files = dict_to_vectors(htseq_outdict) deseq_config = _get_stage_config(config, "deseq") cell_types = _group_input_by_cell_type(htseq_files) for cell_type, files in cell_types.items(): for comparison in deseq_config["comparisons"]: comparison_name = "_vs_".join(comparison) deseq_dir = os.path.join(results_dir, "deseq", cell_type, comparison_name) safe_makedir(deseq_dir) out_file = os.path.join(deseq_dir, comparison_name + ".counts.txt") files_by_condition = _group_input_by_condition(files) _emit_stage_message("deseq", files_by_condition) c, f = dict_to_vectors(files_by_condition) combined_out = htseq_count.combine_counts(f, None, out_file) deseq_out = os.path.join(deseq_dir, comparison_name) logger.info("Running deseq on %s with conditions %s " "and writing ot %s" % (combined_out, conditions, deseq_out)) deseq_out = view.map(deseq.run, [combined_out], [c], [deseq_out]) annotate.annotate_table_with_biomart(deseq_out[0], "id", "ensembl_gene_id", "human") #annotated_file = view.map(annotate.annotate_table_with_biomart, # [deseq_out], # ["id"], # ["ensembl_gene_id"], # ["human"]) # end gracefully stop_cluster()
def main(config_file): """ this assumes that we are keeping the same order of the files throughout """ with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) input_dir = config["input_dir"] results_dir = config["dir"].get("results", "results") input_files = glob.glob(os.path.join(input_dir, "*.fq")) curr_files = _make_current_files(input_files) conditions = [os.path.basename(x).split("_")[0] for x in input_files] for stage in config["run"]: if stage == "fastqc": _emit_stage_message(stage, curr_files) fastqc_config = _get_stage_config(config, stage) fastqc_args = zip(*product(curr_files, [fastqc_config], [config])) fastqc_out = view.map(fastqc.run, *fastqc_args) logger.info("fastqc outfiles: %s" % (fastqc_out)) if stage == "cutadapt": _emit_stage_message(stage, curr_files) cutadapt_config = _get_stage_config(config, stage) cutadapt_args = zip(*product(curr_files, [cutadapt_config], [config])) cutadapt_outputs = view.map(cutadapt_tool.run, *cutadapt_args) curr_files = _make_current_files(cutadapt_outputs) if stage == "tophat": _emit_stage_message(stage, curr_files) tophat_config = _get_stage_config(config, stage) tophat_args = zip(*product(curr_files, [None], [config["ref"]], ["tophat"], [config])) tophat_outputs = view.map(tophat.run_with_config, *tophat_args) # convert to bam, sort and index bamfiles = view.map(sam.sam2bam, tophat_outputs) sorted_bf = view.map(sam.bamsort, bamfiles) view.map(sam.bamindex, sorted_bf) curr_files = sorted_bf if stage == "rseqc": _emit_stage_message(stage, curr_files) rseqc_config = _get_stage_config(config, stage) rseq_args = zip(*product(curr_files, [config])) view.map(rseqc.bam2bigwig, *rseq_args, block=False) view.map(rseqc.bam_stat, *rseq_args, block=False) view.map(rseqc.clipping_profile, *rseq_args, block=False) view.map(rseqc.genebody_coverage, *rseq_args, block=False) view.map(rseqc.junction_annotation, *rseq_args, block=False) view.map(rseqc.junction_saturation, *rseq_args, block=False) view.map(rseqc.RPKM_count, *rseq_args, block=False) view.map(rseqc.RPKM_saturation, *rseq_args, block=False) curr_files = tophat_outputs if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) out_files = [replace_suffix(os.path.basename(x), "metrics") for x in curr_files] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) if stage == "htseq-count": _emit_stage_message(stage, curr_files) htseq_config = _get_stage_config(config, stage) htseq_args = zip(*product(curr_files, [config], [stage])) htseq_outputs = view.map(htseq_count.run_with_config, *htseq_args) combined_out = os.path.join(config["dir"]["results"], stage, "all_combined.counts") combined_out = htseq_count.combine_counts(htseq_outputs, None, out_file=combined_out) if stage == "deseq": _emit_stage_message(stage, curr_files) deseq_config = _get_stage_config(config, stage) out_dir = os.path.join(config["dir"]["results"], stage) safe_makedir(out_dir) for comparison in deseq_config["comparisons"]: comparison_name = "_vs_".join(comparison) out_dir = os.path.join(results_dir, stage, comparison_name) safe_makedir(out_dir) indexes = [x for x, y in enumerate(conditions) if y in comparison] htseq_files = [htseq_outputs[index] for index in indexes] htseq_columns = [conditions[index] for index in indexes] out_file = os.path.join(out_dir, comparison_name + ".counts.txt") combined_out = htseq_count.combine_counts(htseq_files, htseq_columns, out_file) deseq_conds = [conditions[index] for index in indexes] deseq_out = os.path.join(out_dir, comparison_name + ".deseq.txt") logger.info("Running deseq on %s with conditions %s " "and writing to %s" % (combined_out, conditions, deseq_out)) view.map(deseq.run, [combined_out], [deseq_conds], [deseq_out]) annotated_file = view.map(annotate.annotate_table_with_biomart, [deseq_out], ["id"], ["ensembl_gene_id"], ["zebrafish"]) # end gracefully stop_cluster()