def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) setup_logging(config) from bipy.log import logger start_cluster(config) data_dir = config["dir"]["data"] from bipy.cluster import view input_files = [glob.glob(os.path.join(data_dir, x, "*_rep*")) for x in config["input_dirs"]] input_files = list(flatten(input_files)) logger.info("Input files to process: %s" % (input_files)) results_dir = config["dir"]["results"] map(safe_makedir, config["dir"].values()) curr_files = input_files for stage in config["run"]: if stage == "fastqc": nfiles = len(curr_files) logger.info("Running %s on %s" % (stage, str(curr_files))) fastqc_config = _get_stage_config(config, stage) fastqc_outputs = view.map(fastqc.run, curr_files, [fastqc_config] * nfiles, [config] * nfiles) if stage == "cutadapt": nfiles = len(curr_files) cutadapt_config = _get_stage_config(config, stage) cutadapt_outputs = view.map(cutadapt_tool.run, curr_files, [cutadapt_config] * nfiles, [config] * nfiles) curr_files = cutadapt_outputs if stage == "novoalign": nfiles = len(curr_files) novoalign_config = _get_stage_config(config, stage) #db = novoindex.run(config["ref"], # _get_stage_config(config, "novoindex"), # config) db = config["genome"]["file"] novoalign_outputs = view.map(novoalign.run, curr_files, [db] * nfiles, [novoalign_config] * nfiles, [config] * nfiles) picard = BroadRunner(config["program"]["picard"]) args = zip(*itertools.product([picard], novoalign_outputs)) # conver to bam bamfiles = view.map(picardrun.picard_formatconverter, *args) args = zip(*itertools.product([picard], bamfiles)) # sort bam sorted_bf = view.map(picardrun.picard_sort, *args) # index bam args = zip(*itertools.product([picard], sorted_bf)) view.map(picardrun.picard_index, *args) curr_files = novoalign_outputs if stage == "htseq-count": logger.info("Running htseq-count on %s" %(curr_files)) htseq_outputs = curr_files column_names = _get_short_names(input_files) logger.info("Column names: %s" % (column_names)) out_file = os.path.join(config["dir"]["results"], stage, "combined.counts") out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) combined_out = htseq_count.combine_counts(htseq_outputs, column_names, out_file) rpkm = htseq_count.calculate_rpkm(combined_out, config["annotation"]["file"]) rpkm_file = os.path.join(config["dir"]["results"], stage, "rpkm.txt") rpkm.to_csv(rpkm_file, sep="\t") if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = blastn.prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) out_files = [replace_suffix(os.path.basename(x), "metrics") for x in curr_files] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) if stage == "deseq": conditions = [os.path.basename(x).split("_")[0] for x in input_files] deseq_config = _get_stage_config(config, stage) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) for comparison in deseq_config["comparisons"]: comparison_name = "_vs_".join(comparison) out_dir = os.path.join(results_dir, stage, comparison_name) safe_makedir(out_dir) # get the of the conditons that match this comparison indexes = [x for x, y in enumerate(conditions) if y in comparison] # find the htseq_files to combine and combine them htseq_files = [htseq_outputs[index] for index in indexes] htseq_columns = [column_names[index] for index in indexes] logger.info(htseq_files) logger.info(htseq_columns) out_file = os.path.join(out_dir, comparison_name + ".counts.txt") combined_out = htseq_count.combine_counts(htseq_files, htseq_columns, out_file) deseq_conds = [conditions[index] for index in indexes] deseq_prefix = os.path.join(out_dir, comparison_name) deseq_out = view.map(deseq.run, [combined_out], [deseq_conds], [deseq_prefix]) logger.info("Annotating %s." % (deseq_out)) annotated_file = view.map(annotate.annotate_table_with_biomart, deseq_out, ["id"], ["ensembl_gene_id"], ["human"]) if stage == "dss": conditions = [os.path.basename(x).split("_")[0] for x in input_files] dss_config = _get_stage_config(config, stage) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) for comparison in dss_config["comparisons"]: comparison_name = "_vs_".join(comparison) out_dir = os.path.join(results_dir, stage, comparison_name) safe_makedir(out_dir) # get the of the conditons that match this comparison indexes = [x for x, y in enumerate(conditions) if y in comparison] # find the htseq_files to combine and combine them htseq_files = [htseq_outputs[index] for index in indexes] htseq_columns = [column_names[index] for index in indexes] out_file = os.path.join(out_dir, comparison_name + ".counts.txt") combined_out = htseq_count.combine_counts(htseq_files, htseq_columns, out_file) dss_conds = [conditions[index] for index in indexes] dss_prefix = os.path.join(out_dir, comparison_name) logger.info("Running DSS on %s with conditions %s and comparison %s." % (combined_out, dss_conds, comparison)) dss_out = dss.run(combined_out, dss_conds, comparison, dss_prefix) stop_cluster()
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) setup_logging(config) from bipy.log import logger start_cluster(config) from bipy.cluster import view # view.push({'logger': logger}) input_files = [ os.path.join(config["dir"]["data"], x) for x in config["input"] ] results_dir = config["dir"]["results"] map(safe_makedir, config["dir"].values()) curr_files = input_files for stage in config["run"]: if stage == "fastqc": nfiles = len(curr_files) logger.info("Running %s on %s" % (stage, str(curr_files))) fastqc_config = _get_stage_config(config, stage) fastqc_outputs = view.map(fastqc.run, curr_files, [fastqc_config] * nfiles, [config] * nfiles) if stage == "cutadapt": nfiles = len(curr_files) cutadapt_config = _get_stage_config(config, stage) cutadapt_outputs = view.map(cutadapt_tool.run, curr_files, [cutadapt_config] * nfiles, [config] * nfiles) curr_files = cutadapt_outputs if stage == "novoalign": nfiles = len(curr_files) novoalign_config = _get_stage_config(config, stage) #db = novoindex.run(config["ref"], # _get_stage_config(config, "novoindex"), # config) db = config["genome"]["file"] novoalign_outputs = view.map(novoalign.run, curr_files, [db] * nfiles, [novoalign_config] * nfiles, [config] * nfiles) picard = BroadRunner(config["program"]["picard"]) args = zip(*itertools.product([picard], novoalign_outputs)) # conver to bam bamfiles = view.map(picardrun.picard_formatconverter, *args) args = zip(*itertools.product([picard], bamfiles)) # sort bam sorted_bf = view.map(picardrun.picard_sort, *args) # index bam args = zip(*itertools.product([picard], sorted_bf)) view.map(picardrun.picard_index, *args) curr_files = novoalign_outputs if stage == "htseq-count": nfiles = len(curr_files) htseq_config = _get_stage_config(config, stage) htseq_outputs = view.map(htseq_count.run_with_config, curr_files, [config] * nfiles, [stage] * nfiles) column_names = _get_short_names(input_files) logger.info("Column names: %s" % (column_names)) out_file = os.path.join(config["dir"]["results"], stage, "combined.counts") combined_out = htseq_count.combine_counts(htseq_outputs, column_names, out_file) rpkm = htseq_count.calculate_rpkm(combined_out, config["annotation"]["file"]) rpkm_file = os.path.join(config["dir"]["results"], stage, "rpkm.txt") rpkm.to_csv(rpkm_file, sep="\t") if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = blastn.prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) out_files = [ replace_suffix(os.path.basename(x), "metrics") for x in curr_files ] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) if stage == "deseq": conditions = [ os.path.basename(x).split("_")[0] for x in input_files ] deseq_config = _get_stage_config(config, stage) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) for comparison in deseq_config["comparisons"]: comparison_name = "_vs_".join(comparison) out_dir = os.path.join(results_dir, stage, comparison_name) safe_makedir(out_dir) # get the of the conditons that match this comparison indexes = [ x for x, y in enumerate(conditions) if y in comparison ] # find the htseq_files to combine and combine them htseq_files = [htseq_outputs[index] for index in indexes] htseq_columns = [column_names[index] for index in indexes] logger.info(htseq_files) logger.info(htseq_columns) out_file = os.path.join(out_dir, comparison_name + ".counts.txt") combined_out = htseq_count.combine_counts( htseq_files, htseq_columns, out_file) deseq_conds = [conditions[index] for index in indexes] deseq_prefix = os.path.join(out_dir, comparison_name) deseq_out = view.map(deseq.run, [combined_out], [deseq_conds], [deseq_prefix]) logger.info("Annotating %s." % (deseq_out)) annotated_file = view.map(annotate.annotate_table_with_biomart, deseq_out, ["id"], ["ensembl_gene_id"], ["human"]) stop_cluster()
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) stage_dict = {"download_encode": _download_encode, "fastqc": _run_fastqc} curr_files = config["encode_file"] results_dir = config["dir"].get("results", "results") for cell_type in config["cell_types"]: cell_type_dir = os.path.join(results_dir, cell_type) safe_makedir(cell_type_dir) config["dir"]["results"] = cell_type_dir in_files = glob.glob(os.path.join(config["dir"]["data"], cell_type, "*")) curr_files = in_files for stage in config["run"]: if stage == "fastqc": _emit_stage_message(stage, curr_files) fastqc_config = _get_stage_config(config, stage) fastqc_args = zip(*product(curr_files, [fastqc_config], [config])) view.map(fastqc.run, *fastqc_args) if stage == "cutadapt": _emit_stage_message(stage, curr_files) cutadapt_config = _get_stage_config(config, stage) cutadapt_args = zip(*product(curr_files, [cutadapt_config], [config])) cutadapt_outputs = view.map(cutadapt_tool.run, *cutadapt_args) curr_files = cutadapt_outputs if stage == "tophat": _emit_stage_message(stage, curr_files) tophat_config = _get_stage_config(config, stage) tophat_args = zip(*product(curr_files, [None], [config["ref"]], ["tophat"], [config])) tophat_outputs = view.map(tophat.run_with_config, *tophat_args) picard = BroadRunner(config["program"]["picard"]) # convert to bam #args = zip(*product([picard], tophat_outputs)) #bamfiles = view.map(picardrun.picard_formatconverter, # *args) bamfiles = view.map(sam.sam2bam, tophat_outputs) sorted_bf = view.map(sam.bamsort, bamfiles) view.map(sam.bamindex, sorted_bf) curr_files = sorted_bf if stage == "rseqc": _emit_stage_message(stage, curr_files) rseqc_config = _get_stage_config(config, stage) rseq_args = zip(*product(curr_files, [config])) view.map(rseqc.bam2bigwig, *rseq_args, block=False) view.map(rseqc.bam_stat, *rseq_args, block=False) view.map(rseqc.clipping_profile, *rseq_args, block=False) view.map(rseqc.genebody_coverage, *rseq_args, block=False) view.map(rseqc.junction_annotation, *rseq_args, block=False) view.map(rseqc.junction_saturation, *rseq_args, block=False) RPKM_count_files = view.map(rseqc.RPKM_count, *rseq_args) dirs_to_process = list(set(map(os.path.dirname, RPKM_count_files))) logger.info("Count files: %s" % (RPKM_count_files)) logger.info("dirnames to process: %s" % (dirs_to_process)) RPKM_merged = view.map(rseqc.merge_RPKM, dirs_to_process) view.map(rseqc.RPKM_saturation, *rseq_args, block=False) curr_files = tophat_outputs if stage == "htseq-count": _emit_stage_message(stage, curr_files) htseq_config = _get_stage_config(config, stage) htseq_args = zip(*product(curr_files, [config], [stage])) htseq_outputs = view.map(htseq_count.run_with_config, *htseq_args) column_names = in_files out_file = os.path.join(config["dir"]["results"], stage, cell_type + ".combined.counts") combined_out = htseq_count.combine_counts(htseq_outputs, column_names, out_file) rpkm = htseq_count.calculate_rpkm(combined_out, config["annotation"]["file"]) rpkm_file = os.path.join(config["dir"]["results"], stage, cell_type + ".rpkm.txt") rpkm.to_csv(rpkm_file, sep="\t") if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(config["dir"]["results"], stage) safe_makedir(out_dir) out_files = [replace_suffix(os.path.basename(x), "metrics") for x in curr_files] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) # end gracefully, wait for jobs to finish, then exit view.wait() stop_cluster()