def main(config_file, out_base, ref_file, read1, read2=None, sample_name=""): with open(config_file) as in_handle: config = yaml.load(in_handle) barcode, lane = _get_sample_details(read1) maq_cmd = config["program"]["maq"] stringency = config["algorithm"]["stringency"] picard = BroadRunner(config["program"]["picard"], config["program"].get("gatk", ""), max_memory=config["algorithm"].get("java_memory", "")) bam_reads = fastq_to_bam(picard, sample_name, config["algorithm"]["quality_format"], read1, read2) base_align = picard_run_maq(picard, maq_cmd, bam_reads, ref_file, barcode, lane, out_base, stringency, read2 is not None, limit=1e6) cal_bam_reads = calibrate_scores(picard, bam_reads, base_align, ref_file) final_align = picard_run_maq(picard, maq_cmd, cal_bam_reads, ref_file, barcode, lane, out_base, stringency, read2 is not None, ext="-cal")
def main(config_file, align_sam, ref_file, fastq_one, fastq_pair=None, sample_name="", rg_name="", pu_name=""): with open(config_file) as in_handle: config = yaml.load(in_handle) picard = BroadRunner(config["program"]["picard"], max_memory=config["algorithm"].get("java_memory", "")) platform = config["algorithm"]["platform"] if platform.lower() == "illumina": qual_format = "Illumina" else: raise ValueError("Need to specify quality format for %s" % platform) index_ref_file(picard, ref_file) base_dir = os.path.split(align_sam)[0] with curdir_tmpdir() as tmp_dir: out_fastq_bam = picard_fastq_to_bam(picard, fastq_one, fastq_pair, base_dir, platform, qual_format, sample_name, rg_name, pu_name, tmp_dir) out_bam = picard_merge_bam(picard, align_sam, out_fastq_bam, ref_file, tmp_dir, fastq_pair is not None) sort_bam = picard_sort(picard, out_bam, tmp_dir) save_diskspace(out_fastq_bam, "Combined into output BAM %s" % out_bam, config) save_diskspace(out_bam, "Sorted to %s" % sort_bam, config)
def __init__(self, config): self.config = config self.stage_config = config["stage"][self.stage] self.ribo = self.stage_config["ribo"] self.picard = BroadRunner(config["program"]["picard"], None, {"algorithm": {}}) self.ref = prepare_ref_file(self.stage_config["ref"], self.config)
def main(gatk_dir, vcf_info, ref_file, dbsnp, intervals=None): picard = BroadRunner(gatk_dir) if os.path.isdir(vcf_info): vcf_files = sorted(glob.glob(os.path.join(vcf_info, "*-filter.vcf"))) else: vcf_files = [vcf_info] for vcf_in in vcf_files: eval_file = variant_eval(vcf_in, ref_file, dbsnp, intervals, picard) stats = extract_eval_stats(eval_file) print_stats(vcf_in, stats['called'], len(vcf_files) == 1)
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) # specific for thesis pipeline in_dir = config["dir"]["data"] id_file = config["id_file"] curr_files = input_files_from_dir(in_dir, id_file) logger.info("Running pipeline on %s." % (curr_files)) for stage in config["run"]: if stage == "fastqc": logger.info("Running fastqc on %s." % (curr_files)) stage_runner = fastqc.FastQC(config) view.map(stage_runner, curr_files, block=False) if stage == "cutadapt": logger.info("Running cutadapt on %s." % (curr_files)) stage_runner = trim.Cutadapt(config) curr_files = view.map(stage_runner, curr_files) if stage == "bowtie": logger.info("Running bowtie on %s." % (curr_files)) bowtie = Bowtie(config) curr_files = view.map(bowtie, curr_files) mapped = view.map(sam.only_mapped, curr_files) unmapped = view.map(sam.only_unmapped, curr_files) curr_files = mapped bam_files = view.map(sam.sam2bam, mapped) bam_sorted = view.map(sam.bamsort, bam_files) view.map(sam.bamindex, bam_sorted) if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(config["dir"]["results"], stage) safe_makedir(out_dir) out_files = [ replace_suffix(os.path.basename(x), "metrics") for x in curr_files ] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) stop_cluster()
def main(picard_dir, align_bam, ref_file, is_paired, bait_file=None, target_file=None, do_sort=False, sample_name="", config=None): with utils.curdir_tmpdir() as tmp_dir: work_dir = os.getcwd() params = {} java_memory = "" if config: with open(config) as in_handle: info = yaml.load(in_handle) params = info["program"] java_memory = info["algorithm"].get("java_memory", "") picard = BroadRunner(picard_dir, max_memory=java_memory) if do_sort: align_bam = picard_sort(picard, align_bam, tmp_dir) metrics = PicardMetrics(picard, tmp_dir) summary_table, metrics_graphs = metrics.report(align_bam, ref_file, is_paired, bait_file, target_file) metrics_graphs = [(p, c, 0.75) for p, c in metrics_graphs] base, ext = os.path.splitext(align_bam) base = base.replace(".", "-") fastqc_graphs, fastqc_stats, fastqc_overrep = \ fastqc_report(align_bam, params) all_graphs = fastqc_graphs + metrics_graphs summary_table = _update_summary_table(summary_table, ref_file, fastqc_stats) tmpl = Template(section_template) if sample_name is None: sample_name = fastqc_stats["Filename"] sample_name = "%s (%s)" % (sample_name.replace( "_", "\_"), base.replace("_", " ")) section = tmpl.render( name=sample_name, summary=None, summary_table=summary_table, figures=[(f, c, i) for (f, c, i) in all_graphs if f], overrep=fastqc_overrep, recal_figures=_get_recal_plots(work_dir, align_bam)) out_file = "%s-summary.tex" % base out_tmpl = Template(base_template) with open(out_file, "w") as out_handle: out_handle.write(out_tmpl.render(parts=[section])) run_pdflatex(out_file, params)
def main(picard_dir, align_bam, ref_file, fastq_one, fastq_pair=None, bait_file=None, target_file=None, do_sort=False, sample_name="", config=None): tmp_dir = _make_tmpdir() work_dir = os.getcwd() if config: with open(config) as in_handle: params = yaml.load(in_handle)["program"] else: params = PARAM_DEFAULTS picard = BroadRunner(picard_dir) if do_sort: align_bam = picard_sort(picard, align_bam, tmp_dir) metrics = PicardMetrics(picard, tmp_dir) summary_table, metrics_graphs = metrics.report(align_bam, ref_file, fastq_pair is not None, bait_file, target_file) base, ext = os.path.splitext(align_bam) base = base.replace(".", "-") total_count, read_size, fastq_graphs = plot_fastq_stats( [fastq_one, fastq_pair], base, params) qa_graphs = solexaqa_plots([fastq_one, fastq_pair], params, work_dir) # add read_size to the total summary table summary_table[0] = (summary_table[0][0], summary_table[0][1], "%sbp %s" % (read_size, summary_table[0][-1])) ref_org = os.path.splitext(os.path.split(ref_file)[-1])[0] summary_table.insert(0, ("Reference organism", ref_org.replace("_", " "), "")) tmpl = Template(section_template) sample_name = "%s (%s)" % (sample_name.replace( "_", "\_"), base.replace("_", " ")) section = tmpl.render(name=sample_name, summary=None, summary_table=summary_table, figures=[(f, c) for (f, c) in metrics_graphs + fastq_graphs + qa_graphs if f], recal_figures=_get_recal_plots(work_dir, align_bam)) out_file = "%s-summary.tex" % base out_tmpl = Template(base_template) with open(out_file, "w") as out_handle: out_handle.write(out_tmpl.render(parts=[section])) run_pdflatex(out_file, params) shutil.rmtree(tmp_dir)
def merge_bam_files(bam_files, work_dir, config): """Merge multiple BAM files from a sample into a single BAM for processing. """ out_file = os.path.join(work_dir, os.path.basename(bam_files[0])) if not os.path.exists(out_file): picard = BroadRunner(config["program"]["picard"]) with utils.curdir_tmpdir() as tmp_dir: opts = [("OUTPUT", out_file), ("SORT_ORDER", "coordinate"), ("TMP_DIR", tmp_dir)] for b in bam_files: opts.append(("INPUT", b)) picard.run("MergeSamFiles", opts) return out_file
def main(config_file, ref_file, align_bam, snp_file=None): with open(config_file) as in_handle: config = yaml.load(in_handle) picard = BroadRunner(config["program"]["picard"], config["program"].get("gatk", "")) platform = config["algorithm"]["platform"] ref_dict = index_ref_file(picard, ref_file) #snp_dict = (index_snp_file(picard, ref_dict, snp_file) if snp_file else # None) align_sort_bam = picard_sort(picard, align_bam) dup_align_bam = mark_duplicates(picard, align_sort_bam) recal_file = count_covariates(picard, dup_align_bam, ref_file, platform, snp_file) recal_bam = gatk_recalibrate(picard, dup_align_bam, ref_file, recal_file, platform)
def main(config_file, ref_file, align_bam, dbsnp=None): with open(config_file) as in_handle: config = yaml.load(in_handle) picard = BroadRunner(config["program"]["picard"], config["program"].get("gatk", "")) ref_dict = index_ref_file(picard, ref_file) index_bam(align_bam, config["program"]["samtools"]) realign_target_file = realigner_targets(picard, align_bam, ref_file, dbsnp) realign_bam = indel_realignment(picard, align_bam, ref_file, realign_target_file) realign_sort_bam = picard_fixmate(picard, realign_bam) index_bam(realign_sort_bam, config["program"]["samtools"]) snp_file = unified_genotyper(picard, realign_sort_bam, ref_file, dbsnp) filter_snp = variant_filtration(picard, snp_file, ref_file)
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) setup_logging(config) from bipy.log import logger start_cluster(config) from bipy.cluster import view # view.push({'logger': logger}) input_files = [ os.path.join(config["dir"]["data"], x) for x in config["input"] ] results_dir = config["dir"]["results"] map(safe_makedir, config["dir"].values()) curr_files = input_files for stage in config["run"]: if stage == "fastqc": nfiles = len(curr_files) logger.info("Running %s on %s" % (stage, str(curr_files))) fastqc_config = _get_stage_config(config, stage) fastqc_outputs = view.map(fastqc.run, curr_files, [fastqc_config] * nfiles, [config] * nfiles) if stage == "cutadapt": nfiles = len(curr_files) cutadapt_config = _get_stage_config(config, stage) cutadapt_outputs = view.map(cutadapt_tool.run, curr_files, [cutadapt_config] * nfiles, [config] * nfiles) curr_files = cutadapt_outputs if stage == "novoalign": nfiles = len(curr_files) novoalign_config = _get_stage_config(config, stage) #db = novoindex.run(config["ref"], # _get_stage_config(config, "novoindex"), # config) db = config["genome"]["file"] novoalign_outputs = view.map(novoalign.run, curr_files, [db] * nfiles, [novoalign_config] * nfiles, [config] * nfiles) picard = BroadRunner(config["program"]["picard"]) args = zip(*itertools.product([picard], novoalign_outputs)) # conver to bam bamfiles = view.map(picardrun.picard_formatconverter, *args) args = zip(*itertools.product([picard], bamfiles)) # sort bam sorted_bf = view.map(picardrun.picard_sort, *args) # index bam args = zip(*itertools.product([picard], sorted_bf)) view.map(picardrun.picard_index, *args) curr_files = novoalign_outputs if stage == "htseq-count": nfiles = len(curr_files) htseq_config = _get_stage_config(config, stage) htseq_outputs = view.map(htseq_count.run_with_config, curr_files, [config] * nfiles, [stage] * nfiles) column_names = _get_short_names(input_files) logger.info("Column names: %s" % (column_names)) out_file = os.path.join(config["dir"]["results"], stage, "combined.counts") combined_out = htseq_count.combine_counts(htseq_outputs, column_names, out_file) rpkm = htseq_count.calculate_rpkm(combined_out, config["annotation"]["file"]) rpkm_file = os.path.join(config["dir"]["results"], stage, "rpkm.txt") rpkm.to_csv(rpkm_file, sep="\t") if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = blastn.prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) out_files = [ replace_suffix(os.path.basename(x), "metrics") for x in curr_files ] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) if stage == "deseq": conditions = [ os.path.basename(x).split("_")[0] for x in input_files ] deseq_config = _get_stage_config(config, stage) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) for comparison in deseq_config["comparisons"]: comparison_name = "_vs_".join(comparison) out_dir = os.path.join(results_dir, stage, comparison_name) safe_makedir(out_dir) # get the of the conditons that match this comparison indexes = [ x for x, y in enumerate(conditions) if y in comparison ] # find the htseq_files to combine and combine them htseq_files = [htseq_outputs[index] for index in indexes] htseq_columns = [column_names[index] for index in indexes] logger.info(htseq_files) logger.info(htseq_columns) out_file = os.path.join(out_dir, comparison_name + ".counts.txt") combined_out = htseq_count.combine_counts( htseq_files, htseq_columns, out_file) deseq_conds = [conditions[index] for index in indexes] deseq_prefix = os.path.join(out_dir, comparison_name) deseq_out = view.map(deseq.run, [combined_out], [deseq_conds], [deseq_prefix]) logger.info("Annotating %s." % (deseq_out)) annotated_file = view.map(annotate.annotate_table_with_biomart, deseq_out, ["id"], ["ensembl_gene_id"], ["human"]) stop_cluster()
def main(config_file): """ this assumes that we are keeping the same order of the files throughout """ with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) input_dict = config["input"] curr_files = _make_current_files(input_dict.keys()) input_meta = input_dict.values() for stage in config["run"]: if stage == "fastqc": _emit_stage_message(stage, curr_files) fastqc_config = _get_stage_config(config, stage) fastqc_args = zip(*product(curr_files, [fastqc_config], [config])) view.map(fastqc.run, *fastqc_args) if stage == "cutadapt": _emit_stage_message(stage, curr_files) cutadapt_config = _get_stage_config(config, stage) cutadapt_args = zip( *product(curr_files, [cutadapt_config], [config])) cutadapt_outputs = view.map(cutadapt_tool.run, *cutadapt_args) curr_files = _make_current_files(cutadapt_outputs) if stage == "tophat": _emit_stage_message(stage, curr_files) tophat_config = _get_stage_config(config, stage) tophat_args = zip(*product(curr_files, [None], [config["ref"]], ["tophat"], [config])) tophat_outputs = view.map(tophat.run_with_config, *tophat_args) bamfiles = view.map(sam.sam2bam, tophat_outputs) bamsort = view.map(sam.bamsort, bamfiles) view.map(sam.bamindex, bamsort) final_bamfiles = bamsort curr_files = tophat_outputs if stage == "htseq-count": _emit_stage_message(stage, curr_files) htseq_config = _get_stage_config(config, stage) htseq_args = zip(*product(curr_files, [config], [stage])) htseq_outputs = view.map(htseq_count.run_with_config, *htseq_args) combined_out = os.path.join(config["dir"]["results"], stage, "all_combined.counts") combined_out = htseq_count.combine_counts(htseq_outputs, None, out_file=combined_out) if stage == "rseqc": _emit_stage_message(stage, curr_files) rseqc_config = _get_stage_config(config, stage) rseq_args = zip(*product(curr_files, [config])) view.map(rseqc.bam_stat, *rseq_args) view.map(rseqc.genebody_coverage, *rseq_args) view.map(rseqc.junction_annotation, *rseq_args) view.map(rseqc.junction_saturation, *rseq_args) RPKM_args = zip(*product(final_bamfiles, [config])) RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args) RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file, RPKM_count_out) annotate_args = zip(*product(RPKM_count_fixed, ["gene_id"], ["ensembl_transcript_id"], ["mouse"])) view.map(annotate.annotate_table_with_biomart, *annotate_args) view.map(rseqc.RPKM_saturation, *RPKM_args) if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(config["dir"]["results"], stage) safe_makedir(out_dir) out_files = [ replace_suffix(os.path.basename(x), "metrics") for x in curr_files ] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) if stage == "deseq": _emit_stage_message(stage, curr_files) deseq_config = _get_stage_config(config, stage) out_dir = os.path.join(config["dir"]["results"], stage) safe_makedir(out_dir) for test in deseq_config["tests"]: indexes = [ _find_file_index_for_test(input_meta, condition) for condition in test ] files = [htseq_outputs[x] for x in indexes] conditions = [input_meta[x]["condition"] for x in indexes] combined_out = os.path.join( out_dir, "_".join(conditions) + "_combined.counts") logger.info("Combining %s to %s." % (files, combined_out)) count_file = htseq_count.combine_counts(files, None, out_file=combined_out) out_file = os.path.join(out_dir, "_".join(conditions) + "_deseq.txt") logger.info("Running deseq on %s with conditions %s " "and writing to %s" % (count_file, conditions, out_file)) view.map(deseq.run, [count_file], [conditions], [out_file]) #deseq.run(count_file, conditions, out_file=out_file) # end gracefully stop_cluster()
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) # specific for thesis pipeline input_dirs = config["input_dirs"] results_dir = config["dir"].get("results", "results") input_files = _find_input_files(config) conditions = _group_input_by_condition(input_files) logger.info("Input_files: %s" % (input_files)) logger.info("Condition groups %s" % (conditions)) htseq_outdict = {} for condition, curr_files in conditions.items(): condition_dir = os.path.join(results_dir, condition) safe_makedir(condition_dir) config["dir"]["results"] = condition_dir for stage in config["run"]: if stage == "fastqc": _emit_stage_message(stage, curr_files) fastqc_config = _get_stage_config(config, stage) fastqc_args = zip( *product(curr_files, [fastqc_config], [config])) view.map(fastqc.run, *fastqc_args) if stage == "cutadapt": _emit_stage_message(stage, curr_files) cutadapt_config = _get_stage_config(config, stage) cutadapt_args = zip( *product(curr_files, [cutadapt_config], [config])) cutadapt_outputs = view.map(cutadapt_tool.run, *cutadapt_args) curr_files = cutadapt_outputs logger.info("Fixing mate pair information.") pairs = combine_pairs(curr_files) first = [x[0] for x in pairs] second = [x[1] for x in pairs] logger.info("Forward: %s" % (first)) logger.info("Reverse: %s" % (second)) fixed = view.map(fastq.fix_mate_pairs_with_config, first, second, [config] * len(first)) curr_files = list(flatten(fixed)) if stage == "sickle": _emit_stage_message(stage, curr_files) pairs = combine_pairs(curr_files) first = [x[0] for x in pairs] second = [x[1] for x in pairs] fixed = view.map(sickle.run_with_config, first, second, [config] * len(first)) curr_files = list(flatten(fixed)) if stage == "tophat": _emit_stage_message(stage, curr_files) tophat_config = _get_stage_config(config, stage) pairs = combine_pairs(curr_files) first = [x[0] for x in pairs] second = [x[1] for x in pairs] logger.info("first %s" % (first)) logger.info("second %s" % (second)) #tophat_args = zip(*product(first, second, [config["ref"]], # ["tophat"], [config])) tophat_outputs = view.map(tophat.run_with_config, first, second, [config["ref"]] * len(first), ["tophat"] * len(first), [config] * len(first)) bamfiles = view.map(sam.sam2bam, tophat_outputs) bamsort = view.map(sam.bamsort, bamfiles) view.map(sam.bamindex, bamsort) final_bamfiles = bamsort curr_files = tophat_outputs if stage == "htseq-count": _emit_stage_message(stage, curr_files) htseq_config = _get_stage_config(config, stage) htseq_args = zip(*product(curr_files, [config], [stage])) htseq_outputs = view.map(htseq_count.run_with_config, *htseq_args) htseq_outdict[condition] = htseq_outputs if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) out_files = [ replace_suffix(os.path.basename(x), "metrics") for x in curr_files ] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) if stage == "rseqc": _emit_stage_message(stage, curr_files) rseqc_config = _get_stage_config(config, stage) rseq_args = zip(*product(curr_files, [config])) view.map(rseqc.bam_stat, *rseq_args) view.map(rseqc.genebody_coverage, *rseq_args) view.map(rseqc.junction_annotation, *rseq_args) view.map(rseqc.junction_saturation, *rseq_args) RPKM_args = zip(*product(final_bamfiles, [config])) RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args) RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file, RPKM_count_out) """ annotate_args = zip(*product(RPKM_count_fixed, ["gene_id"], ["ensembl_gene_id"], ["human"])) view.map(annotate.annotate_table_with_biomart, *annotate_args) """ view.map(rseqc.RPKM_saturation, *rseq_args) curr_files = tophat_outputs # combine htseq-count files and run deseq on them conditions, htseq_files = dict_to_vectors(htseq_outdict) deseq_config = _get_stage_config(config, "deseq") cell_types = _group_input_by_cell_type(htseq_files) for cell_type, files in cell_types.items(): for comparison in deseq_config["comparisons"]: comparison_name = "_vs_".join(comparison) deseq_dir = os.path.join(results_dir, "deseq", cell_type, comparison_name) safe_makedir(deseq_dir) out_file = os.path.join(deseq_dir, comparison_name + ".counts.txt") files_by_condition = _group_input_by_condition(files) _emit_stage_message("deseq", files_by_condition) c, f = dict_to_vectors(files_by_condition) combined_out = htseq_count.combine_counts(f, None, out_file) deseq_out = os.path.join(deseq_dir, comparison_name) logger.info("Running deseq on %s with conditions %s " "and writing ot %s" % (combined_out, conditions, deseq_out)) deseq_out = view.map(deseq.run, [combined_out], [c], [deseq_out]) annotate.annotate_table_with_biomart(deseq_out[0], "id", "ensembl_gene_id", "human") #annotated_file = view.map(annotate.annotate_table_with_biomart, # [deseq_out], # ["id"], # ["ensembl_gene_id"], # ["human"]) # end gracefully stop_cluster()
def main(config_file): """ this assumes that we are keeping the same order of the files throughout """ with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) input_dir = config["input_dir"] results_dir = config["dir"].get("results", "results") input_files = glob.glob(os.path.join(input_dir, "*.fq")) curr_files = _make_current_files(input_files) conditions = [os.path.basename(x).split("_")[0] for x in input_files] for stage in config["run"]: if stage == "fastqc": _emit_stage_message(stage, curr_files) fastqc_config = _get_stage_config(config, stage) fastqc_args = zip(*product(curr_files, [fastqc_config], [config])) fastqc_out = view.map(fastqc.run, *fastqc_args) logger.info("fastqc outfiles: %s" % (fastqc_out)) if stage == "cutadapt": _emit_stage_message(stage, curr_files) cutadapt_config = _get_stage_config(config, stage) cutadapt_args = zip( *product(curr_files, [cutadapt_config], [config])) cutadapt_outputs = view.map(cutadapt_tool.run, *cutadapt_args) curr_files = _make_current_files(cutadapt_outputs) if stage == "tophat": _emit_stage_message(stage, curr_files) tophat_config = _get_stage_config(config, stage) tophat_args = zip(*product(curr_files, [None], [config["ref"]], ["tophat"], [config])) tophat_outputs = view.map(tophat.run_with_config, *tophat_args) # convert to bam, sort and index bamfiles = view.map(sam.sam2bam, tophat_outputs) sorted_bf = view.map(sam.bamsort, bamfiles) view.map(sam.bamindex, sorted_bf) curr_files = sorted_bf if stage == "rseqc": _emit_stage_message(stage, curr_files) rseqc_config = _get_stage_config(config, stage) rseq_args = zip(*product(curr_files, [config])) view.map(rseqc.bam2bigwig, *rseq_args, block=False) view.map(rseqc.bam_stat, *rseq_args, block=False) view.map(rseqc.clipping_profile, *rseq_args, block=False) view.map(rseqc.genebody_coverage, *rseq_args, block=False) view.map(rseqc.junction_annotation, *rseq_args, block=False) view.map(rseqc.junction_saturation, *rseq_args, block=False) view.map(rseqc.RPKM_count, *rseq_args, block=False) view.map(rseqc.RPKM_saturation, *rseq_args, block=False) curr_files = tophat_outputs if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) out_files = [ replace_suffix(os.path.basename(x), "metrics") for x in curr_files ] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) if stage == "htseq-count": _emit_stage_message(stage, curr_files) htseq_config = _get_stage_config(config, stage) htseq_args = zip(*product(curr_files, [config], [stage])) htseq_outputs = view.map(htseq_count.run_with_config, *htseq_args) combined_out = os.path.join(config["dir"]["results"], stage, "all_combined.counts") combined_out = htseq_count.combine_counts(htseq_outputs, None, out_file=combined_out) if stage == "deseq": _emit_stage_message(stage, curr_files) deseq_config = _get_stage_config(config, stage) out_dir = os.path.join(config["dir"]["results"], stage) safe_makedir(out_dir) for comparison in deseq_config["comparisons"]: comparison_name = "_vs_".join(comparison) out_dir = os.path.join(results_dir, stage, comparison_name) safe_makedir(out_dir) indexes = [ x for x, y in enumerate(conditions) if y in comparison ] htseq_files = [htseq_outputs[index] for index in indexes] htseq_columns = [conditions[index] for index in indexes] out_file = os.path.join(out_dir, comparison_name + ".counts.txt") combined_out = htseq_count.combine_counts( htseq_files, htseq_columns, out_file) deseq_conds = [conditions[index] for index in indexes] deseq_out = os.path.join(out_dir, comparison_name + ".deseq.txt") logger.info("Running deseq on %s with conditions %s " "and writing to %s" % (combined_out, conditions, deseq_out)) view.map(deseq.run, [combined_out], [deseq_conds], [deseq_out]) annotated_file = view.map(annotate.annotate_table_with_biomart, [deseq_out], ["id"], ["ensembl_gene_id"], ["zebrafish"]) # end gracefully stop_cluster()
def main(config_file): if config_file: with open(config_file) as in_handle: config = yaml.load(in_handle) dirs = config["in_dir"] conditions = config["conditions"] glob_string = config["glob_string"] files = list(flatten([glob.glob(os.path.join(x, glob_string)) for x in dirs])) out_dir = config["dir"]["results"] safe_makedir(out_dir) curr_files = [] for condition in conditions: condition_files = [x for x in files if condition in x] out_file = os.path.join(out_dir, condition + "_v2_v3.bam") print "Combining %s into %s." % (condition_files, out_file) sh.samtools.merge(list(flatten([out_file, condition_files]))) # bsub_call = list(flatten(["-q", "hsph", "-o", "out" + condition, "-e", "err" + condition, "samtools", "merge", out_file, condition_files])) #sh.bsub(bsub_call) sorted_prefix = remove_suffix(out_file) + ".sorted" sorted_file = sorted_prefix + ".bam" sh.samtools.sort(out_file, sorted_prefix) sh.samtools.index(sorted_file) mapped_file = append_stem(sorted_file, "mapped") sh.samtools.view(sorted_file, F=4, b=True, o=mapped_file) sh.samtools.index(mapped_file) # find the reads that don't intersect with the rrna in_file = mapped_file out_file = os.path.join(out_dir, condition + "_noribo" + "_v2_v3.bam") ribo = config["ribo"] print "Filtering %s for rRNA in %s into %s." % (in_file, ribo, out_file) sh.bedtools.intersect("-abam", in_file, "-v", "-b", ribo, _out=out_file) filtered_file = out_file print "Calculating RNASeq metrics on %s." % (out_file) in_file = out_file ref = blastn.prepare_ref_file(config["stage"]["new_coverage"]["ref"], config) ribo = config["stage"]["new_coverage"]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(config["dir"]["results"], "new_coverage") safe_makedir(out_dir) out_file = replace_suffix(os.path.basename(in_file), "metrics") out_file = os.path.join(out_dir, out_file) metrics_file = picardrun.picard_rnaseq_metrics(picard, in_file, ref, ribo, out_file) jelly_dir = os.path.join(config["dir"]["results"], "jellyfish") safe_makedir(jelly_dir) # convert the filtered file to fastq for jellyfish counting fastq_file = os.path.join(jelly_dir, os.path.basename(replace_suffix(filtered_file, "fastq"))) sh.bam2fastx(filtered_file, fastq=True, _out=fastq_file) for mer in config["stage"]["jellyfish"]["mer_lengths"]: base, _ = os.path.splitext(os.path.basename(fastq_file)) out_prefix = base + "_%dmer" % (mer) out_file = os.path.join(jelly_dir, out_prefix) if not file_exists(out_file): sh.jellyfish.count(fastq_file, config["stage"]["jellyfish"]["options"], m=mer, o=out_file)
def coordinate_sort_sam(in_file, config, out_file=None): out_file = append_stem(in_file, "sorted") picard = BroadRunner(config["program"]["picard"], None, {"algorithm": {}}) picardrun.picard_sort(picard, in_file, sort_order="coordinate", out_file=out_file) return out_file
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) # specific for thesis pipeline in_dir = config["dir"]["data"] curr_files = input_files_from_dir(in_dir) for stage in config["run"]: if stage == "fastqc": stage_runner = fastqc.FastQCStage(config) view.map(stage_runner, curr_files) if stage == "cutadapt": stage_runner = trim.Cutadapt(config) curr_files = view.map(stage_runner, curr_files) if stage == "tophat": _emit_stage_message(stage, curr_files) tophat_config = _get_stage_config(config, stage) tophat_outputs = view.map(tophat.run_with_config, first, [None] * len(curr_files), [config["ref"]] * len(curr_files), ["tophat"] * len(curr_files), [config] * len(curr_files)) bamfiles = view.map(sam.sam2bam, tophat_outputs) bamsort = view.map(sam.bamsort, bamfiles) view.map(sam.bamindex, bamsort) final_bamfiles = bamsort curr_files = tophat_outputs if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) out_files = [replace_suffix(os.path.basename(x), "metrics") for x in curr_files] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) if stage == "rseqc": _emit_stage_message(stage, curr_files) rseqc_config = _get_stage_config(config, stage) rseq_args = zip(*product(curr_files, [config])) view.map(rseqc.bam_stat, *rseq_args) view.map(rseqc.genebody_coverage, *rseq_args) view.map(rseqc.junction_annotation, *rseq_args) view.map(rseqc.junction_saturation, *rseq_args) RPKM_args = zip(*product(final_bamfiles, [config])) RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args) RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file, RPKM_count_out) """ i annotate_args = zip(*product(RPKM_count_fixed, ["gene_id"], ["ensembl_gene_id"], ["human"])) view.map(annotate.annotate_table_with_biomart, *annotate_args) """ view.map(rseqc.RPKM_saturation, *rseq_args) curr_files = tophat_outputs
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) stage_dict = {"download_encode": _download_encode, "fastqc": _run_fastqc} curr_files = config["encode_file"] results_dir = config["dir"].get("results", "results") for cell_type in config["cell_types"]: cell_type_dir = os.path.join(results_dir, cell_type) safe_makedir(cell_type_dir) config["dir"]["results"] = cell_type_dir in_files = glob.glob(os.path.join(config["dir"]["data"], cell_type, "*")) curr_files = in_files for stage in config["run"]: if stage == "fastqc": _emit_stage_message(stage, curr_files) fastqc_config = _get_stage_config(config, stage) fastqc_args = zip(*product(curr_files, [fastqc_config], [config])) view.map(fastqc.run, *fastqc_args) if stage == "cutadapt": _emit_stage_message(stage, curr_files) cutadapt_config = _get_stage_config(config, stage) cutadapt_args = zip(*product(curr_files, [cutadapt_config], [config])) cutadapt_outputs = view.map(cutadapt_tool.run, *cutadapt_args) curr_files = cutadapt_outputs if stage == "tophat": _emit_stage_message(stage, curr_files) tophat_config = _get_stage_config(config, stage) tophat_args = zip(*product(curr_files, [None], [config["ref"]], ["tophat"], [config])) tophat_outputs = view.map(tophat.run_with_config, *tophat_args) picard = BroadRunner(config["program"]["picard"]) # convert to bam #args = zip(*product([picard], tophat_outputs)) #bamfiles = view.map(picardrun.picard_formatconverter, # *args) bamfiles = view.map(sam.sam2bam, tophat_outputs) sorted_bf = view.map(sam.bamsort, bamfiles) view.map(sam.bamindex, sorted_bf) curr_files = sorted_bf if stage == "rseqc": _emit_stage_message(stage, curr_files) rseqc_config = _get_stage_config(config, stage) rseq_args = zip(*product(curr_files, [config])) view.map(rseqc.bam2bigwig, *rseq_args, block=False) view.map(rseqc.bam_stat, *rseq_args, block=False) view.map(rseqc.clipping_profile, *rseq_args, block=False) view.map(rseqc.genebody_coverage, *rseq_args, block=False) view.map(rseqc.junction_annotation, *rseq_args, block=False) view.map(rseqc.junction_saturation, *rseq_args, block=False) RPKM_count_files = view.map(rseqc.RPKM_count, *rseq_args) dirs_to_process = list(set(map(os.path.dirname, RPKM_count_files))) logger.info("Count files: %s" % (RPKM_count_files)) logger.info("dirnames to process: %s" % (dirs_to_process)) RPKM_merged = view.map(rseqc.merge_RPKM, dirs_to_process) view.map(rseqc.RPKM_saturation, *rseq_args, block=False) curr_files = tophat_outputs if stage == "htseq-count": _emit_stage_message(stage, curr_files) htseq_config = _get_stage_config(config, stage) htseq_args = zip(*product(curr_files, [config], [stage])) htseq_outputs = view.map(htseq_count.run_with_config, *htseq_args) column_names = in_files out_file = os.path.join(config["dir"]["results"], stage, cell_type + ".combined.counts") combined_out = htseq_count.combine_counts(htseq_outputs, column_names, out_file) rpkm = htseq_count.calculate_rpkm(combined_out, config["annotation"]["file"]) rpkm_file = os.path.join(config["dir"]["results"], stage, cell_type + ".rpkm.txt") rpkm.to_csv(rpkm_file, sep="\t") if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(config["dir"]["results"], stage) safe_makedir(out_dir) out_files = [replace_suffix(os.path.basename(x), "metrics") for x in curr_files] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) # end gracefully, wait for jobs to finish, then exit view.wait() stop_cluster()
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) setup_logging(config) start_cluster(config) # after the cluster is up, import the view to i from bipy.cluster import view input_files = config["input"] results_dir = config["dir"]["results"] # make the needed directories map(safe_makedir, config["dir"].values()) curr_files = input_files ## qc steps for stage in config["run"]: if stage == "fastqc": # run the basic fastqc logger.info("Running %s on %s" % (stage, str(curr_files))) fastqc_config = config["stage"][stage] fastqc_outputs = view.map(fastqc.run, curr_files, [fastqc_config] * len(curr_files), [config] * len(curr_files)) # this does nothing for now, not implemented yet summary_file = _combine_fastqc(fastqc_outputs) if stage == "trim": logger.info("Trimming poor quality ends " " from %s" % (str(curr_files))) nlen = len(curr_files) min_length = str(config["stage"][stage].get("min_length", 20)) # trim low quality ends of reads # do this dirty for now out_dir = os.path.join(results_dir, "trimmed") safe_makedir(out_dir) out_files = [ append_stem(os.path.basename(x), "trim") for x in curr_files ] out_files = [os.path.join(out_dir, x) for x in out_files] # XXX remove the magic number of 10 the length of the # minimum read to keep out_files = view.map(sickle.run, curr_files, ["se"] * nlen, ["sanger"] * nlen, [min_length] * nlen, out_files) curr_files = out_files if stage == "tagdust": input_files = curr_files # remove tags matching the other miRNA tested logger.info("Running %s on %s." % (stage, input_files)) tagdust_config = config["stage"][stage] tagdust_outputs = view.map(tagdust.run, input_files, [tagdust_config] * len(input_files), [config] * len(input_files)) curr_files = [x[0] for x in tagdust_outputs] if stage == "filter_length": # filter out reads below or above a certain length filter_config = config["stage"][stage] min_length = filter_config.get("min_length", 0) max_length = filter_config.get("max_length", MAX_READ_LENGTH) # length predicate def length_filter(x): return min_length < len(x.seq) < max_length # filter the input reads based on length # parallelizing this doesn't seem to work # ipython can't accept closures as an argument to view.map() """ filtered_fastq = view.map(filter_seqio, tagdust_outputs, [lf] * len(tagdust_outputs), ["filt"] * len(tagdust_outputs), ["fastq"] * len(tagdust_outputs))""" out_files = [ append_stem(os.path.basename(input_file[0]), "filt") for input_file in tagdust_outputs ] out_dir = os.path.join(config["dir"]["results"], "length_filtered") safe_makedir(out_dir) out_files = [os.path.join(out_dir, x) for x in out_files] filtered_fastq = [ filter_seqio(x[0], length_filter, y, "fastq") for x, y in zip(tagdust_outputs, out_files) ] curr_files = filtered_fastq if stage == "count_ends": logger.info("Compiling nucleotide counts at 3' and 5' ends.") # count the nucleotide at the end of each read def count_ends(x, y): """ keeps a running count of an arbitrary set of keys during the reduce step """ x[y] = x.get(y, 0) + 1 return x def get_3prime_end(x): return str(x.seq[-1]) def get_5prime_end(x): return str(x.seq[0]) def output_counts(end_function, count_file): # if the count_file already exists, skip outdir = os.path.join(config["dir"]["results"], stage) safe_makedir(outdir) count_file = os.path.join(outdir, count_file) if os.path.exists(count_file): return count_file # outputs a tab file of the counts at the end # of the fastq files kj counts = [ reduce(count_ends, apply_seqio(x, end_function, kind="fastq"), {}) for x in curr_files ] df = pd.DataFrame(counts, index=map(_short_name, curr_files)) df = df.astype(float) total = df.sum(axis=1) df = df.div(total, axis=0) df["total"] = total df.to_csv(count_file, sep="\t") output_counts(get_3prime_end, "3prime_counts.tsv") output_counts(get_5prime_end, "5prime_counts.tsv") if stage == "tophat": tophat_config = config["stage"][stage] logger.info("Running tophat on %s" % (str(curr_files))) nlen = len(curr_files) pair_file = None ref_file = tophat_config["annotation"] out_base = os.path.join(results_dir, "mirna") align_dir = os.path.join(results_dir, "tophat") config = config tophat_files = view.map(tophat.align, curr_files, [pair_file] * nlen, [ref_file] * nlen, [out_base] * nlen, [align_dir] * nlen, [config] * nlen) curr_files = tophat_files if stage == "novoalign": logger.info("Running novoalign on %s" % (str(curr_files))) # align ref = config["genome"]["file"] novoalign_config = config["stage"][stage] aligned_outputs = view.map(novoalign.run, curr_files, [ref] * len(curr_files), [novoalign_config] * len(curr_files), [config] * len(curr_files)) # convert sam to bam, sort and index picard = BroadRunner(config["program"]["picard"], None, {}) bamfiles = view.map(picardrun.picard_formatconverter, [picard] * len(aligned_outputs), aligned_outputs) sorted_bf = view.map(picardrun.picard_sort, [picard] * len(bamfiles), bamfiles) view.map(picardrun.picard_index, [picard] * len(sorted_bf), sorted_bf) # these files are the new starting point for the downstream # analyses, so copy them over into the data dir and setting # them to read only #data_dir = os.path.join(config["dir"]["data"], stage) #safe_makedir(data_dir) #view.map(shutil.copy, sorted_bf, [data_dir] * len(sorted_bf)) #new_files = [os.path.join(data_dir, x) for x in # map(os.path.basename, sorted_bf)] #[os.chmod(x, stat.S_IREAD | stat.S_IRGRP) for x in new_files] # index the bam files for later use #view.map(picardrun.picard_index, [picard] * len(new_files), # new_files) curr_files = sorted_bf if stage == "new_coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = blastn.prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"], None, {}) out_dir = os.path.join(results_dir, "new_coverage") safe_makedir(out_dir) out_files = [ replace_suffix(os.path.basename(x), "metrics") for x in curr_files ] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) curr_files = out_files if stage == "coverage": gtf = blastn.prepare_ref_file(config["annotation"], config) logger.info("Calculating coverage of features in %s for %s" % (gtf, str(sorted_bf))) out_files = [replace_suffix(x, "counts.bed") for x in sorted_bf] out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) logger.info(out_files) out_files = [ os.path.join(out_dir, os.path.basename(x)) for x in out_files ] logger.info(out_files) view.map(bedtools.count_overlaps, sorted_bf, [gtf] * len(sorted_bf), out_files) if stage == "htseq-count": nfiles = len(curr_files) htseq_config = _get_stage_config(config, stage) htseq_outputs = view.map(htseq_count.run_with_config, aligned_outputs, [config] * nfiles, [stage] * nfiles) column_names = _get_short_names(input_files) logger.info("Column names: %s" % (column_names)) out_file = os.path.join(config["dir"]["results"], stage, "combined.counts") combined_out = htseq_count.combine_counts(htseq_outputs, column_names, out_file) if stage == "bedtools_intersect": bedfiles = config["stage"]["bedtools_intersect"].get("bed", None) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) for bedfile in bedfiles: bedbase, bedext = os.path.splitext(bedfile) out_files = [remove_suffix(x) for x in sorted_bf] out_files = [ os.path.join(out_dir, os.path.basename(x)) for x in out_files ] out_files = [ "_vs_".join([x, os.path.basename(bedbase)]) for x in out_files ] out_files = [".".join([x, "bam"]) for x in out_files] test_out = map(bedtools.intersectbam2bed, sorted_bf, [bedfile] * len(sorted_bf), [False] * len(sorted_bf), out_files) count_files = [replace_suffix(x, "stats") for x in out_files] map(write_ratios, sorted_bf, out_files, count_files) if stage == "piranha": piranha_runner = piranha.PiranhaStage(config) out_files = view.map(piranha_runner, curr_files) stop_cluster()