def modisco_export_seqlets(modisco_dir, output_dir, trim_frac=0.08): from pybedtools import Interval from bpnet.modisco.files import ModiscoFile add_file_logging(output_dir, logger, 'modisco_export_seqlets') ranges = load_ranges(modisco_dir) example_intervals = [ Interval(row.chrom, row.start, row.end) for i, row in ranges.iterrows() ] r = ModiscoFile(os.path.join(modisco_dir, "modisco.h5")) r.export_seqlets_bed(output_dir, example_intervals=example_intervals, position='absolute', trim_frac=trim_frac) r.close()
def chip_nexus_analysis(modisco_dir, trim_frac=0.08, num_workers=20, run_cwm_scan=False, force=False, footprint_width=200): """Compute all the results for modisco specific for ChIP-nexus/exo data. Runs: - modisco_plot - modisco_report - modisco_table - modisco_export_patterns - cwm_scan - modisco_export_seqlets Note: All the sub-commands are only executed if they have not been ran before. Use --force override this. Whether the commands have been run before is deterimined by checking if the following file exists: `{modisco_dir}/.modisco_report_all/{command}.done`. """ plt.switch_backend('agg') from bpnet.utils import ConditionalRun modisco_dir = Path(modisco_dir) # figure out the contribution scores used kwargs = read_json(modisco_dir / "modisco-run.kwargs.json") contrib_scores = kwargs["contrib_file"] mf = ModiscoFile(f"{modisco_dir}/modisco.h5") all_patterns = mf.pattern_names() mf.close() if len(all_patterns) == 0: print("No patterns found.") # Touch modisco-chip.html for snakemake open(modisco_dir / 'modisco-chip.html', 'a').close() open(modisco_dir / 'seqlets/scored_regions.bed', 'a').close() return # class determining whether to run the command or not (poor-man's snakemake) cr = ConditionalRun("modisco_report_all", None, modisco_dir, force=force) sync = [] # -------------------------------------------- if (not cr.set_cmd('modisco_plot').done() or not cr.set_cmd('modisco_enrich_patterns').done()): # load ContribFile and pass it to all the functions logger.info("Loading ContribFile") contribsf = ContribFile.from_modisco_dir(modisco_dir) contribsf.cache() else: contribsf = None # -------------------------------------------- # Basic reports if not cr.set_cmd('modisco_plot').done(): modisco_plot(modisco_dir, modisco_dir / 'plots', heatmap_width=footprint_width, figsize=(10, 10), contribsf=contribsf) cr.write() sync.append("plots") if not cr.set_cmd('modisco_report').done(): modisco_report(str(modisco_dir), str(modisco_dir)) cr.write() sync.append("modisco-chip.html") if not cr.set_cmd('modisco_table').done(): modisco_table(modisco_dir, contrib_scores, modisco_dir, report_url=None, contribsf=contribsf, footprint_width=footprint_width) cr.write() sync.append("footprints.pkl") sync.append("pattern_table.*") if not cr.set_cmd('modisco_export_patterns').done(): modisco_export_patterns(modisco_dir, output_file=modisco_dir / 'patterns.pkl', contribsf=contribsf) cr.write() sync.append("patterns.pkl") # -------------------------------------------- # Finding new instances if run_cwm_scan: if not cr.set_cmd('cwm_scan').done(): cwm_scan(modisco_dir, modisco_dir / 'instances.bed.gz', trim_frac=trim_frac, contrib_file=None, num_workers=num_workers) cr.write() # -------------------------------------------- # Export bed-files and bigwigs # Seqlets if not cr.set_cmd('modisco_export_seqlets').done(): modisco_export_seqlets(str(modisco_dir), str(modisco_dir / 'seqlets'), trim_frac=trim_frac) cr.write() sync.append("seqlets") # print the rsync command to run in order to sync the output # directories to the webserver logger.info("Run the following command to sync files to the webserver") dirs = " ".join(sync) print(f"rsync -av --progress {dirs} <output_dir>/")
def modisco_plot( modisco_dir, output_dir, # filter_npy=None, # ignore_dist_filter=False, heatmap_width=200, figsize=(10, 10), contribsf=None): """Plot the results of a modisco run Args: modisco_dir: modisco directory output_dir: Output directory for writing the results figsize: Output figure size contribsf: [optional] modisco contribution score file (ContribFile) """ plt.switch_backend('agg') add_file_logging(output_dir, logger, 'modisco-plot') from bpnet.plot.vdom import write_heatmap_pngs from bpnet.plot.profiles import plot_profiles from bpnet.utils import flatten output_dir = Path(output_dir) output_dir.parent.mkdir(parents=True, exist_ok=True) # load modisco mf = ModiscoFile(f"{modisco_dir}/modisco.h5") if contribsf is not None: d = contribsf else: d = ContribFile.from_modisco_dir(modisco_dir) logger.info("Loading the contribution scores") d.cache() # load all thr_one_hot = d.get_seq() # thr_hypothetical_contribs tracks = d.get_profiles() thr_hypothetical_contribs = dict() thr_contrib_scores = dict() # TODO - generalize this thr_hypothetical_contribs['profile'] = d.get_hyp_contrib() thr_contrib_scores['profile'] = d.get_contrib() tasks = d.get_tasks() # Count contribution (if it exists) if d.contains_contrib_score("counts/pre-act"): count_contrib_score = "counts/pre-act" thr_hypothetical_contribs['count'] = d.get_hyp_contrib( contrib_score=count_contrib_score) thr_contrib_scores['count'] = d.get_contrib( contrib_score=count_contrib_score) elif d.contains_contrib_score("count"): count_contrib_score = "count" thr_hypothetical_contribs['count'] = d.get_hyp_contrib( contrib_score=count_contrib_score) thr_contrib_scores['count'] = d.get_contrib( contrib_score=count_contrib_score) else: # Don't do anything pass thr_hypothetical_contribs = OrderedDict( flatten(thr_hypothetical_contribs, separator='/')) thr_contrib_scores = OrderedDict(flatten(thr_contrib_scores, separator='/')) # ------------------------------------------------- all_seqlets = mf.seqlets() all_patterns = mf.pattern_names() if len(all_patterns) == 0: print("No patterns found") return # 1. Plots with tracks and contrib scores print("Writing results for contribution scores") plot_profiles(all_seqlets, thr_one_hot, tracks=tracks, contribution_scores=thr_contrib_scores, legend=False, flip_neg=True, rotate_y=0, seq_height=.5, patterns=all_patterns, n_bootstrap=100, fpath_template=str(output_dir / "{pattern}/agg_profile_contribcores"), mkdir=True, figsize=figsize) # 2. Plots only with hypothetical contrib scores print("Writing results for hypothetical contribution scores") plot_profiles(all_seqlets, thr_one_hot, tracks={}, contribution_scores=thr_hypothetical_contribs, legend=False, flip_neg=True, rotate_y=0, seq_height=1, patterns=all_patterns, n_bootstrap=100, fpath_template=str(output_dir / "{pattern}/agg_profile_hypcontribscores"), figsize=figsize) print("Plotting heatmaps") for pattern in tqdm(all_patterns): write_heatmap_pngs(all_seqlets[pattern], d, tasks, pattern, output_dir=str(output_dir / pattern), resize_width=heatmap_width) mf.close()