def modisco2bed(modisco_dir, output_dir, trim_frac=0.08): from pybedtools import Interval from basepair.modisco.results import ModiscoResult add_file_logging(output_dir, logger, 'modisco2bed') ranges = load_ranges(modisco_dir) example_intervals = [ Interval(row.chrom, row.start, row.end) for i, row in ranges.iterrows() ] r = ModiscoResult(os.path.join(modisco_dir, "modisco.h5")) r.export_seqlets_bed(output_dir, example_intervals=example_intervals, position='absolute', trim_frac=trim_frac) r.close()
def modisco_plot( modisco_dir, output_dir, # filter_npy=None, # ignore_dist_filter=False, figsize=(10, 10), impsf=None): """Plot the results of a modisco run Args: modisco_dir: modisco directory output_dir: Output directory for writing the results figsize: Output figure size impsf: [optional] modisco importance score file (ImpScoreFile) """ plt.switch_backend('agg') add_file_logging(output_dir, logger, 'modisco-plot') from basepair.plot.vdom import write_heatmap_pngs from basepair.plot.profiles import plot_profiles from basepair.utils import flatten output_dir = Path(output_dir) output_dir.parent.mkdir(parents=True, exist_ok=True) # load modisco mr = ModiscoResult(f"{modisco_dir}/modisco.h5") if impsf is not None: d = impsf else: d = ImpScoreFile.from_modisco_dir(modisco_dir) logger.info("Loading the importance scores") d.cache() # load all thr_one_hot = d.get_seq() # thr_hypothetical_contribs tracks = d.get_profiles() thr_hypothetical_contribs = dict() thr_contrib_scores = dict() # TODO - generalize this thr_hypothetical_contribs['weighted'] = d.get_hyp_contrib() thr_contrib_scores['weighted'] = d.get_contrib() tasks = d.get_tasks() # Count importance (if it exists) if d.contains_imp_score("counts/pre-act"): count_imp_score = "counts/pre-act" thr_hypothetical_contribs['count'] = d.get_hyp_contrib( imp_score=count_imp_score) thr_contrib_scores['count'] = d.get_contrib(imp_score=count_imp_score) elif d.contains_imp_score("count"): count_imp_score = "count" thr_hypothetical_contribs['count'] = d.get_hyp_contrib( imp_score=count_imp_score) thr_contrib_scores['count'] = d.get_contrib(imp_score=count_imp_score) else: # Don't do anything pass thr_hypothetical_contribs = OrderedDict( flatten(thr_hypothetical_contribs, separator='/')) thr_contrib_scores = OrderedDict(flatten(thr_contrib_scores, separator='/')) # # load importance scores # modisco_kwargs = read_json(f"{modisco_dir}/kwargs.json") # d = HDF5Reader.load(modisco_kwargs['imp_scores']) # if 'hyp_imp' not in d: # # backcompatibility # d['hyp_imp'] = d['grads'] # tasks = list(d['targets']['profile']) # if isinstance(d['inputs'], dict): # one_hot = d['inputs']['seq'] # else: # one_hot = d['inputs'] # # load used strand distance filter # included_samples = load_included_samples(modisco_dir) # grad_type = "count,weighted" # always plot both importance scores # thr_hypothetical_contribs = OrderedDict([(f"{gt}/{task}", mean(d['hyp_imp'][task][gt])[included_samples]) # for task in tasks # for gt in grad_type.split(",")]) # thr_one_hot = one_hot[included_samples] # thr_contrib_scores = OrderedDict([(f"{gt}/{task}", thr_hypothetical_contribs[f"{gt}/{task}"] * thr_one_hot) # for task in tasks # for gt in grad_type.split(",")]) # tracks = OrderedDict([(task, d['targets']['profile'][task][included_samples]) for task in tasks]) # ------------------------------------------------- all_seqlets = mr.seqlets() all_patterns = mr.patterns() if len(all_patterns) == 0: print("No patterns found") return # 1. Plots with tracks and contrib scores print("Writing results for contribution scores") plot_profiles(all_seqlets, thr_one_hot, tracks=tracks, importance_scores=thr_contrib_scores, legend=False, flip_neg=True, rotate_y=0, seq_height=.5, patterns=all_patterns, n_bootstrap=100, fpath_template=str(output_dir / "{pattern}/agg_profile_contribcores"), mkdir=True, figsize=figsize) # 2. Plots only with hypothetical contrib scores print("Writing results for hypothetical contribution scores") plot_profiles(all_seqlets, thr_one_hot, tracks={}, importance_scores=thr_hypothetical_contribs, legend=False, flip_neg=True, rotate_y=0, seq_height=1, patterns=all_patterns, n_bootstrap=100, fpath_template=str(output_dir / "{pattern}/agg_profile_hypcontribscores"), figsize=figsize) print("Plotting heatmaps") for pattern in tqdm(all_patterns): write_heatmap_pngs(all_seqlets[pattern], d, tasks, pattern, output_dir=str(output_dir / pattern)) mr.close()
def modisco_report_all(modisco_dir, trim_frac=0.08, n_jobs=20, scan_instances=False, force=False): """Compute all the results for modisco. Runs: - modisco_plot - modisco_report - modisco_table - modisco_centroid_seqlet_matches - modisco_score2 - modisco2bed - modisco_instances_to_bed Args: modisco_dir: directory path `output_dir` in `basepair.cli.modisco.modisco_run` contains: modisco.h5, strand_distances.h5, kwargs.json trim_frac: how much to trim the pattern n_jobs: number of parallel jobs to use force: if True, commands will be re-run regardless of whether whey have already been computed Note: All the sub-commands are only executed if they have not been ran before. Use --force override this. Whether the commands have been run before is deterimined by checking if the following file exists: `{modisco_dir}/.modisco_report_all/{command}.done`. """ plt.switch_backend('agg') from basepair.utils import ConditionalRun modisco_dir = Path(modisco_dir) # figure out the importance scores used kwargs = read_json(modisco_dir / "kwargs.json") imp_scores = kwargs["imp_scores"] mr = ModiscoResult(f"{modisco_dir}/modisco.h5") mr.open() all_patterns = mr.patterns() mr.close() if len(all_patterns) == 0: print("No patterns found.") # Touch results.html for snakemake open(modisco_dir / 'results.html', 'a').close() open(modisco_dir / 'seqlets/scored_regions.bed', 'a').close() return # class determining whether to run the command or not (poor-man's snakemake) cr = ConditionalRun("modisco_report_all", None, modisco_dir, force=force) sync = [] # -------------------------------------------- if (not cr.set_cmd('modisco_plot').done() or not cr.set_cmd('modisco_cluster_patterns').done() or not cr.set_cmd('modisco_enrich_patterns').done()): # load ImpScoreFile and pass it to all the functions logger.info("Loading ImpScoreFile") impsf = ImpScoreFile.from_modisco_dir(modisco_dir) impsf.cache() else: impsf = None # -------------------------------------------- # Basic reports if not cr.set_cmd('modisco_plot').done(): modisco_plot(modisco_dir, modisco_dir / 'plots', figsize=(10, 10), impsf=impsf) cr.write() sync.append("plots") if not cr.set_cmd('modisco_report').done(): modisco_report(str(modisco_dir), str(modisco_dir)) cr.write() sync.append("results.html") if not cr.set_cmd('modisco_table').done(): modisco_table(modisco_dir, modisco_dir, report_url=None, impsf=impsf) cr.write() sync.append("footprints.pkl") sync.append("pattern_table.*") if not cr.set_cmd('modisco_cluster_patterns').done(): modisco_cluster_patterns(modisco_dir, modisco_dir) cr.write() sync.append("patterns.pkl") sync.append("cluster-patterns.*") sync.append("motif_clustering") if not cr.set_cmd('modisco_enrich_patterns').done(): modisco_enrich_patterns(modisco_dir / 'patterns.pkl', modisco_dir, modisco_dir / 'patterns.pkl', impsf=impsf) cr.write() # sync.append("patterns.pkl") # TODO - run modisco align # - [ ] add the motif clustering step (as ipynb) and export the aligned tables # - save the final table as a result to CSV (ready to be imported in excel) # -------------------------------------------- # Finding new instances if scan_instances: if not cr.set_cmd('modisco_centroid_seqlet_matches').done(): modisco_centroid_seqlet_matches(modisco_dir, imp_scores, modisco_dir, trim_frac=trim_frac, n_jobs=n_jobs, impsf=impsf) cr.write() # TODO - this would not work with the per-TF importance score file.... if not cr.set_cmd('modisco_score2').done(): modisco_score2( modisco_dir, modisco_dir / 'instances.parq', trim_frac=trim_frac, imp_scores=None, # Use the default one importance=None, # Use the default one n_jobs=n_jobs) cr.write() # TODO - update the pattern table -> compute the fraction of other motifs etc # -------------------------------------------- # Export bed-files and bigwigs # Seqlets if not cr.set_cmd('modisco2bed').done(): modisco2bed(str(modisco_dir), str(modisco_dir / 'seqlets'), trim_frac=trim_frac) cr.write() sync.append("seqlets") # Scanned instances # if not cr.set_cmd('modisco_instances_to_bed').done(): # modisco_instances_to_bed(str(modisco_dir / 'modisco.h5'), # instances_parq=str(modisco_dir / 'instances.parq'), # imp_score_h5=imp_scores, # output_dir=str(modisco_dir / 'instances_bed/'), # ) # cr.write() # sync.append("instances_bed") # print the rsync command to run in order to sync the output # directories to the webserver logger.info("Run the following command to sync files to the webserver") dirs = " ".join(sync) print(f"rsync -av --progress {dirs} <output_dir>/")