def cwm_scan_seqlets(modisco_dir, output_file, trim_frac=0.08, num_workers=1, contribsf=None, verbose=False): """Compute the cwm scanning scores of the original modisco seqlets """ from bpnet.modisco.table import ModiscoData os.makedirs(os.path.dirname(output_file), exist_ok=True) add_file_logging(os.path.dirname(output_file), logger, 'cwm_scan_seqlets') # figure out contrib_wildcard mf = ModiscoFile(modisco_dir / "modisco.h5") if contribsf is None: contrib = ContribFile.from_modisco_dir(modisco_dir) else: contrib = contribsf tasks = mf.tasks() # HACK prune the tasks of contribution (in case it's present) tasks = [t.split("/")[0] for t in tasks] dfi_list = [] for pattern_name in tqdm(mf.pattern_names()): pattern = mf.get_pattern(pattern_name).trim_seq_ic(trim_frac) seqlets = mf._get_seqlets(pattern_name, trim_frac=trim_frac) # scan only the existing locations of the seqlets instead of the full sequences # to obtain the distribution stacked_seqlets = contrib.extract(seqlets) match, contribution = pattern.scan_contribution( stacked_seqlets.contrib, hyp_contrib=None, tasks=tasks, n_jobs=num_workers, verbose=False, pad_mode=None) seq_match = pattern.scan_seq(stacked_seqlets.seq, n_jobs=num_workers, verbose=False, pad_mode=None) dfm = pattern.get_instances(tasks, match, contribution, seq_match, fdr=1, verbose=verbose, plot=verbose) dfm = dfm[dfm.seq_match > 0] dfi_list.append(dfm) df = pd.concat(dfi_list) df.to_csv(output_file)
def modisco_export_seqlets(modisco_dir, output_dir, trim_frac=0.08): from pybedtools import Interval from bpnet.modisco.files import ModiscoFile add_file_logging(output_dir, logger, 'modisco_export_seqlets') ranges = load_ranges(modisco_dir) example_intervals = [ Interval(row.chrom, row.start, row.end) for i, row in ranges.iterrows() ] r = ModiscoFile(os.path.join(modisco_dir, "modisco.h5")) r.export_seqlets_bed(output_dir, example_intervals=example_intervals, position='absolute', trim_frac=trim_frac) r.close()
def modisco_export_patterns(modisco_dir, output_file, contribsf=None): """Export patterns to a pkl file. Don't cluster them Adds `stacked_seqlet_contrib` and `n_seqlets` to pattern `attrs` Args: modisco_dir: modisco directory containing output_file: output file path for patterns.pkl """ from bpnet.cli.contrib import ContribFile logger.info("Loading patterns") modisco_dir = Path(modisco_dir) mf = ModiscoFile(modisco_dir / 'modisco.h5') patterns = [mf.get_pattern(pname) for pname in mf.pattern_names()] if contribsf is None: contrib_file = ContribFile.from_modisco_dir(modisco_dir) logger.info("Loading ContribFile into memory") contrib_file.cache() else: logger.info("Using the provided ContribFile") contrib_file = contribsf logger.info("Extracting profile and contribution scores") extended_patterns = [] for p in tqdm(patterns): p = p.copy() # get seqlets valid_seqlets = mf._get_seqlets(p.name) # extract the contribution scores sti = contrib_file.extract(valid_seqlets, profile_width=None) sti.dfi = mf.get_seqlet_intervals(p.name, as_df=True) p.attrs['stacked_seqlet_contrib'] = sti p.attrs['n_seqlets'] = mf.n_seqlets(p.name) extended_patterns.append(p) write_pkl(extended_patterns, output_file)
parser.add_argument( '--task', dest='task', help= 'name of the BPNET task whose predictions are being analyzed; typically the name of a TF' ) args = parser.parse_args() model_dir = args.model_dir task = [args.task] model_dir_path = Path(model_dir) modisco_dir = model_dir_path / 'modisco' ## MultipleModiscoResult is a convenience wrapper around ModiscoResult mf = ModiscoFileGroup( {t: ModiscoFile(modisco_dir / t / 'modisco.h5') for t in task}) ## create directories for plots: each dir corresponds to a different task/metacluster/ task_metacluster_dirs = [] for i in mf.pattern_names(): dir_name_items = i.split('/') dir_name = '/'.join(dir_name_items[:len(dir_name_items) - 1]) task_metacluster_dirs.append(dir_name) task_metacluster_dirs = list(set(task_metacluster_dirs)) for t in task_metacluster_dirs: Path(f"{modisco_dir}/modisco_plots/{t}").mkdir(parents=True, exist_ok=True) ## Plot all the patterns fig_names = [] for p in mf.patterns():
'--task', dest='task', help= 'name of the BPNET task whose predictions are being analyzed; typically the name of a TF' ) args = parser.parse_args() model_dir = args.model_dir tasks = [args.task] model_dir_path = Path(model_dir) modisco_dir = model_dir_path / 'modisco' for task in tasks: tomtom_dir = modisco_dir / task / 'tomtom' tomtom_dir.mkdir(parents=True, exist_ok=True) modisco_file = modisco_dir / task / 'modisco.h5' mf = ModiscoFile(modisco_file) for pattern_name in mf.pattern_names(): pattern = mf.get_pattern(pattern_name) matches = pattern.fetch_tomtom_matches( motifs_db='meme_db/HOCOMOCOv11_full_HUMAN_mono_meme_format.meme') matches_df = pd.DataFrame( columns=['Target ID', 'p-value', 'E-value', 'q-value']) i = 0 for match in matches: new_row = pd.DataFrame(match, index=[i]) matches_df = pd.concat([matches_df, new_row]) i = i + 1 pattern_name = pattern_name.replace('/', '_') matches_df.to_csv(tomtom_dir / f'{pattern_name}.tsv', sep='\t')
from bpnet.modisco.files import ModiscoFile import pandas as pd from pathlib import Path import sys model_dir = sys.argv[1] model_dir_path = Path(model_dir) modisco_dir = model_dir_path / 'modisco' tasks = sys.argv[2:] mf = ModiscoFile(modisco_file) # export pssm for all tasks for task in tasks: modisco_task_dir = modisco_dir / task modisco_file = modisco_task_dir / 'modisco.h5' mf = ModiscoFile(modisco_file) for i in range(0, len(mf.pattern_names())): pattern_name = mf.pattern_names()[i] pssm = mf.get_pssm(pattern_name) pssm = pd.DataFrame(pssm) pssm.to_csv(f'{modisco_task_dir}/{pattern_name}.tsv', sep='\t', index=False, header=False)
def pattern(modisco_dir): mf = ModiscoFile(modisco_dir / 'modisco.h5') return mf.get_pattern("metacluster_0/pattern_0")
def mf(modisco_dir): """ModiscoFile """ from bpnet.modisco.files import ModiscoFile mf = ModiscoFile(modisco_dir / 'modisco.h5') return mf
def chip_nexus_analysis(modisco_dir, trim_frac=0.08, num_workers=20, run_cwm_scan=False, force=False, footprint_width=200): """Compute all the results for modisco specific for ChIP-nexus/exo data. Runs: - modisco_plot - modisco_report - modisco_table - modisco_export_patterns - cwm_scan - modisco_export_seqlets Note: All the sub-commands are only executed if they have not been ran before. Use --force override this. Whether the commands have been run before is deterimined by checking if the following file exists: `{modisco_dir}/.modisco_report_all/{command}.done`. """ plt.switch_backend('agg') from bpnet.utils import ConditionalRun modisco_dir = Path(modisco_dir) # figure out the contribution scores used kwargs = read_json(modisco_dir / "modisco-run.kwargs.json") contrib_scores = kwargs["contrib_file"] mf = ModiscoFile(f"{modisco_dir}/modisco.h5") all_patterns = mf.pattern_names() mf.close() if len(all_patterns) == 0: print("No patterns found.") # Touch modisco-chip.html for snakemake open(modisco_dir / 'modisco-chip.html', 'a').close() open(modisco_dir / 'seqlets/scored_regions.bed', 'a').close() return # class determining whether to run the command or not (poor-man's snakemake) cr = ConditionalRun("modisco_report_all", None, modisco_dir, force=force) sync = [] # -------------------------------------------- if (not cr.set_cmd('modisco_plot').done() or not cr.set_cmd('modisco_enrich_patterns').done()): # load ContribFile and pass it to all the functions logger.info("Loading ContribFile") contribsf = ContribFile.from_modisco_dir(modisco_dir) contribsf.cache() else: contribsf = None # -------------------------------------------- # Basic reports if not cr.set_cmd('modisco_plot').done(): modisco_plot(modisco_dir, modisco_dir / 'plots', heatmap_width=footprint_width, figsize=(10, 10), contribsf=contribsf) cr.write() sync.append("plots") if not cr.set_cmd('modisco_report').done(): modisco_report(str(modisco_dir), str(modisco_dir)) cr.write() sync.append("modisco-chip.html") if not cr.set_cmd('modisco_table').done(): modisco_table(modisco_dir, contrib_scores, modisco_dir, report_url=None, contribsf=contribsf, footprint_width=footprint_width) cr.write() sync.append("footprints.pkl") sync.append("pattern_table.*") if not cr.set_cmd('modisco_export_patterns').done(): modisco_export_patterns(modisco_dir, output_file=modisco_dir / 'patterns.pkl', contribsf=contribsf) cr.write() sync.append("patterns.pkl") # -------------------------------------------- # Finding new instances if run_cwm_scan: if not cr.set_cmd('cwm_scan').done(): cwm_scan(modisco_dir, modisco_dir / 'instances.bed.gz', trim_frac=trim_frac, contrib_file=None, num_workers=num_workers) cr.write() # -------------------------------------------- # Export bed-files and bigwigs # Seqlets if not cr.set_cmd('modisco_export_seqlets').done(): modisco_export_seqlets(str(modisco_dir), str(modisco_dir / 'seqlets'), trim_frac=trim_frac) cr.write() sync.append("seqlets") # print the rsync command to run in order to sync the output # directories to the webserver logger.info("Run the following command to sync files to the webserver") dirs = " ".join(sync) print(f"rsync -av --progress {dirs} <output_dir>/")
def cwm_scan(modisco_dir, output_file, trim_frac=0.08, patterns='all', filters='match_weighted_p>=.2,contrib_weighted_p>=.01', contrib_file=None, add_profile_features=False, num_workers=10): """Get motif instances via CWM scanning. """ from bpnet.modisco.utils import longer_pattern, shorten_pattern from bpnet.modisco.pattern_instances import annotate_profile_single add_file_logging(os.path.dirname(output_file), logger, 'cwm-scan') modisco_dir = Path(modisco_dir) valid_suffixes = [ '.csv', '.csv.gz', '.tsv', '.tsv.gz', '.parq', '.bed', '.bed.gz', ] if not any([output_file.endswith(suffix) for suffix in valid_suffixes]): raise ValueError( f"output_file doesn't have a valid file suffix. Valid file suffixes are: {valid_suffixes}" ) # Centroid matches path cm_path = modisco_dir / f'cwm-scan-seqlets.trim-frac={trim_frac:.2f}.csv.gz' # save the hyper-parameters kwargs_json_file = os.path.join(os.path.dirname(output_file), 'cwm-scan.kwargs.json') write_json( dict(modisco_dir=os.path.abspath(str(contrib_file)), output_file=str(output_file), cwm_scan_seqlets_path=str(cm_path), trim_frac=trim_frac, patterns=patterns, filters=filters, contrib_file=contrib_file, add_profile_features=add_profile_features, num_workers=num_workers), str(kwargs_json_file)) # figure out contrib_wildcard modisco_kwargs = read_json( os.path.join(modisco_dir, "modisco-run.kwargs.json")) contrib_type = load_contrib_type(modisco_kwargs) mf = ModiscoFile(modisco_dir / "modisco.h5") tasks = mf.tasks() # HACK prune the tasks of contribution (in case it's present) tasks = [t.split("/")[0] for t in tasks] logger.info(f"Using tasks: {tasks}") if contrib_file is None: cf = ContribFile.from_modisco_dir(modisco_dir) cf.cache( ) # cache it since it can be re-used in `modisco_centroid_seqlet_matches` else: logger.info(f"Loading the contribution scores from: {contrib_file}") cf = ContribFile(contrib_file, default_contrib_score=contrib_type) if not cm_path.exists(): logger.info(f"Generating centroid matches to {cm_path.resolve()}") cwm_scan_seqlets(modisco_dir, output_file=cm_path, trim_frac=trim_frac, contribsf=cf if contrib_file is None else None, num_workers=num_workers, verbose=False) else: logger.info("Centroid matches already exist.") logger.info(f"Loading centroid matches from {cm_path.resolve()}") dfm_norm = pd.read_csv(cm_path) # get the raw data seq, contrib, ranges = cf.get_seq(), cf.get_contrib(), cf.get_ranges() logger.info("Scanning for patterns") dfl = [] # patterns to scan. `longer_pattern` makes sure the patterns are in the long format scan_patterns = patterns.split( ",") if patterns is not 'all' else mf.pattern_names() scan_patterns = [longer_pattern(pn) for pn in scan_patterns] if add_profile_features: profile = cf.get_profiles() logger.info("Profile features will also be added to dfi") for pattern_name in tqdm(mf.pattern_names()): if pattern_name not in scan_patterns: # skip scanning that patterns continue pattern = mf.get_pattern(pattern_name).trim_seq_ic(trim_frac) match, contribution = pattern.scan_contribution(contrib, hyp_contrib=None, tasks=tasks, n_jobs=num_workers, verbose=False) seq_match = pattern.scan_seq(seq, n_jobs=num_workers, verbose=False) dfm = pattern.get_instances( tasks, match, contribution, seq_match, norm_df=dfm_norm[dfm_norm.pattern == pattern_name], verbose=False, plot=False) for filt in filters.split(","): if len(filt) > 0: dfm = dfm.query(filt) if add_profile_features: dfm = annotate_profile_single(dfm, pattern_name, mf, profile, profile_width=70, trim_frac=trim_frac) dfm['pattern_short'] = shorten_pattern(pattern_name) # TODO - is it possible to write out the results incrementally? dfl.append(dfm) logger.info("Merging") # merge and write the results dfp = pd.concat(dfl) # append the ranges logger.info("Append ranges") ranges.columns = ["example_" + v for v in ranges.columns] dfp = dfp.merge(ranges, on="example_idx", how='left') # add the absolute coordinates dfp['pattern_start_abs'] = dfp['example_start'] + dfp['pattern_start'] dfp['pattern_end_abs'] = dfp['example_start'] + dfp['pattern_end'] logger.info("Table info") dfp.info() logger.info( f"Writing the resuling pd.DataFrame of shape {dfp.shape} to {output_file}" ) # set the first 7 columns to comply to bed6 format (chrom, start, end, name, score, strand, ...) bed_columns = [ 'example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'pattern', 'contrib_weighted_p', 'strand', 'match_weighted_p' ] dfp = pd_first_cols(dfp, bed_columns) # write to a parquet file if output_file.endswith(".parq"): logger.info("Writing a parquet file") dfp.to_parquet(output_file, partition_on=['pattern_short'], engine='fastparquet') elif output_file.endswith(".csv.gz") or output_file.endswith(".csv"): logger.info("Writing a csv file") dfp.to_csv(output_file, compression='infer', index=False) elif output_file.endswith(".tsv.gz") or output_file.endswith(".tsv"): logger.info("Writing a tsv file") dfp.to_csv(output_file, sep='\t', compression='infer', index=False) elif output_file.endswith(".bed.gz") or output_file.endswith(".bed"): logger.info("Writing a BED file") # write only the first (and main) 7 columns dfp[bed_columns].to_csv(output_file, sep='\t', compression='infer', index=False, header=False) else: logger.warn("File suffix not recognized. Using .csv.gz file format") dfp.to_csv(output_file, compression='gzip', index=False) logger.info("Done!")
def modisco_plot( modisco_dir, output_dir, # filter_npy=None, # ignore_dist_filter=False, heatmap_width=200, figsize=(10, 10), contribsf=None): """Plot the results of a modisco run Args: modisco_dir: modisco directory output_dir: Output directory for writing the results figsize: Output figure size contribsf: [optional] modisco contribution score file (ContribFile) """ plt.switch_backend('agg') add_file_logging(output_dir, logger, 'modisco-plot') from bpnet.plot.vdom import write_heatmap_pngs from bpnet.plot.profiles import plot_profiles from bpnet.utils import flatten output_dir = Path(output_dir) output_dir.parent.mkdir(parents=True, exist_ok=True) # load modisco mf = ModiscoFile(f"{modisco_dir}/modisco.h5") if contribsf is not None: d = contribsf else: d = ContribFile.from_modisco_dir(modisco_dir) logger.info("Loading the contribution scores") d.cache() # load all thr_one_hot = d.get_seq() # thr_hypothetical_contribs tracks = d.get_profiles() thr_hypothetical_contribs = dict() thr_contrib_scores = dict() # TODO - generalize this thr_hypothetical_contribs['profile'] = d.get_hyp_contrib() thr_contrib_scores['profile'] = d.get_contrib() tasks = d.get_tasks() # Count contribution (if it exists) if d.contains_contrib_score("counts/pre-act"): count_contrib_score = "counts/pre-act" thr_hypothetical_contribs['count'] = d.get_hyp_contrib( contrib_score=count_contrib_score) thr_contrib_scores['count'] = d.get_contrib( contrib_score=count_contrib_score) elif d.contains_contrib_score("count"): count_contrib_score = "count" thr_hypothetical_contribs['count'] = d.get_hyp_contrib( contrib_score=count_contrib_score) thr_contrib_scores['count'] = d.get_contrib( contrib_score=count_contrib_score) else: # Don't do anything pass thr_hypothetical_contribs = OrderedDict( flatten(thr_hypothetical_contribs, separator='/')) thr_contrib_scores = OrderedDict(flatten(thr_contrib_scores, separator='/')) # ------------------------------------------------- all_seqlets = mf.seqlets() all_patterns = mf.pattern_names() if len(all_patterns) == 0: print("No patterns found") return # 1. Plots with tracks and contrib scores print("Writing results for contribution scores") plot_profiles(all_seqlets, thr_one_hot, tracks=tracks, contribution_scores=thr_contrib_scores, legend=False, flip_neg=True, rotate_y=0, seq_height=.5, patterns=all_patterns, n_bootstrap=100, fpath_template=str(output_dir / "{pattern}/agg_profile_contribcores"), mkdir=True, figsize=figsize) # 2. Plots only with hypothetical contrib scores print("Writing results for hypothetical contribution scores") plot_profiles(all_seqlets, thr_one_hot, tracks={}, contribution_scores=thr_hypothetical_contribs, legend=False, flip_neg=True, rotate_y=0, seq_height=1, patterns=all_patterns, n_bootstrap=100, fpath_template=str(output_dir / "{pattern}/agg_profile_hypcontribscores"), figsize=figsize) print("Plotting heatmaps") for pattern in tqdm(all_patterns): write_heatmap_pngs(all_seqlets[pattern], d, tasks, pattern, output_dir=str(output_dir / pattern), resize_width=heatmap_width) mf.close()