def cwm_scan_seqlets(modisco_dir, output_file, trim_frac=0.08, num_workers=1, contribsf=None, verbose=False): """Compute the cwm scanning scores of the original modisco seqlets """ from bpnet.modisco.table import ModiscoData os.makedirs(os.path.dirname(output_file), exist_ok=True) add_file_logging(os.path.dirname(output_file), logger, 'cwm_scan_seqlets') # figure out contrib_wildcard mf = ModiscoFile(modisco_dir / "modisco.h5") if contribsf is None: contrib = ContribFile.from_modisco_dir(modisco_dir) else: contrib = contribsf tasks = mf.tasks() # HACK prune the tasks of contribution (in case it's present) tasks = [t.split("/")[0] for t in tasks] dfi_list = [] for pattern_name in tqdm(mf.pattern_names()): pattern = mf.get_pattern(pattern_name).trim_seq_ic(trim_frac) seqlets = mf._get_seqlets(pattern_name, trim_frac=trim_frac) # scan only the existing locations of the seqlets instead of the full sequences # to obtain the distribution stacked_seqlets = contrib.extract(seqlets) match, contribution = pattern.scan_contribution( stacked_seqlets.contrib, hyp_contrib=None, tasks=tasks, n_jobs=num_workers, verbose=False, pad_mode=None) seq_match = pattern.scan_seq(stacked_seqlets.seq, n_jobs=num_workers, verbose=False, pad_mode=None) dfm = pattern.get_instances(tasks, match, contribution, seq_match, fdr=1, verbose=verbose, plot=verbose) dfm = dfm[dfm.seq_match > 0] dfi_list.append(dfm) df = pd.concat(dfi_list) df.to_csv(output_file)
def modisco_export_patterns(modisco_dir, output_file, contribsf=None): """Export patterns to a pkl file. Don't cluster them Adds `stacked_seqlet_contrib` and `n_seqlets` to pattern `attrs` Args: modisco_dir: modisco directory containing output_file: output file path for patterns.pkl """ from bpnet.cli.contrib import ContribFile logger.info("Loading patterns") modisco_dir = Path(modisco_dir) mf = ModiscoFile(modisco_dir / 'modisco.h5') patterns = [mf.get_pattern(pname) for pname in mf.pattern_names()] if contribsf is None: contrib_file = ContribFile.from_modisco_dir(modisco_dir) logger.info("Loading ContribFile into memory") contrib_file.cache() else: logger.info("Using the provided ContribFile") contrib_file = contribsf logger.info("Extracting profile and contribution scores") extended_patterns = [] for p in tqdm(patterns): p = p.copy() # get seqlets valid_seqlets = mf._get_seqlets(p.name) # extract the contribution scores sti = contrib_file.extract(valid_seqlets, profile_width=None) sti.dfi = mf.get_seqlet_intervals(p.name, as_df=True) p.attrs['stacked_seqlet_contrib'] = sti p.attrs['n_seqlets'] = mf.n_seqlets(p.name) extended_patterns.append(p) write_pkl(extended_patterns, output_file)
'--task', dest='task', help= 'name of the BPNET task whose predictions are being analyzed; typically the name of a TF' ) args = parser.parse_args() model_dir = args.model_dir tasks = [args.task] model_dir_path = Path(model_dir) modisco_dir = model_dir_path / 'modisco' for task in tasks: tomtom_dir = modisco_dir / task / 'tomtom' tomtom_dir.mkdir(parents=True, exist_ok=True) modisco_file = modisco_dir / task / 'modisco.h5' mf = ModiscoFile(modisco_file) for pattern_name in mf.pattern_names(): pattern = mf.get_pattern(pattern_name) matches = pattern.fetch_tomtom_matches( motifs_db='meme_db/HOCOMOCOv11_full_HUMAN_mono_meme_format.meme') matches_df = pd.DataFrame( columns=['Target ID', 'p-value', 'E-value', 'q-value']) i = 0 for match in matches: new_row = pd.DataFrame(match, index=[i]) matches_df = pd.concat([matches_df, new_row]) i = i + 1 pattern_name = pattern_name.replace('/', '_') matches_df.to_csv(tomtom_dir / f'{pattern_name}.tsv', sep='\t')
def pattern(modisco_dir): mf = ModiscoFile(modisco_dir / 'modisco.h5') return mf.get_pattern("metacluster_0/pattern_0")
def cwm_scan(modisco_dir, output_file, trim_frac=0.08, patterns='all', filters='match_weighted_p>=.2,contrib_weighted_p>=.01', contrib_file=None, add_profile_features=False, num_workers=10): """Get motif instances via CWM scanning. """ from bpnet.modisco.utils import longer_pattern, shorten_pattern from bpnet.modisco.pattern_instances import annotate_profile_single add_file_logging(os.path.dirname(output_file), logger, 'cwm-scan') modisco_dir = Path(modisco_dir) valid_suffixes = [ '.csv', '.csv.gz', '.tsv', '.tsv.gz', '.parq', '.bed', '.bed.gz', ] if not any([output_file.endswith(suffix) for suffix in valid_suffixes]): raise ValueError( f"output_file doesn't have a valid file suffix. Valid file suffixes are: {valid_suffixes}" ) # Centroid matches path cm_path = modisco_dir / f'cwm-scan-seqlets.trim-frac={trim_frac:.2f}.csv.gz' # save the hyper-parameters kwargs_json_file = os.path.join(os.path.dirname(output_file), 'cwm-scan.kwargs.json') write_json( dict(modisco_dir=os.path.abspath(str(contrib_file)), output_file=str(output_file), cwm_scan_seqlets_path=str(cm_path), trim_frac=trim_frac, patterns=patterns, filters=filters, contrib_file=contrib_file, add_profile_features=add_profile_features, num_workers=num_workers), str(kwargs_json_file)) # figure out contrib_wildcard modisco_kwargs = read_json( os.path.join(modisco_dir, "modisco-run.kwargs.json")) contrib_type = load_contrib_type(modisco_kwargs) mf = ModiscoFile(modisco_dir / "modisco.h5") tasks = mf.tasks() # HACK prune the tasks of contribution (in case it's present) tasks = [t.split("/")[0] for t in tasks] logger.info(f"Using tasks: {tasks}") if contrib_file is None: cf = ContribFile.from_modisco_dir(modisco_dir) cf.cache( ) # cache it since it can be re-used in `modisco_centroid_seqlet_matches` else: logger.info(f"Loading the contribution scores from: {contrib_file}") cf = ContribFile(contrib_file, default_contrib_score=contrib_type) if not cm_path.exists(): logger.info(f"Generating centroid matches to {cm_path.resolve()}") cwm_scan_seqlets(modisco_dir, output_file=cm_path, trim_frac=trim_frac, contribsf=cf if contrib_file is None else None, num_workers=num_workers, verbose=False) else: logger.info("Centroid matches already exist.") logger.info(f"Loading centroid matches from {cm_path.resolve()}") dfm_norm = pd.read_csv(cm_path) # get the raw data seq, contrib, ranges = cf.get_seq(), cf.get_contrib(), cf.get_ranges() logger.info("Scanning for patterns") dfl = [] # patterns to scan. `longer_pattern` makes sure the patterns are in the long format scan_patterns = patterns.split( ",") if patterns is not 'all' else mf.pattern_names() scan_patterns = [longer_pattern(pn) for pn in scan_patterns] if add_profile_features: profile = cf.get_profiles() logger.info("Profile features will also be added to dfi") for pattern_name in tqdm(mf.pattern_names()): if pattern_name not in scan_patterns: # skip scanning that patterns continue pattern = mf.get_pattern(pattern_name).trim_seq_ic(trim_frac) match, contribution = pattern.scan_contribution(contrib, hyp_contrib=None, tasks=tasks, n_jobs=num_workers, verbose=False) seq_match = pattern.scan_seq(seq, n_jobs=num_workers, verbose=False) dfm = pattern.get_instances( tasks, match, contribution, seq_match, norm_df=dfm_norm[dfm_norm.pattern == pattern_name], verbose=False, plot=False) for filt in filters.split(","): if len(filt) > 0: dfm = dfm.query(filt) if add_profile_features: dfm = annotate_profile_single(dfm, pattern_name, mf, profile, profile_width=70, trim_frac=trim_frac) dfm['pattern_short'] = shorten_pattern(pattern_name) # TODO - is it possible to write out the results incrementally? dfl.append(dfm) logger.info("Merging") # merge and write the results dfp = pd.concat(dfl) # append the ranges logger.info("Append ranges") ranges.columns = ["example_" + v for v in ranges.columns] dfp = dfp.merge(ranges, on="example_idx", how='left') # add the absolute coordinates dfp['pattern_start_abs'] = dfp['example_start'] + dfp['pattern_start'] dfp['pattern_end_abs'] = dfp['example_start'] + dfp['pattern_end'] logger.info("Table info") dfp.info() logger.info( f"Writing the resuling pd.DataFrame of shape {dfp.shape} to {output_file}" ) # set the first 7 columns to comply to bed6 format (chrom, start, end, name, score, strand, ...) bed_columns = [ 'example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'pattern', 'contrib_weighted_p', 'strand', 'match_weighted_p' ] dfp = pd_first_cols(dfp, bed_columns) # write to a parquet file if output_file.endswith(".parq"): logger.info("Writing a parquet file") dfp.to_parquet(output_file, partition_on=['pattern_short'], engine='fastparquet') elif output_file.endswith(".csv.gz") or output_file.endswith(".csv"): logger.info("Writing a csv file") dfp.to_csv(output_file, compression='infer', index=False) elif output_file.endswith(".tsv.gz") or output_file.endswith(".tsv"): logger.info("Writing a tsv file") dfp.to_csv(output_file, sep='\t', compression='infer', index=False) elif output_file.endswith(".bed.gz") or output_file.endswith(".bed"): logger.info("Writing a BED file") # write only the first (and main) 7 columns dfp[bed_columns].to_csv(output_file, sep='\t', compression='infer', index=False, header=False) else: logger.warn("File suffix not recognized. Using .csv.gz file format") dfp.to_csv(output_file, compression='gzip', index=False) logger.info("Done!")