def append_logo_cluster(pattern_table, patterns, cluster_order, cluster, align_track='contrib/mean', logo_len=30, **kwargs): # setup patterns pattern_names = np.array([shorten_pattern(p.name) for p in patterns]) patterns_nte_dict = {shorten_pattern(p.name): p for p in patterns} # organize as a dict pattern_table = pattern_table.set_index('pattern') pattern_table_nte = pattern_table.loc[pattern_names] pattern_table = pattern_table.reset_index() pattern_table_nte['cluster'] = cluster pattern_table_nte['cluster_order'] = cluster_order # pattern_table_nte = pattern_table_nte.iloc[cluster_order] # sort the whole table out = [] for cluster_id in tqdm(pattern_table_nte.cluster.unique()): dfg = pattern_table_nte[pattern_table_nte.cluster == cluster_id] # identify the major pattern max_seqlets = dfg['n seqlets'].argmax() major_pattern = patterns_nte_dict[max_seqlets] # align w.r.t. the thing used for clustering logo_contrib = [patterns_nte_dict[p].align(major_pattern, track=align_track).resize(logo_len).vdom_plot('contrib', as_html=True, **kwargs) for p in dfg.index] logo_seq = [patterns_nte_dict[p].align(major_pattern, track=align_track).resize(logo_len).vdom_plot('seq', as_html=True, **kwargs) for p in dfg.index] dfg['logo_contrib'] = logo_contrib dfg['logo_seq'] = logo_seq out.append(dfg) return pd.concat(out, axis=0).reset_index()
def dfi_row2seqlet(row, short_name=False): return Seqlet( row.example_idx, row.pattern_start, row.pattern_end, name=shorten_pattern(row.pattern) if short_name else row.pattern, strand=row.strand)
def footprint_df(footprints, dfl=None, width=120, **kwargs): """Draw footprints sparklines into a pandas.DataFrame Args: footprints: footprint dict with `<pattern>/<task>` nested structure each node contains an array of shape (seq_len, 2) dfl: optional pandas.DataFrame of labels. Contains columns: pattern <task>/l width: width of the final plot **kwargs: additional kwargs to pass to vdom_footprint """ from tqdm import tqdm from bpnet.modisco.utils import shorten_pattern def map_label(l): """Label -> short-name """ # TODO - get rid of this function if l is None: return "/" else: return l[0].upper() tasks = list(footprints[list(footprints)[0]].keys()) profile_max_median = { task: np.median([np.max(v[task]) for v in footprints.values()]) for task in tasks } out = [] for p, arr_d in tqdm(footprints.items()): try: labels = dfl[dfl.pattern == shorten_pattern(p)].iloc[0].to_dict() except Exception: labels = {t + "/l": None for t in tasks} d = { task: vdom_footprint(arr_d[task], r_height=profile_max_median[task], text=map_label(labels[task + "/l"]), **kwargs).to_html().replace("<img", f"<img width={width}") for task in tasks } d['pattern'] = shorten_pattern(p) out.append(d) return pd.DataFrame(out)
def plot_pattern(self, pattern_name, kind='all', rc=False, trim_frac=None, letter_width=0.2, height=0.8, rotate_y=0, ylab=True): pattern = self.get_pattern(pattern_name) pattern = pattern.trim_seq_ic(trim_frac) ns = self.n_seqlets(pattern_name) pattern.name = shorten_pattern(pattern_name) + f" ({ns})" if rc: pattern = pattern.rc() return pattern.plot(kind, letter_width=letter_width, height=height, rotate_y=rotate_y, ylab=ylab)
def load_instances(parq_file, motifs=None, dedup=True, verbose=True): """Load pattern instances from the parquet file Args: parq_file: parquet file of motif instances motifs: dictionary of motifs of interest. key=custom motif name, value=short pattern name (e.g. {'Nanog': 'm0_p3'}) """ if motifs is not None: incl_motifs = {longer_pattern(m) for m in motifs.values()} else: incl_motifs = None if isinstance(parq_file, pd.DataFrame): dfi = parq_file else: if motifs is not None: from fastparquet import ParquetFile # Selectively load only the relevant patterns pf = ParquetFile(str(parq_file)) patterns = [shorten_pattern(pn) for pn in incl_motifs] dfi = pf.to_pandas(filters=[("pattern_short", "in", patterns)]) else: dfi = pd.read_parquet(str(parq_file), engine='fastparquet') if 'pattern' not in dfi: # assumes a hive-stored file dfi['pattern'] = dfi['dir0'].str.replace( "pattern=", "").astype(str) + "/" + dfi['dir1'].astype(str) # filter if motifs is not None: dfi = dfi[dfi.pattern.isin( incl_motifs)] # NOTE this should already be removed if 'pattern_short' not in dfi: dfi['pattern_short'] = dfi['pattern'].map( {k: shorten_pattern(k) for k in incl_motifs}) dfi['pattern_name'] = dfi['pattern_short'].map( {v: k for k, v in motifs.items()}) else: dfi['pattern_short'] = dfi['pattern'].map( {k: shorten_pattern(k) for k in dfi.pattern.unique()}) # add some columns if they don't yet exist if 'pattern_start_abs' not in dfi: dfi['pattern_start_abs'] = dfi['example_start'] + dfi['pattern_start'] if 'pattern_end_abs' not in dfi: dfi['pattern_end_abs'] = dfi['example_start'] + dfi['pattern_end'] if dedup: # deduplicate dfi_dedup = dfi.drop_duplicates([ 'pattern', 'example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand' ]) # number of removed duplicates d = len(dfi) - len(dfi_dedup) if verbose: print("number of de-duplicated instances:", d, f"({d / len(dfi) * 100}%)") # use de-duplicated instances from now on dfi = dfi_dedup return dfi
def cwm_scan(modisco_dir, output_file, trim_frac=0.08, patterns='all', filters='match_weighted_p>=.2,contrib_weighted_p>=.01', contrib_file=None, add_profile_features=False, num_workers=10): """Get motif instances via CWM scanning. """ from bpnet.modisco.utils import longer_pattern, shorten_pattern from bpnet.modisco.pattern_instances import annotate_profile_single add_file_logging(os.path.dirname(output_file), logger, 'cwm-scan') modisco_dir = Path(modisco_dir) valid_suffixes = [ '.csv', '.csv.gz', '.tsv', '.tsv.gz', '.parq', '.bed', '.bed.gz', ] if not any([output_file.endswith(suffix) for suffix in valid_suffixes]): raise ValueError( f"output_file doesn't have a valid file suffix. Valid file suffixes are: {valid_suffixes}" ) # Centroid matches path cm_path = modisco_dir / f'cwm-scan-seqlets.trim-frac={trim_frac:.2f}.csv.gz' # save the hyper-parameters kwargs_json_file = os.path.join(os.path.dirname(output_file), 'cwm-scan.kwargs.json') write_json( dict(modisco_dir=os.path.abspath(str(contrib_file)), output_file=str(output_file), cwm_scan_seqlets_path=str(cm_path), trim_frac=trim_frac, patterns=patterns, filters=filters, contrib_file=contrib_file, add_profile_features=add_profile_features, num_workers=num_workers), str(kwargs_json_file)) # figure out contrib_wildcard modisco_kwargs = read_json( os.path.join(modisco_dir, "modisco-run.kwargs.json")) contrib_type = load_contrib_type(modisco_kwargs) mf = ModiscoFile(modisco_dir / "modisco.h5") tasks = mf.tasks() # HACK prune the tasks of contribution (in case it's present) tasks = [t.split("/")[0] for t in tasks] logger.info(f"Using tasks: {tasks}") if contrib_file is None: cf = ContribFile.from_modisco_dir(modisco_dir) cf.cache( ) # cache it since it can be re-used in `modisco_centroid_seqlet_matches` else: logger.info(f"Loading the contribution scores from: {contrib_file}") cf = ContribFile(contrib_file, default_contrib_score=contrib_type) if not cm_path.exists(): logger.info(f"Generating centroid matches to {cm_path.resolve()}") cwm_scan_seqlets(modisco_dir, output_file=cm_path, trim_frac=trim_frac, contribsf=cf if contrib_file is None else None, num_workers=num_workers, verbose=False) else: logger.info("Centroid matches already exist.") logger.info(f"Loading centroid matches from {cm_path.resolve()}") dfm_norm = pd.read_csv(cm_path) # get the raw data seq, contrib, ranges = cf.get_seq(), cf.get_contrib(), cf.get_ranges() logger.info("Scanning for patterns") dfl = [] # patterns to scan. `longer_pattern` makes sure the patterns are in the long format scan_patterns = patterns.split( ",") if patterns is not 'all' else mf.pattern_names() scan_patterns = [longer_pattern(pn) for pn in scan_patterns] if add_profile_features: profile = cf.get_profiles() logger.info("Profile features will also be added to dfi") for pattern_name in tqdm(mf.pattern_names()): if pattern_name not in scan_patterns: # skip scanning that patterns continue pattern = mf.get_pattern(pattern_name).trim_seq_ic(trim_frac) match, contribution = pattern.scan_contribution(contrib, hyp_contrib=None, tasks=tasks, n_jobs=num_workers, verbose=False) seq_match = pattern.scan_seq(seq, n_jobs=num_workers, verbose=False) dfm = pattern.get_instances( tasks, match, contribution, seq_match, norm_df=dfm_norm[dfm_norm.pattern == pattern_name], verbose=False, plot=False) for filt in filters.split(","): if len(filt) > 0: dfm = dfm.query(filt) if add_profile_features: dfm = annotate_profile_single(dfm, pattern_name, mf, profile, profile_width=70, trim_frac=trim_frac) dfm['pattern_short'] = shorten_pattern(pattern_name) # TODO - is it possible to write out the results incrementally? dfl.append(dfm) logger.info("Merging") # merge and write the results dfp = pd.concat(dfl) # append the ranges logger.info("Append ranges") ranges.columns = ["example_" + v for v in ranges.columns] dfp = dfp.merge(ranges, on="example_idx", how='left') # add the absolute coordinates dfp['pattern_start_abs'] = dfp['example_start'] + dfp['pattern_start'] dfp['pattern_end_abs'] = dfp['example_start'] + dfp['pattern_end'] logger.info("Table info") dfp.info() logger.info( f"Writing the resuling pd.DataFrame of shape {dfp.shape} to {output_file}" ) # set the first 7 columns to comply to bed6 format (chrom, start, end, name, score, strand, ...) bed_columns = [ 'example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'pattern', 'contrib_weighted_p', 'strand', 'match_weighted_p' ] dfp = pd_first_cols(dfp, bed_columns) # write to a parquet file if output_file.endswith(".parq"): logger.info("Writing a parquet file") dfp.to_parquet(output_file, partition_on=['pattern_short'], engine='fastparquet') elif output_file.endswith(".csv.gz") or output_file.endswith(".csv"): logger.info("Writing a csv file") dfp.to_csv(output_file, compression='infer', index=False) elif output_file.endswith(".tsv.gz") or output_file.endswith(".tsv"): logger.info("Writing a tsv file") dfp.to_csv(output_file, sep='\t', compression='infer', index=False) elif output_file.endswith(".bed.gz") or output_file.endswith(".bed"): logger.info("Writing a BED file") # write only the first (and main) 7 columns dfp[bed_columns].to_csv(output_file, sep='\t', compression='infer', index=False, header=False) else: logger.warn("File suffix not recognized. Using .csv.gz file format") dfp.to_csv(output_file, compression='gzip', index=False) logger.info("Done!")