def append_logo_cluster(pattern_table, patterns, cluster_order, cluster, align_track='contrib/mean', logo_len=30, **kwargs): # setup patterns pattern_names = np.array([shorten_pattern(p.name) for p in patterns]) patterns_nte_dict = {shorten_pattern(p.name): p for p in patterns} # organize as a dict pattern_table = pattern_table.set_index('pattern') pattern_table_nte = pattern_table.loc[pattern_names] pattern_table = pattern_table.reset_index() pattern_table_nte['cluster'] = cluster pattern_table_nte['cluster_order'] = cluster_order # pattern_table_nte = pattern_table_nte.iloc[cluster_order] # sort the whole table out = [] for cluster_id in tqdm(pattern_table_nte.cluster.unique()): dfg = pattern_table_nte[pattern_table_nte.cluster == cluster_id] # identify the major pattern max_seqlets = dfg['n seqlets'].argmax() major_pattern = patterns_nte_dict[max_seqlets] # align w.r.t. the thing used for clustering logo_imp = [ patterns_nte_dict[p].align( major_pattern, track=align_track).resize(logo_len).vdom_plot('contrib', as_html=True, **kwargs) for p in dfg.index ] logo_seq = [ patterns_nte_dict[p].align( major_pattern, track=align_track).resize(logo_len).vdom_plot('seq', as_html=True, **kwargs) for p in dfg.index ] dfg['logo_imp'] = logo_imp dfg['logo_seq'] = logo_seq out.append(dfg) return pd.concat(out, axis=0).reset_index()
def dfi_row2seqlet(row, short_name=False): return Seqlet( row.example_idx, row.pattern_start, row.pattern_end, name=shorten_pattern(row.pattern) if short_name else row.pattern, strand=row.strand)
def footprint_df(footprints, dfl=None, width=120, **kwargs): """Draw footprints sparklines into a pandas.DataFrame Args: footprints: footprint dict with `<pattern>/<task>` nested structure each node contains an array of shape (seq_len, 2) dfl: optional pandas.DataFrame of labels. Contains columns: pattern <task>/l width: width of the final plot **kwargs: additional kwargs to pass to vdom_footprint """ from tqdm import tqdm from basepair.modisco.utils import shorten_pattern def map_label(l): """Label -> short-name """ # TODO - get rid of this function if l is None: return "/" else: return l[0].upper() tasks = list(footprints[list(footprints)[0]].keys()) profile_max_median = { task: np.median([np.max(v[task]) for v in footprints.values()]) for task in tasks } out = [] for p, arr_d in tqdm(footprints.items()): try: labels = dfl[dfl.pattern == shorten_pattern(p)].iloc[0].to_dict() except Exception: labels = {t + "/l": None for t in tasks} d = { task: vdom_footprint(arr_d[task], r_height=profile_max_median[task], text=map_label(labels[task + "/l"]), **kwargs).to_html().replace("<img", f"<img width={width}") for task in tasks } d['pattern'] = shorten_pattern(p) out.append(d) return pd.DataFrame(out)
def pattern_features(pattern, data): """ Returns: OrderedDict of columns for pandas dataFrame """ return OrderedDict([ ("pattern", shorten_pattern(pattern)), ("logo pwm", logo_pwm(pattern, data)), ("logo imp", logo_imp(pattern, data)), ("n seqlets", len(data.get_seqlets(pattern))), ("ic pwm mean", pwm_mean_ic(pattern, data)), ] + [ res for task in data.get_tasks() for res in pattern_task_features(pattern, task, data) ] + [("consensus", consensus(pattern, data))])
def plot_pattern(self, pattern_name, kind='all', rc=False, trim_frac=None, letter_width=0.2, height=0.8, rotate_y=0, ylab=True): pattern = self.get_pattern(pattern_name) pattern = pattern.trim_seq_ic(trim_frac) ns = self.n_seqlets(*pattern_name.split("/")) pattern.name = shorten_pattern(pattern_name) + f" ({ns})" if rc: pattern = pattern.rc() return pattern.plot(kind, letter_width=letter_width, height=height, rotate_y=rotate_y, ylab=ylab)
def shorten_te_pattern(s): tf, p = s.split("/", 1) return tf + "/" + shorten_pattern(p)
def load_instances(parq_file, motifs=None, dedup=True): """Load pattern instances from the parquet file Args: parq_file: parquet file of motif instances motifs: dictionary of motifs of interest. key=custom motif name, value=short pattern name (e.g. 'm0_p3') """ if motifs is not None: incl_motifs = {longer_pattern(m) for m in motifs.values()} else: incl_motifs = None if isinstance(parq_file, pd.DataFrame): dfi = parq_file else: if motifs is not None: from fastparquet import ParquetFile # Selectively load only the relevant patterns pf = ParquetFile(str(parq_file)) if 'dir0' in pf.cats: # fix the wrong patterns metaclusters = list( {'pattern=' + x.split("/")[0] for x in incl_motifs}) patterns = list({x.split("/")[1] for x in incl_motifs}) dfi = pf.to_pandas( filters=[("dir0", "in", metaclusters), ("dir1", "in", patterns)]) dfi['pattern'] = dfi['dir0'].str.replace( "pattern=", "").astype(str) + "/" + dfi['dir1'].astype(str) del dfi['dir0'] del dfi['dir1'] else: dfi = pf.to_pandas(filters=[('pattern', 'in', list(incl_motifs))]) else: dfi = pd.read_parquet(str(parq_file), engine='fastparquet') if 'pattern' not in dfi: # assumes a hive-stored file dfi['pattern'] = dfi['dir0'].str.replace( "pattern=", "").astype(str) + "/" + dfi['dir1'].astype(str) # filter if motifs is not None: dfi = dfi[dfi.pattern.isin( incl_motifs)] # NOTE this should already be removed dfi['pattern_short'] = dfi['pattern'].map( {k: shorten_pattern(k) for k in incl_motifs}) dfi['pattern_name'] = dfi['pattern_short'].map( {v: k for k, v in motifs.items()}) else: dfi['pattern_short'] = dfi['pattern'].map( {k: shorten_pattern(k) for k in dfi.pattern.unique()}) # add some columns dfi['pattern_start_abs'] = dfi['example_start'] + dfi['pattern_start'] dfi['pattern_end_abs'] = dfi['example_start'] + dfi['pattern_end'] if dedup: # deduplicate dfi_dedup = dfi.drop_duplicates([ 'pattern', 'example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand' ]) # number of removed duplicates d = len(dfi) - len(dfi_dedup) print("number of de-duplicated instances:", d, f"({d / len(dfi) * 100}%)") # use de-duplicated instances from now on dfi = dfi_dedup return dfi
def short_name(self): return shorten_pattern(self.name)