def append_logo_cluster(pattern_table,
                        patterns,
                        cluster_order,
                        cluster,
                        align_track='contrib/mean',
                        logo_len=30,
                        **kwargs):
    # setup patterns
    pattern_names = np.array([shorten_pattern(p.name) for p in patterns])
    patterns_nte_dict = {shorten_pattern(p.name): p
                         for p in patterns}  # organize as a dict

    pattern_table = pattern_table.set_index('pattern')
    pattern_table_nte = pattern_table.loc[pattern_names]
    pattern_table = pattern_table.reset_index()
    pattern_table_nte['cluster'] = cluster
    pattern_table_nte['cluster_order'] = cluster_order

    # pattern_table_nte = pattern_table_nte.iloc[cluster_order]  # sort the whole table
    out = []
    for cluster_id in tqdm(pattern_table_nte.cluster.unique()):
        dfg = pattern_table_nte[pattern_table_nte.cluster == cluster_id]

        # identify the major pattern
        max_seqlets = dfg['n seqlets'].argmax()
        major_pattern = patterns_nte_dict[max_seqlets]

        # align w.r.t. the thing used for clustering
        logo_imp = [
            patterns_nte_dict[p].align(
                major_pattern,
                track=align_track).resize(logo_len).vdom_plot('contrib',
                                                              as_html=True,
                                                              **kwargs)
            for p in dfg.index
        ]
        logo_seq = [
            patterns_nte_dict[p].align(
                major_pattern,
                track=align_track).resize(logo_len).vdom_plot('seq',
                                                              as_html=True,
                                                              **kwargs)
            for p in dfg.index
        ]

        dfg['logo_imp'] = logo_imp
        dfg['logo_seq'] = logo_seq
        out.append(dfg)

    return pd.concat(out, axis=0).reset_index()
def dfi_row2seqlet(row, short_name=False):
    return Seqlet(
        row.example_idx,
        row.pattern_start,
        row.pattern_end,
        name=shorten_pattern(row.pattern) if short_name else row.pattern,
        strand=row.strand)
Beispiel #3
0
def footprint_df(footprints, dfl=None, width=120, **kwargs):
    """Draw footprints sparklines into a pandas.DataFrame

    Args:
      footprints: footprint dict with `<pattern>/<task>` nested structure
        each node contains an array of shape (seq_len, 2)
      dfl: optional pandas.DataFrame of labels. Contains columns:
        pattern <task>/l
      width: width of the final plot
      **kwargs: additional kwargs to pass to vdom_footprint
    """
    from tqdm import tqdm
    from basepair.modisco.utils import shorten_pattern

    def map_label(l):
        """Label -> short-name
        """
        # TODO - get rid of this function
        if l is None:
            return "/"
        else:
            return l[0].upper()

    tasks = list(footprints[list(footprints)[0]].keys())
    profile_max_median = {
        task: np.median([np.max(v[task]) for v in footprints.values()])
        for task in tasks
    }
    out = []

    for p, arr_d in tqdm(footprints.items()):
        try:
            labels = dfl[dfl.pattern == shorten_pattern(p)].iloc[0].to_dict()
        except Exception:
            labels = {t + "/l": None for t in tasks}
        d = {
            task:
            vdom_footprint(arr_d[task],
                           r_height=profile_max_median[task],
                           text=map_label(labels[task + "/l"]),
                           **kwargs).to_html().replace("<img",
                                                       f"<img width={width}")
            for task in tasks
        }
        d['pattern'] = shorten_pattern(p)
        out.append(d)
    return pd.DataFrame(out)
Beispiel #4
0
def pattern_features(pattern, data):
    """

    Returns:
      OrderedDict of columns for pandas dataFrame
    """
    return OrderedDict([
        ("pattern", shorten_pattern(pattern)),
        ("logo pwm", logo_pwm(pattern, data)),
        ("logo imp", logo_imp(pattern, data)),
        ("n seqlets", len(data.get_seqlets(pattern))),
        ("ic pwm mean", pwm_mean_ic(pattern, data)),
    ] + [
        res for task in data.get_tasks()
        for res in pattern_task_features(pattern, task, data)
    ] + [("consensus", consensus(pattern, data))])
 def plot_pattern(self,
                  pattern_name,
                  kind='all',
                  rc=False,
                  trim_frac=None,
                  letter_width=0.2,
                  height=0.8,
                  rotate_y=0,
                  ylab=True):
     pattern = self.get_pattern(pattern_name)
     pattern = pattern.trim_seq_ic(trim_frac)
     ns = self.n_seqlets(*pattern_name.split("/"))
     pattern.name = shorten_pattern(pattern_name) + f" ({ns})"
     if rc:
         pattern = pattern.rc()
     return pattern.plot(kind,
                         letter_width=letter_width,
                         height=height,
                         rotate_y=rotate_y,
                         ylab=ylab)
Beispiel #6
0
def shorten_te_pattern(s):
    tf, p = s.split("/", 1)
    return tf + "/" + shorten_pattern(p)
def load_instances(parq_file, motifs=None, dedup=True):
    """Load pattern instances from the parquet file

    Args:
      parq_file: parquet file of motif instances
      motifs: dictionary of motifs of interest.
        key=custom motif name, value=short pattern name (e.g. 'm0_p3')

    """
    if motifs is not None:
        incl_motifs = {longer_pattern(m) for m in motifs.values()}
    else:
        incl_motifs = None

    if isinstance(parq_file, pd.DataFrame):
        dfi = parq_file
    else:
        if motifs is not None:
            from fastparquet import ParquetFile
            # Selectively load only the relevant patterns
            pf = ParquetFile(str(parq_file))
            if 'dir0' in pf.cats:
                # fix the wrong patterns
                metaclusters = list(
                    {'pattern=' + x.split("/")[0]
                     for x in incl_motifs})
                patterns = list({x.split("/")[1] for x in incl_motifs})
                dfi = pf.to_pandas(
                    filters=[("dir0", "in",
                              metaclusters), ("dir1", "in", patterns)])
                dfi['pattern'] = dfi['dir0'].str.replace(
                    "pattern=", "").astype(str) + "/" + dfi['dir1'].astype(str)
                del dfi['dir0']
                del dfi['dir1']
            else:
                dfi = pf.to_pandas(filters=[('pattern', 'in',
                                             list(incl_motifs))])

        else:
            dfi = pd.read_parquet(str(parq_file), engine='fastparquet')
            if 'pattern' not in dfi:
                # assumes a hive-stored file
                dfi['pattern'] = dfi['dir0'].str.replace(
                    "pattern=", "").astype(str) + "/" + dfi['dir1'].astype(str)

    # filter
    if motifs is not None:
        dfi = dfi[dfi.pattern.isin(
            incl_motifs)]  # NOTE this should already be removed
        dfi['pattern_short'] = dfi['pattern'].map(
            {k: shorten_pattern(k)
             for k in incl_motifs})
        dfi['pattern_name'] = dfi['pattern_short'].map(
            {v: k
             for k, v in motifs.items()})
    else:
        dfi['pattern_short'] = dfi['pattern'].map(
            {k: shorten_pattern(k)
             for k in dfi.pattern.unique()})

    # add some columns
    dfi['pattern_start_abs'] = dfi['example_start'] + dfi['pattern_start']
    dfi['pattern_end_abs'] = dfi['example_start'] + dfi['pattern_end']

    if dedup:
        # deduplicate
        dfi_dedup = dfi.drop_duplicates([
            'pattern', 'example_chrom', 'pattern_start_abs', 'pattern_end_abs',
            'strand'
        ])

        # number of removed duplicates
        d = len(dfi) - len(dfi_dedup)
        print("number of de-duplicated instances:", d,
              f"({d / len(dfi) * 100}%)")

        # use de-duplicated instances from now on
        dfi = dfi_dedup
    return dfi
Beispiel #8
0
 def short_name(self):
     return shorten_pattern(self.name)