Example #1
0
def append_logo_cluster(pattern_table, patterns, cluster_order, cluster,
                        align_track='contrib/mean', logo_len=30, **kwargs):
    # setup patterns
    pattern_names = np.array([shorten_pattern(p.name) for p in patterns])
    patterns_nte_dict = {shorten_pattern(p.name): p for p in patterns}  # organize as a dict

    pattern_table = pattern_table.set_index('pattern')
    pattern_table_nte = pattern_table.loc[pattern_names]
    pattern_table = pattern_table.reset_index()
    pattern_table_nte['cluster'] = cluster
    pattern_table_nte['cluster_order'] = cluster_order

    # pattern_table_nte = pattern_table_nte.iloc[cluster_order]  # sort the whole table
    out = []
    for cluster_id in tqdm(pattern_table_nte.cluster.unique()):
        dfg = pattern_table_nte[pattern_table_nte.cluster == cluster_id]

        # identify the major pattern
        max_seqlets = dfg['n seqlets'].argmax()
        major_pattern = patterns_nte_dict[max_seqlets]

        # align w.r.t. the thing used for clustering
        logo_contrib = [patterns_nte_dict[p].align(major_pattern, track=align_track).resize(logo_len).vdom_plot('contrib', as_html=True, **kwargs)
                        for p in dfg.index]
        logo_seq = [patterns_nte_dict[p].align(major_pattern, track=align_track).resize(logo_len).vdom_plot('seq', as_html=True, **kwargs)
                    for p in dfg.index]

        dfg['logo_contrib'] = logo_contrib
        dfg['logo_seq'] = logo_seq
        out.append(dfg)

    return pd.concat(out, axis=0).reset_index()
Example #2
0
def dfi_row2seqlet(row, short_name=False):
    return Seqlet(
        row.example_idx,
        row.pattern_start,
        row.pattern_end,
        name=shorten_pattern(row.pattern) if short_name else row.pattern,
        strand=row.strand)
Example #3
0
def footprint_df(footprints, dfl=None, width=120, **kwargs):
    """Draw footprints sparklines into a pandas.DataFrame

    Args:
      footprints: footprint dict with `<pattern>/<task>` nested structure
        each node contains an array of shape (seq_len, 2)
      dfl: optional pandas.DataFrame of labels. Contains columns:
        pattern <task>/l
      width: width of the final plot
      **kwargs: additional kwargs to pass to vdom_footprint
    """
    from tqdm import tqdm
    from bpnet.modisco.utils import shorten_pattern

    def map_label(l):
        """Label -> short-name
        """
        # TODO - get rid of this function
        if l is None:
            return "/"
        else:
            return l[0].upper()

    tasks = list(footprints[list(footprints)[0]].keys())
    profile_max_median = {
        task: np.median([np.max(v[task]) for v in footprints.values()])
        for task in tasks
    }
    out = []

    for p, arr_d in tqdm(footprints.items()):
        try:
            labels = dfl[dfl.pattern == shorten_pattern(p)].iloc[0].to_dict()
        except Exception:
            labels = {t + "/l": None for t in tasks}
        d = {
            task:
            vdom_footprint(arr_d[task],
                           r_height=profile_max_median[task],
                           text=map_label(labels[task + "/l"]),
                           **kwargs).to_html().replace("<img",
                                                       f"<img width={width}")
            for task in tasks
        }
        d['pattern'] = shorten_pattern(p)
        out.append(d)
    return pd.DataFrame(out)
Example #4
0
 def plot_pattern(self,
                  pattern_name,
                  kind='all',
                  rc=False,
                  trim_frac=None,
                  letter_width=0.2,
                  height=0.8,
                  rotate_y=0,
                  ylab=True):
     pattern = self.get_pattern(pattern_name)
     pattern = pattern.trim_seq_ic(trim_frac)
     ns = self.n_seqlets(pattern_name)
     pattern.name = shorten_pattern(pattern_name) + f" ({ns})"
     if rc:
         pattern = pattern.rc()
     return pattern.plot(kind,
                         letter_width=letter_width,
                         height=height,
                         rotate_y=rotate_y,
                         ylab=ylab)
Example #5
0
def load_instances(parq_file, motifs=None, dedup=True, verbose=True):
    """Load pattern instances from the parquet file

    Args:
      parq_file: parquet file of motif instances
      motifs: dictionary of motifs of interest.
        key=custom motif name, value=short pattern name (e.g. {'Nanog': 'm0_p3'})

    """
    if motifs is not None:
        incl_motifs = {longer_pattern(m) for m in motifs.values()}
    else:
        incl_motifs = None

    if isinstance(parq_file, pd.DataFrame):
        dfi = parq_file
    else:
        if motifs is not None:
            from fastparquet import ParquetFile

            # Selectively load only the relevant patterns
            pf = ParquetFile(str(parq_file))
            patterns = [shorten_pattern(pn) for pn in incl_motifs]
            dfi = pf.to_pandas(filters=[("pattern_short", "in", patterns)])
        else:
            dfi = pd.read_parquet(str(parq_file), engine='fastparquet')
            if 'pattern' not in dfi:
                # assumes a hive-stored file
                dfi['pattern'] = dfi['dir0'].str.replace(
                    "pattern=", "").astype(str) + "/" + dfi['dir1'].astype(str)

    # filter
    if motifs is not None:
        dfi = dfi[dfi.pattern.isin(
            incl_motifs)]  # NOTE this should already be removed
        if 'pattern_short' not in dfi:
            dfi['pattern_short'] = dfi['pattern'].map(
                {k: shorten_pattern(k)
                 for k in incl_motifs})
        dfi['pattern_name'] = dfi['pattern_short'].map(
            {v: k
             for k, v in motifs.items()})
    else:
        dfi['pattern_short'] = dfi['pattern'].map(
            {k: shorten_pattern(k)
             for k in dfi.pattern.unique()})

    # add some columns if they don't yet exist
    if 'pattern_start_abs' not in dfi:
        dfi['pattern_start_abs'] = dfi['example_start'] + dfi['pattern_start']
    if 'pattern_end_abs' not in dfi:
        dfi['pattern_end_abs'] = dfi['example_start'] + dfi['pattern_end']

    if dedup:
        # deduplicate
        dfi_dedup = dfi.drop_duplicates([
            'pattern', 'example_chrom', 'pattern_start_abs', 'pattern_end_abs',
            'strand'
        ])

        # number of removed duplicates
        d = len(dfi) - len(dfi_dedup)
        if verbose:
            print("number of de-duplicated instances:", d,
                  f"({d / len(dfi) * 100}%)")

        # use de-duplicated instances from now on
        dfi = dfi_dedup
    return dfi
Example #6
0
def cwm_scan(modisco_dir,
             output_file,
             trim_frac=0.08,
             patterns='all',
             filters='match_weighted_p>=.2,contrib_weighted_p>=.01',
             contrib_file=None,
             add_profile_features=False,
             num_workers=10):
    """Get motif instances via CWM scanning.
    """
    from bpnet.modisco.utils import longer_pattern, shorten_pattern
    from bpnet.modisco.pattern_instances import annotate_profile_single
    add_file_logging(os.path.dirname(output_file), logger, 'cwm-scan')
    modisco_dir = Path(modisco_dir)

    valid_suffixes = [
        '.csv',
        '.csv.gz',
        '.tsv',
        '.tsv.gz',
        '.parq',
        '.bed',
        '.bed.gz',
    ]
    if not any([output_file.endswith(suffix) for suffix in valid_suffixes]):
        raise ValueError(
            f"output_file doesn't have a valid file suffix. Valid file suffixes are: {valid_suffixes}"
        )

    # Centroid matches path
    cm_path = modisco_dir / f'cwm-scan-seqlets.trim-frac={trim_frac:.2f}.csv.gz'

    # save the hyper-parameters
    kwargs_json_file = os.path.join(os.path.dirname(output_file),
                                    'cwm-scan.kwargs.json')
    write_json(
        dict(modisco_dir=os.path.abspath(str(contrib_file)),
             output_file=str(output_file),
             cwm_scan_seqlets_path=str(cm_path),
             trim_frac=trim_frac,
             patterns=patterns,
             filters=filters,
             contrib_file=contrib_file,
             add_profile_features=add_profile_features,
             num_workers=num_workers), str(kwargs_json_file))

    # figure out contrib_wildcard
    modisco_kwargs = read_json(
        os.path.join(modisco_dir, "modisco-run.kwargs.json"))
    contrib_type = load_contrib_type(modisco_kwargs)

    mf = ModiscoFile(modisco_dir / "modisco.h5")
    tasks = mf.tasks()
    # HACK prune the tasks of contribution (in case it's present)
    tasks = [t.split("/")[0] for t in tasks]

    logger.info(f"Using tasks: {tasks}")

    if contrib_file is None:
        cf = ContribFile.from_modisco_dir(modisco_dir)
        cf.cache(
        )  # cache it since it can be re-used in `modisco_centroid_seqlet_matches`
    else:
        logger.info(f"Loading the contribution scores from: {contrib_file}")
        cf = ContribFile(contrib_file, default_contrib_score=contrib_type)

    if not cm_path.exists():
        logger.info(f"Generating centroid matches to {cm_path.resolve()}")
        cwm_scan_seqlets(modisco_dir,
                         output_file=cm_path,
                         trim_frac=trim_frac,
                         contribsf=cf if contrib_file is None else None,
                         num_workers=num_workers,
                         verbose=False)
    else:
        logger.info("Centroid matches already exist.")
    logger.info(f"Loading centroid matches from {cm_path.resolve()}")
    dfm_norm = pd.read_csv(cm_path)

    # get the raw data
    seq, contrib, ranges = cf.get_seq(), cf.get_contrib(), cf.get_ranges()

    logger.info("Scanning for patterns")
    dfl = []

    # patterns to scan. `longer_pattern` makes sure the patterns are in the long format
    scan_patterns = patterns.split(
        ",") if patterns is not 'all' else mf.pattern_names()
    scan_patterns = [longer_pattern(pn) for pn in scan_patterns]

    if add_profile_features:
        profile = cf.get_profiles()
        logger.info("Profile features will also be added to dfi")

    for pattern_name in tqdm(mf.pattern_names()):
        if pattern_name not in scan_patterns:
            # skip scanning that patterns
            continue
        pattern = mf.get_pattern(pattern_name).trim_seq_ic(trim_frac)
        match, contribution = pattern.scan_contribution(contrib,
                                                        hyp_contrib=None,
                                                        tasks=tasks,
                                                        n_jobs=num_workers,
                                                        verbose=False)
        seq_match = pattern.scan_seq(seq, n_jobs=num_workers, verbose=False)
        dfm = pattern.get_instances(
            tasks,
            match,
            contribution,
            seq_match,
            norm_df=dfm_norm[dfm_norm.pattern == pattern_name],
            verbose=False,
            plot=False)
        for filt in filters.split(","):
            if len(filt) > 0:
                dfm = dfm.query(filt)

        if add_profile_features:
            dfm = annotate_profile_single(dfm,
                                          pattern_name,
                                          mf,
                                          profile,
                                          profile_width=70,
                                          trim_frac=trim_frac)
        dfm['pattern_short'] = shorten_pattern(pattern_name)

        # TODO - is it possible to write out the results incrementally?
        dfl.append(dfm)

    logger.info("Merging")
    # merge and write the results
    dfp = pd.concat(dfl)

    # append the ranges
    logger.info("Append ranges")
    ranges.columns = ["example_" + v for v in ranges.columns]
    dfp = dfp.merge(ranges, on="example_idx", how='left')

    # add the absolute coordinates
    dfp['pattern_start_abs'] = dfp['example_start'] + dfp['pattern_start']
    dfp['pattern_end_abs'] = dfp['example_start'] + dfp['pattern_end']

    logger.info("Table info")
    dfp.info()
    logger.info(
        f"Writing the resuling pd.DataFrame of shape {dfp.shape} to {output_file}"
    )

    # set the first 7 columns to comply to bed6 format (chrom, start, end, name, score, strand, ...)
    bed_columns = [
        'example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'pattern',
        'contrib_weighted_p', 'strand', 'match_weighted_p'
    ]
    dfp = pd_first_cols(dfp, bed_columns)

    # write to a parquet file
    if output_file.endswith(".parq"):
        logger.info("Writing a parquet file")
        dfp.to_parquet(output_file,
                       partition_on=['pattern_short'],
                       engine='fastparquet')
    elif output_file.endswith(".csv.gz") or output_file.endswith(".csv"):
        logger.info("Writing a csv file")
        dfp.to_csv(output_file, compression='infer', index=False)
    elif output_file.endswith(".tsv.gz") or output_file.endswith(".tsv"):
        logger.info("Writing a tsv file")
        dfp.to_csv(output_file, sep='\t', compression='infer', index=False)
    elif output_file.endswith(".bed.gz") or output_file.endswith(".bed"):
        logger.info("Writing a BED file")
        # write only the first (and main) 7 columns
        dfp[bed_columns].to_csv(output_file,
                                sep='\t',
                                compression='infer',
                                index=False,
                                header=False)
    else:
        logger.warn("File suffix not recognized. Using .csv.gz file format")
        dfp.to_csv(output_file, compression='gzip', index=False)
    logger.info("Done!")