Beispiel #1
0
def cwm_scan_seqlets(modisco_dir,
                     output_file,
                     trim_frac=0.08,
                     num_workers=1,
                     contribsf=None,
                     verbose=False):
    """Compute the cwm scanning scores of the original modisco seqlets
    """
    from bpnet.modisco.table import ModiscoData
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    add_file_logging(os.path.dirname(output_file), logger, 'cwm_scan_seqlets')

    # figure out contrib_wildcard
    mf = ModiscoFile(modisco_dir / "modisco.h5")

    if contribsf is None:
        contrib = ContribFile.from_modisco_dir(modisco_dir)
    else:
        contrib = contribsf

    tasks = mf.tasks()
    # HACK prune the tasks of contribution (in case it's present)
    tasks = [t.split("/")[0] for t in tasks]

    dfi_list = []

    for pattern_name in tqdm(mf.pattern_names()):
        pattern = mf.get_pattern(pattern_name).trim_seq_ic(trim_frac)
        seqlets = mf._get_seqlets(pattern_name, trim_frac=trim_frac)

        # scan only the existing locations of the seqlets instead of the full sequences
        # to obtain the distribution
        stacked_seqlets = contrib.extract(seqlets)

        match, contribution = pattern.scan_contribution(
            stacked_seqlets.contrib,
            hyp_contrib=None,
            tasks=tasks,
            n_jobs=num_workers,
            verbose=False,
            pad_mode=None)
        seq_match = pattern.scan_seq(stacked_seqlets.seq,
                                     n_jobs=num_workers,
                                     verbose=False,
                                     pad_mode=None)

        dfm = pattern.get_instances(tasks,
                                    match,
                                    contribution,
                                    seq_match,
                                    fdr=1,
                                    verbose=verbose,
                                    plot=verbose)
        dfm = dfm[dfm.seq_match > 0]

        dfi_list.append(dfm)
    df = pd.concat(dfi_list)
    df.to_csv(output_file)
Beispiel #2
0
def modisco_export_seqlets(modisco_dir, output_dir, trim_frac=0.08):
    from pybedtools import Interval
    from bpnet.modisco.files import ModiscoFile
    add_file_logging(output_dir, logger, 'modisco_export_seqlets')
    ranges = load_ranges(modisco_dir)
    example_intervals = [
        Interval(row.chrom, row.start, row.end)
        for i, row in ranges.iterrows()
    ]

    r = ModiscoFile(os.path.join(modisco_dir, "modisco.h5"))
    r.export_seqlets_bed(output_dir,
                         example_intervals=example_intervals,
                         position='absolute',
                         trim_frac=trim_frac)
    r.close()
Beispiel #3
0
def modisco_export_patterns(modisco_dir, output_file, contribsf=None):
    """Export patterns to a pkl file. Don't cluster them

    Adds `stacked_seqlet_contrib` and `n_seqlets` to pattern `attrs`

    Args:
      modisco_dir: modisco directory containing
      output_file: output file path for patterns.pkl
    """
    from bpnet.cli.contrib import ContribFile

    logger.info("Loading patterns")
    modisco_dir = Path(modisco_dir)

    mf = ModiscoFile(modisco_dir / 'modisco.h5')
    patterns = [mf.get_pattern(pname) for pname in mf.pattern_names()]

    if contribsf is None:
        contrib_file = ContribFile.from_modisco_dir(modisco_dir)
        logger.info("Loading ContribFile into memory")
        contrib_file.cache()
    else:
        logger.info("Using the provided ContribFile")
        contrib_file = contribsf

    logger.info("Extracting profile and contribution scores")
    extended_patterns = []
    for p in tqdm(patterns):
        p = p.copy()

        # get seqlets
        valid_seqlets = mf._get_seqlets(p.name)

        # extract the contribution scores
        sti = contrib_file.extract(valid_seqlets, profile_width=None)
        sti.dfi = mf.get_seqlet_intervals(p.name, as_df=True)
        p.attrs['stacked_seqlet_contrib'] = sti
        p.attrs['n_seqlets'] = mf.n_seqlets(p.name)
        extended_patterns.append(p)

    write_pkl(extended_patterns, output_file)
parser.add_argument(
    '--task',
    dest='task',
    help=
    'name of the BPNET task whose predictions are being analyzed; typically the name of a TF'
)
args = parser.parse_args()
model_dir = args.model_dir
task = [args.task]

model_dir_path = Path(model_dir)
modisco_dir = model_dir_path / 'modisco'

## MultipleModiscoResult is a convenience wrapper around ModiscoResult
mf = ModiscoFileGroup(
    {t: ModiscoFile(modisco_dir / t / 'modisco.h5')
     for t in task})

## create directories for plots: each dir corresponds to a different task/metacluster/
task_metacluster_dirs = []
for i in mf.pattern_names():
    dir_name_items = i.split('/')
    dir_name = '/'.join(dir_name_items[:len(dir_name_items) - 1])
    task_metacluster_dirs.append(dir_name)
task_metacluster_dirs = list(set(task_metacluster_dirs))
for t in task_metacluster_dirs:
    Path(f"{modisco_dir}/modisco_plots/{t}").mkdir(parents=True, exist_ok=True)

## Plot all the patterns
fig_names = []
for p in mf.patterns():
    '--task',
    dest='task',
    help=
    'name of the BPNET task whose predictions are being analyzed; typically the name of a TF'
)
args = parser.parse_args()
model_dir = args.model_dir
tasks = [args.task]

model_dir_path = Path(model_dir)
modisco_dir = model_dir_path / 'modisco'

for task in tasks:
    tomtom_dir = modisco_dir / task / 'tomtom'
    tomtom_dir.mkdir(parents=True, exist_ok=True)
    modisco_file = modisco_dir / task / 'modisco.h5'
    mf = ModiscoFile(modisco_file)
    for pattern_name in mf.pattern_names():
        pattern = mf.get_pattern(pattern_name)
        matches = pattern.fetch_tomtom_matches(
            motifs_db='meme_db/HOCOMOCOv11_full_HUMAN_mono_meme_format.meme')
        matches_df = pd.DataFrame(
            columns=['Target ID', 'p-value', 'E-value', 'q-value'])
        i = 0
        for match in matches:
            new_row = pd.DataFrame(match, index=[i])
            matches_df = pd.concat([matches_df, new_row])
            i = i + 1
        pattern_name = pattern_name.replace('/', '_')
        matches_df.to_csv(tomtom_dir / f'{pattern_name}.tsv', sep='\t')
from bpnet.modisco.files import ModiscoFile
import pandas as pd
from pathlib import Path
import sys

model_dir = sys.argv[1]
model_dir_path = Path(model_dir)
modisco_dir = model_dir_path / 'modisco'

tasks = sys.argv[2:]

mf = ModiscoFile(modisco_file)

# export pssm for all tasks
for task in tasks:
    modisco_task_dir = modisco_dir / task
    modisco_file = modisco_task_dir / 'modisco.h5'
    mf = ModiscoFile(modisco_file)
    for i in range(0, len(mf.pattern_names())):
        pattern_name = mf.pattern_names()[i]
        pssm = mf.get_pssm(pattern_name)
        pssm = pd.DataFrame(pssm)
        pssm.to_csv(f'{modisco_task_dir}/{pattern_name}.tsv', sep='\t', index=False, header=False)
Beispiel #7
0
def pattern(modisco_dir):
    mf = ModiscoFile(modisco_dir / 'modisco.h5')
    return mf.get_pattern("metacluster_0/pattern_0")
Beispiel #8
0
def mf(modisco_dir):
    """ModiscoFile
    """
    from bpnet.modisco.files import ModiscoFile
    mf = ModiscoFile(modisco_dir / 'modisco.h5')
    return mf
Beispiel #9
0
def chip_nexus_analysis(modisco_dir,
                        trim_frac=0.08,
                        num_workers=20,
                        run_cwm_scan=False,
                        force=False,
                        footprint_width=200):
    """Compute all the results for modisco specific for ChIP-nexus/exo data. Runs:
    - modisco_plot
    - modisco_report
    - modisco_table
    - modisco_export_patterns
    - cwm_scan
    - modisco_export_seqlets

    Note:
      All the sub-commands are only executed if they have not been ran before. Use --force override this.
      Whether the commands have been run before is deterimined by checking if the following file exists:
        `{modisco_dir}/.modisco_report_all/{command}.done`.
    """
    plt.switch_backend('agg')
    from bpnet.utils import ConditionalRun

    modisco_dir = Path(modisco_dir)
    # figure out the contribution scores used
    kwargs = read_json(modisco_dir / "modisco-run.kwargs.json")
    contrib_scores = kwargs["contrib_file"]

    mf = ModiscoFile(f"{modisco_dir}/modisco.h5")
    all_patterns = mf.pattern_names()
    mf.close()
    if len(all_patterns) == 0:
        print("No patterns found.")
        # Touch modisco-chip.html for snakemake
        open(modisco_dir / 'modisco-chip.html', 'a').close()
        open(modisco_dir / 'seqlets/scored_regions.bed', 'a').close()
        return

    # class determining whether to run the command or not (poor-man's snakemake)
    cr = ConditionalRun("modisco_report_all", None, modisco_dir, force=force)

    sync = []
    # --------------------------------------------
    if (not cr.set_cmd('modisco_plot').done()
            or not cr.set_cmd('modisco_enrich_patterns').done()):
        # load ContribFile and pass it to all the functions
        logger.info("Loading ContribFile")
        contribsf = ContribFile.from_modisco_dir(modisco_dir)
        contribsf.cache()
    else:
        contribsf = None
    # --------------------------------------------
    # Basic reports
    if not cr.set_cmd('modisco_plot').done():
        modisco_plot(modisco_dir,
                     modisco_dir / 'plots',
                     heatmap_width=footprint_width,
                     figsize=(10, 10),
                     contribsf=contribsf)
        cr.write()
    sync.append("plots")

    if not cr.set_cmd('modisco_report').done():
        modisco_report(str(modisco_dir), str(modisco_dir))
        cr.write()
    sync.append("modisco-chip.html")

    if not cr.set_cmd('modisco_table').done():
        modisco_table(modisco_dir,
                      contrib_scores,
                      modisco_dir,
                      report_url=None,
                      contribsf=contribsf,
                      footprint_width=footprint_width)
        cr.write()
    sync.append("footprints.pkl")
    sync.append("pattern_table.*")

    if not cr.set_cmd('modisco_export_patterns').done():
        modisco_export_patterns(modisco_dir,
                                output_file=modisco_dir / 'patterns.pkl',
                                contribsf=contribsf)
        cr.write()
    sync.append("patterns.pkl")

    # --------------------------------------------
    # Finding new instances
    if run_cwm_scan:
        if not cr.set_cmd('cwm_scan').done():
            cwm_scan(modisco_dir,
                     modisco_dir / 'instances.bed.gz',
                     trim_frac=trim_frac,
                     contrib_file=None,
                     num_workers=num_workers)
            cr.write()

    # --------------------------------------------
    # Export bed-files and bigwigs

    # Seqlets
    if not cr.set_cmd('modisco_export_seqlets').done():
        modisco_export_seqlets(str(modisco_dir),
                               str(modisco_dir / 'seqlets'),
                               trim_frac=trim_frac)
        cr.write()
    sync.append("seqlets")

    # print the rsync command to run in order to sync the output
    # directories to the webserver
    logger.info("Run the following command to sync files to the webserver")
    dirs = " ".join(sync)
    print(f"rsync -av --progress {dirs} <output_dir>/")
Beispiel #10
0
def cwm_scan(modisco_dir,
             output_file,
             trim_frac=0.08,
             patterns='all',
             filters='match_weighted_p>=.2,contrib_weighted_p>=.01',
             contrib_file=None,
             add_profile_features=False,
             num_workers=10):
    """Get motif instances via CWM scanning.
    """
    from bpnet.modisco.utils import longer_pattern, shorten_pattern
    from bpnet.modisco.pattern_instances import annotate_profile_single
    add_file_logging(os.path.dirname(output_file), logger, 'cwm-scan')
    modisco_dir = Path(modisco_dir)

    valid_suffixes = [
        '.csv',
        '.csv.gz',
        '.tsv',
        '.tsv.gz',
        '.parq',
        '.bed',
        '.bed.gz',
    ]
    if not any([output_file.endswith(suffix) for suffix in valid_suffixes]):
        raise ValueError(
            f"output_file doesn't have a valid file suffix. Valid file suffixes are: {valid_suffixes}"
        )

    # Centroid matches path
    cm_path = modisco_dir / f'cwm-scan-seqlets.trim-frac={trim_frac:.2f}.csv.gz'

    # save the hyper-parameters
    kwargs_json_file = os.path.join(os.path.dirname(output_file),
                                    'cwm-scan.kwargs.json')
    write_json(
        dict(modisco_dir=os.path.abspath(str(contrib_file)),
             output_file=str(output_file),
             cwm_scan_seqlets_path=str(cm_path),
             trim_frac=trim_frac,
             patterns=patterns,
             filters=filters,
             contrib_file=contrib_file,
             add_profile_features=add_profile_features,
             num_workers=num_workers), str(kwargs_json_file))

    # figure out contrib_wildcard
    modisco_kwargs = read_json(
        os.path.join(modisco_dir, "modisco-run.kwargs.json"))
    contrib_type = load_contrib_type(modisco_kwargs)

    mf = ModiscoFile(modisco_dir / "modisco.h5")
    tasks = mf.tasks()
    # HACK prune the tasks of contribution (in case it's present)
    tasks = [t.split("/")[0] for t in tasks]

    logger.info(f"Using tasks: {tasks}")

    if contrib_file is None:
        cf = ContribFile.from_modisco_dir(modisco_dir)
        cf.cache(
        )  # cache it since it can be re-used in `modisco_centroid_seqlet_matches`
    else:
        logger.info(f"Loading the contribution scores from: {contrib_file}")
        cf = ContribFile(contrib_file, default_contrib_score=contrib_type)

    if not cm_path.exists():
        logger.info(f"Generating centroid matches to {cm_path.resolve()}")
        cwm_scan_seqlets(modisco_dir,
                         output_file=cm_path,
                         trim_frac=trim_frac,
                         contribsf=cf if contrib_file is None else None,
                         num_workers=num_workers,
                         verbose=False)
    else:
        logger.info("Centroid matches already exist.")
    logger.info(f"Loading centroid matches from {cm_path.resolve()}")
    dfm_norm = pd.read_csv(cm_path)

    # get the raw data
    seq, contrib, ranges = cf.get_seq(), cf.get_contrib(), cf.get_ranges()

    logger.info("Scanning for patterns")
    dfl = []

    # patterns to scan. `longer_pattern` makes sure the patterns are in the long format
    scan_patterns = patterns.split(
        ",") if patterns is not 'all' else mf.pattern_names()
    scan_patterns = [longer_pattern(pn) for pn in scan_patterns]

    if add_profile_features:
        profile = cf.get_profiles()
        logger.info("Profile features will also be added to dfi")

    for pattern_name in tqdm(mf.pattern_names()):
        if pattern_name not in scan_patterns:
            # skip scanning that patterns
            continue
        pattern = mf.get_pattern(pattern_name).trim_seq_ic(trim_frac)
        match, contribution = pattern.scan_contribution(contrib,
                                                        hyp_contrib=None,
                                                        tasks=tasks,
                                                        n_jobs=num_workers,
                                                        verbose=False)
        seq_match = pattern.scan_seq(seq, n_jobs=num_workers, verbose=False)
        dfm = pattern.get_instances(
            tasks,
            match,
            contribution,
            seq_match,
            norm_df=dfm_norm[dfm_norm.pattern == pattern_name],
            verbose=False,
            plot=False)
        for filt in filters.split(","):
            if len(filt) > 0:
                dfm = dfm.query(filt)

        if add_profile_features:
            dfm = annotate_profile_single(dfm,
                                          pattern_name,
                                          mf,
                                          profile,
                                          profile_width=70,
                                          trim_frac=trim_frac)
        dfm['pattern_short'] = shorten_pattern(pattern_name)

        # TODO - is it possible to write out the results incrementally?
        dfl.append(dfm)

    logger.info("Merging")
    # merge and write the results
    dfp = pd.concat(dfl)

    # append the ranges
    logger.info("Append ranges")
    ranges.columns = ["example_" + v for v in ranges.columns]
    dfp = dfp.merge(ranges, on="example_idx", how='left')

    # add the absolute coordinates
    dfp['pattern_start_abs'] = dfp['example_start'] + dfp['pattern_start']
    dfp['pattern_end_abs'] = dfp['example_start'] + dfp['pattern_end']

    logger.info("Table info")
    dfp.info()
    logger.info(
        f"Writing the resuling pd.DataFrame of shape {dfp.shape} to {output_file}"
    )

    # set the first 7 columns to comply to bed6 format (chrom, start, end, name, score, strand, ...)
    bed_columns = [
        'example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'pattern',
        'contrib_weighted_p', 'strand', 'match_weighted_p'
    ]
    dfp = pd_first_cols(dfp, bed_columns)

    # write to a parquet file
    if output_file.endswith(".parq"):
        logger.info("Writing a parquet file")
        dfp.to_parquet(output_file,
                       partition_on=['pattern_short'],
                       engine='fastparquet')
    elif output_file.endswith(".csv.gz") or output_file.endswith(".csv"):
        logger.info("Writing a csv file")
        dfp.to_csv(output_file, compression='infer', index=False)
    elif output_file.endswith(".tsv.gz") or output_file.endswith(".tsv"):
        logger.info("Writing a tsv file")
        dfp.to_csv(output_file, sep='\t', compression='infer', index=False)
    elif output_file.endswith(".bed.gz") or output_file.endswith(".bed"):
        logger.info("Writing a BED file")
        # write only the first (and main) 7 columns
        dfp[bed_columns].to_csv(output_file,
                                sep='\t',
                                compression='infer',
                                index=False,
                                header=False)
    else:
        logger.warn("File suffix not recognized. Using .csv.gz file format")
        dfp.to_csv(output_file, compression='gzip', index=False)
    logger.info("Done!")
Beispiel #11
0
def modisco_plot(
        modisco_dir,
        output_dir,
        # filter_npy=None,
        # ignore_dist_filter=False,
        heatmap_width=200,
        figsize=(10, 10),
        contribsf=None):
    """Plot the results of a modisco run

    Args:
      modisco_dir: modisco directory
      output_dir: Output directory for writing the results
      figsize: Output figure size
      contribsf: [optional] modisco contribution score file (ContribFile)
    """
    plt.switch_backend('agg')
    add_file_logging(output_dir, logger, 'modisco-plot')
    from bpnet.plot.vdom import write_heatmap_pngs
    from bpnet.plot.profiles import plot_profiles
    from bpnet.utils import flatten

    output_dir = Path(output_dir)
    output_dir.parent.mkdir(parents=True, exist_ok=True)

    # load modisco
    mf = ModiscoFile(f"{modisco_dir}/modisco.h5")

    if contribsf is not None:
        d = contribsf
    else:
        d = ContribFile.from_modisco_dir(modisco_dir)
        logger.info("Loading the contribution scores")
        d.cache()  # load all

    thr_one_hot = d.get_seq()
    # thr_hypothetical_contribs
    tracks = d.get_profiles()
    thr_hypothetical_contribs = dict()
    thr_contrib_scores = dict()
    # TODO - generalize this
    thr_hypothetical_contribs['profile'] = d.get_hyp_contrib()
    thr_contrib_scores['profile'] = d.get_contrib()

    tasks = d.get_tasks()

    # Count contribution (if it exists)
    if d.contains_contrib_score("counts/pre-act"):
        count_contrib_score = "counts/pre-act"
        thr_hypothetical_contribs['count'] = d.get_hyp_contrib(
            contrib_score=count_contrib_score)
        thr_contrib_scores['count'] = d.get_contrib(
            contrib_score=count_contrib_score)
    elif d.contains_contrib_score("count"):
        count_contrib_score = "count"
        thr_hypothetical_contribs['count'] = d.get_hyp_contrib(
            contrib_score=count_contrib_score)
        thr_contrib_scores['count'] = d.get_contrib(
            contrib_score=count_contrib_score)
    else:
        # Don't do anything
        pass

    thr_hypothetical_contribs = OrderedDict(
        flatten(thr_hypothetical_contribs, separator='/'))
    thr_contrib_scores = OrderedDict(flatten(thr_contrib_scores,
                                             separator='/'))
    # -------------------------------------------------

    all_seqlets = mf.seqlets()
    all_patterns = mf.pattern_names()
    if len(all_patterns) == 0:
        print("No patterns found")
        return

    # 1. Plots with tracks and contrib scores
    print("Writing results for contribution scores")
    plot_profiles(all_seqlets,
                  thr_one_hot,
                  tracks=tracks,
                  contribution_scores=thr_contrib_scores,
                  legend=False,
                  flip_neg=True,
                  rotate_y=0,
                  seq_height=.5,
                  patterns=all_patterns,
                  n_bootstrap=100,
                  fpath_template=str(output_dir /
                                     "{pattern}/agg_profile_contribcores"),
                  mkdir=True,
                  figsize=figsize)

    # 2. Plots only with hypothetical contrib scores
    print("Writing results for hypothetical contribution scores")
    plot_profiles(all_seqlets,
                  thr_one_hot,
                  tracks={},
                  contribution_scores=thr_hypothetical_contribs,
                  legend=False,
                  flip_neg=True,
                  rotate_y=0,
                  seq_height=1,
                  patterns=all_patterns,
                  n_bootstrap=100,
                  fpath_template=str(output_dir /
                                     "{pattern}/agg_profile_hypcontribscores"),
                  figsize=figsize)

    print("Plotting heatmaps")
    for pattern in tqdm(all_patterns):
        write_heatmap_pngs(all_seqlets[pattern],
                           d,
                           tasks,
                           pattern,
                           output_dir=str(output_dir / pattern),
                           resize_width=heatmap_width)

    mf.close()