Ejemplo n.º 1
0
def modisco_export_seqlets(modisco_dir, output_dir, trim_frac=0.08):
    from pybedtools import Interval
    from bpnet.modisco.files import ModiscoFile
    add_file_logging(output_dir, logger, 'modisco_export_seqlets')
    ranges = load_ranges(modisco_dir)
    example_intervals = [
        Interval(row.chrom, row.start, row.end)
        for i, row in ranges.iterrows()
    ]

    r = ModiscoFile(os.path.join(modisco_dir, "modisco.h5"))
    r.export_seqlets_bed(output_dir,
                         example_intervals=example_intervals,
                         position='absolute',
                         trim_frac=trim_frac)
    r.close()
Ejemplo n.º 2
0
def chip_nexus_analysis(modisco_dir,
                        trim_frac=0.08,
                        num_workers=20,
                        run_cwm_scan=False,
                        force=False,
                        footprint_width=200):
    """Compute all the results for modisco specific for ChIP-nexus/exo data. Runs:
    - modisco_plot
    - modisco_report
    - modisco_table
    - modisco_export_patterns
    - cwm_scan
    - modisco_export_seqlets

    Note:
      All the sub-commands are only executed if they have not been ran before. Use --force override this.
      Whether the commands have been run before is deterimined by checking if the following file exists:
        `{modisco_dir}/.modisco_report_all/{command}.done`.
    """
    plt.switch_backend('agg')
    from bpnet.utils import ConditionalRun

    modisco_dir = Path(modisco_dir)
    # figure out the contribution scores used
    kwargs = read_json(modisco_dir / "modisco-run.kwargs.json")
    contrib_scores = kwargs["contrib_file"]

    mf = ModiscoFile(f"{modisco_dir}/modisco.h5")
    all_patterns = mf.pattern_names()
    mf.close()
    if len(all_patterns) == 0:
        print("No patterns found.")
        # Touch modisco-chip.html for snakemake
        open(modisco_dir / 'modisco-chip.html', 'a').close()
        open(modisco_dir / 'seqlets/scored_regions.bed', 'a').close()
        return

    # class determining whether to run the command or not (poor-man's snakemake)
    cr = ConditionalRun("modisco_report_all", None, modisco_dir, force=force)

    sync = []
    # --------------------------------------------
    if (not cr.set_cmd('modisco_plot').done()
            or not cr.set_cmd('modisco_enrich_patterns').done()):
        # load ContribFile and pass it to all the functions
        logger.info("Loading ContribFile")
        contribsf = ContribFile.from_modisco_dir(modisco_dir)
        contribsf.cache()
    else:
        contribsf = None
    # --------------------------------------------
    # Basic reports
    if not cr.set_cmd('modisco_plot').done():
        modisco_plot(modisco_dir,
                     modisco_dir / 'plots',
                     heatmap_width=footprint_width,
                     figsize=(10, 10),
                     contribsf=contribsf)
        cr.write()
    sync.append("plots")

    if not cr.set_cmd('modisco_report').done():
        modisco_report(str(modisco_dir), str(modisco_dir))
        cr.write()
    sync.append("modisco-chip.html")

    if not cr.set_cmd('modisco_table').done():
        modisco_table(modisco_dir,
                      contrib_scores,
                      modisco_dir,
                      report_url=None,
                      contribsf=contribsf,
                      footprint_width=footprint_width)
        cr.write()
    sync.append("footprints.pkl")
    sync.append("pattern_table.*")

    if not cr.set_cmd('modisco_export_patterns').done():
        modisco_export_patterns(modisco_dir,
                                output_file=modisco_dir / 'patterns.pkl',
                                contribsf=contribsf)
        cr.write()
    sync.append("patterns.pkl")

    # --------------------------------------------
    # Finding new instances
    if run_cwm_scan:
        if not cr.set_cmd('cwm_scan').done():
            cwm_scan(modisco_dir,
                     modisco_dir / 'instances.bed.gz',
                     trim_frac=trim_frac,
                     contrib_file=None,
                     num_workers=num_workers)
            cr.write()

    # --------------------------------------------
    # Export bed-files and bigwigs

    # Seqlets
    if not cr.set_cmd('modisco_export_seqlets').done():
        modisco_export_seqlets(str(modisco_dir),
                               str(modisco_dir / 'seqlets'),
                               trim_frac=trim_frac)
        cr.write()
    sync.append("seqlets")

    # print the rsync command to run in order to sync the output
    # directories to the webserver
    logger.info("Run the following command to sync files to the webserver")
    dirs = " ".join(sync)
    print(f"rsync -av --progress {dirs} <output_dir>/")
Ejemplo n.º 3
0
def modisco_plot(
        modisco_dir,
        output_dir,
        # filter_npy=None,
        # ignore_dist_filter=False,
        heatmap_width=200,
        figsize=(10, 10),
        contribsf=None):
    """Plot the results of a modisco run

    Args:
      modisco_dir: modisco directory
      output_dir: Output directory for writing the results
      figsize: Output figure size
      contribsf: [optional] modisco contribution score file (ContribFile)
    """
    plt.switch_backend('agg')
    add_file_logging(output_dir, logger, 'modisco-plot')
    from bpnet.plot.vdom import write_heatmap_pngs
    from bpnet.plot.profiles import plot_profiles
    from bpnet.utils import flatten

    output_dir = Path(output_dir)
    output_dir.parent.mkdir(parents=True, exist_ok=True)

    # load modisco
    mf = ModiscoFile(f"{modisco_dir}/modisco.h5")

    if contribsf is not None:
        d = contribsf
    else:
        d = ContribFile.from_modisco_dir(modisco_dir)
        logger.info("Loading the contribution scores")
        d.cache()  # load all

    thr_one_hot = d.get_seq()
    # thr_hypothetical_contribs
    tracks = d.get_profiles()
    thr_hypothetical_contribs = dict()
    thr_contrib_scores = dict()
    # TODO - generalize this
    thr_hypothetical_contribs['profile'] = d.get_hyp_contrib()
    thr_contrib_scores['profile'] = d.get_contrib()

    tasks = d.get_tasks()

    # Count contribution (if it exists)
    if d.contains_contrib_score("counts/pre-act"):
        count_contrib_score = "counts/pre-act"
        thr_hypothetical_contribs['count'] = d.get_hyp_contrib(
            contrib_score=count_contrib_score)
        thr_contrib_scores['count'] = d.get_contrib(
            contrib_score=count_contrib_score)
    elif d.contains_contrib_score("count"):
        count_contrib_score = "count"
        thr_hypothetical_contribs['count'] = d.get_hyp_contrib(
            contrib_score=count_contrib_score)
        thr_contrib_scores['count'] = d.get_contrib(
            contrib_score=count_contrib_score)
    else:
        # Don't do anything
        pass

    thr_hypothetical_contribs = OrderedDict(
        flatten(thr_hypothetical_contribs, separator='/'))
    thr_contrib_scores = OrderedDict(flatten(thr_contrib_scores,
                                             separator='/'))
    # -------------------------------------------------

    all_seqlets = mf.seqlets()
    all_patterns = mf.pattern_names()
    if len(all_patterns) == 0:
        print("No patterns found")
        return

    # 1. Plots with tracks and contrib scores
    print("Writing results for contribution scores")
    plot_profiles(all_seqlets,
                  thr_one_hot,
                  tracks=tracks,
                  contribution_scores=thr_contrib_scores,
                  legend=False,
                  flip_neg=True,
                  rotate_y=0,
                  seq_height=.5,
                  patterns=all_patterns,
                  n_bootstrap=100,
                  fpath_template=str(output_dir /
                                     "{pattern}/agg_profile_contribcores"),
                  mkdir=True,
                  figsize=figsize)

    # 2. Plots only with hypothetical contrib scores
    print("Writing results for hypothetical contribution scores")
    plot_profiles(all_seqlets,
                  thr_one_hot,
                  tracks={},
                  contribution_scores=thr_hypothetical_contribs,
                  legend=False,
                  flip_neg=True,
                  rotate_y=0,
                  seq_height=1,
                  patterns=all_patterns,
                  n_bootstrap=100,
                  fpath_template=str(output_dir /
                                     "{pattern}/agg_profile_hypcontribscores"),
                  figsize=figsize)

    print("Plotting heatmaps")
    for pattern in tqdm(all_patterns):
        write_heatmap_pngs(all_seqlets[pattern],
                           d,
                           tasks,
                           pattern,
                           output_dir=str(output_dir / pattern),
                           resize_width=heatmap_width)

    mf.close()