Esempio n. 1
0
    def load(cls, modisco_dir, imp_scores_h5, impsf=None):
        """Instantiate ModiscoData from tf-modisco run folder
        """
        del imp_scores_h5  # Unused
        from basepair.cli.imp_score import ImpScoreFile
        modisco_dir = Path(modisco_dir)

        # Load the importance scores and the data
        # d = HDF5Reader.load(imp_scores_h5)

        # load modisco
        mr = ModiscoResult(modisco_dir / "modisco.h5")
        mr.open()

        if impsf is not None:
            # Cache the results
            d = impsf
        else:
            d = ImpScoreFile.from_modisco_dir(modisco_dir)
            d.cache()
        # load included samples
        # included_samples = load_included_samples(modisco_dir)
        included_samples = None

        tasks = d.get_tasks()  # list(d['targets']['profile'].keys())
        return cls(mr, d, included_samples, tasks)
Esempio n. 2
0
    def load(cls, modisco_dir):
        """Instantiate ModiscoData from tf-modisco run folder
        """
        kwargs = read_json(os.path.join(modisco_dir, "kwargs.json"))
        d = HDF5Reader.load(kwargs['imp_scores'])  # deeplift hdffile
        included_samples = np.load(kwargs["filter_npy"])
        # load modisco
        mr = ModiscoResult(os.path.join(modisco_dir, "results.hdf5"))
        mr.open()
        tasks = list(d['grads'].keys())

        return cls(mr, d, included_samples, tasks)
Esempio n. 3
0
def modisco_export_patterns(modisco_dir, output_file, impsf=None):
    """Export patterns to a pkl file. Don't cluster them

    Adds `stacked_seqlet_imp` and `n_seqlets` to pattern `attrs`

    Args:
      patterns_pkl: patterns.pkl file path
      modisco_dir: modisco directory containing
      output_file: output file path for patterns.pkl
    """
    from basepair.utils import read_pkl, write_pkl
    from basepair.cli.imp_score import ImpScoreFile
    from basepair.modisco.core import StackedSeqletImp

    logger.info("Loading patterns")
    modisco_dir = Path(modisco_dir)

    mr = ModiscoResult(modisco_dir / 'modisco.h5')
    mr.open()
    patterns = [mr.get_pattern(pname) for pname in mr.patterns()]

    if impsf is None:
        imp_file = ImpScoreFile.from_modisco_dir(modisco_dir)
        logger.info("Loading ImpScoreFile into memory")
        imp_file.cache()
    else:
        logger.info("Using the provided ImpScoreFile")
        imp_file = impsf

    logger.info("Extracting profile and importance scores")
    extended_patterns = []
    for p in tqdm(patterns):
        p = p.copy()

        # get the shifted seqlets
        valid_seqlets = mr._get_seqlets(p.name)

        # extract the importance scores
        sti = imp_file.extract(valid_seqlets, profile_width=None)
        sti.dfi = mr.get_seqlet_intervals(p.name, as_df=True)
        p.attrs['stacked_seqlet_imp'] = sti
        p.attrs['n_seqlets'] = mr.n_seqlets(*p.name.split("/"))
        extended_patterns.append(p)

    write_pkl(extended_patterns, output_file)
Esempio n. 4
0
def modisco_enrich_patterns(patterns_pkl_file,
                            modisco_dir,
                            output_file,
                            impsf=None):
    """Add stacked_seqlet_imp to pattern `attrs`

    Args:
      patterns_pkl: patterns.pkl file path
      modisco_dir: modisco directory containing
      output_file: output file path for patterns.pkl
    """
    from basepair.utils import read_pkl, write_pkl
    from basepair.cli.imp_score import ImpScoreFile
    from basepair.modisco.core import StackedSeqletImp

    logger.info("Loading patterns")
    modisco_dir = Path(modisco_dir)
    patterns = read_pkl(patterns_pkl_file)

    mr = ModiscoResult(modisco_dir / 'modisco.h5')
    mr.open()

    if impsf is None:
        imp_file = ImpScoreFile.from_modisco_dir(modisco_dir)
        logger.info("Loading ImpScoreFile into memory")
        imp_file.cache()
    else:
        logger.info("Using the provided ImpScoreFile")
        imp_file = impsf

    logger.info("Extracting profile and importance scores")
    extended_patterns = []
    for p in tqdm(patterns):
        p = p.copy()
        profile_width = p.len_profile()
        # get the shifted seqlets
        seqlets = [
            s.pattern_align(**p.attrs['align'])
            for s in mr._get_seqlets(p.name)
        ]

        # keep only valid seqlets
        valid_seqlets = [
            s for s in seqlets if s.valid_resize(profile_width,
                                                 imp_file.get_seqlen() + 1)
        ]
        # extract the importance scores
        p.attrs['stacked_seqlet_imp'] = imp_file.extract(
            valid_seqlets, profile_width=profile_width)

        p.attrs['n_seqlets'] = mr.n_seqlets(*p.name.split("/"))
        extended_patterns.append(p)

    write_pkl(extended_patterns, output_file)
Esempio n. 5
0
def dont_test_parse_hdf4():
    from basepair.modisco.results import ModiscoResult
    mr = ModiscoResult("/s/project/avsec/basepair/modisco/modisco.h5")
    mr.open()

    mr.f.ls()
    metacluster = "metacluster_0"
    pattern = "pattern_0"
    pattern_grp = mr.get_pattern_grp(metacluster, pattern)
    p = Pattern.from_hdf5_grp(pattern_grp, "m0_p0")
    pt = p.trim_seq_ic(0.08)
    assert len(pt) == len(pt.contrib['Klf4'])
    p = mr.get_pattern("metacluster_0/pattern_0")
    import matplotlib.pyplot as plt
    p.plot(kind='all')
    p.plot(kind=['seq', 'contrib/Klf4'])
    p.plot(kind='seq')

    plt.show()
    mr.plot_pattern("metacluster_0/pattern_0", kind=['seq', 'contrib/Klf4'])
Esempio n. 6
0
def modisco2bed(modisco_dir, output_dir, trim_frac=0.08):
    from pybedtools import Interval
    from basepair.modisco.results import ModiscoResult
    add_file_logging(output_dir, logger, 'modisco2bed')
    ranges = load_ranges(modisco_dir)
    example_intervals = [
        Interval(row.chrom, row.start, row.end)
        for i, row in ranges.iterrows()
    ]

    r = ModiscoResult(os.path.join(modisco_dir, "modisco.h5"))
    r.export_seqlets_bed(output_dir,
                         example_intervals=example_intervals,
                         position='absolute',
                         trim_frac=trim_frac)
    r.close()
Esempio n. 7
0
def modisco_instances_to_bed(modisco_h5,
                             instances_parq,
                             imp_score_h5,
                             output_dir,
                             trim_frac=0.08):
    from basepair.modisco.pattern_instances import load_instances

    add_file_logging(output_dir, logger, 'modisco-instances-to-bed')
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    mr = ModiscoResult(modisco_h5)
    mr.open()

    print("load task_id")
    d = HDF5Reader(imp_score_h5)
    d.open()
    if 'hyp_imp' not in d.f.keys():
        # backcompatibility
        d['hyp_imp'] = d['grads']

    id_hash = pd.DataFrame({
        "peak_id":
        d.f['/metadata/interval_from_task'][:],
        "example_idx":
        np.arange(d.f['/metadata/interval_from_task'].shape[0])
    })

    # load the instances data frame
    print("load all instances")
    df = load_instances(instances_parq, motifs=None, dedup=True)
    # import pdb
    # pdb.set_trace()
    df = df.merge(id_hash, on="example_idx")  # append peak_id

    patterns = df.pattern.unique().tolist()
    pattern_pssms = {
        pattern: mr.get_pssm(*pattern.split("/"))
        for pattern in patterns
    }
    append_pattern_loc(df, pattern_pssms, trim_frac=trim_frac)

    # write out the results
    example_cols = [
        'example_chr', 'example_start', 'example_end', 'example_id', 'peak_id'
    ]
    df_examples = df[example_cols].drop_duplicates().sort_values(
        ["example_chr", "example_start"])
    df_examples.to_csv(output_dir / "scored_regions.bed",
                       sep='\t',
                       header=False,
                       index=False)

    df["pattern_start_rel"] = df.pattern_start + df.example_start
    df["pattern_end_rel"] = df.pattern_end + df.example_start
    df["strand"] = df.revcomp.astype(bool).map({True: "-", False: "+"})

    # TODO - update this - ?
    pattern_cols = [
        'example_chr', 'pattern_start_rel', 'pattern_end_rel', 'example_id',
        'percnormed_score', 'strand', 'peak_id', 'seqlet_score'
    ]

    (output_dir /
     "README").write_text("score_regions.bed columns: " +
                          ", ".join(example_cols) + "\n" +
                          "metacluster_<>/pattern_<>.bed columns: " +
                          ", ".join(pattern_cols))
    df_pattern = df[pattern_cols]
    for pattern in df.pattern.unique():
        out_path = output_dir / (pattern + ".bed.gz")
        out_path.parent.mkdir(parents=True, exist_ok=True)
        dfp = df_pattern[df.pattern == pattern].drop_duplicates().sort_values(
            ["example_chr", "pattern_start_rel"])
        dfp.to_csv(out_path,
                   compression='gzip',
                   sep='\t',
                   header=False,
                   index=False)
Esempio n. 8
0
def modisco_score2(modisco_dir,
                   output_file,
                   trim_frac=0.08,
                   imp_scores=None,
                   importance=None,
                   ignore_filter=False,
                   n_jobs=20):
    """Modisco score instances

    Args:
      modisco_dir: modisco directory - used to obtain centroid_seqlet_matches.csv and modisco.h5
      output_file: output file path for the tsv file. If the suffix is
        tsv.gz, then also gzip the file
      trim_frac: how much to trim the pattern when scanning
      imp_scores: hdf5 file of importance scores (contains `importance` score)
        if None, then load the default importance scores from modisco
      importance: which importance scores to use
      n_jobs: number of parallel jobs to use

    Writes a gzipped tsv file(tsv.gz)
    """
    add_file_logging(os.path.dirname(output_file), logger, 'modisco-score2')
    modisco_dir = Path(modisco_dir)
    modisco_kwargs = read_json(f"{modisco_dir}/kwargs.json")
    if importance is None:
        importance = modisco_kwargs['grad_type']

    # Centroid matches
    cm_path = modisco_dir / 'centroid_seqlet_matches.csv'
    if not cm_path.exists():
        logger.info(f"Generating centroid matches to {cm_path.resolve()}")
        modisco_centroid_seqlet_matches(modisco_dir,
                                        imp_scores,
                                        modisco_dir,
                                        trim_frac=trim_frac,
                                        n_jobs=n_jobs)
    logger.info(f"Loading centroid matches from {cm_path.resolve()}")
    dfm_norm = pd.read_csv(cm_path)

    mr = ModiscoResult(modisco_dir / "modisco.h5")
    mr.open()
    tasks = mr.tasks()

    # HACK prune the tasks of importance (in case it's present)
    tasks = [t.replace(f"/{importance}", "") for t in tasks]

    logger.info(f"Using tasks: {tasks}")

    if imp_scores is not None:
        logger.info(f"Loading the importance scores from: {imp_scores}")
        imp = ImpScoreFile(imp_scores, default_imp_score=importance)
    else:
        imp = ImpScoreFile.from_modisco_dir(
            modisco_dir, ignore_include_samples=ignore_filter)

    seq, contrib, hyp_contrib, profile, ranges = imp.get_all()

    logger.info("Scanning for patterns")
    dfl = []
    for pattern_name in tqdm(mr.patterns()):
        pattern = mr.get_pattern(pattern_name).trim_seq_ic(trim_frac)
        match, importance = pattern.scan_importance(contrib,
                                                    hyp_contrib,
                                                    tasks,
                                                    n_jobs=n_jobs,
                                                    verbose=False)
        seq_match = pattern.scan_seq(seq, n_jobs=n_jobs, verbose=False)
        dfm = pattern.get_instances(
            tasks,
            match,
            importance,
            seq_match,
            norm_df=dfm_norm[dfm_norm.pattern == pattern_name],
            verbose=False,
            plot=False)
        dfl.append(dfm)

    logger.info("Merging")
    # merge and write the results
    dfp = pd.concat(dfl)

    # append the ranges
    logger.info("Append ranges")
    ranges.columns = ["example_" + v for v in ranges.columns]
    dfp = dfp.merge(ranges, on="example_idx", how='left')

    logger.info("Table info")
    dfp.info()
    logger.info(
        f"Writing the resuling pd.DataFrame of shape {dfp.shape} to {output_file}"
    )
    # write to a parquet file
    dfp.to_parquet(output_file, partition_on=['pattern'], engine='fastparquet')
    logger.info("Done!")
Esempio n. 9
0
def modisco_plot(
        modisco_dir,
        output_dir,
        # filter_npy=None,
        # ignore_dist_filter=False,
        figsize=(10, 10),
        impsf=None):
    """Plot the results of a modisco run

    Args:
      modisco_dir: modisco directory
      output_dir: Output directory for writing the results
      figsize: Output figure size
      impsf: [optional] modisco importance score file (ImpScoreFile)
    """
    plt.switch_backend('agg')
    add_file_logging(output_dir, logger, 'modisco-plot')
    from basepair.plot.vdom import write_heatmap_pngs
    from basepair.plot.profiles import plot_profiles
    from basepair.utils import flatten

    output_dir = Path(output_dir)
    output_dir.parent.mkdir(parents=True, exist_ok=True)

    # load modisco
    mr = ModiscoResult(f"{modisco_dir}/modisco.h5")

    if impsf is not None:
        d = impsf
    else:
        d = ImpScoreFile.from_modisco_dir(modisco_dir)
        logger.info("Loading the importance scores")
        d.cache()  # load all

    thr_one_hot = d.get_seq()
    # thr_hypothetical_contribs
    tracks = d.get_profiles()
    thr_hypothetical_contribs = dict()
    thr_contrib_scores = dict()
    # TODO - generalize this
    thr_hypothetical_contribs['weighted'] = d.get_hyp_contrib()
    thr_contrib_scores['weighted'] = d.get_contrib()

    tasks = d.get_tasks()

    # Count importance (if it exists)
    if d.contains_imp_score("counts/pre-act"):
        count_imp_score = "counts/pre-act"
        thr_hypothetical_contribs['count'] = d.get_hyp_contrib(
            imp_score=count_imp_score)
        thr_contrib_scores['count'] = d.get_contrib(imp_score=count_imp_score)
    elif d.contains_imp_score("count"):
        count_imp_score = "count"
        thr_hypothetical_contribs['count'] = d.get_hyp_contrib(
            imp_score=count_imp_score)
        thr_contrib_scores['count'] = d.get_contrib(imp_score=count_imp_score)
    else:
        # Don't do anything
        pass

    thr_hypothetical_contribs = OrderedDict(
        flatten(thr_hypothetical_contribs, separator='/'))
    thr_contrib_scores = OrderedDict(flatten(thr_contrib_scores,
                                             separator='/'))

    #     # load importance scores
    #     modisco_kwargs = read_json(f"{modisco_dir}/kwargs.json")
    #     d = HDF5Reader.load(modisco_kwargs['imp_scores'])
    #     if 'hyp_imp' not in d:
    #         # backcompatibility
    #         d['hyp_imp'] = d['grads']
    #     tasks = list(d['targets']['profile'])

    #     if isinstance(d['inputs'], dict):
    #         one_hot = d['inputs']['seq']
    #     else:
    #         one_hot = d['inputs']

    #     # load used strand distance filter

    #     included_samples = load_included_samples(modisco_dir)

    #     grad_type = "count,weighted"  # always plot both importance scores

    #     thr_hypothetical_contribs = OrderedDict([(f"{gt}/{task}", mean(d['hyp_imp'][task][gt])[included_samples])
    #                                              for task in tasks
    #                                              for gt in grad_type.split(",")])
    #     thr_one_hot = one_hot[included_samples]
    #     thr_contrib_scores = OrderedDict([(f"{gt}/{task}", thr_hypothetical_contribs[f"{gt}/{task}"] * thr_one_hot)
    #                                       for task in tasks
    #                                       for gt in grad_type.split(",")])
    #     tracks = OrderedDict([(task, d['targets']['profile'][task][included_samples]) for task in tasks])
    # -------------------------------------------------

    all_seqlets = mr.seqlets()
    all_patterns = mr.patterns()
    if len(all_patterns) == 0:
        print("No patterns found")
        return

    # 1. Plots with tracks and contrib scores
    print("Writing results for contribution scores")
    plot_profiles(all_seqlets,
                  thr_one_hot,
                  tracks=tracks,
                  importance_scores=thr_contrib_scores,
                  legend=False,
                  flip_neg=True,
                  rotate_y=0,
                  seq_height=.5,
                  patterns=all_patterns,
                  n_bootstrap=100,
                  fpath_template=str(output_dir /
                                     "{pattern}/agg_profile_contribcores"),
                  mkdir=True,
                  figsize=figsize)

    # 2. Plots only with hypothetical contrib scores
    print("Writing results for hypothetical contribution scores")
    plot_profiles(all_seqlets,
                  thr_one_hot,
                  tracks={},
                  importance_scores=thr_hypothetical_contribs,
                  legend=False,
                  flip_neg=True,
                  rotate_y=0,
                  seq_height=1,
                  patterns=all_patterns,
                  n_bootstrap=100,
                  fpath_template=str(output_dir /
                                     "{pattern}/agg_profile_hypcontribscores"),
                  figsize=figsize)

    print("Plotting heatmaps")
    for pattern in tqdm(all_patterns):
        write_heatmap_pngs(all_seqlets[pattern],
                           d,
                           tasks,
                           pattern,
                           output_dir=str(output_dir / pattern))

    mr.close()
Esempio n. 10
0
def modisco_report_all(modisco_dir,
                       trim_frac=0.08,
                       n_jobs=20,
                       scan_instances=False,
                       force=False):
    """Compute all the results for modisco. Runs:
    - modisco_plot
    - modisco_report
    - modisco_table
    - modisco_centroid_seqlet_matches
    - modisco_score2
    - modisco2bed
    - modisco_instances_to_bed

    Args:
      modisco_dir: directory path `output_dir` in `basepair.cli.modisco.modisco_run`
        contains: modisco.h5, strand_distances.h5, kwargs.json
      trim_frac: how much to trim the pattern
      n_jobs: number of parallel jobs to use
      force: if True, commands will be re-run regardless of whether whey have already
        been computed

    Note:
      All the sub-commands are only executed if they have not been ran before. Use --force override this.
      Whether the commands have been run before is deterimined by checking if the following file exists:
        `{modisco_dir}/.modisco_report_all/{command}.done`.
    """
    plt.switch_backend('agg')
    from basepair.utils import ConditionalRun

    modisco_dir = Path(modisco_dir)
    # figure out the importance scores used
    kwargs = read_json(modisco_dir / "kwargs.json")
    imp_scores = kwargs["imp_scores"]

    mr = ModiscoResult(f"{modisco_dir}/modisco.h5")
    mr.open()
    all_patterns = mr.patterns()
    mr.close()
    if len(all_patterns) == 0:
        print("No patterns found.")
        # Touch results.html for snakemake
        open(modisco_dir / 'results.html', 'a').close()
        open(modisco_dir / 'seqlets/scored_regions.bed', 'a').close()
        return

    # class determining whether to run the command or not (poor-man's snakemake)
    cr = ConditionalRun("modisco_report_all", None, modisco_dir, force=force)

    sync = []
    # --------------------------------------------
    if (not cr.set_cmd('modisco_plot').done()
            or not cr.set_cmd('modisco_cluster_patterns').done()
            or not cr.set_cmd('modisco_enrich_patterns').done()):
        # load ImpScoreFile and pass it to all the functions
        logger.info("Loading ImpScoreFile")
        impsf = ImpScoreFile.from_modisco_dir(modisco_dir)
        impsf.cache()
    else:
        impsf = None
    # --------------------------------------------
    # Basic reports
    if not cr.set_cmd('modisco_plot').done():
        modisco_plot(modisco_dir,
                     modisco_dir / 'plots',
                     figsize=(10, 10),
                     impsf=impsf)
        cr.write()
    sync.append("plots")

    if not cr.set_cmd('modisco_report').done():
        modisco_report(str(modisco_dir), str(modisco_dir))
        cr.write()
    sync.append("results.html")

    if not cr.set_cmd('modisco_table').done():
        modisco_table(modisco_dir, modisco_dir, report_url=None, impsf=impsf)
        cr.write()
    sync.append("footprints.pkl")
    sync.append("pattern_table.*")

    if not cr.set_cmd('modisco_cluster_patterns').done():
        modisco_cluster_patterns(modisco_dir, modisco_dir)
        cr.write()
    sync.append("patterns.pkl")
    sync.append("cluster-patterns.*")
    sync.append("motif_clustering")

    if not cr.set_cmd('modisco_enrich_patterns').done():
        modisco_enrich_patterns(modisco_dir / 'patterns.pkl',
                                modisco_dir,
                                modisco_dir / 'patterns.pkl',
                                impsf=impsf)
        cr.write()
    # sync.append("patterns.pkl")

    # TODO - run modisco align
    # - [ ] add the motif clustering step (as ipynb) and export the aligned tables
    #   - save the final table as a result to CSV (ready to be imported in excel)
    # --------------------------------------------
    # Finding new instances
    if scan_instances:
        if not cr.set_cmd('modisco_centroid_seqlet_matches').done():
            modisco_centroid_seqlet_matches(modisco_dir,
                                            imp_scores,
                                            modisco_dir,
                                            trim_frac=trim_frac,
                                            n_jobs=n_jobs,
                                            impsf=impsf)
            cr.write()

        # TODO - this would not work with the per-TF importance score file....
        if not cr.set_cmd('modisco_score2').done():
            modisco_score2(
                modisco_dir,
                modisco_dir / 'instances.parq',
                trim_frac=trim_frac,
                imp_scores=None,  # Use the default one
                importance=None,  # Use the default one
                n_jobs=n_jobs)
            cr.write()
    # TODO - update the pattern table -> compute the fraction of other motifs etc
    # --------------------------------------------
    # Export bed-files and bigwigs

    # Seqlets
    if not cr.set_cmd('modisco2bed').done():
        modisco2bed(str(modisco_dir),
                    str(modisco_dir / 'seqlets'),
                    trim_frac=trim_frac)
        cr.write()
    sync.append("seqlets")

    # Scanned instances
    # if not cr.set_cmd('modisco_instances_to_bed').done():
    #     modisco_instances_to_bed(str(modisco_dir / 'modisco.h5'),
    #                              instances_parq=str(modisco_dir / 'instances.parq'),
    #                              imp_score_h5=imp_scores,
    #                              output_dir=str(modisco_dir / 'instances_bed/'),
    #                              )
    #     cr.write()
    # sync.append("instances_bed")

    # print the rsync command to run in order to sync the output
    # directories to the webserver
    logger.info("Run the following command to sync files to the webserver")
    dirs = " ".join(sync)
    print(f"rsync -av --progress {dirs} <output_dir>/")
Esempio n. 11
0
def modisco_score2_single_binary(modisco_dir,
                                 output_file,
                                 imp_scores=None,
                                 trim_frac=0.08,
                                 n_jobs=20):
    """
    Equivalent of modisco_score2
    """
    import modisco
    from modisco.tfmodisco_workflow import workflow

    cm_path = os.path.join(modisco_dir, 'centroid_seqlet_matches.csv')
    dfm_norm = pd.read_csv(cm_path)
    mr = ModiscoResult(os.path.join(modisco_dir, "results.hdf5"))
    mr.open()
    tasks = mr.tasks()

    kwargs = read_json(os.path.join(modisco_dir, "kwargs.json"))
    d = HDF5Reader.load(kwargs['imp_scores'])  # deeplift hdffile
    if isinstance(d['inputs'], dict):
        one_hot = d['inputs']['seq']
    else:
        one_hot = d['inputs']
    tasks = list(d['grads'].keys())
    grad_type = list(d['grads'][tasks[0]].keys())[0]
    if kwargs.get("filter_npy", None) is not None:
        included_samples = np.load(kwargs["filter_npy"])

    hyp_contrib = {
        f"{task}":
        d['grads'][task]['deeplift']['hyp_contrib_scores'][included_samples]
        for task in tasks for gt in grad_type.split(",")
    }
    contrib = {
        f"{task}": d['grads'][task][gt]['contrib_scores'][included_samples]
        for task in tasks for gt in grad_type.split(",")
    }
    seq = one_hot[included_samples]
    ranges = pd.DataFrame({
        "chrom":
        d['metadata']['range']['chr'][:][included_samples],
        "start":
        d['metadata']['range']['start'][:][included_samples],
        "end":
        d['metadata']['range']['end'][:][included_samples],
        "strand":
        d['metadata']['range']['strand'][:][included_samples],
        "idx":
        np.arange(len(included_samples)),
        "interval_from_task":
        d['metadata']['interval_from_task'][:][included_samples],
    })

    print("Scanning for patterns")
    dfl = []
    mr_patterns = mr.patterns()  # [:2]
    for pattern_name in tqdm(mr_patterns):
        pattern = mr.get_pattern(pattern_name).trim_seq_ic(trim_frac)
        match, importance = pattern.scan_importance(contrib,
                                                    hyp_contrib,
                                                    tasks,
                                                    n_jobs=n_jobs,
                                                    verbose=False)
        seq_match = pattern.scan_seq(seq, n_jobs=n_jobs, verbose=False)
        dfm = pattern.get_instances(
            tasks,
            match,
            importance,
            seq_match,
            norm_df=dfm_norm[dfm_norm.pattern == pattern_name],
            verbose=False,
            plot=False)
        dfl.append(dfm)

    print("Merging")
    # merge and write the results
    dfp = pd.concat(dfl)
    print("Append ranges")
    ranges.columns = ["example_" + v for v in ranges.columns]
    dfp = dfp.merge(ranges, on="example_idx", how='left')
    dfp.info()
    dfp.to_parquet(output_file)

    return None