Example #1
0
def load_included_samples(modisco_dir):
    modisco_dir = Path(modisco_dir)

    kwargs = read_json(modisco_dir / "kwargs.json")

    d = ImpScoreFile(kwargs["imp_scores"])
    interval_from_task = d.get_ranges().interval_from_task
    n = len(d)
    d.close()

    included_samples = np.ones((n, ), dtype=bool)
    if not kwargs.get("skip_dist_filter", False) and (
            modisco_dir / "strand_distances.h5").exists():
        included_samples = HDF5Reader.load(
            modisco_dir /
            "strand_distances.h5")['included_samples'] & included_samples

    if kwargs.get("filter_npy", None) is not None:
        included_samples = np.load(kwargs["filter_npy"]) & included_samples

    if kwargs.get("subset_tasks", None) is not None and kwargs.get(
            "filter_subset_tasks", False):
        included_samples = interval_from_task.isin(
            kwargs['subset_tasks']).values & included_samples

    return included_samples
Example #2
0
def load_ranges(modisco_dir):
    modisco_dir = Path(modisco_dir)
    included_samples = load_included_samples(modisco_dir)

    kwargs = read_json(modisco_dir / "kwargs.json")
    d = ImpScoreFile(kwargs["imp_scores"], included_samples)
    df = d.get_ranges()
    d.close()
    return df
Example #3
0
def load_modisco_results(modisco_dir):
    """Load modisco_result - return

    Args:
      modisco_dir: directory path `output_dir` in `basepair.cli.modisco.modisco_run`
        contains: modisco.h5, strand_distances.h5, kwargs.json

    Returns:
      TfModiscoResults object containing original track_set
    """
    import modisco
    from modisco.tfmodisco_workflow import workflow
    modisco_kwargs = read_json(f"{modisco_dir}/kwargs.json")
    grad_type = modisco_kwargs['grad_type']

    # load used strand distance filter
    included_samples = HDF5Reader.load(
        f"{modisco_dir}/strand_distances.h5")['included_samples']

    # load importance scores
    d = HDF5Reader.load(modisco_kwargs['imp_scores'])
    if 'hyp_imp' not in d:
        # backcompatibility
        d['hyp_imp'] = d['grads']

    tasks = list(d['targets']['profile'])
    if isinstance(d['inputs'], dict):
        one_hot = d['inputs']['seq']
    else:
        one_hot = d['inputs']
    thr_hypothetical_contribs = {
        f"{task}/{gt}": mean(d['hyp_imp'][task][gt])[included_samples]
        for task in tasks for gt in grad_type.split(",")
    }
    thr_one_hot = one_hot[included_samples]
    thr_contrib_scores = {
        f"{task}/{gt}": thr_hypothetical_contribs[f"{task}/{gt}"] * thr_one_hot
        for task in tasks for gt in grad_type.split(",")
    }

    track_set = modisco.tfmodisco_workflow.workflow.prep_track_set(
        task_names=tasks,
        contrib_scores=thr_contrib_scores,
        hypothetical_contribs=thr_hypothetical_contribs,
        one_hot=thr_one_hot)

    with h5py.File(os.path.join(modisco_dir, "modisco.h5"), "r") as grp:
        mr = workflow.TfModiscoResults.from_hdf5(grp, track_set=track_set)
    return mr, tasks, grad_type
Example #4
0
def get_branchpoint_pwm_list(cache=True):
    l = []
    if os.path.isfile(BR_PWM) and cache:
        l.append(PWM.from_config(read_json(BR_PWM)))
    else:
        dt = pd.read_csv(
            DATA_ROOT +
            "/Splice_branchpoints/processed/branchpointer/train/filteredDescr.csv"
        )
        # colmeans
        dt = dt[dt.set == "HC"]
        dtseq = dt[dt.columns[dt.columns.str.match("^seq_")]] - 1
        pwm = np.array(dtseq.mean()).reshape((-1, 4))
        assert np.allclose(pwm.sum(1), 1)
        p = PWM(pwm, name="U2_branchpoint")
        write_json(p.get_config(), BR_PWM)
        l.append(p)
    l.append(
        PWM(0.05 + np.loadtxt(BR_SPLICE_RACK_PATH + "/GT_AG_U12.txt"),
            "GT_AG_U12_branchpoint"))
    return l
Example #5
0
def modisco_score_single_binary(modisco_dir,
                                output_tsv,
                                output_seqlets_pkl=None,
                                seqlet_len=25,
                                n_cores=1,
                                method="rank",
                                trim_pattern=False):
    """
    Equivalent of modisco_score
    """
    import modisco
    from modisco.tfmodisco_workflow import workflow

    kwargs = read_json(os.path.join(modisco_dir, "kwargs.json"))
    d = HDF5Reader.load(kwargs['imp_scores'])  # deeplift hdffile
    if isinstance(d['inputs'], dict):
        one_hot = d['inputs']['seq']
    else:
        one_hot = d['inputs']
    tasks = list(d['grads'].keys())
    grad_type = list(d['grads'][tasks[0]].keys())[0]
    if kwargs.get("filter_npy", None) is not None:
        included_samples = np.load(kwargs["filter_npy"])

    hypothetical_contribs = {
        f"{task}":
        d['grads'][task]['deeplift']['hyp_contrib_scores'][included_samples]
        for task in tasks for gt in grad_type.split(",")
    }
    contrib_scores = {
        f"{task}": d['grads'][task][gt]['contrib_scores'][included_samples]
        for task in tasks for gt in grad_type.split(",")
    }

    print(tasks)
    track_set = workflow.prep_track_set(
        task_names=tasks,
        contrib_scores=contrib_scores,
        hypothetical_contribs=hypothetical_contribs,
        one_hot=one_hot[included_samples])

    with h5py.File(os.path.join(modisco_dir, "results.hdf5"), "r") as grp:
        mr = workflow.TfModiscoResults.from_hdf5(grp, track_set=track_set)

    seqlets = find_instances(mr,
                             tasks,
                             contrib_scores,
                             hypothetical_contribs,
                             one_hot[included_samples],
                             seqlet_len=seqlet_len,
                             n_cores=n_cores,
                             method=method,
                             trim_pattern=trim_pattern)

    if output_seqlets_pkl:
        write_pkl(seqlets, output_seqlets_pkl)
    df = labelled_seqlets2df(seqlets)

    dfm = pd.DataFrame(d['metadata']['range'])
    dfm.columns = ["example_" + v for v in dfm.columns]
    dfm['example_id'] = d['metadata']['interval_from_task']

    df = df.merge(dfm,
                  left_on="example_idx",
                  how='left',
                  right_on="example_id")

    df.to_csv(output_tsv, sep='\t')

    return seqlets, df
Example #6
0
def modisco_score2(modisco_dir,
                   output_file,
                   trim_frac=0.08,
                   imp_scores=None,
                   importance=None,
                   ignore_filter=False,
                   n_jobs=20):
    """Modisco score instances

    Args:
      modisco_dir: modisco directory - used to obtain centroid_seqlet_matches.csv and modisco.h5
      output_file: output file path for the tsv file. If the suffix is
        tsv.gz, then also gzip the file
      trim_frac: how much to trim the pattern when scanning
      imp_scores: hdf5 file of importance scores (contains `importance` score)
        if None, then load the default importance scores from modisco
      importance: which importance scores to use
      n_jobs: number of parallel jobs to use

    Writes a gzipped tsv file(tsv.gz)
    """
    add_file_logging(os.path.dirname(output_file), logger, 'modisco-score2')
    modisco_dir = Path(modisco_dir)
    modisco_kwargs = read_json(f"{modisco_dir}/kwargs.json")
    if importance is None:
        importance = modisco_kwargs['grad_type']

    # Centroid matches
    cm_path = modisco_dir / 'centroid_seqlet_matches.csv'
    if not cm_path.exists():
        logger.info(f"Generating centroid matches to {cm_path.resolve()}")
        modisco_centroid_seqlet_matches(modisco_dir,
                                        imp_scores,
                                        modisco_dir,
                                        trim_frac=trim_frac,
                                        n_jobs=n_jobs)
    logger.info(f"Loading centroid matches from {cm_path.resolve()}")
    dfm_norm = pd.read_csv(cm_path)

    mr = ModiscoResult(modisco_dir / "modisco.h5")
    mr.open()
    tasks = mr.tasks()

    # HACK prune the tasks of importance (in case it's present)
    tasks = [t.replace(f"/{importance}", "") for t in tasks]

    logger.info(f"Using tasks: {tasks}")

    if imp_scores is not None:
        logger.info(f"Loading the importance scores from: {imp_scores}")
        imp = ImpScoreFile(imp_scores, default_imp_score=importance)
    else:
        imp = ImpScoreFile.from_modisco_dir(
            modisco_dir, ignore_include_samples=ignore_filter)

    seq, contrib, hyp_contrib, profile, ranges = imp.get_all()

    logger.info("Scanning for patterns")
    dfl = []
    for pattern_name in tqdm(mr.patterns()):
        pattern = mr.get_pattern(pattern_name).trim_seq_ic(trim_frac)
        match, importance = pattern.scan_importance(contrib,
                                                    hyp_contrib,
                                                    tasks,
                                                    n_jobs=n_jobs,
                                                    verbose=False)
        seq_match = pattern.scan_seq(seq, n_jobs=n_jobs, verbose=False)
        dfm = pattern.get_instances(
            tasks,
            match,
            importance,
            seq_match,
            norm_df=dfm_norm[dfm_norm.pattern == pattern_name],
            verbose=False,
            plot=False)
        dfl.append(dfm)

    logger.info("Merging")
    # merge and write the results
    dfp = pd.concat(dfl)

    # append the ranges
    logger.info("Append ranges")
    ranges.columns = ["example_" + v for v in ranges.columns]
    dfp = dfp.merge(ranges, on="example_idx", how='left')

    logger.info("Table info")
    dfp.info()
    logger.info(
        f"Writing the resuling pd.DataFrame of shape {dfp.shape} to {output_file}"
    )
    # write to a parquet file
    dfp.to_parquet(output_file, partition_on=['pattern'], engine='fastparquet')
    logger.info("Done!")
Example #7
0
def modisco_report_all(modisco_dir,
                       trim_frac=0.08,
                       n_jobs=20,
                       scan_instances=False,
                       force=False):
    """Compute all the results for modisco. Runs:
    - modisco_plot
    - modisco_report
    - modisco_table
    - modisco_centroid_seqlet_matches
    - modisco_score2
    - modisco2bed
    - modisco_instances_to_bed

    Args:
      modisco_dir: directory path `output_dir` in `basepair.cli.modisco.modisco_run`
        contains: modisco.h5, strand_distances.h5, kwargs.json
      trim_frac: how much to trim the pattern
      n_jobs: number of parallel jobs to use
      force: if True, commands will be re-run regardless of whether whey have already
        been computed

    Note:
      All the sub-commands are only executed if they have not been ran before. Use --force override this.
      Whether the commands have been run before is deterimined by checking if the following file exists:
        `{modisco_dir}/.modisco_report_all/{command}.done`.
    """
    plt.switch_backend('agg')
    from basepair.utils import ConditionalRun

    modisco_dir = Path(modisco_dir)
    # figure out the importance scores used
    kwargs = read_json(modisco_dir / "kwargs.json")
    imp_scores = kwargs["imp_scores"]

    mr = ModiscoResult(f"{modisco_dir}/modisco.h5")
    mr.open()
    all_patterns = mr.patterns()
    mr.close()
    if len(all_patterns) == 0:
        print("No patterns found.")
        # Touch results.html for snakemake
        open(modisco_dir / 'results.html', 'a').close()
        open(modisco_dir / 'seqlets/scored_regions.bed', 'a').close()
        return

    # class determining whether to run the command or not (poor-man's snakemake)
    cr = ConditionalRun("modisco_report_all", None, modisco_dir, force=force)

    sync = []
    # --------------------------------------------
    if (not cr.set_cmd('modisco_plot').done()
            or not cr.set_cmd('modisco_cluster_patterns').done()
            or not cr.set_cmd('modisco_enrich_patterns').done()):
        # load ImpScoreFile and pass it to all the functions
        logger.info("Loading ImpScoreFile")
        impsf = ImpScoreFile.from_modisco_dir(modisco_dir)
        impsf.cache()
    else:
        impsf = None
    # --------------------------------------------
    # Basic reports
    if not cr.set_cmd('modisco_plot').done():
        modisco_plot(modisco_dir,
                     modisco_dir / 'plots',
                     figsize=(10, 10),
                     impsf=impsf)
        cr.write()
    sync.append("plots")

    if not cr.set_cmd('modisco_report').done():
        modisco_report(str(modisco_dir), str(modisco_dir))
        cr.write()
    sync.append("results.html")

    if not cr.set_cmd('modisco_table').done():
        modisco_table(modisco_dir, modisco_dir, report_url=None, impsf=impsf)
        cr.write()
    sync.append("footprints.pkl")
    sync.append("pattern_table.*")

    if not cr.set_cmd('modisco_cluster_patterns').done():
        modisco_cluster_patterns(modisco_dir, modisco_dir)
        cr.write()
    sync.append("patterns.pkl")
    sync.append("cluster-patterns.*")
    sync.append("motif_clustering")

    if not cr.set_cmd('modisco_enrich_patterns').done():
        modisco_enrich_patterns(modisco_dir / 'patterns.pkl',
                                modisco_dir,
                                modisco_dir / 'patterns.pkl',
                                impsf=impsf)
        cr.write()
    # sync.append("patterns.pkl")

    # TODO - run modisco align
    # - [ ] add the motif clustering step (as ipynb) and export the aligned tables
    #   - save the final table as a result to CSV (ready to be imported in excel)
    # --------------------------------------------
    # Finding new instances
    if scan_instances:
        if not cr.set_cmd('modisco_centroid_seqlet_matches').done():
            modisco_centroid_seqlet_matches(modisco_dir,
                                            imp_scores,
                                            modisco_dir,
                                            trim_frac=trim_frac,
                                            n_jobs=n_jobs,
                                            impsf=impsf)
            cr.write()

        # TODO - this would not work with the per-TF importance score file....
        if not cr.set_cmd('modisco_score2').done():
            modisco_score2(
                modisco_dir,
                modisco_dir / 'instances.parq',
                trim_frac=trim_frac,
                imp_scores=None,  # Use the default one
                importance=None,  # Use the default one
                n_jobs=n_jobs)
            cr.write()
    # TODO - update the pattern table -> compute the fraction of other motifs etc
    # --------------------------------------------
    # Export bed-files and bigwigs

    # Seqlets
    if not cr.set_cmd('modisco2bed').done():
        modisco2bed(str(modisco_dir),
                    str(modisco_dir / 'seqlets'),
                    trim_frac=trim_frac)
        cr.write()
    sync.append("seqlets")

    # Scanned instances
    # if not cr.set_cmd('modisco_instances_to_bed').done():
    #     modisco_instances_to_bed(str(modisco_dir / 'modisco.h5'),
    #                              instances_parq=str(modisco_dir / 'instances.parq'),
    #                              imp_score_h5=imp_scores,
    #                              output_dir=str(modisco_dir / 'instances_bed/'),
    #                              )
    #     cr.write()
    # sync.append("instances_bed")

    # print the rsync command to run in order to sync the output
    # directories to the webserver
    logger.info("Run the following command to sync files to the webserver")
    dirs = " ".join(sync)
    print(f"rsync -av --progress {dirs} <output_dir>/")
Example #8
0
def modisco_score2_single_binary(modisco_dir,
                                 output_file,
                                 imp_scores=None,
                                 trim_frac=0.08,
                                 n_jobs=20):
    """
    Equivalent of modisco_score2
    """
    import modisco
    from modisco.tfmodisco_workflow import workflow

    cm_path = os.path.join(modisco_dir, 'centroid_seqlet_matches.csv')
    dfm_norm = pd.read_csv(cm_path)
    mr = ModiscoResult(os.path.join(modisco_dir, "results.hdf5"))
    mr.open()
    tasks = mr.tasks()

    kwargs = read_json(os.path.join(modisco_dir, "kwargs.json"))
    d = HDF5Reader.load(kwargs['imp_scores'])  # deeplift hdffile
    if isinstance(d['inputs'], dict):
        one_hot = d['inputs']['seq']
    else:
        one_hot = d['inputs']
    tasks = list(d['grads'].keys())
    grad_type = list(d['grads'][tasks[0]].keys())[0]
    if kwargs.get("filter_npy", None) is not None:
        included_samples = np.load(kwargs["filter_npy"])

    hyp_contrib = {
        f"{task}":
        d['grads'][task]['deeplift']['hyp_contrib_scores'][included_samples]
        for task in tasks for gt in grad_type.split(",")
    }
    contrib = {
        f"{task}": d['grads'][task][gt]['contrib_scores'][included_samples]
        for task in tasks for gt in grad_type.split(",")
    }
    seq = one_hot[included_samples]
    ranges = pd.DataFrame({
        "chrom":
        d['metadata']['range']['chr'][:][included_samples],
        "start":
        d['metadata']['range']['start'][:][included_samples],
        "end":
        d['metadata']['range']['end'][:][included_samples],
        "strand":
        d['metadata']['range']['strand'][:][included_samples],
        "idx":
        np.arange(len(included_samples)),
        "interval_from_task":
        d['metadata']['interval_from_task'][:][included_samples],
    })

    print("Scanning for patterns")
    dfl = []
    mr_patterns = mr.patterns()  # [:2]
    for pattern_name in tqdm(mr_patterns):
        pattern = mr.get_pattern(pattern_name).trim_seq_ic(trim_frac)
        match, importance = pattern.scan_importance(contrib,
                                                    hyp_contrib,
                                                    tasks,
                                                    n_jobs=n_jobs,
                                                    verbose=False)
        seq_match = pattern.scan_seq(seq, n_jobs=n_jobs, verbose=False)
        dfm = pattern.get_instances(
            tasks,
            match,
            importance,
            seq_match,
            norm_df=dfm_norm[dfm_norm.pattern == pattern_name],
            verbose=False,
            plot=False)
        dfl.append(dfm)

    print("Merging")
    # merge and write the results
    dfp = pd.concat(dfl)
    print("Append ranges")
    ranges.columns = ["example_" + v for v in ranges.columns]
    dfp = dfp.merge(ranges, on="example_idx", how='left')
    dfp.info()
    dfp.to_parquet(output_file)

    return None