Ejemplo n.º 1
0
def cwm_scan_seqlets(modisco_dir,
                     output_file,
                     trim_frac=0.08,
                     num_workers=1,
                     contribsf=None,
                     verbose=False):
    """Compute the cwm scanning scores of the original modisco seqlets
    """
    from bpnet.modisco.table import ModiscoData
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    add_file_logging(os.path.dirname(output_file), logger, 'cwm_scan_seqlets')

    # figure out contrib_wildcard
    mf = ModiscoFile(modisco_dir / "modisco.h5")

    if contribsf is None:
        contrib = ContribFile.from_modisco_dir(modisco_dir)
    else:
        contrib = contribsf

    tasks = mf.tasks()
    # HACK prune the tasks of contribution (in case it's present)
    tasks = [t.split("/")[0] for t in tasks]

    dfi_list = []

    for pattern_name in tqdm(mf.pattern_names()):
        pattern = mf.get_pattern(pattern_name).trim_seq_ic(trim_frac)
        seqlets = mf._get_seqlets(pattern_name, trim_frac=trim_frac)

        # scan only the existing locations of the seqlets instead of the full sequences
        # to obtain the distribution
        stacked_seqlets = contrib.extract(seqlets)

        match, contribution = pattern.scan_contribution(
            stacked_seqlets.contrib,
            hyp_contrib=None,
            tasks=tasks,
            n_jobs=num_workers,
            verbose=False,
            pad_mode=None)
        seq_match = pattern.scan_seq(stacked_seqlets.seq,
                                     n_jobs=num_workers,
                                     verbose=False,
                                     pad_mode=None)

        dfm = pattern.get_instances(tasks,
                                    match,
                                    contribution,
                                    seq_match,
                                    fdr=1,
                                    verbose=verbose,
                                    plot=verbose)
        dfm = dfm[dfm.seq_match > 0]

        dfi_list.append(dfm)
    df = pd.concat(dfi_list)
    df.to_csv(output_file)
Ejemplo n.º 2
0
def modisco_export_patterns(modisco_dir, output_file, contribsf=None):
    """Export patterns to a pkl file. Don't cluster them

    Adds `stacked_seqlet_contrib` and `n_seqlets` to pattern `attrs`

    Args:
      modisco_dir: modisco directory containing
      output_file: output file path for patterns.pkl
    """
    from bpnet.cli.contrib import ContribFile

    logger.info("Loading patterns")
    modisco_dir = Path(modisco_dir)

    mf = ModiscoFile(modisco_dir / 'modisco.h5')
    patterns = [mf.get_pattern(pname) for pname in mf.pattern_names()]

    if contribsf is None:
        contrib_file = ContribFile.from_modisco_dir(modisco_dir)
        logger.info("Loading ContribFile into memory")
        contrib_file.cache()
    else:
        logger.info("Using the provided ContribFile")
        contrib_file = contribsf

    logger.info("Extracting profile and contribution scores")
    extended_patterns = []
    for p in tqdm(patterns):
        p = p.copy()

        # get seqlets
        valid_seqlets = mf._get_seqlets(p.name)

        # extract the contribution scores
        sti = contrib_file.extract(valid_seqlets, profile_width=None)
        sti.dfi = mf.get_seqlet_intervals(p.name, as_df=True)
        p.attrs['stacked_seqlet_contrib'] = sti
        p.attrs['n_seqlets'] = mf.n_seqlets(p.name)
        extended_patterns.append(p)

    write_pkl(extended_patterns, output_file)