def modisco_table(modisco_dir, output_dir, report_url=None, impsf=None): """Write the pattern table to as .html and .csv """ plt.switch_backend('agg') from basepair.modisco.table import ModiscoData, modisco_table, write_modisco_table from basepair.modisco.motif_clustering import hirearchically_reorder_table add_file_logging(output_dir, logger, 'modisco-table') print("Loading required data") data = ModiscoData.load(modisco_dir, imp_scores_h5=None, impsf=impsf) print("Generating the table") df = modisco_table(data) print("Writing the results") write_modisco_table(df, output_dir, report_url, 'pattern_table') print("Writing clustered table") write_modisco_table(hirearchically_reorder_table(df, data.tasks), output_dir, report_url, 'pattern_table.sorted') print("Writing footprints") profiles = OrderedDict([(pattern, { task: data.get_profile_wide(pattern, task).mean(axis=0) for task in data.tasks }) for pattern in data.mr.patterns()]) write_pkl(profiles, Path(output_dir) / 'footprints.pkl') print("Done!")
def modisco_score(modisco_dir, imp_scores, output_tsv, output_seqlets_pkl=None, seqlet_len=25, n_cores=1, method="rank", trim_pattern=False): """Find seqlet instances using modisco """ add_file_logging(os.path.dirname(output_tsv), logger, 'modisco-score') mr, tasks, grad_type = load_modisco_results(modisco_dir) # load importance scores we want to score d = HDF5Reader.load(imp_scores) if 'hyp_imp' not in d: # backcompatibility d['hyp_imp'] = d['grads'] if isinstance(d['inputs'], dict): one_hot = d['inputs']['seq'] else: one_hot = d['inputs'] hypothetical_contribs = { f"{task}/{gt}": mean(d['hyp_imp'][task][gt]) for task in tasks for gt in grad_type.split(",") } contrib_scores = { f"{task}/{gt}": hypothetical_contribs[f"{task}/{gt}"] * one_hot for task in tasks for gt in grad_type.split(",") } seqlets = find_instances(mr, tasks, contrib_scores, hypothetical_contribs, one_hot, seqlet_len=seqlet_len, n_cores=n_cores, method=method, trim_pattern=trim_pattern) if len(seqlets) == 0: print("ERROR: no seqlets found!!") return [], None if output_seqlets_pkl: write_pkl(seqlets, output_seqlets_pkl) df = labelled_seqlets2df(seqlets) dfm = pd.DataFrame(d['metadata']['range']) dfm.columns = ["example_" + v for v in dfm.columns] df = df.merge(dfm, left_on="example_idx", how='left', right_on="example_id") df.to_csv(output_tsv, sep='\t') return seqlets, df
def modisco_enrich_patterns(patterns_pkl_file, modisco_dir, output_file, impsf=None): """Add stacked_seqlet_imp to pattern `attrs` Args: patterns_pkl: patterns.pkl file path modisco_dir: modisco directory containing output_file: output file path for patterns.pkl """ from basepair.utils import read_pkl, write_pkl from basepair.cli.imp_score import ImpScoreFile from basepair.modisco.core import StackedSeqletImp logger.info("Loading patterns") modisco_dir = Path(modisco_dir) patterns = read_pkl(patterns_pkl_file) mr = ModiscoResult(modisco_dir / 'modisco.h5') mr.open() if impsf is None: imp_file = ImpScoreFile.from_modisco_dir(modisco_dir) logger.info("Loading ImpScoreFile into memory") imp_file.cache() else: logger.info("Using the provided ImpScoreFile") imp_file = impsf logger.info("Extracting profile and importance scores") extended_patterns = [] for p in tqdm(patterns): p = p.copy() profile_width = p.len_profile() # get the shifted seqlets seqlets = [ s.pattern_align(**p.attrs['align']) for s in mr._get_seqlets(p.name) ] # keep only valid seqlets valid_seqlets = [ s for s in seqlets if s.valid_resize(profile_width, imp_file.get_seqlen() + 1) ] # extract the importance scores p.attrs['stacked_seqlet_imp'] = imp_file.extract( valid_seqlets, profile_width=profile_width) p.attrs['n_seqlets'] = mr.n_seqlets(*p.name.split("/")) extended_patterns.append(p) write_pkl(extended_patterns, output_file)
def modisco_export_patterns(modisco_dir, output_file, impsf=None): """Export patterns to a pkl file. Don't cluster them Adds `stacked_seqlet_imp` and `n_seqlets` to pattern `attrs` Args: patterns_pkl: patterns.pkl file path modisco_dir: modisco directory containing output_file: output file path for patterns.pkl """ from basepair.utils import read_pkl, write_pkl from basepair.cli.imp_score import ImpScoreFile from basepair.modisco.core import StackedSeqletImp logger.info("Loading patterns") modisco_dir = Path(modisco_dir) mr = ModiscoResult(modisco_dir / 'modisco.h5') mr.open() patterns = [mr.get_pattern(pname) for pname in mr.patterns()] if impsf is None: imp_file = ImpScoreFile.from_modisco_dir(modisco_dir) logger.info("Loading ImpScoreFile into memory") imp_file.cache() else: logger.info("Using the provided ImpScoreFile") imp_file = impsf logger.info("Extracting profile and importance scores") extended_patterns = [] for p in tqdm(patterns): p = p.copy() # get the shifted seqlets valid_seqlets = mr._get_seqlets(p.name) # extract the importance scores sti = imp_file.extract(valid_seqlets, profile_width=None) sti.dfi = mr.get_seqlet_intervals(p.name, as_df=True) p.attrs['stacked_seqlet_imp'] = sti p.attrs['n_seqlets'] = mr.n_seqlets(*p.name.split("/")) extended_patterns.append(p) write_pkl(extended_patterns, output_file)
def generate_motif_data(dfab, ref, single_mut, double_mut, pairs, output_dir, tasks, profile_width=200, save=False, pseudo_count_quantile=0.2, profile_slice=slice(82, 119)): import gc from basepair.utils import write_pkl from basepair.exp.chipnexus.spacing import remove_edge_instances from basepair.exp.chipnexus.perturb.scores import ( ism_compute_features_tidy, compute_features_tidy, SCORES, max_profile_count) if save: c_output_dir = os.path.join(output_dir, 'motif_pair_lpdata') os.makedirs(c_output_dir, exist_ok=True) dfabf_ism_l = [] dfabf_l = [] for motif_pair in pairs: motif_pair_name = "<>".join(motif_pair) dfab_subset = remove_edge_instances( dfab[dfab.motif_pair == motif_pair_name], profile_width=profile_width) pdata = ParturbationDataset(dfab_subset, ref, single_mut, double_mut, profile_width=profile_width) output = pdata.load_all(num_workers=0) output['dfab'] = dfab_subset # Compute the directionality and epistasis scores # Epistasis: o = {motif_pair_name: output} dfabf_ism_l.append(ism_compute_features_tidy(o, tasks)) # Directional: dfabf_l.append( compute_features_tidy(o, tasks, SCORES, pseudo_count_quantile=pseudo_count_quantile, profile_slice=profile_slice)) # motif_pair_lpdata[motif_pair_name] = output # sort_idx = np.argsort(pdata.dfab.center_diff) if save: write_pkl(output, os.path.join(c_output_dir, motif_pair_name + '.pkl')) del o del output del pdata del dfab_subset print("Garbage collect") gc.collect() return pd.concat(dfabf_ism_l, axis=0), pd.concat(dfabf_l, axis=0)
if args.gpu is not None: create_tf_session(args.gpu) # create the output path cache_path = f"{models_dir}/{exp}/motif-simulation/spacing;correct={args.correct}.pkl" os.makedirs(os.path.dirname(cache_path), exist_ok=True) # load the model logger.info("Loading model") model_dir = models_dir / exp bpnet = BPNetSeqModel.from_mdir(model_dir) logger.info("Creating the output directory") df_d = {} res_dict_d = {} for central_motif_name, central_motif in all_motif_seqs.items(): logger.info(f"Runnig script for {central_motif_name}") # get the motifs res_dict = OrderedDict([(motif, generate_sim(bpnet, central_motif, side_motif, list(range(511, 511 + 150, 1)), center_coords=center_coords, repeat=repeat, correct=args.correct, importance=[])) # 'counts/pre-act', 'profile/wn'])) for motif, side_motif in all_motif_seqs.items()]) df = pd.concat([v[0].assign(motif=k) for k, v in res_dict.items()]) # stack the dataframes df_d[central_motif_name] = df res_dict_d[central_motif_name] = res_dict # Store all the results write_pkl((df_d, res_dict_d), cache_path)
def modisco_score_single_binary(modisco_dir, output_tsv, output_seqlets_pkl=None, seqlet_len=25, n_cores=1, method="rank", trim_pattern=False): """ Equivalent of modisco_score """ import modisco from modisco.tfmodisco_workflow import workflow kwargs = read_json(os.path.join(modisco_dir, "kwargs.json")) d = HDF5Reader.load(kwargs['imp_scores']) # deeplift hdffile if isinstance(d['inputs'], dict): one_hot = d['inputs']['seq'] else: one_hot = d['inputs'] tasks = list(d['grads'].keys()) grad_type = list(d['grads'][tasks[0]].keys())[0] if kwargs.get("filter_npy", None) is not None: included_samples = np.load(kwargs["filter_npy"]) hypothetical_contribs = { f"{task}": d['grads'][task]['deeplift']['hyp_contrib_scores'][included_samples] for task in tasks for gt in grad_type.split(",") } contrib_scores = { f"{task}": d['grads'][task][gt]['contrib_scores'][included_samples] for task in tasks for gt in grad_type.split(",") } print(tasks) track_set = workflow.prep_track_set( task_names=tasks, contrib_scores=contrib_scores, hypothetical_contribs=hypothetical_contribs, one_hot=one_hot[included_samples]) with h5py.File(os.path.join(modisco_dir, "results.hdf5"), "r") as grp: mr = workflow.TfModiscoResults.from_hdf5(grp, track_set=track_set) seqlets = find_instances(mr, tasks, contrib_scores, hypothetical_contribs, one_hot[included_samples], seqlet_len=seqlet_len, n_cores=n_cores, method=method, trim_pattern=trim_pattern) if output_seqlets_pkl: write_pkl(seqlets, output_seqlets_pkl) df = labelled_seqlets2df(seqlets) dfm = pd.DataFrame(d['metadata']['range']) dfm.columns = ["example_" + v for v in dfm.columns] dfm['example_id'] = d['metadata']['interval_from_task'] df = df.merge(dfm, left_on="example_idx", how='left', right_on="example_id") df.to_csv(output_tsv, sep='\t') return seqlets, df
def save(self, file_path): """Save model to a file """ from basepair.utils import write_pkl write_pkl(self, file_path)