def load_included_samples(modisco_dir): modisco_dir = Path(modisco_dir) kwargs = read_json(modisco_dir / "kwargs.json") d = ImpScoreFile(kwargs["imp_scores"]) interval_from_task = d.get_ranges().interval_from_task n = len(d) d.close() included_samples = np.ones((n, ), dtype=bool) if not kwargs.get("skip_dist_filter", False) and ( modisco_dir / "strand_distances.h5").exists(): included_samples = HDF5Reader.load( modisco_dir / "strand_distances.h5")['included_samples'] & included_samples if kwargs.get("filter_npy", None) is not None: included_samples = np.load(kwargs["filter_npy"]) & included_samples if kwargs.get("subset_tasks", None) is not None and kwargs.get( "filter_subset_tasks", False): included_samples = interval_from_task.isin( kwargs['subset_tasks']).values & included_samples return included_samples
def load_ranges(modisco_dir): modisco_dir = Path(modisco_dir) included_samples = load_included_samples(modisco_dir) kwargs = read_json(modisco_dir / "kwargs.json") d = ImpScoreFile(kwargs["imp_scores"], included_samples) df = d.get_ranges() d.close() return df
def load_modisco_results(modisco_dir): """Load modisco_result - return Args: modisco_dir: directory path `output_dir` in `basepair.cli.modisco.modisco_run` contains: modisco.h5, strand_distances.h5, kwargs.json Returns: TfModiscoResults object containing original track_set """ import modisco from modisco.tfmodisco_workflow import workflow modisco_kwargs = read_json(f"{modisco_dir}/kwargs.json") grad_type = modisco_kwargs['grad_type'] # load used strand distance filter included_samples = HDF5Reader.load( f"{modisco_dir}/strand_distances.h5")['included_samples'] # load importance scores d = HDF5Reader.load(modisco_kwargs['imp_scores']) if 'hyp_imp' not in d: # backcompatibility d['hyp_imp'] = d['grads'] tasks = list(d['targets']['profile']) if isinstance(d['inputs'], dict): one_hot = d['inputs']['seq'] else: one_hot = d['inputs'] thr_hypothetical_contribs = { f"{task}/{gt}": mean(d['hyp_imp'][task][gt])[included_samples] for task in tasks for gt in grad_type.split(",") } thr_one_hot = one_hot[included_samples] thr_contrib_scores = { f"{task}/{gt}": thr_hypothetical_contribs[f"{task}/{gt}"] * thr_one_hot for task in tasks for gt in grad_type.split(",") } track_set = modisco.tfmodisco_workflow.workflow.prep_track_set( task_names=tasks, contrib_scores=thr_contrib_scores, hypothetical_contribs=thr_hypothetical_contribs, one_hot=thr_one_hot) with h5py.File(os.path.join(modisco_dir, "modisco.h5"), "r") as grp: mr = workflow.TfModiscoResults.from_hdf5(grp, track_set=track_set) return mr, tasks, grad_type
def get_branchpoint_pwm_list(cache=True): l = [] if os.path.isfile(BR_PWM) and cache: l.append(PWM.from_config(read_json(BR_PWM))) else: dt = pd.read_csv( DATA_ROOT + "/Splice_branchpoints/processed/branchpointer/train/filteredDescr.csv" ) # colmeans dt = dt[dt.set == "HC"] dtseq = dt[dt.columns[dt.columns.str.match("^seq_")]] - 1 pwm = np.array(dtseq.mean()).reshape((-1, 4)) assert np.allclose(pwm.sum(1), 1) p = PWM(pwm, name="U2_branchpoint") write_json(p.get_config(), BR_PWM) l.append(p) l.append( PWM(0.05 + np.loadtxt(BR_SPLICE_RACK_PATH + "/GT_AG_U12.txt"), "GT_AG_U12_branchpoint")) return l
def modisco_score_single_binary(modisco_dir, output_tsv, output_seqlets_pkl=None, seqlet_len=25, n_cores=1, method="rank", trim_pattern=False): """ Equivalent of modisco_score """ import modisco from modisco.tfmodisco_workflow import workflow kwargs = read_json(os.path.join(modisco_dir, "kwargs.json")) d = HDF5Reader.load(kwargs['imp_scores']) # deeplift hdffile if isinstance(d['inputs'], dict): one_hot = d['inputs']['seq'] else: one_hot = d['inputs'] tasks = list(d['grads'].keys()) grad_type = list(d['grads'][tasks[0]].keys())[0] if kwargs.get("filter_npy", None) is not None: included_samples = np.load(kwargs["filter_npy"]) hypothetical_contribs = { f"{task}": d['grads'][task]['deeplift']['hyp_contrib_scores'][included_samples] for task in tasks for gt in grad_type.split(",") } contrib_scores = { f"{task}": d['grads'][task][gt]['contrib_scores'][included_samples] for task in tasks for gt in grad_type.split(",") } print(tasks) track_set = workflow.prep_track_set( task_names=tasks, contrib_scores=contrib_scores, hypothetical_contribs=hypothetical_contribs, one_hot=one_hot[included_samples]) with h5py.File(os.path.join(modisco_dir, "results.hdf5"), "r") as grp: mr = workflow.TfModiscoResults.from_hdf5(grp, track_set=track_set) seqlets = find_instances(mr, tasks, contrib_scores, hypothetical_contribs, one_hot[included_samples], seqlet_len=seqlet_len, n_cores=n_cores, method=method, trim_pattern=trim_pattern) if output_seqlets_pkl: write_pkl(seqlets, output_seqlets_pkl) df = labelled_seqlets2df(seqlets) dfm = pd.DataFrame(d['metadata']['range']) dfm.columns = ["example_" + v for v in dfm.columns] dfm['example_id'] = d['metadata']['interval_from_task'] df = df.merge(dfm, left_on="example_idx", how='left', right_on="example_id") df.to_csv(output_tsv, sep='\t') return seqlets, df
def modisco_score2(modisco_dir, output_file, trim_frac=0.08, imp_scores=None, importance=None, ignore_filter=False, n_jobs=20): """Modisco score instances Args: modisco_dir: modisco directory - used to obtain centroid_seqlet_matches.csv and modisco.h5 output_file: output file path for the tsv file. If the suffix is tsv.gz, then also gzip the file trim_frac: how much to trim the pattern when scanning imp_scores: hdf5 file of importance scores (contains `importance` score) if None, then load the default importance scores from modisco importance: which importance scores to use n_jobs: number of parallel jobs to use Writes a gzipped tsv file(tsv.gz) """ add_file_logging(os.path.dirname(output_file), logger, 'modisco-score2') modisco_dir = Path(modisco_dir) modisco_kwargs = read_json(f"{modisco_dir}/kwargs.json") if importance is None: importance = modisco_kwargs['grad_type'] # Centroid matches cm_path = modisco_dir / 'centroid_seqlet_matches.csv' if not cm_path.exists(): logger.info(f"Generating centroid matches to {cm_path.resolve()}") modisco_centroid_seqlet_matches(modisco_dir, imp_scores, modisco_dir, trim_frac=trim_frac, n_jobs=n_jobs) logger.info(f"Loading centroid matches from {cm_path.resolve()}") dfm_norm = pd.read_csv(cm_path) mr = ModiscoResult(modisco_dir / "modisco.h5") mr.open() tasks = mr.tasks() # HACK prune the tasks of importance (in case it's present) tasks = [t.replace(f"/{importance}", "") for t in tasks] logger.info(f"Using tasks: {tasks}") if imp_scores is not None: logger.info(f"Loading the importance scores from: {imp_scores}") imp = ImpScoreFile(imp_scores, default_imp_score=importance) else: imp = ImpScoreFile.from_modisco_dir( modisco_dir, ignore_include_samples=ignore_filter) seq, contrib, hyp_contrib, profile, ranges = imp.get_all() logger.info("Scanning for patterns") dfl = [] for pattern_name in tqdm(mr.patterns()): pattern = mr.get_pattern(pattern_name).trim_seq_ic(trim_frac) match, importance = pattern.scan_importance(contrib, hyp_contrib, tasks, n_jobs=n_jobs, verbose=False) seq_match = pattern.scan_seq(seq, n_jobs=n_jobs, verbose=False) dfm = pattern.get_instances( tasks, match, importance, seq_match, norm_df=dfm_norm[dfm_norm.pattern == pattern_name], verbose=False, plot=False) dfl.append(dfm) logger.info("Merging") # merge and write the results dfp = pd.concat(dfl) # append the ranges logger.info("Append ranges") ranges.columns = ["example_" + v for v in ranges.columns] dfp = dfp.merge(ranges, on="example_idx", how='left') logger.info("Table info") dfp.info() logger.info( f"Writing the resuling pd.DataFrame of shape {dfp.shape} to {output_file}" ) # write to a parquet file dfp.to_parquet(output_file, partition_on=['pattern'], engine='fastparquet') logger.info("Done!")
def modisco_report_all(modisco_dir, trim_frac=0.08, n_jobs=20, scan_instances=False, force=False): """Compute all the results for modisco. Runs: - modisco_plot - modisco_report - modisco_table - modisco_centroid_seqlet_matches - modisco_score2 - modisco2bed - modisco_instances_to_bed Args: modisco_dir: directory path `output_dir` in `basepair.cli.modisco.modisco_run` contains: modisco.h5, strand_distances.h5, kwargs.json trim_frac: how much to trim the pattern n_jobs: number of parallel jobs to use force: if True, commands will be re-run regardless of whether whey have already been computed Note: All the sub-commands are only executed if they have not been ran before. Use --force override this. Whether the commands have been run before is deterimined by checking if the following file exists: `{modisco_dir}/.modisco_report_all/{command}.done`. """ plt.switch_backend('agg') from basepair.utils import ConditionalRun modisco_dir = Path(modisco_dir) # figure out the importance scores used kwargs = read_json(modisco_dir / "kwargs.json") imp_scores = kwargs["imp_scores"] mr = ModiscoResult(f"{modisco_dir}/modisco.h5") mr.open() all_patterns = mr.patterns() mr.close() if len(all_patterns) == 0: print("No patterns found.") # Touch results.html for snakemake open(modisco_dir / 'results.html', 'a').close() open(modisco_dir / 'seqlets/scored_regions.bed', 'a').close() return # class determining whether to run the command or not (poor-man's snakemake) cr = ConditionalRun("modisco_report_all", None, modisco_dir, force=force) sync = [] # -------------------------------------------- if (not cr.set_cmd('modisco_plot').done() or not cr.set_cmd('modisco_cluster_patterns').done() or not cr.set_cmd('modisco_enrich_patterns').done()): # load ImpScoreFile and pass it to all the functions logger.info("Loading ImpScoreFile") impsf = ImpScoreFile.from_modisco_dir(modisco_dir) impsf.cache() else: impsf = None # -------------------------------------------- # Basic reports if not cr.set_cmd('modisco_plot').done(): modisco_plot(modisco_dir, modisco_dir / 'plots', figsize=(10, 10), impsf=impsf) cr.write() sync.append("plots") if not cr.set_cmd('modisco_report').done(): modisco_report(str(modisco_dir), str(modisco_dir)) cr.write() sync.append("results.html") if not cr.set_cmd('modisco_table').done(): modisco_table(modisco_dir, modisco_dir, report_url=None, impsf=impsf) cr.write() sync.append("footprints.pkl") sync.append("pattern_table.*") if not cr.set_cmd('modisco_cluster_patterns').done(): modisco_cluster_patterns(modisco_dir, modisco_dir) cr.write() sync.append("patterns.pkl") sync.append("cluster-patterns.*") sync.append("motif_clustering") if not cr.set_cmd('modisco_enrich_patterns').done(): modisco_enrich_patterns(modisco_dir / 'patterns.pkl', modisco_dir, modisco_dir / 'patterns.pkl', impsf=impsf) cr.write() # sync.append("patterns.pkl") # TODO - run modisco align # - [ ] add the motif clustering step (as ipynb) and export the aligned tables # - save the final table as a result to CSV (ready to be imported in excel) # -------------------------------------------- # Finding new instances if scan_instances: if not cr.set_cmd('modisco_centroid_seqlet_matches').done(): modisco_centroid_seqlet_matches(modisco_dir, imp_scores, modisco_dir, trim_frac=trim_frac, n_jobs=n_jobs, impsf=impsf) cr.write() # TODO - this would not work with the per-TF importance score file.... if not cr.set_cmd('modisco_score2').done(): modisco_score2( modisco_dir, modisco_dir / 'instances.parq', trim_frac=trim_frac, imp_scores=None, # Use the default one importance=None, # Use the default one n_jobs=n_jobs) cr.write() # TODO - update the pattern table -> compute the fraction of other motifs etc # -------------------------------------------- # Export bed-files and bigwigs # Seqlets if not cr.set_cmd('modisco2bed').done(): modisco2bed(str(modisco_dir), str(modisco_dir / 'seqlets'), trim_frac=trim_frac) cr.write() sync.append("seqlets") # Scanned instances # if not cr.set_cmd('modisco_instances_to_bed').done(): # modisco_instances_to_bed(str(modisco_dir / 'modisco.h5'), # instances_parq=str(modisco_dir / 'instances.parq'), # imp_score_h5=imp_scores, # output_dir=str(modisco_dir / 'instances_bed/'), # ) # cr.write() # sync.append("instances_bed") # print the rsync command to run in order to sync the output # directories to the webserver logger.info("Run the following command to sync files to the webserver") dirs = " ".join(sync) print(f"rsync -av --progress {dirs} <output_dir>/")
def modisco_score2_single_binary(modisco_dir, output_file, imp_scores=None, trim_frac=0.08, n_jobs=20): """ Equivalent of modisco_score2 """ import modisco from modisco.tfmodisco_workflow import workflow cm_path = os.path.join(modisco_dir, 'centroid_seqlet_matches.csv') dfm_norm = pd.read_csv(cm_path) mr = ModiscoResult(os.path.join(modisco_dir, "results.hdf5")) mr.open() tasks = mr.tasks() kwargs = read_json(os.path.join(modisco_dir, "kwargs.json")) d = HDF5Reader.load(kwargs['imp_scores']) # deeplift hdffile if isinstance(d['inputs'], dict): one_hot = d['inputs']['seq'] else: one_hot = d['inputs'] tasks = list(d['grads'].keys()) grad_type = list(d['grads'][tasks[0]].keys())[0] if kwargs.get("filter_npy", None) is not None: included_samples = np.load(kwargs["filter_npy"]) hyp_contrib = { f"{task}": d['grads'][task]['deeplift']['hyp_contrib_scores'][included_samples] for task in tasks for gt in grad_type.split(",") } contrib = { f"{task}": d['grads'][task][gt]['contrib_scores'][included_samples] for task in tasks for gt in grad_type.split(",") } seq = one_hot[included_samples] ranges = pd.DataFrame({ "chrom": d['metadata']['range']['chr'][:][included_samples], "start": d['metadata']['range']['start'][:][included_samples], "end": d['metadata']['range']['end'][:][included_samples], "strand": d['metadata']['range']['strand'][:][included_samples], "idx": np.arange(len(included_samples)), "interval_from_task": d['metadata']['interval_from_task'][:][included_samples], }) print("Scanning for patterns") dfl = [] mr_patterns = mr.patterns() # [:2] for pattern_name in tqdm(mr_patterns): pattern = mr.get_pattern(pattern_name).trim_seq_ic(trim_frac) match, importance = pattern.scan_importance(contrib, hyp_contrib, tasks, n_jobs=n_jobs, verbose=False) seq_match = pattern.scan_seq(seq, n_jobs=n_jobs, verbose=False) dfm = pattern.get_instances( tasks, match, importance, seq_match, norm_df=dfm_norm[dfm_norm.pattern == pattern_name], verbose=False, plot=False) dfl.append(dfm) print("Merging") # merge and write the results dfp = pd.concat(dfl) print("Append ranges") ranges.columns = ["example_" + v for v in ranges.columns] dfp = dfp.merge(ranges, on="example_idx", how='left') dfp.info() dfp.to_parquet(output_file) return None