def evaluate(self, dataset, eval_metric=None, num_workers=8, batch_size=256): lpreds = [] llabels = [] for inputs, targets in tqdm(dataset.batch_train_iter( cycle=False, num_workers=num_workers, batch_size=batch_size), total=len(dataset) // batch_size): assert isinstance(targets, dict) target_keys = list(targets) llabels.append(deepcopy(targets)) bpreds = { k: v for k, v in self.predict(inputs, batch_size=None).items() if k in target_keys } # keep only the target key predictions lpreds.append(bpreds) del inputs del targets preds = numpy_collate_concat(lpreds) labels = numpy_collate_concat(llabels) del lpreds del llabels if eval_metric is not None: return eval_metric(labels, preds) else: task_avg_tape = defaultdict(list) out = {} for task, heads in self.all_heads.items(): for head_i, head in enumerate(heads): target_name = head.get_target(task) if target_name not in labels: print( f"Target {target_name} not found. Skipping evaluation" ) continue res = head.metric(labels[target_name], preds[target_name]) out[target_name] = res metrics_dict = flatten(res, separator='/') for k, v in metrics_dict.items(): task_avg_tape[ head.target_name.replace("{task}", "avg") + "/" + k].append(v) for k, v in task_avg_tape.items(): # get the average out[k] = mean(v) # flatten everything out = flatten(out, separator='/') return out
def get_scores(ref_pred, alt_pred, tasks, motif, seqlen, center_coords): d = {} cstart, cend = center_coords for task in tasks: # profile - use the filtered tracks d[task] = flatten( { "profile": profile_sim_metrics(ref_pred['profile'][task][cstart:cend], alt_pred['profile'][task][cstart:cend]) }, "/") # importance scores - use the central motif region if 'imp' in ref_pred: for imp_score in ref_pred["imp"][task]: imp, imp_frac = imp_sim_metrics( ref_pred["imp"][task][imp_score], alt_pred["imp"][task][imp_score], motif, seqlen) d[task] = { f"imp/{imp_score}": imp, f"imp/{imp_score}_frac": imp_frac, **d[task] } return d
def plot(self, kind='all', rotate_y=90, letter_width=0.2, height=0.8, ylab=True, **kwargs): if isinstance(kind, list): kind_list = kind else: if kind == 'all': kind_list = self._track_list else: self._validate_kind(kind) kind_list = [kind] tracks = OrderedDict([(kind, self._get_track(kind)) for kind in kind_list]) tracks = skip_nan_tracks(flatten(tracks, "/")) if 'seq' in tracks: tracks['seq'] = self.get_seq_ic() # override the sequence with information content if 'title' not in kwargs: kwargs['title'] = self.name return plot_tracks(tracks, # title=self.name, rotate_y=rotate_y, fig_width=len(self) * letter_width, fig_height_per_track=height, ylab=ylab, **kwargs)
def flatten(self): return super().__init__(flatten(self.data), attrs=deepcopy(self.attrs))
def get_lens(self): return list(flatten(self.dapply(len)).values())
def generate_sim(bpnet, central_motif, side_motif, side_distances, center_coords=[450, 550], repeat=128, importance=['count', 'weighted'], correct=False): outl = [] tasks = bpnet.tasks seqlen = bpnet.input_seqlen() # ref_preds = sim_pred(model, central_motif) ref_preds = unflatten( bpnet.sim_pred(central_motif, repeat=repeat, importance=importance), "/") none_preds = unflatten( bpnet.sim_pred('', '', [], repeat=repeat, importance=importance), "/") alt_profiles = [] for dist in tqdm(side_distances): # alt_preds = sim_pred(model, central_motif, side_motif, [dist]) # Note: bpnet.sim_pred already averages the predictions alt_preds = unflatten( bpnet.sim_pred(central_motif, side_motif, [dist], repeat=repeat, importance=importance), "/") if correct: # Correct for the 'shoulder' effect # # this performs: AB - (B - 0) # Where: # - AB: contains both, central and side_motif # - B : contains only side_motif # - 0 : doesn't contain any motif edge_only_preds = unflatten( bpnet.sim_pred('', side_motif, [dist], repeat=repeat, importance=importance), "/") alt_preds_f = flatten(alt_preds, '/') # ref_preds_f = flatten(ref_preds, '/') none_preds_f = flatten(none_preds, "/") # substract the other counts alt_preds = unflatten( { k: alt_preds_f[k] - v + none_preds_f[k] for k, v in flatten(edge_only_preds, "/").items() }, "/") # ref_preds = unflatten({k: ref_preds_f[k] - v for k,v in flatten(none_preds, "/").items()}, "/") alt_profiles.append((dist, alt_preds)) # This normalizes the score by `A` finally yielding: # (AB - B + 0) / A scores = get_scores(ref_preds, alt_preds, tasks, central_motif, seqlen, center_coords) # compute the distance metrics for task in bpnet.tasks: d = scores[task] # book-keeping d['task'] = task d['central_motif'] = central_motif d['side_motif'] = side_motif d['position'] = dist d['distance'] = dist - seqlen // 2 outl.append(d) return pd.DataFrame(outl), alt_profiles
def modisco_plot( modisco_dir, output_dir, # filter_npy=None, # ignore_dist_filter=False, figsize=(10, 10), impsf=None): """Plot the results of a modisco run Args: modisco_dir: modisco directory output_dir: Output directory for writing the results figsize: Output figure size impsf: [optional] modisco importance score file (ImpScoreFile) """ plt.switch_backend('agg') add_file_logging(output_dir, logger, 'modisco-plot') from basepair.plot.vdom import write_heatmap_pngs from basepair.plot.profiles import plot_profiles from basepair.utils import flatten output_dir = Path(output_dir) output_dir.parent.mkdir(parents=True, exist_ok=True) # load modisco mr = ModiscoResult(f"{modisco_dir}/modisco.h5") if impsf is not None: d = impsf else: d = ImpScoreFile.from_modisco_dir(modisco_dir) logger.info("Loading the importance scores") d.cache() # load all thr_one_hot = d.get_seq() # thr_hypothetical_contribs tracks = d.get_profiles() thr_hypothetical_contribs = dict() thr_contrib_scores = dict() # TODO - generalize this thr_hypothetical_contribs['weighted'] = d.get_hyp_contrib() thr_contrib_scores['weighted'] = d.get_contrib() tasks = d.get_tasks() # Count importance (if it exists) if d.contains_imp_score("counts/pre-act"): count_imp_score = "counts/pre-act" thr_hypothetical_contribs['count'] = d.get_hyp_contrib( imp_score=count_imp_score) thr_contrib_scores['count'] = d.get_contrib(imp_score=count_imp_score) elif d.contains_imp_score("count"): count_imp_score = "count" thr_hypothetical_contribs['count'] = d.get_hyp_contrib( imp_score=count_imp_score) thr_contrib_scores['count'] = d.get_contrib(imp_score=count_imp_score) else: # Don't do anything pass thr_hypothetical_contribs = OrderedDict( flatten(thr_hypothetical_contribs, separator='/')) thr_contrib_scores = OrderedDict(flatten(thr_contrib_scores, separator='/')) # # load importance scores # modisco_kwargs = read_json(f"{modisco_dir}/kwargs.json") # d = HDF5Reader.load(modisco_kwargs['imp_scores']) # if 'hyp_imp' not in d: # # backcompatibility # d['hyp_imp'] = d['grads'] # tasks = list(d['targets']['profile']) # if isinstance(d['inputs'], dict): # one_hot = d['inputs']['seq'] # else: # one_hot = d['inputs'] # # load used strand distance filter # included_samples = load_included_samples(modisco_dir) # grad_type = "count,weighted" # always plot both importance scores # thr_hypothetical_contribs = OrderedDict([(f"{gt}/{task}", mean(d['hyp_imp'][task][gt])[included_samples]) # for task in tasks # for gt in grad_type.split(",")]) # thr_one_hot = one_hot[included_samples] # thr_contrib_scores = OrderedDict([(f"{gt}/{task}", thr_hypothetical_contribs[f"{gt}/{task}"] * thr_one_hot) # for task in tasks # for gt in grad_type.split(",")]) # tracks = OrderedDict([(task, d['targets']['profile'][task][included_samples]) for task in tasks]) # ------------------------------------------------- all_seqlets = mr.seqlets() all_patterns = mr.patterns() if len(all_patterns) == 0: print("No patterns found") return # 1. Plots with tracks and contrib scores print("Writing results for contribution scores") plot_profiles(all_seqlets, thr_one_hot, tracks=tracks, importance_scores=thr_contrib_scores, legend=False, flip_neg=True, rotate_y=0, seq_height=.5, patterns=all_patterns, n_bootstrap=100, fpath_template=str(output_dir / "{pattern}/agg_profile_contribcores"), mkdir=True, figsize=figsize) # 2. Plots only with hypothetical contrib scores print("Writing results for hypothetical contribution scores") plot_profiles(all_seqlets, thr_one_hot, tracks={}, importance_scores=thr_hypothetical_contribs, legend=False, flip_neg=True, rotate_y=0, seq_height=1, patterns=all_patterns, n_bootstrap=100, fpath_template=str(output_dir / "{pattern}/agg_profile_hypcontribscores"), figsize=figsize) print("Plotting heatmaps") for pattern in tqdm(all_patterns): write_heatmap_pngs(all_seqlets[pattern], d, tasks, pattern, output_dir=str(output_dir / pattern)) mr.close()