def __init__(self, mr, d, included_samples, tasks): self.mr = mr self.d = d #if 'hyp_imp' not in self.d: # backcompatibility # self.d['hyp_imp'] = self.d['grads'] if included_samples is not None: warnings.warn( "included_samples deprecated. use included_samples=None") self.included_samples = included_samples # TODO - remove this self.tasks = tasks self.seqlets_per_task = self.mr.seqlets() self.profile = {} self.profile_wide = {} self.grad_profile = {} self.grad_counts = {} self.seq = {} self.footprint_width = 200 # Setup all the required matrices for pattern, seqlets in tqdm(self.seqlets_per_task.items()): # get wide seqlets wide_seqlets = [ s.resize(self.footprint_width) for s in seqlets if s.center() > self.footprint_width // 2 and s.center() < self.get_seqlen(pattern) - self.footprint_width // 2 ] # profile data self.profile[pattern] = { task: extract_signal(self.get_region_profile(task), seqlets) for task in tasks } self.profile_wide[pattern] = { task: extract_signal(self.get_region_profile(task), wide_seqlets) for task in tasks } # importance scores self.grad_profile[pattern] = { task: extract_signal(self.get_region_grad(task, 'profile'), seqlets) for task in tasks } self.grad_counts[pattern] = { task: extract_signal(self.get_region_grad(task, 'counts'), seqlets) for task in tasks } # seq self.seq[pattern] = extract_signal(self.get_region_seq(), seqlets)
def profile_features(seqlets, ref_seqlets, profile, profile_width=70, profile_ref=None): from basepair.exp.chipnexus.simulate import profile_sim_metrics # resize seqlets = resize_seqlets(seqlets, profile_width, seqlen=profile.shape[1]) seqlets_ref = resize_seqlets(ref_seqlets, profile_width, seqlen=profile.shape[1]) # import pdb # pdb.set_trace() if profile_ref is None: profile_ref = profile # extract the profile seqlet_profile = extract_signal(profile, seqlets) seqlet_profile_ref = extract_signal(profile_ref, seqlets_ref) # compute the average profile avg_profile = seqlet_profile_ref.mean(axis=0) metrics = pd.DataFrame([ profile_sim_metrics(avg_profile + 1e-6, cp + 1e-6) for cp in seqlet_profile ]) metrics_ref = pd.DataFrame([ profile_sim_metrics(avg_profile + 1e-6, cp + 1e-6) for cp in seqlet_profile_ref ]) assert len(metrics) == len(seqlets) # needs to be the same length if metrics.simmetric_kl.min() == np.inf or \ metrics_ref.simmetric_kl.min() == np.inf: profile_match_p = None else: profile_match_p = quantile_norm(metrics.simmetric_kl, metrics_ref.simmetric_kl) return pd.DataFrame( OrderedDict([ ("profile_match", metrics.simmetric_kl), ("profile_match_p", profile_match_p), ("profile_counts", metrics['counts']), ("profile_counts_p", quantile_norm(metrics['counts'], metrics_ref['counts'])), ("profile_max", metrics['max']), ("profile_max_p", quantile_norm(metrics['max'], metrics_ref['max'])), ("profile_counts_max_ref", metrics['counts_max_ref']), ("profile_counts_max_ref_p", quantile_norm(metrics['counts_max_ref'], metrics_ref['counts_max_ref'])), ]))
def get_signal(seqlets, d: ImpScoreFile, tasks, resize_width=200): thr_one_hot = d.get_seq() if resize_width is None: # width = first seqlets resize_width = seqlets[0].end - seqlets[0].start # get valid seqlets start_pad = np.ceil(resize_width / 2) end_pad = thr_one_hot.shape[1] - start_pad valid_seqlets = [ s.resize(resize_width) for s in seqlets if (s.center() > start_pad) and (s.center() < end_pad) ] # prepare data ex_signal = { task: extract_signal(d.get_profiles()[task], valid_seqlets) for task in tasks } ex_contrib_profile = { task: extract_signal(d.get_contrib()[task], valid_seqlets).sum(axis=-1) for task in tasks } if d.contains_imp_score('count'): ex_contrib_counts = { task: extract_signal(d.get_contrib("count")[task], valid_seqlets).sum(axis=-1) for task in tasks } elif d.contains_imp_score('counts/pre-act'): ex_contrib_counts = { task: extract_signal( d.get_contrib("counts/pre-act")[task], valid_seqlets).sum(axis=-1) for task in tasks } else: ex_contrib_counts = None ex_seq = extract_signal(thr_one_hot, valid_seqlets) seq, contrib, hyp_contrib, profile, ranges = d.get_all() total_counts = sum( [x.sum(axis=-1).sum(axis=-1) for x in ex_signal.values()]) sort_idx = np.argsort(-total_counts) return ex_signal, ex_contrib_profile, ex_contrib_counts, ex_seq, sort_idx
def __init__(self, mr, d, included_samples, tasks): self.mr = mr self.d = d self.included_samples = included_samples self.tasks = tasks self.seqlets_per_task = self.mr.seqlets() self.profile = {} self.profile_wide = {} self.grad_profile = {} self.grad_counts = {} self.seq = {} self.footprint_width = 200 print('loaded seqlets') # Setup all the required matrices for pattern, seqlets in tqdm(self.seqlets_per_task.items()): # get wide seqlets wide_seqlets = [ s.resize(self.footprint_width) for s in seqlets if s.center() > self.footprint_width // 2 and s.center() < self.get_seqlen(pattern) - self.footprint_width // 2 ] # profile data: observed #self.profile[pattern] = {task: extract_signal(self.get_region_profile(task), seqlets) # for task in tasks} #self.profile_wide[pattern] = {task: extract_signal(self.get_region_profile(task), wide_seqlets) # for task in tasks} # importance scores self.grad_profile[pattern] = { task: extract_signal(self.get_region_grad(task, 'profile'), seqlets) for task in tasks } #self.grad_counts[pattern] = {task: extract_signal(self.get_region_grad(task, 'counts'), seqlets) # for task in tasks} # seq self.seq[pattern] = extract_signal(self.get_region_seq(), seqlets)
def plot_power_spectrum(pattern, task, data): seqlets = data.seqlets_per_task[pattern] wide_seqlets = [ s.resize(data.footprint_width) for s in seqlets if s.center() > data.footprint_width // 2 and s.center() < data.get_seqlen(pattern) - data.footprint_width // 2 ] p = extract_signal(data.get_region_grad(task, 'profile'), wide_seqlets) agg_profile = np.log(np.abs(p).sum(axis=-1).sum(axis=0)) heatmap_importance_profile(normalize(np.abs(p).sum(axis=-1)[:500], pmin=50, pmax=99), figsize=(10, 20)) heatmap_fig = plt.gcf() # heatmap_importance_profile(np.abs(p*seq).sum(axis=-1)[:500], figsize=(10, 20)) agg_profile = agg_profile - agg_profile.mean() agg_profile = agg_profile / agg_profile.std() freq = np.fft.fftfreq(agg_profile[102:].shape[-1]) smooth_part = smooth(agg_profile, 10) oscilatory_part = agg_profile - smooth_part avg_fig, axes = plt.subplots(2, 1, figsize=(11, 4), sharex=True) axes[0].plot(agg_profile, label='original') axes[0].plot(smooth_part, label="smooth", alpha=0.5) axes[0].legend() axes[0].set_ylabel("Avg. importance") axes[0].set_title("Average importance score") # axes[0].set_xlabel("Position"); axes[1].plot(oscilatory_part) axes[1].set_xlabel("Position") axes[1].set_ylabel("original - smooth") avg_fig.subplots_adjust(hspace=0) # no space between plots # plt.savefig('nanog-agg-profile.png', dpi=300) # plt.savefig('nanog-agg-profile.pdf') fft_fig = plt.figure(figsize=(11, 2)) plt.plot( 1 / freq[:49], np.abs(np.fft.fft(oscilatory_part[102:])[:49])**2 + np.abs(np.fft.fft(oscilatory_part[:98])[:49])**2, "-o") plt.xlim([0, 50]) plt.gca().xaxis.set_major_locator(ticker.MaxNLocator(25, integer=True)) plt.grid(alpha=0.3) plt.xlabel("1/Frequency [bp]") plt.ylabel("Power spectrum") plt.title("Power spectrum") plt.gcf().subplots_adjust(bottom=0.4) return heatmap_fig, avg_fig, fft_fig
def compute_power_spectrum(pattern, task, data): seqlets = data.seqlets_per_task[pattern] wide_seqlets = [ s.resize(data.footprint_width) for s in seqlets if s.center() > data.footprint_width // 2 and s.center() < data.get_seqlen(pattern) - data.footprint_width // 2 ] p = extract_signal(data.get_region_grad(task, 'profile'), wide_seqlets) agg_profile = np.log(np.abs(p).sum(axis=-1).sum(axis=0)) agg_profile = agg_profile - agg_profile.mean() agg_profile = agg_profile / agg_profile.std() smooth_part = smooth(agg_profile, 10) oscilatory_part = agg_profile - smooth_part t0, ps = periodicity_ft(oscilatory_part) return ps, t0, 1 / t0
def get_reference_profile(mr, pattern, profiles, tasks, profile_width=70, trim_frac=0.08, seqlen=1000): """Generate the reference profile """ from basepair.modisco.results import resize_seqlets from basepair.plot.profiles import extract_signal seqlets_ref = mr._get_seqlets(pattern, trim_frac=trim_frac) seqlets_ref = resize_seqlets(seqlets_ref, profile_width, seqlen=seqlen) out = {} for task in tasks: seqlet_profile_ref = extract_signal(profiles[task], seqlets_ref) avg_profile = seqlet_profile_ref.mean(axis=0) out[task] = avg_profile # metrics_ref = pd.DataFrame([profile_sim_metrics(avg_profile, cp) for cp in seqlet_profile_ref]) return out
def compute_power_spectrum(pattern, task, data): seqlets = data.seqlets_per_task[pattern] wide_seqlets = [ s.resize(data.footprint_width) for s in seqlets if s.center() > data.footprint_width // 2 and s.center() < data.get_seqlen(pattern) - data.footprint_width // 2 ] p = extract_signal(data.get_region_grad(task, 'profile'), wide_seqlets) agg_profile = np.log(np.abs(p).sum(axis=-1).sum(axis=0)) agg_profile = agg_profile - agg_profile.mean() agg_profile = agg_profile / agg_profile.std() smooth_part = smooth(agg_profile, 10) oscilatory_part = agg_profile - smooth_part freq = np.fft.fftfreq(agg_profile[102:].shape[-1]) freq = freq[:49] t0 = 1 / freq ps = np.abs(np.fft.fft(oscilatory_part[102:])[:49])**2 + np.abs( np.fft.fft(oscilatory_part[:98])[:49])**2 return ps, t0, freq