Beispiel #1
0
    def __init__(self, mr, d, included_samples, tasks):
        self.mr = mr
        self.d = d
        #if 'hyp_imp' not in self.d:
        # backcompatibility
        #    self.d['hyp_imp'] = self.d['grads']
        if included_samples is not None:
            warnings.warn(
                "included_samples deprecated. use included_samples=None")
        self.included_samples = included_samples  # TODO - remove this
        self.tasks = tasks

        self.seqlets_per_task = self.mr.seqlets()
        self.profile = {}
        self.profile_wide = {}
        self.grad_profile = {}
        self.grad_counts = {}
        self.seq = {}

        self.footprint_width = 200

        # Setup all the required matrices
        for pattern, seqlets in tqdm(self.seqlets_per_task.items()):
            # get wide seqlets

            wide_seqlets = [
                s.resize(self.footprint_width) for s in seqlets
                if s.center() > self.footprint_width // 2 and s.center() <
                self.get_seqlen(pattern) - self.footprint_width // 2
            ]

            # profile data
            self.profile[pattern] = {
                task: extract_signal(self.get_region_profile(task), seqlets)
                for task in tasks
            }

            self.profile_wide[pattern] = {
                task: extract_signal(self.get_region_profile(task),
                                     wide_seqlets)
                for task in tasks
            }
            # importance scores
            self.grad_profile[pattern] = {
                task: extract_signal(self.get_region_grad(task, 'profile'),
                                     seqlets)
                for task in tasks
            }
            self.grad_counts[pattern] = {
                task: extract_signal(self.get_region_grad(task, 'counts'),
                                     seqlets)
                for task in tasks
            }
            # seq
            self.seq[pattern] = extract_signal(self.get_region_seq(), seqlets)
def profile_features(seqlets,
                     ref_seqlets,
                     profile,
                     profile_width=70,
                     profile_ref=None):
    from basepair.exp.chipnexus.simulate import profile_sim_metrics
    # resize
    seqlets = resize_seqlets(seqlets, profile_width, seqlen=profile.shape[1])
    seqlets_ref = resize_seqlets(ref_seqlets,
                                 profile_width,
                                 seqlen=profile.shape[1])
    #     import pdb
    #     pdb.set_trace()

    if profile_ref is None:
        profile_ref = profile
    # extract the profile
    seqlet_profile = extract_signal(profile, seqlets)
    seqlet_profile_ref = extract_signal(profile_ref, seqlets_ref)

    # compute the average profile
    avg_profile = seqlet_profile_ref.mean(axis=0)

    metrics = pd.DataFrame([
        profile_sim_metrics(avg_profile + 1e-6, cp + 1e-6)
        for cp in seqlet_profile
    ])
    metrics_ref = pd.DataFrame([
        profile_sim_metrics(avg_profile + 1e-6, cp + 1e-6)
        for cp in seqlet_profile_ref
    ])

    assert len(metrics) == len(seqlets)  # needs to be the same length

    if metrics.simmetric_kl.min() == np.inf or \
            metrics_ref.simmetric_kl.min() == np.inf:
        profile_match_p = None
    else:
        profile_match_p = quantile_norm(metrics.simmetric_kl,
                                        metrics_ref.simmetric_kl)
    return pd.DataFrame(
        OrderedDict([
            ("profile_match", metrics.simmetric_kl),
            ("profile_match_p", profile_match_p),
            ("profile_counts", metrics['counts']),
            ("profile_counts_p",
             quantile_norm(metrics['counts'], metrics_ref['counts'])),
            ("profile_max", metrics['max']),
            ("profile_max_p", quantile_norm(metrics['max'],
                                            metrics_ref['max'])),
            ("profile_counts_max_ref", metrics['counts_max_ref']),
            ("profile_counts_max_ref_p",
             quantile_norm(metrics['counts_max_ref'],
                           metrics_ref['counts_max_ref'])),
        ]))
Beispiel #3
0
def get_signal(seqlets, d: ImpScoreFile, tasks, resize_width=200):
    thr_one_hot = d.get_seq()

    if resize_width is None:
        # width = first seqlets
        resize_width = seqlets[0].end - seqlets[0].start

    # get valid seqlets
    start_pad = np.ceil(resize_width / 2)
    end_pad = thr_one_hot.shape[1] - start_pad
    valid_seqlets = [
        s.resize(resize_width) for s in seqlets
        if (s.center() > start_pad) and (s.center() < end_pad)
    ]

    # prepare data
    ex_signal = {
        task: extract_signal(d.get_profiles()[task], valid_seqlets)
        for task in tasks
    }

    ex_contrib_profile = {
        task: extract_signal(d.get_contrib()[task], valid_seqlets).sum(axis=-1)
        for task in tasks
    }

    if d.contains_imp_score('count'):
        ex_contrib_counts = {
            task: extract_signal(d.get_contrib("count")[task],
                                 valid_seqlets).sum(axis=-1)
            for task in tasks
        }
    elif d.contains_imp_score('counts/pre-act'):
        ex_contrib_counts = {
            task: extract_signal(
                d.get_contrib("counts/pre-act")[task],
                valid_seqlets).sum(axis=-1)
            for task in tasks
        }
    else:
        ex_contrib_counts = None

    ex_seq = extract_signal(thr_one_hot, valid_seqlets)

    seq, contrib, hyp_contrib, profile, ranges = d.get_all()

    total_counts = sum(
        [x.sum(axis=-1).sum(axis=-1) for x in ex_signal.values()])
    sort_idx = np.argsort(-total_counts)
    return ex_signal, ex_contrib_profile, ex_contrib_counts, ex_seq, sort_idx
Beispiel #4
0
    def __init__(self, mr, d, included_samples, tasks):
        self.mr = mr
        self.d = d
        self.included_samples = included_samples
        self.tasks = tasks

        self.seqlets_per_task = self.mr.seqlets()
        self.profile = {}
        self.profile_wide = {}
        self.grad_profile = {}
        self.grad_counts = {}
        self.seq = {}

        self.footprint_width = 200
        print('loaded seqlets')
        # Setup all the required matrices
        for pattern, seqlets in tqdm(self.seqlets_per_task.items()):
            # get wide seqlets

            wide_seqlets = [
                s.resize(self.footprint_width) for s in seqlets
                if s.center() > self.footprint_width // 2 and s.center() <
                self.get_seqlen(pattern) - self.footprint_width // 2
            ]

            # profile data: observed
            #self.profile[pattern] = {task: extract_signal(self.get_region_profile(task), seqlets)
            #                         for task in tasks}

            #self.profile_wide[pattern] = {task: extract_signal(self.get_region_profile(task), wide_seqlets)
            #                              for task in tasks}
            # importance scores
            self.grad_profile[pattern] = {
                task: extract_signal(self.get_region_grad(task, 'profile'),
                                     seqlets)
                for task in tasks
            }
            #self.grad_counts[pattern] = {task: extract_signal(self.get_region_grad(task, 'counts'), seqlets)
            #                             for task in tasks}
            # seq
            self.seq[pattern] = extract_signal(self.get_region_seq(), seqlets)
Beispiel #5
0
def plot_power_spectrum(pattern, task, data):
    seqlets = data.seqlets_per_task[pattern]
    wide_seqlets = [
        s.resize(data.footprint_width) for s in seqlets
        if s.center() > data.footprint_width //
        2 and s.center() < data.get_seqlen(pattern) - data.footprint_width // 2
    ]
    p = extract_signal(data.get_region_grad(task, 'profile'), wide_seqlets)

    agg_profile = np.log(np.abs(p).sum(axis=-1).sum(axis=0))
    heatmap_importance_profile(normalize(np.abs(p).sum(axis=-1)[:500],
                                         pmin=50,
                                         pmax=99),
                               figsize=(10, 20))
    heatmap_fig = plt.gcf()
    # heatmap_importance_profile(np.abs(p*seq).sum(axis=-1)[:500], figsize=(10, 20))

    agg_profile = agg_profile - agg_profile.mean()
    agg_profile = agg_profile / agg_profile.std()
    freq = np.fft.fftfreq(agg_profile[102:].shape[-1])

    smooth_part = smooth(agg_profile, 10)
    oscilatory_part = agg_profile - smooth_part

    avg_fig, axes = plt.subplots(2, 1, figsize=(11, 4), sharex=True)
    axes[0].plot(agg_profile, label='original')
    axes[0].plot(smooth_part, label="smooth", alpha=0.5)
    axes[0].legend()
    axes[0].set_ylabel("Avg. importance")
    axes[0].set_title("Average importance score")
    # axes[0].set_xlabel("Position");
    axes[1].plot(oscilatory_part)
    axes[1].set_xlabel("Position")
    axes[1].set_ylabel("original - smooth")
    avg_fig.subplots_adjust(hspace=0)  # no space between plots
    # plt.savefig('nanog-agg-profile.png', dpi=300)
    # plt.savefig('nanog-agg-profile.pdf')

    fft_fig = plt.figure(figsize=(11, 2))
    plt.plot(
        1 / freq[:49],
        np.abs(np.fft.fft(oscilatory_part[102:])[:49])**2 +
        np.abs(np.fft.fft(oscilatory_part[:98])[:49])**2, "-o")
    plt.xlim([0, 50])
    plt.gca().xaxis.set_major_locator(ticker.MaxNLocator(25, integer=True))
    plt.grid(alpha=0.3)
    plt.xlabel("1/Frequency [bp]")
    plt.ylabel("Power spectrum")
    plt.title("Power spectrum")
    plt.gcf().subplots_adjust(bottom=0.4)
    return heatmap_fig, avg_fig, fft_fig
Beispiel #6
0
def compute_power_spectrum(pattern, task, data):
    seqlets = data.seqlets_per_task[pattern]
    wide_seqlets = [
        s.resize(data.footprint_width) for s in seqlets
        if s.center() > data.footprint_width //
        2 and s.center() < data.get_seqlen(pattern) - data.footprint_width // 2
    ]
    p = extract_signal(data.get_region_grad(task, 'profile'), wide_seqlets)

    agg_profile = np.log(np.abs(p).sum(axis=-1).sum(axis=0))

    agg_profile = agg_profile - agg_profile.mean()
    agg_profile = agg_profile / agg_profile.std()

    smooth_part = smooth(agg_profile, 10)
    oscilatory_part = agg_profile - smooth_part

    t0, ps = periodicity_ft(oscilatory_part)
    return ps, t0, 1 / t0
Beispiel #7
0
def get_reference_profile(mr,
                          pattern,
                          profiles,
                          tasks,
                          profile_width=70,
                          trim_frac=0.08,
                          seqlen=1000):
    """Generate the reference profile
    """
    from basepair.modisco.results import resize_seqlets
    from basepair.plot.profiles import extract_signal
    seqlets_ref = mr._get_seqlets(pattern, trim_frac=trim_frac)
    seqlets_ref = resize_seqlets(seqlets_ref, profile_width, seqlen=seqlen)

    out = {}
    for task in tasks:
        seqlet_profile_ref = extract_signal(profiles[task], seqlets_ref)
        avg_profile = seqlet_profile_ref.mean(axis=0)
        out[task] = avg_profile
        # metrics_ref = pd.DataFrame([profile_sim_metrics(avg_profile, cp) for cp in seqlet_profile_ref])
    return out
Beispiel #8
0
def compute_power_spectrum(pattern, task, data):
    seqlets = data.seqlets_per_task[pattern]
    wide_seqlets = [
        s.resize(data.footprint_width) for s in seqlets
        if s.center() > data.footprint_width //
        2 and s.center() < data.get_seqlen(pattern) - data.footprint_width // 2
    ]
    p = extract_signal(data.get_region_grad(task, 'profile'), wide_seqlets)

    agg_profile = np.log(np.abs(p).sum(axis=-1).sum(axis=0))

    agg_profile = agg_profile - agg_profile.mean()
    agg_profile = agg_profile / agg_profile.std()

    smooth_part = smooth(agg_profile, 10)
    oscilatory_part = agg_profile - smooth_part

    freq = np.fft.fftfreq(agg_profile[102:].shape[-1])

    freq = freq[:49]
    t0 = 1 / freq
    ps = np.abs(np.fft.fft(oscilatory_part[102:])[:49])**2 + np.abs(
        np.fft.fft(oscilatory_part[:98])[:49])**2
    return ps, t0, freq