Example #1
0
    def evaluate(self,
                 dataset,
                 eval_metric=None,
                 num_workers=8,
                 batch_size=256):
        lpreds = []
        llabels = []
        for inputs, targets in tqdm(dataset.batch_train_iter(
                cycle=False, num_workers=num_workers, batch_size=batch_size),
                                    total=len(dataset) // batch_size):
            assert isinstance(targets, dict)
            target_keys = list(targets)
            llabels.append(deepcopy(targets))
            bpreds = {
                k: v
                for k, v in self.predict(inputs, batch_size=None).items()
                if k in target_keys
            }  # keep only the target key predictions
            lpreds.append(bpreds)
            del inputs
            del targets
        preds = numpy_collate_concat(lpreds)
        labels = numpy_collate_concat(llabels)
        del lpreds
        del llabels

        if eval_metric is not None:
            return eval_metric(labels, preds)
        else:
            task_avg_tape = defaultdict(list)
            out = {}
            for task, heads in self.all_heads.items():
                for head_i, head in enumerate(heads):
                    target_name = head.get_target(task)
                    if target_name not in labels:
                        print(
                            f"Target {target_name} not found. Skipping evaluation"
                        )
                        continue
                    res = head.metric(labels[target_name], preds[target_name])
                    out[target_name] = res
                    metrics_dict = flatten(res, separator='/')
                    for k, v in metrics_dict.items():
                        task_avg_tape[
                            head.target_name.replace("{task}", "avg") + "/" +
                            k].append(v)
            for k, v in task_avg_tape.items():
                # get the average
                out[k] = mean(v)

        # flatten everything
        out = flatten(out, separator='/')
        return out
Example #2
0
def get_scores(ref_pred, alt_pred, tasks, motif, seqlen, center_coords):
    d = {}
    cstart, cend = center_coords
    for task in tasks:
        # profile - use the filtered tracks
        d[task] = flatten(
            {
                "profile":
                profile_sim_metrics(ref_pred['profile'][task][cstart:cend],
                                    alt_pred['profile'][task][cstart:cend])
            }, "/")

        # importance scores - use the central motif region
        if 'imp' in ref_pred:
            for imp_score in ref_pred["imp"][task]:
                imp, imp_frac = imp_sim_metrics(
                    ref_pred["imp"][task][imp_score],
                    alt_pred["imp"][task][imp_score], motif, seqlen)
                d[task] = {
                    f"imp/{imp_score}": imp,
                    f"imp/{imp_score}_frac": imp_frac,
                    **d[task]
                }
    return d
Example #3
0
    def plot(self, kind='all', rotate_y=90, letter_width=0.2,
             height=0.8, ylab=True, **kwargs):
        if isinstance(kind, list):
            kind_list = kind
        else:
            if kind == 'all':
                kind_list = self._track_list
            else:
                self._validate_kind(kind)
                kind_list = [kind]

        tracks = OrderedDict([(kind, self._get_track(kind)) for kind in kind_list])
        tracks = skip_nan_tracks(flatten(tracks, "/"))
        if 'seq' in tracks:
            tracks['seq'] = self.get_seq_ic()  # override the sequence with information content
        if 'title' not in kwargs:
            kwargs['title'] = self.name
        return plot_tracks(tracks,
                           # title=self.name,
                           rotate_y=rotate_y,
                           fig_width=len(self) * letter_width,
                           fig_height_per_track=height,
                           ylab=ylab,
                           **kwargs)
Example #4
0
 def flatten(self):
     return super().__init__(flatten(self.data), attrs=deepcopy(self.attrs))
Example #5
0
 def get_lens(self):
     return list(flatten(self.dapply(len)).values())
Example #6
0
def generate_sim(bpnet,
                 central_motif,
                 side_motif,
                 side_distances,
                 center_coords=[450, 550],
                 repeat=128,
                 importance=['count', 'weighted'],
                 correct=False):
    outl = []
    tasks = bpnet.tasks
    seqlen = bpnet.input_seqlen()
    # ref_preds = sim_pred(model, central_motif)
    ref_preds = unflatten(
        bpnet.sim_pred(central_motif, repeat=repeat, importance=importance),
        "/")
    none_preds = unflatten(
        bpnet.sim_pred('', '', [], repeat=repeat, importance=importance), "/")

    alt_profiles = []
    for dist in tqdm(side_distances):
        # alt_preds = sim_pred(model, central_motif, side_motif, [dist])

        # Note: bpnet.sim_pred already averages the predictions
        alt_preds = unflatten(
            bpnet.sim_pred(central_motif,
                           side_motif, [dist],
                           repeat=repeat,
                           importance=importance), "/")
        if correct:
            # Correct for the 'shoulder' effect
            #
            # this performs: AB - (B - 0)
            # Where:
            # - AB: contains both, central and side_motif
            # - B : contains only side_motif
            # - 0 : doesn't contain any motif
            edge_only_preds = unflatten(
                bpnet.sim_pred('',
                               side_motif, [dist],
                               repeat=repeat,
                               importance=importance), "/")

            alt_preds_f = flatten(alt_preds, '/')
            # ref_preds_f = flatten(ref_preds, '/')
            none_preds_f = flatten(none_preds, "/")
            # substract the other counts
            alt_preds = unflatten(
                {
                    k: alt_preds_f[k] - v + none_preds_f[k]
                    for k, v in flatten(edge_only_preds, "/").items()
                }, "/")
            # ref_preds = unflatten({k: ref_preds_f[k] - v  for k,v in flatten(none_preds, "/").items()}, "/")
        alt_profiles.append((dist, alt_preds))

        # This normalizes the score by `A` finally yielding:
        # (AB - B + 0) / A
        scores = get_scores(ref_preds, alt_preds, tasks, central_motif, seqlen,
                            center_coords)

        # compute the distance metrics
        for task in bpnet.tasks:
            d = scores[task]

            # book-keeping
            d['task'] = task
            d['central_motif'] = central_motif
            d['side_motif'] = side_motif
            d['position'] = dist
            d['distance'] = dist - seqlen // 2

            outl.append(d)

    return pd.DataFrame(outl), alt_profiles
Example #7
0
def modisco_plot(
        modisco_dir,
        output_dir,
        # filter_npy=None,
        # ignore_dist_filter=False,
        figsize=(10, 10),
        impsf=None):
    """Plot the results of a modisco run

    Args:
      modisco_dir: modisco directory
      output_dir: Output directory for writing the results
      figsize: Output figure size
      impsf: [optional] modisco importance score file (ImpScoreFile)
    """
    plt.switch_backend('agg')
    add_file_logging(output_dir, logger, 'modisco-plot')
    from basepair.plot.vdom import write_heatmap_pngs
    from basepair.plot.profiles import plot_profiles
    from basepair.utils import flatten

    output_dir = Path(output_dir)
    output_dir.parent.mkdir(parents=True, exist_ok=True)

    # load modisco
    mr = ModiscoResult(f"{modisco_dir}/modisco.h5")

    if impsf is not None:
        d = impsf
    else:
        d = ImpScoreFile.from_modisco_dir(modisco_dir)
        logger.info("Loading the importance scores")
        d.cache()  # load all

    thr_one_hot = d.get_seq()
    # thr_hypothetical_contribs
    tracks = d.get_profiles()
    thr_hypothetical_contribs = dict()
    thr_contrib_scores = dict()
    # TODO - generalize this
    thr_hypothetical_contribs['weighted'] = d.get_hyp_contrib()
    thr_contrib_scores['weighted'] = d.get_contrib()

    tasks = d.get_tasks()

    # Count importance (if it exists)
    if d.contains_imp_score("counts/pre-act"):
        count_imp_score = "counts/pre-act"
        thr_hypothetical_contribs['count'] = d.get_hyp_contrib(
            imp_score=count_imp_score)
        thr_contrib_scores['count'] = d.get_contrib(imp_score=count_imp_score)
    elif d.contains_imp_score("count"):
        count_imp_score = "count"
        thr_hypothetical_contribs['count'] = d.get_hyp_contrib(
            imp_score=count_imp_score)
        thr_contrib_scores['count'] = d.get_contrib(imp_score=count_imp_score)
    else:
        # Don't do anything
        pass

    thr_hypothetical_contribs = OrderedDict(
        flatten(thr_hypothetical_contribs, separator='/'))
    thr_contrib_scores = OrderedDict(flatten(thr_contrib_scores,
                                             separator='/'))

    #     # load importance scores
    #     modisco_kwargs = read_json(f"{modisco_dir}/kwargs.json")
    #     d = HDF5Reader.load(modisco_kwargs['imp_scores'])
    #     if 'hyp_imp' not in d:
    #         # backcompatibility
    #         d['hyp_imp'] = d['grads']
    #     tasks = list(d['targets']['profile'])

    #     if isinstance(d['inputs'], dict):
    #         one_hot = d['inputs']['seq']
    #     else:
    #         one_hot = d['inputs']

    #     # load used strand distance filter

    #     included_samples = load_included_samples(modisco_dir)

    #     grad_type = "count,weighted"  # always plot both importance scores

    #     thr_hypothetical_contribs = OrderedDict([(f"{gt}/{task}", mean(d['hyp_imp'][task][gt])[included_samples])
    #                                              for task in tasks
    #                                              for gt in grad_type.split(",")])
    #     thr_one_hot = one_hot[included_samples]
    #     thr_contrib_scores = OrderedDict([(f"{gt}/{task}", thr_hypothetical_contribs[f"{gt}/{task}"] * thr_one_hot)
    #                                       for task in tasks
    #                                       for gt in grad_type.split(",")])
    #     tracks = OrderedDict([(task, d['targets']['profile'][task][included_samples]) for task in tasks])
    # -------------------------------------------------

    all_seqlets = mr.seqlets()
    all_patterns = mr.patterns()
    if len(all_patterns) == 0:
        print("No patterns found")
        return

    # 1. Plots with tracks and contrib scores
    print("Writing results for contribution scores")
    plot_profiles(all_seqlets,
                  thr_one_hot,
                  tracks=tracks,
                  importance_scores=thr_contrib_scores,
                  legend=False,
                  flip_neg=True,
                  rotate_y=0,
                  seq_height=.5,
                  patterns=all_patterns,
                  n_bootstrap=100,
                  fpath_template=str(output_dir /
                                     "{pattern}/agg_profile_contribcores"),
                  mkdir=True,
                  figsize=figsize)

    # 2. Plots only with hypothetical contrib scores
    print("Writing results for hypothetical contribution scores")
    plot_profiles(all_seqlets,
                  thr_one_hot,
                  tracks={},
                  importance_scores=thr_hypothetical_contribs,
                  legend=False,
                  flip_neg=True,
                  rotate_y=0,
                  seq_height=1,
                  patterns=all_patterns,
                  n_bootstrap=100,
                  fpath_template=str(output_dir /
                                     "{pattern}/agg_profile_hypcontribscores"),
                  figsize=figsize)

    print("Plotting heatmaps")
    for pattern in tqdm(all_patterns):
        write_heatmap_pngs(all_seqlets[pattern],
                           d,
                           tasks,
                           pattern,
                           output_dir=str(output_dir / pattern))

    mr.close()