Esempio n. 1
0
    def predict(self, seq, batch_size=512):
        """Make model prediction

        Args:
          seq: numpy array of one-hot-encoded array of sequences
          batch_size: batch size

        Returns:
          dictionary key=task and value=prediction for the task
        """
        if self.bias_model is not None:
            # TODO - what is this?
            seq, = self.bias_model.predict((seq, ), batch_size)

        preds = self.model.predict(seq, batch_size=batch_size)

        if len(self.model.output) == 2 * len(self.tasks):
            # extract the profile and count predictions
            profile_preds = {task: softmax(preds[task_i]) for task_i, task in enumerate(self.tasks)}
            count_preds = {task: preds[len(self.tasks) + task_i] for task_i, task in enumerate(self.tasks)}
            # compute the scaling factor
            if self.preproc is None:
                scales = {task: np.exp(count_preds[task]) - 1
                          for task in self.tasks}
            else:
                scales = {task: np.exp(self.preproc.objects[f'profile/{task}'].steps[1][1].inverse_transform(count_preds[task])) - 1
                          for task in self.tasks}

            # scaled profile (counts per base)
            return {task: profile_preds[task] * scales[task][:, np.newaxis] for task in self.tasks}
        else:
            return {task: preds[task_i] for task_i, task in enumerate(self.tasks)}
Esempio n. 2
0
def postproc(preds, tasks):
    ntasks = len(tasks)
    preds[:len(tasks)] = [softmax(p) for p in preds[:ntasks]]
    preds_dict = dict(profile=preds[:ntasks], counts=preds[len(tasks):])
    scales = pred2scale_strands(preds_dict, tasks)
    return {
        task: preds_dict['profile'][i] * scales[task][:, np.newaxis]
        for i, task in enumerate(tasks)
    }
Esempio n. 3
0
        def get_ism_score(onehot_data):
            from collections import OrderedDict
            from basepair.functions import softmax

            # create mutations
            onehot_data = np.array(onehot_data)[0]
            # print("one hot data: "+str(np.array(onehot_data).shape))
            mutated_seqs = []
            for sample in onehot_data:
                # print("sample: "+str(np.array(sample).shape))
                for pos in range(len(sample)):
                    for base in range(4):
                        mutated = sample.copy()
                        mutated[pos] = np.zeros((1, 4))
                        mutated[pos][base] = 1
                        mutated_seqs.append(mutated)
            mutated_seqs = np.array(mutated_seqs)
            # print("mutated seqs: "+str(mutated_seqs.shape))

            # get predictions
            raw_predictions = keras_model.predict(mutated_seqs, batch_size=32)

            # get scores
            attribs = []
            for sample_idx, sample in enumerate(onehot_data):
                temp_attribs = []
                for pos in range(len(sample)):
                    temp_attribs.append([])
                    for base in range(4):
                        if pred_summary == 'count':
                            relevant_output = raw_predictions[
                                len(self.tasks) +
                                task_id][(sample_idx * len(sample) * 4) +
                                         (pos * 4) + base]
                            temp_attribs[pos].append(
                                relevant_output[strand_id])
                        else:
                            relevant_output = raw_predictions[task_id][
                                (sample_idx * len(sample) * 4) + (pos * 4) +
                                base]
                            temp_attribs[pos].append(
                                np.sum(
                                    softmax([relevant_output[:, strand_id]]) *
                                    [relevant_output[:, strand_id]]))
                temp_attribs = np.array(temp_attribs)
                avg_scores = np.mean(temp_attribs, axis=1,
                                     keepdims=True)  # this is ACGT axis
                temp_attribs -= avg_scores
                # print("sample attribs: "+str(temp_attribs.shape))
                attribs.append(temp_attribs)
            attribs = np.array([attribs])
            # print("all attribs: "+str(attribs.shape))
            # attribs = np.swapaxes(np.array(attribs), -1, -2)
            return attribs
Esempio n. 4
0
 def __call__(self, y_true, preds):
     profile_preds = {
         task: softmax(preds[task_i])
         for task_i, task in enumerate(self.tasks)
     }
     count_preds = {
         task: preds[len(self.tasks) + task_i].sum(axis=-1)
         for task_i, task in enumerate(self.tasks)
     }
     profile_true = {task: y_true[f'profile/{task}'] for task in self.tasks}
     counts_true = {
         task: y_true[f'counts/{task}'].sum(axis=-1)
         for task in self.tasks
     }
     return ({
         "profile": profile_true,
         "counts": counts_true
     }, {
         "profile": profile_preds,
         "counts": count_preds
     })
Esempio n. 5
0
def evaluate(model_dir,
             output_dir=None,
             gpu=0,
             exclude_metrics=False,
             splits=['train', 'valid'],
             model_path=None,
             data=None,
             hparams=None,
             dataspec=None,
             preprocessor=None):
    """
    Args:
      model_dir: path to the model directory
      splits: For which data splits to compute the evaluation metrics
      model_metrics: if True, metrics computed using mode.evaluate(..)
    """
    if gpu is not None:
        create_tf_session(gpu)
    if dataspec is not None:
        ds = DataSpec.load(dataspec)
    else:
        ds = DataSpec.load(os.path.join(model_dir, "dataspec.yaml"))
    if hparams is not None:
        hp = HParams.load(hparams)
    else:
        hp = HParams.load(os.path.join(model_dir, "hparams.yaml"))
    if model_path is not None:
        model = load_model(model_path)
    else:
        model = load_model(os.path.join(model_dir, "model.h5"))
    if output_dir is None:
        output_dir = os.path.join(model_dir, "eval")
    train, valid, test = load_data(model_dir,
                                   dataspec=dataspec,
                                   hparams=hparams,
                                   data=data,
                                   preprocessor=preprocessor)
    data = dict(train=train, valid=valid, test=test)

    metrics = {}
    profile_metrics = []
    os.makedirs(os.path.join(output_dir, "plots"),
                exist_ok=True)
    for split in tqdm(splits):
        y_pred = model.predict(data[split][0])
        y_true = data[split][1]
        if not exclude_metrics:
            eval_metrics_values = model.evaluate(data[split][0],
                                                 data[split][1])
            eval_metrics = dict(zip(_listify(model.metrics_names),
                                    _listify(eval_metrics_values)))
            eval_metrics = {split + "/" + k.replace("_", "/"): v
                            for k, v in eval_metrics.items()}
            metrics = {**eval_metrics, **metrics}
        for task in ds.task_specs:
            # Counts

            yp = y_pred[ds.task2idx(task, "counts")].sum(axis=-1)
            yt = y_true["counts/" + task].sum(axis=-1)
            # compute the correlation
            rp = pearsonr(yt, yp)[0]
            rs = spearmanr(yt, yp)[0]
            metrics = {**metrics,
                       split + f"/counts/{task}/pearsonr": rp,
                       split + f"/counts/{task}/spearmanr": rs,
                       }

            fig = plt.figure(figsize=(5, 5))
            plt.scatter(yp, yt, alpha=0.5)
            plt.xlabel("Predicted")
            plt.ylabel("Observed")
            plt.title(f"R_pearson={rp:.2f}, R_spearman={rs:.2f}")
            plt.savefig(os.path.join(output_dir, f"plots/counts.{split}.{task}.png"))

            # Profile
            yp = softmax(y_pred[ds.task2idx(task, "profile")])
            yt = y_true["profile/" + task]
            df = eval_profile(yt, yp,
                              pos_min_threshold=hp.evaluate.pos_min_threshold,
                              neg_max_threshold=hp.evaluate.neg_max_threshold,
                              required_min_pos_counts=hp.evaluate.required_min_pos_counts,
                              binsizes=hp.evaluate.binsizes)
            df['task'] = task
            df['split'] = split
            # Evaluate for the smallest binsize
            auprc_min = df[df.binsize == min(hp.evaluate.binsizes)].iloc[0].auprc
            metrics[split + f'/profile/{task}/auprc'] = auprc_min
            profile_metrics.append(df)

    # Write the count metrics
    write_json(metrics, os.path.join(output_dir, "metrics.json"))

    # write the profile metrics
    dfm = pd.concat(profile_metrics)
    dfm.to_csv(os.path.join(output_dir, "profile_metrics.tsv"),
               sep='\t', index=False)
    return dfm, metrics