Esempio n. 1
0
    def predict(self, dataset, transformers=[], batch_size=None):
        """
    Uses self to make predictions on provided Dataset object.

    Returns:
      y_pred: numpy ndarray of shape (n_samples,)
    """
        y_preds = []
        n_tasks = self.get_num_tasks()
        ind = 0

        for (X_batch, _, _,
             ids_batch) in dataset.iterbatches(batch_size, deterministic=True):
            n_samples = len(X_batch)
            y_pred_batch = self.predict_on_batch(X_batch)
            # Discard any padded predictions
            y_pred_batch = y_pred_batch[:n_samples]
            #y_pred_batch = np.reshape(y_pred_batch, (n_samples, n_tasks))
            y_pred_batch = undo_transforms(y_pred_batch, transformers)
            y_preds.append(y_pred_batch)
        # y_pred = np.vstack(y_preds)

        # # The iterbatches does padding with zero-weight examples on the last batch.
        # # Remove padded examples.
        # n_samples = len(dataset)
        # y_pred = np.reshape(y_pred, (n_samples, n_tasks))
        # # Special case to handle singletasks.
        # if n_tasks == 1:
        #   y_pred = np.reshape(y_pred, (n_samples,))
        y_pred = np.concatenate(y_preds)
        return y_pred
Esempio n. 2
0
def get_pair_values_and_fold_ind(all_dataset,
                                 K,
                                 transformers,
                                 create_mapping=False,
                                 smiles_to_some_id=None,
                                 drug_id_and_smiles_to_ind=None,
                                 prot_name_and_seq_to_ind=None,
                                 dt_pair_to_fold=None,
                                 dt_pair_to_value=None,
                                 drug_mol_to_ind=None,
                                 prot_to_ind=None):

    for i in range(K):
        validation_data = all_dataset[i][1]
        for (X_b, y_b, w_b, _) in validation_data.itersamples():
            assert w_b[0] == 1.0
            drug_mol = X_b[0]
            protein = X_b[1]
            y_b = undo_transforms(y_b, transformers)
            if not create_mapping:
                drug_smiles = drug_mol.smiles
                some_id = smiles_to_some_id[drug_smiles]
                drug_pair = (some_id, drug_smiles)
                drug_ind = drug_id_and_smiles_to_ind[drug_pair]
                prot_seq = protein.get_sequence()[1]
                prot_name = protein.get_name()[1]
                prot_pair = (prot_name, prot_seq)
                prot_ind = prot_name_and_seq_to_ind[prot_pair]
                pair = (drug_ind, prot_ind)

            else:
                if drug_mol not in drug_mol_to_ind:
                    # values start from 1.
                    drug_mol_to_ind[drug_mol] = len(drug_mol_to_ind) + 1

                if protein not in prot_to_ind:
                    prot_to_ind[protein] = len(prot_to_ind) + 1
                pair = (drug_mol, protein)

            assert pair not in dt_pair_to_fold
            # Also start from 1.
            dt_pair_to_fold[pair] = i + 1
            assert pair not in dt_pair_to_value
            dt_pair_to_value[pair] = y_b[0]
Esempio n. 3
0
    def compute_model_performance(self,
                                  metrics,
                                  csv_out=None,
                                  stats_out=None,
                                  per_task_metrics=False,
                                  no_concordance_index=False,
                                  plot=False,
                                  no_r2=False):
        """
    Computes statistics of model on test data and saves results to csv.

    Parameters
    ----------
    metrics: list
      List of dc.metrics.Metric objects
    csv_out: str, optional
      Filename to write CSV of model predictions.
    stats_out: str, optional
      Filename to write computed statistics.
    per_task_metrics: bool, optional
      If true, return computed metric for each task on multitask dataset.
    """
        y = self.dataset.y
        y = undo_transforms(y, self.output_transformers)
        w = self.dataset.w

        if not len(metrics):
            return {}
        else:
            mode = metrics[0].mode
        y_pred = self.model.predict(self.dataset, self.output_transformers)
        if mode == "classification":
            y_pred_print = np.argmax(y_pred, -1)
        else:
            y_pred_print = y_pred
        multitask_scores = {}
        all_task_scores = {}

        if csv_out is not None:
            log("Saving predictions to %s" % csv_out, self.verbose)
            self.output_predictions(y_pred_print, csv_out)

        plot_finished = False
        # Compute multitask metrics
        for i, metric in enumerate(metrics):
            mtc_name = metric.metric.__name__
            if no_r2 and (mtc_name == 'r2_score'
                          or mtc_name == 'pearson_r2_score'):
                continue
            if per_task_metrics:
                if self.is_training_set:
                    if no_concordance_index and metric.metric.__name__ == "concordance_index":
                        multitask_scores[metric.name] = None
                        all_task_scores[metric.name] = None
                        continue
                    if plot and not plot_finished:
                        # If this dataset is the training data set, don't calculate CI if no_concordance_index.
                        multitask_scores[
                            metric.
                            name], computed_metrics = metric.compute_metric(
                                y,
                                y_pred,
                                w,
                                per_task_metrics=True,
                                plot=True,
                                all_metrics=metrics,
                                is_training_set=self.is_training_set,
                                no_concordance_index=no_concordance_index,
                                tasks=self.tasks,
                                model_name=self.model_name)
                        all_task_scores[metric.name] = computed_metrics
                        plot_finished = True
                    else:
                        # No longer need to plot. Could be wasting time calculating metrics again, but they
                        # are super fast so it is no big deal.
                        multitask_scores[
                            metric.
                            name], computed_metrics = metric.compute_metric(
                                y,
                                y_pred,
                                w,
                                per_task_metrics=True,
                                plot=False,
                                is_training_set=self.is_training_set,
                                tasks=self.tasks,
                                model_name=self.model_name)
                        all_task_scores[metric.name] = computed_metrics

                # Now deal with validation or test sets.
                elif plot and (i == len(metrics) - 1 or metric.metric.__name__
                               == "concordance_index") and (not plot_finished):
                    multitask_scores[
                        metric.name], computed_metrics = metric.compute_metric(
                            y,
                            y_pred,
                            w,
                            per_task_metrics=True,
                            plot=True,
                            all_metrics=metrics,
                            is_training_set=self.is_training_set,
                            tasks=self.tasks,
                            model_name=self.model_name)
                    all_task_scores[metric.name] = computed_metrics
                    plot_finished = True
                else:  # Otherwise don't need to plot.
                    multitask_scores[
                        metric.name], computed_metrics = metric.compute_metric(
                            y,
                            y_pred,
                            w,
                            per_task_metrics=True,
                            plot=False,
                            is_training_set=self.is_training_set,
                            tasks=self.tasks,
                            model_name=self.model_name)
                    all_task_scores[metric.name] = computed_metrics

            else:
                if self.is_training_set:
                    if no_concordance_index and metric.metric.__name__ == "concordance_index":
                        multitask_scores[metric.name] = None
                        continue
                    if plot and not plot_finished:
                        multitask_scores[metric.name] = metric.compute_metric(
                            y,
                            y_pred,
                            w,
                            per_task_metrics=False,
                            plot=True,
                            all_metrics=metrics,
                            is_training_set=self.is_training_set,
                            no_concordance_index=no_concordance_index,
                            tasks=self.tasks,
                            model_name=self.model_name)
                        plot_finished = True
                    else:
                        multitask_scores[metric.name] = metric.compute_metric(
                            y,
                            y_pred,
                            w,
                            per_task_metrics=False,
                            plot=False,
                            is_training_set=self.is_training_set,
                            tasks=self.tasks,
                            model_name=self.model_name)

                elif plot and (i == len(metrics) - 1 or metric.metric.__name__
                               == "concordance_index") and (not plot_finished):
                    multitask_scores[metric.name] = metric.compute_metric(
                        y,
                        y_pred,
                        w,
                        per_task_metrics=False,
                        plot=True,
                        all_metrics=metrics,
                        is_training_set=self.is_training_set,
                        tasks=self.tasks,
                        model_name=self.model_name)
                    plot_finished = True
                else:
                    multitask_scores[metric.name] = metric.compute_metric(
                        y,
                        y_pred,
                        w,
                        per_task_metrics=False,
                        plot=False,
                        is_training_set=self.is_training_set,
                        tasks=self.tasks,
                        model_name=self.model_name)

        if stats_out is not None:
            log("Saving stats to %s" % stats_out, self.verbose)
            self.output_statistics(multitask_scores, stats_out)

        if not per_task_metrics:
            return multitask_scores
        else:
            return multitask_scores, all_task_scores
Esempio n. 4
0
    def compute_model_performance(self,
                                  metrics,
                                  csv_out=None,
                                  stats_out=None,
                                  per_task_metrics=False,
                                  no_r2=False,
                                  no_concordance_index=False,
                                  plot=False):
        """
    Computes statistics of model on test data and saves results to csv.

    Parameters
    ----------
    metrics: list
      List of dc.metrics.Metric objects
    per_task_metrics: bool, optional
      If true, return computed metric for each task on multitask dataset.
    """
        self.model.build()
        y = []
        w = []

        def generator_closure():
            for feed_dict in self.generator:
                y.append(feed_dict[self.label_keys[0]])
                if len(self.weights) > 0:
                    w.append(feed_dict[self.weights[0]])
                yield feed_dict

        if not len(metrics):
            return {}
        else:
            mode = metrics[0].mode
        if mode == "classification":
            y_pred = self.model.predict_proba_on_generator(generator_closure())
            y = np.transpose(np.array(y), axes=[0, 2, 1, 3])
            y = np.reshape(y, newshape=(-1, self.n_tasks, self.n_classes))
            y = from_one_hot(y, axis=-1)
        else:
            y_pred = self.model.predict_proba_on_generator(generator_closure())
            y = np.transpose(np.array(y), axes=[0, 2, 1, 3])
            y = np.reshape(y, newshape=(-1, self.n_tasks))
            y_pred = np.reshape(y_pred, newshape=(-1, self.n_tasks))
        y_pred = self.model.predict_on_generator(generator_closure())
        y = np.concatenate(y, axis=0)
        multitask_scores = {}
        all_task_scores = {}

        y = undo_transforms(y, self.output_transformers)
        y_pred = undo_transforms(y_pred, self.output_transformers)
        if len(w) != 0:
            w = np.array(w)
            w = np.reshape(w, newshape=y.shape)

        if csv_out is not None:
            log("Saving predictions to %s" % csv_out, self.verbose)
            self.output_predictions(y_pred, csv_out)

        plot_finished = False
        # Compute multitask metrics
        for i, metric in enumerate(metrics):
            mtc_name = metric.metric.__name__
            if no_r2 and (mtc_name == 'r2_score'
                          or mtc_name == 'pearson_r2_score'):
                continue
            if per_task_metrics:
                if self.is_training_set:
                    if no_concordance_index and metric.metric.__name__ == "concordance_index":
                        multitask_scores[metric.name] = None
                        all_task_scores[metric.name] = None
                        continue
                    if plot and not plot_finished:
                        multitask_scores[
                            metric.
                            name], computed_metrics = metric.compute_metric(
                                y,
                                y_pred,
                                w,
                                per_task_metrics=True,
                                n_classes=self.n_classes,
                                plot=True,
                                all_metrics=metrics,
                                is_training_set=self.is_training_set,
                                no_concordance_index=no_concordance_index,
                                tasks=self.tasks,
                                model_name=self.model_name)
                        all_task_scores[metric.name] = computed_metrics
                        plot_finished = True
                    else:
                        multitask_scores[
                            metric.
                            name], computed_metrics = metric.compute_metric(
                                y,
                                y_pred,
                                w,
                                per_task_metrics=True,
                                n_classes=self.n_classes,
                                plot=False,
                                is_training_set=self.is_training_set,
                                tasks=self.tasks,
                                model_name=self.model_name)
                        all_task_scores[metric.name] = computed_metrics

                elif plot and (i == len(metrics) - 1 or metric.metric.__name__
                               == "concordance_index") and (not plot_finished):
                    multitask_scores[
                        metric.name], computed_metrics = metric.compute_metric(
                            y,
                            y_pred,
                            w,
                            per_task_metrics=True,
                            n_classes=self.n_classes,
                            plot=True,
                            all_metrics=metrics,
                            is_training_set=self.is_training_set,
                            tasks=self.tasks,
                            model_name=self.model_name)
                    all_task_scores[metric.name] = computed_metrics
                    plot_finished = True

                else:  #Otherwise don't need to plot.
                    multitask_scores[
                        metric.name], computed_metrics = metric.compute_metric(
                            y,
                            y_pred,
                            w,
                            per_task_metrics=True,
                            n_classes=self.n_classes,
                            plot=False,
                            is_training_set=self.is_training_set,
                            tasks=self.tasks,
                            model_name=self.model_name)
                    all_task_scores[metric.name] = computed_metrics

            else:
                if self.is_training_set:
                    if no_concordance_index and metric.metric.__name__ == "concordance_index":
                        multitask_scores[metric.name] = None
                        continue
                    if plot and not plot_finished:
                        multitask_scores[metric.name] = metric.compute_metric(
                            y,
                            y_pred,
                            w,
                            per_task_metrics=False,
                            n_classes=self.n_classes,
                            plot=True,
                            all_metrics=metrics,
                            is_training_set=self.is_training_set,
                            no_concordance_index=no_concordance_index,
                            tasks=self.tasks,
                            model_name=self.model_name)
                        plot_finished = True
                    else:
                        multitask_scores[metric.name] = metric.compute_metric(
                            y,
                            y_pred,
                            w,
                            per_task_metrics=False,
                            n_classes=self.n_classes,
                            plot=False,
                            is_training_set=self.is_training_set,
                            tasks=self.tasks,
                            model_name=self.model_name)

                elif plot and (i == len(metrics) - 1 or metric.metric.__name__
                               == "concordance_index") and (not plot_finished):
                    multitask_scores[metric.name] = metric.compute_metric(
                        y,
                        y_pred,
                        w,
                        per_task_metrics=False,
                        n_classes=self.n_classes,
                        plot=True,
                        all_metrics=metrics,
                        is_training_set=self.is_training_set,
                        tasks=self.tasks,
                        model_name=self.model_name)
                    plot_finished = True

                else:  #Otherwise don't need to plot.
                    multitask_scores[metric.name] = metric.compute_metric(
                        y,
                        y_pred,
                        w,
                        per_task_metrics=False,
                        n_classes=self.n_classes,
                        plot=False,
                        is_training_set=self.is_training_set,
                        tasks=self.tasks,
                        model_name=self.model_name)

        if not per_task_metrics:
            return multitask_scores
        else:
            return multitask_scores, all_task_scores