def setUp(self):
        """
        Set default start setting for all tests.
        """
        self.constraint_object = PhotonBaseConstraint(
            strategy="first", metric="mean_squared_error", margin=0.1)

        metrics_list = ["f1_score", "mean_squared_error"]
        self.dummy_config_item = MDBConfig()
        self.dummy_config_item.inner_folds = []
        for i in range(5):
            inner_fold = MDBInnerFold()
            inner_fold.validation = MDBScoreInformation()
            for metric in metrics_list:
                inner_fold.validation.metrics[metric] = (
                    np.random.randint(0, 1) / 2 + 0.0001)
            self.dummy_config_item.inner_folds.append(inner_fold)

        self.dummy_linear_config_item = MDBConfig()
        self.dummy_linear_config_item.inner_folds = []
        for i in range(5):
            inner_fold = MDBInnerFold()
            inner_fold.validation = MDBScoreInformation()
            for metric in metrics_list:
                inner_fold.validation.metrics[metric] = i / 4
            self.dummy_linear_config_item.inner_folds.append(inner_fold)
Ejemplo n.º 2
0
    def test_finalize_optimization(self):
        # it is kind of difficult to test that's why we fake it
        self.hyperpipe.fit(self.__X, self.__y)

        # reset all infos
        self.hyperpipe.results.dummy_estimator.train = MDBScoreInformation()
        self.hyperpipe.results.dummy_estimator.test = MDBScoreInformation()
        self.hyperpipe.results.metrics_train = {}
        self.hyperpipe.results.metrics_test = {}
        self.hyperpipe.best_config = None
        self.hyperpipe.results.best_config = MDBConfig()
        self.hyperpipe.optimum_pipe = None

        # now generate infos again
        self.hyperpipe._finalize_optimization()

        expected_num_of_metrics = len(self.hyperpipe.optimization.metrics)
        # dummy average values
        self.assertTrue(len(self.hyperpipe.results.dummy_estimator.train),
                        expected_num_of_metrics)
        self.assertTrue(len(self.hyperpipe.results.dummy_estimator.test),
                        expected_num_of_metrics)
        # overall average values
        self.assertTrue(len(self.hyperpipe.results.metrics_train),
                        2 * expected_num_of_metrics)
        self.assertTrue(len(self.hyperpipe.results.metrics_test),
                        2 * expected_num_of_metrics)
        # find best config
        self.assertIsNotNone(self.hyperpipe.best_config)
        self.assertIsNotNone(self.hyperpipe.results.best_config)
        self.assertEqual(self.hyperpipe.best_config,
                         self.hyperpipe.results.best_config.config_dict)
        # set optimum pipe and params, # todo: test add preprocessing
        self.assertIsNotNone(self.hyperpipe.optimum_pipe)
        self.assertEqual(
            self.hyperpipe.optimum_pipe.named_steps["SVC"].base_element.C,
            self.hyperpipe.best_config["SVC__C"])
        # save optimum model
        self.assertTrue(
            os.path.isfile(
                os.path.join(self.hyperpipe.output_settings.results_folder,
                             'photon_best_model.photon')))

        # backmapping
        # because the pca is test disabled, we expect the number of features
        self.assertEqual(
            len(self.hyperpipe.results.best_config_feature_importances[0]),
            self.__X.shape[1])
        backmapped_feature_importances = os.path.join(
            self.hyperpipe.output_settings.results_folder,
            'optimum_pipe_feature_importances_backmapped.csv')
        self.assertTrue(os.path.isfile(backmapped_feature_importances))
        loaded_array = np.loadtxt(open(backmapped_feature_importances, 'rb'),
                                  delimiter=",")
        self.assertEqual(loaded_array.shape[0], self.__X.shape[1])
Ejemplo n.º 3
0
 def _copy_inner_fold_means(metric_dict):
     # We copy all mean values from validation to the best config
     # training
     train_item_metrics = {}
     for m in metric_dict:
         if m.operation == str(self.fold_operation):
             train_item_metrics[m.metric_name] = m.value
     train_item = MDBScoreInformation()
     train_item.metrics_copied_from_inner = True
     train_item.metrics = train_item_metrics
     return train_item
Ejemplo n.º 4
0
    def test_get_optimum_config_outer_folds(self):
        my_pipe_optimizer = Hyperpipe.Optimization(
            "grid_search", {}, [], "balanced_accuracy", None
        )

        outer_fold_list = list()
        for i in range(10):
            outer_fold = MDBOuterFold()
            outer_fold.best_config = MDBConfig()
            outer_fold.best_config.best_config_score = MDBInnerFold()
            outer_fold.best_config.best_config_score.validation = MDBScoreInformation()
            # again fold 5 wins
            if i == 5:
                outer_fold.best_config.best_config_score.validation.metrics = {
                    "balanced_accuracy": 0.99
                }
            else:
                outer_fold.best_config.best_config_score.validation.metrics = {
                    "balanced_accuracy": 0.5
                }
            outer_fold_list.append(outer_fold)

        best_config_outer_folds = my_pipe_optimizer.get_optimum_config_outer_folds(
            outer_fold_list
        )
        self.assertEqual(
            best_config_outer_folds.best_config_score.validation.metrics[
                "balanced_accuracy"
            ],
            0.99,
        )
        self.assertIs(best_config_outer_folds, outer_fold_list[5].best_config)
Ejemplo n.º 5
0
    def score(estimator,
              X,
              y_true,
              metrics,
              indices=[],
              calculate_metrics: bool = True,
              training: bool = False,
              **kwargs):
        """
        Uses the pipeline to predict the given data, compare it to the truth values and calculate metrics

        :param estimator: the pipeline or pipeline element for prediction
        :param X: the data for prediction
        :param y_true: the truth values for the data
        :param metrics: the metrics to be calculated
        :param indices: the indices of the given data and targets that are logged into the result tree
        :param training: if True, all training_only pipeline elements are executed, if False they are skipped
        :param calculate_metrics: if True, calculates metrics for given data
        :return: ScoreInformation object
        """

        scoring_time_start = time.time()

        output_metrics = {}
        non_default_score_metrics = list(metrics)
        # that does not work because it is not an exact match and also reacts to e.g. f1_score
        # if 'score' in metrics:
        # so we use this:
        checklist = ["score"]
        matches = set(checklist).intersection(set(non_default_score_metrics))
        if len(matches) > 0:
            # Todo: Here it is potentially slowing down!!!!!!!!!!!!!!!!
            default_score = estimator.score(X, y_true)
            output_metrics["score"] = default_score
            non_default_score_metrics.remove("score")

        if not training:
            y_pred = estimator.predict(X, **kwargs)
        else:
            X, y_true_new, kwargs_new = estimator.transform(
                X, y_true, **kwargs)
            if y_true_new is not None:
                y_true = y_true_new
            if kwargs_new is not None and len(kwargs_new) > 0:
                kwargs = kwargs_new
            y_pred = estimator.predict(X, training=True, **kwargs)

        # Nice to have
        # InnerFoldManager.plot_some_data(y_true, y_pred)

        if calculate_metrics:
            score_metrics = Scorer.calculate_metrics(
                y_true, y_pred, non_default_score_metrics)

            # add default metric
            if output_metrics:
                output_metrics = {**output_metrics, **score_metrics}
            else:
                output_metrics = score_metrics
        else:
            output_metrics = {}

        final_scoring_time = time.time() - scoring_time_start

        probabilities = []
        if hasattr(estimator, "_final_estimator"):
            if hasattr(estimator._final_estimator.base_element,
                       "predict_proba"):
                probabilities = estimator.predict_proba(X,
                                                        training=training,
                                                        **kwargs)

                try:
                    if probabilities is not None:
                        if not len(probabilities) == 0:
                            probabilities = probabilities.tolist()
                except:
                    warnings.warn("No probabilities available.")

        if not isinstance(y_pred, list):
            y_pred = np.asarray(y_pred).tolist()
        if not isinstance(y_true, list):
            y_true = np.asarray(y_true).tolist()

        score_result_object = MDBScoreInformation(
            metrics=output_metrics,
            score_duration=final_scoring_time,
            y_pred=y_pred,
            y_true=y_true,
            indices=np.asarray(indices).tolist(),
            probabilities=probabilities,
        )

        return score_result_object
Ejemplo n.º 6
0
    def score(estimator,
              X,
              y_true,
              metrics,
              indices=[],
              calculate_metrics: bool = True,
              training: bool = False,
              **kwargs):
        """
        Uses the pipeline to predict the given data, compare it to the truth values and calculate metrics

        :param estimator: the pipeline or pipeline element for prediction
        :param X: the data for prediction
        :param y_true: the truth values for the data
        :param metrics: the metrics to be calculated
        :param indices: the indices of the given data and targets that are logged into the result tree
        :param training: if True, all training_only pipeline elements are executed, if False they are skipped
        :param calculate_metrics: if True, calculates metrics for given data
        :return: ScoreInformation object
        """

        scoring_time_start = time.time()

        output_metrics = {}

        if not training:
            y_pred = estimator.predict(X, **kwargs)
        else:
            X, y_true_new, kwargs_new = estimator.transform(
                X, y_true, **kwargs)
            if y_true_new is not None:
                y_true = y_true_new
            if kwargs_new is not None and len(kwargs_new) > 0:
                kwargs = kwargs_new
            y_pred = estimator.predict(X, training=True, **kwargs)

        # Nice to have
        # InnerFoldManager.plot_some_data(y_true, y_pred)

        if calculate_metrics:
            if isinstance(y_pred, np.ndarray) and y_pred.dtype.names:
                y_pred_names = [y_pred.dtype.names]
                if "y_pred" not in y_pred_names[0]:
                    msg = "If scorer object does not return 1d array or list, PHOTON expected name 'y_pred' in nd array."
                    logger.error(msg)
                    raise KeyError(msg)
                score_metrics = Scorer.calculate_metrics(
                    y_true, y_pred["y_pred"], metrics)
            else:
                y_pred_names = []
                score_metrics = Scorer.calculate_metrics(
                    y_true, y_pred, metrics)

            # add default metric
            if output_metrics:
                output_metrics = {**output_metrics, **score_metrics}
            else:
                output_metrics = score_metrics
        else:
            output_metrics = {}

        final_scoring_time = time.time() - scoring_time_start

        probabilities = []
        if hasattr(estimator, '_final_estimator'):
            if hasattr(estimator._final_estimator.base_element,
                       'predict_proba'):
                probabilities = estimator.predict_proba(X,
                                                        training=training,
                                                        **kwargs)

                try:
                    if probabilities is not None:
                        if not len(probabilities) == 0:
                            probabilities = probabilities.tolist()
                except:
                    warnings.warn('No probabilities available.')

        if not isinstance(y_pred, list):
            y_pred = y_pred_names + np.asarray(y_pred).tolist()

        if not isinstance(y_true, list):
            y_true = np.asarray(y_true).tolist()

        score_result_object = MDBScoreInformation(
            metrics=output_metrics,
            score_duration=final_scoring_time,
            y_pred=y_pred,
            y_true=y_true,
            indices=np.asarray(indices).tolist(),
            probabilities=probabilities)

        return score_result_object