def setUp(self):
        """
        Set default start setting for all tests.
        """
        self.constraint_object = PhotonBaseConstraint(
            strategy="first", metric="mean_squared_error", margin=0.1)

        metrics_list = ["f1_score", "mean_squared_error"]
        self.dummy_config_item = MDBConfig()
        self.dummy_config_item.inner_folds = []
        for i in range(5):
            inner_fold = MDBInnerFold()
            inner_fold.validation = MDBScoreInformation()
            for metric in metrics_list:
                inner_fold.validation.metrics[metric] = (
                    np.random.randint(0, 1) / 2 + 0.0001)
            self.dummy_config_item.inner_folds.append(inner_fold)

        self.dummy_linear_config_item = MDBConfig()
        self.dummy_linear_config_item.inner_folds = []
        for i in range(5):
            inner_fold = MDBInnerFold()
            inner_fold.validation = MDBScoreInformation()
            for metric in metrics_list:
                inner_fold.validation.metrics[metric] = i / 4
            self.dummy_linear_config_item.inner_folds.append(inner_fold)
Ejemplo n.º 2
0
    def test_get_optimum_config(self):
        my_pipe_optimizer = Hyperpipe.Optimization(
            "grid_search", {}, [], "balanced_accuracy", None
        )
        list_of_tested_configs = list()
        metric_default = MDBFoldMetric(
            metric_name="balanced_accuracy", operation=FoldOperations.MEAN, value=0.5
        )
        metric_best = MDBFoldMetric(
            metric_name="balanced_accuracy", operation=FoldOperations.MEAN, value=0.99
        )
        # we add looser configs, one good config, and one good config that failed
        # and check if the good non-failing config is chosen
        for i in range(10):
            config = MDBConfig()
            # number 5 is the winner
            if i == 5 or i == 8:
                config.metrics_test = [metric_best]
            else:
                config.metrics_test = [metric_default]
            if i == 8:
                config.config_failed = True
            list_of_tested_configs.append(config)

        winner_config = my_pipe_optimizer.get_optimum_config(list_of_tested_configs)
        self.assertIs(winner_config, list_of_tested_configs[5])
        self.assertEqual(winner_config.metrics_test[0].value, 0.99)
Ejemplo n.º 3
0
    def test_get_optimum_config_outer_folds(self):
        my_pipe_optimizer = Hyperpipe.Optimization(
            "grid_search", {}, [], "balanced_accuracy", None
        )

        outer_fold_list = list()
        for i in range(10):
            outer_fold = MDBOuterFold()
            outer_fold.best_config = MDBConfig()
            outer_fold.best_config.best_config_score = MDBInnerFold()
            outer_fold.best_config.best_config_score.validation = MDBScoreInformation()
            # again fold 5 wins
            if i == 5:
                outer_fold.best_config.best_config_score.validation.metrics = {
                    "balanced_accuracy": 0.99
                }
            else:
                outer_fold.best_config.best_config_score.validation.metrics = {
                    "balanced_accuracy": 0.5
                }
            outer_fold_list.append(outer_fold)

        best_config_outer_folds = my_pipe_optimizer.get_optimum_config_outer_folds(
            outer_fold_list
        )
        self.assertEqual(
            best_config_outer_folds.best_config_score.validation.metrics[
                "balanced_accuracy"
            ],
            0.99,
        )
        self.assertIs(best_config_outer_folds, outer_fold_list[5].best_config)
Ejemplo n.º 4
0
    def test_finalize_optimization(self):
        # it is kind of difficult to test that's why we fake it
        self.hyperpipe.fit(self.__X, self.__y)

        # reset all infos
        self.hyperpipe.results.dummy_estimator.train = MDBScoreInformation()
        self.hyperpipe.results.dummy_estimator.test = MDBScoreInformation()
        self.hyperpipe.results.metrics_train = {}
        self.hyperpipe.results.metrics_test = {}
        self.hyperpipe.best_config = None
        self.hyperpipe.results.best_config = MDBConfig()
        self.hyperpipe.optimum_pipe = None

        # now generate infos again
        self.hyperpipe._finalize_optimization()

        expected_num_of_metrics = len(self.hyperpipe.optimization.metrics)
        # dummy average values
        self.assertTrue(len(self.hyperpipe.results.dummy_estimator.train),
                        expected_num_of_metrics)
        self.assertTrue(len(self.hyperpipe.results.dummy_estimator.test),
                        expected_num_of_metrics)
        # overall average values
        self.assertTrue(len(self.hyperpipe.results.metrics_train),
                        2 * expected_num_of_metrics)
        self.assertTrue(len(self.hyperpipe.results.metrics_test),
                        2 * expected_num_of_metrics)
        # find best config
        self.assertIsNotNone(self.hyperpipe.best_config)
        self.assertIsNotNone(self.hyperpipe.results.best_config)
        self.assertEqual(self.hyperpipe.best_config,
                         self.hyperpipe.results.best_config.config_dict)
        # set optimum pipe and params, # todo: test add preprocessing
        self.assertIsNotNone(self.hyperpipe.optimum_pipe)
        self.assertEqual(
            self.hyperpipe.optimum_pipe.named_steps["SVC"].base_element.C,
            self.hyperpipe.best_config["SVC__C"])
        # save optimum model
        self.assertTrue(
            os.path.isfile(
                os.path.join(self.hyperpipe.output_settings.results_folder,
                             'photon_best_model.photon')))

        # backmapping
        # because the pca is test disabled, we expect the number of features
        self.assertEqual(
            len(self.hyperpipe.results.best_config_feature_importances[0]),
            self.__X.shape[1])
        backmapped_feature_importances = os.path.join(
            self.hyperpipe.output_settings.results_folder,
            'optimum_pipe_feature_importances_backmapped.csv')
        self.assertTrue(os.path.isfile(backmapped_feature_importances))
        loaded_array = np.loadtxt(open(backmapped_feature_importances, 'rb'),
                                  delimiter=",")
        self.assertEqual(loaded_array.shape[0], self.__X.shape[1])
Ejemplo n.º 5
0
    def fit(self, X, y, **kwargs):
        """
        Iterates over cross-validation folds and trains the pipeline, then uses it for predictions.
        Calculates metrics per fold and averages them over fold.
        :param X: Training and test data
        :param y: Training and test targets
        :returns: configuration class for result tree that monitors training and test performance
        """

        # needed for testing Timeboxed Random Grid Search
        # time.sleep(35)

        config_item = MDBConfig()
        config_item.config_dict = self.params
        config_item.inner_folds = []
        config_item.metrics_test = []
        config_item.metrics_train = []
        config_item.computation_start_time = datetime.datetime.now()

        try:
            # do inner cv
            for idx, (inner_fold_id, inner_fold) in enumerate(
                    self.cross_validation_infos.inner_folds[
                        self.outer_fold_id].items()):

                train, test = inner_fold.train_indices, inner_fold.test_indices

                # split kwargs according to cross validation
                train_X, train_y, kwargs_cv_train = PhotonDataHelper.split_data(
                    X, y, kwargs, indices=train)
                test_X, test_y, kwargs_cv_test = PhotonDataHelper.split_data(
                    X, y, kwargs, indices=test)

                new_pipe = self.pipe()
                if self.cache_folder is not None and self.cache_updater is not None:
                    self.cache_updater(new_pipe, self.cache_folder,
                                       inner_fold_id)

                if not config_item.human_readable_config:
                    config_item.human_readable_config = PhotonPrintHelper.config_to_human_readable_dict(
                        new_pipe, self.params)
                    logger.clean_info(
                        json.dumps(config_item.human_readable_config,
                                   indent=4,
                                   sort_keys=True))

                job_data = InnerFoldManager.InnerCVJob(
                    pipe=new_pipe,
                    config=dict(self.params),
                    metrics=self.optimization_infos.metrics,
                    callbacks=self.optimization_constraints,
                    train_data=InnerFoldManager.JobData(
                        train_X, train_y, train, kwargs_cv_train),
                    test_data=InnerFoldManager.JobData(test_X, test_y, test,
                                                       kwargs_cv_test),
                )

                # only for unparallel processing
                # inform children in which inner fold we are
                # self.pipe.distribute_cv_info_to_hyperpipe_children(inner_fold_counter=fold_cnt)
                # self.mother_inner_fold_handle(fold_cnt)

                # --> write that output in InnerFoldManager!
                # logger.debug(config_item.human_readable_config)
                fold_nr = idx + 1
                logger.debug("calculating inner fold " + str(fold_nr) + "...")

                curr_test_fold, curr_train_fold = InnerFoldManager.fit_and_score(
                    job_data)
                logger.debug("Performance inner fold " + str(fold_nr))
                print_double_metrics(
                    curr_train_fold.metrics,
                    curr_test_fold.metrics,
                    photon_system_log=False,
                )

                durations = job_data.pipe.time_monitor

                self.update_config_item_with_inner_fold(
                    config_item=config_item,
                    fold_cnt=fold_nr,
                    curr_train_fold=curr_train_fold,
                    curr_test_fold=curr_test_fold,
                    time_monitor=durations,
                    feature_importances=new_pipe.feature_importances_,
                )

                if isinstance(self.optimization_constraints, list):
                    break_cv = 0
                    for cf in self.optimization_constraints:
                        if not cf.shall_continue(config_item):
                            logger.info(
                                "Skipped further cross validation after fold "
                                + str(fold_nr) +
                                " due to performance constraints in " +
                                cf.metric)
                            break_cv += 1
                            break
                    if break_cv > 0:
                        break
                elif self.optimization_constraints is not None:
                    if not self.optimization_constraints.shall_continue(
                            config_item):
                        logger.info(
                            "Skipped further cross validation after fold " +
                            str(fold_nr) +
                            " due to performance constraints in " + cf.metric)
                        break

            InnerFoldManager.process_fit_results(
                config_item,
                self.cross_validation_infos.calculate_metrics_across_folds,
                self.cross_validation_infos.calculate_metrics_per_fold,
                self.optimization_infos.metrics,
            )

        except Exception as e:
            if self.raise_error:
                raise e
            logger.error(e)
            logger.error(traceback.format_exc())
            traceback.print_exc()
            if not isinstance(e, Warning):
                config_item.config_failed = True
            config_item.config_error = str(e)
            warnings.warn("One test iteration of pipeline failed with error")

        logger.debug("...done with")
        logger.debug(
            json.dumps(config_item.human_readable_config,
                       indent=4,
                       sort_keys=True))

        config_item.computation_end_time = datetime.datetime.now()
        return config_item