Beispiel #1
0
 def check_current_best_config_equality(outer_manager, fold_operation):
     # we know that the first of the two configs is better
     # the value in outer_manager.current_best_config should be the mean value of the best config
     # for train
     self.assertEqual(
         str(
             MDBHelper.get_metric(
                 outer_manager.result_object.best_config,
                 fold_operation,
                 self.optimization_info.best_config_metric)),
         str(
             MDBHelper.get_metric(
                 outer_manager.current_best_config, fold_operation,
                 self.optimization_info.best_config_metric)))
     # and for test
     self.assertEqual(
         str(
             MDBHelper.get_metric(
                 outer_manager.result_object.best_config,
                 fold_operation,
                 self.optimization_info.best_config_metric, False)),
         str(
             MDBHelper.get_metric(
                 outer_manager.current_best_config, fold_operation,
                 self.optimization_info.best_config_metric, False)))
Beispiel #2
0
def load_pipe(storage, name):
    pipe = None
    error = "Could not load pipeline"
    if storage == "m":
        try:
            pipe = load_pipe_from_db(name)
        except ValueError as exc:
            connect(application.config['mongo_db_url'], alias='photon_core')
            pipe = load_pipe_from_db(name)

    if storage == "w":
        pipe = load_pipe_from_wizard(name)

    elif storage == "a":
        try:
            pipe = application.config['pipe_objects'][name]
        except KeyError as ke:
            # Todo pretty error handling
            error = ke
    elif storage == "f":
        try:
            pipe_path = application.config['pipe_files'][name]
            pipe = MDBHelper.load_results(pipe_path)
        except KeyError as ke:
            # Todo File not Found
            error = ke
        except Exception as e:
            # Todo: handle file does not exist
            debug = True

    if not pipe:
        session["error_msg"] = "Could not load result object."
        abort(500)
    return pipe
Beispiel #3
0
    def process_fit_results(config_item, calculate_metrics_across_folds,
                            calculate_metrics_per_fold, metrics):

        overall_y_pred_test = []
        overall_y_true_test = []
        overall_y_pred_train = []
        overall_y_true_train = []

        for fold in config_item.inner_folds:
            curr_test_fold = fold.validation
            curr_train_fold = fold.training

            if calculate_metrics_across_folds:
                # if we have one hot encoded values -> concat horizontally
                if isinstance(curr_test_fold.y_pred, np.ndarray):
                    if len(curr_test_fold.y_pred.shape) > 1:
                        axis = 1
                    else:
                        axis = 0
                else:
                    # if we have lists concat
                    axis = 0
                overall_y_true_test = np.concatenate(
                    (overall_y_true_test, curr_test_fold.y_true), axis=axis)
                overall_y_pred_test = np.concatenate(
                    (overall_y_pred_test, curr_test_fold.y_pred), axis=axis)

                # we assume y_pred from the training set comes in the same shape as y_pred from the test se
                overall_y_true_train = np.concatenate(
                    (overall_y_true_train, curr_train_fold.y_true), axis=axis)
                overall_y_pred_train = np.concatenate(
                    (overall_y_pred_train, curr_train_fold.y_pred), axis=axis)

                # metrics across folds
                metrics_to_calculate = list(metrics)
                if "score" in metrics_to_calculate:
                    metrics_to_calculate.remove("score")
                metrics_train = Scorer.calculate_metrics(
                    overall_y_true_train, overall_y_pred_train,
                    metrics_to_calculate)
                metrics_test = Scorer.calculate_metrics(
                    overall_y_true_test, overall_y_pred_test,
                    metrics_to_calculate)

                def metric_to_db_class(metric_list):
                    db_metrics = []
                    for metric_name, metric_value in metric_list.items():
                        new_metric = MDBFoldMetric(
                            operation=FoldOperations.RAW,
                            metric_name=metric_name,
                            value=metric_value,
                        )
                        db_metrics.append(new_metric)
                    return db_metrics

                db_metrics_train = metric_to_db_class(metrics_train)
                db_metrics_test = metric_to_db_class(metrics_test)

                # if we want to have metrics for each fold as well, calculate mean and std.
                if calculate_metrics_per_fold:
                    db_metrics_fold_train, db_metrics_fold_test = MDBHelper.aggregate_metrics_for_inner_folds(
                        config_item.inner_folds, metrics)
                    config_item.metrics_train = db_metrics_train + db_metrics_fold_train
                    config_item.metrics_test = db_metrics_test + db_metrics_fold_test
                else:
                    config_item.metrics_train = db_metrics_train
                    config_item.metrics_test = db_metrics_test

            elif calculate_metrics_per_fold:
                # calculate mean and std over all fold metrics
                config_item.metrics_train, config_item.metrics_test = MDBHelper.aggregate_metrics_for_inner_folds(
                    config_item.inner_folds, metrics)
Beispiel #4
0
    def objective_function(self, current_config):
        if current_config is None:
            return
        logger.clean_info(
            '---------------------------------------------------------------------------------------------------------------'
        )
        self.tested_config_counter += 1

        if hasattr(self.optimizer, 'ask_for_pipe'):
            pipe_ctor = self.optimizer.ask_for_pipe()
        else:
            pipe_ctor = self.copy_pipe_fnc

        # self.__distribute_cv_info_to_hyperpipe_children(reset=True, config_counter=tested_config_counter)

        hp = InnerFoldManager(pipe_ctor,
                              current_config,
                              self.optimization_info,
                              self.cross_validaton_info,
                              self.outer_fold_id,
                              self.constraint_objects,
                              cache_folder=self.cache_folder,
                              cache_updater=self.cache_updater)

        # Test the configuration cross validated by inner_cv object
        current_config_mdb = hp.fit(self._validation_X, self._validation_y,
                                    **self._validation_kwargs)
        current_config_mdb.config_nr = self.tested_config_counter

        if not current_config_mdb.config_failed:
            metric_train = MDBHelper.get_metric(
                current_config_mdb, self.fold_operation,
                self.optimization_info.best_config_metric)
            metric_test = MDBHelper.get_metric(
                current_config_mdb,
                self.fold_operation,
                self.optimization_info.best_config_metric,
                train=False)

            if metric_train is None or metric_test is None:
                raise Exception(
                    "Config did not fail, but did not get any metrics either....!!?"
                )
            config_performance = (metric_train, metric_test)
            if self.best_metric_yet is None:
                self.best_metric_yet = config_performance
                self.current_best_config = current_config_mdb
            else:
                # check if we have the next superstar around that exceeds any old performance
                if self.optimization_info.maximize_metric:
                    if metric_test > self.best_metric_yet[1]:
                        self.best_metric_yet = config_performance
                        self.current_best_config.save_memory()
                        self.current_best_config = current_config_mdb
                    else:
                        current_config_mdb.save_memory()
                else:
                    if metric_test < self.best_metric_yet[1]:
                        self.best_metric_yet = config_performance
                        self.current_best_config.save_memory()
                        self.current_best_config = current_config_mdb
                    else:
                        current_config_mdb.save_memory()

            # Print Result for config
            computation_duration = current_config_mdb.computation_end_time - current_config_mdb.computation_start_time
            logger.info('Computed configuration ' +
                        str(self.tested_config_counter) + "/" +
                        self.max_nr_of_configs + " in " +
                        str(computation_duration))
            logger.info("Performance:             " +
                        self.optimization_info.best_config_metric +
                        " - Train: " + "%.4f" % config_performance[0] +
                        ", Validation: " + "%.4f" % config_performance[1])
            logger.info("Best Performance So Far: " +
                        self.optimization_info.best_config_metric +
                        " - Train: " + "%.4f" % self.best_metric_yet[0] +
                        ", Validation: " + "%.4f" % self.best_metric_yet[1])
        else:
            config_performance = (-1, -1)
            # Print Result for config
            logger.debug('...failed:')
            logger.error(current_config_mdb.config_error)

        # add config to result tree
        self.result_object.tested_config_list.append(current_config_mdb)

        # 3. inform optimizer about performance
        logger.debug(
            "Telling hyperparameter optimizer about recent performance.")
        if isinstance(self.optimizer, PhotonSlaveOptimizer):
            self.optimizer.tell(current_config, config_performance)
        logger.debug("Asking hyperparameter optimizer for new config.")

        if self.optimization_info.maximize_metric:
            return 1 - config_performance[1]
        else:
            return config_performance[1]
Beispiel #5
0
    def metric_assertions(self):
        def check_metrics(metric_name, expected_metric_list, mean_metrics):
            for metric in mean_metrics:
                if metric.metric_name == metric_name:
                    if metric.operation == "FoldOperations.MEAN":
                        expected_val_mean = np.mean(expected_metric_list)
                        self.assertEqual(expected_val_mean, metric.value)
                    elif metric.operation == "FoldOperations.STD":
                        expected_val_std = np.std(expected_metric_list)
                        self.assertAlmostEqual(expected_val_std, metric.value)
            return expected_val_mean, expected_val_std

        outer_collection = {"train": list(), "test": list()}
        for i, (_, outer_fold) in enumerate(
                self.hyperpipe.cross_validation.outer_folds.items()):
            outer_fold_results = self.hyperpipe.results.outer_folds[i]
            config = outer_fold_results.tested_config_list[0]
            inner_fold_results = config.inner_folds

            inner_fold_metrics = {"train": list(), "test": list()}
            for _, inner_fold in self.hyperpipe.cross_validation.inner_folds[
                    outer_fold.fold_id].items():
                tree_result = inner_fold_results[inner_fold.fold_nr - 1]

                global_test_indices = outer_fold.train_indices[
                    inner_fold.test_indices]
                expected_test_mae = mean_absolute_error(
                    XPredictor.adapt_X(global_test_indices),
                    global_test_indices)
                inner_fold_metrics["test"].append(expected_test_mae)
                self.assertEqual(
                    expected_test_mae,
                    tree_result.validation.metrics["mean_absolute_error"],
                )
                self.assertTrue(
                    np.array_equal(tree_result.validation.indices,
                                   inner_fold.test_indices))
                self.assertEqual(len(global_test_indices),
                                 len(tree_result.validation.y_true))
                self.assertEqual(len(global_test_indices),
                                 len(tree_result.validation.y_pred))

                global_train_indices = outer_fold.train_indices[
                    inner_fold.train_indices]
                expected_train_mae = mean_absolute_error(
                    XPredictor.adapt_X(global_train_indices),
                    global_train_indices)
                inner_fold_metrics["train"].append(expected_train_mae)
                self.assertEqual(
                    expected_train_mae,
                    tree_result.training.metrics["mean_absolute_error"],
                )
                # check that indices are as expected and the right number of y_pred and y_true exist in the tree
                self.assertTrue(
                    np.array_equal(tree_result.training.indices,
                                   inner_fold.train_indices))
                self.assertEqual(len(global_train_indices),
                                 len(tree_result.training.y_true))
                self.assertEqual(len(global_train_indices),
                                 len(tree_result.training.y_pred))

                # get expected train and test mean and std respectively and calculate mean and std again.

            check_metrics("mean_absolute_error", inner_fold_metrics["train"],
                          config.metrics_train)
            check_metrics("mean_absolute_error", inner_fold_metrics["test"],
                          config.metrics_test)

            # calculate metrics across folds
            if self.hyperpipe.cross_validation.calculate_metrics_across_folds:
                expected_mean_absolute_error_across_folds = mean_absolute_error(
                    XPredictor.adapt_X(outer_fold.train_indices),
                    outer_fold.train_indices,
                )
                actual_mean_absolute_error_across_folds = MDBHelper.get_metric(
                    config, FoldOperations.RAW, "mean_absolute_error")
                self.assertEqual(
                    expected_mean_absolute_error_across_folds,
                    actual_mean_absolute_error_across_folds,
                )

            if self.hyperpipe.cross_validation.eval_final_performance:
                expected_outer_test_mae = mean_absolute_error(
                    XPredictor.adapt_X(outer_fold.test_indices),
                    outer_fold.test_indices)

                self.assertTrue(
                    np.array_equal(
                        outer_fold_results.best_config.best_config_score.
                        validation.indices,
                        outer_fold.test_indices,
                    ))
                self.assertEqual(
                    len(outer_fold.test_indices),
                    len(outer_fold_results.best_config.best_config_score.
                        validation.y_true),
                )
                self.assertEqual(
                    len(outer_fold.test_indices),
                    len(outer_fold_results.best_config.best_config_score.
                        validation.y_pred),
                )

                # check that indices are as expected and the right number of y_pred and y_true exist in the tree
                self.assertTrue(
                    np.array_equal(
                        outer_fold_results.best_config.best_config_score.
                        training.indices,
                        outer_fold.train_indices,
                    ))
                self.assertEqual(
                    len(outer_fold.train_indices),
                    len(outer_fold_results.best_config.best_config_score.
                        training.y_true),
                )
                self.assertEqual(
                    len(outer_fold.train_indices),
                    len(outer_fold_results.best_config.best_config_score.
                        training.y_pred),
                )
            else:
                # if we dont use the test set, we want the values from the inner_cv to be copied
                expected_outer_test_mae = [
                    m.value
                    for m in outer_fold_results.best_config.metrics_test
                    if m.metric_name == "mean_absolute_error"
                    and m.operation == "FoldOperations.MEAN"
                ]
                if len(expected_outer_test_mae) > 0:
                    expected_outer_test_mae = expected_outer_test_mae[0]

                self.assertTrue(
                    outer_fold_results.best_config.best_config_score.
                    validation.metrics_copied_from_inner)
                self.assertTrue(
                    outer_fold_results.best_config.best_config_score.training.
                    metrics_copied_from_inner)

            outer_collection["test"].append(expected_outer_test_mae)
            self.assertEqual(
                outer_fold_results.best_config.best_config_score.validation.
                metrics["mean_absolute_error"],
                expected_outer_test_mae,
            )

            expected_outer_train_mae = mean_absolute_error(
                XPredictor.adapt_X(outer_fold.train_indices),
                outer_fold.train_indices)
            outer_collection["train"].append(expected_outer_train_mae)
            self.assertAlmostEqual(
                outer_fold_results.best_config.best_config_score.training.
                metrics["mean_absolute_error"],
                expected_outer_train_mae,
            )

        # check again in overall best config attribute
        check_metrics(
            "mean_absolute_error",
            outer_collection["train"],
            self.hyperpipe.results.metrics_train,
        )

        check_metrics(
            "mean_absolute_error",
            outer_collection["test"],
            self.hyperpipe.results.metrics_test,
        )

        # check if those agree with helper function output
        outer_fold_performances = (
            self.hyperpipe.results_handler.get_performance_outer_folds())
        self.assertListEqual(outer_fold_performances["mean_absolute_error"],
                             outer_collection["test"])
Beispiel #6
0
    def fit(self, X, y=None, **kwargs):
        logger.photon_system_log("")
        logger.photon_system_log(
            "********************************************************"
        ) 
        logger.photon_system_log(
            "Outer Cross validation Fold {}".format(
                self.cross_validaton_info.outer_folds[self.outer_fold_id].fold_nr
            )
        )
        logger.photon_system_log(
            "********************************************************"
        )

        self._prepare_data(X, y, **kwargs)
        self._fit_dummy()
        self._generate_inner_folds()
        self._prepare_optimization()

        outer_fold_fit_start_time = datetime.datetime.now()
        best_metric_yet = None
        tested_config_counter = 0

        # distribute number of folds to encapsulated child hyperpipes
        # self.__distribute_cv_info_to_hyperpipe_children(num_of_folds=num_folds,
        #                                                 outer_fold_counter=outer_fold_counter)

        if self.cross_validaton_info.calculate_metrics_per_fold:
            fold_operation = FoldOperations.MEAN
        else:
            fold_operation = FoldOperations.RAW

        max_nr_of_configs = ""
        if hasattr(self.optimizer, "n_configurations"):
            max_nr_of_configs = str(self.optimizer.n_configurations)

        # do the optimizing1
        for current_config in self.optimizer.ask:
            if current_config is None:
                continue
            logger.clean_info(
                "---------------------------------------------------------------------------------------------------------------"
            )
            tested_config_counter += 1

            if hasattr(self.optimizer, "ask_for_pipe"):
                pipe_ctor = self.optimizer.ask_for_pipe()
            else:
                pipe_ctor = self.copy_pipe_fnc

            # self.__distribute_cv_info_to_hyperpipe_children(reset=True, config_counter=tested_config_counter)

            hp = InnerFoldManager(
                pipe_ctor,
                current_config,
                self.optimization_info,
                self.cross_validaton_info,
                self.outer_fold_id,
                self.constraint_objects,
                cache_folder=self.cache_folder,
                cache_updater=self.cache_updater,
            )

            # Test the configuration cross validated by inner_cv object
            current_config_mdb = hp.fit(
                self._validation_X, self._validation_y, **self._validation_kwargs
            )
            current_config_mdb.config_nr = tested_config_counter

            if not current_config_mdb.config_failed:
                metric_train = MDBHelper.get_metric(
                    current_config_mdb,
                    fold_operation,
                    self.optimization_info.best_config_metric,
                )
                metric_test = MDBHelper.get_metric(
                    current_config_mdb,
                    fold_operation,
                    self.optimization_info.best_config_metric,
                    train=False,
                )

                if metric_train is None or metric_test is None:
                    raise Exception(
                        "Config did not fail, but did not get any metrics either....!!?"
                    )
                config_performance = (metric_train, metric_test)
                if best_metric_yet is None:
                    best_metric_yet = config_performance
                    self.current_best_config = current_config_mdb
                else:
                    # check if we have the next superstar around that exceeds any old performance
                    if self.optimization_info.maximize_metric:
                        if metric_test > best_metric_yet[1]:
                            best_metric_yet = config_performance
                            self.current_best_config.save_memory()
                            self.current_best_config = current_config_mdb
                        else:
                            current_config_mdb.save_memory()
                    else:
                        if metric_test < best_metric_yet[1]:
                            best_metric_yet = config_performance
                            self.current_best_config.save_memory()
                            self.current_best_config = current_config_mdb
                        else:
                            current_config_mdb.save_memory()

                # Print Result for config
                computation_duration = (
                    current_config_mdb.computation_end_time
                    - current_config_mdb.computation_start_time
                )
                logger.info(
                    "Computed configuration "
                    + str(tested_config_counter)
                    + "/"
                    + max_nr_of_configs
                    + " in "
                    + str(computation_duration)
                )
                logger.info(
                    "Performance:             "
                    + self.optimization_info.best_config_metric
                    + " - Train: "
                    + "%.4f" % config_performance[0]
                    + ", Validation: "
                    + "%.4f" % config_performance[1]
                )
                logger.info(
                    "Best Performance So Far: "
                    + self.optimization_info.best_config_metric
                    + " - Train: "
                    + "%.4f" % best_metric_yet[0]
                    + ", Validation: "
                    + "%.4f" % best_metric_yet[1]
                )
            else:
                config_performance = (-1, -1)
                # Print Result for config
                logger.debug("...failed:")
                logger.error(current_config_mdb.config_error)

            # add config to result tree
            self.result_object.tested_config_list.append(current_config_mdb)

            # 3. inform optimizer about performance
            logger.debug("Telling hyperparameter optimizer about recent performance.")
            self.optimizer.tell(current_config, config_performance)
            logger.debug("Asking hyperparameter optimizer for new config.")
        logger.clean_info(
            "---------------------------------------------------------------------------------------------------------------"
        )
        logger.info(
            "Hyperparameter Optimization finished. Now finding best configuration .... "
        )
        # now go on with the best config found
        if tested_config_counter > 0:
            best_config_outer_fold = self.optimization_info.get_optimum_config(
                self.result_object.tested_config_list, fold_operation
            )

            if not best_config_outer_fold:
                raise Exception("No best config was found!")

            # ... and create optimal pipeline
            optimum_pipe = self.copy_pipe_fnc()
            if self.cache_updater is not None:
                self.cache_updater(optimum_pipe, self.cache_folder, "fixed_fold_id")
            optimum_pipe.caching = False
            # set self to best config
            optimum_pipe.set_params(**best_config_outer_fold.config_dict)

            # Todo: set all children to best config and inform to NOT optimize again, ONLY fit
            # for child_name, child_config in best_config_outer_fold_mdb.children_config_dict.items():
            #     if child_config:
            #         # in case we have a pipeline stacking we need to identify the particular subhyperpipe
            #         splitted_name = child_name.split('__')
            #         if len(splitted_name) > 1:
            #             stacking_element = self.optimum_pipe.named_steps[splitted_name[0]]
            #             pipe_element = stacking_element.elements[splitted_name[1]]
            #         else:
            #             pipe_element = self.optimum_pipe.named_steps[child_name]
            #         pipe_element.set_params(**child_config)
            #         pipe_element.is_final_fit = True

            # self.__distribute_cv_info_to_hyperpipe_children(reset=True)

            logger.debug("Fitting model with best configuration of outer fold...")
            optimum_pipe.fit(
                self._validation_X, self._validation_y, **self._validation_kwargs
            )

            self.result_object.best_config = best_config_outer_fold

            # save test performance
            best_config_performance_mdb = MDBInnerFold()
            best_config_performance_mdb.fold_nr = -99
            best_config_performance_mdb.number_samples_training = self._validation_y.shape[
                0
            ]
            best_config_performance_mdb.number_samples_validation = self._test_y.shape[
                0
            ]
            best_config_performance_mdb.feature_importances = (
                optimum_pipe.feature_importances_
            )

            if self.cross_validaton_info.eval_final_performance:
                # Todo: generate mean and std over outer folds as well. move this items to the top
                logger.info("Calculating best model performance on test set...")

                logger.debug("...scoring test data")
                test_score_mdb = InnerFoldManager.score(
                    optimum_pipe,
                    self._test_X,
                    self._test_y,
                    indices=self.cross_validaton_info.outer_folds[
                        self.outer_fold_id
                    ].test_indices,
                    metrics=self.optimization_info.metrics,
                    **self._test_kwargs
                )

                logger.debug("... scoring training data")

                train_score_mdb = InnerFoldManager.score(
                    optimum_pipe,
                    self._validation_X,
                    self._validation_y,
                    indices=self.cross_validaton_info.outer_folds[
                        self.outer_fold_id
                    ].train_indices,
                    metrics=self.optimization_info.metrics,
                    training=True,
                    **self._validation_kwargs
                )

                best_config_performance_mdb.training = train_score_mdb
                best_config_performance_mdb.validation = test_score_mdb

                print_double_metrics(train_score_mdb.metrics, test_score_mdb.metrics)
            else:

                def _copy_inner_fold_means(metric_dict):
                    # We copy all mean values from validation to the best config
                    # training
                    train_item_metrics = {}
                    for m in metric_dict:
                        if m.operation == str(fold_operation):
                            train_item_metrics[m.metric_name] = m.value
                    train_item = MDBScoreInformation()
                    train_item.metrics_copied_from_inner = True
                    train_item.metrics = train_item_metrics
                    return train_item

                # training
                best_config_performance_mdb.training = _copy_inner_fold_means(
                    best_config_outer_fold.metrics_train
                )
                # validation
                best_config_performance_mdb.validation = _copy_inner_fold_means(
                    best_config_outer_fold.metrics_test
                )

            # write best config performance to best config item
            self.result_object.best_config.best_config_score = (
                best_config_performance_mdb
            )

        logger.info(
            "Computations in outer fold {} took {} minutes.".format(
                self.cross_validaton_info.outer_folds[self.outer_fold_id].fold_nr,
                (datetime.datetime.now() - outer_fold_fit_start_time).total_seconds()
                / 60,
            )
        )