def test_class_with_data_preproc(self):
        """
        Test for simple pipeline with data.
        """

        X, y = load_breast_cancer(return_X_y=True)

        # DESIGN YOUR PIPELINE
        my_pipe = Hyperpipe(
            'basic_svm_pipe',
            optimizer='grid_search',
            metrics=['accuracy', 'precision', 'recall', 'balanced_accuracy'],
            best_config_metric='accuracy',
            eval_final_performance=False,
            outer_cv=KFold(n_splits=2),
            inner_cv=KFold(n_splits=3),
            verbosity=1,
            random_seed=42)

        preprocessing = Preprocessing()
        preprocessing += PipelineElement("LabelEncoder")
        my_pipe += preprocessing

        # ADD ELEMENTS TO YOUR PIPELINE
        # first normalize all features
        my_pipe.add(PipelineElement('StandardScaler'))

        # then do feature selection using a PCA,
        my_pipe += PipelineElement(
            'PCA',
            hyperparameters={'n_components': IntegerRange(10, 12)},
            test_disabled=True)

        # engage and optimize the good old SVM for Classification
        my_pipe += PipelineElement(
            'SVC',
            hyperparameters={'kernel': Categorical(['rbf', 'linear'])},
            C=2,
            gamma='scale')

        # NOW TRAIN YOUR PIPELINE
        my_pipe.fit(X, y)

        json_transformer = JsonTransformer()

        pipe_json = json_transformer.create_json(my_pipe)
        a = elements_to_dict(my_pipe.copy_me())
        my_pipe_reload = json_transformer.from_json(pipe_json)
        pipe_json_reload = pipe_json = json_transformer.create_json(
            my_pipe_reload)

        self.assertEqual(pipe_json, pipe_json_reload)
        my_pipe_reload.fit(X, y)

        self.assertDictEqual(my_pipe.best_config, my_pipe_reload.best_config)

        self.assertDictEqual(elements_to_dict(my_pipe.copy_me()),
                             elements_to_dict(my_pipe_reload.copy_me()))
    def test_class_with_data_01(self):
        """
        Test for simple pipeline with data.
        """

        X, y = load_breast_cancer(True)

        # DESIGN YOUR PIPELINE
        my_pipe = Hyperpipe(
            "basic_svm_pipe",
            optimizer="grid_search",
            metrics=["accuracy", "precision", "recall", "balanced_accuracy"],
            best_config_metric="accuracy",
            eval_final_performance=False,
            outer_cv=KFold(n_splits=2),
            inner_cv=KFold(n_splits=3),
            verbosity=1,
            random_seed=42,
        )

        preprocessing = Preprocessing()
        preprocessing += PipelineElement("LabelEncoder")
        my_pipe += preprocessing

        # ADD ELEMENTS TO YOUR PIPELINE
        # first normalize all features
        my_pipe.add(PipelineElement("StandardScaler"))

        # then do feature selection using a PCA,
        my_pipe += PipelineElement(
            "PCA",
            hyperparameters={"n_components": IntegerRange(10, 12)},
            test_disabled=True,
        )

        # engage and optimize the good old SVM for Classification
        my_pipe += PipelineElement(
            "SVC",
            hyperparameters={"kernel": Categorical(["rbf", "linear"])},
            C=2,
            gamma="scale",
        )

        # NOW TRAIN YOUR PIPELINE
        my_pipe.fit(X, y)

        json_transformer = JsonTransformer()
        pipe_json = json_transformer.create_json(my_pipe)
        my_pipe_reload = json_transformer.from_json(pipe_json)
        pipe_json_reload = pipe_json = json_transformer.create_json(
            my_pipe_reload)

        self.assertEqual(pipe_json, pipe_json_reload)

        my_pipe_reload.fit(X, y)
        self.assertDictEqual(my_pipe.best_config, my_pipe_reload.best_config)
Beispiel #3
0
# WE USE THE BREAST CANCER SET FROM SKLEARN
X, y = load_breast_cancer(True)

# DESIGN YOUR PIPELINE
my_pipe = Hyperpipe(
    "basic_svm_pipe",
    optimizer="sk_opt",
    optimizer_params={"n_configurations": 10},
    metrics=["accuracy", "precision", "recall", "balanced_accuracy"],
    best_config_metric="accuracy",
    outer_cv=KFold(n_splits=3),
    inner_cv=KFold(n_splits=3),
    verbosity=1,
    output_settings=OutputSettings(project_folder="./tmp/"),
)

# ADD ELEMENTS TO YOUR PIPELINE
my_pipe.add(PipelineElement("StandardScaler"))

my_pipe += PipelineElement(
    "PhotonMLPClassifier",
    hyperparameters={
        "layer_1": IntegerRange(0, 5),
        "layer_2": IntegerRange(0, 5),
        "layer_3": IntegerRange(0, 5),
    },
)

# NOW TRAIN YOUR PIPELINE
my_pipe.fit(X, y)
Beispiel #4
0
from photonai.base import Hyperpipe, PipelineElement, OutputSettings
from photonai.optimization import IntegerRange

# WE USE THE BREAST CANCER SET FROM SKLEARN
X, y = load_breast_cancer(return_X_y=True)

# DESIGN YOUR PIPELINE
my_pipe = Hyperpipe(
    'basic_svm_pipe',
    optimizer='sk_opt',
    optimizer_params={'n_configurations': 10},
    metrics=['accuracy', 'precision', 'recall', 'balanced_accuracy'],
    best_config_metric='accuracy',
    outer_cv=KFold(n_splits=3),
    inner_cv=KFold(n_splits=3),
    verbosity=1,
    output_settings=OutputSettings(project_folder='./tmp/'))

# ADD ELEMENTS TO YOUR PIPELINE
my_pipe.add(PipelineElement('StandardScaler'))

my_pipe += PipelineElement('PhotonMLPClassifier',
                           hyperparameters={
                               'layer_1': IntegerRange(1, 5),
                               'layer_2': IntegerRange(0, 5),
                               'layer_3': IntegerRange(0, 5)
                           })

# NOW TRAIN YOUR PIPELINE
my_pipe.fit(X, y)
Beispiel #5
0
class Smac3IntegrationTest(unittest.TestCase):
    def setUp(self):
        self.time_limit = 60 * 2

        settings = OutputSettings(project_folder="./tmp/")

        self.smac_helper = {"data": None, "initial_runs": None}

        # DESIGN YOUR PIPELINE
        self.pipe = Hyperpipe(
            "basic_svm_pipe",  # the name of your pipeline
            optimizer="smac",  # which optimizer PHOTON shall use
            optimizer_params={
                "wallclock_limit": self.time_limit,
                "smac_helper": self.smac_helper,
                "run_limit": 20,
            },
            metrics=["accuracy"],
            # the performance metrics of your interest
            best_config_metric="accuracy",
            inner_cv=KFold(
                n_splits=3
            ),  # test each configuration ten times respectively,
            verbosity=0,
            output_settings=settings,
        )

    def simple_classification(self):
        dataset = fetch_olivetti_faces(download_if_missing=True)
        X = dataset["data"]
        y = dataset["target"]
        # self.X, self.y = load_digits(n_class=2, return_X_y=True)
        return X, y

    def test_against_smac(self):
        # PHOTON implementation
        self.pipe.add(PipelineElement("StandardScaler"))
        # then do feature selection using a PCA, specify which values to try in the hyperparameter search
        self.pipe += PipelineElement(
            "PCA", hyperparameters={"n_components": IntegerRange(5, 30)}
        )
        # engage and optimize the good old SVM for Classification
        self.pipe += PipelineElement(
            "SVC",
            hyperparameters={
                "kernel": Categorical(["linear", "rbf", "poly", "sigmoid"]),
                "C": FloatRange(0.5, 200),
            },
            gamma="auto",
        )

        self.X, self.y = self.simple_classification()
        self.pipe.fit(self.X, self.y)

        # AUTO ML direct
        # Build Configuration Space which defines all parameters and their ranges
        cs = ConfigurationSpace()

        # We define a few possible types of SVM-kernels and add them as "kernel" to our cs
        n_components = UniformIntegerHyperparameter(
            "PCA__n_components", 5, 30
        )  # , default_value=5)
        cs.add_hyperparameter(n_components)

        kernel = CategoricalHyperparameter(
            "SVC__kernel", ["linear", "rbf", "poly", "sigmoid"]
        )  # , default_value="linear")
        cs.add_hyperparameter(kernel)

        c = UniformFloatHyperparameter("SVC__C", 0.5, 200)  # , default_value=1)
        cs.add_hyperparameter(c)

        # Scenario object
        scenario = Scenario(
            {
                "run_obj": "quality",  # we optimize quality (alternatively runtime)
                "runcount-limit": 800,  # maximum function evaluations
                "cs": cs,  # configuration space
                "deterministic": "true",
                "shared_model": "false",  # !!!!
                "wallclock_limit": self.time_limit,
            }
        )

        # Optimize, using a SMAC-object
        print("Optimizing! Depending on your machine, this might take a few minutes.")
        smac = SMAC4BO(
            scenario=scenario,
            rng=np.random.RandomState(42),
            tae_runner=self.objective_function,
        )

        self.traurig = smac

        incumbent = smac.optimize()

        inc_value = self.objective_function(incumbent)

        print(incumbent)
        print(inc_value)

        runhistory_photon = self.smac_helper["data"].solver.runhistory
        runhistory_original = smac.solver.runhistory

        x_ax = range(
            1,
            min(
                len(runhistory_original.cost_per_config.keys()),
                len(runhistory_photon.cost_per_config.keys()),
            )
            + 1,
        )
        y_ax_original = [runhistory_original.cost_per_config[tmp] for tmp in x_ax]
        y_ax_photon = [runhistory_photon.cost_per_config[tmp] for tmp in x_ax]

        y_ax_original_inc = [min(y_ax_original[: tmp + 1]) for tmp in x_ax]
        y_ax_photon_inc = [min(y_ax_photon[: tmp + 1]) for tmp in x_ax]

        plt.figure(figsize=(10, 7))
        plt.plot(x_ax, y_ax_original, "g", label="Original")
        plt.plot(x_ax, y_ax_photon, "b", label="PHOTON")
        plt.plot(x_ax, y_ax_photon_inc, "r", label="PHOTON Incumbent")
        plt.plot(x_ax, y_ax_original_inc, "k", label="Original Incumbent")
        plt.title("Photon Prove")
        plt.xlabel("X")
        plt.ylabel("Y")
        plt.legend(loc="best")
        plt.show()

        def neighbours(items, fill=None):
            before = itertools.chain([fill], items)
            after = itertools.chain(
                items, [fill]
            )  # You could use itertools.zip_longest() later instead.
            next(after)
            for a, b, c in zip(before, items, after):
                yield [value for value in (a, b, c) if value is not fill]

        print("---------------")
        original_pairing = [
            sum(values) / len(values) for values in neighbours(y_ax_original)
        ]
        bias_term = np.mean(
            [
                abs(y_ax_original_inc[t] - y_ax_photon_inc[t])
                for t in range(len(y_ax_photon_inc))
            ]
        )
        photon_pairing = [
            sum(values) / len(values) - bias_term for values in neighbours(y_ax_photon)
        ]
        counter = 0
        for i, x in enumerate(x_ax):
            if abs(original_pairing[i] - photon_pairing[i]) > 0.05:
                counter += 1
            self.assertLessEqual(counter / len(x_ax), 0.15)

    def objective_function(self, cfg):
        cfg = {k: cfg[k] for k in cfg if cfg[k]}
        sc = PipelineElement("StandardScaler", {})
        pca = PipelineElement("PCA", {}, random_state=3)
        svc = PipelineElement("SVC", {}, random_state=3, gamma="auto")
        my_pipe = PhotonPipeline([("StandardScaler", sc), ("PCA", pca), ("SVC", svc)])
        my_pipe.set_params(**cfg)

        metric = cross_val_score(
            my_pipe,
            self.X,
            self.y,
            cv=3,
            scoring=make_scorer(accuracy_score, greater_is_better=True),
        )  # , scoring=my_pipe.predict)
        print("run")
        return 1 - np.mean(metric)
Beispiel #6
0
class HyperpipeTests(PhotonBaseTest):
    def setup_hyperpipe(self, output_settings=None):
        if output_settings is None:
            output_settings = OutputSettings(project_folder=self.tmp_folder_path)
        self.hyperpipe = Hyperpipe(
            "god",
            inner_cv=self.inner_cv_object,
            metrics=self.metrics,
            best_config_metric=self.best_config_metric,
            output_settings=output_settings,
        )
        self.hyperpipe += self.ss_pipe_element
        self.hyperpipe += self.pca_pipe_element
        self.hyperpipe.add(self.svc_pipe_element)

    def setUp(self):

        super(HyperpipeTests, self).setUp()
        self.ss_pipe_element = PipelineElement("StandardScaler")
        self.pca_pipe_element = PipelineElement(
            "PCA", {"n_components": [1, 2]}, random_state=42, test_disabled=True
        )
        self.svc_pipe_element = PipelineElement(
            "SVC",
            {"C": [0.1, 1], "kernel": ["linear"]},  # 'rbf', 'sigmoid']
            random_state=42,
        )

        self.inner_cv_object = KFold(n_splits=3)
        self.metrics = ["accuracy", "recall", "precision"]
        self.best_config_metric = "accuracy"
        self.setup_hyperpipe()

        dataset = load_breast_cancer()
        self.__X = dataset.data
        self.__y = dataset.target

    def test_init(self):
        # test that all init parameters can be retrieved via the cleaned up subclasses
        self.assertEqual(self.hyperpipe.name, "god")

        # in case don't give information, check for the default parameters, otherwise for the infos given in setUp
        # Cross Validation
        self.assertIsNotNone(self.hyperpipe.cross_validation)
        self.assertEqual(self.hyperpipe.cross_validation.inner_cv, self.inner_cv_object)
        self.assertIsNone(self.hyperpipe.cross_validation.outer_cv, None)
        self.assertTrue(self.hyperpipe.cross_validation.eval_final_performance)
        self.assertTrue(self.hyperpipe.cross_validation.calculate_metrics_per_fold)
        self.assertFalse(self.hyperpipe.cross_validation.calculate_metrics_across_folds)
        self.assertIsNone(self.hyperpipe.cross_validation.outer_folds)
        self.assertDictEqual(self.hyperpipe.cross_validation.inner_folds, {})

        # Optimization
        self.assertIsNotNone(self.hyperpipe.optimization)
        self.assertListEqual(self.hyperpipe.optimization.metrics, self.metrics)
        self.assertEqual(
            self.hyperpipe.optimization.best_config_metric, self.best_config_metric
        )
        self.assertEqual(self.hyperpipe.optimization.optimizer_input_str, "grid_search")
        self.assertTrue(self.hyperpipe.optimization.maximize_metric)
        self.assertIsNone(self.hyperpipe.optimization.performance_constraints)
        self.assertDictEqual(self.hyperpipe.optimization.optimizer_params, {})

    def test_add(self):
        # assure pipeline has two elements, first the pca and second the svc
        self.assertEqual(len(self.hyperpipe.elements), 3)
        self.assertIs(self.hyperpipe.elements[0], self.ss_pipe_element)
        self.assertIs(self.hyperpipe.elements[1], self.pca_pipe_element)
        self.assertIs(self.hyperpipe.elements[2], self.svc_pipe_element)
        # todo : assure that no two elements can be added with the same name

        # test add method special cases
        with self.assertRaises(TypeError):
            self.hyperpipe.add(object())

        # assure that preprocessing is identified and set to the extra variable, there is only one preprocessing item
        my_preproc = Preprocessing()
        self.hyperpipe.add(my_preproc)
        self.assertEqual(my_preproc, self.hyperpipe.preprocessing)
        # make sure the element does not end up in the main pipeline
        self.assertTrue([item is not my_preproc for item in self.hyperpipe.elements])

        def my_func(X, y, **kwargs):
            return True

        # test adding callback item
        my_call_back_item = CallbackElement("test_element", my_func, "predict")
        self.hyperpipe.add(my_call_back_item)
        self.assertIs(self.hyperpipe.elements[-1], my_call_back_item)

    def test_no_metrics(self):
        # make sure that no metrics means raising an error
        with self.assertRaises(ValueError):
            hyperpipe = Hyperpipe("hp_name", inner_cv=self.inner_cv_object)

        # make sure that if no best config metric is given, PHOTON raises a warning
        with self.assertRaises(Warning):
            hyperpipe = Hyperpipe(
                "hp_name",
                inner_cv=self.inner_cv_object,
                metrics=["accuracy", "f1_score"],
            )

    def test_preprocessing(self):

        prepro_pipe = Preprocessing()
        prepro_pipe += PipelineElement.create(
            "dummy", DummyYAndCovariatesTransformer(), {}
        )

        self.hyperpipe += prepro_pipe
        self.hyperpipe.fit(self.__X, self.__y)

        self.assertTrue(np.array_equal(self.__y + 1, self.hyperpipe.data.y))

    def test_estimation_type(self):
        def callback(X, y=None, **kwargs):
            pass

        pipe = Hyperpipe(
            "name", inner_cv=KFold(n_splits=2), best_config_metric="mean_squared_error"
        )

        with self.assertRaises(NotImplementedError):
            pipe += PipelineElement("PCA")
            est_type = pipe.estimation_type

        pipe += PipelineElement("SVC")
        self.assertEqual(pipe.estimation_type, "classifier")

        pipe.elements[-1] = PipelineElement("SVR")
        self.assertEqual(pipe.estimation_type, "regressor")

        with self.assertRaises(NotImplementedError):
            pipe.elements[-1] = CallbackElement("MyCallback", callback)
            est_type = pipe.estimation_type

    def test_copy_me(self):
        self.maxDiff = None
        copy = self.hyperpipe.copy_me()
        copy2 = self.hyperpipe.copy_me()
        self.assertDictEqual(elements_to_dict(copy), elements_to_dict(self.hyperpipe))

        copy_after_fit = self.hyperpipe.fit(self.__X, self.__y).copy_me()

        copy_after_fit = elements_to_dict(copy_after_fit)
        # the current_configs of the elements are not None after calling fit() on a hyperpipe
        # when copying the respective PipelineElement, these current_configs are copied, too
        # this is why we need to delete _pipe and elements before asserting for equality
        copy_after_fit["_pipe"] = None
        copy_after_fit["elements"] = None
        copy = elements_to_dict(copy)
        copy["_pipe"] = None
        copy["elements"] = None
        self.assertDictEqual(copy, copy_after_fit)

        # check if deepcopy worked
        copy2.cross_validation.inner_cv.n_splits = 10
        self.assertEqual(copy2.cross_validation.inner_cv.n_splits, 10)
        self.assertEqual(self.hyperpipe.cross_validation.inner_cv.n_splits, 3)

    def test_save_optimum_pipe(self):
        # todo: test .save() of custom model
        tmp_path = os.path.join(self.tmp_folder_path, "optimum_pipypipe")
        settings = OutputSettings(project_folder=tmp_path, overwrite_results=True)

        my_pipe = Hyperpipe(
            "hyperpipe",
            optimizer="random_grid_search",
            optimizer_params={"n_configurations": 3},
            metrics=["accuracy", "precision", "recall"],
            best_config_metric="f1_score",
            outer_cv=KFold(n_splits=2),
            inner_cv=KFold(n_splits=2),
            verbosity=1,
            output_settings=settings,
        )

        preproc = Preprocessing()
        preproc += PipelineElement("StandardScaler")

        # BRANCH WITH QUANTILTRANSFORMER AND DECISIONTREECLASSIFIER
        tree_qua_branch = Branch("tree_branch")
        tree_qua_branch += PipelineElement("QuantileTransformer")
        tree_qua_branch += PipelineElement(
            "DecisionTreeClassifier",
            {"min_samples_split": IntegerRange(2, 4)},
            criterion="gini",
        )

        # BRANCH WITH MinMaxScaler AND DecisionTreeClassifier
        svm_mima_branch = Branch("svm_branch")
        svm_mima_branch += PipelineElement("MinMaxScaler")
        svm_mima_branch += PipelineElement(
            "SVC", {"kernel": Categorical(["rbf", "linear"]), "C": 2.0}, gamma="auto"
        )

        # BRANCH WITH StandardScaler AND KNeighborsClassifier
        knn_sta_branch = Branch("neighbour_branch")
        knn_sta_branch += PipelineElement.create("dummy", DummyTransformer(), {})
        knn_sta_branch += PipelineElement("KNeighborsClassifier")

        my_pipe += preproc
        # voting = True to mean the result of every branch
        my_pipe += Stack(
            "final_stack", [tree_qua_branch, svm_mima_branch, knn_sta_branch]
        )

        my_pipe += PipelineElement("LogisticRegression", solver="lbfgs")

        my_pipe.fit(self.__X, self.__y)
        model_path = os.path.join(
            my_pipe.output_settings.results_folder, "photon_best_model.photon"
        )
        self.assertTrue(os.path.exists(model_path))

        # now move optimum pipe to new folder
        test_folder = os.path.join(
            my_pipe.output_settings.results_folder, "new_test_folder"
        )
        new_model_path = os.path.join(test_folder, "photon_best_model.photon")
        os.makedirs(test_folder)
        shutil.copyfile(model_path, new_model_path)

        # check if load_optimum_pipe also works
        # check if we have the meta information recovered
        loaded_optimum_pipe = Hyperpipe.load_optimum_pipe(new_model_path)
        self.assertIsNotNone(loaded_optimum_pipe._meta_information)
        self.assertIsNotNone(loaded_optimum_pipe._meta_information["photon_version"])

        # check if predictions stay realiably the same
        y_pred_loaded = loaded_optimum_pipe.predict(self.__X)
        y_pred = my_pipe.optimum_pipe.predict(self.__X)
        np.testing.assert_array_equal(y_pred_loaded, y_pred)

    def test_overwrite_result_folder(self):
        """
        Test for right handling of parameter output_settings.overwrite.
        """

        def get_summary_file():
            return os.path.join(
                self.hyperpipe.output_settings.results_folder, "photon_summary.txt"
            )

        # Case 1: default
        output_settings1 = OutputSettings(
            project_folder=self.tmp_folder_path,
            save_output=True,
            overwrite_results=False,
        )
        self.setup_hyperpipe(output_settings1)
        self.hyperpipe.fit(self.__X, self.__y)
        tmp_path = get_summary_file()

        time.sleep(2)

        # again with same settings
        self.setup_hyperpipe(output_settings1)
        self.hyperpipe.fit(self.__X, self.__y)
        tmp_path2 = get_summary_file()

        # we expect a new output folder each time with timestamp
        self.assertNotEqual(tmp_path, tmp_path2)

        # Case 2 overwrite results: all in the same folder
        output_settings2 = OutputSettings(
            project_folder=self.tmp_folder_path,
            save_output=True,
            overwrite_results=True,
        )
        self.setup_hyperpipe(output_settings2)
        self.hyperpipe.fit(self.__X, self.__y)
        tmp_path = get_summary_file()
        tmp_date = os.path.getmtime(tmp_path)

        self.setup_hyperpipe(output_settings2)
        self.hyperpipe.fit(self.__X, self.__y)
        tmp_path2 = get_summary_file()
        tmp_date2 = os.path.getmtime(tmp_path2)

        # same folder but summary file is overwritten through the new analysis
        self.assertEqual(tmp_path, tmp_path2)
        self.assertNotEqual(tmp_date, tmp_date2)

        # Case 3: we have a cache folder
        self.hyperpipe.cache_folder = self.cache_folder_path
        shutil.rmtree(self.cache_folder_path, ignore_errors=True)
        self.hyperpipe.fit(self.__X, self.__y)
        self.assertTrue(os.path.exists(self.cache_folder_path))

    def test_random_state(self):
        self.hyperpipe.random_state = 4567
        self.hyperpipe.fit(self.__X, self.__y)
        # assure we spread the word.. !
        self.assertEqual(self.hyperpipe.random_state, 4567)
        self.assertEqual(self.hyperpipe._pipe.random_state, 4567)
        self.assertEqual(self.hyperpipe.optimum_pipe.random_state, 4567)
        self.assertEqual(self.hyperpipe._pipe.elements[-1][-1].random_state, 4567)
        self.assertEqual(
            self.hyperpipe._pipe.elements[-1][-1].base_element.random_state, 4567
        )

    def test_dummy_estimator_preparation(self):

        self.hyperpipe.results = MDBHyperpipe()
        self.hyperpipe.results.dummy_estimator = dummy_estimator = MDBDummyResults()

        # one time regressor, one time classifier, one time strange object
        self.hyperpipe.elements = list()
        self.hyperpipe.add(PipelineElement("SVC"))
        dummy_estimator = self.hyperpipe._prepare_dummy_estimator()
        self.assertTrue(isinstance(dummy_estimator, DummyClassifier))

        self.hyperpipe.elements = list()
        self.hyperpipe.add(PipelineElement("SVR"))
        dummy_estimator = self.hyperpipe._prepare_dummy_estimator()
        self.assertTrue(isinstance(dummy_estimator, DummyRegressor))

        with self.assertRaises(NotImplementedError):
            self.hyperpipe.elements = list()
            self.hyperpipe.add(PipelineElement("PCA"))
            dummy_estimator = self.hyperpipe._prepare_dummy_estimator()
            self.assertIsNone(dummy_estimator)

    def setup_crazy_pipe(self):
        # erase all, we need a complex and crazy task
        self.hyperpipe.elements = list()

        nmb_list = list()
        for i in range(5):
            nmb = NeuroBranch(name=str(i), nr_of_processes=i + 3)
            nmb += PipelineElement("SmoothImages")
            nmb_list.append(nmb)

        my_switch = Switch("disabling_test_switch")
        my_switch += nmb_list[0]
        my_switch += nmb_list[1]

        my_stack = Stack("stack_of_branches")
        for i in range(3):
            my_branch = Branch("branch_" + str(i + 2))
            my_branch += PipelineElement("StandardScaler")
            my_branch += nmb_list[i + 2]
            my_stack += my_branch

        self.hyperpipe.add(my_stack)
        self.hyperpipe.add(PipelineElement("StandardScaler"))
        self.hyperpipe.add(my_switch)
        self.hyperpipe.add(PipelineElement("SVC"))
        return nmb_list

    def test_recursive_disabling(self):
        list_of_elements_to_detect = self.setup_crazy_pipe()
        self.hyperpipe._pipe = Branch.prepare_photon_pipe(list_of_elements_to_detect)
        Hyperpipe.disable_multiprocessing_recursively(self.hyperpipe._pipe)
        self.assertTrue([i.nr_of_processes == 1 for i in list_of_elements_to_detect])

    def test_recursive_cache_folder_propagation(self):
        list_of_elements = self.setup_crazy_pipe()
        self.hyperpipe._pipe = Branch.prepare_photon_pipe(self.hyperpipe.elements)
        self.hyperpipe.recursive_cache_folder_propagation(
            self.hyperpipe._pipe, self.cache_folder_path, "fold_id_123"
        )
        for i, nmbranch in enumerate(list_of_elements):
            if i > 1:
                start_folder = os.path.join(
                    self.cache_folder_path, "branch_" + nmbranch.name
                )
            else:
                start_folder = self.cache_folder_path
            expected_folder = os.path.join(start_folder, nmbranch.name)
            self.assertEqual(nmbranch.base_element.cache_folder, expected_folder)

    def test_prepare_result_logging(self):
        # test that results object is given and entails hyperpipe infos
        self.hyperpipe.data.X = self.__X
        self.hyperpipe.data.y = self.__y
        self.hyperpipe._prepare_result_logging(datetime.datetime.now())
        self.assertTrue(isinstance(self.hyperpipe.results, MDBHyperpipe))
        self.assertTrue(isinstance(self.hyperpipe.results_handler, ResultsHandler))
        self.assertTrue(len(self.hyperpipe.results.outer_folds) == 0)

    def test_finalize_optimization(self):
        # it is kind of difficult to test that's why we fake it
        self.hyperpipe.fit(self.__X, self.__y)

        # reset all infos
        self.hyperpipe.results.dummy_estimator.train = MDBScoreInformation()
        self.hyperpipe.results.dummy_estimator.test = MDBScoreInformation()
        self.hyperpipe.results.metrics_train = {}
        self.hyperpipe.results.metrics_test = {}
        self.hyperpipe.best_config = None
        self.hyperpipe.results.best_config = MDBConfig()
        self.hyperpipe.optimum_pipe = None

        # now generate infos again
        self.hyperpipe._finalize_optimization()

        expected_num_of_metrics = len(self.hyperpipe.optimization.metrics)
        # dummy average values
        self.assertTrue(
            len(self.hyperpipe.results.dummy_estimator.train), expected_num_of_metrics
        )
        self.assertTrue(
            len(self.hyperpipe.results.dummy_estimator.test), expected_num_of_metrics
        )
        # overall average values
        self.assertTrue(
            len(self.hyperpipe.results.metrics_train), 2 * expected_num_of_metrics
        )
        self.assertTrue(
            len(self.hyperpipe.results.metrics_test), 2 * expected_num_of_metrics
        )
        # find best config
        self.assertIsNotNone(self.hyperpipe.best_config)
        self.assertIsNotNone(self.hyperpipe.results.best_config)
        self.assertEqual(
            self.hyperpipe.best_config, self.hyperpipe.results.best_config.config_dict
        )
        # set optimum pipe and params, # todo: test add preprocessing
        self.assertIsNotNone(self.hyperpipe.optimum_pipe)
        self.assertEqual(
            self.hyperpipe.optimum_pipe.named_steps["SVC"].base_element.C,
            self.hyperpipe.best_config["SVC__C"],
        )
        # save optimum model
        self.assertTrue(
            os.path.isfile(
                os.path.join(
                    self.hyperpipe.output_settings.results_folder,
                    "photon_best_model.photon",
                )
            )
        )

        # backmapping
        # because the pca is test disabled, we expect the number of features
        self.assertEqual(
            len(self.hyperpipe.results.best_config_feature_importances[0]),
            self.__X.shape[1],
        )
        backmapped_feature_importances = os.path.join(
            self.hyperpipe.output_settings.results_folder,
            "optimum_pipe_feature_importances_backmapped.csv",
        )
        self.assertTrue(os.path.isfile(backmapped_feature_importances))
        loaded_array = np.loadtxt(
            open(backmapped_feature_importances, "rb"), delimiter=","
        )
        self.assertEqual(loaded_array.shape[0], self.__X.shape[1])

    def test_optimum_pipe_predict_and_predict_proba_and_transform(self):
        # find best config and test against sklearn
        self.hyperpipe.elements[-1] = PipelineElement(
            "RandomForestClassifier",
            {"n_estimators": IntegerRange(4, 20, step=2)},
            random_state=42,
        )
        self.hyperpipe.fit(self.__X, self.__y)

        # the best config is without PCA so we test it

        best_config_copy = dict(self.hyperpipe.best_config)
        del best_config_copy["PCA__disabled"]
        if self.hyperpipe.best_config["PCA__disabled"]:
            sk_elements = [
                ("StandardScaler", StandardScaler()),
                ("RandomForestClassifier", RandomForestClassifier(random_state=42)),
            ]
        else:
            sk_elements = [
                ("StandardScaler", StandardScaler()),
                ("PCA", PCA(random_state=42)),
                ("RandomForestClassifier", RandomForestClassifier(random_state=42)),
            ]
        self.sklearn_pipe = SKLPipeline(sk_elements)
        self.sklearn_pipe.set_params(**best_config_copy)
        self.sklearn_pipe.fit(self.__X, self.__y)

        self.assertTrue(
            np.array_equal(
                self.sklearn_pipe.predict(self.__X), self.hyperpipe.predict(self.__X)
            )
        )
        self.assertTrue(
            np.array_equal(
                self.sklearn_pipe.predict_proba(self.__X),
                self.hyperpipe.predict_proba(self.__X),
            )
        )
        # fake transform on sklearn pipe
        step1 = self.sklearn_pipe.named_steps["StandardScaler"].transform(self.__X)
        if "PCA" in self.sklearn_pipe.named_steps:
            step2 = self.sklearn_pipe.named_steps["PCA"].transform(self.__X)
        else:
            step2 = step1
        self.assertTrue(np.array_equal(step2, self.hyperpipe.transform(self.__X)))
Beispiel #7
0
class ResultsHandlerTest(PhotonBaseTest):
    def setUp(self):
        """
        Set default start settings for all tests.
        """
        super(ResultsHandlerTest, self).setUp()

        self.files = [
            'best_config_predictions.csv', 'time_monitor.csv',
            'time_monitor_pie.png', 'photon_result_file.p',
            'photon_summary.txt', 'photon_best_model.photon',
            'optimum_pipe_feature_importances_backmapped.npz',
            'photon_code.py', 'optimizer_history.png'
        ]

        self.output_settings = OutputSettings(
            project_folder=self.tmp_folder_path, save_output=True)

        self.ss_pipe_element = PipelineElement('StandardScaler')
        self.pca_pipe_element = PipelineElement('PCA',
                                                {'n_components': [1, 2]},
                                                random_state=42)
        self.svc_pipe_element = PipelineElement(
            'SVC',
            {
                'C': [0.1],
                'kernel': ['linear']
            },  # 'rbf', 'sigmoid']
            random_state=42)

        self.inner_cv_object = KFold(n_splits=3)
        self.metrics = ["accuracy", 'recall', 'precision']
        self.best_config_metric = "accuracy"
        self.hyperpipe = Hyperpipe('god',
                                   inner_cv=self.inner_cv_object,
                                   metrics=self.metrics,
                                   best_config_metric=self.best_config_metric,
                                   outer_cv=KFold(n_splits=2),
                                   output_settings=self.output_settings,
                                   verbosity=1)
        self.hyperpipe += self.ss_pipe_element
        self.hyperpipe += self.pca_pipe_element
        self.hyperpipe.add(self.svc_pipe_element)

        dataset = load_breast_cancer()
        self.__X = dataset.data
        self.__y = dataset.target

        self.hyperpipe.fit(self.__X, self.__y)

    def test_write_convenience_files(self):
        """
        Output creation testing. Only write if output_settings.save_output == True
        """
        for file in self.files:
            self.assertTrue(
                os.path.isfile(
                    os.path.join(self.output_settings.results_folder, file)))

        # correct rows
        with open(
                os.path.join(self.output_settings.results_folder,
                             'best_config_predictions.csv')) as f:
            self.assertEqual(
                sum([
                    outer_fold.number_samples_test
                    for outer_fold in self.hyperpipe.results.outer_folds
                ]),
                sum(1 for _ in f) - 1)

        shutil.rmtree(self.tmp_folder_path, ignore_errors=True)
        self.output_settings = OutputSettings(
            project_folder=self.tmp_folder_path, save_output=False)
        self.hyperpipe.fit(self.__X, self.__y)
        self.assertIsNone(self.output_settings.results_folder)

    def test_readable_time_monitor_csv(self):
        """
        Test for only readable time_moitor.csv (right count of columns and pandas import).
        """
        time_monitor_df = pd.read_csv(os.path.join(
            self.output_settings.results_folder, 'time_monitor.csv'),
                                      header=[0, 1])
        self.assertIsInstance(time_monitor_df, pd.DataFrame)
        self.assertEqual(len(time_monitor_df.columns), 10)

    def test_summary(self):
        """
        Check content of photon_summary.txt. Adjustment with hyperpipe.result.
        """
        with open(
                os.path.join(self.output_settings.results_folder,
                             'photon_summary.txt')) as file:
            data = file.read()

        areas = data.split(
            "-------------------------------------------------------------------"
        )

        # first areas
        self.assertEqual(areas[0], "\nPHOTON RESULT SUMMARY\n")

        result_dict = {
            "dummy_test": self.hyperpipe.results.dummy_estimator.test,
            "dummy_train": self.hyperpipe.results.dummy_estimator.train,
            "best_config_train": self.hyperpipe.results.metrics_train,
            "best_config_test": self.hyperpipe.results.metrics_test
        }

        outer_fold_traintest = {}

        key_areas_outer_fold = []
        # all outerfold areas
        for i in range(len(self.hyperpipe.results.outer_folds)):
            self.assertEqual(areas[4 + i * 2],
                             '\nOUTER FOLD ' + str(i + 1) + '\n')
            key_areas_outer_fold.append("outer_fold_" + str(i + 1))
            result_dict["outer_fold_"+str(i+1)+"_train"] = \
                self.hyperpipe.results.outer_folds[i].best_config.best_config_score.training
            outer_fold_traintest["outer_fold_" + str(i + 1) +
                                 "_train"] = "TrainValue"
            result_dict["outer_fold_" + str(i + 1) + "_test"] = \
                self.hyperpipe.results.outer_folds[i].best_config.best_config_score.validation
            outer_fold_traintest["outer_fold_" + str(i + 1) +
                                 "_test"] = "TestValue"

        # check performance / test-train of dummy and best_config
        key_areas = ["entracee", "name", "dummy", "best_config"]
        splitted_areas = {}

        for num in range(len(key_areas)):
            splitted_areas[key_areas[num]] = areas[num].split("\n")

        index_dict = {}
        for key in key_areas[2:]:
            if [perf for perf in splitted_areas[key] if perf == "TEST:"]:
                index_dict[key + "_test"] = splitted_areas[key].index("TEST:")
                index_dict[key +
                           "_train"] = splitted_areas[key].index("TRAINING:")
            else:
                self.assertTrue(False)
            for data_key in [k for k in list(result_dict.keys()) if key in k]:
                table_str = "\n".join([
                    splitted_areas[key][index_dict[data_key] + i]
                    for i in [2, 4, 5, 6]
                ])
                table = pd.read_csv(StringIO(table_str.replace(" ", "")),
                                    sep="|")[["MetricName", "MEAN",
                                              "STD"]].set_index("MetricName")
                for result_metric in result_dict[data_key]:
                    self.assertAlmostEqual(
                        result_metric.value,
                        table[result_metric.operation.split(".")[1]][
                            result_metric.metric_name], 4)

        splitted_areas = {}
        for num in range(len(key_areas_outer_fold)):
            splitted_areas[key_areas_outer_fold[num]] = areas[len(key_areas) +
                                                              1 + num *
                                                              2].split("\n")

        # check all outer_folds
        for key_area_outer_fold in key_areas_outer_fold:
            if [
                    perf for perf in splitted_areas[key_area_outer_fold]
                    if perf == "PERFORMANCE:"
            ]:
                index_dict[key_area_outer_fold + "_train"] = splitted_areas[
                    key_area_outer_fold].index("PERFORMANCE:")
                index_dict[key_area_outer_fold +
                           "_test"] = index_dict[key_area_outer_fold +
                                                 "_train"]
            else:
                self.assertTrue(False)
            for data_key in [
                    k for k in list(result_dict.keys())
                    if key_area_outer_fold in k
            ]:
                table_str = "\n".join([
                    splitted_areas[key_area_outer_fold][index_dict[data_key] +
                                                        i]
                    for i in [2, 4, 5, 6]
                ])
                table = pd.read_csv(StringIO(table_str.replace(" ", "")),
                                    sep="|")[[
                                        "MetricName", "TrainValue", "TestValue"
                                    ]].set_index("MetricName")

                for result_metric in result_dict[data_key].metrics.keys():
                    self.assertAlmostEqual(
                        result_dict[data_key].metrics[result_metric],
                        table[outer_fold_traintest[data_key]][result_metric],
                        4)

    def test_save_backmapping(self):
        """
        Check dimension of feature backmapping equals input dimensions.
        """
        npzfile = np.load(
            os.path.join(self.output_settings.results_folder,
                         'optimum_pipe_feature_importances_backmapped.npz'))

        self.assertEqual(len(npzfile.files), 1)
        result_data = []
        for file in npzfile.files:
            result_data.append(npzfile[file])

        self.assertEqual(np.shape(self.__X)[1], result_data[0].size)

    #  def test_save_backmapping_stack(self):
    #    self.hyperpipe = Hyperpipe('god', inner_cv=self.inner_cv_object,
    #                               metrics=self.metrics,
    #                               best_config_metric=self.best_config_metric,
    #                               outer_cv=KFold(n_splits=2),
    #                               output_settings=self.output_settings,
    #                               verbosity=1)
    #    self.hyperpipe += self.ss_pipe_element
    #    self.stack = Stack("myStack")
    #    self.stack += PipelineElement("MinMaxScaler")
    #    self.stack += self.pca_pipe_element
    #    self.hyperpipe += self.stack
    #    self.hyperpipe.add(self.svc_pipe_element)

    #    self.output_settings.save_output = True
    #    self.hyperpipe.fit(self.__X, self.__y)
    #    picklefile = pickle.load(open(
    #        os.path.join(self.output_settings.results_folder, 'optimum_pipe_feature_importances_backmapped.p'),"rb"))

    #    self.assertEqual(np.shape(self.__X)[1], len(picklefile[0]))

    def pass_through_plots(self):
        """
        Test for plot functions. Only passing test, no quality testing.
        """
        self.assertIsNone(self.hyperpipe.results.plot_optimizer_history())
        self.assertIsNone(self.hyperpipe.results.plot_true_pred())
        self.assertIsNone(self.hyperpipe.results.plot_confusion_matrix())
        self.assertIsNone(self.hyperpipe.results.plot_roc_curve())

    def test_load_from_file(self):
        X, y = load_breast_cancer(True)
        my_pipe = Hyperpipe(
            'load_results_file_test',
            metrics=['accuracy'],
            best_config_metric='accuracy',
            output_settings=OutputSettings(project_folder='./tmp'))
        my_pipe += PipelineElement("StandardScaler")
        my_pipe += PipelineElement("SVC")
        my_pipe.fit(X, y)

        results_file = os.path.join(my_pipe.output_settings.results_folder,
                                    "photon_result_file.p")
        my_result_handler = ResultsHandler()
        my_result_handler.load_from_file(results_file)
        self.assertIsInstance(my_result_handler.results, MDBHyperpipe)

    def test_get_performance_table(self):
        pass

    def test_load_from_mongodb(self):
        pass
Beispiel #8
0
    class Smac3IntegrationTest(unittest.TestCase):
        def setUp(self):
            self.s_split = ShuffleSplit(n_splits=3,
                                        test_size=0.2,
                                        random_state=42)

            self.time_limit = 20

            settings = OutputSettings(project_folder='./tmp/')

            self.smac_helper = {"data": None, "initial_runs": None}

            # Scenario object
            scenario_dict = {
                "run_obj": "quality",
                "deterministic": "true",
                "wallclock_limit": self.time_limit
            }

            # DESIGN YOUR PIPELINE
            self.pipe = Hyperpipe('basic_svm_pipe',
                                  optimizer='smac',
                                  optimizer_params={
                                      'facade': SMAC4HPO,
                                      'scenario_dict': scenario_dict,
                                      'rng': 42,
                                      'smac_helper': self.smac_helper
                                  },
                                  metrics=['accuracy'],
                                  random_seed=42,
                                  best_config_metric='accuracy',
                                  inner_cv=self.s_split,
                                  verbosity=0,
                                  output_settings=settings)

        def simple_classification(self):
            dataset = fetch_olivetti_faces(download_if_missing=True)
            self.X = dataset["data"]
            self.y = dataset["target"]
            return self.X, self.y

        # integration test for simple pipeline without Switch
        def test_photon_implementation_simple(self):
            # PHOTON implementation
            self.pipe.add(PipelineElement('StandardScaler'))
            self.pipe += PipelineElement(
                'PCA', hyperparameters={'n_components': IntegerRange(5, 30)})
            self.pipe += PipelineElement('SVC',
                                         hyperparameters={
                                             'kernel':
                                             Categorical(["rbf", 'poly']),
                                             'C': FloatRange(0.5, 200)
                                         },
                                         gamma='auto')
            self.X, self.y = self.simple_classification()
            self.pipe.fit(self.X, self.y)

            # direct AUTO ML implementation
            # Build Configuration Space which defines all parameters and their ranges
            cs = ConfigurationSpace()
            n_components = UniformIntegerHyperparameter(
                "PCA__n_components", 5, 30)
            cs.add_hyperparameter(n_components)
            kernel = CategoricalHyperparameter("SVC__kernel", ["rbf", 'poly'])
            cs.add_hyperparameter(kernel)
            c = UniformFloatHyperparameter("SVC__C", 0.5, 200)
            cs.add_hyperparameter(c)

            # Scenario object
            scenario = Scenario({
                "run_obj": "quality",
                "cs": cs,
                "deterministic": "true",
                "wallclock_limit": self.time_limit,
                "limit_resources": False,
                'abort_on_first_run_crash': False
            })

            # Optimize, using a SMAC directly
            smac = SMAC4HPO(scenario=scenario,
                            rng=42,
                            tae_runner=self.objective_function_simple)
            _ = smac.optimize()

            runhistory_photon = self.smac_helper["data"].solver.runhistory
            runhistory_original = smac.solver.runhistory

            x_ax = range(
                1,
                min(len(runhistory_original._cost_per_config.keys()),
                    len(runhistory_photon._cost_per_config.keys())) + 1)
            y_ax_original = [
                runhistory_original._cost_per_config[tmp] for tmp in x_ax
            ]
            y_ax_photon = [
                runhistory_photon._cost_per_config[tmp] for tmp in x_ax
            ]

            y_ax_original_inc = [min(y_ax_original[:tmp + 1]) for tmp in x_ax]
            y_ax_photon_inc = [min(y_ax_photon[:tmp + 1]) for tmp in x_ax]

            plot = False
            if plot:
                plt.figure(figsize=(10, 7))
                plt.plot(x_ax, y_ax_original, 'g', label='Original')
                plt.plot(x_ax, y_ax_photon, 'b', label='PHOTON')
                plt.plot(x_ax, y_ax_photon_inc, 'r', label='PHOTON Incumbent')
                plt.plot(x_ax,
                         y_ax_original_inc,
                         'k',
                         label='Original Incumbent')
                plt.title('Photon Prove')
                plt.xlabel('X')
                plt.ylabel('Y')
                plt.legend(loc='best')
                plt.savefig("smac.png")

            min_len = min(len(y_ax_original), len(y_ax_photon))
            self.assertLessEqual(
                np.max(
                    np.abs(
                        np.array(y_ax_original[:min_len]) -
                        np.array(y_ax_photon[:min_len]))), 0.01)

        def objective_function_simple(self, cfg):
            cfg = {k: cfg[k] for k in cfg if cfg[k]}
            values = []

            train_indices = list(self.pipe.cross_validation.outer_folds.values(
            ))[0].train_indices
            self._validation_X, self._validation_y, _ = PhotonDataHelper.split_data(
                self.X, self.y, kwargs=None, indices=train_indices)

            for inner_fold in list(
                    list(self.pipe.cross_validation.inner_folds.values())
                [0].values()):
                sc = PipelineElement("StandardScaler", {})
                pca = PipelineElement("PCA", {}, random_state=42)
                svc = PipelineElement("SVC", {}, random_state=42, gamma='auto')
                my_pipe = PhotonPipeline([('StandardScaler', sc), ('PCA', pca),
                                          ('SVC', svc)])
                my_pipe.set_params(**cfg)
                my_pipe.fit(self._validation_X[inner_fold.train_indices, :],
                            self._validation_y[inner_fold.train_indices])
                values.append(
                    accuracy_score(
                        self._validation_y[inner_fold.test_indices],
                        my_pipe.predict(
                            self._validation_X[inner_fold.test_indices, :])))
            return 1 - np.mean(values)

        # integration test for pipeline with Switch
        def test_photon_implementation_switch(self):
            # PHOTON implementation
            self.pipe.add(PipelineElement('StandardScaler'))
            self.pipe += PipelineElement(
                'PCA', hyperparameters={'n_components': IntegerRange(5, 30)})
            estimator_siwtch = Switch("Estimator")
            estimator_siwtch += PipelineElement('SVC',
                                                hyperparameters={
                                                    'kernel':
                                                    Categorical(
                                                        ["rbf", 'poly']),
                                                    'C':
                                                    FloatRange(0.5, 200)
                                                },
                                                gamma='auto')
            estimator_siwtch += PipelineElement('RandomForestClassifier',
                                                hyperparameters={
                                                    'criterion':
                                                    Categorical(
                                                        ['gini', 'entropy']),
                                                    'min_samples_split':
                                                    IntegerRange(2, 4)
                                                })
            self.pipe += estimator_siwtch
            self.X, self.y = self.simple_classification()
            self.pipe.fit(self.X, self.y)

            # direct AUTO ML implementation

            # Build Configuration Space which defines all parameters and their ranges
            cs = ConfigurationSpace()
            n_components = UniformIntegerHyperparameter(
                "PCA__n_components", 5, 30)
            cs.add_hyperparameter(n_components)

            switch = CategoricalHyperparameter("Estimator_switch",
                                               ['svc', 'rf'])
            cs.add_hyperparameter(switch)

            kernel = CategoricalHyperparameter("SVC__kernel", ["rbf", 'poly'])
            cs.add_hyperparameter(kernel)
            c = UniformFloatHyperparameter("SVC__C", 0.5, 200)
            cs.add_hyperparameter(c)
            use_svc_c = InCondition(child=kernel,
                                    parent=switch,
                                    values=["svc"])
            use_svc_kernel = InCondition(child=c,
                                         parent=switch,
                                         values=["svc"])

            criterion = CategoricalHyperparameter(
                "RandomForestClassifier__criterion", ['gini', 'entropy'])
            cs.add_hyperparameter(criterion)
            minsplit = UniformIntegerHyperparameter(
                "RandomForestClassifier__min_samples_split", 2, 4)
            cs.add_hyperparameter(minsplit)

            use_rf_crit = InCondition(child=criterion,
                                      parent=switch,
                                      values=["rf"])
            use_rf_minsplit = InCondition(child=minsplit,
                                          parent=switch,
                                          values=["rf"])

            cs.add_conditions(
                [use_svc_c, use_svc_kernel, use_rf_crit, use_rf_minsplit])

            # Scenario object
            scenario = Scenario({
                "run_obj": "quality",
                "cs": cs,
                "deterministic": "true",
                "wallclock_limit": self.time_limit,
                "limit_resources": False,
                'abort_on_first_run_crash': False
            })

            # Optimize, using a SMAC directly
            smac = SMAC4HPO(scenario=scenario,
                            rng=42,
                            tae_runner=self.objective_function_switch)
            _ = smac.optimize()

            runhistory_photon = self.smac_helper["data"].solver.runhistory
            runhistory_original = smac.solver.runhistory

            x_ax = range(
                1,
                min(len(runhistory_original._cost_per_config.keys()),
                    len(runhistory_photon._cost_per_config.keys())) + 1)
            y_ax_original = [
                runhistory_original._cost_per_config[tmp] for tmp in x_ax
            ]
            y_ax_photon = [
                runhistory_photon._cost_per_config[tmp] for tmp in x_ax
            ]

            min_len = min(len(y_ax_original), len(y_ax_photon))
            self.assertLessEqual(
                np.max(
                    np.abs(
                        np.array(y_ax_original[:min_len]) -
                        np.array(y_ax_photon[:min_len]))), 0.01)

        def objective_function_switch(self, cfg):
            cfg = {k: cfg[k] for k in cfg if cfg[k]}
            values = []

            train_indices = list(self.pipe.cross_validation.outer_folds.values(
            ))[0].train_indices
            self._validation_X, self._validation_y, _ = PhotonDataHelper.split_data(
                self.X, self.y, kwargs=None, indices=train_indices)

            switch = cfg["Estimator_switch"]
            del cfg["Estimator_switch"]
            for inner_fold in list(
                    list(self.pipe.cross_validation.inner_folds.values())
                [0].values()):
                sc = PipelineElement("StandardScaler", {})
                pca = PipelineElement("PCA", {}, random_state=42)
                if switch == 'svc':
                    est = PipelineElement("SVC", {},
                                          random_state=42,
                                          gamma='auto')
                    name = 'SVC'
                else:
                    est = PipelineElement("RandomForestClassifier", {},
                                          random_state=42)
                    name = "RandomForestClassifier"
                my_pipe = PhotonPipeline([('StandardScaler', sc), ('PCA', pca),
                                          (name, est)])
                my_pipe.set_params(**cfg)
                my_pipe.fit(self._validation_X[inner_fold.train_indices, :],
                            self._validation_y[inner_fold.train_indices])
                values.append(
                    accuracy_score(
                        self._validation_y[inner_fold.test_indices],
                        my_pipe.predict(
                            self._validation_X[inner_fold.test_indices, :])))
            return 1 - np.mean(values)

        def test_facade(self):
            config_space = ConfigurationSpace()
            n_components = UniformIntegerHyperparameter(
                "PCA__n_components", 5, 30)
            config_space.add_hyperparameter(n_components)
            scenario_dict = {
                "run_obj": "quality",
                "deterministic": "true",
                "cs": config_space,
                "wallclock_limit": 60
            }

            with self.assertRaises(ValueError):
                SMACOptimizer(facade="SMAC4BOO", scenario_dict=scenario_dict)

            with self.assertRaises(ValueError):
                facade = SMAC4BO(scenario=Scenario(scenario_dict))
                SMACOptimizer(facade=facade, scenario_dict=scenario_dict)

            facades = [
                "SMAC4BO", SMAC4BO, "SMAC4AC", SMAC4AC, "SMAC4HPO", SMAC4HPO,
                "BOHB4HPO", BOHB4HPO
            ]
            for facade in facades:
                SMACOptimizer(facade=facade, scenario_dict=scenario_dict)
Beispiel #9
0
class HyperpipeTests(PhotonBaseTest):
    def setup_hyperpipe(self, output_settings=None):
        if output_settings is None:
            output_settings = OutputSettings(
                project_folder=self.tmp_folder_path)
        self.hyperpipe = Hyperpipe('god',
                                   inner_cv=self.inner_cv_object,
                                   metrics=self.metrics,
                                   best_config_metric=self.best_config_metric,
                                   output_settings=output_settings,
                                   verbosity=2)
        self.hyperpipe += self.ss_pipe_element
        self.hyperpipe += self.pca_pipe_element
        self.hyperpipe.add(self.svc_pipe_element)

    @classmethod
    def setUpClass(cls) -> None:
        cls.file = __file__
        super(HyperpipeTests, cls).setUpClass()

    def setUp(self):

        super(HyperpipeTests, self).setUp()
        self.ss_pipe_element = PipelineElement('StandardScaler')
        self.pca_pipe_element = PipelineElement('PCA',
                                                {'n_components': [1, 2]},
                                                random_state=42,
                                                test_disabled=True)
        self.svc_pipe_element = PipelineElement(
            'SVC',
            {
                'C': [0.1, 1],
                'kernel': ['linear']
            },  # 'rbf', 'sigmoid']
            random_state=42)

        self.inner_cv_object = KFold(n_splits=3)
        self.metrics = ["accuracy", 'recall', 'precision']
        self.best_config_metric = "accuracy"
        self.setup_hyperpipe()

        dataset = load_breast_cancer()
        self.__X = dataset.data
        self.__y = dataset.target

    def test_init(self):
        # test that all init parameters can be retrieved via the cleaned up subclasses
        self.assertEqual(self.hyperpipe.name, 'god')

        # in case don't give information, check for the default parameters, otherwise for the infos given in setUp
        # Cross Validation
        self.assertIsNotNone(self.hyperpipe.cross_validation)
        self.assertEqual(self.hyperpipe.cross_validation.inner_cv,
                         self.inner_cv_object)
        self.assertIsNone(self.hyperpipe.cross_validation.outer_cv, None)
        self.assertTrue(self.hyperpipe.cross_validation.eval_final_performance)
        self.assertTrue(
            self.hyperpipe.cross_validation.calculate_metrics_per_fold)
        self.assertFalse(
            self.hyperpipe.cross_validation.calculate_metrics_across_folds)
        self.assertIsNone(self.hyperpipe.cross_validation.outer_folds)
        self.assertDictEqual(self.hyperpipe.cross_validation.inner_folds, {})

        # Optimization
        self.assertIsNotNone(self.hyperpipe.optimization)
        self.assertListEqual(self.hyperpipe.optimization.metrics, self.metrics)
        self.assertEqual(self.hyperpipe.optimization.best_config_metric,
                         self.best_config_metric)
        self.assertEqual(self.hyperpipe.optimization.optimizer_input_str,
                         "grid_search")
        self.assertTrue(self.hyperpipe.optimization.maximize_metric)
        self.assertIsNone(self.hyperpipe.optimization.performance_constraints)
        self.assertDictEqual(self.hyperpipe.optimization.optimizer_params, {})

    def test_add(self):
        # assure pipeline has two elements, first the pca and second the svc
        self.assertEqual(len(self.hyperpipe.elements), 3)
        self.assertIs(self.hyperpipe.elements[0], self.ss_pipe_element)
        self.assertIs(self.hyperpipe.elements[1], self.pca_pipe_element)
        self.assertIs(self.hyperpipe.elements[2], self.svc_pipe_element)
        # todo : assure that no two elements can be added with the same name

        # test add method special cases
        with self.assertRaises(TypeError):
            self.hyperpipe.add(object())

        # assure that preprocessing is identified and set to the extra variable, there is only one preprocessing item
        my_preproc = Preprocessing()
        self.hyperpipe.add(my_preproc)
        self.assertEqual(my_preproc, self.hyperpipe.preprocessing)
        # make sure the element does not end up in the main pipeline
        self.assertTrue(
            [item is not my_preproc for item in self.hyperpipe.elements])

        def my_func(X, y, **kwargs):
            return True

        # test adding callback item
        my_call_back_item = CallbackElement('test_element', my_func, 'predict')
        self.hyperpipe.add(my_call_back_item)
        self.assertIs(self.hyperpipe.elements[-1], my_call_back_item)

    def test_sanity(self):
        # make sure that no metrics means raising an error
        with self.assertRaises(ValueError):
            hyperpipe = Hyperpipe("hp_name", inner_cv=self.inner_cv_object)

        # make sure that if no best config metric is given, PHOTON raises a warning
        with self.assertRaises(Warning):
            hyperpipe = Hyperpipe("hp_name",
                                  inner_cv=self.inner_cv_object,
                                  metrics=["accuracy", "f1_score"])

        with self.assertRaises(Warning):
            hyperpipe = Hyperpipe("hp_name",
                                  inner_cv=self.inner_cv_object,
                                  best_config_metric=["accuracy", "f1_score"])

        with self.assertRaises(NotImplementedError):
            hyperpipe = Hyperpipe("hp_name",
                                  inner_cv=self.inner_cv_object,
                                  best_config_metric='accuracy',
                                  metrics=["accuracy"],
                                  calculate_metrics_across_folds=False,
                                  calculate_metrics_per_fold=False)

        with self.assertRaises(AttributeError):
            hyperpipe = Hyperpipe("hp_name",
                                  best_config_metric='accuracy',
                                  metrics=["accuracy"])

        data = np.random.random((500, 50))

        with self.assertRaises(ValueError):
            targets = np.random.randint(0, 1, (500, 2))
            self.hyperpipe.fit(data, targets)

    def test_hyperpipe_with_custom_metric(self):
        def custom_metric(y_true, y_pred):
            return 99.9

        self.hyperpipe = Hyperpipe('god',
                                   inner_cv=self.inner_cv_object,
                                   metrics=[('custom_metric', custom_metric),
                                            'accuracy'],
                                   best_config_metric=Accuracy,
                                   output_settings=OutputSettings(
                                       project_folder=self.tmp_folder_path))
        self.hyperpipe += self.ss_pipe_element
        self.hyperpipe.add(self.svc_pipe_element)
        self.hyperpipe.fit(self.__X, self.__y)

        self.assertTrue('custom_metric' in self.hyperpipe.results.best_config.
                        best_config_score.validation.metrics)
        self.assertEqual(
            self.hyperpipe.results.best_config.best_config_score.validation.
            metrics['custom_metric'], 99.9)

        expected_num_of_metrics = len(self.hyperpipe.optimization.metrics)
        # one: accuracy, two: custom metric registered as "custom_metric", three: keras Metric registered as function
        self.assertEqual(expected_num_of_metrics, 3)

        # dummy average values
        self.assertTrue(len(self.hyperpipe.results.dummy_estimator.train),
                        expected_num_of_metrics)
        self.assertTrue(len(self.hyperpipe.results.dummy_estimator.test),
                        expected_num_of_metrics)

        # overall average values
        self.assertTrue(len(self.hyperpipe.results.metrics_train),
                        2 * expected_num_of_metrics)
        self.assertTrue(len(self.hyperpipe.results.metrics_test),
                        2 * expected_num_of_metrics)

    def test_preprocessing(self):

        prepro_pipe = Preprocessing()
        prepro_pipe += PipelineElement.create("dummy",
                                              DummyYAndCovariatesTransformer(),
                                              {})

        self.hyperpipe += prepro_pipe
        self.hyperpipe.fit(self.__X, self.__y)

        self.assertTrue(np.array_equal(self.__y + 1, self.hyperpipe.data.y))

    def test_estimation_type(self):
        def callback(X, y=None, **kwargs):
            pass

        pipe = Hyperpipe('name',
                         inner_cv=KFold(n_splits=2),
                         best_config_metric='mean_squared_error')

        with self.assertRaises(NotImplementedError):
            pipe += PipelineElement('PCA')
            est_type = pipe.estimation_type

        pipe += PipelineElement('SVC')
        self.assertEqual(pipe.estimation_type, 'classifier')

        pipe.elements[-1] = PipelineElement('SVR')
        self.assertEqual(pipe.estimation_type, 'regressor')

        with self.assertRaises(NotImplementedError):
            pipe.elements[-1] = CallbackElement('MyCallback', callback)
            est_type = pipe.estimation_type

    def test_copy_me(self):
        self.maxDiff = None
        copy = self.hyperpipe.copy_me()
        copy2 = self.hyperpipe.copy_me()
        self.assertDictEqual(elements_to_dict(copy),
                             elements_to_dict(self.hyperpipe))

        copy_after_fit = self.hyperpipe.fit(self.__X, self.__y).copy_me()

        copy_after_fit = elements_to_dict(copy_after_fit)
        # the current_configs of the elements are not None after calling fit() on a hyperpipe
        # when copying the respective PipelineElement, these current_configs are copied, too
        # this is why we need to delete _pipe and elements before asserting for equality
        copy_after_fit['_pipe'] = None
        copy_after_fit['elements'] = None
        copy = elements_to_dict(copy)
        copy['_pipe'] = None
        copy['elements'] = None
        self.assertDictEqual(copy, copy_after_fit)

        # check if deepcopy worked
        copy2.cross_validation.inner_cv.n_splits = 10
        self.assertEqual(copy2.cross_validation.inner_cv.n_splits, 10)
        self.assertEqual(self.hyperpipe.cross_validation.inner_cv.n_splits, 3)

    def test_save_optimum_pipe(self):
        tmp_path = os.path.join(self.tmp_folder_path, 'optimum_pipypipe')
        settings = OutputSettings(project_folder=tmp_path,
                                  overwrite_results=True)

        my_pipe = Hyperpipe('hyperpipe',
                            optimizer='random_grid_search',
                            optimizer_params={'n_configurations': 3},
                            metrics=['accuracy', 'precision', 'recall'],
                            best_config_metric='f1_score',
                            outer_cv=KFold(n_splits=2),
                            inner_cv=KFold(n_splits=2),
                            verbosity=1,
                            output_settings=settings)

        preproc = Preprocessing()
        preproc += PipelineElement('StandardScaler')

        # BRANCH WITH QUANTILTRANSFORMER AND DECISIONTREECLASSIFIER
        tree_qua_branch = Branch('tree_branch')
        tree_qua_branch += PipelineElement('QuantileTransformer')
        tree_qua_branch += PipelineElement(
            'DecisionTreeClassifier',
            {'min_samples_split': IntegerRange(2, 4)},
            criterion='gini')

        # BRANCH WITH MinMaxScaler AND DecisionTreeClassifier
        svm_mima_branch = Branch('svm_branch')
        svm_mima_branch += PipelineElement('MinMaxScaler')
        svm_mima_branch += PipelineElement(
            'SVC', {
                'kernel': Categorical(['rbf', 'linear']),
                'C': 2.0
            },
            gamma='auto')

        # BRANCH WITH StandardScaler AND KNeighborsClassifier
        knn_sta_branch = Branch('neighbour_branch')
        knn_sta_branch += PipelineElement.create("dummy", DummyTransformer(),
                                                 {})
        knn_sta_branch += PipelineElement('KNeighborsClassifier')

        my_pipe += preproc
        # voting = True to mean the result of every branch
        my_pipe += Stack('final_stack',
                         [tree_qua_branch, svm_mima_branch, knn_sta_branch])

        my_pipe += PipelineElement('LogisticRegression', solver='lbfgs')

        my_pipe.fit(self.__X, self.__y)
        model_path = os.path.join(my_pipe.output_settings.results_folder,
                                  'photon_best_model.photon')
        self.assertTrue(os.path.exists(model_path))

        # now move optimum pipe to new folder
        test_folder = os.path.join(my_pipe.output_settings.results_folder,
                                   'new_test_folder')
        new_model_path = os.path.join(test_folder, 'photon_best_model.photon')
        os.makedirs(test_folder)
        shutil.copyfile(model_path, new_model_path)

        # check if load_optimum_pipe also works
        # check if we have the meta information recovered
        loaded_optimum_pipe = Hyperpipe.load_optimum_pipe(new_model_path)
        self.assertIsNotNone(loaded_optimum_pipe._meta_information)
        self.assertIsNotNone(
            loaded_optimum_pipe._meta_information['photon_version'])

        # check if predictions stay realiably the same
        y_pred_loaded = loaded_optimum_pipe.predict(self.__X)
        y_pred = my_pipe.optimum_pipe.predict(self.__X)
        np.testing.assert_array_equal(y_pred_loaded, y_pred)

    def test_save_optimum_pipe_custom_element(self):
        tmp_path = os.path.join(self.tmp_folder_path, 'optimum_pipypipe')
        settings = OutputSettings(project_folder=tmp_path,
                                  overwrite_results=True)

        my_pipe = Hyperpipe('hyperpipe',
                            optimizer='random_grid_search',
                            optimizer_params={'n_configurations': 1},
                            metrics=['accuracy', 'precision', 'recall'],
                            best_config_metric='f1_score',
                            outer_cv=KFold(n_splits=2),
                            inner_cv=KFold(n_splits=2),
                            verbosity=1,
                            output_settings=settings)
        my_pipe += PipelineElement('KerasDnnClassifier', {},
                                   epochs=1,
                                   hidden_layer_sizes=[5])
        my_pipe.fit(self.__X, self.__y)
        model_path = os.path.join(my_pipe.output_settings.results_folder,
                                  'photon_best_model.photon')
        self.assertTrue(os.path.exists(model_path))

        # check if load_optimum_pipe also works
        # check if we have the meta information recovered
        loaded_optimum_pipe = Hyperpipe.load_optimum_pipe(model_path)
        self.assertIsNotNone(loaded_optimum_pipe._meta_information)

    def test_failure_to_save_optimum_pipe(self):
        tmp_path = os.path.join(self.tmp_folder_path, 'optimum_pipypipe')
        settings = OutputSettings(project_folder=tmp_path,
                                  overwrite_results=True)

        my_pipe = Hyperpipe('hyperpipe',
                            optimizer='random_grid_search',
                            optimizer_params={'n_configurations': 1},
                            metrics=['accuracy', 'precision', 'recall'],
                            best_config_metric='f1_score',
                            outer_cv=KFold(n_splits=2),
                            inner_cv=KFold(n_splits=2),
                            verbosity=1,
                            output_settings=settings)
        my_pipe += PipelineElement('KNeighborsClassifier')
        my_pipe.fit(self.__X, self.__y)
        model_path = os.path.join(my_pipe.output_settings.results_folder,
                                  'photon_best_model_wrong_path.photon')
        with self.assertRaises(FileNotFoundError):
            Hyperpipe.load_optimum_pipe(model_path)

    def test_overwrite_result_folder(self):
        """
        Test for right handling of parameter output_settings.overwrite.
        """
        def get_summary_file():
            return os.path.join(self.hyperpipe.output_settings.results_folder,
                                'photon_summary.txt')

        # Case 1: default
        output_settings1 = OutputSettings(project_folder=self.tmp_folder_path,
                                          save_output=True,
                                          overwrite_results=False)
        self.setup_hyperpipe(output_settings1)
        self.hyperpipe.fit(self.__X, self.__y)
        tmp_path = get_summary_file()

        time.sleep(2)

        # again with same settings
        self.setup_hyperpipe(output_settings1)
        self.hyperpipe.fit(self.__X, self.__y)
        tmp_path2 = get_summary_file()

        # we expect a new output folder each time with timestamp
        self.assertNotEqual(tmp_path, tmp_path2)

        # Case 2 overwrite results: all in the same folder
        output_settings2 = OutputSettings(project_folder=self.tmp_folder_path,
                                          save_output=True,
                                          overwrite_results=True)
        self.setup_hyperpipe(output_settings2)
        self.hyperpipe.fit(self.__X, self.__y)
        tmp_path = get_summary_file()
        tmp_date = os.path.getmtime(tmp_path)

        self.setup_hyperpipe(output_settings2)
        self.hyperpipe.fit(self.__X, self.__y)
        tmp_path2 = get_summary_file()
        tmp_date2 = os.path.getmtime(tmp_path2)

        # same folder but summary file is overwritten through the new analysis
        self.assertEqual(tmp_path, tmp_path2)
        self.assertNotEqual(tmp_date, tmp_date2)

        # Case 3: we have a cache folder
        self.hyperpipe.cache_folder = self.cache_folder_path
        shutil.rmtree(self.cache_folder_path, ignore_errors=True)
        self.hyperpipe.fit(self.__X, self.__y)
        self.assertTrue(os.path.exists(self.cache_folder_path))

    def test_random_state(self):
        self.hyperpipe.random_state = 4567
        self.hyperpipe.fit(self.__X, self.__y)
        # assure we spread the word.. !
        self.assertEqual(self.hyperpipe.random_state, 4567)
        self.assertEqual(self.hyperpipe._pipe.random_state, 4567)
        self.assertEqual(self.hyperpipe.optimum_pipe.random_state, 4567)
        self.assertEqual(self.hyperpipe._pipe.elements[-1][-1].random_state,
                         4567)
        self.assertEqual(
            self.hyperpipe._pipe.elements[-1][-1].base_element.random_state,
            4567)

    def test_dummy_estimator_preparation(self):

        self.hyperpipe.results = MDBHyperpipe()
        self.hyperpipe.results.dummy_estimator = dummy_estimator = MDBDummyResults(
        )

        # one time regressor, one time classifier, one time strange object
        self.hyperpipe.elements = list()
        self.hyperpipe.add(PipelineElement('SVC'))
        dummy_estimator = self.hyperpipe._prepare_dummy_estimator()
        self.assertTrue(isinstance(dummy_estimator, DummyClassifier))

        self.hyperpipe.elements = list()
        self.hyperpipe.add(PipelineElement('SVR'))
        dummy_estimator = self.hyperpipe._prepare_dummy_estimator()
        self.assertTrue(isinstance(dummy_estimator, DummyRegressor))

        with self.assertRaises(NotImplementedError):
            self.hyperpipe.elements = list()
            self.hyperpipe.add(PipelineElement('PCA'))
            dummy_estimator = self.hyperpipe._prepare_dummy_estimator()
            self.assertIsNone(dummy_estimator)

    def setup_crazy_pipe(self):
        # erase all, we need a complex and crazy task
        self.hyperpipe.elements = list()

        nmb_list = list()
        for i in range(5):
            nmb = ParallelBranch(name=str(i), nr_of_processes=i + 3)
            sp = PipelineElement(
                'PCA', hyperparameters={'n_components': IntegerRange(1, 50)})
            nmb += sp
            nmb_list.append(nmb)

        my_switch = Switch('disabling_test_switch')
        my_switch += nmb_list[0]
        my_switch += nmb_list[1]

        my_stack = Stack('stack_of_branches')
        for i in range(3):
            my_branch = Branch('branch_' + str(i + 2))
            my_branch += PipelineElement('StandardScaler')
            my_branch += nmb_list[i + 2]
            my_stack += my_branch

        self.hyperpipe.add(my_stack)
        self.hyperpipe.add(PipelineElement('StandardScaler'))
        self.hyperpipe.add(my_switch)
        self.hyperpipe.add(PipelineElement('SVC'))
        return nmb_list

    def test_recursive_disabling(self):
        list_of_elements_to_detect = self.setup_crazy_pipe()
        self.hyperpipe._pipe = Branch.prepare_photon_pipe(
            list_of_elements_to_detect)
        Hyperpipe.disable_multiprocessing_recursively(self.hyperpipe._pipe)
        self.assertTrue(
            [i.nr_of_processes == 1 for i in list_of_elements_to_detect])

    def test_recursive_cache_folder_propagation(self):
        list_of_elements = self.setup_crazy_pipe()
        self.hyperpipe._pipe = Branch.prepare_photon_pipe(
            self.hyperpipe.elements)
        self.hyperpipe.recursive_cache_folder_propagation(
            self.hyperpipe._pipe, self.cache_folder_path, 'fold_id_123')
        for i, nmbranch in enumerate(list_of_elements):
            if i > 1:
                start_folder = os.path.join(self.cache_folder_path,
                                            'branch_' + nmbranch.name)
            else:
                start_folder = self.cache_folder_path
            expected_folder = os.path.join(start_folder, nmbranch.name)
            self.assertEqual(nmbranch.base_element.cache_folder,
                             expected_folder)

    def test_prepare_result_logging(self):
        # test that results object is given and entails hyperpipe infos
        self.hyperpipe.data.X = self.__X
        self.hyperpipe.data.y = self.__y
        self.hyperpipe._prepare_result_logging(datetime.datetime.now())
        self.assertTrue(isinstance(self.hyperpipe.results, MDBHyperpipe))
        self.assertTrue(
            isinstance(self.hyperpipe.results_handler, ResultsHandler))
        self.assertTrue(len(self.hyperpipe.results.outer_folds) == 0)

    def test_finalize_optimization(self):
        # it is kind of difficult to test that's why we fake it
        self.hyperpipe.fit(self.__X, self.__y)

        # reset all infos
        self.hyperpipe.results.dummy_estimator.train = MDBScoreInformation()
        self.hyperpipe.results.dummy_estimator.test = MDBScoreInformation()
        self.hyperpipe.results.metrics_train = {}
        self.hyperpipe.results.metrics_test = {}
        self.hyperpipe.best_config = None
        self.hyperpipe.results.best_config = MDBConfig()
        self.hyperpipe.optimum_pipe = None

        # now generate infos again
        self.hyperpipe._finalize_optimization()

        expected_num_of_metrics = len(self.hyperpipe.optimization.metrics)
        # dummy average values
        self.assertTrue(len(self.hyperpipe.results.dummy_estimator.train),
                        expected_num_of_metrics)
        self.assertTrue(len(self.hyperpipe.results.dummy_estimator.test),
                        expected_num_of_metrics)
        # overall average values
        self.assertTrue(len(self.hyperpipe.results.metrics_train),
                        2 * expected_num_of_metrics)
        self.assertTrue(len(self.hyperpipe.results.metrics_test),
                        2 * expected_num_of_metrics)
        # find best config
        self.assertIsNotNone(self.hyperpipe.best_config)
        self.assertIsNotNone(self.hyperpipe.results.best_config)
        self.assertEqual(self.hyperpipe.best_config,
                         self.hyperpipe.results.best_config.config_dict)
        # set optimum pipe and params, # todo: test add preprocessing
        self.assertIsNotNone(self.hyperpipe.optimum_pipe)
        self.assertEqual(
            self.hyperpipe.optimum_pipe.named_steps["SVC"].base_element.C,
            self.hyperpipe.best_config["SVC__C"])
        # save optimum model
        self.assertTrue(
            os.path.isfile(
                os.path.join(self.hyperpipe.output_settings.results_folder,
                             'photon_best_model.photon')))

        # backmapping
        # because the pca is test disabled, we expect the number of features
        self.assertEqual(
            len(self.hyperpipe.results.best_config_feature_importances[0]),
            self.__X.shape[1])
        backmapped_feature_importances = os.path.join(
            self.hyperpipe.output_settings.results_folder,
            'optimum_pipe_feature_importances_backmapped.csv')
        self.assertTrue(os.path.isfile(backmapped_feature_importances))
        loaded_array = np.loadtxt(open(backmapped_feature_importances, 'rb'),
                                  delimiter=",")
        self.assertEqual(loaded_array.shape[0], self.__X.shape[1])

    def test_optimum_pipe_predict_and_predict_proba_and_transform(self):
        # find best config and test against sklearn
        self.hyperpipe.elements[-1] = PipelineElement(
            'RandomForestClassifier',
            {'n_estimators': IntegerRange(4, 20, step=2)},
            random_state=42)
        self.hyperpipe.fit(self.__X, self.__y)

        # the best config is without PCA so we test it

        best_config_copy = dict(self.hyperpipe.best_config)
        del best_config_copy["PCA__disabled"]
        if self.hyperpipe.best_config["PCA__disabled"]:
            sk_elements = [('StandardScaler', StandardScaler()),
                           ('RandomForestClassifier',
                            RandomForestClassifier(random_state=42))]
        else:
            sk_elements = [('StandardScaler', StandardScaler()),
                           ('PCA', PCA(random_state=42)),
                           ('RandomForestClassifier',
                            RandomForestClassifier(random_state=42))]
        self.sklearn_pipe = SKLPipeline(sk_elements)
        self.sklearn_pipe.set_params(**best_config_copy)
        self.sklearn_pipe.fit(self.__X, self.__y)

        self.assertTrue(
            np.array_equal(self.sklearn_pipe.predict(self.__X),
                           self.hyperpipe.predict(self.__X)))
        self.assertTrue(
            np.array_equal(self.sklearn_pipe.predict_proba(self.__X),
                           self.hyperpipe.predict_proba(self.__X)))
        # fake transform on sklearn pipe
        step1 = self.sklearn_pipe.named_steps["StandardScaler"].transform(
            self.__X)
        if "PCA" in self.sklearn_pipe.named_steps:
            step2 = self.sklearn_pipe.named_steps["PCA"].transform(self.__X)
        else:
            step2 = step1
        self.assertTrue(np.allclose(step2, self.hyperpipe.transform(self.__X)))