def test_save_optimum_pipe_custom_element(self): tmp_path = os.path.join(self.tmp_folder_path, 'optimum_pipypipe') settings = OutputSettings(project_folder=tmp_path, overwrite_results=True) my_pipe = Hyperpipe('hyperpipe', optimizer='random_grid_search', optimizer_params={'n_configurations': 1}, metrics=['accuracy', 'precision', 'recall'], best_config_metric='f1_score', outer_cv=KFold(n_splits=2), inner_cv=KFold(n_splits=2), verbosity=1, output_settings=settings) my_pipe += PipelineElement('KerasDnnClassifier', {}, epochs=1, hidden_layer_sizes=[5]) my_pipe.fit(self.__X, self.__y) model_path = os.path.join(my_pipe.output_settings.results_folder, 'photon_best_model.photon') self.assertTrue(os.path.exists(model_path)) # check if load_optimum_pipe also works # check if we have the meta information recovered loaded_optimum_pipe = Hyperpipe.load_optimum_pipe(model_path) self.assertIsNotNone(loaded_optimum_pipe._meta_information)
def test_register_element(self): with self.assertRaises(ValueError): self.registry.register('MyCustomEstimator', 'custom_estimator.CustomEstimator', 'WrongType') self.registry.register('MyCustomEstimator', 'custom_estimator.CustomEstimator', 'Estimator') self.registry.activate() settings = OutputSettings(save_output=False, project_folder='./tmp/') # DESIGN YOUR PIPELINE pipe = Hyperpipe('custom_estimator_pipe', optimizer='random_grid_search', optimizer_params={'n_configurations': 2}, metrics=['accuracy', 'precision', 'recall', 'balanced_accuracy'], best_config_metric='accuracy', outer_cv=KFold(n_splits=2), inner_cv=KFold(n_splits=2), verbosity=1, output_settings=settings) pipe += PipelineElement('MyCustomEstimator') pipe.fit(np.random.randn(30, 30), np.random.randint(0, 2, 30)) self.registry.delete('MyCustomEstimator') os.remove(os.path.join(self.custom_folder, 'CustomElements.json'))
def test_huge_combinations(self): hp = Hyperpipe( "huge_combinations", metrics=["accuracy"], best_config_metric="accuracy", output_settings=OutputSettings( project_folder=self.tmp_folder_path), ) hp += PipelineElement("PCA", hyperparameters={"n_components": [5, 10]}) stack = Stack("ensemble") for i in range(20): stack += PipelineElement( "SVC", hyperparameters={ "C": FloatRange(0.001, 5), "kernel": ["linear", "rbf", "sigmoid", "polynomial"], }, ) hp += stack hp += PipelineElement( "SVC", hyperparameters={"kernel": ["linear", "rbf", "sigmoid"]}) X, y = load_breast_cancer(True) with self.assertRaises(Warning): hp.fit(X, y)
def test_one_hyperpipe(learning_curves, learning_curves_cut): if learning_curves and learning_curves_cut is None: learning_curves_cut = FloatRange(0, 1, 'range', 0.2) output_settings = OutputSettings( project_folder=self.tmp_folder_path, save_output=False) test_hyperpipe = Hyperpipe( 'test_pipe', learning_curves=learning_curves, learning_curves_cut=learning_curves_cut, metrics=['accuracy', 'recall', 'specificity'], best_config_metric='accuracy', inner_cv=self.inner_cv, output_settings=output_settings) self.assertEqual(test_hyperpipe.cross_validation.learning_curves, learning_curves) if learning_curves: self.assertEqual( test_hyperpipe.cross_validation.learning_curves_cut, learning_curves_cut) else: self.assertIsNone( test_hyperpipe.cross_validation.learning_curves_cut) test_hyperpipe += PipelineElement('StandardScaler') test_hyperpipe += PipelineElement('PCA', {'n_components': [1, 2]}, random_state=42) test_hyperpipe += PipelineElement('SVC', { 'C': [0.1], 'kernel': ['linear'] }, random_state=42) test_hyperpipe.fit(self.X, self.y) config_results = test_hyperpipe.results_handler.results.outer_folds[ 0].tested_config_list config_num = len(config_results) for config_nr in range(config_num): for inner_fold_nr in range(self.inner_cv.n_splits): curves = config_results[config_nr].inner_folds[ inner_fold_nr].learning_curves if learning_curves: self.assertEqual(len(curves), len(learning_curves_cut.values)) for learning_point_nr in range( len(learning_curves_cut.values)): test_metrics = list( curves[learning_point_nr][1].keys()) train_metrics = list( curves[learning_point_nr][2].keys()) self.assertEqual( test_hyperpipe.optimization.metrics, test_metrics) self.assertEqual( test_hyperpipe.optimization.metrics, train_metrics) else: self.assertEqual(curves, [])
def test_class_with_data_01(self): """ Test for simple pipeline with data. """ X, y = load_breast_cancer(True) # DESIGN YOUR PIPELINE my_pipe = Hyperpipe( "basic_svm_pipe", optimizer="grid_search", metrics=["accuracy", "precision", "recall", "balanced_accuracy"], best_config_metric="accuracy", eval_final_performance=False, outer_cv=KFold(n_splits=2), inner_cv=KFold(n_splits=3), verbosity=1, random_seed=42, ) preprocessing = Preprocessing() preprocessing += PipelineElement("LabelEncoder") my_pipe += preprocessing # ADD ELEMENTS TO YOUR PIPELINE # first normalize all features my_pipe.add(PipelineElement("StandardScaler")) # then do feature selection using a PCA, my_pipe += PipelineElement( "PCA", hyperparameters={"n_components": IntegerRange(10, 12)}, test_disabled=True, ) # engage and optimize the good old SVM for Classification my_pipe += PipelineElement( "SVC", hyperparameters={"kernel": Categorical(["rbf", "linear"])}, C=2, gamma="scale", ) # NOW TRAIN YOUR PIPELINE my_pipe.fit(X, y) json_transformer = JsonTransformer() pipe_json = json_transformer.create_json(my_pipe) my_pipe_reload = json_transformer.from_json(pipe_json) pipe_json_reload = pipe_json = json_transformer.create_json( my_pipe_reload) self.assertEqual(pipe_json, pipe_json_reload) my_pipe_reload.fit(X, y) self.assertDictEqual(my_pipe.best_config, my_pipe_reload.best_config)
def test_shall_continue(self): X, y = load_boston(return_X_y=True) inner_fold_length = 7 # DESIGN YOUR PIPELINE my_pipe = Hyperpipe( name='performance_pipe', optimizer='random_search', optimizer_params={'limit_in_minutes': 2}, metrics=['mean_squared_error'], best_config_metric='mean_squared_error', inner_cv=KFold(n_splits=inner_fold_length), eval_final_performance=True, output_settings=OutputSettings(project_folder='./tmp'), performance_constraints=[self.constraint_object]) my_pipe += PipelineElement('StandardScaler') my_pipe += PipelineElement( 'RandomForestRegressor', hyperparameters={'n_estimators': IntegerRange(5, 50)}) # NOW TRAIN YOUR PIPELINE my_pipe.fit(X, y) # clip config results results = my_pipe.results.outer_folds[0].tested_config_list configs = [] for i in range(len(configs) - 1): configs.append([ x.validation.metrics['mean_squared_error'] for x in results[i].inner_folds ]) threshold = np.inf for val in configs[:10]: challenger = np.mean(val) if threshold > challenger: threshold = challenger originals_for_std = configs[:10] for i, val in enumerate(configs[10:]): std = np.mean([np.std(x) for x in originals_for_std]) for j, v in enumerate(val): if np.mean(val[:j + 1]) > threshold + std: self.assertEqual(v, val[-1]) continue if len(val) == inner_fold_length - 1 and np.mean( val) < threshold + std: threshold = np.mean(val) if len(val) > 1: originals_for_std.append(val)
def test_inverse_transform(self): settings = OutputSettings( project_folder=self.tmp_folder_path, overwrite_results=True ) # DESIGN YOUR PIPELINE pipe = Hyperpipe( "Limbic_System", optimizer="grid_search", metrics=["mean_absolute_error"], best_config_metric="mean_absolute_error", outer_cv=ShuffleSplit(n_splits=1, test_size=0.2), inner_cv=ShuffleSplit(n_splits=1, test_size=0.2), verbosity=2, cache_folder=self.cache_folder_path, eval_final_performance=True, output_settings=settings, ) # PICK AN ATLAS atlas = PipelineElement( "BrainAtlas", rois=["Hippocampus_L", "Amygdala_L"], atlas_name="AAL", extract_mode="vec", batch_size=20, ) # EITHER ADD A NEURO BRANCH OR THE ATLAS ITSELF neuro_branch = NeuroBranch("NeuroBranch") neuro_branch += atlas pipe += neuro_branch pipe += PipelineElement("LinearSVR") pipe.fit(self.X, self.y) # GET IMPORTANCE SCORES handler = ResultsHandler(pipe.results) importance_scores_optimum_pipe = handler.results.best_config_feature_importances manual_img, _, _ = pipe.optimum_pipe.inverse_transform( importance_scores_optimum_pipe, None ) img = image.load_img( os.path.join( self.tmp_folder_path, "Limbic_System_results/optimum_pipe_feature_importances_backmapped.nii.gz", ) ) self.assertTrue(np.array_equal(manual_img.get_data(), img.get_data()))
def test_huge_combinations(self): hp = Hyperpipe('huge_combinations', inner_cv=KFold(n_splits=3), metrics=['accuracy'], best_config_metric='accuracy', output_settings=OutputSettings(project_folder=self.tmp_folder_path)) hp += PipelineElement("PCA", hyperparameters={'n_components': [5, 10]}) stack = Stack('ensemble') for i in range(20): stack += PipelineElement('SVC', hyperparameters={'C': FloatRange(0.001, 5), 'kernel': ["linear", "rbf", "sigmoid", "polynomial"]}) hp += stack hp += PipelineElement("SVC", hyperparameters={'kernel': ["linear", "rbf", "sigmoid"]}) X, y = load_breast_cancer(return_X_y=True) with self.assertRaises(Warning): hp.fit(X, y)
def test_load_from_file(self): X, y = load_breast_cancer(True) my_pipe = Hyperpipe( 'load_results_file_test', metrics=['accuracy'], best_config_metric='accuracy', output_settings=OutputSettings(project_folder='./tmp')) my_pipe += PipelineElement("StandardScaler") my_pipe += PipelineElement("SVC") my_pipe.fit(X, y) results_file = os.path.join(my_pipe.output_settings.results_folder, "photon_result_file.p") my_result_handler = ResultsHandler() my_result_handler.load_from_file(results_file) self.assertIsInstance(my_result_handler.results, MDBHyperpipe)
class CachedHyperpipeTests(PhotonBaseTest): @classmethod def setUpClass(cls) -> None: cls.file = __file__ super(CachedHyperpipeTests, cls).setUpClass() def setUp(self) -> None: super(CachedHyperpipeTests, self).setUp() m = MakeSomeStupidNoiseMatrices() self.X = m(folder=self.tmp_folder_path) self.y = np.random.randn(len(self.X)) self.nr_of_expected_pickles_per_config = len(self.X) def test_neuro_hyperpipe_parallelized_batched_caching(self): cache_path = self.cache_folder_path self.hyperpipe = Hyperpipe('complex_case', inner_cv=KFold(n_splits=5), outer_cv=KFold(n_splits=3), optimizer='grid_search', cache_folder=cache_path, metrics=['mean_squared_error'], best_config_metric='mean_squared_error', output_settings=OutputSettings( project_folder=self.tmp_folder_path)) nb = ParallelBranch("SubjectCaching", nr_of_processes=1) nb += PipelineElement.create("ResampleImages", StupidAdditionTransformer(), {'voxel_size': [3, 5, 10]}, batch_size=4) self.hyperpipe += nb self.hyperpipe += PipelineElement("StandardScaler", {}) self.hyperpipe += PipelineElement("PCA", {'n_components': [3, 4]}) self.hyperpipe += PipelineElement("SVR", {'kernel': ['rbf', 'linear']}) self.hyperpipe.fit(self.X, self.y) # assert cache is empty again nr_of_p_files = len( glob.glob(os.path.join(self.hyperpipe.cache_folder, "*.p"))) print(nr_of_p_files) self.assertTrue(nr_of_p_files == 0)
def test_failure_to_save_optimum_pipe(self): tmp_path = os.path.join(self.tmp_folder_path, 'optimum_pipypipe') settings = OutputSettings(project_folder=tmp_path, overwrite_results=True) my_pipe = Hyperpipe('hyperpipe', optimizer='random_grid_search', optimizer_params={'n_configurations': 1}, metrics=['accuracy', 'precision', 'recall'], best_config_metric='f1_score', outer_cv=KFold(n_splits=2), inner_cv=KFold(n_splits=2), verbosity=1, output_settings=settings) my_pipe += PipelineElement('KNeighborsClassifier') my_pipe.fit(self.__X, self.__y) model_path = os.path.join(my_pipe.output_settings.results_folder, 'photon_best_model_wrong_path.photon') with self.assertRaises(FileNotFoundError): Hyperpipe.load_optimum_pipe(model_path)
class GridSearchOptimizerTest(unittest.TestCase): def setUp(self): """ Set up for GridSearchTest. """ self.pipeline_elements = [PipelineElement("StandardScaler"), PipelineElement('PCA', hyperparameters={'n_components': IntegerRange(5, 20)}), PipelineElement("SVC")] self.optimizer = GridSearchOptimizer() self.optimizer_name = 'grid_search' def create_hyperpipe(self): self.hyperpipe = Hyperpipe('optimizer_test', output_settings=OutputSettings(project_folder='./tmp'), metrics=['accuracy'], best_config_metric='accuracy', inner_cv=KFold(n_splits=3), outer_cv=ShuffleSplit(n_splits=2), optimizer=self.optimizer_name) def test_run(self): self.create_hyperpipe() for p in self.pipeline_elements: self.hyperpipe += p X, y = load_breast_cancer(True) self.hyperpipe.fit(X, y) def test_all_functions_available(self): """ Test existence of functions and parameters -> .ask() .tell() .prepare() """ self.assertTrue(hasattr(self.optimizer, 'prepare')) self.assertListEqual(list(signature(self.optimizer.prepare).parameters.keys()), ['pipeline_elements', 'maximize_metric']) self.assertTrue(hasattr(self.optimizer, 'tell')) self.assertListEqual(list(signature(self.optimizer.tell).parameters.keys()), ['config', 'performance']) self.assertTrue(hasattr(self.optimizer, 'ask')) def test_all_attributes_available(self): """ Test for .ask and .param_grid attribute. .ask is important for next configuration that should be tested. """ self.optimizer.prepare(pipeline_elements=self.pipeline_elements, maximize_metric=True) self.assertIsInstance(self.optimizer.ask, types.GeneratorType) def test_ask(self): """ Test general functionality of .ask() """ self.optimizer.prepare(pipeline_elements=self.pipeline_elements, maximize_metric=True) ask_list = list(self.optimizer.ask) self.assertIsInstance(ask_list, list) self.assertSetEqual(set([str(type(a)) for a in ask_list]), set(["<class 'dict'>"])) generated_elements = reduce(operator.concat, [list(a.keys()) for a in ask_list]) self.assertIn("PCA__n_components", generated_elements) return generated_elements def test_ask_advanced(self): """ Test advanced functionality of .ask() """ branch = Branch('branch') branch += PipelineElement('PCA') branch += PipelineElement('SVC', {'C': [0.1, 1], 'kernel': ['rbf', 'sigmoid']}) pipe_switch = Switch('switch', [PipelineElement("StandardScaler"), PipelineElement("MaxAbsScaler")]) self.pipeline_elements = [PipelineElement("StandardScaler"), PipelineElement('PCA', hyperparameters={'n_components': IntegerRange(5, 20)}, test_disabled=True), pipe_switch, branch, Switch('Switch_in_switch', [branch, pipe_switch])] generated_elements = self.test_ask() self.assertIn("PCA__n_components", generated_elements) self.assertIn("Switch_in_switch__current_element", generated_elements) self.assertIn("branch__SVC__C", generated_elements) self.assertIn("branch__SVC__kernel", generated_elements) self.assertIn("switch__current_element", generated_elements)
}) estimator_switch += PipelineElement( "ExtraTreesClassifier", hyperparameters={'n_estimators': IntegerRange(5, 50)}) estimator_switch += PipelineElement( "SGDClassifier", hyperparameters={'penalty': Categorical(['l2', 'l1', 'elasticnet'])}) pipeline_elements.append(prepro_switch) pipeline_elements.append(estimator_switch) for pipeline_element in pipeline_elements: grid_pipe += pipeline_element smac_pipe += pipeline_element grid_pipe.fit(X, y) smac_pipe.fit(X, y) y_smac = [ 1 - x.metrics_test[0].value for x in grid_pipe.results.outer_folds[0].tested_config_list ] y_grid = [ 1 - x.metrics_test[0].value for x in smac_pipe.results.outer_folds[0].tested_config_list ] x_smac = list(range(1, len(y_smac) + 1)) x_grid = list(range(1, len(y_grid) + 1)) y_smac_inc = [min(y_smac[:tmp + 1]) for tmp in x_smac]
class HyperpipeTests(PhotonBaseTest): def setup_hyperpipe(self, output_settings=None): if output_settings is None: output_settings = OutputSettings(project_folder=self.tmp_folder_path) self.hyperpipe = Hyperpipe( "god", inner_cv=self.inner_cv_object, metrics=self.metrics, best_config_metric=self.best_config_metric, output_settings=output_settings, ) self.hyperpipe += self.ss_pipe_element self.hyperpipe += self.pca_pipe_element self.hyperpipe.add(self.svc_pipe_element) def setUp(self): super(HyperpipeTests, self).setUp() self.ss_pipe_element = PipelineElement("StandardScaler") self.pca_pipe_element = PipelineElement( "PCA", {"n_components": [1, 2]}, random_state=42, test_disabled=True ) self.svc_pipe_element = PipelineElement( "SVC", {"C": [0.1, 1], "kernel": ["linear"]}, # 'rbf', 'sigmoid'] random_state=42, ) self.inner_cv_object = KFold(n_splits=3) self.metrics = ["accuracy", "recall", "precision"] self.best_config_metric = "accuracy" self.setup_hyperpipe() dataset = load_breast_cancer() self.__X = dataset.data self.__y = dataset.target def test_init(self): # test that all init parameters can be retrieved via the cleaned up subclasses self.assertEqual(self.hyperpipe.name, "god") # in case don't give information, check for the default parameters, otherwise for the infos given in setUp # Cross Validation self.assertIsNotNone(self.hyperpipe.cross_validation) self.assertEqual(self.hyperpipe.cross_validation.inner_cv, self.inner_cv_object) self.assertIsNone(self.hyperpipe.cross_validation.outer_cv, None) self.assertTrue(self.hyperpipe.cross_validation.eval_final_performance) self.assertTrue(self.hyperpipe.cross_validation.calculate_metrics_per_fold) self.assertFalse(self.hyperpipe.cross_validation.calculate_metrics_across_folds) self.assertIsNone(self.hyperpipe.cross_validation.outer_folds) self.assertDictEqual(self.hyperpipe.cross_validation.inner_folds, {}) # Optimization self.assertIsNotNone(self.hyperpipe.optimization) self.assertListEqual(self.hyperpipe.optimization.metrics, self.metrics) self.assertEqual( self.hyperpipe.optimization.best_config_metric, self.best_config_metric ) self.assertEqual(self.hyperpipe.optimization.optimizer_input_str, "grid_search") self.assertTrue(self.hyperpipe.optimization.maximize_metric) self.assertIsNone(self.hyperpipe.optimization.performance_constraints) self.assertDictEqual(self.hyperpipe.optimization.optimizer_params, {}) def test_add(self): # assure pipeline has two elements, first the pca and second the svc self.assertEqual(len(self.hyperpipe.elements), 3) self.assertIs(self.hyperpipe.elements[0], self.ss_pipe_element) self.assertIs(self.hyperpipe.elements[1], self.pca_pipe_element) self.assertIs(self.hyperpipe.elements[2], self.svc_pipe_element) # todo : assure that no two elements can be added with the same name # test add method special cases with self.assertRaises(TypeError): self.hyperpipe.add(object()) # assure that preprocessing is identified and set to the extra variable, there is only one preprocessing item my_preproc = Preprocessing() self.hyperpipe.add(my_preproc) self.assertEqual(my_preproc, self.hyperpipe.preprocessing) # make sure the element does not end up in the main pipeline self.assertTrue([item is not my_preproc for item in self.hyperpipe.elements]) def my_func(X, y, **kwargs): return True # test adding callback item my_call_back_item = CallbackElement("test_element", my_func, "predict") self.hyperpipe.add(my_call_back_item) self.assertIs(self.hyperpipe.elements[-1], my_call_back_item) def test_no_metrics(self): # make sure that no metrics means raising an error with self.assertRaises(ValueError): hyperpipe = Hyperpipe("hp_name", inner_cv=self.inner_cv_object) # make sure that if no best config metric is given, PHOTON raises a warning with self.assertRaises(Warning): hyperpipe = Hyperpipe( "hp_name", inner_cv=self.inner_cv_object, metrics=["accuracy", "f1_score"], ) def test_preprocessing(self): prepro_pipe = Preprocessing() prepro_pipe += PipelineElement.create( "dummy", DummyYAndCovariatesTransformer(), {} ) self.hyperpipe += prepro_pipe self.hyperpipe.fit(self.__X, self.__y) self.assertTrue(np.array_equal(self.__y + 1, self.hyperpipe.data.y)) def test_estimation_type(self): def callback(X, y=None, **kwargs): pass pipe = Hyperpipe( "name", inner_cv=KFold(n_splits=2), best_config_metric="mean_squared_error" ) with self.assertRaises(NotImplementedError): pipe += PipelineElement("PCA") est_type = pipe.estimation_type pipe += PipelineElement("SVC") self.assertEqual(pipe.estimation_type, "classifier") pipe.elements[-1] = PipelineElement("SVR") self.assertEqual(pipe.estimation_type, "regressor") with self.assertRaises(NotImplementedError): pipe.elements[-1] = CallbackElement("MyCallback", callback) est_type = pipe.estimation_type def test_copy_me(self): self.maxDiff = None copy = self.hyperpipe.copy_me() copy2 = self.hyperpipe.copy_me() self.assertDictEqual(elements_to_dict(copy), elements_to_dict(self.hyperpipe)) copy_after_fit = self.hyperpipe.fit(self.__X, self.__y).copy_me() copy_after_fit = elements_to_dict(copy_after_fit) # the current_configs of the elements are not None after calling fit() on a hyperpipe # when copying the respective PipelineElement, these current_configs are copied, too # this is why we need to delete _pipe and elements before asserting for equality copy_after_fit["_pipe"] = None copy_after_fit["elements"] = None copy = elements_to_dict(copy) copy["_pipe"] = None copy["elements"] = None self.assertDictEqual(copy, copy_after_fit) # check if deepcopy worked copy2.cross_validation.inner_cv.n_splits = 10 self.assertEqual(copy2.cross_validation.inner_cv.n_splits, 10) self.assertEqual(self.hyperpipe.cross_validation.inner_cv.n_splits, 3) def test_save_optimum_pipe(self): # todo: test .save() of custom model tmp_path = os.path.join(self.tmp_folder_path, "optimum_pipypipe") settings = OutputSettings(project_folder=tmp_path, overwrite_results=True) my_pipe = Hyperpipe( "hyperpipe", optimizer="random_grid_search", optimizer_params={"n_configurations": 3}, metrics=["accuracy", "precision", "recall"], best_config_metric="f1_score", outer_cv=KFold(n_splits=2), inner_cv=KFold(n_splits=2), verbosity=1, output_settings=settings, ) preproc = Preprocessing() preproc += PipelineElement("StandardScaler") # BRANCH WITH QUANTILTRANSFORMER AND DECISIONTREECLASSIFIER tree_qua_branch = Branch("tree_branch") tree_qua_branch += PipelineElement("QuantileTransformer") tree_qua_branch += PipelineElement( "DecisionTreeClassifier", {"min_samples_split": IntegerRange(2, 4)}, criterion="gini", ) # BRANCH WITH MinMaxScaler AND DecisionTreeClassifier svm_mima_branch = Branch("svm_branch") svm_mima_branch += PipelineElement("MinMaxScaler") svm_mima_branch += PipelineElement( "SVC", {"kernel": Categorical(["rbf", "linear"]), "C": 2.0}, gamma="auto" ) # BRANCH WITH StandardScaler AND KNeighborsClassifier knn_sta_branch = Branch("neighbour_branch") knn_sta_branch += PipelineElement.create("dummy", DummyTransformer(), {}) knn_sta_branch += PipelineElement("KNeighborsClassifier") my_pipe += preproc # voting = True to mean the result of every branch my_pipe += Stack( "final_stack", [tree_qua_branch, svm_mima_branch, knn_sta_branch] ) my_pipe += PipelineElement("LogisticRegression", solver="lbfgs") my_pipe.fit(self.__X, self.__y) model_path = os.path.join( my_pipe.output_settings.results_folder, "photon_best_model.photon" ) self.assertTrue(os.path.exists(model_path)) # now move optimum pipe to new folder test_folder = os.path.join( my_pipe.output_settings.results_folder, "new_test_folder" ) new_model_path = os.path.join(test_folder, "photon_best_model.photon") os.makedirs(test_folder) shutil.copyfile(model_path, new_model_path) # check if load_optimum_pipe also works # check if we have the meta information recovered loaded_optimum_pipe = Hyperpipe.load_optimum_pipe(new_model_path) self.assertIsNotNone(loaded_optimum_pipe._meta_information) self.assertIsNotNone(loaded_optimum_pipe._meta_information["photon_version"]) # check if predictions stay realiably the same y_pred_loaded = loaded_optimum_pipe.predict(self.__X) y_pred = my_pipe.optimum_pipe.predict(self.__X) np.testing.assert_array_equal(y_pred_loaded, y_pred) def test_overwrite_result_folder(self): """ Test for right handling of parameter output_settings.overwrite. """ def get_summary_file(): return os.path.join( self.hyperpipe.output_settings.results_folder, "photon_summary.txt" ) # Case 1: default output_settings1 = OutputSettings( project_folder=self.tmp_folder_path, save_output=True, overwrite_results=False, ) self.setup_hyperpipe(output_settings1) self.hyperpipe.fit(self.__X, self.__y) tmp_path = get_summary_file() time.sleep(2) # again with same settings self.setup_hyperpipe(output_settings1) self.hyperpipe.fit(self.__X, self.__y) tmp_path2 = get_summary_file() # we expect a new output folder each time with timestamp self.assertNotEqual(tmp_path, tmp_path2) # Case 2 overwrite results: all in the same folder output_settings2 = OutputSettings( project_folder=self.tmp_folder_path, save_output=True, overwrite_results=True, ) self.setup_hyperpipe(output_settings2) self.hyperpipe.fit(self.__X, self.__y) tmp_path = get_summary_file() tmp_date = os.path.getmtime(tmp_path) self.setup_hyperpipe(output_settings2) self.hyperpipe.fit(self.__X, self.__y) tmp_path2 = get_summary_file() tmp_date2 = os.path.getmtime(tmp_path2) # same folder but summary file is overwritten through the new analysis self.assertEqual(tmp_path, tmp_path2) self.assertNotEqual(tmp_date, tmp_date2) # Case 3: we have a cache folder self.hyperpipe.cache_folder = self.cache_folder_path shutil.rmtree(self.cache_folder_path, ignore_errors=True) self.hyperpipe.fit(self.__X, self.__y) self.assertTrue(os.path.exists(self.cache_folder_path)) def test_random_state(self): self.hyperpipe.random_state = 4567 self.hyperpipe.fit(self.__X, self.__y) # assure we spread the word.. ! self.assertEqual(self.hyperpipe.random_state, 4567) self.assertEqual(self.hyperpipe._pipe.random_state, 4567) self.assertEqual(self.hyperpipe.optimum_pipe.random_state, 4567) self.assertEqual(self.hyperpipe._pipe.elements[-1][-1].random_state, 4567) self.assertEqual( self.hyperpipe._pipe.elements[-1][-1].base_element.random_state, 4567 ) def test_dummy_estimator_preparation(self): self.hyperpipe.results = MDBHyperpipe() self.hyperpipe.results.dummy_estimator = dummy_estimator = MDBDummyResults() # one time regressor, one time classifier, one time strange object self.hyperpipe.elements = list() self.hyperpipe.add(PipelineElement("SVC")) dummy_estimator = self.hyperpipe._prepare_dummy_estimator() self.assertTrue(isinstance(dummy_estimator, DummyClassifier)) self.hyperpipe.elements = list() self.hyperpipe.add(PipelineElement("SVR")) dummy_estimator = self.hyperpipe._prepare_dummy_estimator() self.assertTrue(isinstance(dummy_estimator, DummyRegressor)) with self.assertRaises(NotImplementedError): self.hyperpipe.elements = list() self.hyperpipe.add(PipelineElement("PCA")) dummy_estimator = self.hyperpipe._prepare_dummy_estimator() self.assertIsNone(dummy_estimator) def setup_crazy_pipe(self): # erase all, we need a complex and crazy task self.hyperpipe.elements = list() nmb_list = list() for i in range(5): nmb = NeuroBranch(name=str(i), nr_of_processes=i + 3) nmb += PipelineElement("SmoothImages") nmb_list.append(nmb) my_switch = Switch("disabling_test_switch") my_switch += nmb_list[0] my_switch += nmb_list[1] my_stack = Stack("stack_of_branches") for i in range(3): my_branch = Branch("branch_" + str(i + 2)) my_branch += PipelineElement("StandardScaler") my_branch += nmb_list[i + 2] my_stack += my_branch self.hyperpipe.add(my_stack) self.hyperpipe.add(PipelineElement("StandardScaler")) self.hyperpipe.add(my_switch) self.hyperpipe.add(PipelineElement("SVC")) return nmb_list def test_recursive_disabling(self): list_of_elements_to_detect = self.setup_crazy_pipe() self.hyperpipe._pipe = Branch.prepare_photon_pipe(list_of_elements_to_detect) Hyperpipe.disable_multiprocessing_recursively(self.hyperpipe._pipe) self.assertTrue([i.nr_of_processes == 1 for i in list_of_elements_to_detect]) def test_recursive_cache_folder_propagation(self): list_of_elements = self.setup_crazy_pipe() self.hyperpipe._pipe = Branch.prepare_photon_pipe(self.hyperpipe.elements) self.hyperpipe.recursive_cache_folder_propagation( self.hyperpipe._pipe, self.cache_folder_path, "fold_id_123" ) for i, nmbranch in enumerate(list_of_elements): if i > 1: start_folder = os.path.join( self.cache_folder_path, "branch_" + nmbranch.name ) else: start_folder = self.cache_folder_path expected_folder = os.path.join(start_folder, nmbranch.name) self.assertEqual(nmbranch.base_element.cache_folder, expected_folder) def test_prepare_result_logging(self): # test that results object is given and entails hyperpipe infos self.hyperpipe.data.X = self.__X self.hyperpipe.data.y = self.__y self.hyperpipe._prepare_result_logging(datetime.datetime.now()) self.assertTrue(isinstance(self.hyperpipe.results, MDBHyperpipe)) self.assertTrue(isinstance(self.hyperpipe.results_handler, ResultsHandler)) self.assertTrue(len(self.hyperpipe.results.outer_folds) == 0) def test_finalize_optimization(self): # it is kind of difficult to test that's why we fake it self.hyperpipe.fit(self.__X, self.__y) # reset all infos self.hyperpipe.results.dummy_estimator.train = MDBScoreInformation() self.hyperpipe.results.dummy_estimator.test = MDBScoreInformation() self.hyperpipe.results.metrics_train = {} self.hyperpipe.results.metrics_test = {} self.hyperpipe.best_config = None self.hyperpipe.results.best_config = MDBConfig() self.hyperpipe.optimum_pipe = None # now generate infos again self.hyperpipe._finalize_optimization() expected_num_of_metrics = len(self.hyperpipe.optimization.metrics) # dummy average values self.assertTrue( len(self.hyperpipe.results.dummy_estimator.train), expected_num_of_metrics ) self.assertTrue( len(self.hyperpipe.results.dummy_estimator.test), expected_num_of_metrics ) # overall average values self.assertTrue( len(self.hyperpipe.results.metrics_train), 2 * expected_num_of_metrics ) self.assertTrue( len(self.hyperpipe.results.metrics_test), 2 * expected_num_of_metrics ) # find best config self.assertIsNotNone(self.hyperpipe.best_config) self.assertIsNotNone(self.hyperpipe.results.best_config) self.assertEqual( self.hyperpipe.best_config, self.hyperpipe.results.best_config.config_dict ) # set optimum pipe and params, # todo: test add preprocessing self.assertIsNotNone(self.hyperpipe.optimum_pipe) self.assertEqual( self.hyperpipe.optimum_pipe.named_steps["SVC"].base_element.C, self.hyperpipe.best_config["SVC__C"], ) # save optimum model self.assertTrue( os.path.isfile( os.path.join( self.hyperpipe.output_settings.results_folder, "photon_best_model.photon", ) ) ) # backmapping # because the pca is test disabled, we expect the number of features self.assertEqual( len(self.hyperpipe.results.best_config_feature_importances[0]), self.__X.shape[1], ) backmapped_feature_importances = os.path.join( self.hyperpipe.output_settings.results_folder, "optimum_pipe_feature_importances_backmapped.csv", ) self.assertTrue(os.path.isfile(backmapped_feature_importances)) loaded_array = np.loadtxt( open(backmapped_feature_importances, "rb"), delimiter="," ) self.assertEqual(loaded_array.shape[0], self.__X.shape[1]) def test_optimum_pipe_predict_and_predict_proba_and_transform(self): # find best config and test against sklearn self.hyperpipe.elements[-1] = PipelineElement( "RandomForestClassifier", {"n_estimators": IntegerRange(4, 20, step=2)}, random_state=42, ) self.hyperpipe.fit(self.__X, self.__y) # the best config is without PCA so we test it best_config_copy = dict(self.hyperpipe.best_config) del best_config_copy["PCA__disabled"] if self.hyperpipe.best_config["PCA__disabled"]: sk_elements = [ ("StandardScaler", StandardScaler()), ("RandomForestClassifier", RandomForestClassifier(random_state=42)), ] else: sk_elements = [ ("StandardScaler", StandardScaler()), ("PCA", PCA(random_state=42)), ("RandomForestClassifier", RandomForestClassifier(random_state=42)), ] self.sklearn_pipe = SKLPipeline(sk_elements) self.sklearn_pipe.set_params(**best_config_copy) self.sklearn_pipe.fit(self.__X, self.__y) self.assertTrue( np.array_equal( self.sklearn_pipe.predict(self.__X), self.hyperpipe.predict(self.__X) ) ) self.assertTrue( np.array_equal( self.sklearn_pipe.predict_proba(self.__X), self.hyperpipe.predict_proba(self.__X), ) ) # fake transform on sklearn pipe step1 = self.sklearn_pipe.named_steps["StandardScaler"].transform(self.__X) if "PCA" in self.sklearn_pipe.named_steps: step2 = self.sklearn_pipe.named_steps["PCA"].transform(self.__X) else: step2 = step1 self.assertTrue(np.array_equal(step2, self.hyperpipe.transform(self.__X)))
def test_save_optimum_pipe(self): # todo: test .save() of custom model tmp_path = os.path.join(self.tmp_folder_path, "optimum_pipypipe") settings = OutputSettings(project_folder=tmp_path, overwrite_results=True) my_pipe = Hyperpipe( "hyperpipe", optimizer="random_grid_search", optimizer_params={"n_configurations": 3}, metrics=["accuracy", "precision", "recall"], best_config_metric="f1_score", outer_cv=KFold(n_splits=2), inner_cv=KFold(n_splits=2), verbosity=1, output_settings=settings, ) preproc = Preprocessing() preproc += PipelineElement("StandardScaler") # BRANCH WITH QUANTILTRANSFORMER AND DECISIONTREECLASSIFIER tree_qua_branch = Branch("tree_branch") tree_qua_branch += PipelineElement("QuantileTransformer") tree_qua_branch += PipelineElement( "DecisionTreeClassifier", {"min_samples_split": IntegerRange(2, 4)}, criterion="gini", ) # BRANCH WITH MinMaxScaler AND DecisionTreeClassifier svm_mima_branch = Branch("svm_branch") svm_mima_branch += PipelineElement("MinMaxScaler") svm_mima_branch += PipelineElement( "SVC", {"kernel": Categorical(["rbf", "linear"]), "C": 2.0}, gamma="auto" ) # BRANCH WITH StandardScaler AND KNeighborsClassifier knn_sta_branch = Branch("neighbour_branch") knn_sta_branch += PipelineElement.create("dummy", DummyTransformer(), {}) knn_sta_branch += PipelineElement("KNeighborsClassifier") my_pipe += preproc # voting = True to mean the result of every branch my_pipe += Stack( "final_stack", [tree_qua_branch, svm_mima_branch, knn_sta_branch] ) my_pipe += PipelineElement("LogisticRegression", solver="lbfgs") my_pipe.fit(self.__X, self.__y) model_path = os.path.join( my_pipe.output_settings.results_folder, "photon_best_model.photon" ) self.assertTrue(os.path.exists(model_path)) # now move optimum pipe to new folder test_folder = os.path.join( my_pipe.output_settings.results_folder, "new_test_folder" ) new_model_path = os.path.join(test_folder, "photon_best_model.photon") os.makedirs(test_folder) shutil.copyfile(model_path, new_model_path) # check if load_optimum_pipe also works # check if we have the meta information recovered loaded_optimum_pipe = Hyperpipe.load_optimum_pipe(new_model_path) self.assertIsNotNone(loaded_optimum_pipe._meta_information) self.assertIsNotNone(loaded_optimum_pipe._meta_information["photon_version"]) # check if predictions stay realiably the same y_pred_loaded = loaded_optimum_pipe.predict(self.__X) y_pred = my_pipe.optimum_pipe.predict(self.__X) np.testing.assert_array_equal(y_pred_loaded, y_pred)
class ResultsHandlerTest(PhotonBaseTest): def setUp(self): """ Set default start settings for all tests. """ super(ResultsHandlerTest, self).setUp() self.files = [ 'best_config_predictions.csv', 'time_monitor.csv', 'time_monitor_pie.png', 'photon_result_file.p', 'photon_summary.txt', 'photon_best_model.photon', 'optimum_pipe_feature_importances_backmapped.npz', 'photon_code.py', 'optimizer_history.png' ] self.output_settings = OutputSettings( project_folder=self.tmp_folder_path, save_output=True) self.ss_pipe_element = PipelineElement('StandardScaler') self.pca_pipe_element = PipelineElement('PCA', {'n_components': [1, 2]}, random_state=42) self.svc_pipe_element = PipelineElement( 'SVC', { 'C': [0.1], 'kernel': ['linear'] }, # 'rbf', 'sigmoid'] random_state=42) self.inner_cv_object = KFold(n_splits=3) self.metrics = ["accuracy", 'recall', 'precision'] self.best_config_metric = "accuracy" self.hyperpipe = Hyperpipe('god', inner_cv=self.inner_cv_object, metrics=self.metrics, best_config_metric=self.best_config_metric, outer_cv=KFold(n_splits=2), output_settings=self.output_settings, verbosity=1) self.hyperpipe += self.ss_pipe_element self.hyperpipe += self.pca_pipe_element self.hyperpipe.add(self.svc_pipe_element) dataset = load_breast_cancer() self.__X = dataset.data self.__y = dataset.target self.hyperpipe.fit(self.__X, self.__y) def test_write_convenience_files(self): """ Output creation testing. Only write if output_settings.save_output == True """ for file in self.files: self.assertTrue( os.path.isfile( os.path.join(self.output_settings.results_folder, file))) # correct rows with open( os.path.join(self.output_settings.results_folder, 'best_config_predictions.csv')) as f: self.assertEqual( sum([ outer_fold.number_samples_test for outer_fold in self.hyperpipe.results.outer_folds ]), sum(1 for _ in f) - 1) shutil.rmtree(self.tmp_folder_path, ignore_errors=True) self.output_settings = OutputSettings( project_folder=self.tmp_folder_path, save_output=False) self.hyperpipe.fit(self.__X, self.__y) self.assertIsNone(self.output_settings.results_folder) def test_readable_time_monitor_csv(self): """ Test for only readable time_moitor.csv (right count of columns and pandas import). """ time_monitor_df = pd.read_csv(os.path.join( self.output_settings.results_folder, 'time_monitor.csv'), header=[0, 1]) self.assertIsInstance(time_monitor_df, pd.DataFrame) self.assertEqual(len(time_monitor_df.columns), 10) def test_summary(self): """ Check content of photon_summary.txt. Adjustment with hyperpipe.result. """ with open( os.path.join(self.output_settings.results_folder, 'photon_summary.txt')) as file: data = file.read() areas = data.split( "-------------------------------------------------------------------" ) # first areas self.assertEqual(areas[0], "\nPHOTON RESULT SUMMARY\n") result_dict = { "dummy_test": self.hyperpipe.results.dummy_estimator.test, "dummy_train": self.hyperpipe.results.dummy_estimator.train, "best_config_train": self.hyperpipe.results.metrics_train, "best_config_test": self.hyperpipe.results.metrics_test } outer_fold_traintest = {} key_areas_outer_fold = [] # all outerfold areas for i in range(len(self.hyperpipe.results.outer_folds)): self.assertEqual(areas[4 + i * 2], '\nOUTER FOLD ' + str(i + 1) + '\n') key_areas_outer_fold.append("outer_fold_" + str(i + 1)) result_dict["outer_fold_"+str(i+1)+"_train"] = \ self.hyperpipe.results.outer_folds[i].best_config.best_config_score.training outer_fold_traintest["outer_fold_" + str(i + 1) + "_train"] = "TrainValue" result_dict["outer_fold_" + str(i + 1) + "_test"] = \ self.hyperpipe.results.outer_folds[i].best_config.best_config_score.validation outer_fold_traintest["outer_fold_" + str(i + 1) + "_test"] = "TestValue" # check performance / test-train of dummy and best_config key_areas = ["entracee", "name", "dummy", "best_config"] splitted_areas = {} for num in range(len(key_areas)): splitted_areas[key_areas[num]] = areas[num].split("\n") index_dict = {} for key in key_areas[2:]: if [perf for perf in splitted_areas[key] if perf == "TEST:"]: index_dict[key + "_test"] = splitted_areas[key].index("TEST:") index_dict[key + "_train"] = splitted_areas[key].index("TRAINING:") else: self.assertTrue(False) for data_key in [k for k in list(result_dict.keys()) if key in k]: table_str = "\n".join([ splitted_areas[key][index_dict[data_key] + i] for i in [2, 4, 5, 6] ]) table = pd.read_csv(StringIO(table_str.replace(" ", "")), sep="|")[["MetricName", "MEAN", "STD"]].set_index("MetricName") for result_metric in result_dict[data_key]: self.assertAlmostEqual( result_metric.value, table[result_metric.operation.split(".")[1]][ result_metric.metric_name], 4) splitted_areas = {} for num in range(len(key_areas_outer_fold)): splitted_areas[key_areas_outer_fold[num]] = areas[len(key_areas) + 1 + num * 2].split("\n") # check all outer_folds for key_area_outer_fold in key_areas_outer_fold: if [ perf for perf in splitted_areas[key_area_outer_fold] if perf == "PERFORMANCE:" ]: index_dict[key_area_outer_fold + "_train"] = splitted_areas[ key_area_outer_fold].index("PERFORMANCE:") index_dict[key_area_outer_fold + "_test"] = index_dict[key_area_outer_fold + "_train"] else: self.assertTrue(False) for data_key in [ k for k in list(result_dict.keys()) if key_area_outer_fold in k ]: table_str = "\n".join([ splitted_areas[key_area_outer_fold][index_dict[data_key] + i] for i in [2, 4, 5, 6] ]) table = pd.read_csv(StringIO(table_str.replace(" ", "")), sep="|")[[ "MetricName", "TrainValue", "TestValue" ]].set_index("MetricName") for result_metric in result_dict[data_key].metrics.keys(): self.assertAlmostEqual( result_dict[data_key].metrics[result_metric], table[outer_fold_traintest[data_key]][result_metric], 4) def test_save_backmapping(self): """ Check dimension of feature backmapping equals input dimensions. """ npzfile = np.load( os.path.join(self.output_settings.results_folder, 'optimum_pipe_feature_importances_backmapped.npz')) self.assertEqual(len(npzfile.files), 1) result_data = [] for file in npzfile.files: result_data.append(npzfile[file]) self.assertEqual(np.shape(self.__X)[1], result_data[0].size) # def test_save_backmapping_stack(self): # self.hyperpipe = Hyperpipe('god', inner_cv=self.inner_cv_object, # metrics=self.metrics, # best_config_metric=self.best_config_metric, # outer_cv=KFold(n_splits=2), # output_settings=self.output_settings, # verbosity=1) # self.hyperpipe += self.ss_pipe_element # self.stack = Stack("myStack") # self.stack += PipelineElement("MinMaxScaler") # self.stack += self.pca_pipe_element # self.hyperpipe += self.stack # self.hyperpipe.add(self.svc_pipe_element) # self.output_settings.save_output = True # self.hyperpipe.fit(self.__X, self.__y) # picklefile = pickle.load(open( # os.path.join(self.output_settings.results_folder, 'optimum_pipe_feature_importances_backmapped.p'),"rb")) # self.assertEqual(np.shape(self.__X)[1], len(picklefile[0])) def pass_through_plots(self): """ Test for plot functions. Only passing test, no quality testing. """ self.assertIsNone(self.hyperpipe.results.plot_optimizer_history()) self.assertIsNone(self.hyperpipe.results.plot_true_pred()) self.assertIsNone(self.hyperpipe.results.plot_confusion_matrix()) self.assertIsNone(self.hyperpipe.results.plot_roc_curve()) def test_load_from_file(self): X, y = load_breast_cancer(True) my_pipe = Hyperpipe( 'load_results_file_test', metrics=['accuracy'], best_config_metric='accuracy', output_settings=OutputSettings(project_folder='./tmp')) my_pipe += PipelineElement("StandardScaler") my_pipe += PipelineElement("SVC") my_pipe.fit(X, y) results_file = os.path.join(my_pipe.output_settings.results_folder, "photon_result_file.p") my_result_handler = ResultsHandler() my_result_handler.load_from_file(results_file) self.assertIsInstance(my_result_handler.results, MDBHyperpipe) def test_get_performance_table(self): pass def test_load_from_mongodb(self): pass