def test_adjusted_delegate_call_transformer(self): # check standard transformer trans = PipelineElement.create('Transformer', base_element=DummyTransformer(), hyperparameters={}) X, y, kwargs = trans.transform(self.X, self.y, **self.kwargs) self.assertTrue(np.array_equal( X, self.Xt)) # only X should be transformed self.assertTrue(np.array_equal(y, self.y)) self.assertDictEqual(kwargs, self.kwargs) # check transformer needs y trans = PipelineElement.create('NeedsYTransformer', base_element=DummyNeedsYTransformer(), hyperparameters={}) X, y, kwargs = trans.transform(self.X, self.y, **self.kwargs) self.assertTrue(np.array_equal(X, self.Xt)) self.assertTrue(np.array_equal(y, self.yt)) self.assertDictEqual(kwargs, self.kwargs) trans = PipelineElement.create('NeedsYTransformer', base_element=DummyNeedsYTransformer(), hyperparameters={}) X, y, kwargs = trans.transform(self.X, self.y) # this time without any kwargs self.assertTrue(np.array_equal(X, self.Xt)) self.assertTrue(np.array_equal(y, self.yt)) self.assertDictEqual(kwargs, {}) # check transformer needs covariates trans = PipelineElement.create( 'NeedsCovariatesTransformer', base_element=DummyNeedsCovariatesTransformer(), hyperparameters={}) X, y, kwargs = trans.transform(self.X, **self.kwargs) self.assertTrue(np.array_equal(X, self.Xt)) self.assertTrue( np.array_equal(kwargs['covariates'], self.kwargst['covariates'])) self.assertEqual(y, None) # check transformer needs covariates and needs y trans = PipelineElement.create( 'NeedsCovariatesAndYTransformer', base_element=DummyNeedsCovariatesAndYTransformer(), hyperparameters={}) X, y, kwargs = trans.transform(self.X, self.y, **self.kwargs) self.assertTrue(np.array_equal(X, self.Xt)) self.assertTrue(np.array_equal(y, self.yt)) self.assertTrue( np.array_equal(kwargs['covariates'], self.kwargst['covariates']))
def test_no_y_transformers(self): stacking_element = Stack("forbidden_stack") my_dummy = PipelineElement.create( "dummy", DummyNeedsCovariatesAndYTransformer(), {}) with self.assertRaises(NotImplementedError): stacking_element += my_dummy
def test_neuro_hyperpipe_parallelized_batched_caching(self): cache_path = self.cache_folder_path self.hyperpipe = Hyperpipe('complex_case', inner_cv=KFold(n_splits=5), outer_cv=KFold(n_splits=3), optimizer='grid_search', cache_folder=cache_path, metrics=['mean_squared_error'], best_config_metric='mean_squared_error', output_settings=OutputSettings( project_folder=self.tmp_folder_path)) nb = ParallelBranch("SubjectCaching", nr_of_processes=1) nb += PipelineElement.create("ResampleImages", StupidAdditionTransformer(), {'voxel_size': [3, 5, 10]}, batch_size=4) self.hyperpipe += nb self.hyperpipe += PipelineElement("StandardScaler", {}) self.hyperpipe += PipelineElement("PCA", {'n_components': [3, 4]}) self.hyperpipe += PipelineElement("SVR", {'kernel': ['rbf', 'linear']}) self.hyperpipe.fit(self.X, self.y) # assert cache is empty again nr_of_p_files = len( glob.glob(os.path.join(self.hyperpipe.cache_folder, "*.p"))) print(nr_of_p_files) self.assertTrue(nr_of_p_files == 0)
def generate_hyperpipes(self): if self.atlas_info_object.roi_names_runtime: self.rois = self.atlas_info_object.roi_names_runtime # # self.outer_pipe = Hyperpipe(self.atlas_name + 'outer_pipe', optimizer='grid_search', # metrics=['accuracy'], hyperparameter_specific_config_cv_object= # ShuffleSplit(n_splits=1, test_size=0.2, random_state=3), # hyperparameter_search_cv_object= # ShuffleSplit(n_splits=1, test_size=0.2, random_state=3), # eval_final_performance=True) inner_pipe_list = {} for i in range(len(self.rois)): tmp_inner_pipe = Hyperpipe(self.atlas_name + '_' + str(self.rois[i]), optimizer='grid_search', inner_cv=ShuffleSplit(n_splits=1, test_size=0.2, random_state=3), eval_final_performance=False, verbose=logging.verbosity_level, best_config_metric=self.best_config_metric, metrics=self.metrics) # at first set a filter element roi_filter_element = RoiFilterElement(i) tmp_inner_pipe.filter_element = roi_filter_element # secondly add all other items for pipe_item in self.hyperpipe_elements: tmp_inner_pipe += PipelineElement.create(pipe_item[0], pipe_item[1], **pipe_item[2]) inner_pipe_list[self.rois[i]] = tmp_inner_pipe self.pipeline_fusion = Stack('multiple_source_pipes', inner_pipe_list.values(), voting=False)
def test_adjusted_delegate_call_estimator(self): # check standard estimator est = PipelineElement.create('Estimator', base_element=DummyEstimator(), hyperparameters={}) y = est.predict(self.X) self.assertTrue(np.array_equal( y, self.Xt)) # DummyEstimator returns X as y predictions # check estimator needs covariates est = PipelineElement.create( 'Estimator', base_element=DummyNeedsCovariatesEstimator(), hyperparameters={}) X = est.predict(self.X, **self.kwargs) self.assertTrue(np.array_equal( X, self.Xt)) # DummyEstimator returns X as y predictions
def test_copy_me(self): svc = PipelineElement('SVC', { 'C': [0.1, 1], 'kernel': ['rbf', 'sigmoid'] }) svc.set_params(**{'C': 0.1, 'kernel': 'sigmoid'}) copy = svc.copy_me() self.assertEqual(svc.random_state, copy.random_state) self.assertNotEqual(copy.base_element, svc.base_element) self.assertDictEqual(elements_to_dict(copy), elements_to_dict(svc)) self.assertEqual(copy.base_element.C, svc.base_element.C) # check if copies are still the same, even when making a copy of a fitted PipelineElement copy_after_fit = svc.fit(self.X, self.y).copy_me() self.assertDictEqual(elements_to_dict(copy), elements_to_dict(copy_after_fit)) svc = PipelineElement('SVC', { 'C': [0.1, 1], 'kernel': ['rbf', 'sigmoid'] }) copy = svc.copy_me() self.assertDictEqual(copy.hyperparameters, { 'SVC__C': [0.1, 1], 'SVC__kernel': ['rbf', 'sigmoid'] }) copy.base_element.C = 3 self.assertNotEqual(svc.base_element.C, copy.base_element.C) # test custom element custom_element = PipelineElement.create( 'CustomElement', base_element=DummyNeedsCovariatesEstimator(), hyperparameters={}) copy = custom_element.copy_me() self.assertDictEqual(elements_to_dict(custom_element), elements_to_dict(copy)) custom_element2 = PipelineElement.create( 'MyUnDeepcopyableObject', base_element=GridSearchOptimizer(), hyperparameters={}) with self.assertRaises(Exception): custom_element2.copy_me()
def test_preprocessing(self): prepro_pipe = Preprocessing() prepro_pipe += PipelineElement.create( "dummy", DummyYAndCovariatesTransformer(), {} ) self.hyperpipe += prepro_pipe self.hyperpipe.fit(self.__X, self.__y) self.assertTrue(np.array_equal(self.__y + 1, self.hyperpipe.data.y))
def test_predict_when_no_transform(self): # check standard estimator est = PipelineElement.create('Estimator', base_element=DummyEstimator(), hyperparameters={}) X, y, kwargs = est.transform(self.X) self.assertTrue(np.array_equal( X, self.Xt)) # DummyEstimator returns X as y predictions self.assertEqual(y, None) # check estimator needs covariates est = PipelineElement.create( 'Estimator', base_element=DummyNeedsCovariatesEstimator(), hyperparameters={}) X, y, kwargs = est.transform(self.X, **self.kwargs) self.assertTrue(np.array_equal( X, self.Xt)) # DummyEstimator returns X as y predictions self.assertTrue( np.array_equal(kwargs['covariates'], self.kwargs['covariates'])) self.assertEqual(y, None)
def test_estimator_type(self): estimator = PipelineElement('SVC') self.assertEqual(estimator._estimator_type, 'classifier') estimator = PipelineElement('SVR') self.assertEqual(estimator._estimator_type, 'regressor') estimator = PipelineElement('PCA') self.assertEqual(estimator._estimator_type, None) estimator = PipelineElement.create('Dummy', DummyEstimatorWrongType(), {}) with self.assertRaises(NotImplementedError): est_type = estimator._estimator_type estimator = PipelineElement.create('Dummy', DummyTransformerWithPredict(), {}) with self.assertRaises(NotImplementedError): est_type = estimator._estimator_type estimator = PipelineElement.create('Dummy', DummyEstimatorNoPredict(), {}) with self.assertRaises(NotImplementedError): est_type = estimator._estimator_type
def setUp(self): super(PipelineTests, self).setUp() self.X, self.y = load_breast_cancer(return_X_y=True) # Photon Version self.p_pca = PipelineElement("PCA", {}, random_state=3) self.p_svm = PipelineElement("SVC", {}, random_state=3) self.p_ss = PipelineElement("StandardScaler", {}) self.p_dt = PipelineElement("DecisionTreeClassifier", random_state=3) dummy_element = DummyYAndCovariatesTransformer() self.dummy_photon_element = PipelineElement.create("DummyTransformer", dummy_element, {}) self.sk_pca = PCA(random_state=3) self.sk_svc = SVC(random_state=3) self.sk_ss = StandardScaler() self.sk_dt = DecisionTreeClassifier(random_state=3)
def test_single_subject_caching(self): nb = ParallelBranch("subject_caching_test") # increase complexity by adding batching nb += PipelineElement.create("ResampleImages", StupidAdditionTransformer(), {}, batch_size=4) cache_folder = self.cache_folder_path cache_folder = os.path.join(cache_folder, 'subject_caching_test') nb.base_element.cache_folder = cache_folder def transform_and_check_folder(config, expected_nr_of_files): nb.set_params(**config) nb.transform(self.X, self.y) nr_of_generated_cache_files = len( glob.glob(os.path.join(cache_folder, "*.p"))) self.assertTrue( nr_of_generated_cache_files == expected_nr_of_files) # fit with first config # expect one cache file per input file transform_and_check_folder({'ResampleImages__voxel_size': 5}, self.nr_of_expected_pickles_per_config) # after fitting with second config, we expect two times the number of input files to be in cache transform_and_check_folder({'ResampleImages__voxel_size': 10}, 2 * self.nr_of_expected_pickles_per_config) # fit with first config again, we expect to not have generate other cache files, because they exist transform_and_check_folder({'ResampleImages__voxel_size': 5}, 2 * self.nr_of_expected_pickles_per_config) # clean up afterwards CacheManager.clear_cache_files(cache_folder) CacheManager.clear_cache_files(self.tmp_folder_path, force_all=True)
def test_save_optimum_pipe(self): # todo: test .save() of custom model tmp_path = os.path.join(self.tmp_folder_path, "optimum_pipypipe") settings = OutputSettings(project_folder=tmp_path, overwrite_results=True) my_pipe = Hyperpipe( "hyperpipe", optimizer="random_grid_search", optimizer_params={"n_configurations": 3}, metrics=["accuracy", "precision", "recall"], best_config_metric="f1_score", outer_cv=KFold(n_splits=2), inner_cv=KFold(n_splits=2), verbosity=1, output_settings=settings, ) preproc = Preprocessing() preproc += PipelineElement("StandardScaler") # BRANCH WITH QUANTILTRANSFORMER AND DECISIONTREECLASSIFIER tree_qua_branch = Branch("tree_branch") tree_qua_branch += PipelineElement("QuantileTransformer") tree_qua_branch += PipelineElement( "DecisionTreeClassifier", {"min_samples_split": IntegerRange(2, 4)}, criterion="gini", ) # BRANCH WITH MinMaxScaler AND DecisionTreeClassifier svm_mima_branch = Branch("svm_branch") svm_mima_branch += PipelineElement("MinMaxScaler") svm_mima_branch += PipelineElement( "SVC", {"kernel": Categorical(["rbf", "linear"]), "C": 2.0}, gamma="auto" ) # BRANCH WITH StandardScaler AND KNeighborsClassifier knn_sta_branch = Branch("neighbour_branch") knn_sta_branch += PipelineElement.create("dummy", DummyTransformer(), {}) knn_sta_branch += PipelineElement("KNeighborsClassifier") my_pipe += preproc # voting = True to mean the result of every branch my_pipe += Stack( "final_stack", [tree_qua_branch, svm_mima_branch, knn_sta_branch] ) my_pipe += PipelineElement("LogisticRegression", solver="lbfgs") my_pipe.fit(self.__X, self.__y) model_path = os.path.join( my_pipe.output_settings.results_folder, "photon_best_model.photon" ) self.assertTrue(os.path.exists(model_path)) # now move optimum pipe to new folder test_folder = os.path.join( my_pipe.output_settings.results_folder, "new_test_folder" ) new_model_path = os.path.join(test_folder, "photon_best_model.photon") os.makedirs(test_folder) shutil.copyfile(model_path, new_model_path) # check if load_optimum_pipe also works # check if we have the meta information recovered loaded_optimum_pipe = Hyperpipe.load_optimum_pipe(new_model_path) self.assertIsNotNone(loaded_optimum_pipe._meta_information) self.assertIsNotNone(loaded_optimum_pipe._meta_information["photon_version"]) # check if predictions stay realiably the same y_pred_loaded = loaded_optimum_pipe.predict(self.__X) y_pred = my_pipe.optimum_pipe.predict(self.__X) np.testing.assert_array_equal(y_pred_loaded, y_pred)
def test_combi_from_single_and_group_caching(self): # 2. specify cache directories cache_folder_base = self.cache_folder_path cache_folder_neuro = os.path.join(cache_folder_base, 'subject_caching_test') CacheManager.clear_cache_files(cache_folder_base) CacheManager.clear_cache_files(cache_folder_neuro) # 3. set up Neuro Branch nb = ParallelBranch("SubjectCaching", nr_of_processes=3) # increase complexity by adding batching nb += PipelineElement.create("ResampleImages", StupidAdditionTransformer(), {}, batch_size=4) nb.base_element.cache_folder = cache_folder_neuro # 4. setup usual pipeline ss = PipelineElement("StandardScaler", {}) pca = PipelineElement("PCA", {'n_components': [3, 10, 50]}) svm = PipelineElement("SVR", {'kernel': ['rbf', 'linear']}) pipe = PhotonPipeline([('NeuroBranch', nb), ('StandardScaler', ss), ('PCA', pca), ('SVR', svm)]) pipe.caching = True pipe.fold_id = "12345643463434" pipe.cache_folder = cache_folder_base def transform_and_check_folder(config, expected_nr_of_files_group, expected_nr_subject): pipe.set_params(**config) pipe.fit(self.X, self.y) nr_of_generated_cache_files = len( glob.glob(os.path.join(cache_folder_base, "*.p"))) self.assertTrue( nr_of_generated_cache_files == expected_nr_of_files_group) nr_of_generated_cache_files_subject = len( glob.glob(os.path.join(cache_folder_neuro, "*.p"))) self.assertTrue( nr_of_generated_cache_files_subject == expected_nr_subject) config1 = { 'NeuroBranch__ResampleImages__voxel_size': 5, 'PCA__n_components': 7, 'SVR__C': 2 } config2 = { 'NeuroBranch__ResampleImages__voxel_size': 3, 'PCA__n_components': 4, 'SVR__C': 5 } # first config we expect to have a cached_file for the standard scaler and the pca # and we expect to have two files (one resampler, one brain mask) for each input data transform_and_check_folder(config1, 2, self.nr_of_expected_pickles_per_config) # second config we expect to have two cached_file for the standard scaler (one time for 5 voxel input and one # time for 3 voxel input) and two files two for the first and second config pcas, # and we expect to have 2 * nr of input data for resampler plus one time masker transform_and_check_folder(config2, 4, 2 * self.nr_of_expected_pickles_per_config) # when we transform with the first config again, nothing should happen transform_and_check_folder(config1, 4, 2 * self.nr_of_expected_pickles_per_config) # when we transform with an empty config, a new entry for pca and standard scaler should be generated, as well # as a new cache item for each input data from the neuro branch for each itemin the neuro branch with self.assertRaises(ValueError): transform_and_check_folder({}, 6, 4 * self.nr_of_expected_pickles_per_config) CacheManager.clear_cache_files(cache_folder_base) CacheManager.clear_cache_files(cache_folder_neuro)
def predict(self, X, **kwargs): y_true = kwargs["true_predictions"] assert X.shape[0] == len(y_true) return y_true def save(self): return None # WE USE THE BREAST CANCER SET FROM SKLEARN X, y = load_breast_cancer(return_X_y=True) settings = OutputSettings(project_folder='./tmp/') # DESIGN YOUR PIPELINE my_pipe = Hyperpipe('basic_svm_pipe', metrics=['accuracy', 'precision', 'recall', 'balanced_accuracy'], # the performance metrics of your interest best_config_metric='accuracy', outer_cv=KFold(n_splits=3), inner_cv=KFold(n_splits=3), verbosity=1, output_settings=settings) my_pipe.add(PipelineElement('StandardScaler')) my_pipe += PipelineElement.create("CustomWrapper", AdditionalDataWrapper(), hyperparameters={}) my_pipe.fit(X, y, true_predictions=np.array(y))
def test_save_optimum_pipe(self): # todo: test .save() of custom model tmp_path = os.path.join(self.tmp_folder_path, 'optimum_pipypipe') settings = OutputSettings(project_folder=tmp_path, overwrite_results=True) my_pipe = Hyperpipe('hyperpipe', optimizer='random_grid_search', optimizer_params={'n_configurations': 3}, metrics=['accuracy', 'precision', 'recall'], best_config_metric='f1_score', outer_cv=KFold(n_splits=2), inner_cv=KFold(n_splits=2), verbosity=1, output_settings=settings) preproc = Preprocessing() preproc += PipelineElement('StandardScaler') # BRANCH WITH QUANTILTRANSFORMER AND DECISIONTREECLASSIFIER tree_qua_branch = Branch('tree_branch') tree_qua_branch += PipelineElement('QuantileTransformer') tree_qua_branch += PipelineElement( 'DecisionTreeClassifier', {'min_samples_split': IntegerRange(2, 4)}, criterion='gini') # BRANCH WITH MinMaxScaler AND DecisionTreeClassifier svm_mima_branch = Branch('svm_branch') svm_mima_branch += PipelineElement('MinMaxScaler') svm_mima_branch += PipelineElement( 'SVC', { 'kernel': Categorical(['rbf', 'linear']), 'C': 2.0 }, gamma='auto') # BRANCH WITH StandardScaler AND KNeighborsClassifier knn_sta_branch = Branch('neighbour_branch') knn_sta_branch += PipelineElement.create("dummy", DummyTransformer(), {}) knn_sta_branch += PipelineElement('KNeighborsClassifier') my_pipe += preproc # voting = True to mean the result of every branch my_pipe += Stack('final_stack', [tree_qua_branch, svm_mima_branch, knn_sta_branch]) my_pipe += PipelineElement('LogisticRegression', solver='lbfgs') my_pipe.fit(self.__X, self.__y) model_path = os.path.join(my_pipe.output_settings.results_folder, 'photon_best_model.photon') self.assertTrue(os.path.exists(model_path)) # now move optimum pipe to new folder test_folder = os.path.join(my_pipe.output_settings.results_folder, 'new_test_folder') new_model_path = os.path.join(test_folder, 'photon_best_model.photon') os.makedirs(test_folder) shutil.copyfile(model_path, new_model_path) # check if load_optimum_pipe also works # check if we have the meta information recovered loaded_optimum_pipe = Hyperpipe.load_optimum_pipe(new_model_path) self.assertIsNotNone(loaded_optimum_pipe._meta_information) self.assertIsNotNone( loaded_optimum_pipe._meta_information['photon_version']) # check if predictions stay realiably the same y_pred_loaded = loaded_optimum_pipe.predict(self.__X) y_pred = my_pipe.optimum_pipe.predict(self.__X) np.testing.assert_array_equal(y_pred_loaded, y_pred)