def setUp(self): super(CachedPhotonPipelineTests, self).setUp() # Photon Version ss = PipelineElement("StandardScaler", {}) pca = PipelineElement("PCA", {'n_components': [3, 10, 50]}, random_state=3) svm = PipelineElement("SVC", {'kernel': ['rbf', 'linear']}, random_state=3) self.pipe = PhotonPipeline([('StandardScaler', ss), ('PCA', pca), ('SVC', svm)]) self.pipe.caching = True self.pipe.fold_id = "12345643463434" CacheManager.clear_cache_files(self.cache_folder_path) self.pipe.cache_folder = self.cache_folder_path self.config1 = {'PCA__n_components': 4, 'SVC__C': 3, 'SVC__kernel': 'rbf'} self.config2 = {'PCA__n_components': 7, 'SVC__C': 1, 'SVC__kernel': 'linear'} self.X, self.y = load_breast_cancer(return_X_y=True)
def setUp(self): super(CachedPhotonPipelineTests, self).setUp() # Photon Version ss = PipelineElement("StandardScaler", {}) pca = PipelineElement("PCA", {"n_components": [3, 10, 50]}, random_state=3) svm = PipelineElement("SVC", {"kernel": ["rbf", "linear"]}, random_state=3) self.pipe = PhotonPipeline([("StandardScaler", ss), ("PCA", pca), ("SVC", svm)]) self.pipe.caching = True self.pipe.fold_id = "12345643463434" self.pipe.cache_folder = self.cache_folder_path self.config1 = { "PCA__n_components": 4, "SVC__C": 3, "SVC__kernel": "rbf" } self.config2 = { "PCA__n_components": 7, "SVC__C": 1, "SVC__kernel": "linear" } self.X, self.y = load_breast_cancer(True)
def test_random_state(self): photon_pipe = PhotonPipeline([("SS", self.p_ss), ("PCA", PipelineElement('PCA')), ("SVC", self.p_dt)]) photon_pipe.random_state = 666 photon_pipe.fit(self.X, self.y) self.assertEqual(self.p_dt.random_state, photon_pipe.random_state) self.assertEqual(photon_pipe.elements[1][-1].random_state, photon_pipe.random_state) self.assertEqual(self.p_dt.random_state, 666)
def setUp(self): super(InnerFoldTests, self).setUp() self.pipe = PhotonPipeline([ ("StandardScaler", PipelineElement("StandardScaler")), ("PCA", PipelineElement("PCA")), ("RidgeClassifier", PipelineElement("RidgeClassifier")), ]) self.config = { "PCA__n_components": 5, "RidgeClassifier__solver": "svd", "RidgeClassifier__random_state": 42, } self.outer_fold_id = "TestID" self.inner_cv = KFold(n_splits=4) self.X, self.y = load_breast_cancer(True) self.cross_validation = Hyperpipe.CrossValidation( self.inner_cv, None, True, 0.2, True, False) self.cross_validation.inner_folds = { self.outer_fold_id: { i: FoldInfo(i, i + 1, train, test) for i, (train, test) in enumerate(self.inner_cv.split(self.X, self.y)) } } self.optimization = Hyperpipe.Optimization( "grid_search", {}, ["accuracy", "recall", "specificity"], "accuracy", None)
def setUp(self): super(InnerFoldTests, self).setUp() self.pipe = PhotonPipeline([ ('StandardScaler', PipelineElement('StandardScaler')), ('PCA', PipelineElement('PCA')), ('RidgeClassifier', PipelineElement('RidgeClassifier')) ]) self.config = { 'PCA__n_components': 5, 'RidgeClassifier__solver': 'svd', 'RidgeClassifier__random_state': 42 } self.outer_fold_id = 'TestID' self.inner_cv = KFold(n_splits=4) self.X, self.y = load_breast_cancer(return_X_y=True) self.cross_validation = Hyperpipe.CrossValidation( self.inner_cv, None, True, 0.2, True, False, False, None) self.cross_validation.inner_folds = { self.outer_fold_id: { i: FoldInfo(i, i + 1, train, test) for i, (train, test) in enumerate(self.inner_cv.split(self.X, self.y)) } } self.optimization = Hyperpipe.Optimization( 'grid_search', {}, ['accuracy', 'recall', 'specificity'], 'accuracy', None)
def test_copy_me(self): switch = Switch("my_copy_switch") switch += PipelineElement("StandardScaler") switch += PipelineElement("RobustScaler", test_disabled=True) stack = Stack("RandomStack") stack += PipelineElement("SVC") branch = Branch('Random_Branch') pca_hyperparameters = {'n_components': [5, 10]} branch += PipelineElement("PCA", hyperparameters=pca_hyperparameters) branch += PipelineElement("DecisionTreeClassifier") stack += branch photon_pipe = PhotonPipeline([("SimpleImputer", PipelineElement("SimpleImputer")), ("my_copy_switch", switch), ('RandomStack', stack), ('Callback1', CallbackElement('tmp_callback', np.mean)), ("PhotonVotingClassifier", PipelineElement("PhotonVotingClassifier"))]) copy_of_the_pipe = photon_pipe.copy_me() self.assertEqual(photon_pipe.random_state, copy_of_the_pipe.random_state) self.assertTrue(len(copy_of_the_pipe.elements) == 5) self.assertTrue(copy_of_the_pipe.elements[2][1].name == "RandomStack") self.assertTrue(copy_of_the_pipe.named_steps["my_copy_switch"].elements[1].test_disabled) self.assertDictEqual(copy_of_the_pipe.elements[2][1].elements[1].elements[0].hyperparameters, {"PCA__n_components": [5, 10]}) self.assertTrue(isinstance(copy_of_the_pipe.elements[3][1], CallbackElement)) self.assertTrue(copy_of_the_pipe.named_steps["tmp_callback"].delegate_function == np.mean)
def test_y_and_covariates_transformation(self): X = np.ones((200, 50)) y = np.ones((200, )) + 2 kwargs = {"sample1": np.ones((200, 5))} photon_pipe = PhotonPipeline([("DummyTransformer", self.dummy_photon_element)]) # if y is none all y transformer should be ignored Xt2, yt2, kwargst2 = photon_pipe.transform(X, None, **kwargs) self.assertTrue(np.array_equal(Xt2, X)) self.assertTrue(np.array_equal(yt2, None)) self.assertTrue(np.array_equal(kwargst2, kwargs)) # if y is given, all y transformers should be working Xt, yt, kwargst = photon_pipe.transform(X, y, **kwargs) # assure that data is delivered to element correctly self.assertTrue( np.array_equal(X, self.dummy_photon_element.base_element.X)) self.assertTrue( np.array_equal(y, self.dummy_photon_element.base_element.y)) self.assertTrue( np.array_equal( kwargs["sample1"], self.dummy_photon_element.base_element.kwargs["sample1"], )) # assure that data is transformed correctly self.assertTrue(np.array_equal(Xt, X - 1)) self.assertTrue(np.array_equal(yt, y + 1)) self.assertTrue("sample1_edit" in kwargst) self.assertTrue( np.array_equal(kwargst["sample1_edit"], kwargs["sample1"] + 5))
def test_add_preprocessing(self): my_preprocessing = Preprocessing() my_preprocessing += PipelineElement('LabelEncoder') photon_pipe = PhotonPipeline([("PCA", self.p_pca), ("SVC", self.p_svm)]) photon_pipe._add_preprocessing(my_preprocessing) self.assertEqual(len(photon_pipe.named_steps), 3) first_element = photon_pipe.elements[0][1] self.assertTrue(first_element == my_preprocessing) self.assertTrue(photon_pipe.named_steps['Preprocessing'] == my_preprocessing)
def test_predict_proba(self): sk_pipe = SKPipeline([("SS", self.sk_ss), ("SVC", self.sk_dt)]) sk_pipe.fit(self.X, self.y) sk_proba = sk_pipe.predict_proba(self.X) photon_pipe = PhotonPipeline([("SS", self.p_ss), ("SVC", self.p_dt)]) photon_pipe.fit(self.X, self.y) photon_proba = photon_pipe.predict_proba(self.X) self.assertTrue(np.array_equal(sk_proba, photon_proba))
def test_extract_feature_importances(self): # one machine with coef_ self.pipe.fit(self.X, self.y) f_importances_coef = self.pipe.feature_importances_ self.assertTrue(f_importances_coef is not None) self.assertTrue(isinstance(f_importances_coef, list)) # one machine with feature_importances_ f_imp_pipe = PhotonPipeline([ ("StandardScaler", PipelineElement("StandardScaler")), ("PCA", PipelineElement("PCA")), ("DecisionTreeClassifier", PipelineElement("DecisionTreeClassifier")), ]) f_imp_pipe.fit(self.X, self.y) f_importances = f_imp_pipe.feature_importances_ self.assertTrue(f_importances is not None) self.assertTrue(isinstance(f_importances, list)) # one machine that has no feature importances no_f_imp_pipe = PhotonPipeline([ ("StandardScaler", PipelineElement("StandardScaler")), ("PCA", PipelineElement("PCA")), ("SVC", PipelineElement("SVC", kernel="rbf")), ]) no_f_imp_pipe.fit(self.X, self.y) no_f_imps = no_f_imp_pipe.feature_importances_ self.assertTrue(no_f_imps is None)
def objective_function(self, cfg): cfg = {k: cfg[k] for k in cfg if cfg[k]} sc = PipelineElement("StandardScaler", {}) pca = PipelineElement("PCA", {}, random_state=3) svc = PipelineElement("SVC", {}, random_state=3, gamma="auto") my_pipe = PhotonPipeline([("StandardScaler", sc), ("PCA", pca), ("SVC", svc)]) my_pipe.set_params(**cfg) metric = cross_val_score( my_pipe, self.X, self.y, cv=3, scoring=make_scorer(accuracy_score, greater_is_better=True), ) # , scoring=my_pipe.predict) print("run") return 1 - np.mean(metric)
def test_predict_with_training_flag(self): # manually edit labels sk_pipe = SKPipeline([("SS", self.sk_ss), ("SVC", self.sk_svc)]) y_plus_one = self.y + 1 sk_pipe.fit(self.X, y_plus_one) sk_pred = sk_pipe.predict(self.X) # edit labels during pipeline p_pipe = PhotonPipeline([("SS", self.p_ss), ("YT", self.dummy_photon_element), ("SVC", self.p_svm)]) p_pipe.fit(self.X, self.y) p_pred = p_pipe.predict(self.X) sk_standardized_X = self.sk_ss.transform(self.X) input_of_y_transformer = self.dummy_photon_element.base_element.X self.assertTrue(np.array_equal(sk_standardized_X, input_of_y_transformer)) self.assertTrue(np.array_equal(sk_pred, p_pred))
def objective_function_simple(self, cfg): cfg = {k: cfg[k] for k in cfg if cfg[k]} values = [] train_indices = list(self.pipe.cross_validation.outer_folds.values( ))[0].train_indices self._validation_X, self._validation_y, _ = PhotonDataHelper.split_data( self.X, self.y, kwargs=None, indices=train_indices) for inner_fold in list( list(self.pipe.cross_validation.inner_folds.values()) [0].values()): sc = PipelineElement("StandardScaler", {}) pca = PipelineElement("PCA", {}, random_state=42) svc = PipelineElement("SVC", {}, random_state=42, gamma='auto') my_pipe = PhotonPipeline([('StandardScaler', sc), ('PCA', pca), ('SVC', svc)]) my_pipe.set_params(**cfg) my_pipe.fit(self._validation_X[inner_fold.train_indices, :], self._validation_y[inner_fold.train_indices]) values.append( accuracy_score( self._validation_y[inner_fold.test_indices], my_pipe.predict( self._validation_X[inner_fold.test_indices, :]))) return 1 - np.mean(values)
def objective_function(cfg): my_pipe = PhotonPipeline([('StandardScaler', StandardScaler()), ('SVC', SVC())]) my_pipe.random_state = seed my_pipe.set_params(**cfg) my_pipe.fit(X, y) y_pred = my_pipe.predict(X_train) metric = accuracy_score(y_pred, y_true) return metric
def test_no_estimator(self): no_estimator_pipe = PhotonPipeline([("StandardScaler", self.p_ss), ("PCA", self.p_pca)]) no_estimator_pipe.fit(self.X, self.y) photon_no_estimator_transform, _, _ = no_estimator_pipe.transform(self.X) photon_no_estimator_predict = no_estimator_pipe.predict(self.X) self.assertTrue(np.array_equal(photon_no_estimator_predict, photon_no_estimator_transform)) self.sk_ss.fit(self.X) standardized_data = self.sk_ss.transform(self.X) self.sk_pca.fit(standardized_data) pca_data = self.sk_pca.transform(standardized_data) self.assertTrue(np.array_equal(photon_no_estimator_transform, pca_data)) self.assertTrue(np.array_equal(photon_no_estimator_predict, pca_data))
def setUp(self): super(OuterFoldTests, self).setUp() self.fold_nr_inner_cv = 5 self.inner_cv = ShuffleSplit(n_splits=self.fold_nr_inner_cv, random_state=42) self.outer_cv = ShuffleSplit(n_splits=1, test_size=0.2, random_state=42) self.cv_info = Hyperpipe.CrossValidation( inner_cv=self.inner_cv, outer_cv=self.outer_cv, eval_final_performance=True, test_size=0.2, calculate_metrics_per_fold=True, calculate_metrics_across_folds=False, learning_curves=False, learning_curves_cut=None) self.X, self.y = load_boston(return_X_y=True) self.outer_fold_id = "TestFoldOuter1" self.cv_info.outer_folds = { self.outer_fold_id: FoldInfo(0, 1, train, test) for train, test in self.outer_cv.split(self.X, self.y) } self.config_num = 2 self.optimization_info = Hyperpipe.Optimization( metrics=['mean_absolute_error', 'mean_squared_error'], best_config_metric='mean_absolute_error', optimizer_input='grid_search', optimizer_params={}, performance_constraints=None) self.elements = [ PipelineElement('StandardScaler'), PipelineElement('PCA', {'n_components': [4, 7]}), PipelineElement('DecisionTreeRegressor', random_state=42) ] self.pipe = PhotonPipeline([(p.name, p) for p in self.elements])
def test_regular_use(self): photon_pipe = PhotonPipeline([("PCA", self.p_pca), ("SVC", self.p_svm)]) photon_pipe.fit(self.X, self.y) photon_transformed_X, _, _ = photon_pipe.transform(self.X) photon_predicted_y = photon_pipe.predict(self.X) # the element is given by reference, so it should be fitted right here photon_ref_transformed_X, _, _ = self.p_pca.transform(self.X) photon_ref_predicted_y = self.p_svm.predict(photon_ref_transformed_X) self.assertTrue(np.array_equal(photon_transformed_X, photon_ref_transformed_X)) self.assertTrue(np.array_equal(photon_predicted_y, photon_ref_predicted_y)) sk_pipe = SKPipeline([('PCA', self.sk_pca), ("SVC", self.sk_svc)]) sk_pipe.fit(self.X, self.y) sk_predicted_y = sk_pipe.predict(self.X) self.assertTrue(np.array_equal(photon_predicted_y, sk_predicted_y))
def objective_function_switch(self, cfg): cfg = {k: cfg[k] for k in cfg if cfg[k]} values = [] train_indices = list(self.pipe.cross_validation.outer_folds.values( ))[0].train_indices self._validation_X, self._validation_y, _ = PhotonDataHelper.split_data( self.X, self.y, kwargs=None, indices=train_indices) switch = cfg["Estimator_switch"] del cfg["Estimator_switch"] for inner_fold in list( list(self.pipe.cross_validation.inner_folds.values()) [0].values()): sc = PipelineElement("StandardScaler", {}) pca = PipelineElement("PCA", {}, random_state=42) if switch == 'svc': est = PipelineElement("SVC", {}, random_state=42, gamma='auto') name = 'SVC' else: est = PipelineElement("RandomForestClassifier", {}, random_state=42) name = "RandomForestClassifier" my_pipe = PhotonPipeline([('StandardScaler', sc), ('PCA', pca), (name, est)]) my_pipe.set_params(**cfg) my_pipe.fit(self._validation_X[inner_fold.train_indices, :], self._validation_y[inner_fold.train_indices]) values.append( accuracy_score( self._validation_y[inner_fold.test_indices], my_pipe.predict( self._validation_X[inner_fold.test_indices, :]))) return 1 - np.mean(values)
def test_combi_from_single_and_group_caching(self): # 2. specify cache directories cache_folder_base = self.cache_folder_path cache_folder_neuro = os.path.join(cache_folder_base, 'subject_caching_test') CacheManager.clear_cache_files(cache_folder_base) CacheManager.clear_cache_files(cache_folder_neuro) # 3. set up Neuro Branch nb = ParallelBranch("SubjectCaching", nr_of_processes=3) # increase complexity by adding batching nb += PipelineElement.create("ResampleImages", StupidAdditionTransformer(), {}, batch_size=4) nb.base_element.cache_folder = cache_folder_neuro # 4. setup usual pipeline ss = PipelineElement("StandardScaler", {}) pca = PipelineElement("PCA", {'n_components': [3, 10, 50]}) svm = PipelineElement("SVR", {'kernel': ['rbf', 'linear']}) pipe = PhotonPipeline([('NeuroBranch', nb), ('StandardScaler', ss), ('PCA', pca), ('SVR', svm)]) pipe.caching = True pipe.fold_id = "12345643463434" pipe.cache_folder = cache_folder_base def transform_and_check_folder(config, expected_nr_of_files_group, expected_nr_subject): pipe.set_params(**config) pipe.fit(self.X, self.y) nr_of_generated_cache_files = len( glob.glob(os.path.join(cache_folder_base, "*.p"))) self.assertTrue( nr_of_generated_cache_files == expected_nr_of_files_group) nr_of_generated_cache_files_subject = len( glob.glob(os.path.join(cache_folder_neuro, "*.p"))) self.assertTrue( nr_of_generated_cache_files_subject == expected_nr_subject) config1 = { 'NeuroBranch__ResampleImages__voxel_size': 5, 'PCA__n_components': 7, 'SVR__C': 2 } config2 = { 'NeuroBranch__ResampleImages__voxel_size': 3, 'PCA__n_components': 4, 'SVR__C': 5 } # first config we expect to have a cached_file for the standard scaler and the pca # and we expect to have two files (one resampler, one brain mask) for each input data transform_and_check_folder(config1, 2, self.nr_of_expected_pickles_per_config) # second config we expect to have two cached_file for the standard scaler (one time for 5 voxel input and one # time for 3 voxel input) and two files two for the first and second config pcas, # and we expect to have 2 * nr of input data for resampler plus one time masker transform_and_check_folder(config2, 4, 2 * self.nr_of_expected_pickles_per_config) # when we transform with the first config again, nothing should happen transform_and_check_folder(config1, 4, 2 * self.nr_of_expected_pickles_per_config) # when we transform with an empty config, a new entry for pca and standard scaler should be generated, as well # as a new cache item for each input data from the neuro branch for each itemin the neuro branch with self.assertRaises(ValueError): transform_and_check_folder({}, 6, 4 * self.nr_of_expected_pickles_per_config) CacheManager.clear_cache_files(cache_folder_base) CacheManager.clear_cache_files(cache_folder_neuro)
def setUp(self): def callback(X, y=None, **kwargs): self.assertEqual(X.shape, (569, 30)) print("Shape of transformed data: {}".format(X.shape)) def predict_callback(X, y=None, **kwargs): self.assertEqual(X.shape, (569, )) print('Shape of predictions: {}'.format(X.shape)) def callback_test_equality(X, y=None, **kwargs): self.assertTrue(np.array_equal(self.X, X)) if y is not None: self.assertListEqual(self.y.tolist(), y.tolist()) self.X, self.y = load_breast_cancer(True) self.clean_pipeline = PhotonPipeline( elements=[('PCA', PipelineElement('PCA')), ('LogisticRegression', PipelineElement('LogisticRegression'))]) self.callback_pipeline = PhotonPipeline(elements=[( 'First', CallbackElement('First', callback)), ( 'PCA', PipelineElement('PCA') ), ('Second', CallbackElement('Second', callback) ), ('LogisticRegression', PipelineElement('LogisticRegression'))]) self.clean_branch_pipeline = PhotonPipeline( elements=[('MyBranch', Branch('MyBranch', [PipelineElement('PCA')])), ('LogisticRegression', PipelineElement('LogisticRegression'))]) self.callback_branch_pipeline = PhotonPipeline( elements=[('First', CallbackElement('First', callback)), ('MyBranch', Branch('MyBranch', [ CallbackElement('Second', callback), PipelineElement('PCA') ])), ('Fourth', CallbackElement('Fourth', callback)), ('LogisticRegression', PipelineElement('LogisticRegression'))]) self.callback_branch_pipeline_error = PhotonPipeline( elements=[('First', CallbackElement('First', callback)), ('MyBranch', Branch('MyBranch', [ CallbackElement('Second', callback), PipelineElement('PCA'), CallbackElement('Third', callback) ])), ('Fourth', CallbackElement('Fourth', callback)), ('LogisticRegression', PipelineElement('LogisticRegression') ), ('Fifth', CallbackElement('Fifth', predict_callback))]) # test that data is unaffected from pipeline self.callback_after_callback_pipeline = PhotonPipeline([ ('Callback1', CallbackElement('Callback1', callback)), ('Callback2', CallbackElement('Callback2', callback_test_equality)), ('StandarcScaler', PipelineElement('StandardScaler'), ('SVR', PipelineElement('SVR'))) ])
class InnerFoldTests(PhotonBaseTest): @classmethod def setUpClass(cls) -> None: cls.file = __file__ super(InnerFoldTests, cls).setUpClass() def setUp(self): super(InnerFoldTests, self).setUp() self.pipe = PhotonPipeline([ ('StandardScaler', PipelineElement('StandardScaler')), ('PCA', PipelineElement('PCA')), ('RidgeClassifier', PipelineElement('RidgeClassifier')) ]) self.config = { 'PCA__n_components': 5, 'RidgeClassifier__solver': 'svd', 'RidgeClassifier__random_state': 42 } self.outer_fold_id = 'TestID' self.inner_cv = KFold(n_splits=4) self.X, self.y = load_breast_cancer(return_X_y=True) self.cross_validation = Hyperpipe.CrossValidation( self.inner_cv, None, True, 0.2, True, False, False, None) self.cross_validation.inner_folds = { self.outer_fold_id: { i: FoldInfo(i, i + 1, train, test) for i, (train, test) in enumerate(self.inner_cv.split(self.X, self.y)) } } self.optimization = Hyperpipe.Optimization( 'grid_search', {}, ['accuracy', 'recall', 'specificity'], 'accuracy', None) def test_fit_against_sklearn(self): test_pipe = InnerFoldManager(self.pipe.copy_me, self.config, self.optimization, self.cross_validation, self.outer_fold_id) photon_results_config_item = test_pipe.fit(self.X, self.y) self.assertIsNotNone(photon_results_config_item.computation_start_time) self.assertIsNotNone(photon_results_config_item.computation_end_time) # now sklearn. sklearn_pipe = Pipeline([('StandardScaler', StandardScaler()), ('PCA', PCA()), ('RidgeClassifier', RidgeClassifier())]) sklearn_pipe.set_params(**self.config) for fold_obj in self.cross_validation.inner_folds[ self.outer_fold_id].values(): train_X, test_X = self.X[fold_obj.train_indices], self.X[ fold_obj.test_indices] train_y, test_y = self.y[fold_obj.train_indices], self.y[ fold_obj.test_indices] sklearn_pipe.fit(train_X, train_y) sklearn_predictions = sklearn_pipe.predict(test_X) sklearn_feature_importances = sklearn_pipe.named_steps[ 'RidgeClassifier'].coef_ photon_test_results = photon_results_config_item.inner_folds[ fold_obj.fold_nr - 1].validation self.assertTrue( np.array_equal(sklearn_predictions, photon_test_results.y_pred)) for fi, sklearn_feature_importance_score in enumerate( sklearn_feature_importances[0]): self.assertAlmostEqual( sklearn_feature_importance_score, photon_results_config_item.inner_folds[ fold_obj.fold_nr - 1].feature_importances[0][fi]) accuracy = accuracy_score(test_y, sklearn_predictions) self.assertEqual(photon_test_results.metrics['accuracy'], accuracy) recall = recall_score(test_y, sklearn_predictions) self.assertEqual(photon_test_results.metrics['recall'], recall) def test_performance_constraints(self): # test if the constraints are considered # A: for a single constraint test_pipe = InnerFoldManager( self.pipe.copy_me, self.config, self.optimization, self.cross_validation, self.outer_fold_id, optimization_constraints=MinimumPerformance( 'accuracy', 0.95, 'first')) photon_results_config_item = test_pipe.fit(self.X, self.y) # the first fold has an accuracy of 0.874 so we expect the test_pipe to stop calculating after the first fold # which means it has only one outer fold and self.assertTrue(len(photon_results_config_item.inner_folds) == 1) # B: for a list of constraints, accuracy should pass (0.874 in first fold > accuracy threshold) # but specificity should stop the computation (0.78 in first fold < specificity threshold) test_pipe = InnerFoldManager( self.pipe.copy_me, self.config, self.optimization, self.cross_validation, self.outer_fold_id, optimization_constraints=[ MinimumPerformance('accuracy', 0.85, 'first'), MinimumPerformance('specificity', 0.8, 'first') ]) photon_results_config_item = test_pipe.fit(self.X, self.y) self.assertTrue(len(photon_results_config_item.inner_folds) == 1) # C: for a list of constraints, all should pass test_pipe = InnerFoldManager( self.pipe.copy_me, self.config, self.optimization, self.cross_validation, self.outer_fold_id, optimization_constraints=[ MinimumPerformance('accuracy', 0.75, 'all'), MinimumPerformance('specificity', 0.75, 'all') ]) photon_results_config_item = test_pipe.fit(self.X, self.y) self.assertTrue(len(photon_results_config_item.inner_folds) == 4) def test_raise_error(self): # case A: raise_error = False -> we expect continuation of the computation test_pipe = InnerFoldManager(self.pipe.copy_me, self.config, self.optimization, self.cross_validation, self.outer_fold_id, raise_error=False) # computing with inequal number of features and targets should result in an error test_pipe.fit(self.X, self.y[:10]) # case B: test_pipe.raise_error = True with self.assertRaises(IndexError): test_pipe.fit(self.X, self.y[:10]) def test_save_predictions(self): # assert that we have the predictions stored test_pipe = InnerFoldManager(self.pipe.copy_me, self.config, self.optimization, self.cross_validation, self.outer_fold_id) # in case we want to have metrics calculated across false, we need to temporarily store the predictions test_pipe.optimization_infos.calculate_metrics_across_folds = True config_item = test_pipe.fit(self.X, self.y) for inner_fold in config_item.inner_folds: self.assertEqual(len(inner_fold.training.y_pred), inner_fold.number_samples_training) self.assertEqual(len(inner_fold.validation.y_pred), inner_fold.number_samples_validation) def test_save_feature_importances(self): test_pipe = InnerFoldManager(self.pipe.copy_me, self.config, self.optimization, self.cross_validation, self.outer_fold_id) # we expect the feature importances to be of length 5 because the input is through the PCA reduced to 5 dimensions output_config = test_pipe.fit(self.X, self.y) for inner_fold in output_config.inner_folds: self.assertEqual(len(inner_fold.feature_importances[0]), 5) def test_process_fit_results(self): test_pipe = InnerFoldManager(self.pipe.copy_me, self.config, self.optimization, self.cross_validation, self.outer_fold_id) test_pipe.cross_validation_infos.calculate_metrics_across_folds = True test_pipe.cross_validation_infos.calculate_metrics_per_fold = False across_folds_config_item = test_pipe.fit(self.X, self.y) test_pipe.cross_validation_infos.calculate_metrics_across_folds = False test_pipe.cross_validation_infos.calculate_metrics_per_fold = True per_fold_config_item = test_pipe.fit(self.X, self.y) test_pipe.cross_validation_infos.calculate_metrics_across_folds = True test_pipe.cross_validation_infos.calculate_metrics_per_fold = True across_and_per_folds_config_item = test_pipe.fit(self.X, self.y) def assert_fold_operations(expected_operations, returned_metric_list): # assert that we have raw and std and mean expected_returns = list() for metric in self.optimization.metrics: for operation in expected_operations: expected_returns.append(metric + "__" + str(operation)) returned_formatted_metric_list = [ m.metric_name + "__" + str(m.operation) for m in returned_metric_list ] self.assertTrue( set(expected_returns) == set(returned_formatted_metric_list)) # if we have both, then we have mean and std over the folds + three raw across folds num_of_metrics = len(test_pipe.optimization_infos.metrics) self.assertTrue( len(across_and_per_folds_config_item.metrics_train) == 2 * num_of_metrics + num_of_metrics) self.assertTrue( len(across_and_per_folds_config_item.metrics_test) == 2 * num_of_metrics + num_of_metrics) assert_fold_operations( [FoldOperations.RAW, FoldOperations.MEAN, FoldOperations.STD], across_and_per_folds_config_item.metrics_train) assert_fold_operations( [FoldOperations.RAW, FoldOperations.MEAN, FoldOperations.STD], across_and_per_folds_config_item.metrics_test) # if we have across folds only, then it should be 3, one for each metrics self.assertTrue( len(across_folds_config_item.metrics_train) == num_of_metrics) self.assertTrue( len(across_folds_config_item.metrics_test) == num_of_metrics) assert_fold_operations([FoldOperations.RAW], across_folds_config_item.metrics_train) assert_fold_operations([FoldOperations.RAW], across_folds_config_item.metrics_test) # if we have per fold only, then it should be 6, one for mean and std for each of the three metrics self.assertTrue( len(per_fold_config_item.metrics_train) == 2 * num_of_metrics) self.assertTrue( len(per_fold_config_item.metrics_test) == 2 * num_of_metrics) assert_fold_operations([FoldOperations.MEAN, FoldOperations.STD], per_fold_config_item.metrics_train) assert_fold_operations([FoldOperations.MEAN, FoldOperations.STD], per_fold_config_item.metrics_test) def test_extract_feature_importances(self): # one machine with coef_ self.pipe.fit(self.X, self.y) f_importances_coef = self.pipe.feature_importances_ self.assertTrue(f_importances_coef is not None) self.assertTrue(isinstance(f_importances_coef, list)) # one machine with feature_importances_ f_imp_pipe = PhotonPipeline([ ('StandardScaler', PipelineElement('StandardScaler')), ('PCA', PipelineElement('PCA')), ('DecisionTreeClassifier', PipelineElement('DecisionTreeClassifier')) ]) f_imp_pipe.fit(self.X, self.y) f_importances = f_imp_pipe.feature_importances_ self.assertTrue(f_importances is not None) self.assertTrue(isinstance(f_importances, list)) # one machine that has no feature importances no_f_imp_pipe = PhotonPipeline([ ('StandardScaler', PipelineElement('StandardScaler')), ('PCA', PipelineElement('PCA')), ('SVC', PipelineElement('SVC', kernel='rbf')) ]) no_f_imp_pipe.fit(self.X, self.y) no_f_imps = no_f_imp_pipe.feature_importances_ self.assertTrue(no_f_imps is None) def test_learning_curves(self): def test_one_hyperpipe(learning_curves, learning_curves_cut): if learning_curves and learning_curves_cut is None: learning_curves_cut = FloatRange(0, 1, 'range', 0.2) output_settings = OutputSettings( project_folder=self.tmp_folder_path, save_output=False) test_hyperpipe = Hyperpipe( 'test_pipe', learning_curves=learning_curves, learning_curves_cut=learning_curves_cut, metrics=['accuracy', 'recall', 'specificity'], best_config_metric='accuracy', inner_cv=self.inner_cv, output_settings=output_settings) self.assertEqual(test_hyperpipe.cross_validation.learning_curves, learning_curves) if learning_curves: self.assertEqual( test_hyperpipe.cross_validation.learning_curves_cut, learning_curves_cut) else: self.assertIsNone( test_hyperpipe.cross_validation.learning_curves_cut) test_hyperpipe += PipelineElement('StandardScaler') test_hyperpipe += PipelineElement('PCA', {'n_components': [1, 2]}, random_state=42) test_hyperpipe += PipelineElement('SVC', { 'C': [0.1], 'kernel': ['linear'] }, random_state=42) test_hyperpipe.fit(self.X, self.y) config_results = test_hyperpipe.results_handler.results.outer_folds[ 0].tested_config_list config_num = len(config_results) for config_nr in range(config_num): for inner_fold_nr in range(self.inner_cv.n_splits): curves = config_results[config_nr].inner_folds[ inner_fold_nr].learning_curves if learning_curves: self.assertEqual(len(curves), len(learning_curves_cut.values)) for learning_point_nr in range( len(learning_curves_cut.values)): test_metrics = list( curves[learning_point_nr][1].keys()) train_metrics = list( curves[learning_point_nr][2].keys()) self.assertEqual( test_hyperpipe.optimization.metrics, test_metrics) self.assertEqual( test_hyperpipe.optimization.metrics, train_metrics) else: self.assertEqual(curves, []) # hyperpipe with properly set learning curves test_one_hyperpipe(learning_curves=True, learning_curves_cut=FloatRange(0, 1, 'range', 0.5)) # hyperpipe without cut (default cut should be used here) test_one_hyperpipe(learning_curves=True, learning_curves_cut=None) # hyperpipe with cut despite learning_curves being False test_one_hyperpipe(learning_curves=False, learning_curves_cut=FloatRange(0, 1, 'range', 0.5))
class CachedPhotonPipelineTests(PhotonBaseTest): @classmethod def setUpClass(cls) -> None: cls.file = __file__ super(CachedPhotonPipelineTests, cls).setUpClass() def setUp(self): super(CachedPhotonPipelineTests, self).setUp() # Photon Version ss = PipelineElement("StandardScaler", {}) pca = PipelineElement("PCA", {'n_components': [3, 10, 50]}, random_state=3) svm = PipelineElement("SVC", {'kernel': ['rbf', 'linear']}, random_state=3) self.pipe = PhotonPipeline([('StandardScaler', ss), ('PCA', pca), ('SVC', svm)]) self.pipe.caching = True self.pipe.fold_id = "12345643463434" CacheManager.clear_cache_files(self.cache_folder_path) self.pipe.cache_folder = self.cache_folder_path self.config1 = {'PCA__n_components': 4, 'SVC__C': 3, 'SVC__kernel': 'rbf'} self.config2 = {'PCA__n_components': 7, 'SVC__C': 1, 'SVC__kernel': 'linear'} self.X, self.y = load_breast_cancer(return_X_y=True) def test_group_caching(self): # transform one config self.pipe.set_params(**self.config1) self.pipe.fit(self.X, self.y) X_new, y_new, kwargs_new = self.pipe.transform(self.X, self.y) # one result should be cached ( one standard scaler output + one pca output) self.assertTrue(len(glob.glob(os.path.join(self.pipe.cache_folder, "*.p"))) == 2) # transform second config self.pipe.set_params(**self.config2) self.pipe.fit(self.X, self.y) X_config2, y_config2, kwargs_config2 = self.pipe.transform(self.X, self.y) # two results should be cached ( one standard scaler output (config hasn't changed) # + two pca outputs ) self.assertTrue(len(glob.glob(os.path.join(self.pipe.cache_folder, "*.p"))) == 3) # now transform with config 1 again, results should be loaded self.pipe.set_params(**self.config1) self.pipe.fit(self.X, self.y) X_2, y_2, kwargs_2 = self.pipe.transform(self.X, self.y) self.assertTrue(np.array_equal(X_new, X_2)) self.assertTrue(np.array_equal(y_new, y_2)) self.assertTrue(np.array_equal(kwargs_new, kwargs_2)) # results should be the same as when caching is deactivated self.pipe.caching = False self.pipe.set_params(**self.config1) self.pipe.fit(self.X, self.y) X_uc, y_uc, kwargs_uc = self.pipe.transform(self.X, self.y) self.assertTrue(np.array_equal(X_uc, X_2)) self.assertTrue(np.array_equal(y_uc, y_2)) self.assertTrue(np.array_equal(kwargs_uc, kwargs_2)) def test_empty_hyperparameters(self): # test if one can use it when only default parameters are given and hyperparameter space is empty self.pipe.set_params(**{}) self.pipe.fit(self.X, self.y) X_new, y_new, kwargs_new = self.pipe.transform(self.X, self.y) # one result should be cached ( one standard scaler output + one pca output ) self.assertTrue(len(glob.glob(os.path.join(self.pipe.cache_folder, "*.p"))) == 2) self.pipe.set_params(**{}) self.pipe.fit(self.X, self.y) X_new2, y_new2, kwargs_new2 = self.pipe.transform(self.X, self.y) # assert nothing happened in the cache folder self.assertTrue(len(glob.glob(os.path.join(self.pipe.cache_folder, "*.p"))) == 2) self.assertTrue(np.array_equal(X_new, X_new2)) self.assertTrue(np.array_equal(y_new, y_new2)) self.assertTrue(np.array_equal(kwargs_new, kwargs_new2))
class InnerFoldTests(PhotonBaseTest): def setUp(self): super(InnerFoldTests, self).setUp() self.pipe = PhotonPipeline([ ("StandardScaler", PipelineElement("StandardScaler")), ("PCA", PipelineElement("PCA")), ("RidgeClassifier", PipelineElement("RidgeClassifier")), ]) self.config = { "PCA__n_components": 5, "RidgeClassifier__solver": "svd", "RidgeClassifier__random_state": 42, } self.outer_fold_id = "TestID" self.inner_cv = KFold(n_splits=4) self.X, self.y = load_breast_cancer(True) self.cross_validation = Hyperpipe.CrossValidation( self.inner_cv, None, True, 0.2, True, False) self.cross_validation.inner_folds = { self.outer_fold_id: { i: FoldInfo(i, i + 1, train, test) for i, (train, test) in enumerate(self.inner_cv.split(self.X, self.y)) } } self.optimization = Hyperpipe.Optimization( "grid_search", {}, ["accuracy", "recall", "specificity"], "accuracy", None) def test_fit_against_sklearn(self): test_pipe = InnerFoldManager( self.pipe.copy_me, self.config, self.optimization, self.cross_validation, self.outer_fold_id, ) photon_results_config_item = test_pipe.fit(self.X, self.y) self.assertIsNotNone(photon_results_config_item.computation_start_time) self.assertIsNotNone(photon_results_config_item.computation_end_time) # now sklearn. sklearn_pipe = Pipeline([ ("StandardScaler", StandardScaler()), ("PCA", PCA()), ("RidgeClassifier", RidgeClassifier()), ]) sklearn_pipe.set_params(**self.config) for fold_obj in self.cross_validation.inner_folds[ self.outer_fold_id].values(): train_X, test_X = ( self.X[fold_obj.train_indices], self.X[fold_obj.test_indices], ) train_y, test_y = ( self.y[fold_obj.train_indices], self.y[fold_obj.test_indices], ) sklearn_pipe.fit(train_X, train_y) sklearn_predictions = sklearn_pipe.predict(test_X) sklearn_feature_importances = sklearn_pipe.named_steps[ "RidgeClassifier"].coef_ photon_test_results = photon_results_config_item.inner_folds[ fold_obj.fold_nr - 1].validation self.assertTrue( np.array_equal(sklearn_predictions, photon_test_results.y_pred)) for fi, sklearn_feature_importance_score in enumerate( sklearn_feature_importances[0]): self.assertAlmostEqual( sklearn_feature_importance_score, photon_results_config_item.inner_folds[ fold_obj.fold_nr - 1].feature_importances[0][fi], ) accuracy = accuracy_score(test_y, sklearn_predictions) self.assertEqual(photon_test_results.metrics["accuracy"], accuracy) recall = recall_score(test_y, sklearn_predictions) self.assertEqual(photon_test_results.metrics["recall"], recall) def test_performance_constraints(self): # test if the constraints are considered # A: for a single constraint test_pipe = InnerFoldManager( self.pipe.copy_me, self.config, self.optimization, self.cross_validation, self.outer_fold_id, optimization_constraints=MinimumPerformance( "accuracy", 0.95, "first"), ) photon_results_config_item = test_pipe.fit(self.X, self.y) # the first fold has an accuracy of 0.874 so we expect the test_pipe to stop calculating after the first fold # which means it has only one outer fold and self.assertTrue(len(photon_results_config_item.inner_folds) == 1) # B: for a list of constraints, accuracy should pass (0.874 in first fold > accuracy threshold) # but specificity should stop the computation (0.78 in first fold < specificity threshold) test_pipe = InnerFoldManager( self.pipe.copy_me, self.config, self.optimization, self.cross_validation, self.outer_fold_id, optimization_constraints=[ MinimumPerformance("accuracy", 0.85, "first"), MinimumPerformance("specificity", 0.8, "first"), ], ) photon_results_config_item = test_pipe.fit(self.X, self.y) self.assertTrue(len(photon_results_config_item.inner_folds) == 1) # C: for a list of constraints, all should pass test_pipe = InnerFoldManager( self.pipe.copy_me, self.config, self.optimization, self.cross_validation, self.outer_fold_id, optimization_constraints=[ MinimumPerformance("accuracy", 0.75, "all"), MinimumPerformance("specificity", 0.75, "all"), ], ) photon_results_config_item = test_pipe.fit(self.X, self.y) self.assertTrue(len(photon_results_config_item.inner_folds) == 4) def test_raise_error(self): # case A: raise_error = False -> we expect continuation of the computation test_pipe = InnerFoldManager( self.pipe.copy_me, self.config, self.optimization, self.cross_validation, self.outer_fold_id, raise_error=False, ) # computing with inequal number of features and targets should result in an error test_pipe.fit(self.X, self.y[:10]) # case B: test_pipe.raise_error = True with self.assertRaises(IndexError): test_pipe.fit(self.X, self.y[:10]) def test_save_predictions(self): # assert that we have the predictions stored test_pipe = InnerFoldManager( self.pipe.copy_me, self.config, self.optimization, self.cross_validation, self.outer_fold_id, ) # in case we want to have metrics calculated across false, we need to temporarily store the predictions test_pipe.optimization_infos.calculate_metrics_across_folds = True config_item = test_pipe.fit(self.X, self.y) for inner_fold in config_item.inner_folds: self.assertEqual(len(inner_fold.training.y_pred), inner_fold.number_samples_training) self.assertEqual(len(inner_fold.validation.y_pred), inner_fold.number_samples_validation) def test_save_feature_importances(self): test_pipe = InnerFoldManager( self.pipe.copy_me, self.config, self.optimization, self.cross_validation, self.outer_fold_id, ) # we expect the feature importances to be of length 5 because the input is through the PCA reduced to 5 dimensions output_config = test_pipe.fit(self.X, self.y) for inner_fold in output_config.inner_folds: self.assertEqual(len(inner_fold.feature_importances[0]), 5) def test_process_fit_results(self): test_pipe = InnerFoldManager( self.pipe.copy_me, self.config, self.optimization, self.cross_validation, self.outer_fold_id, ) test_pipe.cross_validation_infos.calculate_metrics_across_folds = True test_pipe.cross_validation_infos.calculate_metrics_per_fold = False across_folds_config_item = test_pipe.fit(self.X, self.y) test_pipe.cross_validation_infos.calculate_metrics_across_folds = False test_pipe.cross_validation_infos.calculate_metrics_per_fold = True per_fold_config_item = test_pipe.fit(self.X, self.y) test_pipe.cross_validation_infos.calculate_metrics_across_folds = True test_pipe.cross_validation_infos.calculate_metrics_per_fold = True across_and_per_folds_config_item = test_pipe.fit(self.X, self.y) def assert_fold_operations(expected_operations, returned_metric_list): # assert that we have raw and std and mean expected_returns = list() for metric in self.optimization.metrics: for operation in expected_operations: expected_returns.append(metric + "__" + str(operation)) returned_formatted_metric_list = [ m.metric_name + "__" + str(m.operation) for m in returned_metric_list ] self.assertTrue( set(expected_returns) == set(returned_formatted_metric_list)) # if we have both, then we have mean and std over the folds + three raw across folds num_of_metrics = len(test_pipe.optimization_infos.metrics) self.assertTrue( len(across_and_per_folds_config_item.metrics_train) == 2 * num_of_metrics + num_of_metrics) self.assertTrue( len(across_and_per_folds_config_item.metrics_test) == 2 * num_of_metrics + num_of_metrics) assert_fold_operations( [FoldOperations.RAW, FoldOperations.MEAN, FoldOperations.STD], across_and_per_folds_config_item.metrics_train, ) assert_fold_operations( [FoldOperations.RAW, FoldOperations.MEAN, FoldOperations.STD], across_and_per_folds_config_item.metrics_test, ) # if we have across folds only, then it should be 3, one for each metrics self.assertTrue( len(across_folds_config_item.metrics_train) == num_of_metrics) self.assertTrue( len(across_folds_config_item.metrics_test) == num_of_metrics) assert_fold_operations([FoldOperations.RAW], across_folds_config_item.metrics_train) assert_fold_operations([FoldOperations.RAW], across_folds_config_item.metrics_test) # if we have per fold only, then it should be 6, one for mean and std for each of the three metrics self.assertTrue( len(per_fold_config_item.metrics_train) == 2 * num_of_metrics) self.assertTrue( len(per_fold_config_item.metrics_test) == 2 * num_of_metrics) assert_fold_operations( [FoldOperations.MEAN, FoldOperations.STD], per_fold_config_item.metrics_train, ) assert_fold_operations([FoldOperations.MEAN, FoldOperations.STD], per_fold_config_item.metrics_test) def test_extract_feature_importances(self): # one machine with coef_ self.pipe.fit(self.X, self.y) f_importances_coef = self.pipe.feature_importances_ self.assertTrue(f_importances_coef is not None) self.assertTrue(isinstance(f_importances_coef, list)) # one machine with feature_importances_ f_imp_pipe = PhotonPipeline([ ("StandardScaler", PipelineElement("StandardScaler")), ("PCA", PipelineElement("PCA")), ("DecisionTreeClassifier", PipelineElement("DecisionTreeClassifier")), ]) f_imp_pipe.fit(self.X, self.y) f_importances = f_imp_pipe.feature_importances_ self.assertTrue(f_importances is not None) self.assertTrue(isinstance(f_importances, list)) # one machine that has no feature importances no_f_imp_pipe = PhotonPipeline([ ("StandardScaler", PipelineElement("StandardScaler")), ("PCA", PipelineElement("PCA")), ("SVC", PipelineElement("SVC", kernel="rbf")), ]) no_f_imp_pipe.fit(self.X, self.y) no_f_imps = no_f_imp_pipe.feature_importances_ self.assertTrue(no_f_imps is None)
def test_combi_from_single_and_group_caching(self): # 1. load data test_folder = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../test_data/") X = AtlasLibrary().get_nii_files_from_folder(test_folder, extension=".nii") nr_of_expected_pickles_per_config = len(X) y = np.random.randn(len(X)) # 2. specify cache directories cache_folder_base = self.cache_folder_path cache_folder_neuro = os.path.join(cache_folder_base, "subject_caching_test") CacheManager.clear_cache_files(cache_folder_base) CacheManager.clear_cache_files(cache_folder_neuro) # 3. set up Neuro Branch nb = NeuroBranch("SubjectCaching", nr_of_processes=3) # increase complexity by adding batching nb += PipelineElement("ResampleImages", batch_size=4) nb += PipelineElement("BrainMask", batch_size=4) nb.base_element.cache_folder = cache_folder_neuro # 4. setup usual pipeline ss = PipelineElement("StandardScaler", {}) pca = PipelineElement("PCA", {"n_components": [3, 10, 50]}) svm = PipelineElement("SVR", {"kernel": ["rbf", "linear"]}) pipe = PhotonPipeline([("NeuroBranch", nb), ("StandardScaler", ss), ("PCA", pca), ("SVR", svm)]) pipe.caching = True pipe.fold_id = "12345643463434" pipe.cache_folder = cache_folder_base def transform_and_check_folder(config, expected_nr_of_files_group, expected_nr_subject): pipe.set_params(**config) pipe.fit(X, y) nr_of_generated_cache_files = len( glob.glob(os.path.join(cache_folder_base, "*.p"))) self.assertTrue( nr_of_generated_cache_files == expected_nr_of_files_group) nr_of_generated_cache_files_subject = len( glob.glob(os.path.join(cache_folder_neuro, "*.p"))) self.assertTrue( nr_of_generated_cache_files_subject == expected_nr_subject) config1 = { "NeuroBranch__ResampleImages__voxel_size": 5, "PCA__n_components": 7, "SVR__C": 2, } config2 = { "NeuroBranch__ResampleImages__voxel_size": 3, "PCA__n_components": 4, "SVR__C": 5, } # first config we expect to have a cached_file for the standard scaler and the pca # and we expect to have two files (one resampler, one brain mask) for each input data transform_and_check_folder(config1, 2, 2 * nr_of_expected_pickles_per_config) # second config we expect to have two cached_file for the standard scaler (one time for 5 voxel input and one # time for 3 voxel input) and two files two for the first and second config pcas, # and we expect to have 2 * nr of input data for resampler plus one time masker transform_and_check_folder(config2, 4, 4 * nr_of_expected_pickles_per_config) # when we transform with the first config again, nothing should happen transform_and_check_folder(config1, 4, 4 * nr_of_expected_pickles_per_config) # when we transform with an empty config, a new entry for pca and standard scaler should be generated, as well # as a new cache item for each input data from the neuro branch for each itemin the neuro branch with self.assertRaises(ValueError): transform_and_check_folder({}, 6, 6 * nr_of_expected_pickles_per_config) CacheManager.clear_cache_files(cache_folder_base) CacheManager.clear_cache_files(cache_folder_neuro)
def test_inverse_tansform(self): # simple pipe sk_pipe = SKPipeline([("SS", self.sk_ss), ("PCA", self.sk_pca)]) sk_pipe.fit(self.X, self.y) sk_transform = sk_pipe.transform(self.X) sk_inverse_transformed = sk_pipe.inverse_transform(sk_transform) photon_pipe = PhotonPipeline([("SS", self.p_ss), ("PCA", self.p_pca)]) photon_pipe.fit(self.X, self.y) p_transform, _, _ = photon_pipe.transform(self.X) p_inverse_transformed, _, _ = photon_pipe.inverse_transform( p_transform) self.assertTrue( np.array_equal(sk_inverse_transformed, p_inverse_transformed)) # now including stack stack = Stack("stack", [self.p_pca]) stack_pipeline = PhotonPipeline([ ("stack", stack), ("StandardScaler", PipelineElement("StandardScaler")), ("LinearSVC", PipelineElement("LinearSVC")), ]) stack_pipeline.fit(self.X, self.y) feature_importances = stack_pipeline.feature_importances_ inversed_data, _, _ = stack_pipeline.inverse_transform( feature_importances) self.assertEqual(inversed_data.shape[1], self.X.shape[1])
class CachedPhotonPipelineTests(PhotonBaseTest): def setUp(self): super(CachedPhotonPipelineTests, self).setUp() # Photon Version ss = PipelineElement("StandardScaler", {}) pca = PipelineElement("PCA", {"n_components": [3, 10, 50]}, random_state=3) svm = PipelineElement("SVC", {"kernel": ["rbf", "linear"]}, random_state=3) self.pipe = PhotonPipeline([("StandardScaler", ss), ("PCA", pca), ("SVC", svm)]) self.pipe.caching = True self.pipe.fold_id = "12345643463434" self.pipe.cache_folder = self.cache_folder_path self.config1 = { "PCA__n_components": 4, "SVC__C": 3, "SVC__kernel": "rbf" } self.config2 = { "PCA__n_components": 7, "SVC__C": 1, "SVC__kernel": "linear" } self.X, self.y = load_breast_cancer(True) def test_group_caching(self): # transform one config self.pipe.set_params(**self.config1) self.pipe.fit(self.X, self.y) X_new, y_new, kwargs_new = self.pipe.transform(self.X, self.y) # one result should be cached ( one standard scaler output + one pca output) self.assertTrue( len(glob.glob(os.path.join(self.pipe.cache_folder, "*.p"))) == 2) # transform second config self.pipe.set_params(**self.config2) self.pipe.fit(self.X, self.y) X_config2, y_config2, kwargs_config2 = self.pipe.transform( self.X, self.y) # two results should be cached ( one standard scaler output (config hasn't changed) # + two pca outputs ) self.assertTrue( len(glob.glob(os.path.join(self.pipe.cache_folder, "*.p"))) == 3) # now transform with config 1 again, results should be loaded self.pipe.set_params(**self.config1) self.pipe.fit(self.X, self.y) X_2, y_2, kwargs_2 = self.pipe.transform(self.X, self.y) self.assertTrue(np.array_equal(X_new, X_2)) self.assertTrue(np.array_equal(y_new, y_2)) self.assertTrue(np.array_equal(kwargs_new, kwargs_2)) # results should be the same as when caching is deactivated self.pipe.caching = False self.pipe.set_params(**self.config1) self.pipe.fit(self.X, self.y) X_uc, y_uc, kwargs_uc = self.pipe.transform(self.X, self.y) self.assertTrue(np.array_equal(X_uc, X_2)) self.assertTrue(np.array_equal(y_uc, y_2)) self.assertTrue(np.array_equal(kwargs_uc, kwargs_2)) def test_empty_hyperparameters(self): # test if one can use it when only default parameters are given and hyperparameter space is empty self.pipe.set_params(**{}) self.pipe.fit(self.X, self.y) X_new, y_new, kwargs_new = self.pipe.transform(self.X, self.y) # one result should be cached ( one standard scaler output + one pca output ) self.assertTrue( len(glob.glob(os.path.join(self.pipe.cache_folder, "*.p"))) == 2) self.pipe.set_params(**{}) self.pipe.fit(self.X, self.y) X_new2, y_new2, kwargs_new2 = self.pipe.transform(self.X, self.y) # assert nothing happened in the cache folder self.assertTrue( len(glob.glob(os.path.join(self.pipe.cache_folder, "*.p"))) == 2) self.assertTrue(np.array_equal(X_new, X_new2)) self.assertTrue(np.array_equal(y_new, y_new2)) self.assertTrue(np.array_equal(kwargs_new, kwargs_new2)) def test_single_subject_caching(self): nb = NeuroBranch("subject_caching_test") # increase complexity by adding batching nb += PipelineElement("ResampleImages", batch_size=4) test_folder = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../test_data/") X = AtlasLibrary().get_nii_files_from_folder(test_folder, extension=".nii") y = np.random.randn(len(X)) cache_folder = self.cache_folder_path cache_folder = os.path.join(cache_folder, "subject_caching_test") nb.base_element.cache_folder = cache_folder nr_of_expected_pickles_per_config = len(X) def transform_and_check_folder(config, expected_nr_of_files): nb.set_params(**config) nb.transform(X, y) nr_of_generated_cache_files = len( glob.glob(os.path.join(cache_folder, "*.p"))) self.assertTrue( nr_of_generated_cache_files == expected_nr_of_files) # fit with first config # expect one cache file per input file transform_and_check_folder({"ResampleImages__voxel_size": 5}, nr_of_expected_pickles_per_config) # after fitting with second config, we expect two times the number of input files to be in cache transform_and_check_folder({"ResampleImages__voxel_size": 10}, 2 * nr_of_expected_pickles_per_config) # fit with first config again, we expect to not have generate other cache files, because they exist transform_and_check_folder({"ResampleImages__voxel_size": 5}, 2 * nr_of_expected_pickles_per_config) # clean up afterwards CacheManager.clear_cache_files(cache_folder) def test_combi_from_single_and_group_caching(self): # 1. load data test_folder = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../test_data/") X = AtlasLibrary().get_nii_files_from_folder(test_folder, extension=".nii") nr_of_expected_pickles_per_config = len(X) y = np.random.randn(len(X)) # 2. specify cache directories cache_folder_base = self.cache_folder_path cache_folder_neuro = os.path.join(cache_folder_base, "subject_caching_test") CacheManager.clear_cache_files(cache_folder_base) CacheManager.clear_cache_files(cache_folder_neuro) # 3. set up Neuro Branch nb = NeuroBranch("SubjectCaching", nr_of_processes=3) # increase complexity by adding batching nb += PipelineElement("ResampleImages", batch_size=4) nb += PipelineElement("BrainMask", batch_size=4) nb.base_element.cache_folder = cache_folder_neuro # 4. setup usual pipeline ss = PipelineElement("StandardScaler", {}) pca = PipelineElement("PCA", {"n_components": [3, 10, 50]}) svm = PipelineElement("SVR", {"kernel": ["rbf", "linear"]}) pipe = PhotonPipeline([("NeuroBranch", nb), ("StandardScaler", ss), ("PCA", pca), ("SVR", svm)]) pipe.caching = True pipe.fold_id = "12345643463434" pipe.cache_folder = cache_folder_base def transform_and_check_folder(config, expected_nr_of_files_group, expected_nr_subject): pipe.set_params(**config) pipe.fit(X, y) nr_of_generated_cache_files = len( glob.glob(os.path.join(cache_folder_base, "*.p"))) self.assertTrue( nr_of_generated_cache_files == expected_nr_of_files_group) nr_of_generated_cache_files_subject = len( glob.glob(os.path.join(cache_folder_neuro, "*.p"))) self.assertTrue( nr_of_generated_cache_files_subject == expected_nr_subject) config1 = { "NeuroBranch__ResampleImages__voxel_size": 5, "PCA__n_components": 7, "SVR__C": 2, } config2 = { "NeuroBranch__ResampleImages__voxel_size": 3, "PCA__n_components": 4, "SVR__C": 5, } # first config we expect to have a cached_file for the standard scaler and the pca # and we expect to have two files (one resampler, one brain mask) for each input data transform_and_check_folder(config1, 2, 2 * nr_of_expected_pickles_per_config) # second config we expect to have two cached_file for the standard scaler (one time for 5 voxel input and one # time for 3 voxel input) and two files two for the first and second config pcas, # and we expect to have 2 * nr of input data for resampler plus one time masker transform_and_check_folder(config2, 4, 4 * nr_of_expected_pickles_per_config) # when we transform with the first config again, nothing should happen transform_and_check_folder(config1, 4, 4 * nr_of_expected_pickles_per_config) # when we transform with an empty config, a new entry for pca and standard scaler should be generated, as well # as a new cache item for each input data from the neuro branch for each itemin the neuro branch with self.assertRaises(ValueError): transform_and_check_folder({}, 6, 6 * nr_of_expected_pickles_per_config) CacheManager.clear_cache_files(cache_folder_base) CacheManager.clear_cache_files(cache_folder_neuro)