Beispiel #1
0
    def setUp(self):
        super(CachedPhotonPipelineTests, self).setUp()
        # Photon Version
        ss = PipelineElement("StandardScaler", {})
        pca = PipelineElement("PCA", {'n_components': [3, 10, 50]}, random_state=3)
        svm = PipelineElement("SVC", {'kernel': ['rbf', 'linear']}, random_state=3)

        self.pipe = PhotonPipeline([('StandardScaler', ss),
                                    ('PCA', pca),
                                    ('SVC', svm)])

        self.pipe.caching = True
        self.pipe.fold_id = "12345643463434"
        CacheManager.clear_cache_files(self.cache_folder_path)
        self.pipe.cache_folder = self.cache_folder_path

        self.config1 = {'PCA__n_components': 4,
                        'SVC__C': 3,
                        'SVC__kernel': 'rbf'}

        self.config2 = {'PCA__n_components': 7,
                        'SVC__C': 1,
                        'SVC__kernel': 'linear'}

        self.X, self.y = load_breast_cancer(return_X_y=True)
Beispiel #2
0
    def setUp(self):
        super(CachedPhotonPipelineTests, self).setUp()
        # Photon Version
        ss = PipelineElement("StandardScaler", {})
        pca = PipelineElement("PCA", {"n_components": [3, 10, 50]},
                              random_state=3)
        svm = PipelineElement("SVC", {"kernel": ["rbf", "linear"]},
                              random_state=3)

        self.pipe = PhotonPipeline([("StandardScaler", ss), ("PCA", pca),
                                    ("SVC", svm)])

        self.pipe.caching = True
        self.pipe.fold_id = "12345643463434"
        self.pipe.cache_folder = self.cache_folder_path

        self.config1 = {
            "PCA__n_components": 4,
            "SVC__C": 3,
            "SVC__kernel": "rbf"
        }

        self.config2 = {
            "PCA__n_components": 7,
            "SVC__C": 1,
            "SVC__kernel": "linear"
        }

        self.X, self.y = load_breast_cancer(True)
Beispiel #3
0
 def test_random_state(self):
     photon_pipe = PhotonPipeline([("SS", self.p_ss), ("PCA", PipelineElement('PCA')), ("SVC", self.p_dt)])
     photon_pipe.random_state = 666
     photon_pipe.fit(self.X, self.y)
     self.assertEqual(self.p_dt.random_state, photon_pipe.random_state)
     self.assertEqual(photon_pipe.elements[1][-1].random_state, photon_pipe.random_state)
     self.assertEqual(self.p_dt.random_state, 666)
Beispiel #4
0
 def setUp(self):
     super(InnerFoldTests, self).setUp()
     self.pipe = PhotonPipeline([
         ("StandardScaler", PipelineElement("StandardScaler")),
         ("PCA", PipelineElement("PCA")),
         ("RidgeClassifier", PipelineElement("RidgeClassifier")),
     ])
     self.config = {
         "PCA__n_components": 5,
         "RidgeClassifier__solver": "svd",
         "RidgeClassifier__random_state": 42,
     }
     self.outer_fold_id = "TestID"
     self.inner_cv = KFold(n_splits=4)
     self.X, self.y = load_breast_cancer(True)
     self.cross_validation = Hyperpipe.CrossValidation(
         self.inner_cv, None, True, 0.2, True, False)
     self.cross_validation.inner_folds = {
         self.outer_fold_id: {
             i: FoldInfo(i, i + 1, train, test)
             for i, (train,
                     test) in enumerate(self.inner_cv.split(self.X, self.y))
         }
     }
     self.optimization = Hyperpipe.Optimization(
         "grid_search", {}, ["accuracy", "recall", "specificity"],
         "accuracy", None)
Beispiel #5
0
 def setUp(self):
     super(InnerFoldTests, self).setUp()
     self.pipe = PhotonPipeline([
         ('StandardScaler', PipelineElement('StandardScaler')),
         ('PCA', PipelineElement('PCA')),
         ('RidgeClassifier', PipelineElement('RidgeClassifier'))
     ])
     self.config = {
         'PCA__n_components': 5,
         'RidgeClassifier__solver': 'svd',
         'RidgeClassifier__random_state': 42
     }
     self.outer_fold_id = 'TestID'
     self.inner_cv = KFold(n_splits=4)
     self.X, self.y = load_breast_cancer(return_X_y=True)
     self.cross_validation = Hyperpipe.CrossValidation(
         self.inner_cv, None, True, 0.2, True, False, False, None)
     self.cross_validation.inner_folds = {
         self.outer_fold_id: {
             i: FoldInfo(i, i + 1, train, test)
             for i, (train,
                     test) in enumerate(self.inner_cv.split(self.X, self.y))
         }
     }
     self.optimization = Hyperpipe.Optimization(
         'grid_search', {}, ['accuracy', 'recall', 'specificity'],
         'accuracy', None)
Beispiel #6
0
    def test_copy_me(self):
        switch = Switch("my_copy_switch")
        switch += PipelineElement("StandardScaler")
        switch += PipelineElement("RobustScaler", test_disabled=True)

        stack = Stack("RandomStack")
        stack += PipelineElement("SVC")
        branch = Branch('Random_Branch')
        pca_hyperparameters = {'n_components': [5, 10]}
        branch += PipelineElement("PCA", hyperparameters=pca_hyperparameters)
        branch += PipelineElement("DecisionTreeClassifier")
        stack += branch

        photon_pipe = PhotonPipeline([("SimpleImputer", PipelineElement("SimpleImputer")),
                                      ("my_copy_switch", switch),
                                      ('RandomStack', stack),
                                      ('Callback1', CallbackElement('tmp_callback', np.mean)),
                                      ("PhotonVotingClassifier", PipelineElement("PhotonVotingClassifier"))])

        copy_of_the_pipe = photon_pipe.copy_me()

        self.assertEqual(photon_pipe.random_state, copy_of_the_pipe.random_state)
        self.assertTrue(len(copy_of_the_pipe.elements) == 5)
        self.assertTrue(copy_of_the_pipe.elements[2][1].name == "RandomStack")
        self.assertTrue(copy_of_the_pipe.named_steps["my_copy_switch"].elements[1].test_disabled)
        self.assertDictEqual(copy_of_the_pipe.elements[2][1].elements[1].elements[0].hyperparameters,
                             {"PCA__n_components": [5, 10]})
        self.assertTrue(isinstance(copy_of_the_pipe.elements[3][1], CallbackElement))
        self.assertTrue(copy_of_the_pipe.named_steps["tmp_callback"].delegate_function == np.mean)
Beispiel #7
0
    def test_y_and_covariates_transformation(self):

        X = np.ones((200, 50))
        y = np.ones((200, )) + 2
        kwargs = {"sample1": np.ones((200, 5))}

        photon_pipe = PhotonPipeline([("DummyTransformer",
                                       self.dummy_photon_element)])

        # if y is none all y transformer should be ignored
        Xt2, yt2, kwargst2 = photon_pipe.transform(X, None, **kwargs)
        self.assertTrue(np.array_equal(Xt2, X))
        self.assertTrue(np.array_equal(yt2, None))
        self.assertTrue(np.array_equal(kwargst2, kwargs))

        # if y is given, all y transformers should be working
        Xt, yt, kwargst = photon_pipe.transform(X, y, **kwargs)

        # assure that data is delivered to element correctly
        self.assertTrue(
            np.array_equal(X, self.dummy_photon_element.base_element.X))
        self.assertTrue(
            np.array_equal(y, self.dummy_photon_element.base_element.y))
        self.assertTrue(
            np.array_equal(
                kwargs["sample1"],
                self.dummy_photon_element.base_element.kwargs["sample1"],
            ))

        # assure that data is transformed correctly
        self.assertTrue(np.array_equal(Xt, X - 1))
        self.assertTrue(np.array_equal(yt, y + 1))
        self.assertTrue("sample1_edit" in kwargst)
        self.assertTrue(
            np.array_equal(kwargst["sample1_edit"], kwargs["sample1"] + 5))
Beispiel #8
0
    def test_add_preprocessing(self):
        my_preprocessing = Preprocessing()
        my_preprocessing += PipelineElement('LabelEncoder')
        photon_pipe = PhotonPipeline([("PCA", self.p_pca), ("SVC", self.p_svm)])
        photon_pipe._add_preprocessing(my_preprocessing)

        self.assertEqual(len(photon_pipe.named_steps), 3)
        first_element = photon_pipe.elements[0][1]
        self.assertTrue(first_element == my_preprocessing)
        self.assertTrue(photon_pipe.named_steps['Preprocessing'] == my_preprocessing)
Beispiel #9
0
    def test_predict_proba(self):

        sk_pipe = SKPipeline([("SS", self.sk_ss), ("SVC", self.sk_dt)])
        sk_pipe.fit(self.X, self.y)
        sk_proba = sk_pipe.predict_proba(self.X)

        photon_pipe = PhotonPipeline([("SS", self.p_ss), ("SVC", self.p_dt)])
        photon_pipe.fit(self.X, self.y)
        photon_proba = photon_pipe.predict_proba(self.X)

        self.assertTrue(np.array_equal(sk_proba, photon_proba))
Beispiel #10
0
    def test_extract_feature_importances(self):
        # one machine with coef_
        self.pipe.fit(self.X, self.y)
        f_importances_coef = self.pipe.feature_importances_
        self.assertTrue(f_importances_coef is not None)
        self.assertTrue(isinstance(f_importances_coef, list))

        # one machine with feature_importances_
        f_imp_pipe = PhotonPipeline([
            ("StandardScaler", PipelineElement("StandardScaler")),
            ("PCA", PipelineElement("PCA")),
            ("DecisionTreeClassifier",
             PipelineElement("DecisionTreeClassifier")),
        ])
        f_imp_pipe.fit(self.X, self.y)
        f_importances = f_imp_pipe.feature_importances_
        self.assertTrue(f_importances is not None)
        self.assertTrue(isinstance(f_importances, list))

        # one machine that has no feature importances
        no_f_imp_pipe = PhotonPipeline([
            ("StandardScaler", PipelineElement("StandardScaler")),
            ("PCA", PipelineElement("PCA")),
            ("SVC", PipelineElement("SVC", kernel="rbf")),
        ])
        no_f_imp_pipe.fit(self.X, self.y)
        no_f_imps = no_f_imp_pipe.feature_importances_
        self.assertTrue(no_f_imps is None)
Beispiel #11
0
    def objective_function(self, cfg):
        cfg = {k: cfg[k] for k in cfg if cfg[k]}
        sc = PipelineElement("StandardScaler", {})
        pca = PipelineElement("PCA", {}, random_state=3)
        svc = PipelineElement("SVC", {}, random_state=3, gamma="auto")
        my_pipe = PhotonPipeline([("StandardScaler", sc), ("PCA", pca), ("SVC", svc)])
        my_pipe.set_params(**cfg)

        metric = cross_val_score(
            my_pipe,
            self.X,
            self.y,
            cv=3,
            scoring=make_scorer(accuracy_score, greater_is_better=True),
        )  # , scoring=my_pipe.predict)
        print("run")
        return 1 - np.mean(metric)
Beispiel #12
0
    def test_predict_with_training_flag(self):
        # manually edit labels
        sk_pipe = SKPipeline([("SS", self.sk_ss), ("SVC", self.sk_svc)])
        y_plus_one = self.y + 1
        sk_pipe.fit(self.X, y_plus_one)
        sk_pred = sk_pipe.predict(self.X)

        # edit labels during pipeline
        p_pipe = PhotonPipeline([("SS", self.p_ss), ("YT", self.dummy_photon_element), ("SVC", self.p_svm)])
        p_pipe.fit(self.X, self.y)
        p_pred = p_pipe.predict(self.X)

        sk_standardized_X = self.sk_ss.transform(self.X)
        input_of_y_transformer = self.dummy_photon_element.base_element.X
        self.assertTrue(np.array_equal(sk_standardized_X, input_of_y_transformer))

        self.assertTrue(np.array_equal(sk_pred, p_pred))
Beispiel #13
0
        def objective_function_simple(self, cfg):
            cfg = {k: cfg[k] for k in cfg if cfg[k]}
            values = []

            train_indices = list(self.pipe.cross_validation.outer_folds.values(
            ))[0].train_indices
            self._validation_X, self._validation_y, _ = PhotonDataHelper.split_data(
                self.X, self.y, kwargs=None, indices=train_indices)

            for inner_fold in list(
                    list(self.pipe.cross_validation.inner_folds.values())
                [0].values()):
                sc = PipelineElement("StandardScaler", {})
                pca = PipelineElement("PCA", {}, random_state=42)
                svc = PipelineElement("SVC", {}, random_state=42, gamma='auto')
                my_pipe = PhotonPipeline([('StandardScaler', sc), ('PCA', pca),
                                          ('SVC', svc)])
                my_pipe.set_params(**cfg)
                my_pipe.fit(self._validation_X[inner_fold.train_indices, :],
                            self._validation_y[inner_fold.train_indices])
                values.append(
                    accuracy_score(
                        self._validation_y[inner_fold.test_indices],
                        my_pipe.predict(
                            self._validation_X[inner_fold.test_indices, :])))
            return 1 - np.mean(values)
Beispiel #14
0
    def objective_function(cfg):

        my_pipe = PhotonPipeline([('StandardScaler', StandardScaler()),
                                  ('SVC', SVC())])
        my_pipe.random_state = seed
        my_pipe.set_params(**cfg)
        my_pipe.fit(X, y)
        y_pred = my_pipe.predict(X_train)
        metric = accuracy_score(y_pred, y_true)

        return metric
Beispiel #15
0
    def test_no_estimator(self):

        no_estimator_pipe = PhotonPipeline([("StandardScaler", self.p_ss), ("PCA", self.p_pca)])
        no_estimator_pipe.fit(self.X, self.y)
        photon_no_estimator_transform, _, _ = no_estimator_pipe.transform(self.X)
        photon_no_estimator_predict = no_estimator_pipe.predict(self.X)

        self.assertTrue(np.array_equal(photon_no_estimator_predict, photon_no_estimator_transform))

        self.sk_ss.fit(self.X)
        standardized_data = self.sk_ss.transform(self.X)
        self.sk_pca.fit(standardized_data)
        pca_data = self.sk_pca.transform(standardized_data)

        self.assertTrue(np.array_equal(photon_no_estimator_transform, pca_data))
        self.assertTrue(np.array_equal(photon_no_estimator_predict, pca_data))
Beispiel #16
0
    def setUp(self):

        super(OuterFoldTests, self).setUp()
        self.fold_nr_inner_cv = 5
        self.inner_cv = ShuffleSplit(n_splits=self.fold_nr_inner_cv,
                                     random_state=42)
        self.outer_cv = ShuffleSplit(n_splits=1,
                                     test_size=0.2,
                                     random_state=42)
        self.cv_info = Hyperpipe.CrossValidation(
            inner_cv=self.inner_cv,
            outer_cv=self.outer_cv,
            eval_final_performance=True,
            test_size=0.2,
            calculate_metrics_per_fold=True,
            calculate_metrics_across_folds=False,
            learning_curves=False,
            learning_curves_cut=None)

        self.X, self.y = load_boston(return_X_y=True)
        self.outer_fold_id = "TestFoldOuter1"
        self.cv_info.outer_folds = {
            self.outer_fold_id: FoldInfo(0, 1, train, test)
            for train, test in self.outer_cv.split(self.X, self.y)
        }

        self.config_num = 2
        self.optimization_info = Hyperpipe.Optimization(
            metrics=['mean_absolute_error', 'mean_squared_error'],
            best_config_metric='mean_absolute_error',
            optimizer_input='grid_search',
            optimizer_params={},
            performance_constraints=None)
        self.elements = [
            PipelineElement('StandardScaler'),
            PipelineElement('PCA', {'n_components': [4, 7]}),
            PipelineElement('DecisionTreeRegressor', random_state=42)
        ]
        self.pipe = PhotonPipeline([(p.name, p) for p in self.elements])
Beispiel #17
0
    def test_regular_use(self):

        photon_pipe = PhotonPipeline([("PCA", self.p_pca), ("SVC", self.p_svm)])
        photon_pipe.fit(self.X, self.y)

        photon_transformed_X, _, _ = photon_pipe.transform(self.X)
        photon_predicted_y = photon_pipe.predict(self.X)

        # the element is given by reference, so it should be fitted right here
        photon_ref_transformed_X, _, _ = self.p_pca.transform(self.X)
        photon_ref_predicted_y = self.p_svm.predict(photon_ref_transformed_X)

        self.assertTrue(np.array_equal(photon_transformed_X, photon_ref_transformed_X))
        self.assertTrue(np.array_equal(photon_predicted_y, photon_ref_predicted_y))

        sk_pipe = SKPipeline([('PCA', self.sk_pca), ("SVC", self.sk_svc)])
        sk_pipe.fit(self.X, self.y)

        sk_predicted_y = sk_pipe.predict(self.X)
        self.assertTrue(np.array_equal(photon_predicted_y, sk_predicted_y))
Beispiel #18
0
        def objective_function_switch(self, cfg):
            cfg = {k: cfg[k] for k in cfg if cfg[k]}
            values = []

            train_indices = list(self.pipe.cross_validation.outer_folds.values(
            ))[0].train_indices
            self._validation_X, self._validation_y, _ = PhotonDataHelper.split_data(
                self.X, self.y, kwargs=None, indices=train_indices)

            switch = cfg["Estimator_switch"]
            del cfg["Estimator_switch"]
            for inner_fold in list(
                    list(self.pipe.cross_validation.inner_folds.values())
                [0].values()):
                sc = PipelineElement("StandardScaler", {})
                pca = PipelineElement("PCA", {}, random_state=42)
                if switch == 'svc':
                    est = PipelineElement("SVC", {},
                                          random_state=42,
                                          gamma='auto')
                    name = 'SVC'
                else:
                    est = PipelineElement("RandomForestClassifier", {},
                                          random_state=42)
                    name = "RandomForestClassifier"
                my_pipe = PhotonPipeline([('StandardScaler', sc), ('PCA', pca),
                                          (name, est)])
                my_pipe.set_params(**cfg)
                my_pipe.fit(self._validation_X[inner_fold.train_indices, :],
                            self._validation_y[inner_fold.train_indices])
                values.append(
                    accuracy_score(
                        self._validation_y[inner_fold.test_indices],
                        my_pipe.predict(
                            self._validation_X[inner_fold.test_indices, :])))
            return 1 - np.mean(values)
    def test_combi_from_single_and_group_caching(self):

        # 2. specify cache directories
        cache_folder_base = self.cache_folder_path
        cache_folder_neuro = os.path.join(cache_folder_base,
                                          'subject_caching_test')

        CacheManager.clear_cache_files(cache_folder_base)
        CacheManager.clear_cache_files(cache_folder_neuro)

        # 3. set up Neuro Branch
        nb = ParallelBranch("SubjectCaching", nr_of_processes=3)
        # increase complexity by adding batching
        nb += PipelineElement.create("ResampleImages",
                                     StupidAdditionTransformer(), {},
                                     batch_size=4)
        nb.base_element.cache_folder = cache_folder_neuro

        # 4. setup usual pipeline
        ss = PipelineElement("StandardScaler", {})
        pca = PipelineElement("PCA", {'n_components': [3, 10, 50]})
        svm = PipelineElement("SVR", {'kernel': ['rbf', 'linear']})

        pipe = PhotonPipeline([('NeuroBranch', nb), ('StandardScaler', ss),
                               ('PCA', pca), ('SVR', svm)])

        pipe.caching = True
        pipe.fold_id = "12345643463434"
        pipe.cache_folder = cache_folder_base

        def transform_and_check_folder(config, expected_nr_of_files_group,
                                       expected_nr_subject):
            pipe.set_params(**config)
            pipe.fit(self.X, self.y)
            nr_of_generated_cache_files = len(
                glob.glob(os.path.join(cache_folder_base, "*.p")))
            self.assertTrue(
                nr_of_generated_cache_files == expected_nr_of_files_group)

            nr_of_generated_cache_files_subject = len(
                glob.glob(os.path.join(cache_folder_neuro, "*.p")))
            self.assertTrue(
                nr_of_generated_cache_files_subject == expected_nr_subject)

        config1 = {
            'NeuroBranch__ResampleImages__voxel_size': 5,
            'PCA__n_components': 7,
            'SVR__C': 2
        }
        config2 = {
            'NeuroBranch__ResampleImages__voxel_size': 3,
            'PCA__n_components': 4,
            'SVR__C': 5
        }

        # first config we expect to have a cached_file for the standard scaler and the pca
        # and we expect to have two files (one resampler, one brain mask) for each input data
        transform_and_check_folder(config1, 2,
                                   self.nr_of_expected_pickles_per_config)

        # second config we expect to have two cached_file for the standard scaler (one time for 5 voxel input and one
        # time for 3 voxel input) and two files two for the first and second config pcas,
        # and we expect to have 2 * nr of input data for resampler plus one time masker
        transform_and_check_folder(config2, 4,
                                   2 * self.nr_of_expected_pickles_per_config)

        # when we transform with the first config again, nothing should happen
        transform_and_check_folder(config1, 4,
                                   2 * self.nr_of_expected_pickles_per_config)

        # when we transform with an empty config, a new entry for pca and standard scaler should be generated, as well
        # as a new cache item for each input data from the neuro branch for each itemin the neuro branch
        with self.assertRaises(ValueError):
            transform_and_check_folder({}, 6, 4 *
                                       self.nr_of_expected_pickles_per_config)

        CacheManager.clear_cache_files(cache_folder_base)
        CacheManager.clear_cache_files(cache_folder_neuro)
Beispiel #20
0
    def setUp(self):
        def callback(X, y=None, **kwargs):
            self.assertEqual(X.shape, (569, 30))
            print("Shape of transformed data: {}".format(X.shape))

        def predict_callback(X, y=None, **kwargs):
            self.assertEqual(X.shape, (569, ))
            print('Shape of predictions: {}'.format(X.shape))

        def callback_test_equality(X, y=None, **kwargs):
            self.assertTrue(np.array_equal(self.X, X))
            if y is not None:
                self.assertListEqual(self.y.tolist(), y.tolist())

        self.X, self.y = load_breast_cancer(True)

        self.clean_pipeline = PhotonPipeline(
            elements=[('PCA', PipelineElement('PCA')),
                      ('LogisticRegression',
                       PipelineElement('LogisticRegression'))])
        self.callback_pipeline = PhotonPipeline(elements=[(
            'First',
            CallbackElement('First', callback)), (
                'PCA', PipelineElement('PCA')
            ), ('Second', CallbackElement('Second', callback)
                ), ('LogisticRegression',
                    PipelineElement('LogisticRegression'))])
        self.clean_branch_pipeline = PhotonPipeline(
            elements=[('MyBranch',
                       Branch('MyBranch', [PipelineElement('PCA')])),
                      ('LogisticRegression',
                       PipelineElement('LogisticRegression'))])
        self.callback_branch_pipeline = PhotonPipeline(
            elements=[('First', CallbackElement('First', callback)),
                      ('MyBranch',
                       Branch('MyBranch', [
                           CallbackElement('Second', callback),
                           PipelineElement('PCA')
                       ])), ('Fourth', CallbackElement('Fourth', callback)),
                      ('LogisticRegression',
                       PipelineElement('LogisticRegression'))])
        self.callback_branch_pipeline_error = PhotonPipeline(
            elements=[('First', CallbackElement('First', callback)),
                      ('MyBranch',
                       Branch('MyBranch', [
                           CallbackElement('Second', callback),
                           PipelineElement('PCA'),
                           CallbackElement('Third', callback)
                       ])), ('Fourth', CallbackElement('Fourth', callback)),
                      ('LogisticRegression',
                       PipelineElement('LogisticRegression')
                       ), ('Fifth',
                           CallbackElement('Fifth', predict_callback))])
        # test that data is unaffected from pipeline
        self.callback_after_callback_pipeline = PhotonPipeline([
            ('Callback1', CallbackElement('Callback1', callback)),
            ('Callback2', CallbackElement('Callback2',
                                          callback_test_equality)),
            ('StandarcScaler', PipelineElement('StandardScaler'),
             ('SVR', PipelineElement('SVR')))
        ])
Beispiel #21
0
class InnerFoldTests(PhotonBaseTest):
    @classmethod
    def setUpClass(cls) -> None:
        cls.file = __file__
        super(InnerFoldTests, cls).setUpClass()

    def setUp(self):
        super(InnerFoldTests, self).setUp()
        self.pipe = PhotonPipeline([
            ('StandardScaler', PipelineElement('StandardScaler')),
            ('PCA', PipelineElement('PCA')),
            ('RidgeClassifier', PipelineElement('RidgeClassifier'))
        ])
        self.config = {
            'PCA__n_components': 5,
            'RidgeClassifier__solver': 'svd',
            'RidgeClassifier__random_state': 42
        }
        self.outer_fold_id = 'TestID'
        self.inner_cv = KFold(n_splits=4)
        self.X, self.y = load_breast_cancer(return_X_y=True)
        self.cross_validation = Hyperpipe.CrossValidation(
            self.inner_cv, None, True, 0.2, True, False, False, None)
        self.cross_validation.inner_folds = {
            self.outer_fold_id: {
                i: FoldInfo(i, i + 1, train, test)
                for i, (train,
                        test) in enumerate(self.inner_cv.split(self.X, self.y))
            }
        }
        self.optimization = Hyperpipe.Optimization(
            'grid_search', {}, ['accuracy', 'recall', 'specificity'],
            'accuracy', None)

    def test_fit_against_sklearn(self):
        test_pipe = InnerFoldManager(self.pipe.copy_me, self.config,
                                     self.optimization, self.cross_validation,
                                     self.outer_fold_id)

        photon_results_config_item = test_pipe.fit(self.X, self.y)
        self.assertIsNotNone(photon_results_config_item.computation_start_time)
        self.assertIsNotNone(photon_results_config_item.computation_end_time)

        # now sklearn.
        sklearn_pipe = Pipeline([('StandardScaler', StandardScaler()),
                                 ('PCA', PCA()),
                                 ('RidgeClassifier', RidgeClassifier())])
        sklearn_pipe.set_params(**self.config)
        for fold_obj in self.cross_validation.inner_folds[
                self.outer_fold_id].values():
            train_X, test_X = self.X[fold_obj.train_indices], self.X[
                fold_obj.test_indices]
            train_y, test_y = self.y[fold_obj.train_indices], self.y[
                fold_obj.test_indices]

            sklearn_pipe.fit(train_X, train_y)
            sklearn_predictions = sklearn_pipe.predict(test_X)
            sklearn_feature_importances = sklearn_pipe.named_steps[
                'RidgeClassifier'].coef_

            photon_test_results = photon_results_config_item.inner_folds[
                fold_obj.fold_nr - 1].validation

            self.assertTrue(
                np.array_equal(sklearn_predictions,
                               photon_test_results.y_pred))

            for fi, sklearn_feature_importance_score in enumerate(
                    sklearn_feature_importances[0]):
                self.assertAlmostEqual(
                    sklearn_feature_importance_score,
                    photon_results_config_item.inner_folds[
                        fold_obj.fold_nr - 1].feature_importances[0][fi])

            accuracy = accuracy_score(test_y, sklearn_predictions)
            self.assertEqual(photon_test_results.metrics['accuracy'], accuracy)

            recall = recall_score(test_y, sklearn_predictions)
            self.assertEqual(photon_test_results.metrics['recall'], recall)

    def test_performance_constraints(self):
        # test if the constraints are considered
        # A: for a single constraint
        test_pipe = InnerFoldManager(
            self.pipe.copy_me,
            self.config,
            self.optimization,
            self.cross_validation,
            self.outer_fold_id,
            optimization_constraints=MinimumPerformance(
                'accuracy', 0.95, 'first'))

        photon_results_config_item = test_pipe.fit(self.X, self.y)
        # the first fold has an accuracy of 0.874 so we expect the test_pipe to stop calculating after the first fold
        # which means it has only one outer fold and
        self.assertTrue(len(photon_results_config_item.inner_folds) == 1)

        # B: for a list of constraints, accuracy should pass (0.874 in first fold > accuracy threshold)
        # but specificity should stop the computation (0.78 in first fold < specificity threshold)
        test_pipe = InnerFoldManager(
            self.pipe.copy_me,
            self.config,
            self.optimization,
            self.cross_validation,
            self.outer_fold_id,
            optimization_constraints=[
                MinimumPerformance('accuracy', 0.85, 'first'),
                MinimumPerformance('specificity', 0.8, 'first')
            ])

        photon_results_config_item = test_pipe.fit(self.X, self.y)
        self.assertTrue(len(photon_results_config_item.inner_folds) == 1)

        # C: for a list of constraints, all should pass
        test_pipe = InnerFoldManager(
            self.pipe.copy_me,
            self.config,
            self.optimization,
            self.cross_validation,
            self.outer_fold_id,
            optimization_constraints=[
                MinimumPerformance('accuracy', 0.75, 'all'),
                MinimumPerformance('specificity', 0.75, 'all')
            ])

        photon_results_config_item = test_pipe.fit(self.X, self.y)
        self.assertTrue(len(photon_results_config_item.inner_folds) == 4)

    def test_raise_error(self):

        # case A: raise_error = False -> we expect continuation of the computation
        test_pipe = InnerFoldManager(self.pipe.copy_me,
                                     self.config,
                                     self.optimization,
                                     self.cross_validation,
                                     self.outer_fold_id,
                                     raise_error=False)

        # computing with inequal number of features and targets should result in an error
        test_pipe.fit(self.X, self.y[:10])

        # case B:
        test_pipe.raise_error = True
        with self.assertRaises(IndexError):
            test_pipe.fit(self.X, self.y[:10])

    def test_save_predictions(self):

        # assert that we have the predictions stored
        test_pipe = InnerFoldManager(self.pipe.copy_me, self.config,
                                     self.optimization, self.cross_validation,
                                     self.outer_fold_id)

        # in case we want to have metrics calculated across false, we need to temporarily store the predictions
        test_pipe.optimization_infos.calculate_metrics_across_folds = True
        config_item = test_pipe.fit(self.X, self.y)

        for inner_fold in config_item.inner_folds:
            self.assertEqual(len(inner_fold.training.y_pred),
                             inner_fold.number_samples_training)
            self.assertEqual(len(inner_fold.validation.y_pred),
                             inner_fold.number_samples_validation)

    def test_save_feature_importances(self):
        test_pipe = InnerFoldManager(self.pipe.copy_me, self.config,
                                     self.optimization, self.cross_validation,
                                     self.outer_fold_id)

        # we expect the feature importances to be of length 5 because the input is through the PCA reduced to 5 dimensions
        output_config = test_pipe.fit(self.X, self.y)
        for inner_fold in output_config.inner_folds:
            self.assertEqual(len(inner_fold.feature_importances[0]), 5)

    def test_process_fit_results(self):

        test_pipe = InnerFoldManager(self.pipe.copy_me, self.config,
                                     self.optimization, self.cross_validation,
                                     self.outer_fold_id)
        test_pipe.cross_validation_infos.calculate_metrics_across_folds = True
        test_pipe.cross_validation_infos.calculate_metrics_per_fold = False
        across_folds_config_item = test_pipe.fit(self.X, self.y)

        test_pipe.cross_validation_infos.calculate_metrics_across_folds = False
        test_pipe.cross_validation_infos.calculate_metrics_per_fold = True
        per_fold_config_item = test_pipe.fit(self.X, self.y)

        test_pipe.cross_validation_infos.calculate_metrics_across_folds = True
        test_pipe.cross_validation_infos.calculate_metrics_per_fold = True
        across_and_per_folds_config_item = test_pipe.fit(self.X, self.y)

        def assert_fold_operations(expected_operations, returned_metric_list):
            # assert that we have raw and std and mean
            expected_returns = list()
            for metric in self.optimization.metrics:
                for operation in expected_operations:
                    expected_returns.append(metric + "__" + str(operation))

            returned_formatted_metric_list = [
                m.metric_name + "__" + str(m.operation)
                for m in returned_metric_list
            ]
            self.assertTrue(
                set(expected_returns) == set(returned_formatted_metric_list))

        # if we have both, then we have mean and std over the folds + three raw across folds
        num_of_metrics = len(test_pipe.optimization_infos.metrics)
        self.assertTrue(
            len(across_and_per_folds_config_item.metrics_train) == 2 *
            num_of_metrics + num_of_metrics)
        self.assertTrue(
            len(across_and_per_folds_config_item.metrics_test) == 2 *
            num_of_metrics + num_of_metrics)

        assert_fold_operations(
            [FoldOperations.RAW, FoldOperations.MEAN, FoldOperations.STD],
            across_and_per_folds_config_item.metrics_train)
        assert_fold_operations(
            [FoldOperations.RAW, FoldOperations.MEAN, FoldOperations.STD],
            across_and_per_folds_config_item.metrics_test)

        # if we have across folds only, then it should be 3, one for each metrics
        self.assertTrue(
            len(across_folds_config_item.metrics_train) == num_of_metrics)
        self.assertTrue(
            len(across_folds_config_item.metrics_test) == num_of_metrics)

        assert_fold_operations([FoldOperations.RAW],
                               across_folds_config_item.metrics_train)
        assert_fold_operations([FoldOperations.RAW],
                               across_folds_config_item.metrics_test)

        # if we have per fold only, then it should be 6, one for mean and std for each of the three metrics
        self.assertTrue(
            len(per_fold_config_item.metrics_train) == 2 * num_of_metrics)
        self.assertTrue(
            len(per_fold_config_item.metrics_test) == 2 * num_of_metrics)
        assert_fold_operations([FoldOperations.MEAN, FoldOperations.STD],
                               per_fold_config_item.metrics_train)
        assert_fold_operations([FoldOperations.MEAN, FoldOperations.STD],
                               per_fold_config_item.metrics_test)

    def test_extract_feature_importances(self):
        # one machine with coef_
        self.pipe.fit(self.X, self.y)
        f_importances_coef = self.pipe.feature_importances_
        self.assertTrue(f_importances_coef is not None)
        self.assertTrue(isinstance(f_importances_coef, list))

        # one machine with feature_importances_
        f_imp_pipe = PhotonPipeline([
            ('StandardScaler', PipelineElement('StandardScaler')),
            ('PCA', PipelineElement('PCA')),
            ('DecisionTreeClassifier',
             PipelineElement('DecisionTreeClassifier'))
        ])
        f_imp_pipe.fit(self.X, self.y)
        f_importances = f_imp_pipe.feature_importances_
        self.assertTrue(f_importances is not None)
        self.assertTrue(isinstance(f_importances, list))

        # one machine that has no feature importances
        no_f_imp_pipe = PhotonPipeline([
            ('StandardScaler', PipelineElement('StandardScaler')),
            ('PCA', PipelineElement('PCA')),
            ('SVC', PipelineElement('SVC', kernel='rbf'))
        ])
        no_f_imp_pipe.fit(self.X, self.y)
        no_f_imps = no_f_imp_pipe.feature_importances_
        self.assertTrue(no_f_imps is None)

    def test_learning_curves(self):
        def test_one_hyperpipe(learning_curves, learning_curves_cut):
            if learning_curves and learning_curves_cut is None:
                learning_curves_cut = FloatRange(0, 1, 'range', 0.2)
            output_settings = OutputSettings(
                project_folder=self.tmp_folder_path, save_output=False)
            test_hyperpipe = Hyperpipe(
                'test_pipe',
                learning_curves=learning_curves,
                learning_curves_cut=learning_curves_cut,
                metrics=['accuracy', 'recall', 'specificity'],
                best_config_metric='accuracy',
                inner_cv=self.inner_cv,
                output_settings=output_settings)

            self.assertEqual(test_hyperpipe.cross_validation.learning_curves,
                             learning_curves)
            if learning_curves:
                self.assertEqual(
                    test_hyperpipe.cross_validation.learning_curves_cut,
                    learning_curves_cut)
            else:
                self.assertIsNone(
                    test_hyperpipe.cross_validation.learning_curves_cut)

            test_hyperpipe += PipelineElement('StandardScaler')
            test_hyperpipe += PipelineElement('PCA', {'n_components': [1, 2]},
                                              random_state=42)
            test_hyperpipe += PipelineElement('SVC', {
                'C': [0.1],
                'kernel': ['linear']
            },
                                              random_state=42)
            test_hyperpipe.fit(self.X, self.y)
            config_results = test_hyperpipe.results_handler.results.outer_folds[
                0].tested_config_list
            config_num = len(config_results)
            for config_nr in range(config_num):
                for inner_fold_nr in range(self.inner_cv.n_splits):
                    curves = config_results[config_nr].inner_folds[
                        inner_fold_nr].learning_curves
                    if learning_curves:
                        self.assertEqual(len(curves),
                                         len(learning_curves_cut.values))
                        for learning_point_nr in range(
                                len(learning_curves_cut.values)):
                            test_metrics = list(
                                curves[learning_point_nr][1].keys())
                            train_metrics = list(
                                curves[learning_point_nr][2].keys())
                            self.assertEqual(
                                test_hyperpipe.optimization.metrics,
                                test_metrics)
                            self.assertEqual(
                                test_hyperpipe.optimization.metrics,
                                train_metrics)
                    else:
                        self.assertEqual(curves, [])

        # hyperpipe with properly set learning curves
        test_one_hyperpipe(learning_curves=True,
                           learning_curves_cut=FloatRange(0, 1, 'range', 0.5))
        # hyperpipe without cut (default cut should be used here)
        test_one_hyperpipe(learning_curves=True, learning_curves_cut=None)
        # hyperpipe with cut despite learning_curves being False
        test_one_hyperpipe(learning_curves=False,
                           learning_curves_cut=FloatRange(0, 1, 'range', 0.5))
Beispiel #22
0
class CachedPhotonPipelineTests(PhotonBaseTest):

    @classmethod
    def setUpClass(cls) -> None:
        cls.file = __file__
        super(CachedPhotonPipelineTests, cls).setUpClass()

    def setUp(self):
        super(CachedPhotonPipelineTests, self).setUp()
        # Photon Version
        ss = PipelineElement("StandardScaler", {})
        pca = PipelineElement("PCA", {'n_components': [3, 10, 50]}, random_state=3)
        svm = PipelineElement("SVC", {'kernel': ['rbf', 'linear']}, random_state=3)

        self.pipe = PhotonPipeline([('StandardScaler', ss),
                                    ('PCA', pca),
                                    ('SVC', svm)])

        self.pipe.caching = True
        self.pipe.fold_id = "12345643463434"
        CacheManager.clear_cache_files(self.cache_folder_path)
        self.pipe.cache_folder = self.cache_folder_path

        self.config1 = {'PCA__n_components': 4,
                        'SVC__C': 3,
                        'SVC__kernel': 'rbf'}

        self.config2 = {'PCA__n_components': 7,
                        'SVC__C': 1,
                        'SVC__kernel': 'linear'}

        self.X, self.y = load_breast_cancer(return_X_y=True)

    def test_group_caching(self):

        # transform one config
        self.pipe.set_params(**self.config1)
        self.pipe.fit(self.X, self.y)
        X_new, y_new, kwargs_new = self.pipe.transform(self.X, self.y)
        # one result should be cached ( one standard scaler output + one pca output)
        self.assertTrue(len(glob.glob(os.path.join(self.pipe.cache_folder, "*.p"))) == 2)

        # transform second config
        self.pipe.set_params(**self.config2)
        self.pipe.fit(self.X, self.y)
        X_config2, y_config2, kwargs_config2 = self.pipe.transform(self.X, self.y)
        # two results should be cached ( one standard scaler output (config hasn't changed)
        # + two pca outputs  )
        self.assertTrue(len(glob.glob(os.path.join(self.pipe.cache_folder, "*.p"))) == 3)

        # now transform with config 1 again, results should be loaded
        self.pipe.set_params(**self.config1)
        self.pipe.fit(self.X, self.y)
        X_2, y_2, kwargs_2 = self.pipe.transform(self.X, self.y)
        self.assertTrue(np.array_equal(X_new, X_2))
        self.assertTrue(np.array_equal(y_new, y_2))
        self.assertTrue(np.array_equal(kwargs_new, kwargs_2))

        # results should be the same as when caching is deactivated
        self.pipe.caching = False
        self.pipe.set_params(**self.config1)
        self.pipe.fit(self.X, self.y)
        X_uc, y_uc, kwargs_uc = self.pipe.transform(self.X, self.y)
        self.assertTrue(np.array_equal(X_uc, X_2))
        self.assertTrue(np.array_equal(y_uc, y_2))
        self.assertTrue(np.array_equal(kwargs_uc, kwargs_2))

    def test_empty_hyperparameters(self):
        # test if one can use it when only default parameters are given and hyperparameter space is empty
        self.pipe.set_params(**{})
        self.pipe.fit(self.X, self.y)
        X_new, y_new, kwargs_new = self.pipe.transform(self.X, self.y)
        # one result should be cached ( one standard scaler output + one pca output )
        self.assertTrue(len(glob.glob(os.path.join(self.pipe.cache_folder, "*.p"))) == 2)

        self.pipe.set_params(**{})
        self.pipe.fit(self.X, self.y)
        X_new2, y_new2, kwargs_new2 = self.pipe.transform(self.X, self.y)
        # assert nothing happened in the cache folder
        self.assertTrue(len(glob.glob(os.path.join(self.pipe.cache_folder, "*.p"))) == 2)
        self.assertTrue(np.array_equal(X_new, X_new2))
        self.assertTrue(np.array_equal(y_new, y_new2))
        self.assertTrue(np.array_equal(kwargs_new, kwargs_new2))
Beispiel #23
0
class InnerFoldTests(PhotonBaseTest):
    def setUp(self):
        super(InnerFoldTests, self).setUp()
        self.pipe = PhotonPipeline([
            ("StandardScaler", PipelineElement("StandardScaler")),
            ("PCA", PipelineElement("PCA")),
            ("RidgeClassifier", PipelineElement("RidgeClassifier")),
        ])
        self.config = {
            "PCA__n_components": 5,
            "RidgeClassifier__solver": "svd",
            "RidgeClassifier__random_state": 42,
        }
        self.outer_fold_id = "TestID"
        self.inner_cv = KFold(n_splits=4)
        self.X, self.y = load_breast_cancer(True)
        self.cross_validation = Hyperpipe.CrossValidation(
            self.inner_cv, None, True, 0.2, True, False)
        self.cross_validation.inner_folds = {
            self.outer_fold_id: {
                i: FoldInfo(i, i + 1, train, test)
                for i, (train,
                        test) in enumerate(self.inner_cv.split(self.X, self.y))
            }
        }
        self.optimization = Hyperpipe.Optimization(
            "grid_search", {}, ["accuracy", "recall", "specificity"],
            "accuracy", None)

    def test_fit_against_sklearn(self):
        test_pipe = InnerFoldManager(
            self.pipe.copy_me,
            self.config,
            self.optimization,
            self.cross_validation,
            self.outer_fold_id,
        )

        photon_results_config_item = test_pipe.fit(self.X, self.y)
        self.assertIsNotNone(photon_results_config_item.computation_start_time)
        self.assertIsNotNone(photon_results_config_item.computation_end_time)

        # now sklearn.
        sklearn_pipe = Pipeline([
            ("StandardScaler", StandardScaler()),
            ("PCA", PCA()),
            ("RidgeClassifier", RidgeClassifier()),
        ])
        sklearn_pipe.set_params(**self.config)
        for fold_obj in self.cross_validation.inner_folds[
                self.outer_fold_id].values():
            train_X, test_X = (
                self.X[fold_obj.train_indices],
                self.X[fold_obj.test_indices],
            )
            train_y, test_y = (
                self.y[fold_obj.train_indices],
                self.y[fold_obj.test_indices],
            )

            sklearn_pipe.fit(train_X, train_y)
            sklearn_predictions = sklearn_pipe.predict(test_X)
            sklearn_feature_importances = sklearn_pipe.named_steps[
                "RidgeClassifier"].coef_

            photon_test_results = photon_results_config_item.inner_folds[
                fold_obj.fold_nr - 1].validation

            self.assertTrue(
                np.array_equal(sklearn_predictions,
                               photon_test_results.y_pred))

            for fi, sklearn_feature_importance_score in enumerate(
                    sklearn_feature_importances[0]):
                self.assertAlmostEqual(
                    sklearn_feature_importance_score,
                    photon_results_config_item.inner_folds[
                        fold_obj.fold_nr - 1].feature_importances[0][fi],
                )

            accuracy = accuracy_score(test_y, sklearn_predictions)
            self.assertEqual(photon_test_results.metrics["accuracy"], accuracy)

            recall = recall_score(test_y, sklearn_predictions)
            self.assertEqual(photon_test_results.metrics["recall"], recall)

    def test_performance_constraints(self):
        # test if the constraints are considered
        # A: for a single constraint
        test_pipe = InnerFoldManager(
            self.pipe.copy_me,
            self.config,
            self.optimization,
            self.cross_validation,
            self.outer_fold_id,
            optimization_constraints=MinimumPerformance(
                "accuracy", 0.95, "first"),
        )

        photon_results_config_item = test_pipe.fit(self.X, self.y)
        # the first fold has an accuracy of 0.874 so we expect the test_pipe to stop calculating after the first fold
        # which means it has only one outer fold and
        self.assertTrue(len(photon_results_config_item.inner_folds) == 1)

        # B: for a list of constraints, accuracy should pass (0.874 in first fold > accuracy threshold)
        # but specificity should stop the computation (0.78 in first fold < specificity threshold)
        test_pipe = InnerFoldManager(
            self.pipe.copy_me,
            self.config,
            self.optimization,
            self.cross_validation,
            self.outer_fold_id,
            optimization_constraints=[
                MinimumPerformance("accuracy", 0.85, "first"),
                MinimumPerformance("specificity", 0.8, "first"),
            ],
        )

        photon_results_config_item = test_pipe.fit(self.X, self.y)
        self.assertTrue(len(photon_results_config_item.inner_folds) == 1)

        # C: for a list of constraints, all should pass
        test_pipe = InnerFoldManager(
            self.pipe.copy_me,
            self.config,
            self.optimization,
            self.cross_validation,
            self.outer_fold_id,
            optimization_constraints=[
                MinimumPerformance("accuracy", 0.75, "all"),
                MinimumPerformance("specificity", 0.75, "all"),
            ],
        )

        photon_results_config_item = test_pipe.fit(self.X, self.y)
        self.assertTrue(len(photon_results_config_item.inner_folds) == 4)

    def test_raise_error(self):

        # case A: raise_error = False -> we expect continuation of the computation
        test_pipe = InnerFoldManager(
            self.pipe.copy_me,
            self.config,
            self.optimization,
            self.cross_validation,
            self.outer_fold_id,
            raise_error=False,
        )

        # computing with inequal number of features and targets should result in an error
        test_pipe.fit(self.X, self.y[:10])

        # case B:
        test_pipe.raise_error = True
        with self.assertRaises(IndexError):
            test_pipe.fit(self.X, self.y[:10])

    def test_save_predictions(self):

        # assert that we have the predictions stored
        test_pipe = InnerFoldManager(
            self.pipe.copy_me,
            self.config,
            self.optimization,
            self.cross_validation,
            self.outer_fold_id,
        )

        # in case we want to have metrics calculated across false, we need to temporarily store the predictions
        test_pipe.optimization_infos.calculate_metrics_across_folds = True
        config_item = test_pipe.fit(self.X, self.y)

        for inner_fold in config_item.inner_folds:
            self.assertEqual(len(inner_fold.training.y_pred),
                             inner_fold.number_samples_training)
            self.assertEqual(len(inner_fold.validation.y_pred),
                             inner_fold.number_samples_validation)

    def test_save_feature_importances(self):
        test_pipe = InnerFoldManager(
            self.pipe.copy_me,
            self.config,
            self.optimization,
            self.cross_validation,
            self.outer_fold_id,
        )

        # we expect the feature importances to be of length 5 because the input is through the PCA reduced to 5 dimensions
        output_config = test_pipe.fit(self.X, self.y)
        for inner_fold in output_config.inner_folds:
            self.assertEqual(len(inner_fold.feature_importances[0]), 5)

    def test_process_fit_results(self):

        test_pipe = InnerFoldManager(
            self.pipe.copy_me,
            self.config,
            self.optimization,
            self.cross_validation,
            self.outer_fold_id,
        )
        test_pipe.cross_validation_infos.calculate_metrics_across_folds = True
        test_pipe.cross_validation_infos.calculate_metrics_per_fold = False
        across_folds_config_item = test_pipe.fit(self.X, self.y)

        test_pipe.cross_validation_infos.calculate_metrics_across_folds = False
        test_pipe.cross_validation_infos.calculate_metrics_per_fold = True
        per_fold_config_item = test_pipe.fit(self.X, self.y)

        test_pipe.cross_validation_infos.calculate_metrics_across_folds = True
        test_pipe.cross_validation_infos.calculate_metrics_per_fold = True
        across_and_per_folds_config_item = test_pipe.fit(self.X, self.y)

        def assert_fold_operations(expected_operations, returned_metric_list):
            # assert that we have raw and std and mean
            expected_returns = list()
            for metric in self.optimization.metrics:
                for operation in expected_operations:
                    expected_returns.append(metric + "__" + str(operation))

            returned_formatted_metric_list = [
                m.metric_name + "__" + str(m.operation)
                for m in returned_metric_list
            ]
            self.assertTrue(
                set(expected_returns) == set(returned_formatted_metric_list))

        # if we have both, then we have mean and std over the folds + three raw across folds
        num_of_metrics = len(test_pipe.optimization_infos.metrics)
        self.assertTrue(
            len(across_and_per_folds_config_item.metrics_train) == 2 *
            num_of_metrics + num_of_metrics)
        self.assertTrue(
            len(across_and_per_folds_config_item.metrics_test) == 2 *
            num_of_metrics + num_of_metrics)

        assert_fold_operations(
            [FoldOperations.RAW, FoldOperations.MEAN, FoldOperations.STD],
            across_and_per_folds_config_item.metrics_train,
        )
        assert_fold_operations(
            [FoldOperations.RAW, FoldOperations.MEAN, FoldOperations.STD],
            across_and_per_folds_config_item.metrics_test,
        )

        # if we have across folds only, then it should be 3, one for each metrics
        self.assertTrue(
            len(across_folds_config_item.metrics_train) == num_of_metrics)
        self.assertTrue(
            len(across_folds_config_item.metrics_test) == num_of_metrics)

        assert_fold_operations([FoldOperations.RAW],
                               across_folds_config_item.metrics_train)
        assert_fold_operations([FoldOperations.RAW],
                               across_folds_config_item.metrics_test)

        # if we have per fold only, then it should be 6, one for mean and std for each of the three metrics
        self.assertTrue(
            len(per_fold_config_item.metrics_train) == 2 * num_of_metrics)
        self.assertTrue(
            len(per_fold_config_item.metrics_test) == 2 * num_of_metrics)
        assert_fold_operations(
            [FoldOperations.MEAN, FoldOperations.STD],
            per_fold_config_item.metrics_train,
        )
        assert_fold_operations([FoldOperations.MEAN, FoldOperations.STD],
                               per_fold_config_item.metrics_test)

    def test_extract_feature_importances(self):
        # one machine with coef_
        self.pipe.fit(self.X, self.y)
        f_importances_coef = self.pipe.feature_importances_
        self.assertTrue(f_importances_coef is not None)
        self.assertTrue(isinstance(f_importances_coef, list))

        # one machine with feature_importances_
        f_imp_pipe = PhotonPipeline([
            ("StandardScaler", PipelineElement("StandardScaler")),
            ("PCA", PipelineElement("PCA")),
            ("DecisionTreeClassifier",
             PipelineElement("DecisionTreeClassifier")),
        ])
        f_imp_pipe.fit(self.X, self.y)
        f_importances = f_imp_pipe.feature_importances_
        self.assertTrue(f_importances is not None)
        self.assertTrue(isinstance(f_importances, list))

        # one machine that has no feature importances
        no_f_imp_pipe = PhotonPipeline([
            ("StandardScaler", PipelineElement("StandardScaler")),
            ("PCA", PipelineElement("PCA")),
            ("SVC", PipelineElement("SVC", kernel="rbf")),
        ])
        no_f_imp_pipe.fit(self.X, self.y)
        no_f_imps = no_f_imp_pipe.feature_importances_
        self.assertTrue(no_f_imps is None)
Beispiel #24
0
    def test_combi_from_single_and_group_caching(self):

        # 1. load data
        test_folder = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                   "../test_data/")
        X = AtlasLibrary().get_nii_files_from_folder(test_folder,
                                                     extension=".nii")
        nr_of_expected_pickles_per_config = len(X)
        y = np.random.randn(len(X))

        # 2. specify cache directories
        cache_folder_base = self.cache_folder_path
        cache_folder_neuro = os.path.join(cache_folder_base,
                                          "subject_caching_test")

        CacheManager.clear_cache_files(cache_folder_base)
        CacheManager.clear_cache_files(cache_folder_neuro)

        # 3. set up Neuro Branch
        nb = NeuroBranch("SubjectCaching", nr_of_processes=3)
        # increase complexity by adding batching
        nb += PipelineElement("ResampleImages", batch_size=4)
        nb += PipelineElement("BrainMask", batch_size=4)
        nb.base_element.cache_folder = cache_folder_neuro

        # 4. setup usual pipeline
        ss = PipelineElement("StandardScaler", {})
        pca = PipelineElement("PCA", {"n_components": [3, 10, 50]})
        svm = PipelineElement("SVR", {"kernel": ["rbf", "linear"]})

        pipe = PhotonPipeline([("NeuroBranch", nb), ("StandardScaler", ss),
                               ("PCA", pca), ("SVR", svm)])

        pipe.caching = True
        pipe.fold_id = "12345643463434"
        pipe.cache_folder = cache_folder_base

        def transform_and_check_folder(config, expected_nr_of_files_group,
                                       expected_nr_subject):
            pipe.set_params(**config)
            pipe.fit(X, y)
            nr_of_generated_cache_files = len(
                glob.glob(os.path.join(cache_folder_base, "*.p")))
            self.assertTrue(
                nr_of_generated_cache_files == expected_nr_of_files_group)

            nr_of_generated_cache_files_subject = len(
                glob.glob(os.path.join(cache_folder_neuro, "*.p")))
            self.assertTrue(
                nr_of_generated_cache_files_subject == expected_nr_subject)

        config1 = {
            "NeuroBranch__ResampleImages__voxel_size": 5,
            "PCA__n_components": 7,
            "SVR__C": 2,
        }
        config2 = {
            "NeuroBranch__ResampleImages__voxel_size": 3,
            "PCA__n_components": 4,
            "SVR__C": 5,
        }

        # first config we expect to have a cached_file for the standard scaler and the pca
        # and we expect to have two files (one resampler, one brain mask) for each input data
        transform_and_check_folder(config1, 2,
                                   2 * nr_of_expected_pickles_per_config)

        # second config we expect to have two cached_file for the standard scaler (one time for 5 voxel input and one
        # time for 3 voxel input) and two files two for the first and second config pcas,
        # and we expect to have 2 * nr of input data for resampler plus one time masker
        transform_and_check_folder(config2, 4,
                                   4 * nr_of_expected_pickles_per_config)

        # when we transform with the first config again, nothing should happen
        transform_and_check_folder(config1, 4,
                                   4 * nr_of_expected_pickles_per_config)

        # when we transform with an empty config, a new entry for pca and standard scaler should be generated, as well
        # as a new cache item for each input data from the neuro branch for each itemin the neuro branch
        with self.assertRaises(ValueError):
            transform_and_check_folder({}, 6,
                                       6 * nr_of_expected_pickles_per_config)

        CacheManager.clear_cache_files(cache_folder_base)
        CacheManager.clear_cache_files(cache_folder_neuro)
Beispiel #25
0
    def test_inverse_tansform(self):
        # simple pipe
        sk_pipe = SKPipeline([("SS", self.sk_ss), ("PCA", self.sk_pca)])
        sk_pipe.fit(self.X, self.y)
        sk_transform = sk_pipe.transform(self.X)
        sk_inverse_transformed = sk_pipe.inverse_transform(sk_transform)

        photon_pipe = PhotonPipeline([("SS", self.p_ss), ("PCA", self.p_pca)])
        photon_pipe.fit(self.X, self.y)
        p_transform, _, _ = photon_pipe.transform(self.X)
        p_inverse_transformed, _, _ = photon_pipe.inverse_transform(
            p_transform)

        self.assertTrue(
            np.array_equal(sk_inverse_transformed, p_inverse_transformed))

        # now including stack
        stack = Stack("stack", [self.p_pca])
        stack_pipeline = PhotonPipeline([
            ("stack", stack),
            ("StandardScaler", PipelineElement("StandardScaler")),
            ("LinearSVC", PipelineElement("LinearSVC")),
        ])
        stack_pipeline.fit(self.X, self.y)
        feature_importances = stack_pipeline.feature_importances_
        inversed_data, _, _ = stack_pipeline.inverse_transform(
            feature_importances)
        self.assertEqual(inversed_data.shape[1], self.X.shape[1])
Beispiel #26
0
class CachedPhotonPipelineTests(PhotonBaseTest):
    def setUp(self):
        super(CachedPhotonPipelineTests, self).setUp()
        # Photon Version
        ss = PipelineElement("StandardScaler", {})
        pca = PipelineElement("PCA", {"n_components": [3, 10, 50]},
                              random_state=3)
        svm = PipelineElement("SVC", {"kernel": ["rbf", "linear"]},
                              random_state=3)

        self.pipe = PhotonPipeline([("StandardScaler", ss), ("PCA", pca),
                                    ("SVC", svm)])

        self.pipe.caching = True
        self.pipe.fold_id = "12345643463434"
        self.pipe.cache_folder = self.cache_folder_path

        self.config1 = {
            "PCA__n_components": 4,
            "SVC__C": 3,
            "SVC__kernel": "rbf"
        }

        self.config2 = {
            "PCA__n_components": 7,
            "SVC__C": 1,
            "SVC__kernel": "linear"
        }

        self.X, self.y = load_breast_cancer(True)

    def test_group_caching(self):

        # transform one config
        self.pipe.set_params(**self.config1)
        self.pipe.fit(self.X, self.y)
        X_new, y_new, kwargs_new = self.pipe.transform(self.X, self.y)
        # one result should be cached ( one standard scaler output + one pca output)
        self.assertTrue(
            len(glob.glob(os.path.join(self.pipe.cache_folder, "*.p"))) == 2)

        # transform second config
        self.pipe.set_params(**self.config2)
        self.pipe.fit(self.X, self.y)
        X_config2, y_config2, kwargs_config2 = self.pipe.transform(
            self.X, self.y)
        # two results should be cached ( one standard scaler output (config hasn't changed)
        # + two pca outputs  )
        self.assertTrue(
            len(glob.glob(os.path.join(self.pipe.cache_folder, "*.p"))) == 3)

        # now transform with config 1 again, results should be loaded
        self.pipe.set_params(**self.config1)
        self.pipe.fit(self.X, self.y)
        X_2, y_2, kwargs_2 = self.pipe.transform(self.X, self.y)
        self.assertTrue(np.array_equal(X_new, X_2))
        self.assertTrue(np.array_equal(y_new, y_2))
        self.assertTrue(np.array_equal(kwargs_new, kwargs_2))

        # results should be the same as when caching is deactivated
        self.pipe.caching = False
        self.pipe.set_params(**self.config1)
        self.pipe.fit(self.X, self.y)
        X_uc, y_uc, kwargs_uc = self.pipe.transform(self.X, self.y)
        self.assertTrue(np.array_equal(X_uc, X_2))
        self.assertTrue(np.array_equal(y_uc, y_2))
        self.assertTrue(np.array_equal(kwargs_uc, kwargs_2))

    def test_empty_hyperparameters(self):
        # test if one can use it when only default parameters are given and hyperparameter space is empty
        self.pipe.set_params(**{})
        self.pipe.fit(self.X, self.y)
        X_new, y_new, kwargs_new = self.pipe.transform(self.X, self.y)
        # one result should be cached ( one standard scaler output + one pca output )
        self.assertTrue(
            len(glob.glob(os.path.join(self.pipe.cache_folder, "*.p"))) == 2)

        self.pipe.set_params(**{})
        self.pipe.fit(self.X, self.y)
        X_new2, y_new2, kwargs_new2 = self.pipe.transform(self.X, self.y)
        # assert nothing happened in the cache folder
        self.assertTrue(
            len(glob.glob(os.path.join(self.pipe.cache_folder, "*.p"))) == 2)
        self.assertTrue(np.array_equal(X_new, X_new2))
        self.assertTrue(np.array_equal(y_new, y_new2))
        self.assertTrue(np.array_equal(kwargs_new, kwargs_new2))

    def test_single_subject_caching(self):

        nb = NeuroBranch("subject_caching_test")
        # increase complexity by adding batching
        nb += PipelineElement("ResampleImages", batch_size=4)

        test_folder = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                   "../test_data/")
        X = AtlasLibrary().get_nii_files_from_folder(test_folder,
                                                     extension=".nii")
        y = np.random.randn(len(X))

        cache_folder = self.cache_folder_path
        cache_folder = os.path.join(cache_folder, "subject_caching_test")
        nb.base_element.cache_folder = cache_folder

        nr_of_expected_pickles_per_config = len(X)

        def transform_and_check_folder(config, expected_nr_of_files):
            nb.set_params(**config)
            nb.transform(X, y)
            nr_of_generated_cache_files = len(
                glob.glob(os.path.join(cache_folder, "*.p")))
            self.assertTrue(
                nr_of_generated_cache_files == expected_nr_of_files)

        # fit with first config
        # expect one cache file per input file
        transform_and_check_folder({"ResampleImages__voxel_size": 5},
                                   nr_of_expected_pickles_per_config)

        # after fitting with second config, we expect two times the number of input files to be in cache
        transform_and_check_folder({"ResampleImages__voxel_size": 10},
                                   2 * nr_of_expected_pickles_per_config)

        # fit with first config again, we expect to not have generate other cache files, because they exist
        transform_and_check_folder({"ResampleImages__voxel_size": 5},
                                   2 * nr_of_expected_pickles_per_config)

        # clean up afterwards
        CacheManager.clear_cache_files(cache_folder)

    def test_combi_from_single_and_group_caching(self):

        # 1. load data
        test_folder = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                   "../test_data/")
        X = AtlasLibrary().get_nii_files_from_folder(test_folder,
                                                     extension=".nii")
        nr_of_expected_pickles_per_config = len(X)
        y = np.random.randn(len(X))

        # 2. specify cache directories
        cache_folder_base = self.cache_folder_path
        cache_folder_neuro = os.path.join(cache_folder_base,
                                          "subject_caching_test")

        CacheManager.clear_cache_files(cache_folder_base)
        CacheManager.clear_cache_files(cache_folder_neuro)

        # 3. set up Neuro Branch
        nb = NeuroBranch("SubjectCaching", nr_of_processes=3)
        # increase complexity by adding batching
        nb += PipelineElement("ResampleImages", batch_size=4)
        nb += PipelineElement("BrainMask", batch_size=4)
        nb.base_element.cache_folder = cache_folder_neuro

        # 4. setup usual pipeline
        ss = PipelineElement("StandardScaler", {})
        pca = PipelineElement("PCA", {"n_components": [3, 10, 50]})
        svm = PipelineElement("SVR", {"kernel": ["rbf", "linear"]})

        pipe = PhotonPipeline([("NeuroBranch", nb), ("StandardScaler", ss),
                               ("PCA", pca), ("SVR", svm)])

        pipe.caching = True
        pipe.fold_id = "12345643463434"
        pipe.cache_folder = cache_folder_base

        def transform_and_check_folder(config, expected_nr_of_files_group,
                                       expected_nr_subject):
            pipe.set_params(**config)
            pipe.fit(X, y)
            nr_of_generated_cache_files = len(
                glob.glob(os.path.join(cache_folder_base, "*.p")))
            self.assertTrue(
                nr_of_generated_cache_files == expected_nr_of_files_group)

            nr_of_generated_cache_files_subject = len(
                glob.glob(os.path.join(cache_folder_neuro, "*.p")))
            self.assertTrue(
                nr_of_generated_cache_files_subject == expected_nr_subject)

        config1 = {
            "NeuroBranch__ResampleImages__voxel_size": 5,
            "PCA__n_components": 7,
            "SVR__C": 2,
        }
        config2 = {
            "NeuroBranch__ResampleImages__voxel_size": 3,
            "PCA__n_components": 4,
            "SVR__C": 5,
        }

        # first config we expect to have a cached_file for the standard scaler and the pca
        # and we expect to have two files (one resampler, one brain mask) for each input data
        transform_and_check_folder(config1, 2,
                                   2 * nr_of_expected_pickles_per_config)

        # second config we expect to have two cached_file for the standard scaler (one time for 5 voxel input and one
        # time for 3 voxel input) and two files two for the first and second config pcas,
        # and we expect to have 2 * nr of input data for resampler plus one time masker
        transform_and_check_folder(config2, 4,
                                   4 * nr_of_expected_pickles_per_config)

        # when we transform with the first config again, nothing should happen
        transform_and_check_folder(config1, 4,
                                   4 * nr_of_expected_pickles_per_config)

        # when we transform with an empty config, a new entry for pca and standard scaler should be generated, as well
        # as a new cache item for each input data from the neuro branch for each itemin the neuro branch
        with self.assertRaises(ValueError):
            transform_and_check_folder({}, 6,
                                       6 * nr_of_expected_pickles_per_config)

        CacheManager.clear_cache_files(cache_folder_base)
        CacheManager.clear_cache_files(cache_folder_neuro)