def test_confounder_removal_statistically(self):
        cr = PipelineElement("ConfounderRemoval", {},
                             standardize_covariates=False)
        cr.fit(self.z[:, 1:3], self.z[:, 0], **{"confounder": self.z[:, 3]})

        # use transform to write data to cache
        z_transformed = cr.transform(self.z[:, 1:3],
                                     **{"confounder": self.z[:, 3]})
        corr = np.corrcoef(
            np.concatenate(
                [
                    self.z[:, 0].reshape(-1, 1),
                    z_transformed[0],
                    self.z[:, 3].reshape(-1, 1),
                ],
                axis=1,
            ),
            rowvar=False,
        )
        # correlation between target and feature should be lower than 0.25 in this case
        # correlation between covariate and feature should be near zero
        self.assertLess(corr[1, 0], 0.25)
        self.assertLess(corr[2, 0], 0.25)
        self.assertAlmostEqual(corr[3, 1], 0)
        self.assertAlmostEqual(corr[3, 2], 0)
Exemple #2
0
    def test_predict_proba(self):
        self.svc_pipe_element.fit(self.X, self.y)
        self.assertEqual(self.svc_pipe_element.predict_proba(self.X), None)

        gpc = PipelineElement('GaussianProcessClassifier')
        gpc.fit(self.X, self.y)
        self.assertTrue(
            np.array_equal(
                gpc.predict_proba(self.X)[0],
                np.asarray([0.5847072926551391, 0.4152927073448609])))
Exemple #3
0
    def test_copy_me(self):
        svc = PipelineElement('SVC', {
            'C': [0.1, 1],
            'kernel': ['rbf', 'sigmoid']
        })
        svc.set_params(**{'C': 0.1, 'kernel': 'sigmoid'})
        copy = svc.copy_me()

        self.assertEqual(svc.random_state, copy.random_state)
        self.assertNotEqual(copy.base_element, svc.base_element)
        self.assertDictEqual(elements_to_dict(copy), elements_to_dict(svc))
        self.assertEqual(copy.base_element.C, svc.base_element.C)

        # check if copies are still the same, even when making a copy of a fitted PipelineElement
        copy_after_fit = svc.fit(self.X, self.y).copy_me()
        self.assertDictEqual(elements_to_dict(copy),
                             elements_to_dict(copy_after_fit))

        svc = PipelineElement('SVC', {
            'C': [0.1, 1],
            'kernel': ['rbf', 'sigmoid']
        })
        copy = svc.copy_me()
        self.assertDictEqual(copy.hyperparameters, {
            'SVC__C': [0.1, 1],
            'SVC__kernel': ['rbf', 'sigmoid']
        })
        copy.base_element.C = 3
        self.assertNotEqual(svc.base_element.C, copy.base_element.C)

        # test custom element
        custom_element = PipelineElement.create(
            'CustomElement',
            base_element=DummyNeedsCovariatesEstimator(),
            hyperparameters={})
        copy = custom_element.copy_me()
        self.assertDictEqual(elements_to_dict(custom_element),
                             elements_to_dict(copy))

        custom_element2 = PipelineElement.create(
            'MyUnDeepcopyableObject',
            base_element=GridSearchOptimizer(),
            hyperparameters={})
        with self.assertRaises(Exception):
            custom_element2.copy_me()
Exemple #4
0
class SwitchTests(unittest.TestCase):
    def setUp(self):
        self.X, self.y = load_breast_cancer(True)
        self.svc = PipelineElement('SVC', {
            'C': [0.1, 1],
            'kernel': ['rbf', 'sigmoid']
        })
        self.tree = PipelineElement('DecisionTreeClassifier',
                                    {'min_samples_split': [2, 3, 4]})
        self.gpc = PipelineElement('GaussianProcessClassifier')
        self.pca = PipelineElement('PCA')

        self.estimator_branch = Branch('estimator_branch',
                                       [self.tree.copy_me()])
        self.transformer_branch = Branch('transformer_branch',
                                         [self.pca.copy_me()])

        self.estimator_switch = Switch(
            'estimator_switch',
            [self.svc.copy_me(),
             self.tree.copy_me(),
             self.gpc.copy_me()])
        self.estimator_switch_with_branch = Switch(
            'estimator_switch_with_branch',
            [self.tree.copy_me(),
             self.estimator_branch.copy_me()])
        self.transformer_switch_with_branch = Switch(
            'transformer_switch_with_branch',
            [self.pca.copy_me(),
             self.transformer_branch.copy_me()])
        self.switch_in_switch = Switch('Switch_in_switch', [
            self.transformer_branch.copy_me(),
            self.transformer_switch_with_branch.copy_me()
        ])

    def test_init(self):
        self.assertEqual(self.estimator_switch.name, 'estimator_switch')

    def test_hyperparams(self):
        # assert number of different configs to test
        # each config combi for each element: 4 for SVC and 3 for logistic regression = 7
        self.assertEqual(
            len(self.estimator_switch.pipeline_element_configurations), 3)
        self.assertEqual(
            len(self.estimator_switch.pipeline_element_configurations[0]), 4)
        self.assertEqual(
            len(self.estimator_switch.pipeline_element_configurations[1]), 3)

        # hyperparameters
        self.assertDictEqual(
            self.estimator_switch.hyperparameters, {
                'estimator_switch__current_element': [(0, 0), (0, 1), (0, 2),
                                                      (0, 3), (1, 0), (1, 1),
                                                      (1, 2), (2, 0)]
            })

        # config grid
        self.assertListEqual(self.estimator_switch.generate_config_grid(), [{
            'estimator_switch__current_element': (0, 0)
        }, {
            'estimator_switch__current_element': (0, 1)
        }, {
            'estimator_switch__current_element': (0, 2)
        }, {
            'estimator_switch__current_element': (0, 3)
        }, {
            'estimator_switch__current_element': (1, 0)
        }, {
            'estimator_switch__current_element': (1, 1)
        }, {
            'estimator_switch__current_element': (1, 2)
        }, {
            'estimator_switch__current_element': (2, 0)
        }])

    def test_set_params(self):

        # test for grid search
        false_config = {'current_element': 1}
        with self.assertRaises(ValueError):
            self.estimator_switch.set_params(**false_config)

        correct_config = {'current_element': (0, 1)}
        self.estimator_switch.set_params(**correct_config)
        self.assertEqual(self.estimator_switch.base_element.base_element.C,
                         0.1)
        self.assertEqual(
            self.estimator_switch.base_element.base_element.kernel, 'sigmoid')

        # test for other optimizers
        smac_config = {'SVC__C': 2, 'SVC__kernel': 'rbf'}
        self.estimator_switch.set_params(**smac_config)
        self.assertEqual(self.estimator_switch.base_element.base_element.C, 2)
        self.assertEqual(
            self.estimator_switch.base_element.base_element.kernel, 'rbf')

    def test_fit(self):
        np.random.seed(42)
        self.estimator_switch.set_params(**{'current_element': (1, 0)})
        self.estimator_switch.fit(self.X, self.y)
        np.random.seed(42)
        self.tree.set_params(**{'min_samples_split': 2})
        self.tree.fit(self.X, self.y)
        np.testing.assert_array_equal(
            self.tree.base_element.feature_importances_,
            self.estimator_switch.base_element.feature_importances_)

    def test_transform(self):
        self.transformer_switch_with_branch.set_params(
            **{'current_element': (0, 0)})
        self.transformer_switch_with_branch.fit(self.X, self.y)
        self.pca.fit(self.X, self.y)

        switch_Xt, _, _ = self.transformer_switch_with_branch.transform(self.X)
        pca_Xt, _, _ = self.pca.transform(self.X)
        self.assertTrue(np.array_equal(pca_Xt, switch_Xt))

    def test_predict(self):
        self.estimator_switch.set_params(**{'current_element': (1, 0)})
        np.random.seed(42)
        self.estimator_switch.fit(self.X, self.y)
        self.tree.set_params(**{'min_samples_split': 2})
        np.random.seed(42)
        self.tree.fit(self.X, self.y)

        switch_preds = self.estimator_switch.predict(self.X)
        tree_preds = self.tree.predict(self.X)
        self.assertTrue(np.array_equal(switch_preds, tree_preds))

    def test_predict_proba(self):
        gpc = PipelineElement('GaussianProcessClassifier')
        svc = PipelineElement('SVC')
        switch = Switch('EstimatorSwitch', [gpc, svc])
        switch.set_params(**{'current_element': (0, 0)})
        np.random.seed(42)
        switch_probas = switch.fit(self.X, self.y).predict_proba(self.X)
        np.random.seed(42)
        gpr_probas = self.gpc.fit(self.X, self.y).predict_proba(self.X)
        self.assertTrue(np.array_equal(switch_probas, gpr_probas))

    def test_inverse_transform(self):
        self.transformer_switch_with_branch.set_params(
            **{'current_element': (0, 0)})
        self.transformer_switch_with_branch.fit(self.X, self.y)
        self.pca.fit(self.X, self.y)
        Xt_pca, _, _ = self.pca.transform(self.X)
        Xt_switch, _, _ = self.transformer_switch_with_branch.transform(self.X)
        X_pca, _, _ = self.pca.inverse_transform(Xt_pca)
        X_switch, _, _ = self.transformer_switch_with_branch.inverse_transform(
            Xt_switch)

        self.assertTrue(np.array_equal(Xt_pca, Xt_switch))
        self.assertTrue(np.array_equal(X_pca, X_switch))
        np.testing.assert_almost_equal(X_switch, self.X)

    def test_base_element(self):
        switch = Switch('switch', [self.svc, self.tree])
        switch.set_params(**{'current_element': (1, 1)})
        self.assertIs(switch.base_element, self.tree)
        self.assertIs(switch.base_element.base_element, self.tree.base_element)

        # other optimizer
        switch.set_params(**{'DecisionTreeClassifier__min_samples_split': 2})
        self.assertIs(switch.base_element, self.tree)
        self.assertIs(switch.base_element.base_element, self.tree.base_element)

    def test_copy_me(self):
        switches = [
            self.estimator_switch, self.estimator_switch_with_branch,
            self.transformer_switch_with_branch, self.switch_in_switch
        ]

        for switch in switches:
            copy = switch.copy_me()

            self.assertEqual(switch.random_state, copy.random_state)

            for i, element in enumerate(copy.elements):
                self.assertNotEqual(copy.elements[i], switch.elements[i])

            switch = elements_to_dict(switch)
            copy = elements_to_dict(copy)

            self.assertDictEqual(copy, switch)

    def test_estimator_type(self):
        pca = PipelineElement('PCA')
        ica = PipelineElement('FastICA')
        svc = PipelineElement('SVC')
        svr = PipelineElement('SVR')
        tree_class = PipelineElement('DecisionTreeClassifier')
        tree_reg = PipelineElement('DecisionTreeRegressor')

        switch = Switch('MySwitch', [pca, svr])
        with self.assertRaises(NotImplementedError):
            est_type = switch._estimator_type

        switch = Switch('MySwitch', [svc, svr])
        with self.assertRaises(NotImplementedError):
            est_type = switch._estimator_type

        switch = Switch('MySwitch', [pca, ica])
        self.assertEqual(switch._estimator_type, None)

        switch = Switch('MySwitch', [tree_class, svc])
        self.assertEqual(switch._estimator_type, 'classifier')

        switch = Switch('MySwitch', [tree_reg, svr])
        self.assertEqual(switch._estimator_type, 'regressor')

        self.assertEqual(self.estimator_switch._estimator_type, 'classifier')
        self.assertEqual(self.estimator_switch_with_branch._estimator_type,
                         'classifier')
        self.assertEqual(self.transformer_switch_with_branch._estimator_type,
                         None)
        self.assertEqual(self.switch_in_switch._estimator_type, None)

    def test_add(self):
        self.assertEqual(len(self.estimator_switch.elements), 3)
        self.assertEqual(len(self.switch_in_switch.elements), 2)
        self.assertEqual(len(self.transformer_switch_with_branch.elements), 2)

        self.assertEqual(
            list(self.estimator_switch.elements_dict.keys()),
            ['SVC', 'DecisionTreeClassifier', 'GaussianProcessClassifier'])
        self.assertEqual(
            list(self.switch_in_switch.elements_dict.keys()),
            ['transformer_branch', 'transformer_switch_with_branch'])

        switch = Switch('MySwitch',
                        [PipelineElement('PCA'),
                         PipelineElement('FastICA')])
        switch = Switch('MySwitch2')
        switch += PipelineElement('PCA')
        switch += PipelineElement('FastICA')

        # test doubled names
        with self.assertRaises(ValueError):
            self.estimator_switch += self.estimator_switch.elements[0]
        self.estimator_switch += PipelineElement("SVC")
        self.assertEqual(self.estimator_switch.elements[-1].name, "SVC2")
        self.estimator_switch += PipelineElement(
            "SVC", hyperparameters={'kernel': ['polynomial', 'sigmoid']})
        self.assertEqual(self.estimator_switch.elements[-1].name, "SVC3")
        self.estimator_switch += PipelineElement("SVR")
        self.assertEqual(self.estimator_switch.elements[-1].name, "SVR")
        self.estimator_switch += PipelineElement("SVC")
        self.assertEqual(self.estimator_switch.elements[-1].name, "SVC4")

        # check that hyperparameters are renamed respectively
        self.assertEqual(
            self.estimator_switch.pipeline_element_configurations[4][0]
            ["SVC3__kernel"], 'polynomial')

    def test_feature_importances(self):

        self.estimator_switch.set_params(**{'current_element': (1, 0)})
        self.estimator_switch.fit(self.X, self.y)
        self.assertTrue(
            len(self.estimator_switch.feature_importances_) == self.X.shape[1])

        self.estimator_switch_with_branch.set_params(
            **{'current_element': (1, 0)})
        self.estimator_switch_with_branch.fit(self.X, self.y)
        self.assertTrue(
            len(self.estimator_switch_with_branch.feature_importances_) ==
            self.X.shape[1])

        self.estimator_switch.set_params(**{'current_element': (2, 0)})
        self.estimator_switch.fit(self.X, self.y)
        self.assertIsNone(self.estimator_branch.feature_importances_)

        self.switch_in_switch.set_params(**{'current_element': (1, 0)})
        self.switch_in_switch.fit(self.X, self.y)
        self.assertIsNone(self.switch_in_switch.feature_importances_)
        self.estimator_switch.set_params(**{'current_element': (1, 0)})
        self.switch_in_switch.fit(self.X, self.y)
        self.assertIsNone(self.switch_in_switch.feature_importances_)
Exemple #5
0
class PipelineElementTests(unittest.TestCase):
    def setUp(self):
        self.pca_pipe_element = PipelineElement('PCA',
                                                {'n_components': [1, 2]},
                                                test_disabled=True)
        self.svc_pipe_element = PipelineElement('SVC', {
            'C': [0.1, 1],
            'kernel': ['rbf', 'sigmoid']
        })
        self.X, self.y = load_breast_cancer(True)
        self.kwargs = {'covariates': self.y}
        self.Xt = self.X + 1
        self.yt = self.y + 1
        self.kwargst = {'covariates': self.y + 1}

    def test_create_failure(self):
        with self.assertRaises(NameError):
            PipelineElement('NONSENSEName', {})

    def test_pipeline_element_create(self):
        # test name, set_disabled and base_element
        self.assertIsInstance(self.pca_pipe_element.base_element, PCA)

        # set_disabled is passed correctly
        self.assertTrue(self.pca_pipe_element.test_disabled)

        # correct name
        self.assertEqual(self.pca_pipe_element.name, 'PCA')

    def test_fit(self):
        self.pca_pipe_element.fit(self.X, self.y)
        self.assertEqual(self.pca_pipe_element.base_element.components_.shape,
                         (30, 30))
        self.assertEqual(self.pca_pipe_element.base_element.components_[0, 0],
                         0.005086232018734175)

        self.svc_pipe_element.fit(self.X, self.y)
        self.assertEqual(self.svc_pipe_element.base_element._intercept_,
                         -0.3753900173819406)

    def test_transform(self):
        self.pca_pipe_element.fit(self.X, self.y)

        Xt, _, _ = self.pca_pipe_element.transform(self.X)
        self.assertEqual(Xt.shape, (569, 30))
        self.assertAlmostEqual(Xt[0, 0], 1160.1425737041347)

    def test_predict(self):
        self.svc_pipe_element.fit(self.X, self.y)

        yt = self.svc_pipe_element.predict(self.X)
        self.assertEqual(yt.shape, (569, ))
        self.assertEqual(yt[21], 1)

    def test_predict_proba(self):
        self.svc_pipe_element.fit(self.X, self.y)
        self.assertEqual(self.svc_pipe_element.predict_proba(self.X), None)

        gpc = PipelineElement('GaussianProcessClassifier')
        gpc.fit(self.X, self.y)
        self.assertTrue(
            np.array_equal(
                gpc.predict_proba(self.X)[0],
                np.asarray([0.5847072926551391, 0.4152927073448609])))

    def test_inverse_transform(self):
        Xt, _, _ = self.pca_pipe_element.fit(self.X, self.y).transform(self.X)
        X, _, _ = self.pca_pipe_element.inverse_transform(Xt)
        np.testing.assert_array_almost_equal(X, self.X)

    def test_one_hyperparameter_setup(self):
        # sklearn attributes are generated
        self.assertDictEqual(self.pca_pipe_element.hyperparameters, {
            'PCA__n_components': [1, 2],
            'PCA__disabled': [False, True]
        })

        # config_grid is created as expected
        self.assertListEqual(self.pca_pipe_element.generate_config_grid(),
                             [{
                                 'PCA__n_components': 1,
                                 'PCA__disabled': False
                             }, {
                                 'PCA__n_components': 2,
                                 'PCA__disabled': False
                             }, {
                                 'PCA__disabled': True
                             }])

    def test_more_hyperparameters_setup(self):
        # sklearn attributes are generated
        self.assertDictEqual(self.svc_pipe_element.hyperparameters, {
            'SVC__C': [0.1, 1],
            'SVC__kernel': ['rbf', 'sigmoid']
        })

        # config_grid is created as expected
        self.assertListEqual(self.svc_pipe_element.generate_config_grid(),
                             [{
                                 'SVC__C': 0.1,
                                 'SVC__kernel': 'rbf'
                             }, {
                                 'SVC__C': 0.1,
                                 'SVC__kernel': 'sigmoid'
                             }, {
                                 'SVC__C': 1,
                                 'SVC__kernel': 'rbf'
                             }, {
                                 'SVC__C': 1,
                                 'SVC__kernel': 'sigmoid'
                             }])

    def test_no_hyperparameters(self):
        pca_sklearn_element = PCA()
        pca_photon_element = PipelineElement('PCA')

        self.assertDictEqual(pca_sklearn_element.__dict__,
                             pca_photon_element.base_element.__dict__)

    def test_set_params(self):
        config = {'n_components': 3, 'disabled': False}
        self.pca_pipe_element.set_params(**config)
        self.assertFalse(self.pca_pipe_element.disabled)
        self.assertEqual(self.pca_pipe_element.base_element.n_components, 3)
        with self.assertRaises(ValueError):
            self.pca_pipe_element.set_params(**{'any_weird_param': 1})

    def test_set_random_state(self):
        # we handle all elements in one method that is inherited so we capture them all in this test
        random_state = 53
        my_branch = Branch("random_state_branch")
        my_branch += PipelineElement("StandardScaler")
        my_switch = Switch("transformer_Switch")
        my_switch += PipelineElement("LassoFeatureSelection")
        my_switch += PipelineElement("PCA")
        my_branch += my_switch
        my_stack = Stack("Estimator_Stack")
        my_stack += PipelineElement("SVR")
        my_stack += PipelineElement("Ridge")
        my_branch += my_stack
        my_branch += PipelineElement("ElasticNet")

        my_branch.random_state = random_state
        self.assertTrue(my_switch.elements[1].random_state == random_state)
        self.assertTrue(
            my_switch.elements[1].base_element.random_state == random_state)
        self.assertTrue(my_stack.elements[1].random_state == random_state)
        self.assertTrue(
            my_stack.elements[1].base_element.random_state == random_state)

    def test_adjusted_delegate_call_transformer(self):
        # check standard transformer
        trans = PipelineElement.create('Transformer',
                                       base_element=DummyTransformer(),
                                       hyperparameters={})
        X, y, kwargs = trans.transform(self.X, self.y, **self.kwargs)
        self.assertTrue(np.array_equal(
            X, self.Xt))  # only X should be transformed
        self.assertTrue(np.array_equal(y, self.y))
        self.assertDictEqual(kwargs, self.kwargs)

        # check transformer needs y
        trans = PipelineElement.create('NeedsYTransformer',
                                       base_element=DummyNeedsYTransformer(),
                                       hyperparameters={})
        X, y, kwargs = trans.transform(self.X, self.y, **self.kwargs)
        self.assertTrue(np.array_equal(X, self.Xt))
        self.assertTrue(np.array_equal(y, self.yt))
        self.assertDictEqual(kwargs, self.kwargs)

        trans = PipelineElement.create('NeedsYTransformer',
                                       base_element=DummyNeedsYTransformer(),
                                       hyperparameters={})
        X, y, kwargs = trans.transform(self.X,
                                       self.y)  # this time without any kwargs
        self.assertTrue(np.array_equal(X, self.Xt))
        self.assertTrue(np.array_equal(y, self.yt))
        self.assertDictEqual(kwargs, {})

        # check transformer needs covariates
        trans = PipelineElement.create(
            'NeedsCovariatesTransformer',
            base_element=DummyNeedsCovariatesTransformer(),
            hyperparameters={})
        X, y, kwargs = trans.transform(self.X, **self.kwargs)
        self.assertTrue(np.array_equal(X, self.Xt))
        self.assertTrue(
            np.array_equal(kwargs['covariates'], self.kwargst['covariates']))
        self.assertEqual(y, None)

        # check transformer needs covariates and needs y
        trans = PipelineElement.create(
            'NeedsCovariatesAndYTransformer',
            base_element=DummyNeedsCovariatesAndYTransformer(),
            hyperparameters={})
        X, y, kwargs = trans.transform(self.X, self.y, **self.kwargs)
        self.assertTrue(np.array_equal(X, self.Xt))
        self.assertTrue(np.array_equal(y, self.yt))
        self.assertTrue(
            np.array_equal(kwargs['covariates'], self.kwargst['covariates']))

    def test_adjusted_delegate_call_estimator(self):
        # check standard estimator
        est = PipelineElement.create('Estimator',
                                     base_element=DummyEstimator(),
                                     hyperparameters={})
        y = est.predict(self.X)
        self.assertTrue(np.array_equal(
            y, self.Xt))  # DummyEstimator returns X as y predictions

        # check estimator needs covariates
        est = PipelineElement.create(
            'Estimator',
            base_element=DummyNeedsCovariatesEstimator(),
            hyperparameters={})
        X = est.predict(self.X, **self.kwargs)
        self.assertTrue(np.array_equal(
            X, self.Xt))  # DummyEstimator returns X as y predictions

    def test_predict_when_no_transform(self):
        # check standard estimator
        est = PipelineElement.create('Estimator',
                                     base_element=DummyEstimator(),
                                     hyperparameters={})
        X, y, kwargs = est.transform(self.X)
        self.assertTrue(np.array_equal(
            X, self.Xt))  # DummyEstimator returns X as y predictions
        self.assertEqual(y, None)

        # check estimator needs covariates
        est = PipelineElement.create(
            'Estimator',
            base_element=DummyNeedsCovariatesEstimator(),
            hyperparameters={})
        X, y, kwargs = est.transform(self.X, **self.kwargs)
        self.assertTrue(np.array_equal(
            X, self.Xt))  # DummyEstimator returns X as y predictions
        self.assertTrue(
            np.array_equal(kwargs['covariates'], self.kwargs['covariates']))
        self.assertEqual(y, None)

    def test_copy_me(self):
        svc = PipelineElement('SVC', {
            'C': [0.1, 1],
            'kernel': ['rbf', 'sigmoid']
        })
        svc.set_params(**{'C': 0.1, 'kernel': 'sigmoid'})
        copy = svc.copy_me()

        self.assertEqual(svc.random_state, copy.random_state)
        self.assertNotEqual(copy.base_element, svc.base_element)
        self.assertDictEqual(elements_to_dict(copy), elements_to_dict(svc))
        self.assertEqual(copy.base_element.C, svc.base_element.C)

        # check if copies are still the same, even when making a copy of a fitted PipelineElement
        copy_after_fit = svc.fit(self.X, self.y).copy_me()
        self.assertDictEqual(elements_to_dict(copy),
                             elements_to_dict(copy_after_fit))

        svc = PipelineElement('SVC', {
            'C': [0.1, 1],
            'kernel': ['rbf', 'sigmoid']
        })
        copy = svc.copy_me()
        self.assertDictEqual(copy.hyperparameters, {
            'SVC__C': [0.1, 1],
            'SVC__kernel': ['rbf', 'sigmoid']
        })
        copy.base_element.C = 3
        self.assertNotEqual(svc.base_element.C, copy.base_element.C)

        # test custom element
        custom_element = PipelineElement.create(
            'CustomElement',
            base_element=DummyNeedsCovariatesEstimator(),
            hyperparameters={})
        copy = custom_element.copy_me()
        self.assertDictEqual(elements_to_dict(custom_element),
                             elements_to_dict(copy))

        custom_element2 = PipelineElement.create(
            'MyUnDeepcopyableObject',
            base_element=GridSearchOptimizer(),
            hyperparameters={})
        with self.assertRaises(Exception):
            custom_element2.copy_me()

    def test_estimator_type(self):
        estimator = PipelineElement('SVC')
        self.assertEqual(estimator._estimator_type, 'classifier')

        estimator = PipelineElement('SVR')
        self.assertEqual(estimator._estimator_type, 'regressor')

        estimator = PipelineElement('PCA')
        self.assertEqual(estimator._estimator_type, None)

        estimator = PipelineElement.create('Dummy', DummyEstimatorWrongType(),
                                           {})
        with self.assertRaises(NotImplementedError):
            est_type = estimator._estimator_type

        estimator = PipelineElement.create('Dummy',
                                           DummyTransformerWithPredict(), {})
        with self.assertRaises(NotImplementedError):
            est_type = estimator._estimator_type

        estimator = PipelineElement.create('Dummy', DummyEstimatorNoPredict(),
                                           {})
        with self.assertRaises(NotImplementedError):
            est_type = estimator._estimator_type

    def test_sanity_check_item_for_add(self):
        valid_type = PipelineElement('StandardScaler')
        valid_type2 = CallbackElement('my_callback', None)
        invalid_type = StandardScaler()
        invalid_type2 = Preprocessing()

        PipelineElement.sanity_check_element_type_for_building_photon_pipes(
            valid_type, PipelineElement)
        PipelineElement.sanity_check_element_type_for_building_photon_pipes(
            valid_type2, PipelineElement)

        with self.assertRaises(TypeError):
            PipelineElement.sanity_check_element_type_for_building_photon_pipes(
                invalid_type, PipelineElement)

        with self.assertRaises(TypeError):
            PipelineElement.sanity_check_element_type_for_building_photon_pipes(
                invalid_type2, PipelineElement)

        classes_to_test = [Stack, Switch, Branch, Preprocessing]
        for photon_class in classes_to_test:
            # we name it SVC so it suits all classes
            if photon_class is Preprocessing:
                instance = photon_class()
            else:
                instance = photon_class('tmp_instance')
            instance.add(valid_type)
            instance.add(valid_type2)
            with self.assertRaises(TypeError):
                instance.add(invalid_type)
            with self.assertRaises(TypeError):
                instance.add(invalid_type2)
Exemple #6
0
class ConfounderRemovalTests(PhotonBaseTest):
    def setUp(self):

        super(ConfounderRemovalTests, self).setUp()
        self.X, self.y = load_breast_cancer(True)
        self.X_train = self.X[:100]
        self.y_train = self.y[:100]
        self.shuffle_split = ShuffleSplit(test_size=0.2,
                                          n_splits=1,
                                          random_state=15)
        settings = OutputSettings(project_folder=self.tmp_folder_path)
        self.pipe = Hyperpipe("confounder_pipe",
                              outer_cv=self.shuffle_split,
                              inner_cv=KFold(n_splits=3, random_state=15),
                              metrics=["accuracy"],
                              best_config_metric="accuracy",
                              output_settings=settings)
        self.pipe += PipelineElement("StandardScaler")
        self.cr = PipelineElement("ConfounderRemoval")
        self.pipe += self.cr
        self.pipe += PipelineElement("SVC")
        self.random_confounders = np.random.randn(self.X.shape[0], 1)

        # do confounder removal by hand
        self.multiple_confounders = np.random.randn(self.X.shape[0], 2) * 10
        ols_confounder = sm.add_constant(self.multiple_confounders)
        self.X_transformed = np.empty(self.X.shape)
        for i in range(self.X.shape[1]):
            # fit
            model = sm.OLS(endog=np.squeeze(self.X[:, i]),
                           exog=ols_confounder).fit()
            # transform
            self.X_transformed[:, i] = np.asarray(
                np.squeeze(self.X[:, i]) -
                np.matmul(ols_confounder, np.squeeze(model.params)))

        # prepare caching
        self.X_train_transformed = np.empty(self.X_train.shape)
        self.confounder_train = self.multiple_confounders[:100]
        ols_confounder_train = sm.add_constant(self.confounder_train)
        for i in range(self.X_train.shape[1]):
            # fit
            model = sm.OLS(endog=np.squeeze(self.X_train[:, i]),
                           exog=ols_confounder_train).fit()
            # transform
            self.X_train_transformed[:, i] = np.asarray(
                np.squeeze(self.X_train[:, i]) -
                np.matmul(ols_confounder_train, np.squeeze(model.params)))

        # prepare confounder removal with standardization of covariates
        scaled_covs = list()
        # standardize covariates
        for cov in self.multiple_confounders.T:
            scaler = StandardScaler()
            scaled_covs.append(
                scaler.fit_transform(cov.reshape(-1, 1)).squeeze())
        scaled_covs = np.asarray(scaled_covs).T
        scaled_covs = sm.add_constant(scaled_covs)
        self.X_transformed_standardized = np.empty(self.X.shape)
        for i in range(self.X.shape[1]):
            # fit
            model = sm.OLS(endog=np.squeeze(self.X[:, i]),
                           exog=scaled_covs).fit()
            # transform
            self.X_transformed_standardized[:, i] = np.asarray(
                np.squeeze(self.X[:, i]) -
                np.matmul(scaled_covs, np.squeeze(model.params)))

        # prepare statistical testing of confounder removal
        # Generate samples from three independent normally distributed random
        # variables (with mean 0 and std. dev. 1).
        x = norm.rvs(size=(4, 300))

        # desired covariance matrix
        r = np.array([
            [1, .9, .9, .9],
            [.9, 1, .9, .9],
            [.9, .9, 1, .9],
            [.9, .9, .9, 1],
        ])
        c = cholesky(r, lower=True)

        # convert the data to correlated random variables
        self.z = np.dot(c, x).T

    def test_confounder_removal_statistically(self):
        cr = PipelineElement("ConfounderRemoval", {},
                             standardize_covariates=False)
        cr.fit(self.z[:, 1:3], self.z[:, 0], **{'confounder': self.z[:, 3]})

        # use transform to write data to cache
        z_transformed = cr.transform(self.z[:, 1:3],
                                     **{'confounder': self.z[:, 3]})
        corr = np.corrcoef(np.concatenate([
            self.z[:, 0].reshape(-1, 1), z_transformed[0],
            self.z[:, 3].reshape(-1, 1)
        ],
                                          axis=1),
                           rowvar=False)
        # correlation between target and feature should be lower than 0.25 in this case
        # correlation between covariate and feature should be near zero
        self.assertLess(corr[1, 0], 0.25)
        self.assertLess(corr[2, 0], 0.25)
        self.assertAlmostEqual(corr[3, 1], 0)
        self.assertAlmostEqual(corr[3, 2], 0)

    def test_multiple_confounders(self):
        self.cr.fit(self.X, self.y,
                    **{'confounder': self.multiple_confounders})
        X_transformed = self.cr.transform(
            self.X, **{'confounder': self.multiple_confounders})
        np.testing.assert_array_almost_equal(X_transformed[0],
                                             self.X_transformed)

    def test_standardize_covariates(self):
        self.cr.fit(self.X, self.y,
                    **{'confounder': self.multiple_confounders})
        X_transformed = self.cr.transform(
            self.X, **{'confounder': self.multiple_confounders})
        np.testing.assert_array_almost_equal(X_transformed[0],
                                             self.X_transformed_standardized)

    def test_use(self):
        self.pipe.fit(self.X, self.y,
                      **{'confounder': self.random_confounders})
        trans_data = self.pipe.transform(
            self.X, **{'confounder': self.random_confounders})

    def test_dimensions(self):
        with self.assertRaises(ValueError):
            self.cr.fit(self.X,
                        self.y,
                        confounder=np.random.randn(self.X.shape[0] - 10, 2))

    def test_key_error(self):
        with self.assertRaises(KeyError):
            self.cr.fit(self.X,
                        self.y,
                        covariate=np.random.randn(self.X.shape[0] - 10, 2))