def test_preprocessing_dtype(self):
        # Dense
        # np.float32
        X_train, Y_train, X_test, Y_test = get_dataset("iris")
        self.assertEqual(X_train.dtype, np.float32)

        configuration_space = RandomTreesEmbedding.get_hyperparameter_search_space(
        )
        default = configuration_space.get_default_configuration()
        preprocessor = RandomTreesEmbedding(
            random_state=1,
            **{hp_name: default[hp_name]
               for hp_name in default})
        preprocessor.fit(X_train)
        Xt = preprocessor.transform(X_train)

        self.assertEqual(Xt.dtype, np.float32)

        # np.float64
        X_train, Y_train, X_test, Y_test = get_dataset("iris")
        X_train = X_train.astype(np.float64)
        configuration_space = RandomTreesEmbedding.get_hyperparameter_search_space(
        )
        default = configuration_space.get_default_configuration()
        preprocessor = RandomTreesEmbedding(
            random_state=1,
            **{hp_name: default[hp_name]
               for hp_name in default})
        preprocessor.fit(X_train, Y_train)
        Xt = preprocessor.transform(X_train)
        self.assertEqual(Xt.dtype, np.float64)
예제 #2
0
    def test_predict_proba_batched(self):
        cs = ParamSklearnClassifier.get_hyperparameter_search_space()
        default = cs.get_default_configuration()

        # Multiclass
        cls = ParamSklearnClassifier(default)
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        # The object behind the last step in the pipeline
        cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1])
        cls.pipeline_.steps[-1] = ("estimator", cls_predict)
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertEqual((1647, 10), prediction.shape)
        self.assertEqual(84, cls_predict.predict_proba.call_count)
        assert_array_almost_equal(prediction_, prediction)

        # Multilabel
        cls = ParamSklearnClassifier(default)
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
        Y_train = np.array([(y, 26 - y) for y in Y_train])
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1])
        cls.pipeline_.steps[-1] = ("estimator", cls_predict)
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertIsInstance(prediction, list)
        self.assertEqual(2, len(prediction))
        self.assertEqual((1647, 10), prediction[0].shape)
        self.assertEqual((1647, 10), prediction[1].shape)
        self.assertEqual(84, cls_predict.predict_proba.call_count)
        assert_array_almost_equal(prediction_, prediction)
예제 #3
0
    def test_predict_batched(self):
        cs = ParamSklearnClassifier.get_hyperparameter_search_space()
        default = cs.get_default_configuration()
        cls = ParamSklearnClassifier(default)

        # Multiclass
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict(X_test_)
        cls_predict = mock.Mock(wraps=cls.pipeline_)
        cls.pipeline_ = cls_predict
        prediction = cls.predict(X_test, batch_size=20)
        self.assertEqual((1647, ), prediction.shape)
        self.assertEqual(83, cls_predict.predict.call_count)
        assert_array_almost_equal(prediction_, prediction)

        # Multilabel
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
        Y_train = np.array([(y, 26 - y) for y in Y_train])
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict(X_test_)
        cls_predict = mock.Mock(wraps=cls.pipeline_)
        cls.pipeline_ = cls_predict
        prediction = cls.predict(X_test, batch_size=20)
        self.assertEqual((1647, 2), prediction.shape)
        self.assertEqual(83, cls_predict.predict.call_count)
        assert_array_almost_equal(prediction_, prediction)
예제 #4
0
    def test_predict_batched(self):
        cs = ParamSklearnClassifier.get_hyperparameter_search_space()
        default = cs.get_default_configuration()
        cls = ParamSklearnClassifier(default)

        # Multiclass
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict(X_test_)
        cls_predict = mock.Mock(wraps=cls.pipeline_)
        cls.pipeline_ = cls_predict
        prediction = cls.predict(X_test, batch_size=20)
        self.assertEqual((1647,), prediction.shape)
        self.assertEqual(83, cls_predict.predict.call_count)
        assert_array_almost_equal(prediction_, prediction)

        # Multilabel
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
        Y_train = np.array([(y, 26 - y) for y in Y_train])
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict(X_test_)
        cls_predict = mock.Mock(wraps=cls.pipeline_)
        cls.pipeline_ = cls_predict
        prediction = cls.predict(X_test, batch_size=20)
        self.assertEqual((1647, 2), prediction.shape)
        self.assertEqual(83, cls_predict.predict.call_count)
        assert_array_almost_equal(prediction_, prediction)
예제 #5
0
    def test_predict_proba_batched(self):
        cs = ParamSklearnClassifier.get_hyperparameter_search_space()
        default = cs.get_default_configuration()

        # Multiclass
        cls = ParamSklearnClassifier(default)
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        # The object behind the last step in the pipeline
        cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1])
        cls.pipeline_.steps[-1] = ("estimator", cls_predict)
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertEqual((1647, 10), prediction.shape)
        self.assertEqual(84, cls_predict.predict_proba.call_count)
        assert_array_almost_equal(prediction_, prediction)

        # Multilabel
        cls = ParamSklearnClassifier(default)
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
        Y_train = np.array([(y, 26 - y) for y in Y_train])
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1])
        cls.pipeline_.steps[-1] = ("estimator", cls_predict)
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertIsInstance(prediction, list)
        self.assertEqual(2, len(prediction))
        self.assertEqual((1647, 10), prediction[0].shape)
        self.assertEqual((1647, 10), prediction[1].shape)
        self.assertEqual(84, cls_predict.predict_proba.call_count)
        assert_array_almost_equal(prediction_, prediction)
    def test_preprocessing_dtype(self):
        # Dense
        # np.float32
        X_train, Y_train, X_test, Y_test = get_dataset("iris")
        self.assertEqual(X_train.dtype, np.float32)

        configuration_space = RandomTreesEmbedding.get_hyperparameter_search_space()
        default = configuration_space.get_default_configuration()
        preprocessor = RandomTreesEmbedding(random_state=1,
                                            **{hp_name: default[hp_name] for
                                               hp_name in
                                               default})
        preprocessor.fit(X_train)
        Xt = preprocessor.transform(X_train)

        self.assertEqual(Xt.dtype, np.float32)

        # np.float64
        X_train, Y_train, X_test, Y_test = get_dataset("iris")
        X_train = X_train.astype(np.float64)
        configuration_space = RandomTreesEmbedding.get_hyperparameter_search_space()
        default = configuration_space.get_default_configuration()
        preprocessor = RandomTreesEmbedding(random_state=1,
                                            **{hp_name: default[hp_name] for
                                               hp_name in
                                               default})
        preprocessor.fit(X_train, Y_train)
        Xt = preprocessor.transform(X_train)
        self.assertEqual(Xt.dtype, np.float64)
예제 #7
0
    def _test_preprocessing_dtype(self):
        # Dense
        # np.float32
        X_train, Y_train, X_test, Y_test = get_dataset("iris")
        self.assertEqual(X_train.dtype, np.float32)

        configuration_space = Nystroem.get_hyperparameter_search_space()
        default = configuration_space.get_default_configuration()
        preprocessor = Nystroem(random_state=1,
                                          **{hp.hyperparameter.name: hp.value
                                             for hp
                                             in
                                             default.values.values()})
        preprocessor.fit(X_train)
        Xt = preprocessor.transform(X_train)
        self.assertEqual(Xt.dtype, np.float32)

        # np.float64
        X_train, Y_train, X_test, Y_test = get_dataset("iris")
        X_train = X_train.astype(np.float64)
        configuration_space = Nystroem.get_hyperparameter_search_space()
        default = configuration_space.get_default_configuration()
        preprocessor = Nystroem(random_state=1,
                                          **{hp.hyperparameter.name: hp.value
                                             for hp
                                             in
                                             default.values.values()})
        preprocessor.fit(X_train, Y_train)
        Xt = preprocessor.transform(X_train)
        self.assertEqual(Xt.dtype, np.float64)

        # Sparse
        # np.float32
        X_train, Y_train, X_test, Y_test = get_dataset("iris", make_sparse=True)
        self.assertEqual(X_train.dtype, np.float32)
        configuration_space = Nystroem.get_hyperparameter_search_space()
        default = configuration_space.get_default_configuration()
        preprocessor = Nystroem(random_state=1,
                                          **{hp.hyperparameter.name: hp.value
                                             for hp
                                             in
                                             default.values.values()})
        preprocessor.fit(X_train)
        Xt = preprocessor.transform(X_train)
        self.assertEqual(Xt.dtype, np.float32)

        # np.float64
        X_train, Y_train, X_test, Y_test = get_dataset("iris", make_sparse=True)
        X_train = X_train.astype(np.float64)
        configuration_space = Nystroem.get_hyperparameter_search_space()
        default = configuration_space.get_default_configuration()
        preprocessor = Nystroem(random_state=1,
                                          **{hp.hyperparameter.name: hp.value
                                             for hp
                                             in
                                             default.values.values()})
        preprocessor.fit(X_train)
        Xt = preprocessor.transform(X_train)
        self.assertEqual(Xt.dtype, np.float64)
예제 #8
0
    def test_predict_proba_batched_sparse(self):
        cs = ParamSklearnClassifier.get_hyperparameter_search_space(
            dataset_properties={'sparse': True})

        config = Configuration(
            cs,
            values={
                "balancing:strategy": "none",
                "classifier:__choice__": "random_forest",
                "imputation:strategy": "mean",
                "one_hot_encoding:minimum_fraction": 0.01,
                "one_hot_encoding:use_minimum_fraction": 'True',
                "preprocessor:__choice__": "no_preprocessing",
                'classifier:random_forest:bootstrap': 'True',
                'classifier:random_forest:criterion': 'gini',
                'classifier:random_forest:max_depth': 'None',
                'classifier:random_forest:min_samples_split': 2,
                'classifier:random_forest:min_samples_leaf': 2,
                'classifier:random_forest:min_weight_fraction_leaf': 0.0,
                'classifier:random_forest:max_features': 0.5,
                'classifier:random_forest:max_leaf_nodes': 'None',
                'classifier:random_forest:n_estimators': 100,
                "rescaling:__choice__": "min/max"
            })

        # Multiclass
        cls = ParamSklearnClassifier(config)
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                       make_sparse=True)
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        # The object behind the last step in the pipeline
        cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1])
        cls.pipeline_.steps[-1] = ("estimator", cls_predict)
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertEqual((1647, 10), prediction.shape)
        self.assertEqual(84, cls_predict.predict_proba.call_count)
        assert_array_almost_equal(prediction_, prediction)

        # Multilabel
        cls = ParamSklearnClassifier(config)
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                       make_sparse=True)
        Y_train = np.array([(y, 26 - y) for y in Y_train])
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1])
        cls.pipeline_.steps[-1] = ("estimator", cls_predict)
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertIsInstance(prediction, list)
        self.assertEqual(2, len(prediction))
        self.assertEqual((1647, 10), prediction[0].shape)
        self.assertEqual((1647, 10), prediction[1].shape)
        self.assertEqual(84, cls_predict.predict_proba.call_count)
        assert_array_almost_equal(prediction_, prediction)
예제 #9
0
    def test_predict_proba_batched_sparse(self):
        cs = ParamSklearnClassifier.get_hyperparameter_search_space(
            dataset_properties={'sparse': True})

        config = Configuration(cs,
                               values={"balancing:strategy": "none",
                                       "classifier:__choice__": "random_forest",
                                       "imputation:strategy": "mean",
                                       "one_hot_encoding:minimum_fraction": 0.01,
                                       "one_hot_encoding:use_minimum_fraction": 'True',
                                       "preprocessor:__choice__": "no_preprocessing",
                                       'classifier:random_forest:bootstrap': 'True',
                                       'classifier:random_forest:criterion': 'gini',
                                       'classifier:random_forest:max_depth': 'None',
                                       'classifier:random_forest:min_samples_split': 2,
                                       'classifier:random_forest:min_samples_leaf': 2,
                                       'classifier:random_forest:min_weight_fraction_leaf': 0.0,
                                       'classifier:random_forest:max_features': 0.5,
                                       'classifier:random_forest:max_leaf_nodes': 'None',
                                       'classifier:random_forest:n_estimators': 100,
                                       "rescaling:__choice__": "min/max"})

        # Multiclass
        cls = ParamSklearnClassifier(config)
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                       make_sparse=True)
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        # The object behind the last step in the pipeline
        cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1])
        cls.pipeline_.steps[-1] = ("estimator", cls_predict)
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertEqual((1647, 10), prediction.shape)
        self.assertEqual(84, cls_predict.predict_proba.call_count)
        assert_array_almost_equal(prediction_, prediction)

        # Multilabel
        cls = ParamSklearnClassifier(config)
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                       make_sparse=True)
        Y_train = np.array([(y, 26 - y) for y in Y_train])
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1])
        cls.pipeline_.steps[-1] = ("estimator", cls_predict)
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertIsInstance(prediction, list)
        self.assertEqual(2, len(prediction))
        self.assertEqual((1647, 10), prediction[0].shape)
        self.assertEqual((1647, 10), prediction[1].shape)
        self.assertEqual(84, cls_predict.predict_proba.call_count)
        assert_array_almost_equal(prediction_, prediction)
    def test_preprocessing_dtype(self):
        # Dense
        # np.float32
        X_train, Y_train, X_test, Y_test = get_dataset("iris")
        self.assertEqual(X_train.dtype, np.float32)

        configuration_space = SelectPercentileClassification.get_hyperparameter_search_space()
        default = configuration_space.get_default_configuration()
        preprocessor = SelectPercentileClassification(random_state=1,
                                                      **{hp_name: default[hp_name]
                                                         for hp_name in default})
        preprocessor.fit(X_train, Y_train)
        Xt = preprocessor.transform(X_train)
        self.assertEqual(Xt.dtype, np.float32)

        # np.float64
        X_train, Y_train, X_test, Y_test = get_dataset("iris")
        X_train = X_train.astype(np.float64)
        configuration_space = SelectPercentileClassification.get_hyperparameter_search_space()
        default = configuration_space.get_default_configuration()
        preprocessor = SelectPercentileClassification(random_state=1,
                                                      **{hp_name: default[hp_name]
                                                         for hp_name in default})
        preprocessor.fit(X_train, Y_train)
        Xt = preprocessor.transform(X_train)
        self.assertEqual(Xt.dtype, np.float64)

        # Sparse
        # np.float32
        X_train, Y_train, X_test, Y_test = get_dataset("iris", make_sparse=True)
        self.assertEqual(X_train.dtype, np.float32)
        configuration_space = SelectPercentileClassification.get_hyperparameter_search_space()
        default = configuration_space.get_default_configuration()
        preprocessor = SelectPercentileClassification(random_state=1,
                                                      **{hp_name: default[hp_name]
                                                         for hp_name in default})
        preprocessor.fit(X_train, Y_train)
        Xt = preprocessor.transform(X_train)
        self.assertEqual(Xt.dtype, np.float32)

        # np.float64
        X_train, Y_train, X_test, Y_test = get_dataset("iris", make_sparse=True)
        X_train = X_train.astype(np.float64)
        configuration_space = SelectPercentileClassification.get_hyperparameter_search_space()
        default = configuration_space.get_default_configuration()
        preprocessor = SelectPercentileClassification(random_state=1,
                                                      **{hp_name: default[hp_name]
                                                         for hp_name in default})
        preprocessor.fit(X_train, Y_train)
        Xt = preprocessor.transform(X_train)
        self.assertEqual(Xt.dtype, np.float64)
    def setUp(self):
        self.X_train, self.Y_train, self.X_test, self.Y_test = \
            get_dataset('iris')

        eliminate_class_two = self.Y_train != 2
        self.X_train = self.X_train[eliminate_class_two]
        self.Y_train = self.Y_train[eliminate_class_two]
    def test_default_configuration(self):
        transformation, original = _test_preprocessing(
            SelectPercentileClassification)
        self.assertEqual(transformation.shape[0], original.shape[0])
        self.assertEqual(transformation.shape[1], int(original.shape[1] / 2))
        self.assertFalse((transformation == 0).all())

        transformation, original = _test_preprocessing(
            SelectPercentileClassification, make_sparse=True)
        self.assertTrue(scipy.sparse.issparse(transformation))
        self.assertEqual(transformation.shape[0], original.shape[0])
        self.assertEqual(transformation.shape[1], int(original.shape[1] / 2))

        # Custon preprocessing test to check if clipping to zero works
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
        original_X_train = X_train.copy()
        ss = sklearn.preprocessing.StandardScaler()
        X_train = ss.fit_transform(X_train)
        configuration_space = SelectPercentileClassification.get_hyperparameter_search_space(
        )
        default = configuration_space.get_default_configuration()

        preprocessor = SelectPercentileClassification(
            random_state=1,
            **{
                hp_name: default[hp_name]
                for hp_name in default if default[hp_name] is not None
            })

        transformer = preprocessor.fit(X_train, Y_train)
        transformation, original = transformer.transform(
            X_train), original_X_train
        self.assertEqual(transformation.shape[0], original.shape[0])
        self.assertEqual(transformation.shape[1], int(original.shape[1] / 2))
예제 #13
0
 def test_fit(self):
     X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
     automl = autosklearn.automl.AutoML(self.output, self.output, 10, 10)
     automl.fit(X_train, Y_train)
     score = automl.score(X_test, Y_test)
     self.assertGreaterEqual(score, 0.9)
     self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION)
예제 #14
0
 def test_fit(self):
     X_train, Y_train, X_test, Y_test = putil.get_dataset("iris")
     automl = autosklearn.automl.AutoML(self.output, self.output, 10, 10)
     automl.fit(X_train, Y_train)
     score = automl.score(X_test, Y_test)
     self.assertGreaterEqual(score, 0.9)
     self.assertEqual(automl.task_, MULTICLASS_CLASSIFICATION)
예제 #15
0
    def test_default_configuration(self):
        transformation, original = _test_preprocessing(Nystroem)
        self.assertEqual(transformation.shape[0], original.shape[0])
        self.assertEqual(transformation.shape[1], 100)
        self.assertFalse((transformation == 0).all())

        # Custon preprocessing test to check if clipping to zero works
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
        original_X_train = X_train.copy()
        ss = sklearn.preprocessing.StandardScaler()
        X_train = ss.fit_transform(X_train)
        configuration_space = Nystroem.get_hyperparameter_search_space()
        default = configuration_space.get_default_configuration()

        preprocessor = Nystroem(random_state=1,
                                **{
                                    hp_name: default[hp_name]
                                    for hp_name in default
                                    if default[hp_name] is not None
                                })

        transformer = preprocessor.fit(X_train, Y_train)
        transformation, original = transformer.transform(
            X_train), original_X_train
        self.assertEqual(transformation.shape[0], original.shape[0])
        self.assertEqual(transformation.shape[1], 100)
예제 #16
0
    def test_fit_pSMAC(self):
        output = os.path.join(self.test_dir, '..', '.tmp_estimator_fit_pSMAC')
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')

        automl = AutoSklearnClassifier(time_left_for_this_task=15,
                                       per_run_time_limit=15,
                                       output_folder=output,
                                       tmp_folder=output,
                                       shared_mode=True,
                                       seed=1,
                                       initial_configurations_via_metalearning=0,
                                       ensemble_size=0)
        automl.fit(X_train, Y_train)

        # Create a 'dummy model' for the first run, which has an accuracy of
        # more than 99%; it should be in the final ensemble if the ensemble
        # building of the second AutoSklearn classifier works correct
        true_targets_ensemble_path = os.path.join(output, '.auto-sklearn',
                                                  'true_targets_ensemble.npy')
        true_targets_ensemble = np.load(true_targets_ensemble_path)
        true_targets_ensemble[-1] = 1 if true_targets_ensemble[-1] != 1 else 0
        probas = np.zeros((len(true_targets_ensemble), 3), dtype=float)
        for i, value in enumerate(true_targets_ensemble):
            probas[i, value] = 1.0
        dummy_predictions_path = os.path.join(output, '.auto-sklearn',
                                              'predictions_ensemble',
                                              'predictions_ensemble_1_00030.npy')
        with open(dummy_predictions_path, 'wb') as fh:
            np.save(fh, probas)

        probas_test = np.zeros((len(Y_test), 3), dtype=float)
        for i, value in enumerate(Y_test):
            probas_test[i, value] = 1.0

        dummy = ArrayReturningDummyPredictor(probas_test)
        backend = Backend(output, output)
        backend.save_model(dummy, 30, 1)

        automl = AutoSklearnClassifier(time_left_for_this_task=15,
                                       per_run_time_limit=15,
                                       output_folder=output,
                                       tmp_folder=output,
                                       shared_mode=True,
                                       seed=2,
                                       initial_configurations_via_metalearning=0,
                                       ensemble_size=0)
        automl.fit(X_train, Y_train)
        automl.run_ensemble_builder(0, 1, 50).wait()

        score = automl.score(X_test, Y_test)

        self.assertEqual(len(os.listdir(os.path.join(output, '.auto-sklearn',
                                                     'ensemble_indices'))), 1)
        self.assertGreaterEqual(score, 0.90)
        self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION)

        del automl
        self._tearDown(output)
    def test_default_configuration(self):
        transformation, original = _test_preprocessing(SelectPercentileClassification)
        self.assertEqual(transformation.shape[0], original.shape[0])
        self.assertEqual(transformation.shape[1], int(original.shape[1]/2))
        self.assertFalse((transformation == 0).all())

        transformation, original = _test_preprocessing(SelectPercentileClassification, make_sparse=True)
        self.assertTrue(scipy.sparse.issparse(transformation))
        self.assertEqual(transformation.shape[0], original.shape[0])
        self.assertEqual(transformation.shape[1], int(original.shape[1]/2))

        # Custon preprocessing test to check if clipping to zero works
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
        original_X_train = X_train.copy()
        ss = sklearn.preprocessing.StandardScaler()
        X_train = ss.fit_transform(X_train)
        configuration_space = SelectPercentileClassification.get_hyperparameter_search_space()
        default = configuration_space.get_default_configuration()

        preprocessor = SelectPercentileClassification(random_state=1,
                            **{hp_name: default[hp_name] for hp_name in
                               default if default[hp_name] is not None})

        transformer = preprocessor.fit(X_train, Y_train)
        transformation, original = transformer.transform(X_train), original_X_train
        self.assertEqual(transformation.shape[0], original.shape[0])
        self.assertEqual(transformation.shape[1], int(original.shape[1] / 2))
    def test_evaluate_multiclass_classification(self):
        X_train, Y_train, X_test, Y_test = get_dataset('iris')
        X_valid = X_test[:25,]
        Y_valid = Y_test[:25,]
        X_test = X_test[25:,]
        Y_test = Y_test[25:,]

        D = Dummy()
        D.info = {'metric': 'bac_metric', 'task': MULTICLASS_CLASSIFICATION,
                  'is_sparse': False, 'target_num': 3}
        D.data = {'X_train': X_train, 'Y_train': Y_train,
                  'X_valid': X_valid, 'X_test': X_test}
        D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical']

        configuration_space = get_configuration_space(D.info,
            include_estimators = ['ridge'],
            include_preprocessors = ['select_rates'])

        err = np.zeros([N_TEST_RUNS])
        for i in range(N_TEST_RUNS):
            print "Evaluate configuration: %d; result:" % i,
            configuration = configuration_space.sample_configuration()
            D_ = copy.deepcopy(D)
            evaluator = HoldoutEvaluator(D_, configuration)

            if not self._fit(evaluator):
                print
                continue
            err[i] = evaluator.predict()
            print err[i]

            self.assertTrue(np.isfinite(err[i]))
            self.assertGreaterEqual(err[i], 0.0)

        print "Number of times it was worse than random guessing:" + str(np.sum(err > 1))
    def test_default_configuration(self):
        configuration_space = RidgeRegression.get_hyperparameter_search_space()
        default = configuration_space.get_default_configuration()
        configuration_space_preproc = RandomKitchenSinks.get_hyperparameter_search_space()
        default_preproc = configuration_space_preproc.get_default_configuration()

        for i in range(10):
            # This should be a bad results
            predictions, targets = _test_regressor(RidgeRegression,)
            self.assertAlmostEqual(0.32614416980439365,
                sklearn.metrics.r2_score(y_true=targets, y_pred=predictions))

            # This should be much more better
            X_train, Y_train, X_test, Y_test = get_dataset(dataset='diabetes',
                                                           make_sparse=False)
            preprocessor = RandomKitchenSinks(
                random_state=1,
                **{hp_name: default_preproc[hp_name] for hp_name in
                   default_preproc if default_preproc[hp_name] is not None})

            transformer = preprocessor.fit(X_train, Y_train)
            X_train_transformed = transformer.transform(X_train)
            X_test_transformed = transformer.transform(X_test)

            regressor = RidgeRegression(
                random_state=1,
                **{hp_name: default[hp_name] for hp_name in
                   default if default[hp_name] is not None})
            predictor = regressor.fit(X_train_transformed, Y_train)
            predictions = predictor.predict(X_test_transformed)

            self.assertAlmostEqual(0.37183512452087852,
                sklearn.metrics.r2_score(y_true=Y_test, y_pred=predictions))
    def test_evaluate_multiclass_classification_all_metrics(self):
        X_train, Y_train, X_test, Y_test = get_dataset("iris")
        X_valid = X_test[:25,]
        Y_valid = Y_test[:25,]
        X_test = X_test[25:,]
        Y_test = Y_test[25:,]

        D = Dummy()
        D.info = {"metric": BAC_METRIC, "task": MULTICLASS_CLASSIFICATION, "is_sparse": False, "label_num": 3}
        D.data = {"X_train": X_train, "Y_train": Y_train, "X_valid": X_valid, "X_test": X_test}
        D.feat_type = ["numerical", "Numerical", "numerical", "numerical"]

        configuration_space = get_configuration_space(D.info, include_estimators=["lda"], include_preprocessors=["pca"])

        # Test all scoring functions
        err = []
        for i in range(N_TEST_RUNS):
            print("Evaluate configuration: %d; result:" % i)
            configuration = configuration_space.sample_configuration()
            D_ = copy.deepcopy(D)
            evaluator = HoldoutEvaluator(D_, configuration, all_scoring_functions=True)
            if not self._fit(evaluator):
                continue

            err.append(evaluator.predict())
            print(err[-1])

            self.assertIsInstance(err[-1], dict)
            for key in err[-1]:
                self.assertEqual(len(err[-1]), 5)
                self.assertTrue(np.isfinite(err[-1][key]))
                self.assertGreaterEqual(err[-1][key], 0.0)

        print("Number of times it was worse than random guessing:" + str(np.sum(err > 1)))
예제 #21
0
    def test_configurations_signed_data(self):
        # Use a limit of ~4GiB
        limit = 4000 * 1024 * 1024
        resource.setrlimit(resource.RLIMIT_AS, (limit, limit))

        cs = ParamSklearnClassifier.get_hyperparameter_search_space(
            dataset_properties={'signed': True})

        print(cs)

        for i in range(10):
            config = cs.sample_configuration()
            config._populate_values()
            if config['classifier:passive_aggressive:n_iter'] is not None:
                config._values['classifier:passive_aggressive:n_iter'] = 5
            if config['classifier:sgd:n_iter'] is not None:
                config._values['classifier:sgd:n_iter'] = 5

            X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
            cls = ParamSklearnClassifier(config, random_state=1)
            print(config)
            try:
                cls.fit(X_train, Y_train)
                X_test_ = X_test.copy()
                predictions = cls.predict(X_test)
                self.assertIsInstance(predictions, np.ndarray)
                predicted_probabiliets = cls.predict_proba(X_test_)
                self.assertIsInstance(predicted_probabiliets, np.ndarray)
            except ValueError as e:
                if "Floating-point under-/overflow occurred at epoch" in \
                       e.args[0] or \
                       "removed all features" in e.args[0] or \
                                "all features are discarded" in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except RuntimeWarning as e:
                if "invalid value encountered in sqrt" in e.args[0]:
                    continue
                elif "divide by zero encountered in" in e.args[0]:
                    continue
                elif "invalid value encountered in divide" in e.args[0]:
                    continue
                elif "invalid value encountered in true_divide" in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except UserWarning as e:
                if "FastICA did not converge" in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except MemoryError as e:
                continue
예제 #22
0
    def test_configurations_signed_data(self):
        # Use a limit of ~4GiB
        limit = 4000 * 1024 * 1024
        resource.setrlimit(resource.RLIMIT_AS, (limit, limit))

        cs = ParamSklearnClassifier.get_hyperparameter_search_space(
            dataset_properties={'signed': True})

        print(cs)

        for i in range(10):
            config = cs.sample_configuration()
            config._populate_values()
            if config['classifier:passive_aggressive:n_iter'] is not None:
                config._values['classifier:passive_aggressive:n_iter'] = 5
            if config['classifier:sgd:n_iter'] is not None:
                config._values['classifier:sgd:n_iter'] = 5

            X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
            cls = ParamSklearnClassifier(config, random_state=1)
            print(config)
            try:
                cls.fit(X_train, Y_train)
                X_test_ = X_test.copy()
                predictions = cls.predict(X_test)
                self.assertIsInstance(predictions, np.ndarray)
                predicted_probabiliets = cls.predict_proba(X_test_)
                self.assertIsInstance(predicted_probabiliets, np.ndarray)
            except ValueError as e:
                if "Floating-point under-/overflow occurred at epoch" in \
                       e.args[0] or \
                       "removed all features" in e.args[0] or \
                                "all features are discarded" in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except RuntimeWarning as e:
                if "invalid value encountered in sqrt" in e.args[0]:
                    continue
                elif "divide by zero encountered in" in e.args[0]:
                    continue
                elif "invalid value encountered in divide" in e.args[0]:
                    continue
                elif "invalid value encountered in true_divide" in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except UserWarning as e:
                if "FastICA did not converge" in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except MemoryError as e:
                continue
예제 #23
0
    def test_default_configuration(self):
        transformation, original = _test_preprocessing(SelectRates)
        self.assertEqual(transformation.shape[0], original.shape[0])
        self.assertEqual(transformation.shape[1], 3)
        self.assertFalse((transformation == 0).all())

        transformation, original = _test_preprocessing(SelectRates,
                                                       make_sparse=True)
        self.assertTrue(scipy.sparse.issparse(transformation))
        self.assertEqual(transformation.shape[0], original.shape[0])
        self.assertEqual(transformation.shape[1], int(original.shape[1] / 2))

        # Custon preprocessing test to check if clipping to zero works
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
        original_X_train = X_train.copy()
        ss = sklearn.preprocessing.StandardScaler()
        X_train = ss.fit_transform(X_train)
        configuration_space = SelectRates.get_hyperparameter_search_space()
        default = configuration_space.get_default_configuration()

        preprocessor = SelectRates(random_state=1,
                                   **{
                                       hp_name: default[hp_name]
                                       for hp_name in default
                                       if default[hp_name] is not None
                                   })

        transformer = preprocessor.fit(X_train, Y_train)
        transformation, original = transformer.transform(
            X_train), original_X_train
        self.assertEqual(transformation.shape[0], original.shape[0])
        # I don't know why its 52 here and not 32 which would be half of the
        # number of features. Seems to be related to a runtime warning raised
        #  by sklearn
        self.assertEqual(transformation.shape[1], 52)
예제 #24
0
    def setUp(self):
        self.X_train, self.Y_train, self.X_test, self.Y_test = \
            get_dataset('iris')

        eliminate_class_two = self.Y_train != 2
        self.X_train = self.X_train[eliminate_class_two]
        self.Y_train = self.Y_train[eliminate_class_two]
예제 #25
0
    def test_default_configuration(self):
        transformation, original = _test_preprocessing(SelectRates)
        self.assertEqual(transformation.shape[0], original.shape[0])
        self.assertEqual(transformation.shape[1], 3)
        self.assertFalse((transformation == 0).all())

        transformation, original = _test_preprocessing(SelectRates, make_sparse=True)
        self.assertTrue(scipy.sparse.issparse(transformation))
        self.assertEqual(transformation.shape[0], original.shape[0])
        self.assertEqual(transformation.shape[1], int(original.shape[1] / 2))

        # Custon preprocessing test to check if clipping to zero works
        X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits")
        original_X_train = X_train.copy()
        ss = sklearn.preprocessing.StandardScaler()
        X_train = ss.fit_transform(X_train)
        configuration_space = SelectRates.get_hyperparameter_search_space()
        default = configuration_space.get_default_configuration()

        preprocessor = SelectRates(
            random_state=1, **{hp_name: default[hp_name] for hp_name in default if default[hp_name] is not None}
        )

        transformer = preprocessor.fit(X_train, Y_train)
        transformation, original = transformer.transform(X_train), original_X_train
        self.assertEqual(transformation.shape[0], original.shape[0])
        # I don't know why its 52 here and not 32 which would be half of the
        # number of features. Seems to be related to a runtime warning raised
        #  by sklearn
        self.assertEqual(transformation.shape[1], 52)
예제 #26
0
    def test_evaluate_multiclass_classification(self):
        X_train, Y_train, X_test, Y_test = get_dataset('iris')

        X_valid = X_test[:25, ]
        Y_valid = Y_test[:25, ]
        X_test = X_test[25:, ]
        Y_test = Y_test[25:, ]

        D = Dummy()
        D.info = {'metric': 'bac_metric', 'task': MULTICLASS_CLASSIFICATION,
                  'is_sparse': False, 'target_num': 3}
        D.data = {'X_train': X_train, 'Y_train': Y_train,
                  'X_valid': X_valid, 'X_test': X_test}
        D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical']

        configuration_space = get_configuration_space(D.info,
            include_estimators=['ridge'],
            include_preprocessors=['select_rates'])

        err = np.zeros([N_TEST_RUNS])
        num_models_better_than_random = 0
        for i in range(N_TEST_RUNS):
            print "Evaluate configuration: %d; result:" % i,
            configuration = configuration_space.sample_configuration()
            D_ = copy.deepcopy(D)
            evaluator = CVEvaluator(D_, configuration,
                                    with_predictions=True)

            if not self._fit(evaluator):
                print
                continue
            e_, Y_optimization_pred, Y_valid_pred, Y_test_pred = \
                evaluator.predict()
            err[i] = e_
            print err[i], configuration['classifier']

            num_targets = len(np.unique(Y_train))
            self.assertTrue(np.isfinite(err[i]))
            self.assertGreaterEqual(err[i], 0.0)
            # Test that ten models were trained
            self.assertEqual(len(evaluator.models), 10)
            self.assertEqual(Y_optimization_pred.shape[0], Y_train.shape[0])
            self.assertEqual(Y_optimization_pred.shape[1], num_targets)
            self.assertEqual(Y_valid_pred.shape[0], Y_valid.shape[0])
            self.assertEqual(Y_valid_pred.shape[1], num_targets)
            self.assertEqual(Y_test_pred.shape[0], Y_test.shape[0])
            self.assertEqual(Y_test_pred.shape[1], num_targets)
            # Test some basic statistics of the dataset
            if err[i] < 0.5:
                self.assertTrue(0.3 < Y_valid_pred.mean() < 0.36666)
                self.assertGreaterEqual(Y_valid_pred.std(), 0.01)
                self.assertTrue(0.3 < Y_test_pred.mean() < 0.36666)
                self.assertGreaterEqual(Y_test_pred.std(), 0.01)
                num_models_better_than_random += 1
        self.assertGreater(num_models_better_than_random, 5)
예제 #27
0
 def test_default_configuration(self):
     for i in range(2):
         cs = ParamSklearnClassifier.get_hyperparameter_search_space()
         default = cs.get_default_configuration()
         X_train, Y_train, X_test, Y_test = get_dataset(dataset='iris')
         auto = ParamSklearnClassifier(default)
         auto = auto.fit(X_train, Y_train)
         predictions = auto.predict(X_test)
         self.assertAlmostEqual(0.9599999999999995,
             sklearn.metrics.accuracy_score(predictions, Y_test))
         scores = auto.predict_proba(X_test)
예제 #28
0
 def test_fit_OneHotEncoder(self):
     # Test if the OneHotEncoder is called
     X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
     X_train = np.hstack((X_train,
                          np.arange(X_train.shape[0]).reshape((-1, 1))))
     cls = autosklearn.AutoSklearnClassifier(time_left_for_this_task=5,
                                             per_run_time_limit=5)
     cls.fit(X_train, Y_train,
             feat_type=['NUMERICAL', 'NUMERICAL', 'NUMERICAL', 'NUMERICAL',
                        'CATEGORICAL'])
     self.assertEqual([False, False, False, False, True],
                      cls._ohe.categorical_features)
예제 #29
0
 def test_default_configuration(self):
     for i in range(2):
         cs = ParamSklearnClassifier.get_hyperparameter_search_space()
         default = cs.get_default_configuration()
         X_train, Y_train, X_test, Y_test = get_dataset(dataset='iris')
         auto = ParamSklearnClassifier(default)
         auto = auto.fit(X_train, Y_train)
         predictions = auto.predict(X_test)
         self.assertAlmostEqual(
             0.9599999999999995,
             sklearn.metrics.accuracy_score(predictions, Y_test))
         scores = auto.predict_proba(X_test)
예제 #30
0
    def test_fit(self):
        output = os.path.join(self.test_dir, '..', '.tmp_test_fit')
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
        automl = autosklearn.automl.AutoML(output, output, 12, 12)
        automl.fit(X_train, Y_train)
        score = automl.score(X_test, Y_test)
        self.assertGreaterEqual(score, 0.8)
        self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION)

        del automl
        self._tearDown(output)
예제 #31
0
 def test_default_configuration(self):
     for i in range(2):
         cs = ParamSklearnRegressor.get_hyperparameter_search_space()
         default = cs.get_default_configuration()
         X_train, Y_train, X_test, Y_test = get_dataset(dataset='diabetes')
         auto = ParamSklearnRegressor(default)
         auto = auto.fit(X_train, Y_train)
         predictions = auto.predict(copy.deepcopy(X_test))
         # The lower the worse
         r2_score = sklearn.metrics.r2_score(Y_test, predictions)
         self.assertAlmostEqual(0.41626416529791199, r2_score)
         model_score = auto.score(copy.deepcopy(X_test), Y_test)
         self.assertEqual(model_score, r2_score)
예제 #32
0
    def _test_helper(self, Preprocessor, dataset=None, make_sparse=False):
        X_train, Y_train, X_test, Y_test = get_dataset(dataset=dataset,
                                          make_sparse=make_sparse)
        original_X_train = X_train.copy()
        configuration_space = Preprocessor.get_hyperparameter_search_space()
        default = configuration_space.get_default_configuration()

        preprocessor = Preprocessor(random_state=1,
                                    **{hp_name: default[hp_name] for hp_name in
                                       default if default[hp_name] is not None})
        preprocessor = preprocessor.choice
        transformer = preprocessor.fit(X_train, Y_train)
        return transformer.transform(X_train), original_X_train
예제 #33
0
    def test_evaluate_multiclass_classification(self):
        X_train, Y_train, X_test, Y_test = get_dataset("iris")

        X_valid = X_test[:25,]
        Y_valid = Y_test[:25,]
        X_test = X_test[25:,]
        Y_test = Y_test[25:,]

        D = Dummy()
        D.info = {"metric": BAC_METRIC, "task": MULTICLASS_CLASSIFICATION, "is_sparse": False, "label_num": 3}
        D.data = {"X_train": X_train, "Y_train": Y_train, "X_valid": X_valid, "X_test": X_test}
        D.feat_type = ["numerical", "Numerical", "numerical", "numerical"]

        configuration_space = get_configuration_space(
            D.info, include_estimators=["extra_trees"], include_preprocessors=["select_rates"]
        )

        err = np.zeros([N_TEST_RUNS])
        num_models_better_than_random = 0
        for i in range(N_TEST_RUNS):
            print("Evaluate configuration: %d; result:" % i)
            configuration = configuration_space.sample_configuration()
            D_ = copy.deepcopy(D)
            evaluator = CVEvaluator(D_, configuration, with_predictions=True)

            if not self._fit(evaluator):
                print()
                continue
            e_, Y_optimization_pred, Y_valid_pred, Y_test_pred = evaluator.predict()
            err[i] = e_
            print(err[i], configuration["classifier:__choice__"])

            num_targets = len(np.unique(Y_train))
            self.assertTrue(np.isfinite(err[i]))
            self.assertGreaterEqual(err[i], 0.0)
            # Test that ten models were trained
            self.assertEqual(len(evaluator.models), 10)
            self.assertEqual(Y_optimization_pred.shape[0], Y_train.shape[0])
            self.assertEqual(Y_optimization_pred.shape[1], num_targets)
            self.assertEqual(Y_valid_pred.shape[0], Y_valid.shape[0])
            self.assertEqual(Y_valid_pred.shape[1], num_targets)
            self.assertEqual(Y_test_pred.shape[0], Y_test.shape[0])
            self.assertEqual(Y_test_pred.shape[1], num_targets)
            # Test some basic statistics of the dataset
            if err[i] < 0.5:
                self.assertTrue(0.3 < Y_valid_pred.mean() < 0.36666)
                self.assertGreaterEqual(Y_valid_pred.std(), 0.01)
                self.assertTrue(0.3 < Y_test_pred.mean() < 0.36666)
                self.assertGreaterEqual(Y_test_pred.std(), 0.01)
                num_models_better_than_random += 1
        self.assertGreater(num_models_better_than_random, 5)
예제 #34
0
    def test_configurations_sparse(self):
        # Use a limit of ~4GiB
        limit = 4000 * 1024 * 1024
        resource.setrlimit(resource.RLIMIT_AS, (limit, limit))

        cs = ParamSklearnClassifier.get_hyperparameter_search_space(
            dataset_properties={'sparse': True})
        print(cs)
        for i in range(10):
            config = cs.sample_configuration()
            config._populate_values()
            if config['classifier:passive_aggressive:n_iter'] is not None:
                config._values['classifier:passive_aggressive:n_iter'] = 5
            if config['classifier:sgd:n_iter'] is not None:
                config._values['classifier:sgd:n_iter'] = 5

            print(config)
            X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                           make_sparse=True)
            cls = ParamSklearnClassifier(config, random_state=1)
            try:
                cls.fit(X_train, Y_train)
                predictions = cls.predict(X_test)
            except ValueError as e:
                if "Floating-point under-/overflow occurred at epoch" in \
                       e.args[0] or \
                        "removed all features" in e.args[0] or \
                                "all features are discarded" in e.args[0]:
                    continue
                else:
                    print(config)
                    traceback.print_tb(sys.exc_info()[2])
                    raise e
            except RuntimeWarning as e:
                if "invalid value encountered in sqrt" in e.args[0]:
                    continue
                elif "divide by zero encountered in" in e.args[0]:
                    continue
                elif "invalid value encountered in divide" in e.args[0]:
                    continue
                elif "invalid value encountered in true_divide" in e.args[0]:
                    continue
                else:
                    print(config)
                    raise e
            except UserWarning as e:
                if "FastICA did not converge" in e.args[0]:
                    continue
                else:
                    print(config)
                    raise e
예제 #35
0
    def test_configurations_sparse(self):
        # Use a limit of ~4GiB
        limit = 4000 * 1024 * 1024
        resource.setrlimit(resource.RLIMIT_AS, (limit, limit))

        cs = ParamSklearnClassifier.get_hyperparameter_search_space(
            dataset_properties={'sparse': True})
        print(cs)
        for i in range(10):
            config = cs.sample_configuration()
            config._populate_values()
            if config['classifier:passive_aggressive:n_iter'] is not None:
                config._values['classifier:passive_aggressive:n_iter'] = 5
            if config['classifier:sgd:n_iter'] is not None:
                config._values['classifier:sgd:n_iter'] = 5

            print(config)
            X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                           make_sparse=True)
            cls = ParamSklearnClassifier(config, random_state=1)
            try:
                cls.fit(X_train, Y_train)
                predictions = cls.predict(X_test)
            except ValueError as e:
                if "Floating-point under-/overflow occurred at epoch" in \
                       e.args[0] or \
                        "removed all features" in e.args[0] or \
                                "all features are discarded" in e.args[0]:
                    continue
                else:
                    print(config)
                    traceback.print_tb(sys.exc_info()[2])
                    raise e
            except RuntimeWarning as e:
                if "invalid value encountered in sqrt" in e.args[0]:
                    continue
                elif "divide by zero encountered in" in e.args[0]:
                    continue
                elif "invalid value encountered in divide" in e.args[0]:
                    continue
                elif "invalid value encountered in true_divide" in e.args[0]:
                    continue
                else:
                    print(config)
                    raise e
            except UserWarning as e:
                if "FastICA did not converge" in e.args[0]:
                    continue
                else:
                    print(config)
                    raise e
예제 #36
0
    def test_metalearning(self):
        dataset_name = 'digits'

        initial_challengers = {
            ACC_METRIC: "--initial-challengers \" "
                        "-balancing:strategy 'weighting' "
                        "-classifier:__choice__ 'proj_logit'",
            AUC_METRIC: "--initial-challengers \" "
                        "-balancing:strategy 'none' "
                        "-classifier:__choice__ 'random_forest'",
            BAC_METRIC: "--initial-challengers \" "
                        "-balancing:strategy 'weighting' "
                        "-classifier:__choice__ 'proj_logit'",
            F1_METRIC: "--initial-challengers \" "
                       "-balancing:strategy 'weighting' "
                       "-classifier:__choice__ 'proj_logit'",
            PAC_METRIC: "--initial-challengers \" "
                        "-balancing:strategy 'none' "
                        "-classifier:__choice__ 'random_forest'"
        }

        for metric in initial_challengers:
            configuration_space = get_configuration_space(
                {
                    'metric': metric,
                    'task': MULTICLASS_CLASSIFICATION,
                    'is_sparse': False
                },
                include_preprocessors=['no_preprocessing'])

            X_train, Y_train, X_test, Y_test = get_dataset(dataset_name)
            categorical = [False] * X_train.shape[1]

            meta_features_label = calc_meta_features(X_train, Y_train,
                                                     categorical, dataset_name)
            meta_features_encoded_label = calc_meta_features_encoded(X_train,
                                                                     Y_train,
                                                                     categorical,
                                                                     dataset_name)
            initial_configuration_strings_for_smac = \
                create_metalearning_string_for_smac_call(
                    meta_features_label,
                    meta_features_encoded_label,
                    configuration_space, dataset_name, metric,
                    MULTICLASS_CLASSIFICATION, False, 1, None)

            print(metric)
            print(initial_configuration_strings_for_smac[0])
            self.assertTrue(initial_configuration_strings_for_smac[
                                0].startswith(initial_challengers[metric]))
예제 #37
0
    def test_evaluate_binary_classification(self):
        X_train, Y_train, X_test, Y_test = get_dataset('iris')

        eliminate_class_two = Y_train != 2
        X_train = X_train[eliminate_class_two]
        Y_train = Y_train[eliminate_class_two]

        eliminate_class_two = Y_test != 2
        X_test = X_test[eliminate_class_two]
        Y_test = Y_test[eliminate_class_two]

        X_valid = X_test[:25, ]
        Y_valid = Y_test[:25, ]
        X_test = X_test[25:, ]
        Y_test = Y_test[25:, ]

        D = Dummy()
        D.info = {
            'metric': AUC_METRIC,
            'task': BINARY_CLASSIFICATION,
            'is_sparse': False,
            'label_num': 2
        }
        D.data = {
            'X_train': X_train,
            'Y_train': Y_train,
            'X_valid': X_valid,
            'X_test': X_test
        }
        D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical']

        configuration_space = get_configuration_space(
            D.info,
            include_estimators=['lda'],
            include_preprocessors=['pca'])

        err = np.zeros([N_TEST_RUNS])
        for i in range(N_TEST_RUNS):
            print('Evaluate configuration: %d; result:' % i)
            configuration = configuration_space.sample_configuration()
            D_ = copy.deepcopy(D)
            evaluator = HoldoutEvaluator(D_, configuration)

            if not self._fit(evaluator):
                continue
            err[i] = evaluator.predict()
            self.assertTrue(np.isfinite(err[i]))
            print(err[i])

            self.assertGreaterEqual(err[i], 0.0)
예제 #38
0
    def test_predict_proba_binary_classification(self):
        X_train, Y_train, X_test, Y_test = get_dataset('iris')

        eliminate_class_two = Y_train != 2
        X_train = X_train[eliminate_class_two]
        Y_train = Y_train[eliminate_class_two]

        eliminate_class_two = Y_test != 2
        X_test = X_test[eliminate_class_two]
        Y_test = Y_test[eliminate_class_two]

        X_valid = X_test[:25, ]
        Y_valid = Y_test[:25, ]
        X_test = X_test[25:, ]
        Y_test = Y_test[25:, ]

        class Dummy2(object):

            def predict_proba(self, y, batch_size=200):
                return np.array([[0.1, 0.9], [0.7, 0.3]])

        model = Dummy2()
        task_type = BINARY_CLASSIFICATION

        D = Dummy()
        D.info = {
            'metric': BAC_METRIC,
            'task': task_type,
            'is_sparse': False,
            'label_num': 3
        }
        D.data = {
            'X_train': X_train,
            'Y_train': Y_train,
            'X_valid': X_valid,
            'X_test': X_test
        }
        D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical']

        configuration_space = get_configuration_space(
            D.info,
            include_estimators=['lda'],
            include_preprocessors=['select_rates'])
        configuration = configuration_space.sample_configuration()

        evaluator = HoldoutEvaluator(D, configuration)
        pred = evaluator.predict_proba(None, model, task_type)
        expected = [[0.9], [0.3]]
        for i in range(len(expected)):
            self.assertEqual(expected[i], pred[i])
    def test_evaluate_multiclass_classification_all_metrics(self):
        X_train, Y_train, X_test, Y_test = get_dataset('iris')
        X_valid = X_test[:25, ]
        Y_valid = Y_test[:25, ]
        X_test = X_test[25:, ]
        Y_test = Y_test[25:, ]

        D = Dummy()
        D.info = {
            'metric': 'bac_metric',
            'task': MULTICLASS_CLASSIFICATION,
            'is_sparse': False,
            'target_num': 3
        }
        D.data = {
            'X_train': X_train,
            'Y_train': Y_train,
            'X_valid': X_valid,
            'X_test': X_test
        }
        D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical']

        configuration_space = get_configuration_space(
            D.info,
            include_estimators=['ridge'],
            include_preprocessors=['select_rates'])

        # Test all scoring functions
        err = []
        for i in range(N_TEST_RUNS):
            print('Evaluate configuration: %d; result:' % i)
            configuration = configuration_space.sample_configuration()
            D_ = copy.deepcopy(D)
            evaluator = HoldoutEvaluator(D_, configuration,
                                         all_scoring_functions=True)
            if not self._fit(evaluator):
                continue

            err.append(evaluator.predict())
            print(err[-1])

            self.assertIsInstance(err[-1], dict)
            for key in err[-1]:
                self.assertEqual(len(err[-1]), 5)
                self.assertTrue(np.isfinite(err[-1][key]))
                self.assertGreaterEqual(err[-1][key], 0.0)

        print('Number of times it was worse than random guessing:' +
              str(np.sum(err > 1)))
예제 #40
0
 def test_fit_OneHotEncoder(self):
     # Test if the OneHotEncoder is called
     X_train, Y_train, X_test, Y_test = putil.get_dataset("iris")
     X_train = np.hstack((X_train, np.arange(X_train.shape[0]).reshape(
         (-1, 1))))
     cls = autosklearn.AutoSklearnClassifier(time_left_for_this_task=5,
                                             per_run_time_limit=5)
     cls.fit(X_train,
             Y_train,
             feat_type=[
                 "NUMERICAL", "NUMERICAL", "NUMERICAL", "NUMERICAL",
                 "CATEGORICAL"
             ])
     self.assertEqual([False, False, False, False, True],
                      cls.ohe_.categorical_features)
예제 #41
0
    def test_predict_batched(self):
        cs = ParamSklearnRegressor.get_hyperparameter_search_space()
        default = cs.get_default_configuration()
        cls = ParamSklearnRegressor(default)

        X_train, Y_train, X_test, Y_test = get_dataset(dataset='boston')
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict(X_test_)
        cls_predict = mock.Mock(wraps=cls.pipeline_)
        cls.pipeline_ = cls_predict
        prediction = cls.predict(X_test, batch_size=20)
        self.assertEqual((356,), prediction.shape)
        self.assertEqual(18, cls_predict.predict.call_count)
        assert_array_almost_equal(prediction_, prediction)
    def test_file_output(self):
        output_dir = os.path.join(os.getcwd(), '.test')

        try:
            shutil.rmtree(output_dir)
        except Exception:
            pass

        X_train, Y_train, X_test, Y_test = get_dataset('iris')
        X_valid = X_test[:25, ]
        Y_valid = Y_test[:25, ]
        X_test = X_test[25:, ]
        Y_test = Y_test[25:, ]

        D = Dummy()
        D.info = {
            'metric': 'bac_metric',
            'task': MULTICLASS_CLASSIFICATION,
            'is_sparse': False,
            'target_num': 3
        }
        D.data = {
            'X_train': X_train,
            'Y_train': Y_train,
            'X_valid': X_valid,
            'X_test': X_test
        }
        D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical']
        D.basename = 'test'

        configuration_space = get_configuration_space(D.info)

        while True:
            configuration = configuration_space.sample_configuration()
            evaluator = HoldoutEvaluator(D, configuration,
                                         with_predictions=True,
                                         all_scoring_functions=True,
                                         output_dir=output_dir,
                                         output_y_test=True)

            if not self._fit(evaluator):
                continue
            evaluator.predict()
            evaluator.file_output()

            self.assertTrue(os.path.exists(os.path.join(output_dir,
                                                        'y_optimization.npy')))
            break
예제 #43
0
    def test_file_output(self):
        output_dir = os.path.join(os.getcwd(), '.test')

        try:
            shutil.rmtree(output_dir)
        except Exception:
            pass

        X_train, Y_train, X_test, Y_test = get_dataset('boston')
        X_valid = X_test[:25, ]
        Y_valid = Y_test[:25, ]
        X_test = X_test[25:, ]
        Y_test = Y_test[25:, ]

        D = Dummy()
        D.info = {
            'metric': R2_METRIC,
            'task': REGRESSION,
            'is_sparse': False,
            'label_num': 3
        }
        D.data = {
            'X_train': X_train,
            'Y_train': Y_train,
            'X_valid': X_valid,
            'X_test': X_test
        }
        D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical']
        D.name = 'test'

        configuration_space = get_configuration_space(D.info)

        while True:
            configuration = configuration_space.sample_configuration()
            evaluator = HoldoutEvaluator(D, configuration,
                                         with_predictions=True,
                                         all_scoring_functions=True,
                                         output_dir=output_dir,
                                         output_y_test=True)

            if not self._fit(evaluator):
                continue
            evaluator.predict()
            evaluator.file_output()

            self.assertTrue(os.path.exists(os.path.join(
                output_dir, '.auto-sklearn', 'true_targets_ensemble.npy')))
            break
예제 #44
0
    def _test_helper(self, Preprocessor, dataset=None, make_sparse=False):
        X_train, Y_train, X_test, Y_test = get_dataset(dataset=dataset,
                                                       make_sparse=make_sparse)
        original_X_train = X_train.copy()
        configuration_space = Preprocessor.get_hyperparameter_search_space()
        default = configuration_space.get_default_configuration()

        preprocessor = Preprocessor(random_state=1,
                                    **{
                                        hp_name: default[hp_name]
                                        for hp_name in default
                                        if default[hp_name] is not None
                                    })
        preprocessor = preprocessor.choice
        transformer = preprocessor.fit(X_train, Y_train)
        return transformer.transform(X_train), original_X_train
예제 #45
0
    def test_default_configuration_predict_proba(self):
        for i in range(10):
            predictions, targets = _test_classifier_predict_proba(
                LibSVM_SVC,
                sparse=True,
                dataset='digits',
                train_size_maximum=500)
            self.assertAlmostEqual(
                4.6680593525563063,
                sklearn.metrics.log_loss(targets, predictions))

        for i in range(10):
            predictions, targets = _test_classifier_predict_proba(
                LibSVM_SVC, sparse=True, dataset='iris')
            self.assertAlmostEqual(
                0.8649665185853217,
                sklearn.metrics.log_loss(targets, predictions))

        # 2 class
        for i in range(10):
            X_train, Y_train, X_test, Y_test = get_dataset(dataset='iris')
            remove_training_data = Y_train == 2
            remove_test_data = Y_test == 2
            X_train = X_train[~remove_training_data]
            Y_train = Y_train[~remove_training_data]
            X_test = X_test[~remove_test_data]
            Y_test = Y_test[~remove_test_data]
            ss = sklearn.preprocessing.StandardScaler()
            X_train = ss.fit_transform(X_train)
            configuration_space = LibSVM_SVC.get_hyperparameter_search_space()
            default = configuration_space.get_default_configuration()

            cls = LibSVM_SVC(random_state=1,
                             **{
                                 hp_name: default[hp_name]
                                 for hp_name in default
                                 if default[hp_name] is not None
                             })

            cls = cls.fit(X_train, Y_train)
            prediction = cls.predict_proba(X_test)
            self.assertAlmostEqual(
                sklearn.metrics.log_loss(Y_test, prediction),
                0.69323680119641773)
예제 #46
0
    def test_default_configuration_regression(self):
        for i in range(5):
            X_train, Y_train, X_test, Y_test = get_dataset(dataset='diabetes')
            configuration_space = FastICA.get_hyperparameter_search_space()
            default = configuration_space.get_default_configuration()
            preprocessor = FastICA(
                random_state=1,
                **{hp_name: default[hp_name]
                   for hp_name in default})
            preprocessor.fit(X_train, Y_train)
            X_train_trans = preprocessor.transform(X_train)
            X_test_trans = preprocessor.transform(X_test)

            # fit a classifier on top
            classifier = Ridge()
            predictor = classifier.fit(X_train_trans, Y_train)
            predictions = predictor.predict(X_test_trans)
            accuracy = sklearn.metrics.r2_score(Y_test, predictions)
            self.assertAlmostEqual(accuracy, 0.32614416980439365)
    def test_file_output(self):
        output_dir = os.path.join(os.getcwd(), ".test")

        try:
            shutil.rmtree(output_dir)
        except:
            pass

        X_train, Y_train, X_test, Y_test = get_dataset('iris')
        X_valid = X_test[:25, ]
        Y_valid = Y_test[:25, ]
        X_test = X_test[25:, ]
        Y_test = Y_test[25:, ]

        D = Dummy()
        D.info = {'metric': 'bac_metric', 'task': MULTICLASS_CLASSIFICATION,
                  'is_sparse': False, 'target_num': 3}
        D.data = {'X_train': X_train, 'Y_train': Y_train,
                  'X_valid': X_valid, 'X_test': X_test}
        D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical']
        D.basename = "test"


        configuration_space = get_configuration_space(D.info)

        while True:
            configuration = configuration_space.sample_configuration()
            evaluator = HoldoutEvaluator(D, configuration,
                                         with_predictions=True,
                                         all_scoring_functions=True,
                                         output_dir=output_dir,
                                         output_y_test=True)

            if not self._fit(evaluator):
                print
                continue
            evaluator.predict()
            evaluator.file_output()

            self.assertTrue(os.path.exists(os.path.join(output_dir,
                                                        "y_optimization.npy")))
            break
    def test_default_configuration(self):
        configuration_space = RidgeRegression.get_hyperparameter_search_space()
        default = configuration_space.get_default_configuration()
        configuration_space_preproc = RandomKitchenSinks.get_hyperparameter_search_space(
        )
        default_preproc = configuration_space_preproc.get_default_configuration(
        )

        for i in range(10):
            # This should be a bad results
            predictions, targets = _test_regressor(RidgeRegression, )
            self.assertAlmostEqual(
                0.32614416980439365,
                sklearn.metrics.r2_score(y_true=targets, y_pred=predictions))

            # This should be much more better
            X_train, Y_train, X_test, Y_test = get_dataset(dataset='diabetes',
                                                           make_sparse=False)
            preprocessor = RandomKitchenSinks(
                random_state=1,
                **{
                    hp_name: default_preproc[hp_name]
                    for hp_name in default_preproc
                    if default_preproc[hp_name] is not None
                })

            transformer = preprocessor.fit(X_train, Y_train)
            X_train_transformed = transformer.transform(X_train)
            X_test_transformed = transformer.transform(X_test)

            regressor = RidgeRegression(random_state=1,
                                        **{
                                            hp_name: default[hp_name]
                                            for hp_name in default
                                            if default[hp_name] is not None
                                        })
            predictor = regressor.fit(X_train_transformed, Y_train)
            predictions = predictor.predict(X_test_transformed)

            self.assertAlmostEqual(
                0.37183512452087852,
                sklearn.metrics.r2_score(y_true=Y_test, y_pred=predictions))
예제 #49
0
    def test_default_configuration_classify(self):
        for i in range(3):
            X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                           make_sparse=False)
            configuration_space = GEM.get_hyperparameter_search_space()
            default = configuration_space.get_default_configuration()
            preprocessor = GEM(
                random_state=1,
                **{hp_name: default[hp_name]
                   for hp_name in default})
            preprocessor.fit(X_train, Y_train)
            X_train_trans = preprocessor.transform(X_train)
            X_test_trans = preprocessor.transform(X_test)

            # fit a classifier on top
            classifier = ProjLogitCLassifier(max_epochs=5, random_state=1)
            predictor = classifier.fit(X_train_trans, Y_train)
            predictions = predictor.predict(X_test_trans)
            accuracy = sklearn.metrics.accuracy_score(predictions, Y_test)
            self.assertGreaterEqual(accuracy, 0.94)
예제 #50
0
    def test_default_configuration_negative_values(self):
        # Custon preprocessing test to check if clipping to zero works
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
        original_X_train = X_train.copy()
        ss = sklearn.preprocessing.StandardScaler()
        X_train = ss.fit_transform(X_train)
        configuration_space = MultinomialNB.get_hyperparameter_search_space()
        default = configuration_space.get_default_configuration()

        cls = MultinomialNB(random_state=1,
                            **{
                                hp_name: default[hp_name]
                                for hp_name in default
                                if default[hp_name] is not None
                            })

        cls = cls.fit(X_train, Y_train)
        prediction = cls.predict(X_test)
        self.assertAlmostEqual(np.nanmean(prediction == Y_test),
                               0.88888888888888884)
예제 #51
0
    def test_default_configuration_classify(self):
        for i in range(2):
            X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                           make_sparse=False)
            configuration_space = ExtraTreesPreprocessor.get_hyperparameter_search_space(
            )
            default = configuration_space.get_default_configuration()
            preprocessor = ExtraTreesPreprocessor(
                random_state=1,
                **{hp_name: default[hp_name]
                   for hp_name in default})
            preprocessor.fit(X_train, Y_train)
            X_train_trans = preprocessor.transform(X_train)
            X_test_trans = preprocessor.transform(X_test)

            # fit a classifier on top
            classifier = RidgeClassifier()
            predictor = classifier.fit(X_train_trans, Y_train)
            predictions = predictor.predict(X_test_trans)
            accuracy = sklearn.metrics.accuracy_score(predictions, Y_test)
            self.assertAlmostEqual(accuracy, 0.87310261080752882, places=2)
예제 #52
0
    def test_default_configuration_classify(self):
        for i in range(3):
            X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                           make_sparse=False)
            configuration_space = FeatureAgglomeration.get_hyperparameter_search_space(
            )
            default = configuration_space.get_default_configuration()
            preprocessor = FeatureAgglomeration(
                random_state=1,
                **{hp_name: default[hp_name]
                   for hp_name in default})
            preprocessor.fit(X_train, Y_train)
            X_train_trans = preprocessor.transform(X_train)
            X_test_trans = preprocessor.transform(X_test)

            # fit a classifier on top
            classifier = RandomForestClassifier(random_state=1)
            predictor = classifier.fit(X_train_trans, Y_train)
            predictions = predictor.predict(X_test_trans)
            accuracy = sklearn.metrics.accuracy_score(predictions, Y_test)
            self.assertAlmostEqual(accuracy, 0.8026715)
    def test_predict_proba_binary_classification(self):
        X_train, Y_train, X_test, Y_test = get_dataset('iris')

        eliminate_class_two = Y_train != 2
        X_train = X_train[eliminate_class_two]
        Y_train = Y_train[eliminate_class_two]

        eliminate_class_two = Y_test != 2
        X_test = X_test[eliminate_class_two]
        Y_test = Y_test[eliminate_class_two]

        X_valid = X_test[:25, ]
        Y_valid = Y_test[:25, ]
        X_test = X_test[25:, ]
        Y_test = Y_test[25:, ]

        class Dummy2(object):
            def predict_proba(self, y, batch_size=200):
                return np.array([[0.1, 0.9], [0.7, 0.3]])

        model = Dummy2()
        task_type = BINARY_CLASSIFICATION

        D = Dummy()
        D.info = {'metric': 'bac_metric', 'task': task_type,
                  'is_sparse': False, 'target_num': 3}
        D.data = {'X_train': X_train, 'Y_train': Y_train,
                  'X_valid': X_valid, 'X_test': X_test}
        D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical']

        configuration_space = get_configuration_space(
            D.info, include_estimators=['ridge'],
            include_preprocessors=['select_rates'])
        configuration = configuration_space.sample_configuration()

        evaluator = HoldoutEvaluator(D, configuration)
        pred = evaluator.predict_proba(None, model, task_type)
        expected = [[0.9], [0.3]]
        for i in range(len(expected)):
            self.assertEqual(expected[i], pred[i])
예제 #54
0
    def test_default_configuration_classify(self):
        for i in range(5):
            X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                           make_sparse=False)
            configuration_space = KernelPCA.get_hyperparameter_search_space()
            default = configuration_space.get_default_configuration()
            preprocessor = KernelPCA(random_state=1,
                                     **{
                                         hp_name: default[hp_name]
                                         for hp_name in default
                                         if default[hp_name] is not None
                                     })
            preprocessor.fit(X_train, Y_train)
            X_train_trans = preprocessor.transform(X_train)
            X_test_trans = preprocessor.transform(X_test)

            # fit a classifier on top
            classifier = RidgeClassifier()
            predictor = classifier.fit(X_train_trans, Y_train)
            predictions = predictor.predict(X_test_trans)
            accuracy = sklearn.metrics.accuracy_score(predictions, Y_test)
            self.assertAlmostEqual(accuracy, 0.096539162112932606)
예제 #55
0
    def test_min_max_scaler_sparse_boston_data(self):
        # Use the boston housing dataset, because column three is 1HotEncoded!
        # This is important to test; because the normal sklearn rescaler
        # would set all values of the 1Hot Encoded column to zero, while we
        # keep the values at 1.
        X_train, Y_train, X_test, Y_test = get_dataset('boston',
                                                       make_sparse=True)
        num_data_points = len(X_train.data)
        expected_max_values = [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
        expected_max_values = np.array(expected_max_values).reshape((1, -1))

        scaler = MinMaxScaler()
        scaler.fit(X_train, Y_train)
        transformation = scaler.transform(X_train)

        assert_array_almost_equal(
            np.array(transformation.todense().min(axis=0)), np.zeros((1, 13)))
        assert_array_almost_equal(
            np.array(transformation.todense().max(axis=0)),
            expected_max_values)
        # Test that the matrix is still sparse
        self.assertTrue(sparse.issparse(transformation))
        self.assertEqual(num_data_points, len(transformation.data))
예제 #56
0
    def test_default_configuration_classify(self):
        for i in range(2):
            X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                           make_sparse=True)
            configuration_space = TruncatedSVD.get_hyperparameter_search_space(
            )
            default = configuration_space.get_default_configuration()
            preprocessor = TruncatedSVD(random_state=1,
                                        **{
                                            hp_name: default[hp_name]
                                            for hp_name in default
                                            if default[hp_name] is not None
                                        })
            preprocessor.fit(X_train, Y_train)
            X_train_trans = preprocessor.transform(X_train)
            X_test_trans = preprocessor.transform(X_test)

            # fit a classifier on top
            classifier = RidgeClassifier()
            predictor = classifier.fit(X_train_trans, Y_train)
            predictions = predictor.predict(X_test_trans)
            accuracy = sklearn.metrics.accuracy_score(predictions, Y_test)
            self.assertAlmostEqual(accuracy, 0.44201578627808136, places=2)