def test_preprocessing_dtype(self):
        # Dense
        # np.float32
        X_train, Y_train, X_test, Y_test = get_dataset("iris")
        self.assertEqual(X_train.dtype, np.float32)

        configuration_space = RandomTreesEmbedding.get_hyperparameter_search_space()
        default = configuration_space.get_default_configuration()
        preprocessor = RandomTreesEmbedding(random_state=1,
                                            **{hp_name: default[hp_name] for
                                               hp_name in
                                               default})
        preprocessor.fit(X_train)
        Xt = preprocessor.transform(X_train)

        self.assertEqual(Xt.dtype, np.float32)

        # np.float64
        X_train, Y_train, X_test, Y_test = get_dataset("iris")
        X_train = X_train.astype(np.float64)
        configuration_space = RandomTreesEmbedding.get_hyperparameter_search_space()
        default = configuration_space.get_default_configuration()
        preprocessor = RandomTreesEmbedding(random_state=1,
                                            **{hp_name: default[hp_name] for
                                               hp_name in
                                               default})
        preprocessor.fit(X_train, Y_train)
        Xt = preprocessor.transform(X_train)
        self.assertEqual(Xt.dtype, np.float64)
Ejemplo n.º 2
0
    def test_predict_proba_batched(self):
        # Multiclass
        cls = SimpleClassificationPipeline(include={'classifier': ['sgd']})
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')

        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        # The object behind the last step in the pipeline
        cls_predict = unittest.mock.Mock(wraps=cls.steps[-1][1].predict_proba)
        cls.steps[-1][-1].predict_proba = cls_predict
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertEqual((1647, 10), prediction.shape)
        self.assertEqual(84, cls_predict.call_count)
        assert_array_almost_equal(prediction_, prediction)

        # Multilabel
        cls = SimpleClassificationPipeline(include={'classifier': ['lda']})
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
        Y_train = np.array(list([(list([1 if i != y else 0 for i in range(10)]))
                                 for y in Y_train]))
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        # The object behind the last step in the pipeline
        cls_predict = unittest.mock.Mock(wraps=cls.steps[-1][1].predict_proba)
        cls.steps[-1][-1].predict_proba = cls_predict
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertEqual((1647, 10), prediction.shape)
        self.assertEqual(84, cls_predict.call_count)
        assert_array_almost_equal(prediction_, prediction)
Ejemplo n.º 3
0
    def test_predict_proba_batched(self):
        cs = SimpleClassificationPipeline.get_hyperparameter_search_space()
        default = cs.get_default_configuration()

        # Multiclass
        cls = SimpleClassificationPipeline(default)
        X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits")
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        # The object behind the last step in the pipeline
        cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1])
        cls.pipeline_.steps[-1] = ("estimator", cls_predict)
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertEqual((1647, 10), prediction.shape)
        self.assertEqual(84, cls_predict.predict_proba.call_count)
        assert_array_almost_equal(prediction_, prediction)

        # Multilabel
        cls = SimpleClassificationPipeline(default)
        X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits")
        Y_train = np.array(list([(list([1 if i != y else 0 for i in range(10)])) for y in Y_train]))
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1])
        cls.pipeline_.steps[-1] = ("estimator", cls_predict)
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertIsInstance(prediction, np.ndarray)
        self.assertEqual(prediction.shape, ((1647, 10)))
        self.assertEqual(84, cls_predict.predict_proba.call_count)
        assert_array_almost_equal(prediction_, prediction)
Ejemplo n.º 4
0
    def test_predict_proba_batched(self):
        cs = SimpleClassificationPipeline.get_hyperparameter_search_space()
        default = cs.get_default_configuration()

        # Multiclass
        cls = SimpleClassificationPipeline(default)
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        # The object behind the last step in the pipeline
        cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1])
        cls.pipeline_.steps[-1] = ("estimator", cls_predict)
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertEqual((1647, 10), prediction.shape)
        self.assertEqual(84, cls_predict.predict_proba.call_count)
        assert_array_almost_equal(prediction_, prediction)

        # Multilabel
        cls = SimpleClassificationPipeline(default)
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
        Y_train_ = np.zeros((Y_train.shape[0], 10))
        for i, y in enumerate(Y_train):
            Y_train_[i][y] = 1
        Y_train = Y_train_
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1])
        cls.pipeline_.steps[-1] = ("estimator", cls_predict)
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertIsInstance(prediction, np.ndarray)
        self.assertEqual(prediction.shape, ((1647, 10)))
        self.assertEqual(84, cls_predict.predict_proba.call_count)
        assert_array_almost_equal(prediction_, prediction)
Ejemplo n.º 5
0
    def test_predict_batched(self):
        cs = SimpleClassificationPipeline.get_hyperparameter_search_space()
        default = cs.get_default_configuration()
        cls = SimpleClassificationPipeline(default)

        # Multiclass
        X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits")
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict(X_test_)
        cls_predict = mock.Mock(wraps=cls.pipeline_)
        cls.pipeline_ = cls_predict
        prediction = cls.predict(X_test, batch_size=20)
        self.assertEqual((1647,), prediction.shape)
        self.assertEqual(83, cls_predict.predict.call_count)
        assert_array_almost_equal(prediction_, prediction)

        # Multilabel
        X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits")
        Y_train = np.array(list([(list([1 if i != y else 0 for i in range(10)])) for y in Y_train]))
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict(X_test_)
        cls_predict = mock.Mock(wraps=cls.pipeline_)
        cls.pipeline_ = cls_predict
        prediction = cls.predict(X_test, batch_size=20)
        self.assertEqual((1647, 10), prediction.shape)
        self.assertEqual(83, cls_predict.predict.call_count)
        assert_array_almost_equal(prediction_, prediction)
    def _test_preprocessing_dtype(self):
        # Dense
        # np.float32
        X_train, Y_train, X_test, Y_test = get_dataset("iris")
        self.assertEqual(X_train.dtype, np.float32)

        configuration_space = Nystroem.get_hyperparameter_search_space()
        default = configuration_space.get_default_configuration()
        preprocessor = Nystroem(random_state=1,
                                          **{hp.hyperparameter.name: hp.value
                                             for hp
                                             in
                                             default.values.values()})
        preprocessor.fit(X_train)
        Xt = preprocessor.transform(X_train)
        self.assertEqual(Xt.dtype, np.float32)

        # np.float64
        X_train, Y_train, X_test, Y_test = get_dataset("iris")
        X_train = X_train.astype(np.float64)
        configuration_space = Nystroem.get_hyperparameter_search_space()
        default = configuration_space.get_default_configuration()
        preprocessor = Nystroem(random_state=1,
                                          **{hp.hyperparameter.name: hp.value
                                             for hp
                                             in
                                             default.values.values()})
        preprocessor.fit(X_train, Y_train)
        Xt = preprocessor.transform(X_train)
        self.assertEqual(Xt.dtype, np.float64)

        # Sparse
        # np.float32
        X_train, Y_train, X_test, Y_test = get_dataset("iris", make_sparse=True)
        self.assertEqual(X_train.dtype, np.float32)
        configuration_space = Nystroem.get_hyperparameter_search_space()
        default = configuration_space.get_default_configuration()
        preprocessor = Nystroem(random_state=1,
                                          **{hp.hyperparameter.name: hp.value
                                             for hp
                                             in
                                             default.values.values()})
        preprocessor.fit(X_train)
        Xt = preprocessor.transform(X_train)
        self.assertEqual(Xt.dtype, np.float32)

        # np.float64
        X_train, Y_train, X_test, Y_test = get_dataset("iris", make_sparse=True)
        X_train = X_train.astype(np.float64)
        configuration_space = Nystroem.get_hyperparameter_search_space()
        default = configuration_space.get_default_configuration()
        preprocessor = Nystroem(random_state=1,
                                          **{hp.hyperparameter.name: hp.value
                                             for hp
                                             in
                                             default.values.values()})
        preprocessor.fit(X_train)
        Xt = preprocessor.transform(X_train)
        self.assertEqual(Xt.dtype, np.float64)
Ejemplo n.º 7
0
    def test_predict_proba_batched_sparse(self):
        cs = SimpleClassificationPipeline.get_hyperparameter_search_space(
            dataset_properties={'sparse': True})

        config = Configuration(cs,
                               values={"balancing:strategy": "none",
                                       "classifier:__choice__": "random_forest",
                                       "imputation:strategy": "mean",
                                       "one_hot_encoding:minimum_fraction": 0.01,
                                       "one_hot_encoding:use_minimum_fraction": 'True',
                                       "preprocessor:__choice__": "no_preprocessing",
                                       'classifier:random_forest:bootstrap': 'True',
                                       'classifier:random_forest:criterion': 'gini',
                                       'classifier:random_forest:max_depth': 'None',
                                       'classifier:random_forest:min_samples_split': 2,
                                       'classifier:random_forest:min_samples_leaf': 2,
                                       'classifier:random_forest:min_weight_fraction_leaf': 0.0,
                                       'classifier:random_forest:max_features': 0.5,
                                       'classifier:random_forest:max_leaf_nodes': 'None',
                                       'classifier:random_forest:n_estimators': 100,
                                       "rescaling:__choice__": "min/max"})

        # Multiclass
        cls = SimpleClassificationPipeline(config)
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                       make_sparse=True)
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        # The object behind the last step in the pipeline
        cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1])
        cls.pipeline_.steps[-1] = ("estimator", cls_predict)
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertEqual((1647, 10), prediction.shape)
        self.assertEqual(84, cls_predict.predict_proba.call_count)
        assert_array_almost_equal(prediction_, prediction)

        # Multilabel
        cls = SimpleClassificationPipeline(config)
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                       make_sparse=True)
        Y_train_ = np.zeros((Y_train.shape[0], 10))
        for i, y in enumerate(Y_train):
            Y_train_[i][y] = 1
        Y_train = Y_train_
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1])
        cls.pipeline_.steps[-1] = ("estimator", cls_predict)
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertEqual(prediction.shape, ((1647, 10)))
        self.assertIsInstance(prediction, np.ndarray)
        self.assertEqual(84, cls_predict.predict_proba.call_count)
        assert_array_almost_equal(prediction_, prediction)
Ejemplo n.º 8
0
    def test_predict_proba_batched_sparse(self):
        cs = SimpleClassificationPipeline.get_hyperparameter_search_space(dataset_properties={"sparse": True})

        config = Configuration(
            cs,
            values={
                "balancing:strategy": "none",
                "classifier:__choice__": "random_forest",
                "imputation:strategy": "mean",
                "one_hot_encoding:minimum_fraction": 0.01,
                "one_hot_encoding:use_minimum_fraction": "True",
                "preprocessor:__choice__": "no_preprocessing",
                "classifier:random_forest:bootstrap": "True",
                "classifier:random_forest:criterion": "gini",
                "classifier:random_forest:max_depth": "None",
                "classifier:random_forest:min_samples_split": 2,
                "classifier:random_forest:min_samples_leaf": 2,
                "classifier:random_forest:min_weight_fraction_leaf": 0.0,
                "classifier:random_forest:max_features": 0.5,
                "classifier:random_forest:max_leaf_nodes": "None",
                "classifier:random_forest:n_estimators": 100,
                "rescaling:__choice__": "min/max",
            },
        )

        # Multiclass
        cls = SimpleClassificationPipeline(config)
        X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits", make_sparse=True)
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        # The object behind the last step in the pipeline
        cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1])
        cls.pipeline_.steps[-1] = ("estimator", cls_predict)
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertEqual((1647, 10), prediction.shape)
        self.assertEqual(84, cls_predict.predict_proba.call_count)
        assert_array_almost_equal(prediction_, prediction)

        # Multilabel
        cls = SimpleClassificationPipeline(config)
        X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits", make_sparse=True)
        Y_train = np.array(list([(list([1 if i != y else 0 for i in range(10)])) for y in Y_train]))
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict_proba(X_test_)
        cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1])
        cls.pipeline_.steps[-1] = ("estimator", cls_predict)
        prediction = cls.predict_proba(X_test, batch_size=20)
        self.assertEqual(prediction.shape, ((1647, 10)))
        self.assertIsInstance(prediction, np.ndarray)
        self.assertEqual(84, cls_predict.predict_proba.call_count)
        assert_array_almost_equal(prediction_, prediction)
    def test_preprocessing_dtype(self):
        # Dense
        # np.float32
        X_train, Y_train, X_test, Y_test = get_dataset("iris")
        self.assertEqual(X_train.dtype, np.float32)

        configuration_space = SelectPercentileClassification.get_hyperparameter_search_space()
        default = configuration_space.get_default_configuration()
        preprocessor = SelectPercentileClassification(random_state=1,
                                                      **{hp_name: default[hp_name]
                                                         for hp_name in default})
        preprocessor.fit(X_train, Y_train)
        Xt = preprocessor.transform(X_train)
        self.assertEqual(Xt.dtype, np.float32)

        # np.float64
        X_train, Y_train, X_test, Y_test = get_dataset("iris")
        X_train = X_train.astype(np.float64)
        configuration_space = SelectPercentileClassification.get_hyperparameter_search_space()
        default = configuration_space.get_default_configuration()
        preprocessor = SelectPercentileClassification(random_state=1,
                                                      **{hp_name: default[hp_name]
                                                         for hp_name in default})
        preprocessor.fit(X_train, Y_train)
        Xt = preprocessor.transform(X_train)
        self.assertEqual(Xt.dtype, np.float64)

        # Sparse
        # np.float32
        X_train, Y_train, X_test, Y_test = get_dataset("iris", make_sparse=True)
        self.assertEqual(X_train.dtype, np.float32)
        configuration_space = SelectPercentileClassification.get_hyperparameter_search_space()
        default = configuration_space.get_default_configuration()
        preprocessor = SelectPercentileClassification(random_state=1,
                                                      **{hp_name: default[hp_name]
                                                         for hp_name in default})
        preprocessor.fit(X_train, Y_train)
        Xt = preprocessor.transform(X_train)
        self.assertEqual(Xt.dtype, np.float32)

        # np.float64
        X_train, Y_train, X_test, Y_test = get_dataset("iris", make_sparse=True)
        X_train = X_train.astype(np.float64)
        configuration_space = SelectPercentileClassification.get_hyperparameter_search_space()
        default = configuration_space.get_default_configuration()
        preprocessor = SelectPercentileClassification(random_state=1,
                                                      **{hp_name: default[hp_name]
                                                         for hp_name in default})
        preprocessor.fit(X_train, Y_train)
        Xt = preprocessor.transform(X_train)
        self.assertEqual(Xt.dtype, np.float64)
Ejemplo n.º 10
0
    def test_predict_batched_sparse(self):
        cs = SimpleClassificationPipeline.get_hyperparameter_search_space(
            dataset_properties={'sparse': True})
        config = Configuration(cs,
            values={"balancing:strategy": "none",
                    "classifier:__choice__": "random_forest",
                    "imputation:strategy": "mean",
                    "one_hot_encoding:minimum_fraction": 0.01,
                    "one_hot_encoding:use_minimum_fraction": "True",
                    "preprocessor:__choice__": "no_preprocessing",
                    'classifier:random_forest:bootstrap': 'True',
                    'classifier:random_forest:criterion': 'gini',
                    'classifier:random_forest:max_depth': 'None',
                    'classifier:random_forest:min_samples_split': 2,
                    'classifier:random_forest:min_samples_leaf': 2,
                    'classifier:random_forest:max_features': 0.5,
                    'classifier:random_forest:max_leaf_nodes': 'None',
                    'classifier:random_forest:n_estimators': 100,
                    'classifier:random_forest:min_weight_fraction_leaf': 0.0,
                    "rescaling:__choice__": "min/max"})
        cls = SimpleClassificationPipeline(config)

        # Multiclass
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                       make_sparse=True)
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict(X_test_)
        cls_predict = mock.Mock(wraps=cls.pipeline_)
        cls.pipeline_ = cls_predict
        prediction = cls.predict(X_test, batch_size=20)
        self.assertEqual((1647,), prediction.shape)
        self.assertEqual(83, cls_predict.predict.call_count)
        assert_array_almost_equal(prediction_, prediction)

        # Multilabel
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                       make_sparse=True)
        Y_train = np.array([(y, 26 - y) for y in Y_train])
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict(X_test_)
        cls_predict = mock.Mock(wraps=cls.pipeline_)
        cls.pipeline_ = cls_predict
        prediction = cls.predict(X_test, batch_size=20)
        self.assertEqual((1647, 2), prediction.shape)
        self.assertEqual(83, cls_predict.predict.call_count)
        assert_array_almost_equal(prediction_, prediction)
Ejemplo n.º 11
0
    def test_default_configuration(self):
        configuration_space = RidgeRegression.get_hyperparameter_search_space()
        default = configuration_space.get_default_configuration()
        configuration_space_preproc = RandomKitchenSinks.get_hyperparameter_search_space()
        default_preproc = configuration_space_preproc.get_default_configuration()

        for i in range(10):
            # This should be a bad results
            predictions, targets = _test_regressor(RidgeRegression,)
            self.assertAlmostEqual(0.32614416980439365,
                sklearn.metrics.r2_score(y_true=targets, y_pred=predictions))

            # This should be much more better
            X_train, Y_train, X_test, Y_test = get_dataset(dataset='diabetes',
                                                           make_sparse=False)
            preprocessor = RandomKitchenSinks(
                random_state=1,
                **{hp_name: default_preproc[hp_name] for hp_name in
                   default_preproc if default_preproc[hp_name] is not None})

            transformer = preprocessor.fit(X_train, Y_train)
            X_train_transformed = transformer.transform(X_train)
            X_test_transformed = transformer.transform(X_test)

            regressor = RidgeRegression(
                random_state=1,
                **{hp_name: default[hp_name] for hp_name in
                   default if default[hp_name] is not None})
            predictor = regressor.fit(X_train_transformed, Y_train)
            predictions = predictor.predict(X_test_transformed)

            self.assertAlmostEqual(0.37183512452087852,
                sklearn.metrics.r2_score(y_true=Y_test, y_pred=predictions))
    def test_default_configuration(self):
        transformation, original = _test_preprocessing(SelectPercentileClassification)
        self.assertEqual(transformation.shape[0], original.shape[0])
        self.assertEqual(transformation.shape[1], int(original.shape[1]/2))
        self.assertFalse((transformation == 0).all())

        transformation, original = _test_preprocessing(SelectPercentileClassification, make_sparse=True)
        self.assertTrue(scipy.sparse.issparse(transformation))
        self.assertEqual(transformation.shape[0], original.shape[0])
        self.assertEqual(transformation.shape[1], int(original.shape[1]/2))

        # Custon preprocessing test to check if clipping to zero works
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
        original_X_train = X_train.copy()
        ss = sklearn.preprocessing.StandardScaler()
        X_train = ss.fit_transform(X_train)
        configuration_space = SelectPercentileClassification.get_hyperparameter_search_space()
        default = configuration_space.get_default_configuration()

        preprocessor = SelectPercentileClassification(random_state=1,
                            **{hp_name: default[hp_name] for hp_name in
                               default if default[hp_name] is not None})

        transformer = preprocessor.fit(X_train, Y_train)
        transformation, original = transformer.transform(X_train), original_X_train
        self.assertEqual(transformation.shape[0], original.shape[0])
        self.assertEqual(transformation.shape[1], int(original.shape[1] / 2))
Ejemplo n.º 13
0
    def test_cv_results(self):
        # TODO restructure and actually use real SMAC output from a long run
        # to do this unittest!
        tmp = os.path.join(self.test_dir, '..', '.tmp_cv_results')
        output = os.path.join(self.test_dir, '..', '.out_cv_results')
        self._setUp(tmp)
        self._setUp(output)
        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')

        cls = AutoSklearnClassifier(time_left_for_this_task=20,
                                    per_run_time_limit=5,
                                    output_folder=output,
                                    tmp_folder=tmp,
                                    shared_mode=False,
                                    seed=1,
                                    initial_configurations_via_metalearning=0,
                                    ensemble_size=0)
        cls.fit(X_train, Y_train)
        cv_results = cls.cv_results_
        self.assertIsInstance(cv_results, dict)
        self.assertIsInstance(cv_results['mean_test_score'], np.ndarray)
        self.assertIsInstance(cv_results['mean_fit_time'], np.ndarray)
        self.assertIsInstance(cv_results['params'], list)
        self.assertIsInstance(cv_results['rank_test_scores'], np.ndarray)
        self.assertTrue([isinstance(val, npma.MaskedArray) for key, val in
                         cv_results.items() if key.startswith('param_')])
        del cls
        self._tearDown(tmp)
        self._tearDown(output)
Ejemplo n.º 14
0
    def test_configurations_signed_data(self):
        # Use a limit of ~4GiB
        limit = 4000 * 1024 * 1024
        resource.setrlimit(resource.RLIMIT_AS, (limit, limit))

        cs = SimpleClassificationPipeline.get_hyperparameter_search_space(
            dataset_properties={'signed': True})

        print(cs)

        for i in range(10):
            config = cs.sample_configuration()
            config._populate_values()
            if config['classifier:passive_aggressive:n_iter'] is not None:
                config._values['classifier:passive_aggressive:n_iter'] = 5
            if config['classifier:sgd:n_iter'] is not None:
                config._values['classifier:sgd:n_iter'] = 5

            X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
            cls = SimpleClassificationPipeline(config, random_state=1)
            print(config)
            try:
                cls.fit(X_train, Y_train)
                X_test_ = X_test.copy()
                predictions = cls.predict(X_test)
                self.assertIsInstance(predictions, np.ndarray)
                predicted_probabiliets = cls.predict_proba(X_test_)
                self.assertIsInstance(predicted_probabiliets, np.ndarray)
            except ValueError as e:
                if "Floating-point under-/overflow occurred at epoch" in \
                       e.args[0] or \
                       "removed all features" in e.args[0] or \
                                "all features are discarded" in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except RuntimeWarning as e:
                if "invalid value encountered in sqrt" in e.args[0]:
                    continue
                elif "divide by zero encountered in" in e.args[0]:
                    continue
                elif "invalid value encountered in divide" in e.args[0]:
                    continue
                elif "invalid value encountered in true_divide" in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except UserWarning as e:
                if "FastICA did not converge" in e.args[0]:
                    continue
                else:
                    print(config)
                    print(traceback.format_exc())
                    raise e
            except MemoryError as e:
                continue
Ejemplo n.º 15
0
    def test_default_configuration(self):
        transformation, original = _test_preprocessing(SelectRates)
        self.assertEqual(transformation.shape[0], original.shape[0])
        self.assertEqual(transformation.shape[1], 3)
        self.assertFalse((transformation == 0).all())

        transformation, original = _test_preprocessing(SelectRates, make_sparse=True)
        self.assertTrue(scipy.sparse.issparse(transformation))
        self.assertEqual(transformation.shape[0], original.shape[0])
        self.assertEqual(transformation.shape[1], int(original.shape[1] / 2))

        # Custon preprocessing test to check if clipping to zero works
        X_train, Y_train, X_test, Y_test = get_dataset(dataset="digits")
        original_X_train = X_train.copy()
        ss = sklearn.preprocessing.StandardScaler()
        X_train = ss.fit_transform(X_train)
        configuration_space = SelectRates.get_hyperparameter_search_space()
        default = configuration_space.get_default_configuration()

        preprocessor = SelectRates(
            random_state=1, **{hp_name: default[hp_name] for hp_name in default if default[hp_name] is not None}
        )

        transformer = preprocessor.fit(X_train, Y_train)
        transformation, original = transformer.transform(X_train), original_X_train
        self.assertEqual(transformation.shape[0], original.shape[0])
        # I don't know why its 52 here and not 32 which would be half of the
        # number of features. Seems to be related to a runtime warning raised
        #  by sklearn
        self.assertEqual(transformation.shape[1], 52)
Ejemplo n.º 16
0
def get_regression_datamanager():
    X_train, Y_train, X_test, Y_test = get_dataset('boston')
    indices = list(range(X_train.shape[0]))
    np.random.seed(1)
    np.random.shuffle(indices)
    X_train = X_train[indices]
    Y_train = Y_train[indices]

    X_valid = X_test[:200, ]
    Y_valid = Y_test[:200, ]
    X_test = X_test[200:, ]
    Y_test = Y_test[200:, ]

    D = Dummy()
    D.info = {
        'metric': R2_METRIC,
        'task': REGRESSION,
        'is_sparse': False,
        'label_num': 1
    }
    D.data = {
        'X_train': X_train,
        'Y_train': Y_train.reshape((-1, 1)),
        'X_valid': X_valid,
        'Y_valid': Y_valid.reshape((-1, 1)),
        'X_test': X_test,
        'Y_test': Y_test.reshape((-1, 1))
    }
    D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical',
                   'numerical', 'numerical', 'numerical', 'numerical',
                   'numerical', 'numerical', 'numerical']
    return D
Ejemplo n.º 17
0
    def setUp(self):
        self.X_train, self.Y_train, self.X_test, self.Y_test = \
            get_dataset('iris')

        eliminate_class_two = self.Y_train != 2
        self.X_train = self.X_train[eliminate_class_two]
        self.Y_train = self.Y_train[eliminate_class_two]
Ejemplo n.º 18
0
def get_multiclass_classification_datamanager():
    X_train, Y_train, X_test, Y_test = get_dataset('iris')
    indices = list(range(X_train.shape[0]))
    np.random.seed(1)
    np.random.shuffle(indices)
    X_train = X_train[indices]
    Y_train = Y_train[indices]

    X_valid = X_test[:25, ]
    Y_valid = Y_test[:25, ]
    X_test = X_test[25:, ]
    Y_test = Y_test[25:, ]

    D = Dummy()
    D.info = {
        'metric': BAC_METRIC,
        'task': MULTICLASS_CLASSIFICATION,
        'is_sparse': False,
        'label_num': 3
    }
    D.data = {
        'X_train': X_train,
        'Y_train': Y_train,
        'X_valid': X_valid,
        'Y_valid': Y_valid,
        'X_test': X_test,
        'Y_test': Y_test
    }
    D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical']
    return D
Ejemplo n.º 19
0
    def test_fit_pSMAC(self):
        output = os.path.join(self.test_dir, '..', '.tmp_estimator_fit_pSMAC')
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')

        automl = AutoSklearnClassifier(time_left_for_this_task=15,
                                       per_run_time_limit=15,
                                       output_folder=output,
                                       tmp_folder=output,
                                       shared_mode=True,
                                       seed=1,
                                       initial_configurations_via_metalearning=0,
                                       ensemble_size=0)
        automl.fit(X_train, Y_train)

        # Create a 'dummy model' for the first run, which has an accuracy of
        # more than 99%; it should be in the final ensemble if the ensemble
        # building of the second AutoSklearn classifier works correct
        true_targets_ensemble_path = os.path.join(output, '.auto-sklearn',
                                                  'true_targets_ensemble.npy')
        true_targets_ensemble = np.load(true_targets_ensemble_path)
        true_targets_ensemble[-1] = 1 if true_targets_ensemble[-1] != 1 else 0
        probas = np.zeros((len(true_targets_ensemble), 3), dtype=float)
        for i, value in enumerate(true_targets_ensemble):
            probas[i, value] = 1.0
        dummy_predictions_path = os.path.join(output, '.auto-sklearn',
                                              'predictions_ensemble',
                                              'predictions_ensemble_1_00030.npy')
        with open(dummy_predictions_path, 'wb') as fh:
            np.save(fh, probas)

        probas_test = np.zeros((len(Y_test), 3), dtype=float)
        for i, value in enumerate(Y_test):
            probas_test[i, value] = 1.0

        dummy = ArrayReturningDummyPredictor(probas_test)
        backend = Backend(output, output)
        backend.save_model(dummy, 30, 1)

        automl = AutoSklearnClassifier(time_left_for_this_task=15,
                                       per_run_time_limit=15,
                                       output_folder=output,
                                       tmp_folder=output,
                                       shared_mode=True,
                                       seed=2,
                                       initial_configurations_via_metalearning=0,
                                       ensemble_size=0)
        automl.fit(X_train, Y_train)
        automl.run_ensemble_builder(0, 1, 50).wait()

        score = automl.score(X_test, Y_test)

        self.assertEqual(len(os.listdir(os.path.join(output, '.auto-sklearn',
                                                     'ensembles'))), 1)
        self.assertGreaterEqual(score, 0.90)
        self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION)

        del automl
        self._tearDown(output)
Ejemplo n.º 20
0
 def test_default_configuration(self):
     for i in range(2):
         X_train, Y_train, X_test, Y_test = get_dataset(dataset='iris')
         auto = SimpleClassificationPipeline()
         auto = auto.fit(X_train, Y_train)
         predictions = auto.predict(X_test)
         self.assertAlmostEqual(0.94,
             sklearn.metrics.accuracy_score(predictions, Y_test))
         scores = auto.predict_proba(X_test)
Ejemplo n.º 21
0
 def test_default_configuration_iterative_fit(self):
     classifier = SimpleClassificationPipeline(
         include={'classifier': ['random_forest'],
                  'preprocessor': ['no_preprocessing']})
     X_train, Y_train, X_test, Y_test = get_dataset(dataset='iris')
     XT = classifier.fit_transformer(X_train, Y_train)
     for i in range(1, 11):
         classifier.iterative_fit(X_train, Y_train)
         self.assertEqual(classifier.steps[-1][-1].choice.estimator.n_estimators,
                          i)
Ejemplo n.º 22
0
 def test_default_configuration_multilabel(self):
     for i in range(2):
         cs = SimpleClassificationPipeline.get_hyperparameter_search_space(dataset_properties={"multilabel": True})
         default = cs.get_default_configuration()
         X_train, Y_train, X_test, Y_test = get_dataset(dataset="iris", make_multilabel=True)
         auto = SimpleClassificationPipeline(default)
         auto = auto.fit(X_train, Y_train)
         predictions = auto.predict(X_test)
         self.assertAlmostEqual(0.9599999999999995, sklearn.metrics.accuracy_score(predictions, Y_test))
         scores = auto.predict_proba(X_test)
Ejemplo n.º 23
0
    def test_configurations_sparse(self):
        # Use a limit of ~4GiB
        limit = 4000 * 1024 * 1024
        resource.setrlimit(resource.RLIMIT_AS, (limit, limit))

        cs = SimpleRegressionPipeline.get_hyperparameter_search_space(
            dataset_properties={'sparse': True})
        print(cs)
        for i in range(10):
            config = cs.sample_configuration()
            config._populate_values()
            if 'classifier:passive_aggressive:n_iter' in config and \
                            config[
                                'classifier:passive_aggressive:n_iter'] is not None:
                config._values['classifier:passive_aggressive:n_iter'] = 5
            if 'classifier:sgd:n_iter' in config and \
                            config['classifier:sgd:n_iter'] is not None:
                config._values['classifier:sgd:n_iter'] = 5

            print(config)
            X_train, Y_train, X_test, Y_test = get_dataset(dataset='boston',
                                                           make_sparse=True)
            cls = SimpleRegressionPipeline(config, random_state=1)
            try:
                cls.fit(X_train, Y_train)
                predictions = cls.predict(X_test)
            except ValueError as e:
                if "Floating-point under-/overflow occurred at epoch" in \
                        e.args[0] or \
                                "removed all features" in e.args[0] or \
                                "all features are discarded" in e.args[0]:
                    continue
                else:
                    print(config)
                    traceback.print_tb(sys.exc_info()[2])
                    raise e
            except RuntimeWarning as e:
                if "invalid value encountered in sqrt" in e.args[0]:
                    continue
                elif "divide by zero encountered in" in e.args[0]:
                    continue
                elif "invalid value encountered in divide" in e.args[0]:
                    continue
                elif "invalid value encountered in true_divide" in e.args[0]:
                    continue
                else:
                    print(config)
                    raise e
            except UserWarning as e:
                if "FastICA did not converge" in e.args[0]:
                    continue
                else:
                    print(config)
                    raise e
Ejemplo n.º 24
0
 def test_default_configuration(self):
     for i in range(2):
         X_train, Y_train, X_test, Y_test = get_dataset(dataset='diabetes')
         auto = SimpleRegressionPipeline()
         auto = auto.fit(X_train, Y_train)
         predictions = auto.predict(copy.deepcopy(X_test))
         # The lower the worse
         r2_score = sklearn.metrics.r2_score(Y_test, predictions)
         self.assertAlmostEqual(0.339, r2_score, places=3)
         model_score = auto.score(copy.deepcopy(X_test), Y_test)
         self.assertAlmostEqual(model_score, r2_score, places=5)
Ejemplo n.º 25
0
 def test_default_configuration(self):
     for i in range(2):
         cs = SimpleRegressionPipeline.get_hyperparameter_search_space()
         default = cs.get_default_configuration()
         X_train, Y_train, X_test, Y_test = get_dataset(dataset='diabetes')
         auto = SimpleRegressionPipeline(default)
         auto = auto.fit(X_train, Y_train)
         predictions = auto.predict(copy.deepcopy(X_test))
         # The lower the worse
         r2_score = sklearn.metrics.r2_score(Y_test, predictions)
         self.assertAlmostEqual(0.41732302035060087, r2_score)
         model_score = auto.score(copy.deepcopy(X_test), Y_test)
         self.assertEqual(model_score, r2_score)
Ejemplo n.º 26
0
    def _test_helper(self, Preprocessor, dataset=None, make_sparse=False):
        X_train, Y_train, X_test, Y_test = get_dataset(dataset=dataset,
                                          make_sparse=make_sparse)
        original_X_train = X_train.copy()
        configuration_space = Preprocessor.get_hyperparameter_search_space()
        default = configuration_space.get_default_configuration()

        preprocessor = Preprocessor(random_state=1,
                                    **{hp_name: default[hp_name] for hp_name in
                                       default if default[hp_name] is not None})
        preprocessor = preprocessor.choice
        transformer = preprocessor.fit(X_train, Y_train)
        return transformer.transform(X_train), original_X_train
Ejemplo n.º 27
0
    def test_fit(self):
        output = os.path.join(self.test_dir, '..', '.tmp_test_fit')
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
        backend_api = backend.create(output, output)
        automl = autosklearn.automl.AutoML(backend_api, 15, 5)
        automl.fit(X_train, Y_train)
        score = automl.score(X_test, Y_test)
        self.assertGreaterEqual(score, 0.8)
        self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION)

        del automl
        self._tearDown(output)
Ejemplo n.º 28
0
    def test_regression_methods_returns_self(self):
        X_train, y_train, X_test, y_test = putil.get_dataset('boston')
        automl = AutoSklearnRegressor(time_left_for_this_task=20,
                                      per_run_time_limit=5,
                                      ensemble_size=0)

        automl_fitted = automl.fit(X_train, y_train)
        self.assertIs(automl, automl_fitted)

        automl_ensemble_fitted = automl.fit_ensemble(y_train, ensemble_size=5)
        self.assertIs(automl, automl_ensemble_fitted)

        automl_refitted = automl.refit(X_train.copy(), y_train.copy())
        self.assertIs(automl, automl_refitted)
Ejemplo n.º 29
0
    def test_evaluate_binary_classification(self):
        X_train, Y_train, X_test, Y_test = get_dataset('iris')

        eliminate_class_two = Y_train != 2
        X_train = X_train[eliminate_class_two]
        Y_train = Y_train[eliminate_class_two]

        eliminate_class_two = Y_test != 2
        X_test = X_test[eliminate_class_two]
        Y_test = Y_test[eliminate_class_two]

        X_valid = X_test[:25, ]
        Y_valid = Y_test[:25, ]
        X_test = X_test[25:, ]
        Y_test = Y_test[25:, ]

        D = Dummy()
        D.info = {
            'metric': AUC_METRIC,
            'task': BINARY_CLASSIFICATION,
            'is_sparse': False,
            'label_num': 2
        }
        D.data = {
            'X_train': X_train,
            'Y_train': Y_train,
            'X_valid': X_valid,
            'X_test': X_test
        }
        D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical']

        configuration_space = get_configuration_space(
            D.info,
            include_estimators=['lda'],
            include_preprocessors=['pca'])

        err = np.zeros([N_TEST_RUNS])
        for i in range(N_TEST_RUNS):
            print('Evaluate configuration: %d; result:' % i)
            configuration = configuration_space.sample_configuration()
            D_ = copy.deepcopy(D)
            evaluator = HoldoutEvaluator(D_, configuration)

            if not self._fit(evaluator):
                continue
            err[i] = evaluator.predict()
            self.assertTrue(np.isfinite(err[i]))
            print(err[i])

            self.assertGreaterEqual(err[i], 0.0)
Ejemplo n.º 30
0
    def test_predict_proba_binary_classification(self):
        X_train, Y_train, X_test, Y_test = get_dataset('iris')

        eliminate_class_two = Y_train != 2
        X_train = X_train[eliminate_class_two]
        Y_train = Y_train[eliminate_class_two]

        eliminate_class_two = Y_test != 2
        X_test = X_test[eliminate_class_two]
        Y_test = Y_test[eliminate_class_two]

        X_valid = X_test[:25, ]
        Y_valid = Y_test[:25, ]
        X_test = X_test[25:, ]
        Y_test = Y_test[25:, ]

        class Dummy2(object):

            def predict_proba(self, y, batch_size=200):
                return np.array([[0.1, 0.9], [0.7, 0.3]])

        model = Dummy2()
        task_type = BINARY_CLASSIFICATION

        D = Dummy()
        D.info = {
            'metric': BAC_METRIC,
            'task': task_type,
            'is_sparse': False,
            'label_num': 3
        }
        D.data = {
            'X_train': X_train,
            'Y_train': Y_train,
            'X_valid': X_valid,
            'X_test': X_test
        }
        D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical']

        configuration_space = get_configuration_space(
            D.info,
            include_estimators=['lda'],
            include_preprocessors=['select_rates'])
        configuration = configuration_space.sample_configuration()

        evaluator = HoldoutEvaluator(D, configuration)
        pred = evaluator.predict_proba(None, model, task_type)
        expected = [[0.9], [0.3]]
        for i in range(len(expected)):
            self.assertEqual(expected[i], pred[i])
Ejemplo n.º 31
0
    def test_evaluate_multiclass_classification_all_metrics(self):
        X_train, Y_train, X_test, Y_test = get_dataset('iris')
        X_valid = X_test[:25, ]
        Y_valid = Y_test[:25, ]
        X_test = X_test[25:, ]
        Y_test = Y_test[25:, ]

        D = Dummy()
        D.info = {
            'metric': BAC_METRIC,
            'task': MULTICLASS_CLASSIFICATION,
            'is_sparse': False,
            'label_num': 3
        }
        D.data = {
            'X_train': X_train,
            'Y_train': Y_train,
            'X_valid': X_valid,
            'X_test': X_test
        }
        D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical']

        configuration_space = get_configuration_space(
            D.info, include_estimators=['lda'], include_preprocessors=['pca'])

        # Test all scoring functions
        err = []
        for i in range(N_TEST_RUNS):
            print('Evaluate configuration: %d; result:' % i)
            configuration = configuration_space.sample_configuration()
            D_ = copy.deepcopy(D)
            evaluator = HoldoutEvaluator(D_,
                                         configuration,
                                         all_scoring_functions=True)
            if not self._fit(evaluator):
                continue

            err.append(evaluator.predict())
            print(err[-1])

            self.assertIsInstance(err[-1], dict)
            for key in err[-1]:
                self.assertEqual(len(err[-1]), 5)
                self.assertTrue(np.isfinite(err[-1][key]))
                self.assertGreaterEqual(err[-1][key], 0.0)
Ejemplo n.º 32
0
    def test_do_dummy_prediction(self):
        datasets = {
            'breast_cancer': BINARY_CLASSIFICATION,
            'wine': MULTICLASS_CLASSIFICATION,
            'diabetes': REGRESSION,
        }

        for name, task in datasets.items():
            backend_api = self._create_backend('test_do_dummy_prediction')

            X_train, Y_train, X_test, Y_test = putil.get_dataset(name)
            datamanager = XYDataManager(
                X_train, Y_train,
                X_test, Y_test,
                task=task,
                dataset_name=name,
                feat_type=None,
            )

            auto = autosklearn.automl.AutoML(
                backend_api, 20, 5,
                initial_configurations_via_metalearning=25,
                metric=accuracy,
            )
            setup_logger()
            auto._logger = get_logger('test_do_dummy_predictions')
            auto._backend.save_datamanager(datamanager)
            D = backend_api.load_datamanager()

            # Check if data manager is correcly loaded
            self.assertEqual(D.info['task'], datamanager.info['task'])

            auto._do_dummy_prediction(D, 1)

            # Ensure that the dummy predictions are not in the current working
            # directory, but in the temporary directory.
            self.assertFalse(os.path.exists(os.path.join(os.getcwd(),
                                                         '.auto-sklearn')))
            self.assertTrue(os.path.exists(os.path.join(
                backend_api.temporary_directory, '.auto-sklearn', 'predictions_ensemble',
                'predictions_ensemble_1_1_0.0.npy')))

            del auto
            self._tearDown(backend_api.temporary_directory)
            self._tearDown(backend_api.output_directory)
Ejemplo n.º 33
0
    def test_evaluate_multilabel_classification(self):
        X_train, Y_train, X_test, Y_test = get_dataset('iris')
        Y_train = np.array(convert_to_bin(Y_train, 3))
        Y_train[:, -1] = 1
        Y_test = np.array(convert_to_bin(Y_test, 3))
        Y_test[:, -1] = 1

        X_valid = X_test[:25, ]
        Y_valid = Y_test[:25, ]
        X_test = X_test[25:, ]
        Y_test = Y_test[25:, ]

        D = Dummy()
        D.info = {
            'metric': F1_METRIC,
            'task': MULTILABEL_CLASSIFICATION,
            'is_sparse': False,
            'label_num': 3
        }
        D.data = {
            'X_train': X_train,
            'Y_train': Y_train,
            'X_valid': X_valid,
            'X_test': X_test
        }
        D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical']

        configuration_space = get_configuration_space(
            D.info,
            include_estimators=['extra_trees'],
            include_preprocessors=['no_preprocessing'])

        err = np.zeros([N_TEST_RUNS])
        for i in range(N_TEST_RUNS):
            print('Evaluate configuration: %d; result:' % i)
            configuration = configuration_space.sample_configuration()
            D_ = copy.deepcopy(D)
            evaluator = HoldoutEvaluator(D_, configuration)
            if not self._fit(evaluator):
                continue
            err[i] = evaluator.predict()
            print(err[i])

            self.assertTrue(np.isfinite(err[i]))
            self.assertGreaterEqual(err[i], 0.0)
Ejemplo n.º 34
0
def test_can_pickle_classifier(tmp_dir, output_dir, dask_client):
    X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
    automl = AutoSklearnClassifier(time_left_for_this_task=30,
                                   per_run_time_limit=5,
                                   tmp_folder=tmp_dir,
                                   dask_client=dask_client,
                                   output_folder=output_dir)
    automl.fit(X_train, Y_train)

    initial_predictions = automl.predict(X_test)
    initial_accuracy = sklearn.metrics.accuracy_score(Y_test,
                                                      initial_predictions)
    assert initial_accuracy >= 0.75
    assert count_succeses(automl.cv_results_) > 0
    assert includes_train_scores(automl.performance_over_time_.columns) is True
    assert performance_over_time_is_plausible(
        automl.performance_over_time_) is True

    # Test pickle
    dump_file = os.path.join(output_dir, 'automl.dump.pkl')

    with open(dump_file, 'wb') as f:
        pickle.dump(automl, f)

    with open(dump_file, 'rb') as f:
        restored_automl = pickle.load(f)

    restored_predictions = restored_automl.predict(X_test)
    restored_accuracy = sklearn.metrics.accuracy_score(Y_test,
                                                       restored_predictions)
    assert restored_accuracy >= 0.75
    assert initial_accuracy == restored_accuracy

    # Test joblib
    dump_file = os.path.join(output_dir, 'automl.dump.joblib')

    joblib.dump(automl, dump_file)

    restored_automl = joblib.load(dump_file)

    restored_predictions = restored_automl.predict(X_test)
    restored_accuracy = sklearn.metrics.accuracy_score(Y_test,
                                                       restored_predictions)
    assert restored_accuracy >= 0.75
    assert initial_accuracy == restored_accuracy
Ejemplo n.º 35
0
    def test_evaluate_regression(self):
        X_train, Y_train, X_test, Y_test = get_dataset('boston')

        X_valid = X_test[:200, ]
        Y_valid = Y_test[:200, ]
        X_test = X_test[200:, ]
        Y_test = Y_test[200:, ]

        D = Dummy()
        D.info = {
            'metric': R2_METRIC,
            'task': REGRESSION,
            'is_sparse': False,
            'label_num': 1
        }
        D.data = {
            'X_train': X_train,
            'Y_train': Y_train,
            'X_valid': X_valid,
            'X_test': X_test
        }
        D.feat_type = [
            'numerical', 'Numerical', 'numerical', 'numerical', 'numerical',
            'numerical', 'numerical', 'numerical', 'numerical', 'numerical',
            'numerical'
        ]

        configuration_space = get_configuration_space(
            D.info,
            include_estimators=['extra_trees'],
            include_preprocessors=['no_preprocessing'])

        err = np.zeros([N_TEST_RUNS])
        for i in range(N_TEST_RUNS):
            print('Evaluate configuration: %d; result:' % i)
            configuration = configuration_space.sample_configuration()
            D_ = copy.deepcopy(D)
            evaluator = HoldoutEvaluator(D_, configuration)
            if not self._fit(evaluator):
                continue
            err[i] = evaluator.predict()
            self.assertTrue(np.isfinite(err[i]))
            print(err[i])

            self.assertGreaterEqual(err[i], 0.0)
Ejemplo n.º 36
0
    def test_delete_non_candidate_models(self):
        backend_api = self._create_backend(
            'test_delete', delete_tmp_folder_after_terminate=False)

        seed = 555
        X, Y, _, _ = putil.get_dataset('iris')
        automl = autosklearn.automl.AutoML(
            backend_api,
            time_left_for_this_task=30,
            per_run_time_limit=5,
            ensemble_nbest=3,
            seed=seed,
            initial_configurations_via_metalearning=0,
            resampling_strategy='holdout',
            include_estimators=['sgd'],
            include_preprocessors=['no_preprocessing'],
            metric=accuracy,
        )

        automl.fit(X, Y, task=MULTICLASS_CLASSIFICATION,
                   X_test=X, y_test=Y)

        # Assert at least one model file has been deleted and that there were no
        # deletion errors
        log_file_path = glob.glob(os.path.join(
            backend_api.temporary_directory, 'AutoML(' + str(seed) + '):*.log'))
        with open(log_file_path[0]) as log_file:
            log_content = log_file.read()
            self.assertIn('Deleted files of non-candidate model', log_content)
            self.assertNotIn('Failed to delete files of non-candidate model', log_content)
            self.assertNotIn('Failed to lock model', log_content)

        # Assert that the files of the models used by the ensemble weren't deleted
        model_files = backend_api.list_all_models(seed=seed)
        model_files_idx = set()
        for m_file in model_files:
            # Extract the model identifiers from the filename
            m_file = os.path.split(m_file)[1].replace('.model', '').split('.', 2)
            model_files_idx.add((int(m_file[0]), int(m_file[1]), float(m_file[2])))
        ensemble_members_idx = set(automl.ensemble_.identifiers_)
        self.assertTrue(ensemble_members_idx.issubset(model_files_idx))

        del automl
        self._tearDown(backend_api.temporary_directory)
        self._tearDown(backend_api.output_directory)
Ejemplo n.º 37
0
    def test_default_configuration_predict_proba_individual(self):
        # Leave this additional test here
        for i in range(2):
            predictions, targets = _test_classifier_predict_proba(
                LibSVM_SVC,
                sparse=True,
                dataset='digits',
                train_size_maximum=500)
            self.assertAlmostEqual(
                5.4706296711768925,
                sklearn.metrics.log_loss(targets, predictions))

        for i in range(2):
            predictions, targets = _test_classifier_predict_proba(
                LibSVM_SVC, sparse=True, dataset='iris')
            self.assertAlmostEqual(
                0.84333924656905945,
                sklearn.metrics.log_loss(targets, predictions))

        # 2 class
        for i in range(2):
            X_train, Y_train, X_test, Y_test = get_dataset(dataset='iris')
            remove_training_data = Y_train == 2
            remove_test_data = Y_test == 2
            X_train = X_train[~remove_training_data]
            Y_train = Y_train[~remove_training_data]
            X_test = X_test[~remove_test_data]
            Y_test = Y_test[~remove_test_data]
            ss = sklearn.preprocessing.StandardScaler()
            X_train = ss.fit_transform(X_train)
            configuration_space = LibSVM_SVC.get_hyperparameter_search_space()
            default = configuration_space.get_default_configuration()

            cls = LibSVM_SVC(random_state=1,
                             **{
                                 hp_name: default[hp_name]
                                 for hp_name in default
                                 if default[hp_name] is not None
                             })

            cls = cls.fit(X_train, Y_train)
            prediction = cls.predict_proba(X_test)
            self.assertAlmostEqual(
                sklearn.metrics.log_loss(Y_test, prediction),
                0.69323680119641773)
Ejemplo n.º 38
0
def test_do_dummy_prediction(dask_client, datasets):

    name, task = datasets

    X_train, Y_train, X_test, Y_test = putil.get_dataset(name)
    datamanager = XYDataManager(
        X_train, Y_train,
        X_test, Y_test,
        task=task,
        dataset_name=name,
        feat_type={i: 'numerical' for i in range(X_train.shape[1])},
    )

    auto = autosklearn.automl.AutoML(
        20, 5,
        initial_configurations_via_metalearning=25,
        metric=accuracy,
        dask_client=dask_client,
        delete_tmp_folder_after_terminate=False,
    )
    auto._backend = auto._create_backend()

    # Make a dummy logger
    auto._logger_port = 9020
    auto._logger = unittest.mock.Mock()
    auto._logger.info.return_value = None

    auto._backend.save_datamanager(datamanager)
    D = auto._backend.load_datamanager()

    # Check if data manager is correcly loaded
    assert D.info['task'] == datamanager.info['task']
    auto._do_dummy_prediction(D, 1)

    # Ensure that the dummy predictions are not in the current working
    # directory, but in the temporary directory.
    assert not os.path.exists(os.path.join(os.getcwd(), '.auto-sklearn'))
    assert os.path.exists(os.path.join(
        auto._backend.temporary_directory, '.auto-sklearn', 'runs', '1_1_0.0',
        'predictions_ensemble_1_1_0.0.npy')
    )

    auto._clean_logger()

    del auto
Ejemplo n.º 39
0
    def test_can_pickle_classifier(self):
        output = os.path.join(self.test_dir, '..', '.tmp_can_pickle')
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
        automl = AutoSklearnClassifier(time_left_for_this_task=20,
                                       per_run_time_limit=5,
                                       tmp_folder=output,
                                       output_folder=output)
        automl.fit(X_train, Y_train)

        initial_predictions = automl.predict(X_test)
        initial_accuracy = sklearn.metrics.accuracy_score(
            Y_test, initial_predictions)
        self.assertGreaterEqual(initial_accuracy, 0.75)

        # Test pickle
        dump_file = os.path.join(output, 'automl.dump.pkl')

        with open(dump_file, 'wb') as f:
            pickle.dump(automl, f)

        with open(dump_file, 'rb') as f:
            restored_automl = pickle.load(f)

        restored_predictions = restored_automl.predict(X_test)
        restored_accuracy = sklearn.metrics.accuracy_score(
            Y_test, restored_predictions)
        self.assertGreaterEqual(restored_accuracy, 0.75)

        self.assertEqual(initial_accuracy, restored_accuracy)

        # Test joblib
        dump_file = os.path.join(output, 'automl.dump.joblib')

        sklearn.externals.joblib.dump(automl, dump_file)

        restored_automl = sklearn.externals.joblib.load(dump_file)

        restored_predictions = restored_automl.predict(X_test)
        restored_accuracy = sklearn.metrics.accuracy_score(
            Y_test, restored_predictions)
        self.assertGreaterEqual(restored_accuracy, 0.75)

        self.assertEqual(initial_accuracy, restored_accuracy)
Ejemplo n.º 40
0
def test_delete_non_candidate_models(backend, dask_client):

    seed = 555
    X, Y, _, _ = putil.get_dataset('iris')
    automl = autosklearn.automl.AutoML(
        backend,
        time_left_for_this_task=60,
        per_run_time_limit=5,
        ensemble_nbest=3,
        seed=seed,
        initial_configurations_via_metalearning=0,
        resampling_strategy='holdout',
        include_estimators=['sgd'],
        include_preprocessors=['no_preprocessing'],
        metric=accuracy,
        dask_client=dask_client,
        # Force model to be deleted. That is, from 50 which is the
        # default to 3 to make sure we delete models.
        max_models_on_disc=3,
    )

    automl.fit(X, Y, task=MULTICLASS_CLASSIFICATION,
               X_test=X, y_test=Y)

    # Assert at least one model file has been deleted and that there were no
    # deletion errors
    log_file_path = glob.glob(os.path.join(
        backend.temporary_directory, 'AutoML(' + str(seed) + '):*.log'))
    with open(log_file_path[0]) as log_file:
        log_content = log_file.read()
        assert 'Deleted files of non-candidate model' in log_content, log_content
        assert 'Failed to delete files of non-candidate model' not in log_content, log_content
        assert 'Failed to lock model' not in log_content, log_content

    # Assert that the files of the models used by the ensemble weren't deleted
    model_files = backend.list_all_models(seed=seed)
    model_files_idx = set()
    for m_file in model_files:
        # Extract the model identifiers from the filename
        m_file = os.path.split(m_file)[1].replace('.model', '').split('.', 2)
        model_files_idx.add((int(m_file[0]), int(m_file[1]), float(m_file[2])))
    ensemble_members_idx = set(automl.ensemble_.identifiers_)
    assert ensemble_members_idx.issubset(model_files_idx), (ensemble_members_idx, model_files_idx)

    del automl
Ejemplo n.º 41
0
    def test_fit(self):
        backend_api = self._create_backend('test_fit')

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
        automl = autosklearn.automl.AutoML(backend_api, 20, 5)
        automl.fit(
            X_train,
            Y_train,
            metric=accuracy,
            task=MULTICLASS_CLASSIFICATION,
        )
        score = automl.score(X_test, Y_test)
        self.assertGreaterEqual(score, 0.8)
        self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION)

        del automl
        self._tearDown(backend_api.temporary_directory)
        self._tearDown(backend_api.output_directory)
Ejemplo n.º 42
0
    def test_regression(self):
        tmp = os.path.join(self.test_dir, '..', '.tmp_regression_fit')
        output = os.path.join(self.test_dir, '..', '.out_regression_fit')
        self._setUp(tmp)
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('boston')
        automl = AutoSklearnRegressor(time_left_for_this_task=20,
                                      per_run_time_limit=5,
                                      tmp_folder=tmp,
                                      output_folder=output)

        automl.fit(X_train, Y_train)
        predictions = automl.predict(X_test)
        self.assertEqual(predictions.shape, (356, ))
        score = mean_squared_error(Y_test, predictions)
        # On average np.sqrt(30) away from the target -> ~5.5 on average
        self.assertGreaterEqual(score, -30)
Ejemplo n.º 43
0
 def test_produce_zero_scaling(self):
     from autosklearn.pipeline.classification import SimpleClassificationPipeline
     from autosklearn.pipeline import util as putil
     p = SimpleClassificationPipeline(configuration={
         'balancing:strategy': 'weighting',
         'classifier:__choice__': 'qda',
         'classifier:qda:reg_param': 2.992955287687101,
         'imputation:strategy': 'most_frequent',
         'one_hot_encoding:use_minimum_fraction': 'False',
         'preprocessor:__choice__': 'gem',
         'preprocessor:gem:N': 18,
         'preprocessor:gem:precond': 0.12360249797270745,
         'rescaling:__choice__': 'none'})
     X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
     self.assertRaisesRegexp(ValueError, 'Numerical problems in '
                                         'QDA. QDA.scalings_ contains '
                                         'values <= 0.0',
                             p.fit, X_train, Y_train)
Ejemplo n.º 44
0
    def test_fit(self):

        output = os.path.join(self.test_dir, '..', '.tmp_estimator_fit')
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
        automl = AutoSklearnClassifier(time_left_for_this_task=15,
                                       per_run_time_limit=5,
                                       tmp_folder=output,
                                       output_folder=output)
        automl.fit(X_train, Y_train)
        score = automl.score(X_test, Y_test)

        self.assertGreaterEqual(score, 0.8)
        self.assertEqual(automl._automl._automl._task, MULTICLASS_CLASSIFICATION)

        del automl
        self._tearDown(output)
Ejemplo n.º 45
0
    def test_multilabel(self):
        output = os.path.join(self.test_dir, '..', '.tmp_multilabel_fit')
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset(
            'iris', make_multilabel=True)
        automl = AutoSklearnClassifier(time_left_for_this_task=20,
                                       per_run_time_limit=5,
                                       tmp_folder=output,
                                       output_folder=output)

        automl.fit(X_train, Y_train)
        predictions = automl.predict(X_test)
        self.assertEqual(predictions.shape, (50, 3))
        score = f1_macro(Y_test, predictions)
        self.assertGreaterEqual(score, 0.9)
        probs = automl.predict_proba(X_train)
        self.assertAlmostEqual(np.mean(probs), 0.33333333333333331)
Ejemplo n.º 46
0
    def test_predict_batched_sparse(self):
        cs = SimpleRegressionPipeline.get_hyperparameter_search_space(
            dataset_properties={'sparse': True},
            include={'regressor': ['decision_tree']})
        default = cs.get_default_configuration()
        cls = SimpleRegressionPipeline(default)

        X_train, Y_train, X_test, Y_test = get_dataset(dataset='diabetes',
                                                       make_sparse=True)
        cls.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = cls.predict(X_test_)
        cls_predict = unittest.mock.Mock(wraps=cls.pipeline_)
        cls.pipeline_ = cls_predict
        prediction = cls.predict(X_test, batch_size=20)
        self.assertEqual((292, ), prediction.shape)
        self.assertEqual(15, cls_predict.predict.call_count)
        assert_array_almost_equal(prediction_, prediction)
Ejemplo n.º 47
0
    def test_predict_batched(self):
        include = {'regressor': ['decision_tree']}
        cs = SimpleRegressionPipeline(
            include=include).get_hyperparameter_search_space()
        default = cs.get_default_configuration()
        regressor = SimpleRegressionPipeline(default, include=include)

        X_train, Y_train, X_test, Y_test = get_dataset(dataset='boston')
        regressor.fit(X_train, Y_train)
        X_test_ = X_test.copy()
        prediction_ = regressor.predict(X_test_)
        mock_predict = unittest.mock.Mock(
            wraps=regressor.steps[-1][-1].predict)
        regressor.steps[-1][-1].predict = mock_predict
        prediction = regressor.predict(X_test, batch_size=20)
        self.assertEqual((356, ), prediction.shape)
        self.assertEqual(18, mock_predict.call_count)
        np.testing.assert_array_almost_equal(prediction_, prediction)
Ejemplo n.º 48
0
 def test_trials_callback_execution(self):
     trials_summary_fname = os.path.join(tempfile.gettempdir(),
                                         "trials.csv")
     X_train, Y_train, X_test, Y_test = putil.get_dataset('breast_cancer')
     cls = AutoSklearnClassifier(
         time_left_for_this_task=30,
         initial_configurations_via_metalearning=0,
         per_run_time_limit=10,
         memory_limit=1024,
         delete_tmp_folder_after_terminate=False,
         n_jobs=1,
         include_estimators=["sgd"],
         include_preprocessors=["no_preprocessing"],
         get_trials_callback=AutoMLTrialsCallBack(trials_summary_fname))
     cls.fit(X_train, Y_train, X_test, Y_test)
     trials = pd.read_csv(trials_summary_fname)
     assert trials.shape[
         0] > 0, f"Auto-Sklearn explored {trials.shape[0] - 1} trials"
Ejemplo n.º 49
0
    def test_fit_roar(self):
        output = os.path.join(self.test_dir, '..', '.tmp_test_fit_roar')
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
        backend_api = backend.create(output, output)
        automl = autosklearn.automl.AutoML(backend_api, 20, 5,
                                           initial_configurations_via_metalearning=0,
                                           configuration_mode='ROAR')
        automl.fit(X_train, Y_train, metric=accuracy)
        # print(automl.show_models(), flush=True)
        # print(automl.cv_results_, flush=True)
        score = automl.score(X_test, Y_test)
        self.assertGreaterEqual(score, 0.8)
        self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION)

        del automl
        self._tearDown(output)
Ejemplo n.º 50
0
    def test_fit(self):
        if self.travis:
            self.skipTest('This test does currently not run on travis-ci. '
                          'Make sure it runs locally on your machine!')

        output = os.path.join(self.test_dir, '..', '.tmp_test_fit')
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
        backend_api = backend.create(output, output)
        automl = autosklearn.automl.AutoML(backend_api, 15, 15)
        automl.fit(X_train, Y_train)
        score = automl.score(X_test, Y_test)
        self.assertGreaterEqual(score, 0.8)
        self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION)

        del automl
        self._tearDown(output)
Ejemplo n.º 51
0
    def _test_helper(self, Preprocessor, dataset=None, make_sparse=False):
        X_train, Y_train, X_test, Y_test = get_dataset(
            dataset=dataset,
            make_sparse=make_sparse,
        )

        dataset_properties = {'sparse': make_sparse}

        original_X_train = X_train.copy()
        configuration_space = Preprocessor(dataset_properties).\
            get_hyperparameter_search_space(dataset_properties)
        default = configuration_space.get_default_configuration()

        preprocessor = Preprocessor(dataset_properties, random_state=1)
        preprocessor.set_hyperparameters(default)
        preprocessor = preprocessor.choice
        transformer = preprocessor.fit(X_train, Y_train)
        return transformer.transform(X_train), original_X_train
Ejemplo n.º 52
0
    def test_default_configuration_predict_proba(self):
        for i in range(10):
            predictions, targets = _test_classifier_predict_proba(
                LibSVM_SVC,
                sparse=True,
                dataset='digits',
                train_size_maximum=500)
            self.assertAlmostEqual(
                4.6680593525563063,
                sklearn.metrics.log_loss(targets, predictions))

        for i in range(10):
            predictions, targets = _test_classifier_predict_proba(
                LibSVM_SVC, sparse=True, dataset='iris')
            self.assertAlmostEqual(
                0.8649665185853217,
                sklearn.metrics.log_loss(targets, predictions))

        # 2 class
        for i in range(10):
            X_train, Y_train, X_test, Y_test = get_dataset(dataset='iris')
            remove_training_data = Y_train == 2
            remove_test_data = Y_test == 2
            X_train = X_train[~remove_training_data]
            Y_train = Y_train[~remove_training_data]
            X_test = X_test[~remove_test_data]
            Y_test = Y_test[~remove_test_data]
            ss = sklearn.preprocessing.StandardScaler()
            X_train = ss.fit_transform(X_train)
            configuration_space = LibSVM_SVC.get_hyperparameter_search_space()
            default = configuration_space.get_default_configuration()

            cls = LibSVM_SVC(random_state=1,
                             **{
                                 hp_name: default[hp_name]
                                 for hp_name in default
                                 if default[hp_name] is not None
                             })

            cls = cls.fit(X_train, Y_train)
            prediction = cls.predict_proba(X_test)
            self.assertAlmostEqual(
                sklearn.metrics.log_loss(Y_test, prediction),
                0.69323680119641773)
Ejemplo n.º 53
0
    def test_exceptions_inside_log_in_smbo(self, smbo_run_mock):

        # Make sure that any exception during the AutoML fit due to
        # SMAC are properly captured in a log file
        backend_api = self._create_backend('test_exceptions_inside_log')
        self._tearDown(backend_api.temporary_directory)
        self._tearDown(backend_api.output_directory)

        automl = autosklearn.automl.AutoML(
            backend_api,
            20,
            5,
            metric=accuracy,
        )

        output_file = 'test_exceptions_inside_log.log'
        setup_logger(output_file=output_file)
        logger = get_logger('test_exceptions_inside_log')

        # Create a custom exception to prevent other errors to slip in
        class MyException(Exception):
            pass

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
        # The first call is on dummy predictor failure
        message = str(np.random.randint(100)) + '_run_smbo'
        smbo_run_mock.side_effect = MyException(message)

        with unittest.mock.patch(
                'autosklearn.automl.AutoML._get_logger') as mock:
            mock.return_value = logger
            with self.assertRaises(MyException):
                automl.fit(
                    X_train,
                    Y_train,
                    task=MULTICLASS_CLASSIFICATION,
                )
            with open(output_file) as f:
                self.assertTrue(message in f.read())

        # Cleanup
        os.unlink(output_file)
        self._tearDown(backend_api.temporary_directory)
        self._tearDown(backend_api.output_directory)
Ejemplo n.º 54
0
    def test_fit_roar(self):
        def get_roar_object_callback(
                scenario_dict,
                seed,
                ta,
                ta_kwargs,
                **kwargs
        ):
            """Random online adaptive racing.

            http://ml.informatik.uni-freiburg.de/papers/11-LION5-SMAC.pdf"""
            scenario = Scenario(scenario_dict)
            return ROAR(
                scenario=scenario,
                rng=seed,
                tae_runner=ta,
                tae_runner_kwargs=ta_kwargs,
            )

        backend_api = self._create_backend('test_fit_roar')

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
        automl = autosklearn.automl.AutoML(
            backend=backend_api,
            time_left_for_this_task=20,
            per_run_time_limit=5,
            initial_configurations_via_metalearning=0,
            get_smac_object_callback=get_roar_object_callback,
            metric=accuracy,
        )
        setup_logger()
        automl._logger = get_logger('test_fit_roar')
        automl.fit(
            X_train, Y_train, task=MULTICLASS_CLASSIFICATION,
        )
        score = automl.score(X_test, Y_test)
        self.assertGreaterEqual(score, 0.8)
        self.assertGreater(self._count_succeses(automl.cv_results_), 0)
        self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION)

        del automl
        self._tearDown(backend_api.temporary_directory)
        self._tearDown(backend_api.output_directory)
    def test_default_configuration_negative_values(self):
        # Custon preprocessing test to check if clipping to zero works
        X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits')
        ss = sklearn.preprocessing.StandardScaler()
        X_train = ss.fit_transform(X_train)
        configuration_space = MultinomialNB.get_hyperparameter_search_space()
        default = configuration_space.get_default_configuration()

        cls = MultinomialNB(random_state=1,
                            **{
                                hp_name: default[hp_name]
                                for hp_name in default
                                if default[hp_name] is not None
                            })

        cls = cls.fit(X_train, Y_train)
        prediction = cls.predict(X_test)
        self.assertAlmostEqual(np.nanmean(prediction == Y_test),
                               0.88888888888888884)
Ejemplo n.º 56
0
    def test_default_configuration_classify(self):
        for i in range(2):
            X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                           make_sparse=False)
            configuration_space = ExtraTreesPreprocessor.get_hyperparameter_search_space()
            default = configuration_space.get_default_configuration()
            preprocessor = ExtraTreesPreprocessor(random_state=1,
                                                  **{hp_name: default[hp_name]
                                                     for hp_name in default})
            preprocessor.fit(X_train, Y_train)
            X_train_trans = preprocessor.transform(X_train)
            X_test_trans = preprocessor.transform(X_test)

            # fit a classifier on top
            classifier = RidgeClassifier()
            predictor = classifier.fit(X_train_trans, Y_train)
            predictions = predictor.predict(X_test_trans)
            accuracy = sklearn.metrics.accuracy_score(predictions, Y_test)
            self.assertAlmostEqual(accuracy, 0.87310261080752882, places=2)
Ejemplo n.º 57
0
def test_fit(dask_client, backend):

    X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
    automl = autosklearn.automl.AutoML(
        backend=backend,
        time_left_for_this_task=30,
        per_run_time_limit=5,
        metric=accuracy,
        dask_client=dask_client,
    )
    automl.fit(
        X_train, Y_train, task=MULTICLASS_CLASSIFICATION,
    )
    score = automl.score(X_test, Y_test)
    assert score > 0.8
    assert count_succeses(automl.cv_results_) > 0
    assert automl._task == MULTICLASS_CLASSIFICATION

    del automl
Ejemplo n.º 58
0
def test_load_best_individual_model(metric, backend, dask_client):

    X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
    automl = autosklearn.automl.AutoML(
        backend=backend,
        time_left_for_this_task=30,
        per_run_time_limit=5,
        metric=metric,
        dask_client=dask_client,
    )

    # We cannot easily mock a function sent to dask
    # so for this test we create the whole set of models/ensembles
    # but prevent it to be loaded
    automl.fit(
        X_train,
        Y_train,
        task=MULTICLASS_CLASSIFICATION,
    )
    automl._backend.load_ensemble = unittest.mock.MagicMock(return_value=None)

    # A memory error occurs in the ensemble construction
    assert automl._backend.load_ensemble(automl._seed) is None

    # The load model is robust to this and loads the best model
    automl._load_models()
    assert automl.ensemble_ is not None

    # Just 1 model is there for ensemble and all weight must be on it
    get_models_with_weights = automl.get_models_with_weights()
    assert len(get_models_with_weights) == 1
    assert get_models_with_weights[0][0] == 1.0

    # Match a toy dataset
    if metric.name == 'balanced_accuracy':
        assert automl.score(X_test, Y_test) > 0.9
    elif metric.name == 'log_loss':
        # Seen values in github actions of 0.6978304740364537
        assert automl.score(X_test, Y_test) < 0.7
    else:
        raise ValueError(metric.name)

    del automl
    def test_default_configuration_classify(self):
        for i in range(3):
            X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits',
                                                           make_sparse=False)
            configuration_space = FeatureAgglomeration.get_hyperparameter_search_space()
            default = configuration_space.get_default_configuration()
            preprocessor = FeatureAgglomeration(random_state=1,
                                                **{hp_name: default[hp_name] for
                                                   hp_name in default})
            preprocessor.fit(X_train, Y_train)
            X_train_trans = preprocessor.transform(X_train)
            X_test_trans = preprocessor.transform(X_test)

            # fit a classifier on top
            classifier = RandomForestClassifier(random_state=1)
            predictor = classifier.fit(X_train_trans, Y_train)
            predictions = predictor.predict(X_test_trans)
            accuracy = sklearn.metrics.accuracy_score(predictions, Y_test)
            self.assertAlmostEqual(accuracy, 0.8026715)
Ejemplo n.º 60
0
def get_multilabel_classification_datamanager():
    X_train, Y_train, X_test, Y_test = get_dataset('iris')
    indices = list(range(X_train.shape[0]))
    np.random.seed(1)
    np.random.shuffle(indices)
    X_train = X_train[indices]
    Y_train = Y_train[indices]

    Y_train = np.array(convert_to_bin(Y_train, 3))
    # for i in range(Y_train_.shape[0]):
    #    Y_train_[:, Y_train[i]] = 1
    # Y_train = Y_train_
    Y_test = np.array(convert_to_bin(Y_test, 3))
    # for i in range(Y_test_.shape[0]):
    #    Y_test_[:, Y_test[i]] = 1
    # Y_test = Y_test_

    X_valid = X_test[:25, ]
    Y_valid = Y_test[:25, ]
    X_test = X_test[25:, ]
    Y_test = Y_test[25:, ]

    D = Dummy()
    D.info = {
        'task': MULTILABEL_CLASSIFICATION,
        'is_sparse': False,
        'label_num': 3
    }
    D.data = {
        'X_train': X_train,
        'Y_train': Y_train,
        'X_valid': X_valid,
        'Y_valid': Y_valid,
        'X_test': X_test,
        'Y_test': Y_test
    }
    D.feat_type = {
        0: 'numerical',
        1: 'Numerical',
        2: 'numerical',
        3: 'numerical'
    }
    return D