Exemple #1
0
    def test_fit_roar(self):
        def get_roar_object_callback(scenario_dict, seed, ta, **kwargs):
            """Random online adaptive racing.

            http://ml.informatik.uni-freiburg.de/papers/11-LION5-SMAC.pdf"""
            scenario = Scenario(scenario_dict)
            return ROAR(
                scenario=scenario,
                rng=seed,
                tae_runner=ta,
            )

        backend_api = self._create_backend('test_fit_roar')

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
        automl = autosklearn.automl.AutoML(
            backend=backend_api,
            time_left_for_this_task=20,
            per_run_time_limit=5,
            initial_configurations_via_metalearning=0,
            get_smac_object_callback=get_roar_object_callback,
        )
        automl.fit(
            X_train,
            Y_train,
            metric=accuracy,
            task=MULTICLASS_CLASSIFICATION,
        )
        score = automl.score(X_test, Y_test)
        self.assertGreaterEqual(score, 0.8)
        self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION)

        del automl
        self._tearDown(backend_api.temporary_directory)
        self._tearDown(backend_api.output_directory)
Exemple #2
0
 def test_fit(self):
     X_train, Y_train, X_test, Y_test = putil.get_dataset("iris")
     automl = autosklearn.automl.AutoML(self.output, self.output, 10, 10)
     automl.fit(X_train, Y_train)
     score = automl.score(X_test, Y_test)
     self.assertGreaterEqual(score, 0.9)
     self.assertEqual(automl.task_, MULTICLASS_CLASSIFICATION)
Exemple #3
0
    def test_binary_score(self):
        """
        Test fix for binary classification prediction
        taking the index 1 of second dimension in prediction matrix
        """

        output = os.path.join(self.test_dir, '..', '.tmp_test_binary_score')
        self._setUp(output)

        data = sklearn.datasets.make_classification(
            n_samples=1000, n_features=20, n_redundant=5, n_informative=5,
            n_repeated=2, n_clusters_per_class=2, random_state=1)
        X_train = data[0][:700]
        Y_train = data[1][:700]
        X_test = data[0][700:]
        Y_test = data[1][700:]

        backend_api = backend.create(output, output)
        automl = autosklearn.automl.AutoML(backend_api, 15, 5)
        automl.fit(X_train, Y_train, task=BINARY_CLASSIFICATION)
        self.assertEqual(automl._task, BINARY_CLASSIFICATION)

        score = automl.score(X_test, Y_test)
        self.assertGreaterEqual(score, 0.5)

        del automl
        self._tearDown(output)
Exemple #4
0
    def test_binary_score(self):
        """
        Test fix for binary classification prediction
        taking the index 1 of second dimension in prediction matrix
        """
        if self.travis:
            self.skipTest('This test does currently not run on travis-ci. '
                          'Make sure it runs locally on your machine!')

        output = os.path.join(self.test_dir, '..', '.tmp_test_binary_score')
        self._setUp(output)

        data = sklearn.datasets.make_classification(n_samples=1000,
                                                    n_features=20,
                                                    n_redundant=5,
                                                    n_informative=5,
                                                    n_repeated=2,
                                                    n_clusters_per_class=2,
                                                    random_state=1)
        X_train = data[0][:700]
        Y_train = data[1][:700]
        X_test = data[0][700:]
        Y_test = data[1][700:]

        automl = autosklearn.automl.AutoML(output, output, 15, 15)
        automl.fit(X_train, Y_train, task=BINARY_CLASSIFICATION)
        self.assertEqual(automl._task, BINARY_CLASSIFICATION)

        score = automl.score(X_test, Y_test)
        self.assertGreaterEqual(score, 0.5)

        del automl
        self._tearDown(output)
 def test_fit(self):
     X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
     automl = autosklearn.automl.AutoML(self.output, self.output, 10, 10)
     automl.fit(X_train, Y_train)
     score = automl.score(X_test, Y_test)
     self.assertGreaterEqual(score, 0.9)
     self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION)
    def test_binary_score(self):
        """
        Test fix for binary classification prediction
        taking the index 1 of second dimension in prediction matrix
        """
        if self.travis:
            self.skipTest('This test does currently not run on travis-ci. '
                          'Make sure it runs locally on your machine!')

        output = os.path.join(self.test_dir, '..', '.tmp_test_binary_score')
        self._setUp(output)

        data = sklearn.datasets.make_classification(
            n_samples=1000, n_features=20, n_redundant=5, n_informative=5,
            n_repeated=2, n_clusters_per_class=2, random_state=1)
        X_train = data[0][:700]
        Y_train = data[1][:700]
        X_test = data[0][700:]
        Y_test = data[1][700:]

        automl = autosklearn.automl.AutoML(output, output, 15, 15)
        automl.fit(X_train, Y_train, task=BINARY_CLASSIFICATION)
        self.assertEqual(automl._task, BINARY_CLASSIFICATION)

        score = automl.score(X_test, Y_test)
        self.assertGreaterEqual(score, 0.5)

        del automl
        self._tearDown(output)
Exemple #7
0
    def test_binary_score_and_include(self):
        """
        Test fix for binary classification prediction
        taking the index 1 of second dimension in prediction matrix
        """
        backend_api = self._create_backend('test_binary_score_and_include')

        data = sklearn.datasets.make_classification(
            n_samples=400, n_features=10, n_redundant=1, n_informative=3,
            n_repeated=1, n_clusters_per_class=2, random_state=1)
        X_train = data[0][:200]
        Y_train = data[1][:200]
        X_test = data[0][200:]
        Y_test = data[1][200:]

        automl = autosklearn.automl.AutoML(backend_api, 20, 5,
                                           include_estimators=['sgd'],
                                           include_preprocessors=['no_preprocessing'])
        automl.fit(X_train, Y_train, task=BINARY_CLASSIFICATION,
                   metric=accuracy)
        self.assertEqual(automl._task, BINARY_CLASSIFICATION)

        # TODO, the assumption from above is not really tested here
        # Also, the score method should be removed, it only makes little sense
        score = automl.score(X_test, Y_test)
        self.assertGreaterEqual(score, 0.4)

        del automl
        self._tearDown(backend_api.temporary_directory)
        self._tearDown(backend_api.output_directory)
def test_fail_if_feat_type_on_pandas_input(backend, dask_client):
    """We do not support feat type when pandas
    is provided as an input
    """
    automl = autosklearn.automl.AutoML(
        backend=backend,
        time_left_for_this_task=30,
        per_run_time_limit=5,
        metric=accuracy,
        dask_client=dask_client,
    )

    X_train = pd.DataFrame({'a': [1, 1], 'c': [1, 2]})
    y_train = [1, 0]
    with pytest.raises(
            ValueError,
            match=""
            "providing the option feat_type to the fit method is not supported when using a Dataframe"
    ):
        automl.fit(
            X_train,
            y_train,
            task=BINARY_CLASSIFICATION,
            feat_type={
                1: 'Categorical',
                2: 'Numerical'
            },
        )
Exemple #9
0
def test_binary_score_and_include(backend, dask_client):
    """
    Test fix for binary classification prediction
    taking the index 1 of second dimension in prediction matrix
    """

    data = sklearn.datasets.make_classification(
        n_samples=400, n_features=10, n_redundant=1, n_informative=3,
        n_repeated=1, n_clusters_per_class=2, random_state=1)
    X_train = data[0][:200]
    Y_train = data[1][:200]
    X_test = data[0][200:]
    Y_test = data[1][200:]

    automl = autosklearn.automl.AutoML(
        backend, 20, 5,
        include_estimators=['sgd'],
        include_preprocessors=['no_preprocessing'],
        metric=accuracy,
        dask_client=dask_client,
    )
    automl.fit(X_train, Y_train, task=BINARY_CLASSIFICATION)
    assert automl._task == BINARY_CLASSIFICATION

    # TODO, the assumption from above is not really tested here
    # Also, the score method should be removed, it only makes little sense
    score = automl.score(X_test, Y_test)
    assert score >= 0.4

    del automl
Exemple #10
0
    def test_binary_score_and_include(self):
        """
        Test fix for binary classification prediction
        taking the index 1 of second dimension in prediction matrix
        """
        backend_api = self._create_backend('test_binary_score_and_include')

        data = sklearn.datasets.make_classification(
            n_samples=400, n_features=10, n_redundant=1, n_informative=3,
            n_repeated=1, n_clusters_per_class=2, random_state=1)
        X_train = data[0][:200]
        Y_train = data[1][:200]
        X_test = data[0][200:]
        Y_test = data[1][200:]

        automl = autosklearn.automl.AutoML(
            backend_api, 20, 5,
            include_estimators=['sgd'],
            include_preprocessors=['no_preprocessing'],
            metric=accuracy,
        )
        automl.fit(X_train, Y_train, task=BINARY_CLASSIFICATION)
        self.assertEqual(automl._task, BINARY_CLASSIFICATION)

        # TODO, the assumption from above is not really tested here
        # Also, the score method should be removed, it only makes little sense
        score = automl.score(X_test, Y_test)
        self.assertGreaterEqual(score, 0.4)

        del automl
        self._tearDown(backend_api.temporary_directory)
        self._tearDown(backend_api.output_directory)
    def test_fit(self):
        output = os.path.join(self.test_dir, '..', '.tmp_test_fit')
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
        automl = autosklearn.automl.AutoML(output, output, 12, 12)
        automl.fit(X_train, Y_train)
        score = automl.score(X_test, Y_test)
        self.assertGreaterEqual(score, 0.8)
        self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION)

        del automl
        self._tearDown(output)
    def test_delete_non_candidate_models(self):
        backend_api = self._create_backend(
            'test_delete', delete_tmp_folder_after_terminate=False)

        seed = 555
        X, Y, _, _ = putil.get_dataset('iris')
        automl = autosklearn.automl.AutoML(
            backend_api,
            time_left_for_this_task=30,
            per_run_time_limit=5,
            ensemble_nbest=3,
            seed=seed,
            initial_configurations_via_metalearning=0,
            resampling_strategy='holdout',
            include_estimators=['sgd'],
            include_preprocessors=['no_preprocessing'])

        automl.fit(X,
                   Y,
                   metric=accuracy,
                   task=MULTICLASS_CLASSIFICATION,
                   X_test=X,
                   y_test=Y)

        # Assert at least one model file has been deleted and that there were no
        # deletion errors
        log_file_path = glob.glob(
            os.path.join(backend_api.temporary_directory,
                         'AutoML(' + str(seed) + '):*.log'))
        with open(log_file_path[0]) as log_file:
            log_content = log_file.read()
            self.assertIn('Deleted files of non-candidate model', log_content)
            self.assertNotIn('Failed to delete files of non-candidate model',
                             log_content)
            self.assertNotIn('Failed to lock model', log_content)

        # Assert that the files of the models used by the ensemble weren't deleted
        model_files = backend_api.list_all_models(seed=seed)
        model_files_idx = set()
        for m_file in model_files:
            # Extract the model identifiers from the filename
            m_file = os.path.split(m_file)[1].replace('.model',
                                                      '').split('.', 2)
            model_files_idx.add(
                (int(m_file[0]), int(m_file[1]), float(m_file[2])))
        ensemble_members_idx = set(automl.ensemble_.identifiers_)
        self.assertTrue(ensemble_members_idx.issubset(model_files_idx))

        del automl
        self._tearDown(backend_api.temporary_directory)
        self._tearDown(backend_api.output_directory)
Exemple #13
0
    def test_fit(self):
        output = os.path.join(self.test_dir, '..', '.tmp_test_fit')
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
        backend_api = backend.create(output, output)
        automl = autosklearn.automl.AutoML(backend_api, 20, 5)
        automl.fit(X_train, Y_train, metric=accuracy)
        score = automl.score(X_test, Y_test)
        self.assertGreaterEqual(score, 0.8)
        self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION)

        del automl
        self._tearDown(output)
Exemple #14
0
    def test_fit(self):
        backend_api = self._create_backend('test_fit')

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
        automl = autosklearn.automl.AutoML(backend_api, 20, 5)
        automl.fit(
            X_train, Y_train, metric=accuracy, task=MULTICLASS_CLASSIFICATION,
        )
        score = automl.score(X_test, Y_test)
        self.assertGreaterEqual(score, 0.8)
        self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION)

        del automl
        self._tearDown(backend_api.temporary_directory)
        self._tearDown(backend_api.output_directory)
def test_delete_non_candidate_models(dask_client):

    seed = 555
    X, Y, _, _ = putil.get_dataset('iris')
    automl = autosklearn.automl.AutoML(
        delete_tmp_folder_after_terminate=False,
        time_left_for_this_task=60,
        per_run_time_limit=5,
        ensemble_nbest=3,
        seed=seed,
        initial_configurations_via_metalearning=0,
        resampling_strategy='holdout',
        include={
            'classifier': ['sgd'],
            'feature_preprocessor': ['no_preprocessing']
        },
        metric=accuracy,
        dask_client=dask_client,
        # Force model to be deleted. That is, from 50 which is the
        # default to 3 to make sure we delete models.
        max_models_on_disc=3,
    )

    automl.fit(X, Y, task=MULTICLASS_CLASSIFICATION, X_test=X, y_test=Y)

    # Assert at least one model file has been deleted and that there were no
    # deletion errors
    log_file_path = glob.glob(
        os.path.join(automl._backend.temporary_directory,
                     'AutoML(' + str(seed) + '):*.log'))
    with open(log_file_path[0]) as log_file:
        log_content = log_file.read()
        assert 'Deleted files of non-candidate model' in log_content, log_content
        assert 'Failed to delete files of non-candidate model' not in log_content, log_content
        assert 'Failed to lock model' not in log_content, log_content

    # Assert that the files of the models used by the ensemble weren't deleted
    model_files = automl._backend.list_all_models(seed=seed)
    model_files_idx = set()
    for m_file in model_files:
        # Extract the model identifiers from the filename
        m_file = os.path.split(m_file)[1].replace('.model', '').split('.', 2)
        model_files_idx.add((int(m_file[0]), int(m_file[1]), float(m_file[2])))
    ensemble_members_idx = set(automl.ensemble_.identifiers_)
    assert ensemble_members_idx.issubset(model_files_idx), (
        ensemble_members_idx, model_files_idx)

    del automl
Exemple #16
0
    def test_fit(self):
        output = os.path.join(self.test_dir, '..', '.tmp_test_fit')
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
        backend_api = backend.create(output, output)
        automl = autosklearn.automl.AutoML(backend_api, 30, 5)
        automl.fit(X_train, Y_train)
        #print(automl.show_models(), flush=True)
        #print(automl.cv_results_, flush=True)
        score = automl.score(X_test, Y_test)
        self.assertGreaterEqual(score, 0.8)
        self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION)

        del automl
        self._tearDown(output)
Exemple #17
0
def test_exceptions_inside_log_in_smbo(smbo_run_mock, backend, dask_client):

    # Below importing and shutdown is a workaround, to make sure
    # we reset the port to collect messages. Randomly, when running
    # this test with multiple other test at the same time causes this
    # test to fail. This resets the singletons of the logging class
    import logging
    logging.shutdown()

    automl = autosklearn.automl.AutoML(
        backend,
        20,
        5,
        metric=accuracy,
        dask_client=dask_client,
    )

    dataset_name = 'test_exceptions_inside_log'

    # Create a custom exception to prevent other errors to slip in
    class MyException(Exception):
        pass

    X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
    # The first call is on dummy predictor failure
    message = str(np.random.randint(100)) + '_run_smbo'
    smbo_run_mock.side_effect = MyException(message)

    with pytest.raises(MyException):
        automl.fit(
            X_train,
            Y_train,
            task=MULTICLASS_CLASSIFICATION,
            dataset_name=dataset_name,
        )

    # make sure that the logfile was created
    import shutil
    shutil.copytree(backend.temporary_directory, '/tmp/trydebug')
    logger_name = 'AutoML(%d):%s' % (1, dataset_name)
    logfile = os.path.join(backend.temporary_directory, logger_name + '.log')
    assert os.path.exists(logfile), automl._clean_logger()
    with open(logfile) as f:
        assert message in f.read(), automl._clean_logger()

    # Speed up the closing after forced crash
    automl._clean_logger()
Exemple #18
0
    def test_fit(self):
        if self.travis:
            self.skipTest('This test does currently not run on travis-ci. '
                          'Make sure it runs locally on your machine!')

        output = os.path.join(self.test_dir, '..', '.tmp_test_fit')
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
        automl = autosklearn.automl.AutoML(output, output, 15, 15)
        automl.fit(X_train, Y_train)
        score = automl.score(X_test, Y_test)
        self.assertGreaterEqual(score, 0.8)
        self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION)

        del automl
        self._tearDown(output)
    def test_fit(self):
        if self.travis:
            self.skipTest('This test does currently not run on travis-ci. '
                          'Make sure it runs locally on your machine!')

        output = os.path.join(self.test_dir, '..', '.tmp_test_fit')
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
        automl = autosklearn.automl.AutoML(output, output, 15, 15)
        automl.fit(X_train, Y_train)
        score = automl.score(X_test, Y_test)
        self.assertGreaterEqual(score, 0.8)
        self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION)

        del automl
        self._tearDown(output)
Exemple #20
0
    def test_fit(self):
        backend_api = self._create_backend('test_fit')

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
        automl = autosklearn.automl.AutoML(backend_api, 20, 5)
        automl.fit(
            X_train,
            Y_train,
            metric=accuracy,
            task=MULTICLASS_CLASSIFICATION,
        )
        score = automl.score(X_test, Y_test)
        self.assertGreaterEqual(score, 0.8)
        self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION)

        del automl
        self._tearDown(backend_api.temporary_directory)
        self._tearDown(backend_api.output_directory)
Exemple #21
0
    def test_fit_roar(self):
        output = os.path.join(self.test_dir, '..', '.tmp_test_fit_roar')
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
        backend_api = backend.create(output, output)
        automl = autosklearn.automl.AutoML(backend_api, 20, 5,
                                           initial_configurations_via_metalearning=0,
                                           configuration_mode='ROAR')
        automl.fit(X_train, Y_train, metric=accuracy)
        # print(automl.show_models(), flush=True)
        # print(automl.cv_results_, flush=True)
        score = automl.score(X_test, Y_test)
        self.assertGreaterEqual(score, 0.8)
        self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION)

        del automl
        self._tearDown(output)
Exemple #22
0
    def test_exceptions_inside_log_in_smbo(self, smbo_run_mock):

        # Make sure that any exception during the AutoML fit due to
        # SMAC are properly captured in a log file
        backend_api = self._create_backend('test_exceptions_inside_log')
        self._tearDown(backend_api.temporary_directory)
        self._tearDown(backend_api.output_directory)

        automl = autosklearn.automl.AutoML(
            backend_api,
            20,
            5,
            metric=accuracy,
        )

        output_file = 'test_exceptions_inside_log.log'
        setup_logger(output_file=output_file)
        logger = get_logger('test_exceptions_inside_log')

        # Create a custom exception to prevent other errors to slip in
        class MyException(Exception):
            pass

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
        # The first call is on dummy predictor failure
        message = str(np.random.randint(100)) + '_run_smbo'
        smbo_run_mock.side_effect = MyException(message)

        with unittest.mock.patch(
                'autosklearn.automl.AutoML._get_logger') as mock:
            mock.return_value = logger
            with self.assertRaises(MyException):
                automl.fit(
                    X_train,
                    Y_train,
                    task=MULTICLASS_CLASSIFICATION,
                )
            with open(output_file) as f:
                self.assertTrue(message in f.read())

        # Cleanup
        os.unlink(output_file)
        self._tearDown(backend_api.temporary_directory)
        self._tearDown(backend_api.output_directory)
Exemple #23
0
def test_fit_roar(dask_client_single_worker, backend):
    def get_roar_object_callback(
            scenario_dict,
            seed,
            ta,
            ta_kwargs,
            dask_client,
            n_jobs,
            **kwargs
    ):
        """Random online adaptive racing.

        http://ml.informatik.uni-freiburg.de/papers/11-LION5-SMAC.pdf"""
        scenario = Scenario(scenario_dict)
        return ROAR(
            scenario=scenario,
            rng=seed,
            tae_runner=ta,
            tae_runner_kwargs=ta_kwargs,
            dask_client=dask_client,
            n_jobs=n_jobs,
        )

    X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
    automl = autosklearn.automl.AutoML(
        backend=backend,
        time_left_for_this_task=30,
        per_run_time_limit=5,
        initial_configurations_via_metalearning=0,
        get_smac_object_callback=get_roar_object_callback,
        metric=accuracy,
        dask_client=dask_client_single_worker,
    )
    automl.fit(
        X_train, Y_train, task=MULTICLASS_CLASSIFICATION,
    )
    score = automl.score(X_test, Y_test)
    assert score > 0.8
    assert count_succeses(automl.cv_results_) > 0
    assert includes_train_scores(automl.performance_over_time_.columns) is True
    assert automl._task == MULTICLASS_CLASSIFICATION

    del automl
Exemple #24
0
def test_load_best_individual_model(metric, backend, dask_client):

    X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
    automl = autosklearn.automl.AutoML(
        backend=backend,
        time_left_for_this_task=30,
        per_run_time_limit=5,
        metric=metric,
        dask_client=dask_client,
    )

    # We cannot easily mock a function sent to dask
    # so for this test we create the whole set of models/ensembles
    # but prevent it to be loaded
    automl.fit(
        X_train,
        Y_train,
        task=MULTICLASS_CLASSIFICATION,
    )
    automl._backend.load_ensemble = unittest.mock.MagicMock(return_value=None)

    # A memory error occurs in the ensemble construction
    assert automl._backend.load_ensemble(automl._seed) is None

    # The load model is robust to this and loads the best model
    automl._load_models()
    assert automl.ensemble_ is not None

    # Just 1 model is there for ensemble and all weight must be on it
    get_models_with_weights = automl.get_models_with_weights()
    assert len(get_models_with_weights) == 1
    assert get_models_with_weights[0][0] == 1.0

    # Match a toy dataset
    if metric.name == 'balanced_accuracy':
        assert automl.score(X_test, Y_test) > 0.9
    elif metric.name == 'log_loss':
        # Seen values in github actions of 0.6978304740364537
        assert automl.score(X_test, Y_test) < 0.7
    else:
        raise ValueError(metric.name)

    del automl
Exemple #25
0
def test_input_and_target_types(dask_client, X, y, task):

    if task in CLASSIFICATION_TASKS:
        automl = AutoMLClassifier(
            time_left_for_this_task=15,
            per_run_time_limit=5,
            dask_client=dask_client,
        )
    else:
        automl = AutoMLRegressor(
            time_left_for_this_task=15,
            per_run_time_limit=5,
            dask_client=dask_client,
        )
    # To save time fitting and only validate the inputs we only return
    # the configuration space
    automl.fit(X, y, only_return_configuration_space=True)
    assert automl._task == task
    assert automl._metric.name == default_metric_for_task[task].name
Exemple #26
0
def test_fit(dask_client, backend):

    X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
    automl = autosklearn.automl.AutoML(
        backend=backend,
        time_left_for_this_task=30,
        per_run_time_limit=5,
        metric=accuracy,
        dask_client=dask_client,
    )
    automl.fit(
        X_train, Y_train, task=MULTICLASS_CLASSIFICATION,
    )
    score = automl.score(X_test, Y_test)
    assert score > 0.8
    assert count_succeses(automl.cv_results_) > 0
    assert automl._task == MULTICLASS_CLASSIFICATION

    del automl
Exemple #27
0
    def test_load_best_individual_model(self):

        backend_api = self._create_backend('test_fit')

        for metric in [log_loss, balanced_accuracy]:
            X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
            automl = autosklearn.automl.AutoML(
                backend=backend_api,
                time_left_for_this_task=20,
                per_run_time_limit=5,
                metric=metric,
            )

            with unittest.mock.patch(
                'autosklearn.ensemble_builder.EnsembleBuilder.run'
            ) as mock_ensemble_run:
                mock_ensemble_run.side_effect = MemoryError
                automl.fit(
                    X_train, Y_train, task=MULTICLASS_CLASSIFICATION,
                )

            # A memory error occurs in the ensemble construction
            self.assertIsNone(automl._backend.load_ensemble(automl._seed))

            # The load model is robust to this and loads the best model
            automl._load_models()
            self.assertIsNotNone(automl.ensemble_)

            # Just 1 model is there for ensemble and all weight must be on it
            get_models_with_weights = automl.get_models_with_weights()
            self.assertEqual(len(get_models_with_weights), 1)
            self.assertEqual(get_models_with_weights[0][0], 1.0)

            # Match a toy dataset
            if metric._sign < 0:
                self.assertLessEqual(automl.score(X_test, Y_test), 0.2)
            else:
                self.assertGreaterEqual(automl.score(X_test, Y_test), 0.8)

            del automl
            self._tearDown(backend_api.temporary_directory)
            self._tearDown(backend_api.output_directory)
def test_fit(dask_client):

    X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
    automl = autosklearn.automl.AutoML(
        seed=0,
        time_left_for_this_task=30,
        per_run_time_limit=5,
        metric=accuracy,
        dask_client=dask_client,
    )
    automl.fit(X_train, Y_train, task=MULTICLASS_CLASSIFICATION)
    score = automl.score(X_test, Y_test)
    assert score > 0.8
    assert count_succeses(automl.cv_results_) > 0
    assert includes_train_scores(automl.performance_over_time_.columns) is True
    assert performance_over_time_is_plausible(
        automl.performance_over_time_) is True
    assert automl._task == MULTICLASS_CLASSIFICATION

    del automl
    def test_fit(self):
        backend_api = self._create_backend('test_fit')

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
        automl = autosklearn.automl.AutoML(
            backend=backend_api,
            time_left_for_this_task=20,
            per_run_time_limit=5,
            metric=accuracy,
        )
        automl.fit(
            X_train, Y_train, task=MULTICLASS_CLASSIFICATION,
        )
        score = automl.score(X_test, Y_test)
        self.assertGreaterEqual(score, 0.8)
        self.assertGreater(self._count_succeses(automl.cv_results_), 0)
        self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION)

        del automl
        self._tearDown(backend_api.temporary_directory)
        self._tearDown(backend_api.output_directory)
Exemple #30
0
    def test_binary_score_and_include(self):
        """
        Test fix for binary classification prediction
        taking the index 1 of second dimension in prediction matrix
        """

        output = os.path.join(self.test_dir, '..', '.tmp_test_binary_score')
        self._setUp(output)

        data = sklearn.datasets.make_classification(n_samples=400,
                                                    n_features=10,
                                                    n_redundant=1,
                                                    n_informative=3,
                                                    n_repeated=1,
                                                    n_clusters_per_class=2,
                                                    random_state=1)
        X_train = data[0][:200]
        Y_train = data[1][:200]
        X_test = data[0][200:]
        Y_test = data[1][200:]

        backend_api = backend.create(output, output)
        automl = autosklearn.automl.AutoML(
            backend_api,
            30,
            5,
            include_estimators=['sgd'],
            include_preprocessors=['no_preprocessing'])
        automl.fit(X_train, Y_train, task=BINARY_CLASSIFICATION)
        #print(automl.show_models(), flush=True)
        #print(automl.cv_results_, flush=True)
        self.assertEqual(automl._task, BINARY_CLASSIFICATION)

        # TODO, the assumption from above is not really tested here
        # Also, the score method should be removed, it only makes little sense
        score = automl.score(X_test, Y_test)
        self.assertGreaterEqual(score, 0.4)

        del automl
        self._tearDown(output)
Exemple #31
0
    def test_fail_if_dtype_changes_automl(self):
        """We do not support changes in the input type.
        Once a estimator is fitted, it should not change data type
        """
        backend_api = self._create_backend('test_fail_feat_typechange')
        automl = autosklearn.automl.AutoML(
            backend=backend_api,
            time_left_for_this_task=20,
            per_run_time_limit=5,
            metric=accuracy,
        )

        X_train = pd.DataFrame({'a': [1, 1], 'c': [1, 2]})
        y_train = [1, 0]
        automl.InputValidator.validate(X_train, y_train, is_classification=True)
        with self.assertRaisesRegex(ValueError,
                                    "Auto-sklearn previously received features of type"):
            automl.fit(
                X_train.to_numpy(), y_train,
                task=BINARY_CLASSIFICATION,
            )
        self._tearDown(backend_api.temporary_directory)
        self._tearDown(backend_api.output_directory)
Exemple #32
0
    def test_fail_if_feat_type_on_pandas_input(self):
        """We do not support feat type when pandas
        is provided as an input
        """
        backend_api = self._create_backend('test_fail_feat_pandas')
        automl = autosklearn.automl.AutoML(
            backend=backend_api,
            time_left_for_this_task=20,
            per_run_time_limit=5,
            metric=accuracy,
        )

        X_train = pd.DataFrame({'a': [1, 1], 'c': [1, 2]})
        y_train = [1, 0]
        with self.assertRaisesRegex(ValueError,
                                    "feat_type cannot be provided when using pandas"):
            automl.fit(
                X_train, y_train,
                task=BINARY_CLASSIFICATION,
                feat_type=['Categorical', 'Numerical'],
            )
        self._tearDown(backend_api.temporary_directory)
        self._tearDown(backend_api.output_directory)
Exemple #33
0
def test_fail_if_dtype_changes_automl(backend, dask_client):
    """We do not support changes in the input type.
    Once a estimator is fitted, it should not change data type
    """
    automl = autosklearn.automl.AutoML(
        backend=backend,
        time_left_for_this_task=30,
        per_run_time_limit=5,
        metric=accuracy,
        dask_client=dask_client,
    )

    X_train = pd.DataFrame({'a': [1, 1], 'c': [1, 2]})
    y_train = [1, 0]
    automl.InputValidator.validate(X_train, y_train, is_classification=True)
    with pytest.raises(
            ValueError,
            match="Auto-sklearn previously received features of type"):
        automl.fit(
            X_train.to_numpy(),
            y_train,
            task=BINARY_CLASSIFICATION,
        )
Exemple #34
0
    def test_fit_roar(self):
        def get_roar_object_callback(
                scenario_dict,
                seed,
                ta,
                **kwargs
        ):
            """Random online adaptive racing.

            http://ml.informatik.uni-freiburg.de/papers/11-LION5-SMAC.pdf"""
            scenario = Scenario(scenario_dict)
            return ROAR(
                scenario=scenario,
                rng=seed,
                tae_runner=ta,
            )

        backend_api = self._create_backend('test_fit_roar')

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
        automl = autosklearn.automl.AutoML(
            backend=backend_api,
            time_left_for_this_task=20,
            per_run_time_limit=5,
            initial_configurations_via_metalearning=0,
            get_smac_object_callback=get_roar_object_callback,
        )
        automl.fit(
            X_train, Y_train, metric=accuracy, task=MULTICLASS_CLASSIFICATION,
        )
        score = automl.score(X_test, Y_test)
        self.assertGreaterEqual(score, 0.8)
        self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION)

        del automl
        self._tearDown(backend_api.temporary_directory)
        self._tearDown(backend_api.output_directory)
Exemple #35
0
def test_exceptions_inside_log_in_smbo(smbo_run_mock, backend, dask_client):

    automl = autosklearn.automl.AutoML(
        backend,
        20,
        5,
        metric=accuracy,
        dask_client=dask_client,
    )

    output_file = 'test_exceptions_inside_log.log'
    setup_logger(output_file=output_file)
    logger = get_logger('test_exceptions_inside_log')

    # Create a custom exception to prevent other errors to slip in
    class MyException(Exception):
        pass

    X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
    # The first call is on dummy predictor failure
    message = str(np.random.randint(100)) + '_run_smbo'
    smbo_run_mock.side_effect = MyException(message)

    with unittest.mock.patch('autosklearn.automl.AutoML._get_logger') as mock:
        mock.return_value = logger
        with pytest.raises(MyException):
            automl.fit(
                X_train,
                Y_train,
                task=MULTICLASS_CLASSIFICATION,
            )
        with open(output_file) as f:
            assert message in f.read()

    # Cleanup
    os.unlink(output_file)
Exemple #36
0
def test_exceptions_inside_log_in_smbo(smbo_run_mock, backend, dask_client):

    # Below importing and shutdown is a workaround, to make sure
    # we reset the port to collect messages. Randomly, when running
    # this test with multiple other test at the same time causes this
    # test to fail. This resets the singletons of the logging class
    import logging
    logging.shutdown()

    automl = autosklearn.automl.AutoML(
        backend,
        20,
        5,
        metric=accuracy,
        dask_client=dask_client,
    )

    dataset_name = 'test_exceptions_inside_log'

    # Create a custom exception to prevent other errors to slip in
    class MyException(Exception):
        pass

    X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
    # The first call is on dummy predictor failure
    message = str(np.random.randint(100)) + '_run_smbo'
    smbo_run_mock.side_effect = MyException(message)

    with pytest.raises(MyException):
        automl.fit(
            X_train,
            Y_train,
            task=MULTICLASS_CLASSIFICATION,
            dataset_name=dataset_name,
        )

    # make sure that the logfile was created
    logger_name = 'AutoML(%d):%s' % (1, dataset_name)
    logger = logging.getLogger(logger_name)
    logfile = os.path.join(backend.temporary_directory, logger_name + '.log')
    assert os.path.exists(logfile), print_debug_information(automl) + str(automl._clean_logger())

    # Give some time for the error message to be printed in the
    # log file
    found_message = False
    for incr_tolerance in range(5):
        with open(logfile) as f:
            lines = f.readlines()
        if any(message in line for line in lines):
            found_message = True
            break
        else:
            time.sleep(incr_tolerance)

    # Speed up the closing after forced crash
    automl._clean_logger()

    if not found_message:
        pytest.fail("Did not find {} in the log file {} for logger {}/{}/{}".format(
            message,
            print_debug_information(automl),
            vars(automl._logger.logger),
            vars(logger),
            vars(logging.getLogger())
        ))