def test_fit_roar(self): def get_roar_object_callback(scenario_dict, seed, ta, **kwargs): """Random online adaptive racing. http://ml.informatik.uni-freiburg.de/papers/11-LION5-SMAC.pdf""" scenario = Scenario(scenario_dict) return ROAR( scenario=scenario, rng=seed, tae_runner=ta, ) backend_api = self._create_backend('test_fit_roar') X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') automl = autosklearn.automl.AutoML( backend=backend_api, time_left_for_this_task=20, per_run_time_limit=5, initial_configurations_via_metalearning=0, get_smac_object_callback=get_roar_object_callback, ) automl.fit( X_train, Y_train, metric=accuracy, task=MULTICLASS_CLASSIFICATION, ) score = automl.score(X_test, Y_test) self.assertGreaterEqual(score, 0.8) self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION) del automl self._tearDown(backend_api.temporary_directory) self._tearDown(backend_api.output_directory)
def test_fit(self): X_train, Y_train, X_test, Y_test = putil.get_dataset("iris") automl = autosklearn.automl.AutoML(self.output, self.output, 10, 10) automl.fit(X_train, Y_train) score = automl.score(X_test, Y_test) self.assertGreaterEqual(score, 0.9) self.assertEqual(automl.task_, MULTICLASS_CLASSIFICATION)
def test_binary_score(self): """ Test fix for binary classification prediction taking the index 1 of second dimension in prediction matrix """ output = os.path.join(self.test_dir, '..', '.tmp_test_binary_score') self._setUp(output) data = sklearn.datasets.make_classification( n_samples=1000, n_features=20, n_redundant=5, n_informative=5, n_repeated=2, n_clusters_per_class=2, random_state=1) X_train = data[0][:700] Y_train = data[1][:700] X_test = data[0][700:] Y_test = data[1][700:] backend_api = backend.create(output, output) automl = autosklearn.automl.AutoML(backend_api, 15, 5) automl.fit(X_train, Y_train, task=BINARY_CLASSIFICATION) self.assertEqual(automl._task, BINARY_CLASSIFICATION) score = automl.score(X_test, Y_test) self.assertGreaterEqual(score, 0.5) del automl self._tearDown(output)
def test_binary_score(self): """ Test fix for binary classification prediction taking the index 1 of second dimension in prediction matrix """ if self.travis: self.skipTest('This test does currently not run on travis-ci. ' 'Make sure it runs locally on your machine!') output = os.path.join(self.test_dir, '..', '.tmp_test_binary_score') self._setUp(output) data = sklearn.datasets.make_classification(n_samples=1000, n_features=20, n_redundant=5, n_informative=5, n_repeated=2, n_clusters_per_class=2, random_state=1) X_train = data[0][:700] Y_train = data[1][:700] X_test = data[0][700:] Y_test = data[1][700:] automl = autosklearn.automl.AutoML(output, output, 15, 15) automl.fit(X_train, Y_train, task=BINARY_CLASSIFICATION) self.assertEqual(automl._task, BINARY_CLASSIFICATION) score = automl.score(X_test, Y_test) self.assertGreaterEqual(score, 0.5) del automl self._tearDown(output)
def test_fit(self): X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') automl = autosklearn.automl.AutoML(self.output, self.output, 10, 10) automl.fit(X_train, Y_train) score = automl.score(X_test, Y_test) self.assertGreaterEqual(score, 0.9) self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION)
def test_binary_score(self): """ Test fix for binary classification prediction taking the index 1 of second dimension in prediction matrix """ if self.travis: self.skipTest('This test does currently not run on travis-ci. ' 'Make sure it runs locally on your machine!') output = os.path.join(self.test_dir, '..', '.tmp_test_binary_score') self._setUp(output) data = sklearn.datasets.make_classification( n_samples=1000, n_features=20, n_redundant=5, n_informative=5, n_repeated=2, n_clusters_per_class=2, random_state=1) X_train = data[0][:700] Y_train = data[1][:700] X_test = data[0][700:] Y_test = data[1][700:] automl = autosklearn.automl.AutoML(output, output, 15, 15) automl.fit(X_train, Y_train, task=BINARY_CLASSIFICATION) self.assertEqual(automl._task, BINARY_CLASSIFICATION) score = automl.score(X_test, Y_test) self.assertGreaterEqual(score, 0.5) del automl self._tearDown(output)
def test_binary_score_and_include(self): """ Test fix for binary classification prediction taking the index 1 of second dimension in prediction matrix """ backend_api = self._create_backend('test_binary_score_and_include') data = sklearn.datasets.make_classification( n_samples=400, n_features=10, n_redundant=1, n_informative=3, n_repeated=1, n_clusters_per_class=2, random_state=1) X_train = data[0][:200] Y_train = data[1][:200] X_test = data[0][200:] Y_test = data[1][200:] automl = autosklearn.automl.AutoML(backend_api, 20, 5, include_estimators=['sgd'], include_preprocessors=['no_preprocessing']) automl.fit(X_train, Y_train, task=BINARY_CLASSIFICATION, metric=accuracy) self.assertEqual(automl._task, BINARY_CLASSIFICATION) # TODO, the assumption from above is not really tested here # Also, the score method should be removed, it only makes little sense score = automl.score(X_test, Y_test) self.assertGreaterEqual(score, 0.4) del automl self._tearDown(backend_api.temporary_directory) self._tearDown(backend_api.output_directory)
def test_fail_if_feat_type_on_pandas_input(backend, dask_client): """We do not support feat type when pandas is provided as an input """ automl = autosklearn.automl.AutoML( backend=backend, time_left_for_this_task=30, per_run_time_limit=5, metric=accuracy, dask_client=dask_client, ) X_train = pd.DataFrame({'a': [1, 1], 'c': [1, 2]}) y_train = [1, 0] with pytest.raises( ValueError, match="" "providing the option feat_type to the fit method is not supported when using a Dataframe" ): automl.fit( X_train, y_train, task=BINARY_CLASSIFICATION, feat_type={ 1: 'Categorical', 2: 'Numerical' }, )
def test_binary_score_and_include(backend, dask_client): """ Test fix for binary classification prediction taking the index 1 of second dimension in prediction matrix """ data = sklearn.datasets.make_classification( n_samples=400, n_features=10, n_redundant=1, n_informative=3, n_repeated=1, n_clusters_per_class=2, random_state=1) X_train = data[0][:200] Y_train = data[1][:200] X_test = data[0][200:] Y_test = data[1][200:] automl = autosklearn.automl.AutoML( backend, 20, 5, include_estimators=['sgd'], include_preprocessors=['no_preprocessing'], metric=accuracy, dask_client=dask_client, ) automl.fit(X_train, Y_train, task=BINARY_CLASSIFICATION) assert automl._task == BINARY_CLASSIFICATION # TODO, the assumption from above is not really tested here # Also, the score method should be removed, it only makes little sense score = automl.score(X_test, Y_test) assert score >= 0.4 del automl
def test_binary_score_and_include(self): """ Test fix for binary classification prediction taking the index 1 of second dimension in prediction matrix """ backend_api = self._create_backend('test_binary_score_and_include') data = sklearn.datasets.make_classification( n_samples=400, n_features=10, n_redundant=1, n_informative=3, n_repeated=1, n_clusters_per_class=2, random_state=1) X_train = data[0][:200] Y_train = data[1][:200] X_test = data[0][200:] Y_test = data[1][200:] automl = autosklearn.automl.AutoML( backend_api, 20, 5, include_estimators=['sgd'], include_preprocessors=['no_preprocessing'], metric=accuracy, ) automl.fit(X_train, Y_train, task=BINARY_CLASSIFICATION) self.assertEqual(automl._task, BINARY_CLASSIFICATION) # TODO, the assumption from above is not really tested here # Also, the score method should be removed, it only makes little sense score = automl.score(X_test, Y_test) self.assertGreaterEqual(score, 0.4) del automl self._tearDown(backend_api.temporary_directory) self._tearDown(backend_api.output_directory)
def test_fit(self): output = os.path.join(self.test_dir, '..', '.tmp_test_fit') self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') automl = autosklearn.automl.AutoML(output, output, 12, 12) automl.fit(X_train, Y_train) score = automl.score(X_test, Y_test) self.assertGreaterEqual(score, 0.8) self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION) del automl self._tearDown(output)
def test_delete_non_candidate_models(self): backend_api = self._create_backend( 'test_delete', delete_tmp_folder_after_terminate=False) seed = 555 X, Y, _, _ = putil.get_dataset('iris') automl = autosklearn.automl.AutoML( backend_api, time_left_for_this_task=30, per_run_time_limit=5, ensemble_nbest=3, seed=seed, initial_configurations_via_metalearning=0, resampling_strategy='holdout', include_estimators=['sgd'], include_preprocessors=['no_preprocessing']) automl.fit(X, Y, metric=accuracy, task=MULTICLASS_CLASSIFICATION, X_test=X, y_test=Y) # Assert at least one model file has been deleted and that there were no # deletion errors log_file_path = glob.glob( os.path.join(backend_api.temporary_directory, 'AutoML(' + str(seed) + '):*.log')) with open(log_file_path[0]) as log_file: log_content = log_file.read() self.assertIn('Deleted files of non-candidate model', log_content) self.assertNotIn('Failed to delete files of non-candidate model', log_content) self.assertNotIn('Failed to lock model', log_content) # Assert that the files of the models used by the ensemble weren't deleted model_files = backend_api.list_all_models(seed=seed) model_files_idx = set() for m_file in model_files: # Extract the model identifiers from the filename m_file = os.path.split(m_file)[1].replace('.model', '').split('.', 2) model_files_idx.add( (int(m_file[0]), int(m_file[1]), float(m_file[2]))) ensemble_members_idx = set(automl.ensemble_.identifiers_) self.assertTrue(ensemble_members_idx.issubset(model_files_idx)) del automl self._tearDown(backend_api.temporary_directory) self._tearDown(backend_api.output_directory)
def test_fit(self): output = os.path.join(self.test_dir, '..', '.tmp_test_fit') self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') backend_api = backend.create(output, output) automl = autosklearn.automl.AutoML(backend_api, 20, 5) automl.fit(X_train, Y_train, metric=accuracy) score = automl.score(X_test, Y_test) self.assertGreaterEqual(score, 0.8) self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION) del automl self._tearDown(output)
def test_fit(self): backend_api = self._create_backend('test_fit') X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') automl = autosklearn.automl.AutoML(backend_api, 20, 5) automl.fit( X_train, Y_train, metric=accuracy, task=MULTICLASS_CLASSIFICATION, ) score = automl.score(X_test, Y_test) self.assertGreaterEqual(score, 0.8) self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION) del automl self._tearDown(backend_api.temporary_directory) self._tearDown(backend_api.output_directory)
def test_delete_non_candidate_models(dask_client): seed = 555 X, Y, _, _ = putil.get_dataset('iris') automl = autosklearn.automl.AutoML( delete_tmp_folder_after_terminate=False, time_left_for_this_task=60, per_run_time_limit=5, ensemble_nbest=3, seed=seed, initial_configurations_via_metalearning=0, resampling_strategy='holdout', include={ 'classifier': ['sgd'], 'feature_preprocessor': ['no_preprocessing'] }, metric=accuracy, dask_client=dask_client, # Force model to be deleted. That is, from 50 which is the # default to 3 to make sure we delete models. max_models_on_disc=3, ) automl.fit(X, Y, task=MULTICLASS_CLASSIFICATION, X_test=X, y_test=Y) # Assert at least one model file has been deleted and that there were no # deletion errors log_file_path = glob.glob( os.path.join(automl._backend.temporary_directory, 'AutoML(' + str(seed) + '):*.log')) with open(log_file_path[0]) as log_file: log_content = log_file.read() assert 'Deleted files of non-candidate model' in log_content, log_content assert 'Failed to delete files of non-candidate model' not in log_content, log_content assert 'Failed to lock model' not in log_content, log_content # Assert that the files of the models used by the ensemble weren't deleted model_files = automl._backend.list_all_models(seed=seed) model_files_idx = set() for m_file in model_files: # Extract the model identifiers from the filename m_file = os.path.split(m_file)[1].replace('.model', '').split('.', 2) model_files_idx.add((int(m_file[0]), int(m_file[1]), float(m_file[2]))) ensemble_members_idx = set(automl.ensemble_.identifiers_) assert ensemble_members_idx.issubset(model_files_idx), ( ensemble_members_idx, model_files_idx) del automl
def test_fit(self): output = os.path.join(self.test_dir, '..', '.tmp_test_fit') self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') backend_api = backend.create(output, output) automl = autosklearn.automl.AutoML(backend_api, 30, 5) automl.fit(X_train, Y_train) #print(automl.show_models(), flush=True) #print(automl.cv_results_, flush=True) score = automl.score(X_test, Y_test) self.assertGreaterEqual(score, 0.8) self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION) del automl self._tearDown(output)
def test_exceptions_inside_log_in_smbo(smbo_run_mock, backend, dask_client): # Below importing and shutdown is a workaround, to make sure # we reset the port to collect messages. Randomly, when running # this test with multiple other test at the same time causes this # test to fail. This resets the singletons of the logging class import logging logging.shutdown() automl = autosklearn.automl.AutoML( backend, 20, 5, metric=accuracy, dask_client=dask_client, ) dataset_name = 'test_exceptions_inside_log' # Create a custom exception to prevent other errors to slip in class MyException(Exception): pass X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') # The first call is on dummy predictor failure message = str(np.random.randint(100)) + '_run_smbo' smbo_run_mock.side_effect = MyException(message) with pytest.raises(MyException): automl.fit( X_train, Y_train, task=MULTICLASS_CLASSIFICATION, dataset_name=dataset_name, ) # make sure that the logfile was created import shutil shutil.copytree(backend.temporary_directory, '/tmp/trydebug') logger_name = 'AutoML(%d):%s' % (1, dataset_name) logfile = os.path.join(backend.temporary_directory, logger_name + '.log') assert os.path.exists(logfile), automl._clean_logger() with open(logfile) as f: assert message in f.read(), automl._clean_logger() # Speed up the closing after forced crash automl._clean_logger()
def test_fit(self): if self.travis: self.skipTest('This test does currently not run on travis-ci. ' 'Make sure it runs locally on your machine!') output = os.path.join(self.test_dir, '..', '.tmp_test_fit') self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') automl = autosklearn.automl.AutoML(output, output, 15, 15) automl.fit(X_train, Y_train) score = automl.score(X_test, Y_test) self.assertGreaterEqual(score, 0.8) self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION) del automl self._tearDown(output)
def test_fit_roar(self): output = os.path.join(self.test_dir, '..', '.tmp_test_fit_roar') self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') backend_api = backend.create(output, output) automl = autosklearn.automl.AutoML(backend_api, 20, 5, initial_configurations_via_metalearning=0, configuration_mode='ROAR') automl.fit(X_train, Y_train, metric=accuracy) # print(automl.show_models(), flush=True) # print(automl.cv_results_, flush=True) score = automl.score(X_test, Y_test) self.assertGreaterEqual(score, 0.8) self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION) del automl self._tearDown(output)
def test_exceptions_inside_log_in_smbo(self, smbo_run_mock): # Make sure that any exception during the AutoML fit due to # SMAC are properly captured in a log file backend_api = self._create_backend('test_exceptions_inside_log') self._tearDown(backend_api.temporary_directory) self._tearDown(backend_api.output_directory) automl = autosklearn.automl.AutoML( backend_api, 20, 5, metric=accuracy, ) output_file = 'test_exceptions_inside_log.log' setup_logger(output_file=output_file) logger = get_logger('test_exceptions_inside_log') # Create a custom exception to prevent other errors to slip in class MyException(Exception): pass X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') # The first call is on dummy predictor failure message = str(np.random.randint(100)) + '_run_smbo' smbo_run_mock.side_effect = MyException(message) with unittest.mock.patch( 'autosklearn.automl.AutoML._get_logger') as mock: mock.return_value = logger with self.assertRaises(MyException): automl.fit( X_train, Y_train, task=MULTICLASS_CLASSIFICATION, ) with open(output_file) as f: self.assertTrue(message in f.read()) # Cleanup os.unlink(output_file) self._tearDown(backend_api.temporary_directory) self._tearDown(backend_api.output_directory)
def test_fit_roar(dask_client_single_worker, backend): def get_roar_object_callback( scenario_dict, seed, ta, ta_kwargs, dask_client, n_jobs, **kwargs ): """Random online adaptive racing. http://ml.informatik.uni-freiburg.de/papers/11-LION5-SMAC.pdf""" scenario = Scenario(scenario_dict) return ROAR( scenario=scenario, rng=seed, tae_runner=ta, tae_runner_kwargs=ta_kwargs, dask_client=dask_client, n_jobs=n_jobs, ) X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') automl = autosklearn.automl.AutoML( backend=backend, time_left_for_this_task=30, per_run_time_limit=5, initial_configurations_via_metalearning=0, get_smac_object_callback=get_roar_object_callback, metric=accuracy, dask_client=dask_client_single_worker, ) automl.fit( X_train, Y_train, task=MULTICLASS_CLASSIFICATION, ) score = automl.score(X_test, Y_test) assert score > 0.8 assert count_succeses(automl.cv_results_) > 0 assert includes_train_scores(automl.performance_over_time_.columns) is True assert automl._task == MULTICLASS_CLASSIFICATION del automl
def test_load_best_individual_model(metric, backend, dask_client): X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') automl = autosklearn.automl.AutoML( backend=backend, time_left_for_this_task=30, per_run_time_limit=5, metric=metric, dask_client=dask_client, ) # We cannot easily mock a function sent to dask # so for this test we create the whole set of models/ensembles # but prevent it to be loaded automl.fit( X_train, Y_train, task=MULTICLASS_CLASSIFICATION, ) automl._backend.load_ensemble = unittest.mock.MagicMock(return_value=None) # A memory error occurs in the ensemble construction assert automl._backend.load_ensemble(automl._seed) is None # The load model is robust to this and loads the best model automl._load_models() assert automl.ensemble_ is not None # Just 1 model is there for ensemble and all weight must be on it get_models_with_weights = automl.get_models_with_weights() assert len(get_models_with_weights) == 1 assert get_models_with_weights[0][0] == 1.0 # Match a toy dataset if metric.name == 'balanced_accuracy': assert automl.score(X_test, Y_test) > 0.9 elif metric.name == 'log_loss': # Seen values in github actions of 0.6978304740364537 assert automl.score(X_test, Y_test) < 0.7 else: raise ValueError(metric.name) del automl
def test_input_and_target_types(dask_client, X, y, task): if task in CLASSIFICATION_TASKS: automl = AutoMLClassifier( time_left_for_this_task=15, per_run_time_limit=5, dask_client=dask_client, ) else: automl = AutoMLRegressor( time_left_for_this_task=15, per_run_time_limit=5, dask_client=dask_client, ) # To save time fitting and only validate the inputs we only return # the configuration space automl.fit(X, y, only_return_configuration_space=True) assert automl._task == task assert automl._metric.name == default_metric_for_task[task].name
def test_fit(dask_client, backend): X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') automl = autosklearn.automl.AutoML( backend=backend, time_left_for_this_task=30, per_run_time_limit=5, metric=accuracy, dask_client=dask_client, ) automl.fit( X_train, Y_train, task=MULTICLASS_CLASSIFICATION, ) score = automl.score(X_test, Y_test) assert score > 0.8 assert count_succeses(automl.cv_results_) > 0 assert automl._task == MULTICLASS_CLASSIFICATION del automl
def test_load_best_individual_model(self): backend_api = self._create_backend('test_fit') for metric in [log_loss, balanced_accuracy]: X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') automl = autosklearn.automl.AutoML( backend=backend_api, time_left_for_this_task=20, per_run_time_limit=5, metric=metric, ) with unittest.mock.patch( 'autosklearn.ensemble_builder.EnsembleBuilder.run' ) as mock_ensemble_run: mock_ensemble_run.side_effect = MemoryError automl.fit( X_train, Y_train, task=MULTICLASS_CLASSIFICATION, ) # A memory error occurs in the ensemble construction self.assertIsNone(automl._backend.load_ensemble(automl._seed)) # The load model is robust to this and loads the best model automl._load_models() self.assertIsNotNone(automl.ensemble_) # Just 1 model is there for ensemble and all weight must be on it get_models_with_weights = automl.get_models_with_weights() self.assertEqual(len(get_models_with_weights), 1) self.assertEqual(get_models_with_weights[0][0], 1.0) # Match a toy dataset if metric._sign < 0: self.assertLessEqual(automl.score(X_test, Y_test), 0.2) else: self.assertGreaterEqual(automl.score(X_test, Y_test), 0.8) del automl self._tearDown(backend_api.temporary_directory) self._tearDown(backend_api.output_directory)
def test_fit(dask_client): X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') automl = autosklearn.automl.AutoML( seed=0, time_left_for_this_task=30, per_run_time_limit=5, metric=accuracy, dask_client=dask_client, ) automl.fit(X_train, Y_train, task=MULTICLASS_CLASSIFICATION) score = automl.score(X_test, Y_test) assert score > 0.8 assert count_succeses(automl.cv_results_) > 0 assert includes_train_scores(automl.performance_over_time_.columns) is True assert performance_over_time_is_plausible( automl.performance_over_time_) is True assert automl._task == MULTICLASS_CLASSIFICATION del automl
def test_fit(self): backend_api = self._create_backend('test_fit') X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') automl = autosklearn.automl.AutoML( backend=backend_api, time_left_for_this_task=20, per_run_time_limit=5, metric=accuracy, ) automl.fit( X_train, Y_train, task=MULTICLASS_CLASSIFICATION, ) score = automl.score(X_test, Y_test) self.assertGreaterEqual(score, 0.8) self.assertGreater(self._count_succeses(automl.cv_results_), 0) self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION) del automl self._tearDown(backend_api.temporary_directory) self._tearDown(backend_api.output_directory)
def test_binary_score_and_include(self): """ Test fix for binary classification prediction taking the index 1 of second dimension in prediction matrix """ output = os.path.join(self.test_dir, '..', '.tmp_test_binary_score') self._setUp(output) data = sklearn.datasets.make_classification(n_samples=400, n_features=10, n_redundant=1, n_informative=3, n_repeated=1, n_clusters_per_class=2, random_state=1) X_train = data[0][:200] Y_train = data[1][:200] X_test = data[0][200:] Y_test = data[1][200:] backend_api = backend.create(output, output) automl = autosklearn.automl.AutoML( backend_api, 30, 5, include_estimators=['sgd'], include_preprocessors=['no_preprocessing']) automl.fit(X_train, Y_train, task=BINARY_CLASSIFICATION) #print(automl.show_models(), flush=True) #print(automl.cv_results_, flush=True) self.assertEqual(automl._task, BINARY_CLASSIFICATION) # TODO, the assumption from above is not really tested here # Also, the score method should be removed, it only makes little sense score = automl.score(X_test, Y_test) self.assertGreaterEqual(score, 0.4) del automl self._tearDown(output)
def test_fail_if_dtype_changes_automl(self): """We do not support changes in the input type. Once a estimator is fitted, it should not change data type """ backend_api = self._create_backend('test_fail_feat_typechange') automl = autosklearn.automl.AutoML( backend=backend_api, time_left_for_this_task=20, per_run_time_limit=5, metric=accuracy, ) X_train = pd.DataFrame({'a': [1, 1], 'c': [1, 2]}) y_train = [1, 0] automl.InputValidator.validate(X_train, y_train, is_classification=True) with self.assertRaisesRegex(ValueError, "Auto-sklearn previously received features of type"): automl.fit( X_train.to_numpy(), y_train, task=BINARY_CLASSIFICATION, ) self._tearDown(backend_api.temporary_directory) self._tearDown(backend_api.output_directory)
def test_fail_if_feat_type_on_pandas_input(self): """We do not support feat type when pandas is provided as an input """ backend_api = self._create_backend('test_fail_feat_pandas') automl = autosklearn.automl.AutoML( backend=backend_api, time_left_for_this_task=20, per_run_time_limit=5, metric=accuracy, ) X_train = pd.DataFrame({'a': [1, 1], 'c': [1, 2]}) y_train = [1, 0] with self.assertRaisesRegex(ValueError, "feat_type cannot be provided when using pandas"): automl.fit( X_train, y_train, task=BINARY_CLASSIFICATION, feat_type=['Categorical', 'Numerical'], ) self._tearDown(backend_api.temporary_directory) self._tearDown(backend_api.output_directory)
def test_fail_if_dtype_changes_automl(backend, dask_client): """We do not support changes in the input type. Once a estimator is fitted, it should not change data type """ automl = autosklearn.automl.AutoML( backend=backend, time_left_for_this_task=30, per_run_time_limit=5, metric=accuracy, dask_client=dask_client, ) X_train = pd.DataFrame({'a': [1, 1], 'c': [1, 2]}) y_train = [1, 0] automl.InputValidator.validate(X_train, y_train, is_classification=True) with pytest.raises( ValueError, match="Auto-sklearn previously received features of type"): automl.fit( X_train.to_numpy(), y_train, task=BINARY_CLASSIFICATION, )
def test_fit_roar(self): def get_roar_object_callback( scenario_dict, seed, ta, **kwargs ): """Random online adaptive racing. http://ml.informatik.uni-freiburg.de/papers/11-LION5-SMAC.pdf""" scenario = Scenario(scenario_dict) return ROAR( scenario=scenario, rng=seed, tae_runner=ta, ) backend_api = self._create_backend('test_fit_roar') X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') automl = autosklearn.automl.AutoML( backend=backend_api, time_left_for_this_task=20, per_run_time_limit=5, initial_configurations_via_metalearning=0, get_smac_object_callback=get_roar_object_callback, ) automl.fit( X_train, Y_train, metric=accuracy, task=MULTICLASS_CLASSIFICATION, ) score = automl.score(X_test, Y_test) self.assertGreaterEqual(score, 0.8) self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION) del automl self._tearDown(backend_api.temporary_directory) self._tearDown(backend_api.output_directory)
def test_exceptions_inside_log_in_smbo(smbo_run_mock, backend, dask_client): automl = autosklearn.automl.AutoML( backend, 20, 5, metric=accuracy, dask_client=dask_client, ) output_file = 'test_exceptions_inside_log.log' setup_logger(output_file=output_file) logger = get_logger('test_exceptions_inside_log') # Create a custom exception to prevent other errors to slip in class MyException(Exception): pass X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') # The first call is on dummy predictor failure message = str(np.random.randint(100)) + '_run_smbo' smbo_run_mock.side_effect = MyException(message) with unittest.mock.patch('autosklearn.automl.AutoML._get_logger') as mock: mock.return_value = logger with pytest.raises(MyException): automl.fit( X_train, Y_train, task=MULTICLASS_CLASSIFICATION, ) with open(output_file) as f: assert message in f.read() # Cleanup os.unlink(output_file)
def test_exceptions_inside_log_in_smbo(smbo_run_mock, backend, dask_client): # Below importing and shutdown is a workaround, to make sure # we reset the port to collect messages. Randomly, when running # this test with multiple other test at the same time causes this # test to fail. This resets the singletons of the logging class import logging logging.shutdown() automl = autosklearn.automl.AutoML( backend, 20, 5, metric=accuracy, dask_client=dask_client, ) dataset_name = 'test_exceptions_inside_log' # Create a custom exception to prevent other errors to slip in class MyException(Exception): pass X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') # The first call is on dummy predictor failure message = str(np.random.randint(100)) + '_run_smbo' smbo_run_mock.side_effect = MyException(message) with pytest.raises(MyException): automl.fit( X_train, Y_train, task=MULTICLASS_CLASSIFICATION, dataset_name=dataset_name, ) # make sure that the logfile was created logger_name = 'AutoML(%d):%s' % (1, dataset_name) logger = logging.getLogger(logger_name) logfile = os.path.join(backend.temporary_directory, logger_name + '.log') assert os.path.exists(logfile), print_debug_information(automl) + str(automl._clean_logger()) # Give some time for the error message to be printed in the # log file found_message = False for incr_tolerance in range(5): with open(logfile) as f: lines = f.readlines() if any(message in line for line in lines): found_message = True break else: time.sleep(incr_tolerance) # Speed up the closing after forced crash automl._clean_logger() if not found_message: pytest.fail("Did not find {} in the log file {} for logger {}/{}/{}".format( message, print_debug_information(automl), vars(automl._logger.logger), vars(logger), vars(logging.getLogger()) ))