def test_fit_roar(self): def get_roar_object_callback(scenario_dict, seed, ta, **kwargs): """Random online adaptive racing. http://ml.informatik.uni-freiburg.de/papers/11-LION5-SMAC.pdf""" scenario = Scenario(scenario_dict) return ROAR( scenario=scenario, rng=seed, tae_runner=ta, ) output = os.path.join(self.test_dir, '..', '.tmp_test_fit_roar') self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') backend_api = backend.create(output, output) automl = autosklearn.automl.AutoML( backend=backend_api, time_left_for_this_task=20, per_run_time_limit=5, initial_configurations_via_metalearning=0, get_smac_object_callback=get_roar_object_callback, ) automl.fit(X_train, Y_train, metric=accuracy) score = automl.score(X_test, Y_test) self.assertGreaterEqual(score, 0.8) self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION) del automl self._tearDown(output)
def test_binary_score(self): """ Test fix for binary classification prediction taking the index 1 of second dimension in prediction matrix """ output = os.path.join(self.test_dir, '..', '.tmp_test_binary_score') self._setUp(output) data = sklearn.datasets.make_classification( n_samples=1000, n_features=20, n_redundant=5, n_informative=5, n_repeated=2, n_clusters_per_class=2, random_state=1) X_train = data[0][:700] Y_train = data[1][:700] X_test = data[0][700:] Y_test = data[1][700:] backend_api = backend.create(output, output) automl = autosklearn.automl.AutoML(backend_api, 15, 5) automl.fit(X_train, Y_train, task=BINARY_CLASSIFICATION) self.assertEqual(automl._task, BINARY_CLASSIFICATION) score = automl.score(X_test, Y_test) self.assertGreaterEqual(score, 0.5) del automl self._tearDown(output)
def test_file_output(self): self.output_dir = os.path.join(os.getcwd(), '.test_file_output') D = get_regression_datamanager() D.name = 'test' configuration_space = get_configuration_space(D.info) configuration = configuration_space.sample_configuration() backend_api = backend.create(self.output_dir, self.output_dir) evaluator = HoldoutEvaluator(D, backend_api, configuration, with_predictions=True, all_scoring_functions=True, output_y_test=True) loss, Y_optimization_pred, Y_valid_pred, Y_test_pred = \ evaluator.fit_predict_and_loss() evaluator.file_output(loss, Y_optimization_pred, Y_valid_pred, Y_test_pred) self.assertTrue( os.path.exists( os.path.join(self.output_dir, '.auto-sklearn', 'true_targets_ensemble.npy')))
def test_do_dummy_prediction(self): for name in ['401_bac', '31_bac', 'adult', 'cadata']: output = os.path.join(self.test_dir, '..', '.tmp_test_do_dummy_prediction') self._setUp(output) dataset = os.path.join(self.test_dir, '..', '.data', name) backend_api = backend.create(output, output) auto = autosklearn.automl.AutoML( backend_api, 20, 5, initial_configurations_via_metalearning=25) setup_logger() auto._logger = get_logger('test_do_dummy_predictions') auto._backend._make_internals_directory() D = load_data(dataset, backend_api) auto._backend.save_datamanager(D) auto._do_dummy_prediction(D, 1) # Ensure that the dummy predictions are not in the current working # directory, but in the output directory (under output) self.assertFalse(os.path.exists(os.path.join(os.getcwd(), '.auto-sklearn'))) self.assertTrue(os.path.exists(os.path.join( output, '.auto-sklearn', 'predictions_ensemble', 'predictions_ensemble_1_00001.npy'))) del auto self._tearDown(output)
def test_do_dummy_prediction(self): for name in ['401_bac', '31_bac', 'adult', 'cadata']: output = os.path.join(self.test_dir, '..', '.tmp_test_do_dummy_prediction') self._setUp(output) dataset = os.path.join(self.test_dir, '..', '.data', name) backend_api = backend.create(output, output) auto = autosklearn.automl.AutoML( backend_api, 15, 5, initial_configurations_via_metalearning=25) setup_logger() auto._logger = get_logger('test_do_dummy_predictions') auto._backend._make_internals_directory() D = load_data(dataset, backend_api) auto._backend.save_datamanager(D) auto._do_dummy_prediction(D, 1) # Ensure that the dummy predictions are not in the current working # directory, but in the output directory (under output) self.assertFalse(os.path.exists(os.path.join(os.getcwd(), '.auto-sklearn'))) self.assertTrue(os.path.exists(os.path.join( output, '.auto-sklearn', 'predictions_ensemble', 'predictions_ensemble_1_00001.npy'))) del auto self._tearDown(output)
def test_binary_score_and_include(self): """ Test fix for binary classification prediction taking the index 1 of second dimension in prediction matrix """ output = os.path.join(self.test_dir, '..', '.tmp_test_binary_score') self._setUp(output) data = sklearn.datasets.make_classification( n_samples=400, n_features=10, n_redundant=1, n_informative=3, n_repeated=1, n_clusters_per_class=2, random_state=1) X_train = data[0][:200] Y_train = data[1][:200] X_test = data[0][200:] Y_test = data[1][200:] backend_api = backend.create(output, output) automl = autosklearn.automl.AutoML(backend_api, 20, 5, include_estimators=['sgd'], include_preprocessors=['no_preprocessing']) automl.fit(X_train, Y_train, task=BINARY_CLASSIFICATION, metric=accuracy) self.assertEqual(automl._task, BINARY_CLASSIFICATION) # TODO, the assumption from above is not really tested here # Also, the score method should be removed, it only makes little sense score = automl.score(X_test, Y_test) self.assertGreaterEqual(score, 0.4) del automl self._tearDown(output)
def test_eval_cv_on_subset(self): backend_api = backend.create(self.tmp_dir, self.tmp_dir) eval_cv(self.queue, self.configuration, self.data, backend_api, 1, 1, 5, 45, True, False, True) info = self.queue.get() self.assertAlmostEqual(info[1], 0.063004032258064502) self.assertEqual(info[2], 1)
def build_automl( self, seed: int, shared_mode: bool, ensemble_size: int, initial_configurations_via_metalearning: int, tmp_folder: str, output_folder: str, smac_scenario_args: Optional[Dict] = None, ): if shared_mode: self.delete_output_folder_after_terminate = False self.delete_tmp_folder_after_terminate = False if tmp_folder is None: raise ValueError("If shared_mode == True tmp_folder must not " "be None.") if output_folder is None: raise ValueError("If shared_mode == True output_folder must " "not be None.") backend = create(temporary_directory=tmp_folder, output_directory=output_folder, delete_tmp_folder_after_terminate=self. delete_tmp_folder_after_terminate, delete_output_folder_after_terminate=self. delete_output_folder_after_terminate, shared_mode=shared_mode) if smac_scenario_args is None: smac_scenario_args = self.smac_scenario_args automl = self._get_automl_class()( backend=backend, time_left_for_this_task=self.time_left_for_this_task, per_run_time_limit=self.per_run_time_limit, initial_configurations_via_metalearning= initial_configurations_via_metalearning, ensemble_size=ensemble_size, ensemble_nbest=self.ensemble_nbest, ensemble_memory_limit=self.ensemble_memory_limit, seed=seed, ml_memory_limit=self.ml_memory_limit, include_estimators=self.include_estimators, exclude_estimators=self.exclude_estimators, include_preprocessors=self.include_preprocessors, exclude_preprocessors=self.exclude_preprocessors, resampling_strategy=self.resampling_strategy, resampling_strategy_arguments=self.resampling_strategy_arguments, shared_mode=shared_mode, get_smac_object_callback=self.get_smac_object_callback, disable_evaluator_output=self.disable_evaluator_output, smac_scenario_args=smac_scenario_args, logging_config=self.logging_config, metadata_directory=self.metadata_directory, write_history=self.write_history, read_history=self.read_history, ) return automl
def test_eval_holdout_on_subset(self): backend_api = backend.create(self.tmp_dir, self.tmp_dir) eval_holdout(self.queue, self.configuration, self.data, backend_api, 1, 1, 43, True, False, True) info = self.queue.get() self.assertAlmostEqual(info[1], 0.1) self.assertEqual(info[2], 1)
def test_eval_holdout_iterative_fit_no_timeout(self): backend_api = backend.create(self.tmp_dir, self.tmp_dir) eval_iterative_holdout(self.queue, self.configuration, self.data, backend_api, 1, 1, None, True, False, True) info = self.queue.get() self.assertAlmostEqual(info[1], 0.05) self.assertEqual(info[2], 1)
def _create_backend(self, test_name): tmp = os.path.join(self.test_dir, '..', '.tmp._%s' % test_name) output = os.path.join(self.test_dir, '..', '.output._%s' % test_name) # Make sure the folders we wanna create do not already exist. self._setUp(tmp) self._setUp(output) backend = create(tmp, output) return backend
def test_eval_holdout(self): backend_api = backend.create(self.tmp_dir, self.tmp_dir) eval_holdout(self.queue, self.configuration, self.data, backend_api, 1, 1, None, True, False, True) info = self.queue.get() self.assertAlmostEqual(info[1], 0.05) self.assertEqual(info[2], 1) self.assertNotIn('bac_metric', info[3])
def test_automl_outputs(self): output = os.path.join(self.test_dir, '..', '.tmp_test_automl_outputs') self._setUp(output) name = '31_bac' dataset = os.path.join(self.test_dir, '..', '.data', name) data_manager_file = os.path.join(output, '.auto-sklearn', 'datamanager.pkl') backend_api = backend.create(output, output) auto = autosklearn.automl.AutoML( backend_api, 20, 5, initial_configurations_via_metalearning=0, seed=100, ) auto.fit_automl_dataset(dataset, accuracy) # pickled data manager (without one hot encoding!) with open(data_manager_file, 'rb') as fh: D = pickle.load(fh) self.assertTrue( np.allclose(D.data['X_train'][0, :3], [1., 12., 2.])) # Check that all directories are there fixture = [ 'predictions_valid', 'true_targets_ensemble.npy', 'start_time_100', 'datamanager.pkl', 'predictions_ensemble', 'ensembles', 'predictions_test', 'models' ] self.assertEqual( sorted(os.listdir(os.path.join(output, '.auto-sklearn'))), sorted(fixture)) # At least one ensemble, one validation, one test prediction and one # model and one ensemble fixture = os.listdir( os.path.join(output, '.auto-sklearn', 'predictions_ensemble')) self.assertIn('predictions_ensemble_100_00001.npy', fixture) fixture = os.listdir(os.path.join(output, '.auto-sklearn', 'models')) self.assertIn('100.1.model', fixture) fixture = os.listdir(os.path.join(output, '.auto-sklearn', 'ensembles')) self.assertIn('100.0000000000.ensemble', fixture) # Start time start_time_file_path = os.path.join(output, '.auto-sklearn', "start_time_100") with open(start_time_file_path, 'r') as fh: start_time = float(fh.read()) self.assertGreaterEqual(time.time() - start_time, 10) del auto self._tearDown(output)
def test_eval_cv_all_loss_functions(self): backend_api = backend.create(self.tmp_dir, self.tmp_dir) eval_cv(self.queue, self.configuration, self.data, backend_api, 1, 1, 5, None, True, True, True) info = self.queue.get() self.assertIn( 'f1_metric: 0.0794451450189;pac_metric: 0.344745492187;' 'acc_metric: 0.075;auc_metric: 0.0285222960152;' 'bac_metric: 0.0796370967742;duration: ', info[3]) self.assertAlmostEqual(info[1], 0.079637096774193727) self.assertEqual(info[2], 1)
def test_eval_holdout_all_loss_functions(self): backend_api = backend.create(self.tmp_dir, self.tmp_dir) eval_holdout(self.queue, self.configuration, self.data, backend_api, 1, 1, None, True, True, True) info = self.queue.get() self.assertIn( 'f1_metric: 0.0480549199085;pac_metric: 0.135572680594;' 'acc_metric: 0.0454545454545;auc_metric: 0.0;' 'bac_metric: 0.05;duration: ', info[3]) self.assertAlmostEqual(info[1], 0.05) self.assertEqual(info[2], 1)
def evaluate(input_directory, validation_files, test_files, ensemble_size=50): backend = create(input_directory, input_directory + "_output", delete_tmp_folder_after_terminate=False, delete_output_folder_after_terminate=False, shared_mode=True) valid_labels = backend.load_targets_ensemble() D = backend.load_datamanager() test_labels = D.data["Y_test"] score = balanced_accuracy # Read the modification time of the predictions file and # compute the interval to the first prediction file. # This interval will be add to the time we needed to build the ensemble time_function_evaluation = validation_files[-1][-1] # Build the ensemble start = time.time() ensemble_selection = EnsembleSelection( ensemble_size=ensemble_size, task_type=D.info['task'], metric=score, random_state=np.random.RandomState()) validation_predictions = np.array([v[0] for v in validation_files]) test_predictions = np.array([t[0] for t in test_files]) ensemble_selection.fit(validation_predictions, valid_labels, identifiers=None) y_hat_ensemble = ensemble_selection.predict( np.array(validation_predictions)) y_hat_test = ensemble_selection.predict(np.array(test_predictions)) # Compute validation error ensemble_error = 1 - score(valid_labels, y_hat_ensemble) # Compute test error ensemble_test_error = 1 - score(test_labels, y_hat_test) ensemble_time = time.time() - start rval = { 'ensemble_time': ensemble_time, 'time_function_evaluation': time_function_evaluation, 'ensemble_error': ensemble_error, 'ensemble_test_error': ensemble_test_error } return rval
def test_automl_outputs(self): output = os.path.join(self.test_dir, '..', '.tmp_test_automl_outputs') self._setUp(output) name = '31_bac' dataset = os.path.join(self.test_dir, '..', '.data', name) data_manager_file = os.path.join(output, '.auto-sklearn', 'datamanager.pkl') backend_api = backend.create(output, output) auto = autosklearn.automl.AutoML( backend_api, 15, 5, initial_configurations_via_metalearning=25, seed=100) auto.fit_automl_dataset(dataset) # pickled data manager (without one hot encoding!) with open(data_manager_file, 'rb') as fh: D = six.moves.cPickle.load(fh) self.assertTrue(np.allclose(D.data['X_train'][0, :3], [1., 12., 2.])) # Check that all directories are there fixture = ['predictions_valid', 'true_targets_ensemble.npy', 'start_time_100', 'datamanager.pkl', 'predictions_ensemble', 'ensembles', 'predictions_test', 'models'] self.assertEqual(sorted(os.listdir(os.path.join(output, '.auto-sklearn'))), sorted(fixture)) # At least one ensemble, one validation, one test prediction and one # model and one ensemble fixture = os.listdir(os.path.join(output, '.auto-sklearn', 'predictions_ensemble')) self.assertIn('predictions_ensemble_100_00001.npy', fixture) fixture = os.listdir(os.path.join(output, '.auto-sklearn', 'models')) self.assertIn('100.1.model', fixture) fixture = os.listdir(os.path.join(output, '.auto-sklearn', 'ensembles')) self.assertIn('100.0000000000.ensemble', fixture) # Start time start_time_file_path = os.path.join(output, '.auto-sklearn', "start_time_100") with open(start_time_file_path, 'r') as fh: start_time = float(fh.read()) self.assertGreaterEqual(time.time() - start_time, 10) del auto self._tearDown(output)
def test_eval_partial_cv(self): results = [ 0.071428571428571508, 0.071428571428571508, 0.08333333333333337, 0.16666666666666674, 0.0 ] for fold in range(5): backend_api = backend.create(self.tmp_dir, self.tmp_dir) eval_partial_cv(self.queue, self.configuration, self.data, backend_api, 1, 1, fold, 5, None, True, False, True) info = self.queue.get() results.append(info[1]) self.assertAlmostEqual(info[1], results[fold]) self.assertEqual(info[2], 1)
def test_fit(self): output = os.path.join(self.test_dir, '..', '.tmp_test_fit') self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') backend_api = backend.create(output, output) automl = autosklearn.automl.AutoML(backend_api, 20, 5) automl.fit(X_train, Y_train, metric=accuracy) score = automl.score(X_test, Y_test) self.assertGreaterEqual(score, 0.8) self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION) del automl self._tearDown(output)
def build_automl( self, seed: int, ensemble_size: int, initial_configurations_via_metalearning: int, tmp_folder: str, output_folder: str, smac_scenario_args: Optional[Dict] = None, ): backend = create( temporary_directory=tmp_folder, output_directory=output_folder, delete_tmp_folder_after_terminate=self. delete_tmp_folder_after_terminate, delete_output_folder_after_terminate=self. delete_output_folder_after_terminate, ) if smac_scenario_args is None: smac_scenario_args = self.smac_scenario_args automl = self._get_automl_class()( backend=backend, time_left_for_this_task=self.time_left_for_this_task, per_run_time_limit=self.per_run_time_limit, initial_configurations_via_metalearning= initial_configurations_via_metalearning, ensemble_size=ensemble_size, ensemble_nbest=self.ensemble_nbest, max_models_on_disc=self.max_models_on_disc, ensemble_memory_limit=self.ensemble_memory_limit, seed=seed, ml_memory_limit=self.ml_memory_limit, include_estimators=self.include_estimators, exclude_estimators=self.exclude_estimators, include_preprocessors=self.include_preprocessors, exclude_preprocessors=self.exclude_preprocessors, resampling_strategy=self.resampling_strategy, resampling_strategy_arguments=self.resampling_strategy_arguments, n_jobs=self._n_jobs, dask_client=self.dask_client, get_smac_object_callback=self.get_smac_object_callback, disable_evaluator_output=self.disable_evaluator_output, smac_scenario_args=smac_scenario_args, logging_config=self.logging_config, metadata_directory=self.metadata_directory, metric=self._metric) return automl
def test_fit(self): output = os.path.join(self.test_dir, '..', '.tmp_test_fit') self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') backend_api = backend.create(output, output) automl = autosklearn.automl.AutoML(backend_api, 15, 5) automl.fit(X_train, Y_train) score = automl.score(X_test, Y_test) self.assertGreaterEqual(score, 0.8) self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION) del automl self._tearDown(output)
def test_iterative_holdout_not_iterative(self, pipeline_mock): # Regular fitting D = get_binary_classification_datamanager() D.name = 'test' Xt_fixture = 'Xt_fixture' pipeline_mock.estimator_supports_iterative_fit.return_value = False pipeline_mock.fit_transformer.return_value = Xt_fixture, {} pipeline_mock.predict_proba.side_effect = lambda X, batch_size: np.tile( [0.6, 0.4], (len(X), 1)) pipeline_mock.side_effect = lambda **kwargs: pipeline_mock pipeline_mock.get_additional_run_info.return_value = None output_dir = os.path.join(os.getcwd(), '.test_iterative_holdout_not_iterative') configuration = unittest.mock.Mock(spec=Configuration) backend_api = backend.create(output_dir, output_dir) backend_api.load_datamanager = lambda: D queue_ = multiprocessing.Queue() evaluator = TrainEvaluator(backend_api, queue_, configuration=configuration, resampling_strategy='holdout-iterative-fit', all_scoring_functions=False, output_y_hat_optimization=True, metric=accuracy) evaluator.file_output = unittest.mock.Mock(spec=evaluator.file_output) evaluator.file_output.return_value = (None, None) evaluator.fit_predict_and_loss(iterative=True) self.assertEqual(evaluator.file_output.call_count, 1) rval = evaluator.queue.get(timeout=1) self.assertAlmostEqual(rval['loss'], 0.47826086956521741) self.assertRaises(queue.Empty, evaluator.queue.get, timeout=1) self.assertEqual(pipeline_mock.iterative_fit.call_count, 0) # fifteen calls because of the holdout, the validation and the test set # and a total of five calls because of five iterations of fitting self.assertEqual(evaluator.model.predict_proba.call_count, 3) self.assertEqual(evaluator.file_output.call_args[0][0].shape[0], 23) self.assertEqual(evaluator.file_output.call_args[0][1].shape[0], D.data['Y_valid'].shape[0]) self.assertEqual(evaluator.file_output.call_args[0][2].shape[0], D.data['Y_test'].shape[0]) self.assertEqual(evaluator.file_output.call_count, 1) self.assertEqual(evaluator.model.fit.call_count, 1)
def test_finish_up_model_predicts_NaN(self): '''Tests by handing in predictions which contain NaNs''' rs = np.random.RandomState(1) D = get_multiclass_classification_datamanager() output_dir = os.path.join( os.getcwd(), '.test_finish_up_model_predicts_NaN') try: shutil.rmtree(output_dir) except: pass backend_api = backend.create(output_dir, output_dir) ae = AbstractEvaluator(Datamanager=D, backend=backend_api, output_y_test=False) ae.Y_optimization = rs.rand(33, 3) predictions_ensemble = rs.rand(33, 3) predictions_test = rs.rand(25, 3) predictions_valid = rs.rand(25, 3) # NaNs in prediction ensemble predictions_ensemble[5, 2] = np.NaN _, loss, _, additional_run_info = ae.finish_up( 0.1, predictions_ensemble, predictions_valid, predictions_test) self.assertEqual(loss, 2.0) self.assertEqual(additional_run_info, 'Model predictions for ' 'optimization set contains NaNs.') # NaNs in prediction validation predictions_ensemble[5, 2] = 0.5 predictions_valid[5, 2] = np.NaN _, loss, _, additional_run_info = ae.finish_up( 0.1, predictions_ensemble, predictions_valid, predictions_test) self.assertEqual(loss, 2.0) self.assertEqual(additional_run_info, 'Model predictions for ' 'validation set contains NaNs.') # NaNs in prediction test predictions_valid[5, 2] = 0.5 predictions_test[5, 2] = np.NaN _, loss, _, additional_run_info = ae.finish_up( 0.1, predictions_ensemble, predictions_valid, predictions_test) self.assertEqual(loss, 2.0) self.assertEqual(additional_run_info, 'Model predictions for ' 'test set contains NaNs.') self.assertEqual(len(os.listdir(os.path.join(output_dir, '.auto-sklearn'))), 0)
def test_fit(self): output = os.path.join(self.test_dir, '..', '.tmp_test_fit') self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') backend_api = backend.create(output, output) automl = autosklearn.automl.AutoML(backend_api, 30, 5) automl.fit(X_train, Y_train) #print(automl.show_models(), flush=True) #print(automl.cv_results_, flush=True) score = automl.score(X_test, Y_test) self.assertGreaterEqual(score, 0.8) self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION) del automl self._tearDown(output)
def test_holdout(self, pipeline_mock): D = get_binary_classification_datamanager() D.name = 'test' pipeline_mock.predict_proba.side_effect = lambda X, batch_size: np.tile( [0.6, 0.4], (len(X), 1)) pipeline_mock.side_effect = lambda **kwargs: pipeline_mock pipeline_mock.get_additional_run_info.return_value = None output_dir = os.path.join(os.getcwd(), '.test_holdout') configuration = unittest.mock.Mock(spec=Configuration) backend_api = backend.create(output_dir, output_dir) backend_api.load_datamanager = lambda: D queue_ = multiprocessing.Queue() evaluator = TrainEvaluator( backend_api, queue_, configuration=configuration, resampling_strategy='holdout', resampling_strategy_args={'train_size': 0.66}, all_scoring_functions=False, output_y_hat_optimization=True, metric=accuracy, subsample=50) evaluator.file_output = unittest.mock.Mock(spec=evaluator.file_output) evaluator.file_output.return_value = (None, None) evaluator.fit_predict_and_loss() rval = get_last_result(evaluator.queue) result = rval['loss'] self.assertEqual(len(rval), 3) self.assertRaises(queue.Empty, evaluator.queue.get, timeout=1) self.assertEqual(evaluator.file_output.call_count, 1) self.assertEqual(result, 0.45833333333333337) self.assertEqual(pipeline_mock.fit.call_count, 1) # three calls because of the holdout, the validation and the test set self.assertEqual(pipeline_mock.predict_proba.call_count, 3) self.assertEqual(evaluator.file_output.call_count, 1) self.assertEqual(evaluator.file_output.call_args[0][0].shape[0], 24) self.assertEqual(evaluator.file_output.call_args[0][1].shape[0], D.data['Y_valid'].shape[0]) self.assertEqual(evaluator.file_output.call_args[0][2].shape[0], D.data['Y_test'].shape[0]) self.assertEqual(evaluator.model.fit.call_count, 1)
def test_cv(self, pipeline_mock): D = get_binary_classification_datamanager() pipeline_mock.predict_proba.side_effect = lambda X, batch_size: np.tile( [0.6, 0.4], (len(X), 1)) pipeline_mock.side_effect = lambda **kwargs: pipeline_mock pipeline_mock.get_additional_run_info.return_value = None output_dir = os.path.join(os.getcwd(), '.test_cv') configuration = unittest.mock.Mock(spec=Configuration) backend_api = backend.create(output_dir, output_dir) backend_api.load_datamanager = lambda: D queue_ = multiprocessing.Queue() evaluator = TrainEvaluator(backend_api, queue_, configuration=configuration, resampling_strategy='cv', resampling_strategy_args={'folds': 5}, all_scoring_functions=False, output_y_hat_optimization=True, metric=accuracy) evaluator.file_output = unittest.mock.Mock(spec=evaluator.file_output) evaluator.file_output.return_value = (None, None) evaluator.fit_predict_and_loss() rval = get_last_result(evaluator.queue) result = rval['loss'] self.assertEqual(len(rval), 3) self.assertRaises(queue.Empty, evaluator.queue.get, timeout=1) self.assertEqual(evaluator.file_output.call_count, 1) self.assertEqual(result, 0.46376811594202894) self.assertEqual(pipeline_mock.fit.call_count, 5) # Fifteen calls because of the holdout, the validation and the test set self.assertEqual(pipeline_mock.predict_proba.call_count, 15) self.assertEqual(evaluator.file_output.call_args[0][0].shape[0], D.data['Y_train'].shape[0]) self.assertEqual(evaluator.file_output.call_args[0][1].shape[0], D.data['Y_valid'].shape[0]) self.assertEqual(evaluator.file_output.call_args[0][2].shape[0], D.data['Y_test'].shape[0]) # The model prior to fitting is saved, this cannot be directly tested # because of the way the mock module is used. Instead, we test whether # the if block in which model assignment is done is accessed self.assertTrue(evaluator._added_empty_model)
def _create_backend(self, test_name, delete_tmp_folder_after_terminate=True, delete_output_folder_after_terminate=True): tmp = os.path.join(self.test_dir, '..', '.tmp._%s' % test_name) output = os.path.join(self.test_dir, '..', '.output._%s' % test_name) # Make sure the folders we wanna create do not already exist. self._setUp(tmp) self._setUp(output) backend = create( tmp, output, delete_tmp_folder_after_terminate=delete_tmp_folder_after_terminate, delete_output_folder_after_terminate= delete_output_folder_after_terminate, ) return backend
def get_meta_learning_configs(X, y, task_type, dataset_name='default', metric='accuracy', num_cfgs=5): if X is None or y is None: X, y, _ = load_data(dataset_name) backend = create(temporary_directory=None, output_directory=None, delete_tmp_folder_after_terminate=False, delete_output_folder_after_terminate=False, shared_mode=True) dm = XYDataManager(X, y, None, None, task_type, None, dataset_name) configuration_space = pipeline.get_configuration_space( dm.info, include_estimators=None, exclude_estimators=None, include_preprocessors=None, exclude_preprocessors=None) watcher = StopWatch() name = os.path.basename(dm.name) watcher.start_task(name) def reset_data_manager(max_mem=None): pass automlsmbo = AutoMLSMBO( config_space=configuration_space, dataset_name=dataset_name, backend=backend, total_walltime_limit=1e5, func_eval_time_limit=1e5, memory_limit=1e5, metric=metric, watcher=watcher, metadata_directory='components/meta_learning/meta_resource', num_metalearning_cfgs=num_cfgs) automlsmbo.reset_data_manager = reset_data_manager automlsmbo.task = task_type automlsmbo.datamanager = dm configs = automlsmbo.get_metalearning_suggestions() return configs
def test_fit(self): if self.travis: self.skipTest('This test does currently not run on travis-ci. ' 'Make sure it runs locally on your machine!') output = os.path.join(self.test_dir, '..', '.tmp_test_fit') self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') backend_api = backend.create(output, output) automl = autosklearn.automl.AutoML(backend_api, 15, 15) automl.fit(X_train, Y_train) score = automl.score(X_test, Y_test) self.assertGreaterEqual(score, 0.8) self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION) del automl self._tearDown(output)
def test_fit_roar(self): output = os.path.join(self.test_dir, '..', '.tmp_test_fit_roar') self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') backend_api = backend.create(output, output) automl = autosklearn.automl.AutoML(backend_api, 20, 5, initial_configurations_via_metalearning=0, configuration_mode='ROAR') automl.fit(X_train, Y_train, metric=accuracy) # print(automl.show_models(), flush=True) # print(automl.cv_results_, flush=True) score = automl.score(X_test, Y_test) self.assertGreaterEqual(score, 0.8) self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION) del automl self._tearDown(output)
def build_automl(self): if self.shared_mode: self.delete_output_folder_after_terminate = False self.delete_tmp_folder_after_terminate = False if self.tmp_folder is None: raise ValueError("If shared_mode == True tmp_folder must not " "be None.") if self.output_folder is None: raise ValueError("If shared_mode == True output_folder must " "not be None.") backend = create(temporary_directory=self.tmp_folder, output_directory=self.output_folder, delete_tmp_folder_after_terminate=self. delete_tmp_folder_after_terminate, delete_output_folder_after_terminate=self. delete_output_folder_after_terminate, shared_mode=self.shared_mode) automl = self._get_automl_class()( backend=backend, time_left_for_this_task=self.time_left_for_this_task, per_run_time_limit=self.per_run_time_limit, initial_configurations_via_metalearning=self. initial_configurations_via_metalearning, ensemble_size=self.ensemble_size, ensemble_nbest=self.ensemble_nbest, seed=self.seed, ml_memory_limit=self.ml_memory_limit, include_estimators=self.include_estimators, exclude_estimators=self.exclude_estimators, include_preprocessors=self.include_preprocessors, exclude_preprocessors=self.exclude_preprocessors, resampling_strategy=self.resampling_strategy, resampling_strategy_arguments=self.resampling_strategy_arguments, shared_mode=self.shared_mode, get_smac_object_callback=self.get_smac_object_callback, disable_evaluator_output=self.disable_evaluator_output, smac_scenario_args=self.smac_scenario_args, # 启用正方向代码 direction_args=self.direction_args, ) return automl
def build_automl(self): if self.shared_mode: self.delete_output_folder_after_terminate = False self.delete_tmp_folder_after_terminate = False if self.tmp_folder is None: raise ValueError("If shared_mode == True tmp_folder must not " "be None.") if self.output_folder is None: raise ValueError("If shared_mode == True output_folder must " "not be None.") backend = create(temporary_directory=self.tmp_folder, output_directory=self.output_folder, delete_tmp_folder_after_terminate=self. delete_tmp_folder_after_terminate, delete_output_folder_after_terminate=self. delete_output_folder_after_terminate) automl = autosklearn.automl.AutoML( backend=backend, time_left_for_this_task=self.time_left_for_this_task, per_run_time_limit=self.per_run_time_limit, log_dir=backend.temporary_directory, initial_configurations_via_metalearning=self. initial_configurations_via_metalearning, ensemble_size=self.ensemble_size, ensemble_nbest=self.ensemble_nbest, seed=self.seed, ml_memory_limit=self.ml_memory_limit, include_estimators=self.include_estimators, exclude_estimators=self.exclude_estimators, include_preprocessors=self.include_preprocessors, exclude_preprocessors=self.exclude_preprocessors, resampling_strategy=self.resampling_strategy, resampling_strategy_arguments=self.resampling_strategy_arguments, delete_tmp_folder_after_terminate=self. delete_tmp_folder_after_terminate, delete_output_folder_after_terminate=self. delete_output_folder_after_terminate, shared_mode=self.shared_mode, configuration_mode=self.configuration_mode, disable_evaluator_output=self.disable_evaluator_output) return automl
def backend(request): test_dir = os.path.dirname(__file__) tmp = os.path.join( test_dir, '.tmp__%s__%s' % (request.module.__name__, request.node.name)) output = os.path.join( test_dir, '.output__%s__%s' % (request.module.__name__, request.node.name)) for dir in (tmp, output): for i in range(10): if os.path.exists(dir): try: shutil.rmtree(dir) break except OSError: time.sleep(1) # Make sure the folders we wanna create do not already exist. backend = create( tmp, output, delete_tmp_folder_after_terminate=True, delete_output_folder_after_terminate=True, ) def get_finalizer(tmp_dir, output_dir): def session_run_at_end(): for dir in (tmp_dir, output_dir): for i in range(10): if os.path.exists(dir): try: shutil.rmtree(dir) break except OSError: time.sleep(1) return session_run_at_end request.addfinalizer(get_finalizer(tmp, output)) return backend
def test_partial_cv(self, pipeline_mock): D = get_binary_classification_datamanager() kfold = StratifiedKFold(random_state=1, n_splits=5, shuffle=True) pipeline_mock.predict_proba.side_effect = lambda X, batch_size: np.tile( [0.6, 0.4], (len(X), 1)) pipeline_mock.side_effect = lambda **kwargs: pipeline_mock output_dir = os.path.join(os.getcwd(), '.test_partial_cv') D = get_binary_classification_datamanager() D.name = 'test' configuration = unittest.mock.Mock(spec=Configuration) backend_api = backend.create(output_dir, output_dir) queue_ = multiprocessing.Queue() evaluator = TrainEvaluator(D, backend_api, queue_, configuration=configuration, cv=kfold, all_scoring_functions=False, output_y_hat_optimization=True, metric=accuracy) evaluator.file_output = unittest.mock.Mock(spec=evaluator.file_output) evaluator.file_output.return_value = (None, None) evaluator.partial_fit_predict_and_loss(1) rval = evaluator.queue.get(timeout=1) self.assertRaises(queue.Empty, evaluator.queue.get, timeout=1) self.assertEqual(evaluator.file_output.call_count, 0) self.assertEqual(rval['loss'], 0.46666666666666667) self.assertEqual(pipeline_mock.fit.call_count, 1) self.assertEqual(pipeline_mock.predict_proba.call_count, 3) # The model prior to fitting is saved, this cannot be directly tested # because of the way the mock module is used. Instead, we test whether # the if block in which model assignment is done is accessed self.assertTrue(evaluator._added_empty_model)
def build_automl(self): if self.shared_mode: self.delete_output_folder_after_terminate = False self.delete_tmp_folder_after_terminate = False if self.tmp_folder is None: raise ValueError("If shared_mode == True tmp_folder must not " "be None.") if self.output_folder is None: raise ValueError("If shared_mode == True output_folder must " "not be None.") backend = create(temporary_directory=self.tmp_folder, output_directory=self.output_folder, delete_tmp_folder_after_terminate=self.delete_tmp_folder_after_terminate, delete_output_folder_after_terminate=self.delete_output_folder_after_terminate, shared_mode = self.shared_mode) automl = self._get_automl_class()( backend=backend, time_left_for_this_task=self.time_left_for_this_task, per_run_time_limit=self.per_run_time_limit, initial_configurations_via_metalearning= self.initial_configurations_via_metalearning, ensemble_size=self.ensemble_size, ensemble_nbest=self.ensemble_nbest, seed=self.seed, ml_memory_limit=self.ml_memory_limit, include_estimators=self.include_estimators, exclude_estimators=self.exclude_estimators, include_preprocessors=self.include_preprocessors, exclude_preprocessors=self.exclude_preprocessors, resampling_strategy=self.resampling_strategy, resampling_strategy_arguments=self.resampling_strategy_arguments, shared_mode=self.shared_mode, get_smac_object_callback=self.get_smac_object_callback, disable_evaluator_output=self.disable_evaluator_output, smac_scenario_args=self.smac_scenario_args ) return automl