def test_fit_pSMAC(self): output = os.path.join(self.test_dir, '..', '.tmp_estimator_fit_pSMAC') self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') automl = AutoSklearnClassifier(time_left_for_this_task=15, per_run_time_limit=15, output_folder=output, tmp_folder=output, shared_mode=True, seed=1, initial_configurations_via_metalearning=0, ensemble_size=0) automl.fit(X_train, Y_train) # Create a 'dummy model' for the first run, which has an accuracy of # more than 99%; it should be in the final ensemble if the ensemble # building of the second AutoSklearn classifier works correct true_targets_ensemble_path = os.path.join(output, '.auto-sklearn', 'true_targets_ensemble.npy') true_targets_ensemble = np.load(true_targets_ensemble_path) true_targets_ensemble[-1] = 1 if true_targets_ensemble[-1] != 1 else 0 probas = np.zeros((len(true_targets_ensemble), 3), dtype=float) for i, value in enumerate(true_targets_ensemble): probas[i, value] = 1.0 dummy_predictions_path = os.path.join(output, '.auto-sklearn', 'predictions_ensemble', 'predictions_ensemble_1_00030.npy') with open(dummy_predictions_path, 'wb') as fh: np.save(fh, probas) probas_test = np.zeros((len(Y_test), 3), dtype=float) for i, value in enumerate(Y_test): probas_test[i, value] = 1.0 dummy = ArrayReturningDummyPredictor(probas_test) backend = Backend(output, output) backend.save_model(dummy, 30, 1) automl = AutoSklearnClassifier(time_left_for_this_task=15, per_run_time_limit=15, output_folder=output, tmp_folder=output, shared_mode=True, seed=2, initial_configurations_via_metalearning=0, ensemble_size=0) automl.fit(X_train, Y_train) automl.run_ensemble_builder(0, 1, 50).wait() score = automl.score(X_test, Y_test) self.assertEqual(len(os.listdir(os.path.join(output, '.auto-sklearn', 'ensemble_indices'))), 1) self.assertGreaterEqual(score, 0.90) self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION) del automl self._tearDown(output)
def test_fit_pSMAC(self): output = os.path.join(self.test_dir, '..', '.tmp_estimator_fit_pSMAC') self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') automl = AutoSklearnClassifier(time_left_for_this_task=15, per_run_time_limit=15, output_folder=output, tmp_folder=output, shared_mode=True, seed=1, initial_configurations_via_metalearning=0, ensemble_size=0) automl.fit(X_train, Y_train) # Create a 'dummy model' for the first run, which has an accuracy of # more than 99%; it should be in the final ensemble if the ensemble # building of the second AutoSklearn classifier works correct true_targets_ensemble_path = os.path.join(output, '.auto-sklearn', 'true_targets_ensemble.npy') true_targets_ensemble = np.load(true_targets_ensemble_path) true_targets_ensemble[-1] = 1 if true_targets_ensemble[-1] != 1 else 0 probas = np.zeros((len(true_targets_ensemble), 3), dtype=float) for i, value in enumerate(true_targets_ensemble): probas[i, value] = 1.0 dummy_predictions_path = os.path.join(output, '.auto-sklearn', 'predictions_ensemble', 'predictions_ensemble_1_00030.npy') with open(dummy_predictions_path, 'wb') as fh: np.save(fh, probas) probas_test = np.zeros((len(Y_test), 3), dtype=float) for i, value in enumerate(Y_test): probas_test[i, value] = 1.0 dummy = ArrayReturningDummyPredictor(probas_test) backend = Backend(output, output) backend.save_model(dummy, 30, 1) automl = AutoSklearnClassifier(time_left_for_this_task=15, per_run_time_limit=15, output_folder=output, tmp_folder=output, shared_mode=True, seed=2, initial_configurations_via_metalearning=0, ensemble_size=0) automl.fit(X_train, Y_train) automl.run_ensemble_builder(0, 1, 50).wait() score = automl.score(X_test, Y_test) self.assertEqual(len(os.listdir(os.path.join(output, '.auto-sklearn', 'ensembles'))), 1) self.assertGreaterEqual(score, 0.90) self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION) del automl self._tearDown(output)
def test_classification_pandas_support(self): X, y = sklearn.datasets.fetch_openml( data_id=2, # cat/num dataset return_X_y=True, as_frame=True, ) # Drop NAN!! X = X.dropna('columns') # This test only make sense if input is dataframe self.assertTrue(isinstance(X, pd.DataFrame)) self.assertTrue(isinstance(y, pd.Series)) automl = AutoSklearnClassifier( time_left_for_this_task=30, per_run_time_limit=5, exclude_estimators=['libsvm_svc'], seed=5, ) automl.fit(X, y) # Make sure that at least better than random. # We use same X_train==X_test to test code quality self.assertTrue(automl.score(X, y) > 0.555) automl.refit(X, y) # Make sure that at least better than random. # accuracy in sklearn needs valid data # It should be 0.555 as the dataset is unbalanced. y = automl._automl[0].InputValidator.encode_target(y) prediction = automl._automl[0].InputValidator.encode_target(automl.predict(X)) self.assertTrue(accuracy(y, prediction) > 0.555)
def test_fit(self): output = os.path.join(self.test_dir, '..', '.tmp_estimator_fit') self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') automl = AutoSklearnClassifier(time_left_for_this_task=15, per_run_time_limit=5, tmp_folder=output, output_folder=output) automl.fit(X_train, Y_train) score = automl.score(X_test, Y_test) self.assertGreaterEqual(score, 0.8) self.assertEqual(automl._automl._automl._task, MULTICLASS_CLASSIFICATION) del automl self._tearDown(output)
def test_fit(self): output = os.path.join(self.test_dir, '..', '.tmp_estimator_fit') self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') automl = AutoSklearnClassifier(time_left_for_this_task=15, per_run_time_limit=5, tmp_folder=output, output_folder=output) automl.fit(X_train, Y_train) score = automl.score(X_test, Y_test) print(automl.show_models()) self.assertGreaterEqual(score, 0.8) self.assertEqual(automl._automl._automl._task, MULTICLASS_CLASSIFICATION) del automl self._tearDown(output)
def test_classification_pandas_support(tmp_dir, output_dir, dask_client): X, y = sklearn.datasets.fetch_openml( data_id=2, # cat/num dataset return_X_y=True, as_frame=True, ) # Drop NAN!! X = X.dropna('columns') # This test only make sense if input is dataframe assert isinstance(X, pd.DataFrame) assert isinstance(y, pd.Series) automl = AutoSklearnClassifier( time_left_for_this_task=30, per_run_time_limit=5, exclude_estimators=['libsvm_svc'], dask_client=dask_client, seed=5, tmp_folder=tmp_dir, output_folder=output_dir, ) automl.fit(X, y) # Make sure that at least better than random. # We use same X_train==X_test to test code quality assert automl.score(X, y) > 0.555, print_debug_information(automl) automl.refit(X, y) # Make sure that at least better than random. # accuracy in sklearn needs valid data # It should be 0.555 as the dataset is unbalanced. prediction = automl.predict(X) assert accuracy(y, prediction) > 0.555 assert count_succeses(automl.cv_results_) > 0 assert includes_train_scores(automl.performance_over_time_.columns) is True assert performance_over_time_is_plausible( automl.performance_over_time_) is True
def test_classification_pandas_support(tmp_dir, output_dir, dask_client): X, y = sklearn.datasets.fetch_openml( data_id=2, # cat/num dataset return_X_y=True, as_frame=True, ) # Drop NAN!! X = X.dropna('columns') # This test only make sense if input is dataframe assert isinstance(X, pd.DataFrame) assert isinstance(y, pd.Series) automl = AutoSklearnClassifier( time_left_for_this_task=30, per_run_time_limit=5, exclude_estimators=['libsvm_svc'], dask_client=dask_client, seed=5, tmp_folder=tmp_dir, output_folder=output_dir, ) automl.fit(X, y) log_file_path = glob.glob(os.path.join(tmp_dir, 'AutoML*.log'))[0] # Make sure that at least better than random. # We use same X_train==X_test to test code quality assert automl.score(X, y) > 0.555, extract_msg_from_log(log_file_path) automl.refit(X, y) # Make sure that at least better than random. # accuracy in sklearn needs valid data # It should be 0.555 as the dataset is unbalanced. y = automl.automl_.InputValidator.encode_target(y) prediction = automl.automl_.InputValidator.encode_target(automl.predict(X)) assert accuracy(y, prediction) > 0.555 assert count_succeses(automl.cv_results_) > 0
def test_fit(self): if self.travis: self.skipTest('This test does currently not run on travis-ci. ' 'Make sure it runs locally on your machine!') output = os.path.join(self.test_dir, '..', '.tmp_estimator_fit') self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') automl = AutoSklearnClassifier(time_left_for_this_task=15, per_run_time_limit=15, tmp_folder=output, output_folder=output) automl.fit(X_train, Y_train) score = automl.score(X_test, Y_test) print(automl.show_models()) self.assertGreaterEqual(score, 0.8) self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION) del automl self._tearDown(output)
def test_fit(self): if self.travis: self.skipTest('This test does currently not run on travis-ci. ' 'Make sure it runs locally on your machine!') output = os.path.join(self.test_dir, '..', '.tmp_estimator_fit') self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') automl = AutoSklearnClassifier(time_left_for_this_task=15, per_run_time_limit=15, tmp_folder=output, output_folder=output) automl.fit(X_train, Y_train) score = automl.score(X_test, Y_test) print(automl.show_models()) self.assertGreaterEqual(score, 0.8) self.assertEqual(automl._automl._automl._task, MULTICLASS_CLASSIFICATION) del automl self._tearDown(output)