def test_regression_pandas_support(self): X, y = sklearn.datasets.fetch_openml( data_id=41514, # diabetes return_X_y=True, as_frame=True, ) # This test only make sense if input is dataframe self.assertTrue(isinstance(X, pd.DataFrame)) self.assertTrue(isinstance(y, pd.Series)) automl = AutoSklearnRegressor( time_left_for_this_task=30, per_run_time_limit=5, ) # Make sure we error out because y is not encoded automl.fit(X, y) # Make sure that at least better than random. # We use same X_train==X_test to test code quality self.assertTrue(automl.score(X, y) > 0.5) automl.refit(X, y) # Make sure that at least better than random. self.assertTrue(r2(y, automl.predict(X)) > 0.5)
def test_cv_regression(self): """ Makes sure that when using a cv strategy, we are able to fit a regressor """ tmp = os.path.join(self.test_dir, '..', '.tmp_regression_fit_cv') output = os.path.join(self.test_dir, '..', '.out_regression_fit_cv') self._setUp(tmp) self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset( 'boston', train_size_maximum=300) automl = AutoSklearnRegressor(time_left_for_this_task=60, per_run_time_limit=10, resampling_strategy='cv', tmp_folder=tmp, output_folder=output) automl.fit(X_train, Y_train) predictions = automl.predict(X_test) self.assertEqual(predictions.shape, (206, )) score = r2(Y_test, predictions) print(Y_test) print(predictions) self.assertGreaterEqual(score, 0.1) self.assertGreater(self._count_succeses(automl.cv_results_), 0) self._tearDown(tmp) self._tearDown(output)
def test_cv_regression(tmp_dir, output_dir, dask_client): """ Makes sure that when using a cv strategy, we are able to fit a regressor """ X_train, Y_train, X_test, Y_test = putil.get_dataset( 'boston', train_size_maximum=300) automl = AutoSklearnRegressor(time_left_for_this_task=60, per_run_time_limit=10, resampling_strategy='cv', tmp_folder=tmp_dir, dask_client=dask_client, output_folder=output_dir) automl.fit(X_train, Y_train) # Log file path log_file_path = glob.glob(os.path.join(tmp_dir, 'AutoML*.log'))[0] predictions = automl.predict(X_test) assert predictions.shape == (206, ) score = r2(Y_test, predictions) assert score >= 0.1, extract_msg_from_log(log_file_path) assert count_succeses( automl.cv_results_) > 0, extract_msg_from_log(log_file_path)
def test_regression_pandas_support(tmp_dir, output_dir, dask_client): X, y = sklearn.datasets.fetch_openml( data_id=41514, # diabetes return_X_y=True, as_frame=True, ) # This test only make sense if input is dataframe assert isinstance(X, pd.DataFrame) assert isinstance(y, pd.Series) automl = AutoSklearnRegressor( time_left_for_this_task=40, per_run_time_limit=5, dask_client=dask_client, tmp_folder=tmp_dir, output_folder=output_dir, ) # Make sure we error out because y is not encoded automl.fit(X, y) # Make sure that at least better than random. # We use same X_train==X_test to test code quality assert automl.score(X, y) >= 0.5, print_debug_information(automl) automl.refit(X, y) # Make sure that at least better than random. assert r2(y, automl.predict(X)) > 0.5, print_debug_information(automl) assert count_succeses( automl.cv_results_) > 0, print_debug_information(automl)
def test_cv_regression(tmp_dir, dask_client): """ Makes sure that when using a cv strategy, we are able to fit a regressor """ X_train, Y_train, X_test, Y_test = putil.get_dataset( 'boston', train_size_maximum=300) automl = AutoSklearnRegressor( time_left_for_this_task=60, per_run_time_limit=10, resampling_strategy='cv', tmp_folder=tmp_dir, dask_client=dask_client, ) automl.fit(X_train, Y_train) predictions = automl.predict(X_test) assert predictions.shape == (206, ) score = r2(Y_test, predictions) assert score >= 0.1, print_debug_information(automl) assert count_succeses( automl.cv_results_) > 0, print_debug_information(automl) assert includes_train_scores(automl.performance_over_time_.columns) is True assert performance_over_time_is_plausible( automl.performance_over_time_) is True