def regression(self, metric="r2"): """ Perform auto_regression. Args: metric (str): The evaluation metric of regression. This will be mapped by AutoSklearnML.get_regression_metric to an instance of :class:`autosklearn.metrics.Scorer` as created by :meth:`autosklearn.metrics.make_scorer`. Default metric: "r2". Other supported metrics: "mean_squared_error", "mean_absolute_error", "median_absolute_error" Returns: """ auto_regressor = AutoSklearnRegressor(**self.auto_sklearn_kwargs) regression_metric = AutoSklearnML.get_regression_metric(metric) auto_regressor.fit(self._X_train, self._y_train, metric=regression_metric, dataset_name=self.dataset_name) print(auto_regressor.show_models()) if self.auto_sklearn_kwargs["resampling_strategy"] == "cv": auto_regressor.refit(self._X_train.copy(), self._y_train.copy()) prediction_train = auto_regressor.predict(self._X_train) print("training set {} score: {}".format( metric, regression_metric._score_func(self._y_train, prediction_train))) prediction_test = auto_regressor.predict(self._X_test) print("test set {} score: {}".format( metric, regression_metric._score_func(self._y_test, prediction_test))) with open( os.path.join(self.auto_sklearn_kwargs['output_folder'], 'best_auto_sklearn_output.log'), 'a+') as wf: wf.write('The best model is : \n') wf.write(auto_regressor.show_models()) wf.write("\ntraining set {} score: {}\n".format( metric, regression_metric._score_func(self._y_train, prediction_train))) wf.write('\n') wf.write("test set {} score: {}".format( metric, regression_metric._score_func(self._y_test, prediction_test))) dump_file = os.path.join(self.auto_sklearn_kwargs['output_folder'], 'automl_regressor.dump.pkl') with open(dump_file, 'wb') as f: pickle.dump(auto_regressor, f) return auto_regressor
def test_regression_pandas_support(tmp_dir, dask_client): X, y = sklearn.datasets.fetch_openml( data_id=41514, # diabetes return_X_y=True, as_frame=True, ) # This test only make sense if input is dataframe assert isinstance(X, pd.DataFrame) assert isinstance(y, pd.Series) automl = AutoSklearnRegressor( time_left_for_this_task=40, per_run_time_limit=5, dask_client=dask_client, tmp_folder=tmp_dir, ) # Make sure we error out because y is not encoded automl.fit(X, y) # Make sure that at least better than random. # We use same X_train==X_test to test code quality assert automl.score(X, y) >= 0.5, print_debug_information(automl) automl.refit(X, y) # Make sure that at least better than random. assert r2(y, automl.predict(X)) > 0.5, print_debug_information(automl) assert count_succeses( automl.cv_results_) > 0, print_debug_information(automl)
def test_cv_regression(tmp_dir, dask_client): """ Makes sure that when using a cv strategy, we are able to fit a regressor """ X_train, Y_train, X_test, Y_test = putil.get_dataset( 'boston', train_size_maximum=300) automl = AutoSklearnRegressor( time_left_for_this_task=60, per_run_time_limit=10, resampling_strategy='cv', tmp_folder=tmp_dir, dask_client=dask_client, ) automl.fit(X_train, Y_train) predictions = automl.predict(X_test) assert predictions.shape == (206, ) score = r2(Y_test, predictions) assert score >= 0.1, print_debug_information(automl) assert count_succeses( automl.cv_results_) > 0, print_debug_information(automl) assert includes_train_scores(automl.performance_over_time_.columns) is True assert performance_over_time_is_plausible( automl.performance_over_time_) is True
def test_regression(tmp_dir, dask_client): X_train, Y_train, X_test, Y_test = putil.get_dataset('boston') automl = AutoSklearnRegressor( time_left_for_this_task=30, per_run_time_limit=5, tmp_folder=tmp_dir, dask_client=dask_client, ) automl.fit(X_train, Y_train) predictions = automl.predict(X_test) assert predictions.shape == (356, ) score = mean_squared_error(Y_test, predictions) # On average np.sqrt(30) away from the target -> ~5.5 on average # Results with select rates drops avg score to a range of -32.40 to -37, on 30 seconds # constraint. With more time_left_for_this_task this is no longer an issue assert score >= -37, print_debug_information(automl) assert count_succeses(automl.cv_results_) > 0