def test_cv_regression(tmp_dir, dask_client): """ Makes sure that when using a cv strategy, we are able to fit a regressor """ X_train, Y_train, X_test, Y_test = putil.get_dataset( 'boston', train_size_maximum=300) automl = AutoSklearnRegressor( time_left_for_this_task=60, per_run_time_limit=10, resampling_strategy='cv', tmp_folder=tmp_dir, dask_client=dask_client, ) automl.fit(X_train, Y_train) predictions = automl.predict(X_test) assert predictions.shape == (206, ) score = r2(Y_test, predictions) assert score >= 0.1, print_debug_information(automl) assert count_succeses( automl.cv_results_) > 0, print_debug_information(automl) assert includes_train_scores(automl.performance_over_time_.columns) is True assert performance_over_time_is_plausible( automl.performance_over_time_) is True
def train_model(data, target_label, duration, regressor=True): dataframe = data.to_df() if regressor: model = AutoSklearnRegressor( time_left_for_this_task=duration, memory_limit=9216 ) else: model = AutoSklearnClassifier( time_left_for_this_task=duration, memory_limit=9216 ) if score: Xt, Xv, yt, yv = train_test_split( self.preprocessor.transform(dataframe["smiles"]), dataframe[target_label], test_size=0.15, random_state=18, ) else: Xt = self.preprocessor.transform(dataframe["smiles"]) yt = dataframe[target_label] model.fit(Xt, yt) if score: print(f"Score on {target_label}: {model.score(Xv, yv)}") return model
def regression(self, metric="r2"): """ Perform auto_regression. Args: metric (str): The evaluation metric of regression. This will be mapped by AutoSklearnML.get_regression_metric to an instance of :class:`autosklearn.metrics.Scorer` as created by :meth:`autosklearn.metrics.make_scorer`. Default metric: "r2". Other supported metrics: "mean_squared_error", "mean_absolute_error", "median_absolute_error" Returns: """ auto_regressor = AutoSklearnRegressor(**self.auto_sklearn_kwargs) regression_metric = AutoSklearnML.get_regression_metric(metric) auto_regressor.fit(self._X_train, self._y_train, metric=regression_metric, dataset_name=self.dataset_name) print(auto_regressor.show_models()) if self.auto_sklearn_kwargs["resampling_strategy"] == "cv": auto_regressor.refit(self._X_train.copy(), self._y_train.copy()) prediction_train = auto_regressor.predict(self._X_train) print("training set {} score: {}".format( metric, regression_metric._score_func(self._y_train, prediction_train))) prediction_test = auto_regressor.predict(self._X_test) print("test set {} score: {}".format( metric, regression_metric._score_func(self._y_test, prediction_test))) with open( os.path.join(self.auto_sklearn_kwargs['output_folder'], 'best_auto_sklearn_output.log'), 'a+') as wf: wf.write('The best model is : \n') wf.write(auto_regressor.show_models()) wf.write("\ntraining set {} score: {}\n".format( metric, regression_metric._score_func(self._y_train, prediction_train))) wf.write('\n') wf.write("test set {} score: {}".format( metric, regression_metric._score_func(self._y_test, prediction_test))) dump_file = os.path.join(self.auto_sklearn_kwargs['output_folder'], 'automl_regressor.dump.pkl') with open(dump_file, 'wb') as f: pickle.dump(auto_regressor, f) return auto_regressor
def test_regression_pandas_support(tmp_dir, dask_client): X, y = sklearn.datasets.fetch_openml( data_id=41514, # diabetes return_X_y=True, as_frame=True, ) # This test only make sense if input is dataframe assert isinstance(X, pd.DataFrame) assert isinstance(y, pd.Series) automl = AutoSklearnRegressor( time_left_for_this_task=40, per_run_time_limit=5, dask_client=dask_client, tmp_folder=tmp_dir, ) # Make sure we error out because y is not encoded automl.fit(X, y) # Make sure that at least better than random. # We use same X_train==X_test to test code quality assert automl.score(X, y) >= 0.5, print_debug_information(automl) automl.refit(X, y) # Make sure that at least better than random. assert r2(y, automl.predict(X)) > 0.5, print_debug_information(automl) assert count_succeses( automl.cv_results_) > 0, print_debug_information(automl)
def test_autosklearn_regression_methods_returns_self(dask_client): X_train, y_train, X_test, y_test = putil.get_dataset('boston') automl = AutoSklearnRegressor(time_left_for_this_task=30, per_run_time_limit=5, dask_client=dask_client, ensemble_size=0) automl_fitted = automl.fit(X_train, y_train) assert automl is automl_fitted automl_ensemble_fitted = automl.fit_ensemble(y_train, ensemble_size=5) assert automl is automl_ensemble_fitted automl_refitted = automl.refit(X_train.copy(), y_train.copy()) assert automl is automl_refitted
def test_regression(tmp_dir, dask_client): X_train, Y_train, X_test, Y_test = putil.get_dataset('boston') automl = AutoSklearnRegressor( time_left_for_this_task=30, per_run_time_limit=5, tmp_folder=tmp_dir, dask_client=dask_client, ) automl.fit(X_train, Y_train) predictions = automl.predict(X_test) assert predictions.shape == (356, ) score = mean_squared_error(Y_test, predictions) # On average np.sqrt(30) away from the target -> ~5.5 on average # Results with select rates drops avg score to a range of -32.40 to -37, on 30 seconds # constraint. With more time_left_for_this_task this is no longer an issue assert score >= -37, print_debug_information(automl) assert count_succeses(automl.cv_results_) > 0
def test_type_of_target(mock_estimator): # Test that classifier raises error for illegal target types. X = np.array([ [1, 2], [2, 3], [3, 4], [4, 5], ]) # Possible target types y_binary = np.array([0, 0, 1, 1]) y_continuous = np.array([0.1, 1.3, 2.1, 4.0]) y_multiclass = np.array([0, 1, 2, 0]) y_multilabel = np.array([ [0, 1], [1, 1], [1, 0], [0, 0], ]) y_multiclass_multioutput = np.array([ [0, 1], [1, 3], [2, 2], [5, 3], ]) y_continuous_multioutput = np.array([ [0.1, 1.5], [1.2, 3.5], [2.7, 2.7], [5.5, 3.9], ]) cls = AutoSklearnClassifier(ensemble_size=0) cls.automl_ = unittest.mock.Mock() cls.automl_.InputValidator = unittest.mock.Mock() cls.automl_.InputValidator.target_validator = unittest.mock.Mock() # Illegal target types for classification: continuous, # multiclass-multioutput, continuous-multioutput. expected_msg = r".*Classification with data of type" " multiclass-multioutput is not supported.*" with pytest.raises(ValueError, match=expected_msg): cls.fit(X=X, y=y_multiclass_multioutput) expected_msg = r".*Classification with data of type" " continuous is not supported.*" with pytest.raises(ValueError, match=expected_msg): cls.fit(X=X, y=y_continuous) expected_msg = r".*Classification with data of type" " continuous-multioutput is not supported.*" with pytest.raises(ValueError, match=expected_msg): cls.fit(X=X, y=y_continuous_multioutput) # Legal target types for classification: binary, multiclass, # multilabel-indicator. try: cls.fit(X, y_binary) except ValueError: pytest.fail("cls.fit() raised ValueError while fitting " "binary targets") try: cls.fit(X, y_multiclass) except ValueError: pytest.fail("cls.fit() raised ValueError while fitting " "multiclass targets") try: cls.fit(X, y_multilabel) except ValueError: pytest.fail("cls.fit() raised ValueError while fitting " "multilabel-indicator targets") # Test that regressor raises error for illegal target types. reg = AutoSklearnRegressor(ensemble_size=0) # Illegal target types for regression: multilabel-indicator # multiclass-multioutput expected_msg = r".*Regression with data of type" " multilabel-indicator is not supported.*" with pytest.raises(ValueError, match=expected_msg): reg.fit( X=X, y=y_multilabel, ) expected_msg = r".*Regression with data of type" " multiclass-multioutput is not supported.*" with pytest.raises(ValueError, match=expected_msg): reg.fit( X=X, y=y_multiclass_multioutput, ) # Legal target types: continuous, multiclass, # continuous-multioutput, # binary try: reg.fit(X, y_continuous) except ValueError: pytest.fail("reg.fit() raised ValueError while fitting " "continuous targets") try: reg.fit(X, y_multiclass) except ValueError: pytest.fail("reg.fit() raised ValueError while fitting " "multiclass targets") try: reg.fit(X, y_continuous_multioutput) except ValueError: pytest.fail("reg.fit() raised ValueError while fitting " "continuous_multioutput targets") try: reg.fit(X, y_binary) except ValueError: pytest.fail("reg.fit() raised ValueError while fitting " "binary targets")