Beispiel #1
0
    def regression(self, metric="r2"):
        """
        Perform auto_regression.
        Args:
            metric (str): The evaluation metric of regression.
                 This will be mapped by AutoSklearnML.get_regression_metric
                 to an instance of :class:`autosklearn.metrics.Scorer` as
                 created by :meth:`autosklearn.metrics.make_scorer`.
                 Default metric: "r2".
                 Other supported metrics: "mean_squared_error",
                                          "mean_absolute_error",
                                          "median_absolute_error"

        Returns:

        """
        auto_regressor = AutoSklearnRegressor(**self.auto_sklearn_kwargs)
        regression_metric = AutoSklearnML.get_regression_metric(metric)
        auto_regressor.fit(self._X_train,
                           self._y_train,
                           metric=regression_metric,
                           dataset_name=self.dataset_name)
        print(auto_regressor.show_models())

        if self.auto_sklearn_kwargs["resampling_strategy"] == "cv":
            auto_regressor.refit(self._X_train.copy(), self._y_train.copy())

        prediction_train = auto_regressor.predict(self._X_train)
        print("training set {} score: {}".format(
            metric,
            regression_metric._score_func(self._y_train, prediction_train)))

        prediction_test = auto_regressor.predict(self._X_test)
        print("test set {} score: {}".format(
            metric, regression_metric._score_func(self._y_test,
                                                  prediction_test)))

        with open(
                os.path.join(self.auto_sklearn_kwargs['output_folder'],
                             'best_auto_sklearn_output.log'), 'a+') as wf:
            wf.write('The best model is : \n')
            wf.write(auto_regressor.show_models())
            wf.write("\ntraining set {} score: {}\n".format(
                metric,
                regression_metric._score_func(self._y_train,
                                              prediction_train)))
            wf.write('\n')
            wf.write("test set {} score: {}".format(
                metric,
                regression_metric._score_func(self._y_test, prediction_test)))

        dump_file = os.path.join(self.auto_sklearn_kwargs['output_folder'],
                                 'automl_regressor.dump.pkl')

        with open(dump_file, 'wb') as f:
            pickle.dump(auto_regressor, f)

        return auto_regressor
def test_regression_pandas_support(tmp_dir, dask_client):

    X, y = sklearn.datasets.fetch_openml(
        data_id=41514,  # diabetes
        return_X_y=True,
        as_frame=True,
    )
    # This test only make sense if input is dataframe
    assert isinstance(X, pd.DataFrame)
    assert isinstance(y, pd.Series)
    automl = AutoSklearnRegressor(
        time_left_for_this_task=40,
        per_run_time_limit=5,
        dask_client=dask_client,
        tmp_folder=tmp_dir,
    )

    # Make sure we error out because y is not encoded
    automl.fit(X, y)

    # Make sure that at least better than random.
    # We use same X_train==X_test to test code quality
    assert automl.score(X, y) >= 0.5, print_debug_information(automl)

    automl.refit(X, y)

    # Make sure that at least better than random.
    assert r2(y, automl.predict(X)) > 0.5, print_debug_information(automl)
    assert count_succeses(
        automl.cv_results_) > 0, print_debug_information(automl)
def test_cv_regression(tmp_dir, dask_client):
    """
    Makes sure that when using a cv strategy, we are able to fit
    a regressor
    """

    X_train, Y_train, X_test, Y_test = putil.get_dataset(
        'boston', train_size_maximum=300)
    automl = AutoSklearnRegressor(
        time_left_for_this_task=60,
        per_run_time_limit=10,
        resampling_strategy='cv',
        tmp_folder=tmp_dir,
        dask_client=dask_client,
    )

    automl.fit(X_train, Y_train)

    predictions = automl.predict(X_test)
    assert predictions.shape == (206, )
    score = r2(Y_test, predictions)
    assert score >= 0.1, print_debug_information(automl)
    assert count_succeses(
        automl.cv_results_) > 0, print_debug_information(automl)
    assert includes_train_scores(automl.performance_over_time_.columns) is True
    assert performance_over_time_is_plausible(
        automl.performance_over_time_) is True
def test_regression(tmp_dir, dask_client):

    X_train, Y_train, X_test, Y_test = putil.get_dataset('boston')
    automl = AutoSklearnRegressor(
        time_left_for_this_task=30,
        per_run_time_limit=5,
        tmp_folder=tmp_dir,
        dask_client=dask_client,
    )

    automl.fit(X_train, Y_train)

    predictions = automl.predict(X_test)
    assert predictions.shape == (356, )
    score = mean_squared_error(Y_test, predictions)

    # On average np.sqrt(30) away from the target -> ~5.5 on average
    # Results with select rates drops avg score to a range of -32.40 to -37, on 30 seconds
    # constraint. With more time_left_for_this_task this is no longer an issue
    assert score >= -37, print_debug_information(automl)
    assert count_succeses(automl.cv_results_) > 0