def test_cv_regression(tmp_dir, dask_client):
    """
    Makes sure that when using a cv strategy, we are able to fit
    a regressor
    """

    X_train, Y_train, X_test, Y_test = putil.get_dataset(
        'boston', train_size_maximum=300)
    automl = AutoSklearnRegressor(
        time_left_for_this_task=60,
        per_run_time_limit=10,
        resampling_strategy='cv',
        tmp_folder=tmp_dir,
        dask_client=dask_client,
    )

    automl.fit(X_train, Y_train)

    predictions = automl.predict(X_test)
    assert predictions.shape == (206, )
    score = r2(Y_test, predictions)
    assert score >= 0.1, print_debug_information(automl)
    assert count_succeses(
        automl.cv_results_) > 0, print_debug_information(automl)
    assert includes_train_scores(automl.performance_over_time_.columns) is True
    assert performance_over_time_is_plausible(
        automl.performance_over_time_) is True
Beispiel #2
0
        def train_model(data, target_label, duration, regressor=True):
            dataframe = data.to_df()
            if regressor:
                model = AutoSklearnRegressor(
                    time_left_for_this_task=duration, memory_limit=9216
                )
            else:
                model = AutoSklearnClassifier(
                    time_left_for_this_task=duration, memory_limit=9216
                )

            if score:
                Xt, Xv, yt, yv = train_test_split(
                    self.preprocessor.transform(dataframe["smiles"]),
                    dataframe[target_label],
                    test_size=0.15,
                    random_state=18,
                )
            else:
                Xt = self.preprocessor.transform(dataframe["smiles"])
                yt = dataframe[target_label]

            model.fit(Xt, yt)

            if score:
                print(f"Score on {target_label}: {model.score(Xv, yv)}")

            return model
Beispiel #3
0
    def regression(self, metric="r2"):
        """
        Perform auto_regression.
        Args:
            metric (str): The evaluation metric of regression.
                 This will be mapped by AutoSklearnML.get_regression_metric
                 to an instance of :class:`autosklearn.metrics.Scorer` as
                 created by :meth:`autosklearn.metrics.make_scorer`.
                 Default metric: "r2".
                 Other supported metrics: "mean_squared_error",
                                          "mean_absolute_error",
                                          "median_absolute_error"

        Returns:

        """
        auto_regressor = AutoSklearnRegressor(**self.auto_sklearn_kwargs)
        regression_metric = AutoSklearnML.get_regression_metric(metric)
        auto_regressor.fit(self._X_train,
                           self._y_train,
                           metric=regression_metric,
                           dataset_name=self.dataset_name)
        print(auto_regressor.show_models())

        if self.auto_sklearn_kwargs["resampling_strategy"] == "cv":
            auto_regressor.refit(self._X_train.copy(), self._y_train.copy())

        prediction_train = auto_regressor.predict(self._X_train)
        print("training set {} score: {}".format(
            metric,
            regression_metric._score_func(self._y_train, prediction_train)))

        prediction_test = auto_regressor.predict(self._X_test)
        print("test set {} score: {}".format(
            metric, regression_metric._score_func(self._y_test,
                                                  prediction_test)))

        with open(
                os.path.join(self.auto_sklearn_kwargs['output_folder'],
                             'best_auto_sklearn_output.log'), 'a+') as wf:
            wf.write('The best model is : \n')
            wf.write(auto_regressor.show_models())
            wf.write("\ntraining set {} score: {}\n".format(
                metric,
                regression_metric._score_func(self._y_train,
                                              prediction_train)))
            wf.write('\n')
            wf.write("test set {} score: {}".format(
                metric,
                regression_metric._score_func(self._y_test, prediction_test)))

        dump_file = os.path.join(self.auto_sklearn_kwargs['output_folder'],
                                 'automl_regressor.dump.pkl')

        with open(dump_file, 'wb') as f:
            pickle.dump(auto_regressor, f)

        return auto_regressor
def test_regression_pandas_support(tmp_dir, dask_client):

    X, y = sklearn.datasets.fetch_openml(
        data_id=41514,  # diabetes
        return_X_y=True,
        as_frame=True,
    )
    # This test only make sense if input is dataframe
    assert isinstance(X, pd.DataFrame)
    assert isinstance(y, pd.Series)
    automl = AutoSklearnRegressor(
        time_left_for_this_task=40,
        per_run_time_limit=5,
        dask_client=dask_client,
        tmp_folder=tmp_dir,
    )

    # Make sure we error out because y is not encoded
    automl.fit(X, y)

    # Make sure that at least better than random.
    # We use same X_train==X_test to test code quality
    assert automl.score(X, y) >= 0.5, print_debug_information(automl)

    automl.refit(X, y)

    # Make sure that at least better than random.
    assert r2(y, automl.predict(X)) > 0.5, print_debug_information(automl)
    assert count_succeses(
        automl.cv_results_) > 0, print_debug_information(automl)
def test_autosklearn_regression_methods_returns_self(dask_client):
    X_train, y_train, X_test, y_test = putil.get_dataset('boston')
    automl = AutoSklearnRegressor(time_left_for_this_task=30,
                                  per_run_time_limit=5,
                                  dask_client=dask_client,
                                  ensemble_size=0)

    automl_fitted = automl.fit(X_train, y_train)
    assert automl is automl_fitted

    automl_ensemble_fitted = automl.fit_ensemble(y_train, ensemble_size=5)
    assert automl is automl_ensemble_fitted

    automl_refitted = automl.refit(X_train.copy(), y_train.copy())
    assert automl is automl_refitted
def test_regression(tmp_dir, dask_client):

    X_train, Y_train, X_test, Y_test = putil.get_dataset('boston')
    automl = AutoSklearnRegressor(
        time_left_for_this_task=30,
        per_run_time_limit=5,
        tmp_folder=tmp_dir,
        dask_client=dask_client,
    )

    automl.fit(X_train, Y_train)

    predictions = automl.predict(X_test)
    assert predictions.shape == (356, )
    score = mean_squared_error(Y_test, predictions)

    # On average np.sqrt(30) away from the target -> ~5.5 on average
    # Results with select rates drops avg score to a range of -32.40 to -37, on 30 seconds
    # constraint. With more time_left_for_this_task this is no longer an issue
    assert score >= -37, print_debug_information(automl)
    assert count_succeses(automl.cv_results_) > 0
def test_type_of_target(mock_estimator):
    # Test that classifier raises error for illegal target types.
    X = np.array([
        [1, 2],
        [2, 3],
        [3, 4],
        [4, 5],
    ])
    # Possible target types
    y_binary = np.array([0, 0, 1, 1])
    y_continuous = np.array([0.1, 1.3, 2.1, 4.0])
    y_multiclass = np.array([0, 1, 2, 0])
    y_multilabel = np.array([
        [0, 1],
        [1, 1],
        [1, 0],
        [0, 0],
    ])
    y_multiclass_multioutput = np.array([
        [0, 1],
        [1, 3],
        [2, 2],
        [5, 3],
    ])
    y_continuous_multioutput = np.array([
        [0.1, 1.5],
        [1.2, 3.5],
        [2.7, 2.7],
        [5.5, 3.9],
    ])

    cls = AutoSklearnClassifier(ensemble_size=0)
    cls.automl_ = unittest.mock.Mock()
    cls.automl_.InputValidator = unittest.mock.Mock()
    cls.automl_.InputValidator.target_validator = unittest.mock.Mock()

    # Illegal target types for classification: continuous,
    # multiclass-multioutput, continuous-multioutput.
    expected_msg = r".*Classification with data of type"
    " multiclass-multioutput is not supported.*"
    with pytest.raises(ValueError, match=expected_msg):
        cls.fit(X=X, y=y_multiclass_multioutput)

    expected_msg = r".*Classification with data of type"
    " continuous is not supported.*"
    with pytest.raises(ValueError, match=expected_msg):
        cls.fit(X=X, y=y_continuous)

    expected_msg = r".*Classification with data of type"
    " continuous-multioutput is not supported.*"
    with pytest.raises(ValueError, match=expected_msg):
        cls.fit(X=X, y=y_continuous_multioutput)

    # Legal target types for classification: binary, multiclass,
    # multilabel-indicator.
    try:
        cls.fit(X, y_binary)
    except ValueError:
        pytest.fail("cls.fit() raised ValueError while fitting "
                    "binary targets")

    try:
        cls.fit(X, y_multiclass)
    except ValueError:
        pytest.fail("cls.fit() raised ValueError while fitting "
                    "multiclass targets")

    try:
        cls.fit(X, y_multilabel)
    except ValueError:
        pytest.fail("cls.fit() raised ValueError while fitting "
                    "multilabel-indicator targets")

    # Test that regressor raises error for illegal target types.
    reg = AutoSklearnRegressor(ensemble_size=0)
    # Illegal target types for regression: multilabel-indicator
    # multiclass-multioutput
    expected_msg = r".*Regression with data of type"
    " multilabel-indicator is not supported.*"
    with pytest.raises(ValueError, match=expected_msg):
        reg.fit(
            X=X,
            y=y_multilabel,
        )

    expected_msg = r".*Regression with data of type"
    " multiclass-multioutput is not supported.*"
    with pytest.raises(ValueError, match=expected_msg):
        reg.fit(
            X=X,
            y=y_multiclass_multioutput,
        )

    # Legal target types: continuous, multiclass,
    # continuous-multioutput,
    # binary
    try:
        reg.fit(X, y_continuous)
    except ValueError:
        pytest.fail("reg.fit() raised ValueError while fitting "
                    "continuous targets")

    try:
        reg.fit(X, y_multiclass)
    except ValueError:
        pytest.fail("reg.fit() raised ValueError while fitting "
                    "multiclass targets")

    try:
        reg.fit(X, y_continuous_multioutput)
    except ValueError:
        pytest.fail("reg.fit() raised ValueError while fitting "
                    "continuous_multioutput targets")

    try:
        reg.fit(X, y_binary)
    except ValueError:
        pytest.fail("reg.fit() raised ValueError while fitting "
                    "binary targets")