def test_binary(tmp_dir, dask_client):

    X_train, Y_train, X_test, Y_test = putil.get_dataset('iris',
                                                         make_binary=True)
    automl = AutoSklearnClassifier(
        time_left_for_this_task=40,
        delete_tmp_folder_after_terminate=False,
        per_run_time_limit=10,
        tmp_folder=tmp_dir,
        dask_client=dask_client,
    )

    automl.fit(X_train,
               Y_train,
               X_test=X_test,
               y_test=Y_test,
               dataset_name='binary_test_dataset')

    predictions = automl.predict(X_test)
    assert predictions.shape == (50, ), print_debug_information(automl)

    score = accuracy(Y_test, predictions)
    assert score > 0.9, print_debug_information(automl)
    assert count_succeses(
        automl.cv_results_) > 0, print_debug_information(automl)
    assert includes_all_scores(automl.performance_over_time_.columns) is True
    assert performance_over_time_is_plausible(
        automl.performance_over_time_) is True
Esempio n. 2
0
def test_cv_results(tmp_dir):
    # TODO restructure and actually use real SMAC output from a long run
    # to do this unittest!
    X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')

    cls = AutoSklearnClassifier(time_left_for_this_task=30,
                                per_run_time_limit=5,
                                tmp_folder=tmp_dir,
                                seed=1,
                                initial_configurations_via_metalearning=0,
                                ensemble_size=0,
                                scoring_functions=[
                                    autosklearn.metrics.precision,
                                    autosklearn.metrics.roc_auc
                                ])

    params = cls.get_params()
    original_params = copy.deepcopy(params)

    cls.fit(X_train, Y_train)
    cv_results = cls.cv_results_
    assert isinstance(cv_results, dict), type(cv_results)
    assert isinstance(cv_results['mean_test_score'],
                      np.ndarray), type(cv_results['mean_test_score'])
    assert isinstance(cv_results['mean_fit_time'],
                      np.ndarray), type(cv_results['mean_fit_time'])
    assert isinstance(cv_results['params'], list), type(cv_results['params'])
    assert isinstance(cv_results['rank_test_scores'],
                      np.ndarray), type(cv_results['rank_test_scores'])
    assert isinstance(cv_results['metric_precision'],
                      npma.MaskedArray), type(cv_results['metric_precision'])
    assert isinstance(cv_results['metric_roc_auc'],
                      npma.MaskedArray), type(cv_results['metric_roc_auc'])
    cv_result_items = [
        isinstance(val, npma.MaskedArray) for key, val in cv_results.items()
        if key.startswith('param_')
    ]
    assert all(cv_result_items), cv_results.items()

    # Compare the state of the model parameters with the original parameters
    new_params = clone(cls).get_params()
    for param_name, original_value in original_params.items():
        new_value = new_params[param_name]

        # Taken from Sklearn code:
        # We should never change or mutate the internal state of input
        # parameters by default. To check this we use the joblib.hash function
        # that introspects recursively any subobjects to compute a checksum.
        # The only exception to this rule of immutable constructor parameters
        # is possible RandomState instance but in this check we explicitly
        # fixed the random_state params recursively to be integer seeds.
        assert joblib.hash(new_value) == joblib.hash(original_value), (
            "Estimator %s should not change or mutate "
            " the parameter %s from %s to %s during fit." %
            (cls, param_name, original_value, new_value))

    # Comply with https://scikit-learn.org/dev/glossary.html#term-classes
    is_classifier(cls)
    assert hasattr(cls, 'classes_')
Esempio n. 3
0
    def __init__(self,
                 preprocessor=None,
                 refit=True,
                 verbose=False,
                 retry_on_error=True,
                 **params):
        self.estimator = AutoSklearnClassifier(**dict(params))

        # Call to super
        super(AutoSklearnWrapper, self).__init__(estimator=self.estimator,
                                                 preprocessor=preprocessor,
                                                 refit=refit,
                                                 verbose=verbose,
                                                 retry_on_error=retry_on_error)
def test_performance_over_time_no_ensemble(tmp_dir):
    X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')

    cls = AutoSklearnClassifier(
        time_left_for_this_task=30,
        per_run_time_limit=5,
        tmp_folder=tmp_dir,
        seed=1,
        initial_configurations_via_metalearning=0,
        ensemble_size=0,
    )
    cls.fit(X_train, Y_train, X_test, Y_test)

    performance_over_time = cls.performance_over_time_
    assert include_single_scores(performance_over_time.columns) is True
    assert performance_over_time_is_plausible(performance_over_time) is True
Esempio n. 5
0
        def train_model(data, target_label, duration, regressor=True):
            dataframe = data.to_df()
            if regressor:
                model = AutoSklearnRegressor(
                    time_left_for_this_task=duration, memory_limit=9216
                )
            else:
                model = AutoSklearnClassifier(
                    time_left_for_this_task=duration, memory_limit=9216
                )

            if score:
                Xt, Xv, yt, yv = train_test_split(
                    self.preprocessor.transform(dataframe["smiles"]),
                    dataframe[target_label],
                    test_size=0.15,
                    random_state=18,
                )
            else:
                Xt = self.preprocessor.transform(dataframe["smiles"])
                yt = dataframe[target_label]

            model.fit(Xt, yt)

            if score:
                print(f"Score on {target_label}: {model.score(Xv, yv)}")

            return model
def test_can_pickle_classifier(tmp_dir, dask_client):
    X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
    automl = AutoSklearnClassifier(
        time_left_for_this_task=30,
        delete_tmp_folder_after_terminate=False,
        per_run_time_limit=5,
        tmp_folder=tmp_dir,
        dask_client=dask_client,
    )
    automl.fit(X_train, Y_train)

    initial_predictions = automl.predict(X_test)
    initial_accuracy = sklearn.metrics.accuracy_score(Y_test,
                                                      initial_predictions)
    assert initial_accuracy >= 0.75
    assert count_succeses(automl.cv_results_) > 0
    assert includes_train_scores(automl.performance_over_time_.columns) is True
    assert performance_over_time_is_plausible(
        automl.performance_over_time_) is True

    # Test pickle
    dump_file = os.path.join(tmp_dir, 'automl.dump.pkl')

    with open(dump_file, 'wb') as f:
        pickle.dump(automl, f)

    with open(dump_file, 'rb') as f:
        restored_automl = pickle.load(f)

    restored_predictions = restored_automl.predict(X_test)
    restored_accuracy = sklearn.metrics.accuracy_score(Y_test,
                                                       restored_predictions)
    assert restored_accuracy >= 0.75
    assert initial_accuracy == restored_accuracy

    # Test joblib
    dump_file = os.path.join(tmp_dir, 'automl.dump.joblib')

    joblib.dump(automl, dump_file)

    restored_automl = joblib.load(dump_file)

    restored_predictions = restored_automl.predict(X_test)
    restored_accuracy = sklearn.metrics.accuracy_score(Y_test,
                                                       restored_predictions)
    assert restored_accuracy >= 0.75
    assert initial_accuracy == restored_accuracy
Esempio n. 7
0
class AutoSklearnWrapper(Wrapper):
    def __init__(self,
                 preprocessor=None,
                 refit=True,
                 verbose=False,
                 retry_on_error=True,
                 **params):
        self.estimator = AutoSklearnClassifier(**dict(params))

        # Call to super
        super(AutoSklearnWrapper, self).__init__(estimator=self.estimator,
                                                 preprocessor=preprocessor,
                                                 refit=refit,
                                                 verbose=verbose,
                                                 retry_on_error=retry_on_error)

    def predict_proba(self, X):
        say("WARNING: predict_proba() not working well in Autosklearn. Raising AttributeError."
            )
        raise AttributeError()

    # Implementation of internal _fit
    def _fit(self, X, y, **fit_params):
        self.estimator.fit(X, y, **fit_params, metric=accuracy)

    # Implementation of internal _refit
    def _refit(self, X, y):
        self.estimator.fit(X, y)

    def _get_cv_results(self, estimator):
        # Get results and convert to lists, so that it is json serializable
        results = estimator.cv_results_
        lists = dict([(i, j if isinstance(j, list) else j.tolist())
                      for i, j in results.items()])

        # Store results
        cv_results_ = lists
        best_index_ = np.argmax(
            cv_results_['mean_test_score'])  # type: np.int64
        best_params_ = cv_results_['params'][best_index_]
        best_score_ = cv_results_['mean_test_score'][best_index_]

        return cv_results_, best_index_, best_params_, best_score_
Esempio n. 8
0
def test_autosklearn_classification_methods_returns_self(dask_client):
    """
    Currently this method only tests that the methods of AutoSklearnClassifier
    is able to fit using fit(), fit_ensemble() and refit()
    """
    X_train, y_train, X_test, y_test = putil.get_dataset('iris')
    automl = AutoSklearnClassifier(time_left_for_this_task=60,
                                   per_run_time_limit=10,
                                   ensemble_size=0,
                                   dask_client=dask_client,
                                   exclude_preprocessors=['fast_ica'])

    automl_fitted = automl.fit(X_train, y_train)
    assert automl is automl_fitted

    automl_ensemble_fitted = automl.fit_ensemble(y_train, ensemble_size=5)
    assert automl is automl_ensemble_fitted

    automl_refitted = automl.refit(X_train.copy(), y_train.copy())
    assert automl is automl_refitted
def test_multilabel(tmp_dir, dask_client):

    X_train, Y_train, X_test, Y_test = putil.get_dataset('iris',
                                                         make_multilabel=True)
    automl = AutoSklearnClassifier(
        time_left_for_this_task=30,
        per_run_time_limit=5,
        tmp_folder=tmp_dir,
        dask_client=dask_client,
    )

    automl.fit(X_train, Y_train)

    predictions = automl.predict(X_test)
    assert predictions.shape == (50, 3), print_debug_information(automl)
    assert count_succeses(
        automl.cv_results_) > 0, print_debug_information(automl)
    assert includes_train_scores(automl.performance_over_time_.columns) is True
    assert performance_over_time_is_plausible(
        automl.performance_over_time_) is True

    score = f1_macro(Y_test, predictions)
    assert score >= 0.9, print_debug_information(automl)

    probs = automl.predict_proba(X_train)
    assert np.mean(probs) == pytest.approx(0.33, rel=1e-1)
def test_autosklearn_anneal(as_frame):
    """
    This test makes sure that anneal dataset can be fitted and scored.
    This dataset is quite complex, with NaN, categorical and numerical columns
    so is a good testcase for unit-testing
    """
    X, y = sklearn.datasets.fetch_openml(data_id=2,
                                         return_X_y=True,
                                         as_frame=as_frame)
    automl = AutoSklearnClassifier(time_left_for_this_task=60,
                                   ensemble_size=0,
                                   delete_tmp_folder_after_terminate=False,
                                   initial_configurations_via_metalearning=0,
                                   smac_scenario_args={'runcount_limit': 6},
                                   resampling_strategy='holdout-iterative-fit')

    if as_frame:
        # Let autosklearn calculate the feat types
        automl_fitted = automl.fit(X, y)
    else:
        X_, y_ = sklearn.datasets.fetch_openml(data_id=2,
                                               return_X_y=True,
                                               as_frame=True)
        feat_type = [
            'categorical' if X_[col].dtype.name == 'category' else 'numerical'
            for col in X_.columns
        ]
        automl_fitted = automl.fit(X, y, feat_type=feat_type)
    assert automl is automl_fitted

    automl_ensemble_fitted = automl.fit_ensemble(y, ensemble_size=5)
    assert automl is automl_ensemble_fitted

    # We want to make sure we can learn from this data.
    # This is a test to make sure the data format (numpy/pandas)
    # can be used in a meaningful way -- not meant for generalization,
    # hence we use the train dataset
    assert automl_fitted.score(X, y) > 0.75
Esempio n. 11
0
def test_binary(tmp_dir, dask_client):

    X_train, Y_train, X_test, Y_test = putil.get_dataset('iris',
                                                         make_binary=True)
    automl = AutoSklearnClassifier(
        time_left_for_this_task=40,
        per_run_time_limit=10,
        tmp_folder=tmp_dir,
        dask_client=dask_client,
    )

    automl.fit(X_train,
               Y_train,
               X_test=X_test,
               y_test=Y_test,
               dataset_name='binary_test_dataset')

    predictions = automl.predict(X_test)
    assert predictions.shape == (50, ), print_debug_information(automl)

    score = accuracy(Y_test, predictions)
    assert score > 0.9, print_debug_information(automl)
    assert count_succeses(
        automl.cv_results_) > 0, print_debug_information(automl)
def test_classification_pandas_support(tmp_dir, dask_client):

    X, y = sklearn.datasets.fetch_openml(
        data_id=2,  # cat/num dataset
        return_X_y=True,
        as_frame=True,
    )

    # Drop NAN!!
    X = X.dropna('columns')

    # This test only make sense if input is dataframe
    assert isinstance(X, pd.DataFrame)
    assert isinstance(y, pd.Series)
    automl = AutoSklearnClassifier(
        time_left_for_this_task=30,
        per_run_time_limit=5,
        exclude={'classifier': ['libsvm_svc']},
        dask_client=dask_client,
        seed=5,
        tmp_folder=tmp_dir,
    )

    automl.fit(X, y)

    # Make sure that at least better than random.
    # We use same X_train==X_test to test code quality
    assert automl.score(X, y) > 0.555, print_debug_information(automl)

    automl.refit(X, y)

    # Make sure that at least better than random.
    # accuracy in sklearn needs valid data
    # It should be 0.555 as the dataset is unbalanced.
    prediction = automl.predict(X)
    assert accuracy(y, prediction) > 0.555
    assert count_succeses(automl.cv_results_) > 0
    assert includes_train_scores(automl.performance_over_time_.columns) is True
    assert performance_over_time_is_plausible(
        automl.performance_over_time_) is True
Esempio n. 13
0
def test_feat_type_wrong_arguments():

    # Every Auto-Sklearn estimator has a backend, that allows a single
    # call to fit
    X = np.zeros((100, 100))
    y = np.zeros((100, ))

    cls = AutoSklearnClassifier(ensemble_size=0)
    expected_msg = r".*feat_type does not have same number of "
    "variables as X has features. 1 vs 100.*"
    with pytest.raises(ValueError, match=expected_msg):
        cls.fit(X=X, y=y, feat_type=[True])

    cls = AutoSklearnClassifier(ensemble_size=0)
    expected_msg = r".*feat_type must only contain strings.*"
    with pytest.raises(ValueError, match=expected_msg):
        cls.fit(X=X, y=y, feat_type=[True] * 100)

    cls = AutoSklearnClassifier(ensemble_size=0)
    expected_msg = r".*Only `Categorical` and `Numerical` are"
    "valid feature types, you passed `Car`.*"
    with pytest.raises(ValueError, match=expected_msg):
        cls.fit(X=X, y=y, feat_type=['Car'] * 100)
Esempio n. 14
0
    def classification(self, metric="accuracy"):
        """
        Perform auto_classification.
        Args:
            metric (str): The evaluation metric of classification.
                 This will be mapped by AutoSklearnML.get_classification_metric
                 to an instance of :class:`autosklearn.metrics.Scorer` as
                 created by :meth:`autosklearn.metrics.make_scorer`.
                 Default metric: "accuracy".
                 Other supported metrics: "balanced_accuracy", "f1",
                                          "roc_auc", "average_precision",
                                          "precision", "recall"

        Returns:

        """
        auto_classifier = AutoSklearnClassifier(**self.auto_sklearn_kwargs)
        classification_metric = AutoSklearnML.get_classification_metric(metric)
        auto_classifier.fit(self._X_train.copy(),
                            self._y_train.copy(),
                            metric=classification_metric,
                            dataset_name=self.dataset_name)

        print(auto_classifier.show_models())

        if self.auto_sklearn_kwargs["resampling_strategy"] == "cv":
            auto_classifier.refit(self._X_train.copy(), self._y_train.copy())

        prediction_train = auto_classifier.predict(self._X_train)
        print("training set {} score: {}".format(
            metric,
            classification_metric._score_func(self._y_train,
                                              prediction_train)))

        prediction_test = auto_classifier.predict(self._X_test)
        print("test set {} score: {}".format(
            metric,
            classification_metric._score_func(self._y_test, prediction_test)))

        with open(
                os.path.join(self.auto_sklearn_kwargs['output_folder'],
                             'best_auto_sklearn_output.log'), 'a+') as wf:
            wf.write('The best model is : \n')
            wf.write(auto_classifier.show_models())
            wf.write("\ntraining set {} score: {}\n".format(
                metric,
                classification_metric._score_func(self._y_train,
                                                  prediction_train)))
            wf.write('\n')
            wf.write("test set {} score: {}".format(
                metric,
                classification_metric._score_func(self._y_test,
                                                  prediction_test)))

        dump_file = os.path.join(self.auto_sklearn_kwargs['output_folder'],
                                 'automl_classification.dump.pkl')

        with open(dump_file, 'wb') as f:
            pickle.dump(auto_classifier, f)

        return auto_classifier
Esempio n. 15
0
def test_pass_categorical_and_numeric_columns_to_pipeline(
        dask_client, data_type, include_categorical):

    # Prepare the training data
    X, y = sklearn.datasets.make_classification()
    feat_type = None
    if 'pandas' in data_type:
        X = pd.DataFrame(X)
        y = pd.DataFrame(y, dtype="category")
        if include_categorical:
            cat_name = X.shape[1]
            X[cat_name] = 'A'
            X[cat_name] = X[cat_name].astype('category')
    elif 'numpy' in data_type:
        if include_categorical:
            feat_type = ['numerical' for x in range(np.shape(X)[1])]
            feat_type.append('categorical')
            temporal = np.zeros((X.shape[0], X.shape[1] + 1))
            temporal[:, :-1] = X
            X = temporal
    else:
        pytest.fail()

    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
        X, y, test_size=0.5, random_state=3)

    seed = 3
    automl = AutoSklearnClassifier(
        time_left_for_this_task=120,
        # Time left for task plays no role
        # only per run time limit
        per_run_time_limit=30,
        ensemble_size=0,
        dask_client=dask_client,
        include_estimators=['random_forest'],
        seed=seed,
    )
    config = automl.get_configuration_space(
        X_train,
        y_train,
        feat_type=feat_type,
        X_test=X_test,
        y_test=y_test,
    ).get_default_configuration()

    pipeline, run_info, run_value = automl.fit_pipeline(X=X_train,
                                                        y=y_train,
                                                        config=config,
                                                        feat_type=feat_type,
                                                        X_test=X_test,
                                                        y_test=y_test)

    # We should produce a decent result
    assert run_value.cost < 0.4, f"{run_value}/{run_value.additional_info}"
    prediction = pipeline.predict(
        automl.automl_.InputValidator.feature_validator.transform(X))
    assert np.shape(prediction)[0], np.shape(y)[0]

    if include_categorical:
        expected_dict = {i: 'numerical' for i in range(np.shape(X)[1] - 1)}
        expected_dict[X.shape[1] - 1] = 'categorical'
    else:
        expected_dict = {i: 'numerical' for i in range(np.shape(X)[1])}
    assert expected_dict == pipeline.named_steps[
        'data_preprocessing'].feat_type
Esempio n. 16
0
def test_fit_n_jobs(tmp_dir):

    X_train, Y_train, X_test, Y_test = putil.get_dataset('breast_cancer')

    # test parallel Classifier to predict classes, not only indices
    Y_train += 1
    Y_test += 1

    class get_smac_object_wrapper:
        def __call__(self, *args, **kwargs):
            self.n_jobs = kwargs['n_jobs']
            smac = get_smac_object(*args, **kwargs)
            self.dask_n_jobs = smac.solver.tae_runner.n_workers
            self.dask_client_n_jobs = len(
                smac.solver.tae_runner.client.scheduler_info()['workers'])
            return smac

    get_smac_object_wrapper_instance = get_smac_object_wrapper()

    automl = AutoSklearnClassifier(
        time_left_for_this_task=30,
        per_run_time_limit=5,
        tmp_folder=tmp_dir,
        seed=1,
        initial_configurations_via_metalearning=0,
        ensemble_size=5,
        n_jobs=2,
        include_estimators=['sgd'],
        include_preprocessors=['no_preprocessing'],
        get_smac_object_callback=get_smac_object_wrapper_instance,
        max_models_on_disc=None,
    )
    automl.fit(X_train, Y_train)

    # Test that the argument is correctly passed to SMAC
    assert getattr(get_smac_object_wrapper_instance, 'n_jobs') == 2
    assert getattr(get_smac_object_wrapper_instance, 'dask_n_jobs') == 2
    assert getattr(get_smac_object_wrapper_instance, 'dask_client_n_jobs') == 2

    available_num_runs = set()
    for run_key, run_value in automl.automl_.runhistory_.data.items():
        if run_value.additional_info is not None and 'num_run' in run_value.additional_info:
            available_num_runs.add(run_value.additional_info['num_run'])
    available_predictions = set()
    predictions = glob.glob(
        os.path.join(automl.automl_._backend.get_runs_directory(), '*',
                     'predictions_ensemble*.npy'))
    seeds = set()
    for prediction in predictions:
        prediction = os.path.split(prediction)[1]
        match = re.match(MODEL_FN_RE,
                         prediction.replace("predictions_ensemble", ""))
        if match:
            num_run = int(match.group(2))
            available_predictions.add(num_run)
            seed = int(match.group(1))
            seeds.add(seed)

    # Remove the dummy prediction, it is not part of the runhistory
    available_predictions.remove(1)
    assert available_num_runs.issubset(available_predictions)

    assert len(seeds) == 1

    ensemble_dir = automl.automl_._backend.get_ensemble_dir()
    ensembles = os.listdir(ensemble_dir)

    seeds = set()
    for ensemble_file in ensembles:
        seeds.add(int(ensemble_file.split('.')[0].split('_')[0]))
    assert len(seeds) == 1

    assert count_succeses(automl.cv_results_) > 0
    # For travis-ci it is important that the client no longer exists
    assert automl.automl_._dask_client is None
def test_pass_categorical_and_numeric_columns_to_pipeline(
        dask_client, data_type, include_categorical):

    # Prepare the training data
    X, y = sklearn.datasets.make_classification(random_state=0)
    X = cast(np.ndarray, X)

    n_features = X.shape[1]

    # If categorical, insert a row of 'categorical' '0's at last col
    if include_categorical:
        X = np.insert(X, n_features, values=0, axis=1)

    if data_type == 'pandas':
        X = pd.DataFrame(X)
        y = pd.DataFrame(y, dtype="category")

        # Set the last column to categorical
        if include_categorical:
            X.loc[:, n_features] = X.loc[:, n_features].astype(
                'category')  # type: ignore

    # Specify the feature_types
    if data_type == 'numpy' and include_categorical:
        feat_type = ['numerical'] * n_features + ['categorical']
    else:
        feat_type = None

    # Create the splits
    X_train, X_test, y_train, y_test = model_selection.train_test_split(
        X, y, test_size=0.5, random_state=3)

    # Create Estimator
    # Time left for task plays no role for fit_pipeline
    automl = AutoSklearnClassifier(
        delete_tmp_folder_after_terminate=False,
        time_left_for_this_task=120,
        per_run_time_limit=30,
        ensemble_size=0,
        seed=0,
        dask_client=dask_client,
        include={'classifier': ['random_forest']},
    )

    config_space = automl.get_configuration_space(
        X_train,
        y_train,
        X_test=X_test,
        y_test=y_test,
        feat_type=feat_type,
    )
    config = config_space.get_default_configuration()

    pipeline, _, run_value = automl.fit_pipeline(
        X=X_train,
        y=y_train,
        X_test=X_test,
        y_test=y_test,
        config=config,
        feat_type=feat_type,
    )

    assert pipeline is not None, "Expected a pipeline from automl.fit_pipeline"

    feature_validator = automl.automl_.InputValidator.feature_validator  # type: ignore
    transformed_X_test = feature_validator.transform(X_test)
    predictions = pipeline.predict(transformed_X_test)

    # We should produce a half decent result
    assert run_value.cost < 0.40, f"Run value:\n {run_value}"

    # Outputs should be the correct length
    assert np.shape(predictions)[0] == np.shape(y_test)[0]

    n_columns = np.shape(X)[1]

    if include_categorical:
        expected_feat_types = {
            i: feature_type
            for i, feature_type in enumerate(['numerical'] * (n_columns - 1) +
                                             ['categorical'])
        }

    else:
        expected_feat_types = {
            i: feature_type
            for i, feature_type in enumerate(['numerical'] * n_columns)
        }

    pipeline_feat_types = pipeline.named_steps[
        'data_preprocessor'].choice.feat_type
    assert expected_feat_types == pipeline_feat_types
Esempio n. 18
0
def test_type_of_target(mock_estimator):
    # Test that classifier raises error for illegal target types.
    X = np.array([
        [1, 2],
        [2, 3],
        [3, 4],
        [4, 5],
    ])
    # Possible target types
    y_binary = np.array([0, 0, 1, 1])
    y_continuous = np.array([0.1, 1.3, 2.1, 4.0])
    y_multiclass = np.array([0, 1, 2, 0])
    y_multilabel = np.array([
        [0, 1],
        [1, 1],
        [1, 0],
        [0, 0],
    ])
    y_multiclass_multioutput = np.array([
        [0, 1],
        [1, 3],
        [2, 2],
        [5, 3],
    ])
    y_continuous_multioutput = np.array([
        [0.1, 1.5],
        [1.2, 3.5],
        [2.7, 2.7],
        [5.5, 3.9],
    ])

    cls = AutoSklearnClassifier(ensemble_size=0)
    cls.automl_ = unittest.mock.Mock()
    cls.automl_.InputValidator = unittest.mock.Mock()
    cls.automl_.InputValidator.target_validator = unittest.mock.Mock()

    # Illegal target types for classification: continuous,
    # multiclass-multioutput, continuous-multioutput.
    expected_msg = r".*Classification with data of type"
    " multiclass-multioutput is not supported.*"
    with pytest.raises(ValueError, match=expected_msg):
        cls.fit(X=X, y=y_multiclass_multioutput)

    expected_msg = r".*Classification with data of type"
    " continuous is not supported.*"
    with pytest.raises(ValueError, match=expected_msg):
        cls.fit(X=X, y=y_continuous)

    expected_msg = r".*Classification with data of type"
    " continuous-multioutput is not supported.*"
    with pytest.raises(ValueError, match=expected_msg):
        cls.fit(X=X, y=y_continuous_multioutput)

    # Legal target types for classification: binary, multiclass,
    # multilabel-indicator.
    try:
        cls.fit(X, y_binary)
    except ValueError:
        pytest.fail("cls.fit() raised ValueError while fitting "
                    "binary targets")

    try:
        cls.fit(X, y_multiclass)
    except ValueError:
        pytest.fail("cls.fit() raised ValueError while fitting "
                    "multiclass targets")

    try:
        cls.fit(X, y_multilabel)
    except ValueError:
        pytest.fail("cls.fit() raised ValueError while fitting "
                    "multilabel-indicator targets")

    # Test that regressor raises error for illegal target types.
    reg = AutoSklearnRegressor(ensemble_size=0)
    # Illegal target types for regression: multilabel-indicator
    # multiclass-multioutput
    expected_msg = r".*Regression with data of type"
    " multilabel-indicator is not supported.*"
    with pytest.raises(ValueError, match=expected_msg):
        reg.fit(
            X=X,
            y=y_multilabel,
        )

    expected_msg = r".*Regression with data of type"
    " multiclass-multioutput is not supported.*"
    with pytest.raises(ValueError, match=expected_msg):
        reg.fit(
            X=X,
            y=y_multiclass_multioutput,
        )

    # Legal target types: continuous, multiclass,
    # continuous-multioutput,
    # binary
    try:
        reg.fit(X, y_continuous)
    except ValueError:
        pytest.fail("reg.fit() raised ValueError while fitting "
                    "continuous targets")

    try:
        reg.fit(X, y_multiclass)
    except ValueError:
        pytest.fail("reg.fit() raised ValueError while fitting "
                    "multiclass targets")

    try:
        reg.fit(X, y_continuous_multioutput)
    except ValueError:
        pytest.fail("reg.fit() raised ValueError while fitting "
                    "continuous_multioutput targets")

    try:
        reg.fit(X, y_binary)
    except ValueError:
        pytest.fail("reg.fit() raised ValueError while fitting "
                    "binary targets")
Esempio n. 19
0
# training_features = pd.read_csv('../../prepare-data/one-label/training.csv')
# training_features = pd.read_csv('../../prepare-data/one-label/simple/training.csv')
training_features = pd.read_csv(
    '../../prepare-data/one-label/simple/downgrade/postpaid/training.csv')
training_labels = training_features.pop('UpdatedIn90Days').values

# test_features = pd.read_csv('../../prepare-data/one-label/test.csv')
# test_features = pd.read_csv('../../prepare-data/one-label/simple/test.csv')
test_features = pd.read_csv(
    '../../prepare-data/one-label/simple/downgrade/postpaid/test.csv')
test_labels = test_features.pop('UpdatedIn90Days').values

# -----------------------------------------------------------------------------
# 2) Fit auto-classifier

clf = AutoSklearnClassifier()
clf.fit(training_features, training_labels)

# -----------------------------------------------------------------------------
# 3) Perform predictions on test set

actual = test_labels
predictions = clf.predict(test_features)

# -----------------------------------------------------------------------------
# 4) Show result scores; confusion matrix (most useful) and precision/recall

print('\nconfusion matrix')
# print(confusion_matrix(actual, predictions, labels = [0, 1, 2, 3, 4]))
# print(confusion_matrix(actual, predictions, labels = [0, 1, 2]))
print(confusion_matrix(actual, predictions, labels=[0, 1]))