コード例 #1
0
    def test_classification_pandas_support(self):
        X, y = sklearn.datasets.fetch_openml(
            data_id=2,  # cat/num dataset
            return_X_y=True,
            as_frame=True,
        )

        # Drop NAN!!
        X = X.dropna('columns')

        # This test only make sense if input is dataframe
        self.assertTrue(isinstance(X, pd.DataFrame))
        self.assertTrue(isinstance(y, pd.Series))
        automl = AutoSklearnClassifier(
            time_left_for_this_task=30,
            per_run_time_limit=5,
            exclude_estimators=['libsvm_svc'],
            seed=5,
        )

        automl.fit(X, y)

        # Make sure that at least better than random.
        # We use same X_train==X_test to test code quality
        self.assertTrue(automl.score(X, y) > 0.555)

        automl.refit(X, y)

        # Make sure that at least better than random.
        # accuracy in sklearn needs valid data
        # It should be 0.555 as the dataset is unbalanced.
        y = automl._automl[0].InputValidator.encode_target(y)
        prediction = automl._automl[0].InputValidator.encode_target(automl.predict(X))
        self.assertTrue(accuracy(y, prediction) > 0.555)
コード例 #2
0
    def test_binary(self):
        tmp = os.path.join(self.test_dir, '..', '.out_binary_fit')
        output = os.path.join(self.test_dir, '..', '.tmp_binary_fit')
        self._setUp(output)
        self._setUp(tmp)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris',
                                                             make_binary=True)
        automl = AutoSklearnClassifier(time_left_for_this_task=20,
                                       per_run_time_limit=5,
                                       tmp_folder=tmp,
                                       output_folder=output)

        automl.fit(X_train,
                   Y_train,
                   X_test=X_test,
                   y_test=Y_test,
                   dataset_name='binary_test_dataset')
        predictions = automl.predict(X_test)
        self.assertEqual(predictions.shape, (50, ))
        score = accuracy(Y_test, predictions)
        self.assertGreaterEqual(score, 0.9)

        output_files = os.listdir(output)
        self.assertIn('binary_test_dataset_test_1.predict', output_files)
コード例 #3
0
def test_binary(tmp_dir, output_dir, dask_client):

    X_train, Y_train, X_test, Y_test = putil.get_dataset('iris',
                                                         make_binary=True)
    automl = AutoSklearnClassifier(time_left_for_this_task=40,
                                   per_run_time_limit=10,
                                   tmp_folder=tmp_dir,
                                   dask_client=dask_client,
                                   output_folder=output_dir)

    automl.fit(X_train,
               Y_train,
               X_test=X_test,
               y_test=Y_test,
               dataset_name='binary_test_dataset')

    predictions = automl.predict(X_test)
    assert predictions.shape == (50, ), print_debug_information(automl)

    score = accuracy(Y_test, predictions)
    assert score > 0.9, print_debug_information(automl)
    assert count_succeses(
        automl.cv_results_) > 0, print_debug_information(automl)

    output_files = glob.glob(
        os.path.join(output_dir, 'binary_test_dataset_test_*.predict'))
    assert len(output_files) > 0, (output_files,
                                   print_debug_information(automl))
コード例 #4
0
def test_binary(tmp_dir, dask_client):

    X_train, Y_train, X_test, Y_test = putil.get_dataset('iris',
                                                         make_binary=True)
    automl = AutoSklearnClassifier(
        time_left_for_this_task=40,
        delete_tmp_folder_after_terminate=False,
        per_run_time_limit=10,
        tmp_folder=tmp_dir,
        dask_client=dask_client,
    )

    automl.fit(X_train,
               Y_train,
               X_test=X_test,
               y_test=Y_test,
               dataset_name='binary_test_dataset')

    predictions = automl.predict(X_test)
    assert predictions.shape == (50, ), print_debug_information(automl)

    score = accuracy(Y_test, predictions)
    assert score > 0.9, print_debug_information(automl)
    assert count_succeses(
        automl.cv_results_) > 0, print_debug_information(automl)
    assert includes_all_scores(automl.performance_over_time_.columns) is True
    assert performance_over_time_is_plausible(
        automl.performance_over_time_) is True
コード例 #5
0
def test_classification_pandas_support(tmp_dir, output_dir, dask_client):

    X, y = sklearn.datasets.fetch_openml(
        data_id=2,  # cat/num dataset
        return_X_y=True,
        as_frame=True,
    )

    # Drop NAN!!
    X = X.dropna('columns')

    # This test only make sense if input is dataframe
    assert isinstance(X, pd.DataFrame)
    assert isinstance(y, pd.Series)
    automl = AutoSklearnClassifier(
        time_left_for_this_task=30,
        per_run_time_limit=5,
        exclude_estimators=['libsvm_svc'],
        dask_client=dask_client,
        seed=5,
        tmp_folder=tmp_dir,
        output_folder=output_dir,
    )

    automl.fit(X, y)

    # Make sure that at least better than random.
    # We use same X_train==X_test to test code quality
    assert automl.score(X, y) > 0.555, print_debug_information(automl)

    automl.refit(X, y)

    # Make sure that at least better than random.
    # accuracy in sklearn needs valid data
    # It should be 0.555 as the dataset is unbalanced.
    prediction = automl.predict(X)
    assert accuracy(y, prediction) > 0.555
    assert count_succeses(automl.cv_results_) > 0
    assert includes_train_scores(automl.performance_over_time_.columns) is True
    assert performance_over_time_is_plausible(
        automl.performance_over_time_) is True
コード例 #6
0
def test_classification_pandas_support(tmp_dir, output_dir, dask_client):

    X, y = sklearn.datasets.fetch_openml(
        data_id=2,  # cat/num dataset
        return_X_y=True,
        as_frame=True,
    )

    # Drop NAN!!
    X = X.dropna('columns')

    # This test only make sense if input is dataframe
    assert isinstance(X, pd.DataFrame)
    assert isinstance(y, pd.Series)
    automl = AutoSklearnClassifier(
        time_left_for_this_task=30,
        per_run_time_limit=5,
        exclude_estimators=['libsvm_svc'],
        dask_client=dask_client,
        seed=5,
        tmp_folder=tmp_dir,
        output_folder=output_dir,
    )

    automl.fit(X, y)

    log_file_path = glob.glob(os.path.join(tmp_dir, 'AutoML*.log'))[0]

    # Make sure that at least better than random.
    # We use same X_train==X_test to test code quality
    assert automl.score(X, y) > 0.555, extract_msg_from_log(log_file_path)

    automl.refit(X, y)

    # Make sure that at least better than random.
    # accuracy in sklearn needs valid data
    # It should be 0.555 as the dataset is unbalanced.
    y = automl.automl_.InputValidator.encode_target(y)
    prediction = automl.automl_.InputValidator.encode_target(automl.predict(X))
    assert accuracy(y, prediction) > 0.555
    assert count_succeses(automl.cv_results_) > 0
コード例 #7
0
    def test_binary(self):
        tmp = os.path.join(self.test_dir, '..', '.out_binary_fit')
        output = os.path.join(self.test_dir, '..', '.tmp_binary_fit')
        self._setUp(output)
        self._setUp(tmp)

        X_train, Y_train, X_test, Y_test = putil.get_dataset(
            'iris', make_binary=True)
        automl = AutoSklearnClassifier(time_left_for_this_task=20,
                                       per_run_time_limit=5,
                                       tmp_folder=tmp,
                                       output_folder=output)

        automl.fit(X_train, Y_train, X_test=X_test, y_test=Y_test,
                   dataset_name='binary_test_dataset')
        predictions = automl.predict(X_test)
        self.assertEqual(predictions.shape, (50, ))
        score = accuracy(Y_test, predictions)
        self.assertGreaterEqual(score, 0.9)

        output_files = os.listdir(output)
        self.assertIn('binary_test_dataset_test_1.predict', output_files)