Example #1
0
def test_refit_shuffle_on_fail(backend, dask_client):

    failing_model = unittest.mock.Mock()
    failing_model.fit.side_effect = [ValueError(), ValueError(), None]
    failing_model.fit_transformer.side_effect = [
        ValueError(), ValueError(), (None, {})]
    failing_model.get_max_iter.return_value = 100

    auto = AutoML(backend, 30, 5, dask_client=dask_client)
    ensemble_mock = unittest.mock.Mock()
    ensemble_mock.get_selected_model_identifiers.return_value = [(1, 1, 50.0)]
    auto.ensemble_ = ensemble_mock
    auto.InputValidator = InputValidator()
    for budget_type in [None, 'iterations']:
        auto._budget_type = budget_type

        auto.models_ = {(1, 1, 50.0): failing_model}

        # Make sure a valid 2D array is given to automl
        X = np.array([1, 2, 3]).reshape(-1, 1)
        y = np.array([1, 2, 3])
        auto.InputValidator.fit(X, y)
        auto.refit(X, y)

        assert failing_model.fit.call_count == 3
    assert failing_model.fit_transformer.call_count == 3

    del auto
def test_subsample_classification_unique_labels_stay_in_training_set(task, y):
    n_samples = 10000
    X = np.random.random(size=(n_samples, 3))
    memory_limit = 1  # Force subsampling
    mock = unittest.mock.Mock()

    # Make sure our test assumptions are correct
    assert len(y) == n_samples, "Ensure tests are correctly setup"

    values, counts = np.unique(y, axis=0, return_counts=True)
    unique_labels = [
        value for value, count in zip(values, counts) if count == 1
    ]
    assert len(unique_labels), "Ensure we have unique labels in the test"

    _, y_sampled = AutoML.subsample_if_too_large(X,
                                                 y,
                                                 logger=mock,
                                                 seed=1,
                                                 memory_limit=memory_limit,
                                                 task=task)

    assert len(y_sampled) <= len(y), \
        "Ensure sampling took place"
    assert all(label in y_sampled for label in unique_labels), \
        "All unique labels present in the return sampled set"
Example #3
0
def test_subsample_if_too_large(memory_limit, task):
    fixture = {
        BINARY_CLASSIFICATION: {
            1: 436,
            10: 569,
            None: 569
        },
        MULTICLASS_CLASSIFICATION: {
            1: 204,
            10: 1797,
            None: 1797
        },
        MULTILABEL_CLASSIFICATION: {
            1: 204,
            10: 1797,
            None: 1797
        },
        REGRESSION: {
            1: 1310,
            10: 1326,
            None: 1326
        },
        MULTIOUTPUT_REGRESSION: {
            1: 1310,
            10: 1326,
            None: 1326
        }
    }
    mock = unittest.mock.Mock()
    if task == BINARY_CLASSIFICATION:
        X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)
    elif task == MULTICLASS_CLASSIFICATION:
        X, y = sklearn.datasets.load_digits(return_X_y=True)
    elif task == MULTILABEL_CLASSIFICATION:
        X, y_ = sklearn.datasets.load_digits(return_X_y=True)
        y = np.zeros((X.shape[0], 10))
        for i, j in enumerate(y_):
            y[i, j] = 1
    elif task == REGRESSION:
        X, y = sklearn.datasets.load_diabetes(return_X_y=True)
        X = np.vstack((X, X, X))
        y = np.vstack((y.reshape((-1, 1)), y.reshape(
            (-1, 1)), y.reshape((-1, 1))))
    elif task == MULTIOUTPUT_REGRESSION:
        X, y = sklearn.datasets.load_diabetes(return_X_y=True)
        y = np.vstack((y, y)).transpose()
        X = np.vstack((X, X, X))
        y = np.vstack((y, y, y))
    else:
        raise ValueError(task)

    assert X.shape[0] == y.shape[0]

    X_new, y_new = AutoML.subsample_if_too_large(X, y, mock, 1, memory_limit,
                                                 task)
    assert X_new.shape[0] == fixture[task][memory_limit]
    if memory_limit == 1:
        assert mock.warning.call_count == 1
    else:
        assert mock.warning.call_count == 0
Example #4
0
def test_subsample_if_too_large(memory_limit, precision, task):
    fixture = {
        BINARY_CLASSIFICATION: {
            1: {float: 1310, np.float32: 2621, np.float64: 1310, np.float128: 655},
            100: {float: 12000, np.float32: 12000, np.float64: 12000, np.float128: 12000},
            None: {float: 12000, np.float32: 12000, np.float64: 12000, np.float128: 12000},
        },
        MULTICLASS_CLASSIFICATION: {
            1: {float: 204, np.float32: 409, np.float64: 204, np.float128: 102},
            100: {float: 1797, np.float32: 1797, np.float64: 1797, np.float128: 1797},
            None: {float: 1797, np.float32: 1797, np.float64: 1797, np.float128: 1797},
        },
        MULTILABEL_CLASSIFICATION: {
            1: {float: 204, np.float32: 409, np.float64: 204, np.float128: 102},
            100: {float: 1797, np.float32: 1797, np.float64: 1797, np.float128: 1797},
            None: {float: 1797, np.float32: 1797, np.float64: 1797, np.float128: 1797},
        },
        REGRESSION: {
            1: {float: 655, np.float32: 1310, np.float64: 655, np.float128: 327},
            100: {float: 5000, np.float32: 5000, np.float64: 5000, np.float128: 5000},
            None: {float: 5000, np.float32: 5000, np.float64: 5000, np.float128: 5000},
        },
        MULTIOUTPUT_REGRESSION: {
            1: {float: 655, np.float32: 1310, np.float64: 655, np.float128: 327},
            100: {float: 5000, np.float32: 5000, np.float64: 5000, np.float128: 5000},
            None: {float: 5000, np.float32: 5000, np.float64: 5000, np.float128: 5000},
        }
    }
    mock = unittest.mock.Mock()
    if task == BINARY_CLASSIFICATION:
        X, y = sklearn.datasets.make_hastie_10_2()
    elif task == MULTICLASS_CLASSIFICATION:
        X, y = sklearn.datasets.load_digits(return_X_y=True)
    elif task == MULTILABEL_CLASSIFICATION:
        X, y_ = sklearn.datasets.load_digits(return_X_y=True)
        y = np.zeros((X.shape[0], 10))
        for i, j in enumerate(y_):
            y[i, j] = 1
    elif task == REGRESSION:
        X, y = sklearn.datasets.make_friedman1(n_samples=5000, n_features=20)
    elif task == MULTIOUTPUT_REGRESSION:
        X, y = sklearn.datasets.make_friedman1(n_samples=5000, n_features=20)
        y = np.vstack((y, y)).transpose()
    else:
        raise ValueError(task)
    X = X.astype(precision)

    assert X.shape[0] == y.shape[0]

    X_new, y_new = AutoML.subsample_if_too_large(X, y, mock, 1, memory_limit, task)
    assert X_new.shape[0] == fixture[task][memory_limit][precision]
    if memory_limit == 1:
        if precision in (np.float128, np.float64, float):
            assert mock.warning.call_count == 2
        else:
            assert mock.warning.call_count == 1
    else:
        assert mock.warning.call_count == 0
Example #5
0
    def test_refit_shuffle_on_fail(self):
        backend_api = self._create_backend('test_refit_shuffle_on_fail')

        failing_model = unittest.mock.Mock()
        failing_model.fit.side_effect = [ValueError(), ValueError(), None]
        failing_model.fit_transformer.side_effect = [
            ValueError(), ValueError(), (None, {})]
        failing_model.get_max_iter.return_value = 100

        auto = AutoML(backend_api, 20, 5)
        ensemble_mock = unittest.mock.Mock()
        ensemble_mock.get_selected_model_identifiers.return_value = [(1, 1, 50.0)]
        auto.ensemble_ = ensemble_mock
        for budget_type in [None, 'iterations']:
            auto._budget_type = budget_type

            auto.models_ = {(1, 1, 50.0): failing_model}

            # Make sure a valid 2D array is given to automl
            X = np.array([1, 2, 3]).reshape(-1, 1)
            y = np.array([1, 2, 3])
            auto.refit(X, y)

            self.assertEqual(failing_model.fit.call_count, 3)
        self.assertEqual(failing_model.fit_transformer.call_count, 3)

        del auto
        self._tearDown(backend_api.temporary_directory)
        self._tearDown(backend_api.output_directory)
Example #6
0
def test_smbo_metalearning_configurations(backend, context, dask_client):

    # Get the inputs to the optimizer
    X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
    config_space = AutoML(backend=backend,
                          metric=autosklearn.metrics.accuracy,
                          time_left_for_this_task=20,
                          per_run_time_limit=5).fit(
                              X_train,
                              Y_train,
                              task=BINARY_CLASSIFICATION,
                              only_return_configuration_space=True)
    watcher = StopWatch()

    # Create an optimizer
    smbo = AutoMLSMBO(
        config_space=config_space,
        dataset_name='iris',
        backend=backend,
        total_walltime_limit=10,
        func_eval_time_limit=5,
        memory_limit=4096,
        metric=autosklearn.metrics.accuracy,
        watcher=watcher,
        n_jobs=1,
        dask_client=dask_client,
        port=logging.handlers.DEFAULT_TCP_LOGGING_PORT,
        start_num_run=1,
        data_memory_limit=None,
        num_metalearning_cfgs=25,
        pynisher_context=context,
    )
    assert smbo.pynisher_context == context

    # Create the inputs to metalearning
    datamanager = XYDataManager(
        X_train,
        Y_train,
        X_test,
        Y_test,
        task=BINARY_CLASSIFICATION,
        dataset_name='iris',
        feat_type={i: 'numerical'
                   for i in range(X_train.shape[1])},
    )
    backend.save_datamanager(datamanager)
    smbo.task = BINARY_CLASSIFICATION
    smbo.reset_data_manager()
    metalearning_configurations = smbo.get_metalearning_suggestions()

    # We should have 25 metalearning configurations
    assert len(metalearning_configurations) == 25
    assert [
        isinstance(config, Configuration)
        for config in metalearning_configurations
    ]
Example #7
0
    def test_refit_shuffle_on_fail(self):
        output = os.path.join(self.test_dir, '..', '.tmp_refit_shuffle_on_fail')
        context = BackendContext(output, output, False, False)
        backend = Backend(context)

        failing_model = unittest.mock.Mock()
        failing_model.fit.side_effect = [ValueError(), ValueError(), None]

        auto = AutoML(backend, 30, 30)
        ensemble_mock = unittest.mock.Mock()
        auto.ensemble_ = ensemble_mock
        ensemble_mock.get_model_identifiers.return_value = [1]

        auto.models_ = {1: failing_model}

        X = np.array([1, 2, 3])
        y = np.array([1, 2, 3])
        auto.refit(X, y)

        self.assertEqual(failing_model.fit.call_count, 3)
Example #8
0
    def test_refit_shuffle_on_fail(self):
        backend_api = self._create_backend('test_refit_shuffle_on_fail')

        failing_model = unittest.mock.Mock()
        failing_model.fit.side_effect = [ValueError(), ValueError(), None]

        auto = AutoML(backend_api, 20, 5)
        ensemble_mock = unittest.mock.Mock()
        auto.ensemble_ = ensemble_mock
        ensemble_mock.get_selected_model_identifiers.return_value = [1]

        auto.models_ = {1: failing_model}

        X = np.array([1, 2, 3])
        y = np.array([1, 2, 3])
        auto.refit(X, y)

        self.assertEqual(failing_model.fit.call_count, 3)

        del auto
        self._tearDown(backend_api.temporary_directory)
        self._tearDown(backend_api.output_directory)
Example #9
0
    def test_refit_shuffle_on_fail(self):
        output = os.path.join(self.test_dir, '..', '.tmp_refit_shuffle_on_fail')
        context = BackendContext(output, output, False, False)
        backend = Backend(context)

        failing_model = unittest.mock.Mock()
        failing_model.fit.side_effect = [ValueError(), ValueError(), None]

        auto = AutoML(backend, 20, 5)
        ensemble_mock = unittest.mock.Mock()
        auto.ensemble_ = ensemble_mock
        ensemble_mock.get_model_identifiers.return_value = [1]

        auto.models_ = {1: failing_model}

        X = np.array([1, 2, 3])
        y = np.array([1, 2, 3])
        auto.refit(X, y)

        self.assertEqual(failing_model.fit.call_count, 3)
Example #10
0
    def test_refit_shuffle_on_fail(self):
        backend_api = self._create_backend('test_refit_shuffle_on_fail')

        failing_model = unittest.mock.Mock()
        failing_model.fit.side_effect = [ValueError(), ValueError(), None]

        auto = AutoML(backend_api, 20, 5)
        ensemble_mock = unittest.mock.Mock()
        auto.ensemble_ = ensemble_mock
        ensemble_mock.get_selected_model_identifiers.return_value = [1]

        auto.models_ = {1: failing_model}

        X = np.array([1, 2, 3])
        y = np.array([1, 2, 3])
        auto.refit(X, y)

        self.assertEqual(failing_model.fit.call_count, 3)

        del auto
        self._tearDown(backend_api.temporary_directory)
        self._tearDown(backend_api.output_directory)