Example #1
0
    def test_cv_results(self):
        # TODO restructure and actually use real SMAC output from a long run
        # to do this unittest!
        tmp = os.path.join(self.test_dir, '..', '.tmp_cv_results')
        output = os.path.join(self.test_dir, '..', '.out_cv_results')
        self._setUp(tmp)
        self._setUp(output)
        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')

        cls = AutoSklearnClassifier(time_left_for_this_task=20,
                                    per_run_time_limit=5,
                                    output_folder=output,
                                    tmp_folder=tmp,
                                    shared_mode=False,
                                    seed=1,
                                    initial_configurations_via_metalearning=0,
                                    ensemble_size=0)
        cls.fit(X_train, Y_train)
        cv_results = cls.cv_results_
        self.assertIsInstance(cv_results, dict)
        self.assertIsInstance(cv_results['mean_test_score'], np.ndarray)
        self.assertIsInstance(cv_results['mean_fit_time'], np.ndarray)
        self.assertIsInstance(cv_results['params'], list)
        self.assertIsInstance(cv_results['rank_test_scores'], np.ndarray)
        self.assertTrue([isinstance(val, npma.MaskedArray) for key, val in
                         cv_results.items() if key.startswith('param_')])
        del cls
        self._tearDown(tmp)
        self._tearDown(output)
    def test_fit_pSMAC(self):
        output = os.path.join(self.test_dir, '..', '.tmp_estimator_fit_pSMAC')
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')

        automl = AutoSklearnClassifier(time_left_for_this_task=15,
                                       per_run_time_limit=15,
                                       output_folder=output,
                                       tmp_folder=output,
                                       shared_mode=True,
                                       seed=1,
                                       initial_configurations_via_metalearning=0,
                                       ensemble_size=0)
        automl.fit(X_train, Y_train)

        # Create a 'dummy model' for the first run, which has an accuracy of
        # more than 99%; it should be in the final ensemble if the ensemble
        # building of the second AutoSklearn classifier works correct
        true_targets_ensemble_path = os.path.join(output, '.auto-sklearn',
                                                  'true_targets_ensemble.npy')
        true_targets_ensemble = np.load(true_targets_ensemble_path)
        true_targets_ensemble[-1] = 1 if true_targets_ensemble[-1] != 1 else 0
        probas = np.zeros((len(true_targets_ensemble), 3), dtype=float)
        for i, value in enumerate(true_targets_ensemble):
            probas[i, value] = 1.0
        dummy_predictions_path = os.path.join(output, '.auto-sklearn',
                                              'predictions_ensemble',
                                              'predictions_ensemble_1_00030.npy')
        with open(dummy_predictions_path, 'wb') as fh:
            np.save(fh, probas)

        probas_test = np.zeros((len(Y_test), 3), dtype=float)
        for i, value in enumerate(Y_test):
            probas_test[i, value] = 1.0

        dummy = ArrayReturningDummyPredictor(probas_test)
        backend = Backend(output, output)
        backend.save_model(dummy, 30, 1)

        automl = AutoSklearnClassifier(time_left_for_this_task=15,
                                       per_run_time_limit=15,
                                       output_folder=output,
                                       tmp_folder=output,
                                       shared_mode=True,
                                       seed=2,
                                       initial_configurations_via_metalearning=0,
                                       ensemble_size=0)
        automl.fit(X_train, Y_train)
        automl.run_ensemble_builder(0, 1, 50).wait()

        score = automl.score(X_test, Y_test)

        self.assertEqual(len(os.listdir(os.path.join(output, '.auto-sklearn',
                                                     'ensembles'))), 1)
        self.assertGreaterEqual(score, 0.90)
        self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION)

        del automl
        self._tearDown(output)
Example #3
0
def spawn_classifier(seed, dataset_name):
    """Spawn a subprocess.

    auto-sklearn does not take care of spawning worker processes. This
    function, which is called several times in the main block is a new
    process which runs one instance of auto-sklearn.
    """

    # Use the initial configurations from meta-learning only in one out of
    # the four processes spawned. This prevents auto-sklearn from evaluating
    # the same configurations in four processes.
    if seed == 0:
        initial_configurations_via_metalearning = 25
    else:
        initial_configurations_via_metalearning = 0

    # Arguments which are different to other runs of auto-sklearn:
    # 1. all classifiers write to the same output directory
    # 2. shared_mode is set to True, this enables sharing of data between
    # models.
    # 3. all instances of the AutoSklearnClassifier must have a different seed!
    automl = AutoSklearnClassifier(
        time_left_for_this_task=120,  # sec., how long should this seed fit
        # process run
        per_run_time_limit=60,  # sec., each model may only take this long before it's killed
        ml_memory_limit=1024,  # MB, memory limit imposed on each call to a ML  algorithm
        shared_mode=True,  # tmp folder will be shared between seeds
        tmp_folder=tmp_folder,
        output_folder=output_folder,
        delete_tmp_folder_after_terminate=False,
        ensemble_size=0,  # ensembles will be built when all optimization runs are finished
        initial_configurations_via_metalearning=initial_configurations_via_metalearning,
        seed=seed,
    )
    automl.fit(X_train, y_train, dataset_name=dataset_name)
def spawn_classifier(seed, dataset_name):

    automl = AutoSklearnClassifier(time_left_for_this_task=600, # sec., how long should this seed fit process run
                                   per_run_time_limit=60, # sec., each model may only take this long before it's killed 
                                   ml_memory_limit=1024, # MB
                                   shared_mode=True, # tmp folder will be shared between seeds
                                   tmp_folder=tmp_folder,
                                   output_folder=output_folder,
                                   delete_tmp_folder_after_terminate=False,
                                   ensemble_size=0, # no need to build ensembles at this stage
                                   initial_configurations_via_metalearning=0, # let seeds profit from each other's results
                                   seed=seed)
    automl.fit(X_train, y_train, dataset_name=dataset_name)
Example #5
0
 def test_conversion_of_list_to_np(self, fit_ensemble, refit, fit):
     automl = AutoSklearnClassifier()
     X = [[1], [2], [3]]
     y = [1, 2, 3]
     automl.fit(X, y)
     self.assertEqual(fit.call_count, 1)
     self.assertIsInstance(fit.call_args[0][0], np.ndarray)
     self.assertIsInstance(fit.call_args[0][1], np.ndarray)
     automl.refit(X, y)
     self.assertEqual(refit.call_count, 1)
     self.assertIsInstance(refit.call_args[0][0], np.ndarray)
     self.assertIsInstance(refit.call_args[0][1], np.ndarray)
     automl.fit_ensemble(y)
     self.assertEqual(fit_ensemble.call_count, 1)
     self.assertIsInstance(fit_ensemble.call_args[0][0], np.ndarray)
Example #6
0
    def test_can_pickle_classifier(self):
        tmp = os.path.join(self.test_dir, '..', '.tmp_can_pickle')
        output = os.path.join(self.test_dir, '..', '.out_can_pickle')
        self._setUp(tmp)
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
        automl = AutoSklearnClassifier(time_left_for_this_task=20,
                                       per_run_time_limit=5,
                                       tmp_folder=tmp,
                                       output_folder=output)
        automl.fit(X_train, Y_train)

        initial_predictions = automl.predict(X_test)
        initial_accuracy = sklearn.metrics.accuracy_score(Y_test,
                                                          initial_predictions)
        self.assertGreaterEqual(initial_accuracy, 0.75)

        # Test pickle
        dump_file = os.path.join(output, 'automl.dump.pkl')

        with open(dump_file, 'wb') as f:
            pickle.dump(automl, f)

        with open(dump_file, 'rb') as f:
            restored_automl = pickle.load(f)

        restored_predictions = restored_automl.predict(X_test)
        restored_accuracy = sklearn.metrics.accuracy_score(Y_test,
                                                           restored_predictions)
        self.assertGreaterEqual(restored_accuracy, 0.75)

        self.assertEqual(initial_accuracy, restored_accuracy)

        # Test joblib
        dump_file = os.path.join(output, 'automl.dump.joblib')

        sklearn.externals.joblib.dump(automl, dump_file)

        restored_automl = sklearn.externals.joblib.load(dump_file)

        restored_predictions = restored_automl.predict(X_test)
        restored_accuracy = sklearn.metrics.accuracy_score(Y_test,
                                                           restored_predictions)
        self.assertGreaterEqual(restored_accuracy, 0.75)

        self.assertEqual(initial_accuracy, restored_accuracy)
Example #7
0
    def test_fit(self):

        output = os.path.join(self.test_dir, '..', '.tmp_estimator_fit')
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
        automl = AutoSklearnClassifier(time_left_for_this_task=15,
                                       per_run_time_limit=5,
                                       tmp_folder=output,
                                       output_folder=output)
        automl.fit(X_train, Y_train)
        score = automl.score(X_test, Y_test)
        print(automl.show_models())

        self.assertGreaterEqual(score, 0.8)
        self.assertEqual(automl._automl._automl._task, MULTICLASS_CLASSIFICATION)

        del automl
        self._tearDown(output)
Example #8
0
    def test_multilabel(self):
        tmp = os.path.join(self.test_dir, '..', '.tmp_multilabel_fit')
        output = os.path.join(self.test_dir, '..', '.out_multilabel_fit')
        self._setUp(tmp)
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset(
            'iris', make_multilabel=True)
        automl = AutoSklearnClassifier(time_left_for_this_task=20,
                                       per_run_time_limit=5,
                                       tmp_folder=tmp,
                                       output_folder=output)

        automl.fit(X_train, Y_train)
        predictions = automl.predict(X_test)
        self.assertEqual(predictions.shape, (50, 3))
        score = f1_macro(Y_test, predictions)
        self.assertGreaterEqual(score, 0.9)
        probs = automl.predict_proba(X_train)
        self.assertAlmostEqual(np.mean(probs), 0.33, places=1)
    def test_fit(self):
        if self.travis:
            self.skipTest('This test does currently not run on travis-ci. '
                          'Make sure it runs locally on your machine!')

        output = os.path.join(self.test_dir, '..', '.tmp_estimator_fit')
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
        automl = AutoSklearnClassifier(time_left_for_this_task=15,
                                       per_run_time_limit=15,
                                       tmp_folder=output,
                                       output_folder=output)
        automl.fit(X_train, Y_train)
        score = automl.score(X_test, Y_test)
        print(automl.show_models())

        self.assertGreaterEqual(score, 0.8)
        self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION)

        del automl
        self._tearDown(output)
Example #10
0
    def test_binary(self):
        tmp = os.path.join(self.test_dir, '..', '.out_binary_fit')
        output = os.path.join(self.test_dir, '..', '.tmp_binary_fit')
        self._setUp(output)
        self._setUp(tmp)

        X_train, Y_train, X_test, Y_test = putil.get_dataset(
            'iris', make_binary=True)
        automl = AutoSklearnClassifier(time_left_for_this_task=20,
                                       per_run_time_limit=5,
                                       tmp_folder=tmp,
                                       output_folder=output)

        automl.fit(X_train, Y_train, X_test=X_test, y_test=Y_test,
                   dataset_name='binary_test_dataset')
        predictions = automl.predict(X_test)
        self.assertEqual(predictions.shape, (50, ))
        score = accuracy(Y_test, predictions)
        self.assertGreaterEqual(score, 0.9)

        output_files = os.listdir(output)
        self.assertIn('binary_test_dataset_test_1.predict', output_files)
Example #11
0
    def test_classification_methods_returns_self(self):
        X_train, y_train, X_test, y_test = putil.get_dataset('iris')
        automl = AutoSklearnClassifier(time_left_for_this_task=20,
                                       per_run_time_limit=5,
                                       ensemble_size=0)

        automl_fitted = automl.fit(X_train, y_train)
        self.assertIs(automl, automl_fitted)

        automl_ensemble_fitted = automl.fit_ensemble(y_train, ensemble_size=5)
        self.assertIs(automl, automl_ensemble_fitted)

        automl_refitted = automl.refit(X_train.copy(), y_train.copy())
        self.assertIs(automl, automl_refitted)
Example #12
0
def train(X, y):
    """example of auto-sklearn for a classification dataset"""
    # split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.33, random_state=1
    )
    # define search
    model = AutoSklearnClassifier(
        time_left_for_this_task=30,
        # per_run_time_limit=30,
        # n_jobs=8,
    )
    # perform the search
    model.fit(X_train, y_train)
    # summarize
    print(model.sprint_statistics())
    # evaluate best model
    y_hat = model.predict(X_test)
    acc = accuracy_score(y_test, y_hat)
    print("Accuracy: %.3f" % acc)

    model_path = Path("./catanatron/players/estimator.pickle").resolve()
    with open(model_path, "wb") as f:
        pickle.dump(model, f)
    def test_classification_pandas_support(self):
        X, y = sklearn.datasets.fetch_openml(
            data_id=2,  # cat/num dataset
            return_X_y=True,
            as_frame=True,
        )

        # Drop NAN!!
        X = X.dropna('columns')

        # This test only make sense if input is dataframe
        self.assertTrue(isinstance(X, pd.DataFrame))
        self.assertTrue(isinstance(y, pd.Series))
        automl = AutoSklearnClassifier(
            time_left_for_this_task=30,
            per_run_time_limit=5,
            exclude_estimators=['libsvm_svc'],
            seed=5,
        )

        automl.fit(X, y)

        # Make sure that at least better than random.
        # We use same X_train==X_test to test code quality
        self.assertTrue(automl.score(X, y) > 0.555)

        automl.refit(X, y)

        # Make sure that at least better than random.
        # accuracy in sklearn needs valid data
        # It should be 0.555 as the dataset is unbalanced.
        y = automl.automl_.InputValidator.encode_target(y)
        prediction = automl.automl_.InputValidator.encode_target(
            automl.predict(X))
        self.assertTrue(accuracy(y, prediction) > 0.555)
        self.assertGreater(self._count_succeses(automl.cv_results_), 0)
Example #14
0
def spawn_classifier(seed, dataset_name):
    digits = sklearn.datasets.load_digits()
    X = digits.data
    y = digits.target
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)
    X = X[indices]
    y = y[indices]
    X_train = X[:1000]
    y_train = y[:1000]
    X_test = X[1000:]
    y_test = y[1000:]

    automl = AutoSklearnClassifier(time_left_for_this_task=60,
                                   per_run_time_limit=60,
                                   ml_memory_limit=1024,
                                   shared_mode=True,
                                   tmp_folder=tmp_folder,
                                   output_folder=output_folder,
                                   delete_tmp_folder_after_terminate=False,
                                   ensemble_size=0,
                                   initial_configurations_via_metalearning=0,
                                   seed=seed)
    automl.fit(X_train, y_train, dataset_name=dataset_name)
    def test_classification_methods_returns_self(self):
        X_train, y_train, X_test, y_test = putil.get_dataset('iris')
        automl = AutoSklearnClassifier(time_left_for_this_task=60,
                                       per_run_time_limit=10,
                                       ensemble_size=0,
                                       exclude_preprocessors=['fast_ica'])

        automl_fitted = automl.fit(X_train, y_train)
        self.assertIs(automl, automl_fitted)

        automl_ensemble_fitted = automl.fit_ensemble(y_train, ensemble_size=5)
        self.assertIs(automl, automl_ensemble_fitted)

        automl_refitted = automl.refit(X_train.copy(), y_train.copy())
        self.assertIs(automl, automl_refitted)
Example #16
0
def test_autosklearn_classification_methods_returns_self(dask_client):
    """
    Currently this method only tests that the methods of AutoSklearnClassifier
    is able to fit using fit(), fit_ensemble() and refit()
    """
    X_train, y_train, X_test, y_test = putil.get_dataset('iris')
    automl = AutoSklearnClassifier(time_left_for_this_task=60,
                                   per_run_time_limit=10,
                                   ensemble_size=0,
                                   dask_client=dask_client,
                                   exclude_preprocessors=['fast_ica'])

    automl_fitted = automl.fit(X_train, y_train)
    assert automl is automl_fitted

    automl_ensemble_fitted = automl.fit_ensemble(y_train, ensemble_size=5)
    assert automl is automl_ensemble_fitted

    automl_refitted = automl.refit(X_train.copy(), y_train.copy())
    assert automl is automl_refitted
Example #17
0
def test_feat_type_wrong_arguments():
    cls = AutoSklearnClassifier(ensemble_size=0)
    X = np.zeros((100, 100))
    y = np.zeros((100, ))

    expected_msg = r".*Array feat_type does not have same number of "
    "variables as X has features. 1 vs 100.*"
    with pytest.raises(ValueError, match=expected_msg):
        cls.fit(X=X, y=y, feat_type=[True])

    expected_msg = r".*Array feat_type must only contain strings.*"
    with pytest.raises(ValueError, match=expected_msg):
        cls.fit(X=X, y=y, feat_type=[True] * 100)

    expected_msg = r".*Only `Categorical` and `Numerical` are"
    "valid feature types, you passed `Car`.*"
    with pytest.raises(ValueError, match=expected_msg):
        cls.fit(X=X, y=y, feat_type=['Car'] * 100)
Example #18
0
}

X_train, y_train, X_test, y_test, cat = load_task(task_id)

if task_type == 'classification':
    automl = AutoSklearnClassifier(**automl_arguments)
    metric = balanced_accuracy
elif task_type == 'regression':
    automl = AutoSklearnRegressor(**automl_arguments)
    metric = r2
else:
    raise ValueError(task_type)

automl.fit(X_train,
           y_train,
           dataset_name=str(task_id),
           metric=metric,
           feat_type=cat)
data = automl._automl._backend.load_datamanager()
# Data manager can't be replaced with save_datamanager, it has to be deleted
# first
os.remove(automl._automl._backend._get_datamanager_pickle_filename())
data.data['X_test'] = X_test
data.data['Y_test'] = y_test
automl._automl._backend.save_datamanager(data)
trajectory = automl.trajectory_

incumbent_id_to_model = {}
incumbent_id_to_performance = {}
validated_trajectory = []
        raise ValueError("Wrong set type, should be `train` or `test`!")
    # when the task if binary.classification or regression, transform it to multilabel
    if task == 'regression':
        labels = regression_to_multilabel(labels)
    elif task == 'binary.classification':
        labels = binary_to_multilabel(labels)
    return features, labels


if __name__ == '__main__':
    input_dir = '../../../autodl-contrib/raw_datasets/automl'
    output_dir = '../'
    for dataset_name in ['dorothea', 'adult']:
        D = DataManager(dataset_name,
                        input_dir,
                        replace_missing=False,
                        verbose=verbose)
        X_test, Y_test = _prepare_metadata_features_and_labels(D,
                                                               set_type='test')
        X_train, Y_train = _prepare_metadata_features_and_labels(
            D, set_type='train')
        print(Y_test.shape)
        time_budget = 7200
        model = AutoSklearnClassifier(time_left_for_this_task=time_budget,
                                      per_run_time_limit=time_budget // 10)
        model.fit(X_train, Y_train)
        predict_path = os.path.join(output_dir, dataset_name + '.predict')
        Y_hat_test = model.predict_proba(X_test)
        print(Y_hat_test.shape)
        data_io.write(predict_path, Y_hat_test)
def run_experiment(
    working_directory,
    time_limit,
    per_run_time_limit,
    task_id,
    seed,
    use_metalearning,
):
    # set this to local dataset cache
    # openml.config.cache_directory = os.path.join(working_directory, "../cache")

    seed_dir = os.path.join(working_directory, str(seed))
    try:
        os.makedirs(seed_dir)
    except Exception:
        print("Directory {0} aleardy created.".format(seed_dir))

    tmp_dir = os.path.join(seed_dir, str(task_id))

    # With metalearning
    if use_metalearning is True:
        # path to the original metadata directory.
        metadata_directory = os.path.abspath(os.path.dirname(__file__))
        metadata_directory = os.path.join(
            metadata_directory, "../../../autosklearn/metalearning/files/")

        # Create new metadata directory not containing task_id.
        new_metadata_directory = os.path.abspath(
            os.path.join(working_directory, "metadata_%i" % task_id))

        try:
            os.makedirs(new_metadata_directory)
        except OSError:
            pass  # pass because new metadata is created for this task.

        # remove the given task id from metadata directory.
        remove_dataset(metadata_directory, new_metadata_directory, task_id)

        automl_arguments = {
            'time_left_for_this_task': time_limit,
            'per_run_time_limit': per_run_time_limit,
            'initial_configurations_via_metalearning': 25,
            'ensemble_size': 0,
            'seed': seed,
            'memory_limit': 3072,
            'resampling_strategy': 'holdout',
            'resampling_strategy_arguments': {
                'train_size': 0.67
            },
            'tmp_folder': tmp_dir,
            'delete_tmp_folder_after_terminate': False,
            'disable_evaluator_output': False,
            'metadata_directory': new_metadata_directory
        }

    # Without metalearning
    else:
        automl_arguments = {
            'time_left_for_this_task': time_limit,
            'per_run_time_limit': per_run_time_limit,
            'initial_configurations_via_metalearning': 0,
            'ensemble_size': 0,
            'seed': seed,
            'memory_limit': 3072,
            'resampling_strategy': 'holdout',
            'resampling_strategy_arguments': {
                'train_size': 0.67
            },
            'tmp_folder': tmp_dir,
            'delete_tmp_folder_after_terminate': False,
            'disable_evaluator_output': False,
        }

    automl = AutoSklearnClassifier(**automl_arguments)

    X_train, y_train, X_test, y_test, cat = load_task(task_id)

    automl.fit(X_train,
               y_train,
               dataset_name=str(task_id),
               X_test=X_test,
               y_test=y_test,
               metric=balanced_accuracy)
Example #21
0
    def test_type_of_target(self, mock_estimator):
        # Test that classifier raises error for illegal target types.
        X = np.array([
            [1, 2],
            [2, 3],
            [3, 4],
            [4, 5],
        ])
        # Possible target types
        y_binary = np.array([0, 0, 1, 1])
        y_continuous = np.array([0.1, 1.3, 2.1, 4.0])
        y_multiclass = np.array([0, 1, 2, 0])
        y_multilabel = np.array([
            [0, 1],
            [1, 1],
            [1, 0],
            [0, 0],
        ])
        y_multiclass_multioutput = np.array([
            [0, 1],
            [1, 3],
            [2, 2],
            [5, 3],
        ])
        y_continuous_multioutput = np.array([
            [0.1, 1.5],
            [1.2, 3.5],
            [2.7, 2.7],
            [5.5, 3.9],
        ])

        cls = AutoSklearnClassifier()
        # Illegal target types for classification: continuous,
        # multiclass-multioutput, continuous-multioutput.
        self.assertRaisesRegex(
            ValueError,
            "classification with data of type"
            " multiclass-multioutput is not supported",
            cls.fit,
            X=X,
            y=y_multiclass_multioutput,
        )

        self.assertRaisesRegex(
            ValueError,
            "classification with data of type"
            " continuous is not supported",
            cls.fit,
            X=X,
            y=y_continuous,
        )

        self.assertRaisesRegex(
            ValueError,
            "classification with data of type"
            " continuous-multioutput is not supported",
            cls.fit,
            X=X,
            y=y_continuous_multioutput,
        )

        # Legal target types for classification: binary, multiclass,
        # multilabel-indicator.
        try:
            cls.fit(X, y_binary)
        except ValueError:
            self.fail("cls.fit() raised ValueError while fitting "
                      "binary targets")

        try:
            cls.fit(X, y_multiclass)
        except ValueError:
            self.fail("cls.fit() raised ValueError while fitting "
                      "multiclass targets")

        try:
            cls.fit(X, y_multilabel)
        except ValueError:
            self.fail("cls.fit() raised ValueError while fitting "
                      "multilabel-indicator targets")

        # Test that regressor raises error for illegal target types.
        reg = AutoSklearnRegressor()
        # Illegal target types for regression: multiclass-multioutput,
        # multilabel-indicator, continuous-multioutput.
        self.assertRaisesRegex(
            ValueError,
            "regression with data of type"
            " multiclass-multioutput is not supported",
            reg.fit,
            X=X,
            y=y_multiclass_multioutput,
        )

        self.assertRaisesRegex(
            ValueError,
            "regression with data of type"
            " multilabel-indicator is not supported",
            reg.fit,
            X=X,
            y=y_multilabel,
        )

        self.assertRaisesRegex(
            ValueError,
            "regression with data of type"
            " continuous-multioutput is not supported",
            reg.fit,
            X=X,
            y=y_continuous_multioutput,
        )
        # Legal target types: continuous, binary, multiclass
        try:
            reg.fit(X, y_continuous)
        except ValueError:
            self.fail("reg.fit() raised ValueError while fitting "
                      "continuous targets")

        try:
            reg.fit(X, y_binary)
        except ValueError:
            self.fail("reg.fit() raised ValueError while fitting "
                      "binary targets")

        try:
            reg.fit(X, y_multiclass)
        except ValueError:
            self.fail("reg.fit() raised ValueError while fitting "
                      "multiclass targets")
Example #22
0
    def test_fit_pSMAC(self):
        tmp = os.path.join(self.test_dir, '..', '.tmp_estimator_fit_pSMAC')
        output = os.path.join(self.test_dir, '..', '.out_estimator_fit_pSMAC')
        self._setUp(tmp)
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('breast_cancer')

        # test parallel Classifier to predict classes, not only indices
        Y_train += 1
        Y_test += 1

        automl = AutoSklearnClassifier(
            time_left_for_this_task=30,
            per_run_time_limit=5,
            output_folder=output,
            tmp_folder=tmp,
            shared_mode=True,
            seed=1,
            initial_configurations_via_metalearning=0,
            ensemble_size=0,
        )
        automl.fit(X_train, Y_train)
        n_models_fit = len(automl.cv_results_['mean_test_score'])
        cv_results = automl.cv_results_['mean_test_score']

        automl = AutoSklearnClassifier(
            time_left_for_this_task=30,
            per_run_time_limit=5,
            output_folder=output,
            tmp_folder=tmp,
            shared_mode=True,
            seed=2,
            initial_configurations_via_metalearning=0,
            ensemble_size=0,
        )
        automl.fit(X_train, Y_train)
        n_models_fit_2 = len(automl.cv_results_['mean_test_score'])

        # Check that the results from the first run were actually read by the
        # second run
        self.assertGreater(n_models_fit_2, n_models_fit)
        for score in cv_results:
            self.assertIn(
                score,
                automl.cv_results_['mean_test_score'],
                msg=str((automl.cv_results_['mean_test_score'], cv_results)),
            )

        # Create a 'dummy model' for the first run, which has an accuracy of
        # more than 99%; it should be in the final ensemble if the ensemble
        # building of the second AutoSklearn classifier works correct
        true_targets_ensemble_path = os.path.join(tmp, '.auto-sklearn',
                                                  'true_targets_ensemble.npy')
        with open(true_targets_ensemble_path, 'rb') as fh:
            true_targets_ensemble = np.load(fh, allow_pickle=True)
        true_targets_ensemble[-1] = 1 if true_targets_ensemble[-1] != 1 else 0
        true_targets_ensemble = true_targets_ensemble.astype(int)
        probas = np.zeros((len(true_targets_ensemble), 2), dtype=float)

        for i, value in enumerate(true_targets_ensemble):
            probas[i, value] = 1.0
        dummy_predictions_path = os.path.join(
            tmp,
            '.auto-sklearn',
            'predictions_ensemble',
            'predictions_ensemble_0_999_0.0.npy',
        )
        with open(dummy_predictions_path, 'wb') as fh:
            np.save(fh, probas)

        probas_test = np.zeros((len(Y_test), 2), dtype=float)
        for i, value in enumerate(Y_test):
            probas_test[i, value - 1] = 1.0

        dummy = ArrayReturningDummyPredictor(probas_test)
        context = BackendContext(tmp, output, False, False, True)
        backend = Backend(context)
        model_path = backend.get_model_path(seed=0, idx=999, budget=0.0)
        backend.save_model(model=dummy, filepath=model_path)

        automl = AutoSklearnClassifier(
            time_left_for_this_task=30,
            per_run_time_limit=5,
            output_folder=output,
            tmp_folder=tmp,
            shared_mode=True,
            seed=3,
            initial_configurations_via_metalearning=0,
            ensemble_size=0,
            metric=accuracy,
        )
        automl.fit_ensemble(Y_train, task=BINARY_CLASSIFICATION,
                            precision='32',
                            dataset_name='breast_cancer',
                            ensemble_size=20,
                            ensemble_nbest=50,
                            )

        predictions = automl.predict(X_test)
        score = sklearn.metrics.accuracy_score(Y_test, predictions)

        self.assertEqual(len(os.listdir(os.path.join(tmp, '.auto-sklearn',
                                                     'ensembles'))), 1)
        self.assertGreaterEqual(score, 0.90)
        self.assertEqual(automl._automl[0]._task, BINARY_CLASSIFICATION)

        models = automl._automl[0].models_
        classifier_types = [type(c) for c in models.values()]
        self.assertIn(ArrayReturningDummyPredictor, classifier_types)

        del automl
        self._tearDown(tmp)
        self._tearDown(output)
    'mean_squared_error': mean_squared_error,
    'root_mean_squared_error': root_mean_squared_error,
    'mean_absolute_error': mean_absolute_error,
}[metric]
automl_arguments['metric'] = metric

if task_type == 'classification':
    automl = AutoSklearnClassifier(**automl_arguments)
elif task_type == 'regression':
    automl = AutoSklearnRegressor(**automl_arguments)
else:
    raise ValueError(task_type)

automl.fit(X_train,
           y_train,
           dataset_name=str(task_id),
           feat_type=cat,
           X_test=X_test,
           y_test=y_test)
trajectory = automl.trajectory_

incumbent_id_to_model = {}
incumbent_id_to_performance = {}
validated_trajectory = []

if is_test:
    memory_limit_factor = 1
else:
    memory_limit_factor = 2

print('Starting to validate configurations')
for i, entry in enumerate(trajectory):
Example #24
0
    def fit_automl(self, run_time):
        """Runs auto-sklearn on the uploaded data and prints results.

        Side effects:
            - Enables upload_widget

        Args:
            run_time (int): The run time for auto-sklearn in seconds.
        Returns:
            automl (AutoSklearnClassifier): fitted auto-sklearn model.
        """

        automl_args = {}

        automl_args['time_left_for_this_task'] = run_time
        # TODO functionality to load this from Mongo
        automl_args['metadata_directory'] = ".metalearning/metalearning_files/"
        #automl_args['metadata_directory'] = "../metalearning/metalearning_files/"

        automl = AutoSklearnClassifier(**automl_args)
        thread = threading.Thread(target=self.update_progress,
                                  args=(self.progress_widget, ))
        thread.start()

        # always load a copy of the latest dataset
        cur_data = self.data[-1].copy()

        y = cur_data.pop(0)
        X, feat_types, _ = model_utils.process_feat_types(cur_data)

        X_train = X.iloc[self.train_idxs]
        y_train = y.iloc[self.train_idxs]

        X_test = X.iloc[self.test_idxs]
        y_test = y.iloc[self.test_idxs]

        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            with HiddenPrints():
                automl.fit(X_train, y_train, feat_type=feat_types)

        # Automl has finished fitting:
        self.models.append(copy.deepcopy(automl))

        with self.event_output_widget:
            print("FITTING COMPLETED WITH FITTING TIME PARAMETER AS ",
                  int(run_time / 60), " MINUTES")

        with self.metrics_output_widget:
            y_train_hat = automl.predict(X_train)
            train_accuracy_score = metrics.accuracy_score(y_train, y_train_hat)

            y_test_hat = automl.predict(X_test)
            test_accuracy_score = metrics.accuracy_score(y_test, y_test_hat)

            thresholdout_score = model_utils.thresholdout(
                train_accuracy_score, test_accuracy_score)

            output_str = "Run {}: train acc: {:.4}, noised test acc: {:.4}\n".format(
                self.queries, train_accuracy_score, thresholdout_score)
            print(output_str)

        with self.model_output_widget:
            print("MODELS:")
            print(automl.get_models_with_weights())

        if self.textbox_upload:
            self.upload_button.disabled = False
            self.upload_text.disabled = False
        else:
            self.upload_widget.disabled = False

        if self.queries == self.budget_widget.value:
            self.on_budget_completion()

        return automl
    def test_fit_n_jobs(self):
        tmp = os.path.join(self.test_dir, '..', '.tmp_estimator_fit_n_jobs')
        output = os.path.join(self.test_dir, '..', '.out_estimator_fit_n_jobs')
        self._setUp(tmp)
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('breast_cancer')

        # test parallel Classifier to predict classes, not only indices
        Y_train += 1
        Y_test += 1

        class get_smac_object_wrapper:
            def __call__(self, *args, **kwargs):
                self.n_jobs = kwargs['n_jobs']
                smac = get_smac_object(*args, **kwargs)
                self.dask_n_jobs = smac.solver.tae_runner.n_workers
                self.dask_client_n_jobs = len(
                    smac.solver.tae_runner.client.scheduler_info()['workers'])
                return smac

        get_smac_object_wrapper_instance = get_smac_object_wrapper()

        automl = AutoSklearnClassifier(
            time_left_for_this_task=30,
            per_run_time_limit=5,
            output_folder=output,
            tmp_folder=tmp,
            seed=1,
            initial_configurations_via_metalearning=0,
            ensemble_size=5,
            n_jobs=2,
            include_estimators=['sgd'],
            include_preprocessors=['no_preprocessing'],
            get_smac_object_callback=get_smac_object_wrapper_instance,
            max_models_on_disc=None,
        )
        automl.fit(X_train, Y_train)

        # Test that the argument is correctly passed to SMAC
        self.assertEqual(getattr(get_smac_object_wrapper_instance, 'n_jobs'),
                         2)
        self.assertEqual(
            getattr(get_smac_object_wrapper_instance, 'dask_n_jobs'), 2)
        self.assertEqual(
            getattr(get_smac_object_wrapper_instance, 'dask_client_n_jobs'), 2)

        available_num_runs = set()
        for run_key, run_value in automl.automl_.runhistory_.data.items():
            if run_value.additional_info is not None and 'num_run' in run_value.additional_info:
                available_num_runs.add(run_value.additional_info['num_run'])
        predictions_dir = automl.automl_._backend._get_prediction_output_dir(
            'ensemble')
        available_predictions = set()
        predictions = os.listdir(predictions_dir)
        seeds = set()
        for prediction in predictions:
            match = re.match(MODEL_FN_RE,
                             prediction.replace("predictions_ensemble", ""))
            print(prediction, match)
            if match:
                num_run = int(match.group(2))
                available_predictions.add(num_run)
                seed = int(match.group(1))
                seeds.add(seed)

        # Remove the dummy prediction, it is not part of the runhistory
        available_predictions.remove(1)
        self.assertSetEqual(available_predictions, available_num_runs)

        self.assertEqual(len(seeds), 1)

        ensemble_dir = automl.automl_._backend.get_ensemble_dir()
        ensembles = os.listdir(ensemble_dir)

        seeds = set()
        for ensemble_file in ensembles:
            seeds.add(int(ensemble_file.split('.')[0].split('_')[0]))
        self.assertEqual(len(seeds), 1)

        self.assertGreater(self._count_succeses(automl.cv_results_), 0)

        self._tearDown(tmp)
        self._tearDown(output)
Example #26
0
def test_fit_n_jobs(tmp_dir, output_dir):

    X_train, Y_train, X_test, Y_test = putil.get_dataset('breast_cancer')

    # test parallel Classifier to predict classes, not only indices
    Y_train += 1
    Y_test += 1

    class get_smac_object_wrapper:
        def __call__(self, *args, **kwargs):
            self.n_jobs = kwargs['n_jobs']
            smac = get_smac_object(*args, **kwargs)
            self.dask_n_jobs = smac.solver.tae_runner.n_workers
            self.dask_client_n_jobs = len(
                smac.solver.tae_runner.client.scheduler_info()['workers'])
            return smac

    get_smac_object_wrapper_instance = get_smac_object_wrapper()

    automl = AutoSklearnClassifier(
        time_left_for_this_task=30,
        per_run_time_limit=5,
        output_folder=output_dir,
        tmp_folder=tmp_dir,
        seed=1,
        initial_configurations_via_metalearning=0,
        ensemble_size=5,
        n_jobs=2,
        include_estimators=['sgd'],
        include_preprocessors=['no_preprocessing'],
        get_smac_object_callback=get_smac_object_wrapper_instance,
        max_models_on_disc=None,
    )
    automl.fit(X_train, Y_train)

    # Test that the argument is correctly passed to SMAC
    assert getattr(get_smac_object_wrapper_instance, 'n_jobs') == 2
    assert getattr(get_smac_object_wrapper_instance, 'dask_n_jobs') == 2
    assert getattr(get_smac_object_wrapper_instance, 'dask_client_n_jobs') == 2

    available_num_runs = set()
    for run_key, run_value in automl.automl_.runhistory_.data.items():
        if run_value.additional_info is not None and 'num_run' in run_value.additional_info:
            available_num_runs.add(run_value.additional_info['num_run'])
    available_predictions = set()
    predictions = glob.glob(
        os.path.join(automl.automl_._backend.get_runs_directory(), '*',
                     'predictions_ensemble*.npy'))
    seeds = set()
    for prediction in predictions:
        prediction = os.path.split(prediction)[1]
        match = re.match(MODEL_FN_RE,
                         prediction.replace("predictions_ensemble", ""))
        if match:
            num_run = int(match.group(2))
            available_predictions.add(num_run)
            seed = int(match.group(1))
            seeds.add(seed)

    # Remove the dummy prediction, it is not part of the runhistory
    available_predictions.remove(1)
    assert available_num_runs.issubset(available_predictions)

    assert len(seeds) == 1

    ensemble_dir = automl.automl_._backend.get_ensemble_dir()
    ensembles = os.listdir(ensemble_dir)

    seeds = set()
    for ensemble_file in ensembles:
        seeds.add(int(ensemble_file.split('.')[0].split('_')[0]))
    assert len(seeds) == 1

    assert count_succeses(automl.cv_results_) > 0
    # For travis-ci it is important that the client no longer exists
    assert automl.automl_._dask_client is None
Example #27
0
def test_type_of_target(mock_estimator):
    # Test that classifier raises error for illegal target types.
    X = np.array([
        [1, 2],
        [2, 3],
        [3, 4],
        [4, 5],
    ])
    # Possible target types
    y_binary = np.array([0, 0, 1, 1])
    y_continuous = np.array([0.1, 1.3, 2.1, 4.0])
    y_multiclass = np.array([0, 1, 2, 0])
    y_multilabel = np.array([
        [0, 1],
        [1, 1],
        [1, 0],
        [0, 0],
    ])
    y_multiclass_multioutput = np.array([
        [0, 1],
        [1, 3],
        [2, 2],
        [5, 3],
    ])
    y_continuous_multioutput = np.array([
        [0.1, 1.5],
        [1.2, 3.5],
        [2.7, 2.7],
        [5.5, 3.9],
    ])

    cls = AutoSklearnClassifier(ensemble_size=0)
    cls.automl_ = unittest.mock.Mock()
    cls.automl_.InputValidator = unittest.mock.Mock()
    cls.automl_.InputValidator.target_validator = unittest.mock.Mock()

    # Illegal target types for classification: continuous,
    # multiclass-multioutput, continuous-multioutput.
    expected_msg = r".*Classification with data of type"
    " multiclass-multioutput is not supported.*"
    with pytest.raises(ValueError, match=expected_msg):
        cls.fit(X=X, y=y_multiclass_multioutput)

    expected_msg = r".*Classification with data of type"
    " continuous is not supported.*"
    with pytest.raises(ValueError, match=expected_msg):
        cls.fit(X=X, y=y_continuous)

    expected_msg = r".*Classification with data of type"
    " continuous-multioutput is not supported.*"
    with pytest.raises(ValueError, match=expected_msg):
        cls.fit(X=X, y=y_continuous_multioutput)

    # Legal target types for classification: binary, multiclass,
    # multilabel-indicator.
    try:
        cls.fit(X, y_binary)
    except ValueError:
        pytest.fail("cls.fit() raised ValueError while fitting "
                    "binary targets")

    try:
        cls.fit(X, y_multiclass)
    except ValueError:
        pytest.fail("cls.fit() raised ValueError while fitting "
                    "multiclass targets")

    try:
        cls.fit(X, y_multilabel)
    except ValueError:
        pytest.fail("cls.fit() raised ValueError while fitting "
                    "multilabel-indicator targets")

    # Test that regressor raises error for illegal target types.
    reg = AutoSklearnRegressor(ensemble_size=0)
    # Illegal target types for regression: multilabel-indicator
    # multiclass-multioutput
    expected_msg = r".*Regression with data of type"
    " multilabel-indicator is not supported.*"
    with pytest.raises(ValueError, match=expected_msg):
        reg.fit(
            X=X,
            y=y_multilabel,
        )

    expected_msg = r".*Regression with data of type"
    " multiclass-multioutput is not supported.*"
    with pytest.raises(ValueError, match=expected_msg):
        reg.fit(
            X=X,
            y=y_multiclass_multioutput,
        )

    # Legal target types: continuous, multiclass,
    # continuous-multioutput,
    # binary
    try:
        reg.fit(X, y_continuous)
    except ValueError:
        pytest.fail("reg.fit() raised ValueError while fitting "
                    "continuous targets")

    try:
        reg.fit(X, y_multiclass)
    except ValueError:
        pytest.fail("reg.fit() raised ValueError while fitting "
                    "multiclass targets")

    try:
        reg.fit(X, y_continuous_multioutput)
    except ValueError:
        pytest.fail("reg.fit() raised ValueError while fitting "
                    "continuous_multioutput targets")

    try:
        reg.fit(X, y_binary)
    except ValueError:
        pytest.fail("reg.fit() raised ValueError while fitting "
                    "binary targets")
Example #28
0
def evaluate_ml_algorithm(dataset,
                          algo,
                          run_id,
                          obj_metric,
                          time_limit=600,
                          seed=1,
                          task_type=None):
    if algo == 'lightgbm':
        _algo = ['LightGBM']
        add_classifier(LightGBM)
    elif algo == 'logistic_regression':
        _algo = ['Logistic_Regression']
        add_classifier(Logistic_Regression)
    else:
        _algo = [algo]
    print('EVALUATE-%s-%s-%s: run_id=%d' % (dataset, algo, obj_metric, run_id))
    train_data, test_data = load_train_test_data(dataset, task_type=task_type)
    if task_type in CLS_TASKS:
        task_type = BINARY_CLS if len(set(
            train_data.data[1])) == 2 else MULTICLASS_CLS
    print(set(train_data.data[1]))

    raw_data, test_raw_data = load_train_test_data(dataset,
                                                   task_type=MULTICLASS_CLS)
    X, y = raw_data.data
    X_test, y_test = test_raw_data.data
    feat_type = [
        'Categorical' if _type == CATEGORICAL else 'Numerical'
        for _type in raw_data.feature_types
    ]
    from autosklearn.metrics import balanced_accuracy as balanced_acc
    automl = AutoSklearnClassifier(
        time_left_for_this_task=int(time_limit),
        per_run_time_limit=180,
        n_jobs=1,
        include_estimators=_algo,
        initial_configurations_via_metalearning=0,
        ensemble_memory_limit=16384,
        ml_memory_limit=16384,
        # tmp_folder='/var/folders/0t/mjph32q55hd10x3qr_kdd2vw0000gn/T/autosklearn_tmp',
        ensemble_size=1,
        seed=int(seed),
        resampling_strategy='holdout',
        resampling_strategy_arguments={'train_size': 0.67})
    automl.fit(X.copy(), y.copy(), feat_type=feat_type, metric=balanced_acc)
    model_desc = automl.show_models()
    str_stats = automl.sprint_statistics()
    valid_results = automl.cv_results_['mean_test_score']
    print('Eval num: %d' % (len(valid_results)))

    validation_score = np.max(valid_results)

    # Test performance.
    automl.refit(X.copy(), y.copy())
    predictions = automl.predict(X_test)
    test_score = balanced_accuracy_score(y_test, predictions)

    # Print statistics about the auto-sklearn run such as number of
    # iterations, number of models failed with a time out.
    print(str_stats)
    print(model_desc)
    print('Validation Accuracy:', validation_score)
    print("Test Accuracy      :", test_score)

    save_path = save_dir + '%s-%s-%s-%d-%d.pkl' % (dataset, algo, obj_metric,
                                                   run_id, time_limit)
    with open(save_path, 'wb') as f:
        pickle.dump([dataset, algo, validation_score, test_score, task_type],
                    f)
Example #29
0
    include_estimators=include_estimator,
    include_preprocessors=include_preprocessor,
    get_smac_object_callback=get_smac_object_callback,
    metadata_directory=metadata_directory,
)

trajectory = [(0.0, 1.0)]
min_cost = np.inf
loss = np.inf

crashed = False
try:
    automl.fit(
        X_train, y_train,
        dataset_name=str(task_id),
        feat_type=cat,
        metric=balanced_accuracy,
        X_test=X_test,
        y_test=y_test,
    )
    print('Finished Auto-sklearn fitting', flush=True)

    with open(os.path.join(autosklearn_directory, '.auto-sklearn', 'start_time_%d' % seed)) as fh:
        start_time = float(fh.read())

    for run_key, run_data in automl._automl[0].runhistory_.data.items():
        if run_data.cost < min_cost:
            if 'test_loss' in run_data.additional_info:
                num_run = run_data.additional_info['num_run']
                prediction_file_path = os.path.join(
                    autosklearn_directory,
                    '.auto-sklearn',
Example #30
0
    automl_arguments['metric'] = metric

    if task_type == 'classification':
        automl = AutoSklearnClassifier(**automl_arguments)
        scorer_list = CLASSIFICATION_METRICS
    elif task_type == 'regression':
        automl = AutoSklearnRegressor(**automl_arguments)
        scorer_list = REGRESSION_METRICS
    else:
        raise ValueError(task_type)

    scoring_functions = [scorer for name, scorer in scorer_list.items()]

    automl.fit(X_train,
               y_train,
               dataset_name=dataset_name,
               feat_type=cat,
               X_test=X_test,
               y_test=y_test)
    trajectory = automl.trajectory_

    incumbent_id_to_model = {}
    incumbent_id_to_performance = {}
    validated_trajectory = []

    if is_test:
        memory_limit_factor = 1
    else:
        memory_limit_factor = 2

    print('Starting to validate configurations')
    for i, entry in enumerate(trajectory):
Example #31
0
    def test_fit_pSMAC(self):
        output = os.path.join(self.test_dir, '..', '.tmp_estimator_fit_pSMAC')
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')

        automl = AutoSklearnClassifier(
            time_left_for_this_task=15,
            per_run_time_limit=15,
            output_folder=output,
            tmp_folder=output,
            shared_mode=True,
            seed=1,
            initial_configurations_via_metalearning=0,
            ensemble_size=0)
        automl.fit(X_train, Y_train)

        # Create a 'dummy model' for the first run, which has an accuracy of
        # more than 99%; it should be in the final ensemble if the ensemble
        # building of the second AutoSklearn classifier works correct
        true_targets_ensemble_path = os.path.join(output, '.auto-sklearn',
                                                  'true_targets_ensemble.npy')
        true_targets_ensemble = np.load(true_targets_ensemble_path)
        true_targets_ensemble[-1] = 1 if true_targets_ensemble[-1] != 1 else 0
        probas = np.zeros((len(true_targets_ensemble), 3), dtype=float)
        for i, value in enumerate(true_targets_ensemble):
            probas[i, value] = 1.0
        dummy_predictions_path = os.path.join(
            output, '.auto-sklearn', 'predictions_ensemble',
            'predictions_ensemble_1_00030.npy')
        with open(dummy_predictions_path, 'wb') as fh:
            np.save(fh, probas)

        probas_test = np.zeros((len(Y_test), 3), dtype=float)
        for i, value in enumerate(Y_test):
            probas_test[i, value] = 1.0

        dummy = ArrayReturningDummyPredictor(probas_test)
        backend = Backend(output, output)
        backend.save_model(dummy, 30, 1)

        automl = AutoSklearnClassifier(
            time_left_for_this_task=15,
            per_run_time_limit=15,
            output_folder=output,
            tmp_folder=output,
            shared_mode=True,
            seed=2,
            initial_configurations_via_metalearning=0,
            ensemble_size=0)
        automl.fit(X_train, Y_train)
        automl.run_ensemble_builder(0, 1, 50).wait()

        score = automl.score(X_test, Y_test)

        self.assertEqual(
            len(os.listdir(os.path.join(output, '.auto-sklearn',
                                        'ensembles'))), 1)
        self.assertGreaterEqual(score, 0.90)
        self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION)

        del automl
        self._tearDown(output)
Example #32
0
    def test_fit_pSMAC(self):
        tmp = os.path.join(self.test_dir, '..', '.tmp_estimator_fit_pSMAC')
        output = os.path.join(self.test_dir, '..', '.out_estimator_fit_pSMAC')
        self._setUp(tmp)
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('digits')

        # test parallel Classifier to predict classes, not only indexes
        Y_train += 1
        Y_test += 1

        automl = AutoSklearnClassifier(
            time_left_for_this_task=20,
            per_run_time_limit=5,
            output_folder=output,
            tmp_folder=tmp,
            shared_mode=True,
            seed=1,
            initial_configurations_via_metalearning=0,
            ensemble_size=0,
        )
        automl.fit(X_train, Y_train)
        # Create a 'dummy model' for the first run, which has an accuracy of
        # more than 99%; it should be in the final ensemble if the ensemble
        # building of the second AutoSklearn classifier works correct
        true_targets_ensemble_path = os.path.join(tmp, '.auto-sklearn',
                                                  'true_targets_ensemble.npy')
        with open(true_targets_ensemble_path, 'rb') as fh:
            true_targets_ensemble = np.load(fh)
        true_targets_ensemble[-1] = 1 if true_targets_ensemble[-1] != 1 else 0
        true_targets_ensemble = true_targets_ensemble.astype(int)
        probas = np.zeros((len(true_targets_ensemble), 10), dtype=float)

        for i, value in enumerate(true_targets_ensemble):
            probas[i, value] = 1.0
        dummy_predictions_path = os.path.join(
            tmp,
            '.auto-sklearn',
            'predictions_ensemble',
            'predictions_ensemble_1_00030.npy',
        )
        with open(dummy_predictions_path, 'wb') as fh:
            np.save(fh, probas)

        probas_test = np.zeros((len(Y_test), 10), dtype=float)
        for i, value in enumerate(Y_test):
            probas_test[i, value - 1] = 1.0

        dummy = ArrayReturningDummyPredictor(probas_test)
        context = BackendContext(tmp, output, False, False, True)
        backend = Backend(context)
        backend.save_model(dummy, 30, 1)

        automl = AutoSklearnClassifier(
            time_left_for_this_task=20,
            per_run_time_limit=5,
            output_folder=output,
            tmp_folder=tmp,
            shared_mode=True,
            seed=2,
            initial_configurations_via_metalearning=0,
            ensemble_size=0,
        )
        automl.fit_ensemble(
            Y_train,
            task=MULTICLASS_CLASSIFICATION,
            metric=accuracy,
            precision='32',
            dataset_name='iris',
            ensemble_size=20,
            ensemble_nbest=50,
        )

        predictions = automl.predict(X_test)
        score = sklearn.metrics.accuracy_score(Y_test, predictions)

        self.assertEqual(
            len(os.listdir(os.path.join(tmp, '.auto-sklearn', 'ensembles'))),
            1)
        self.assertGreaterEqual(score, 0.90)
        self.assertEqual(automl._automl._task, MULTICLASS_CLASSIFICATION)

        models = automl._automl.models_
        classifier_types = [type(c) for c in models.values()]
        self.assertIn(ArrayReturningDummyPredictor, classifier_types)

        del automl
        self._tearDown(tmp)
        self._tearDown(output)
Example #33
0
                                  ensemble_size=50,
                                  ensemble_nbest=50,
                                  seed=1,
                                  ml_memory_limit=12000,
                                  include_estimators=None,
                                  include_preprocessors=None,
                                  resampling_strategy='holdout',
                                  tmp_folder='./tmp/',
                                  output_folder='./out/',
                                  delete_tmp_folder_after_terminate=False,
                                  delete_output_folder_after_terminate=False,
                                  shared_mode=False)

    model.fit(data,
              target,
              metric='f1_metric',
              feat_type=None,
              dataset_name='numerai_20161021')

    try:
        report(model.grid_scores_)
    except:
        pass

    with open('result.txt', 'w') as f:
        f.write(model.show_models())

    cv = StratifiedKFold(target, n_folds=3, shuffle=True, random_state=0)
    for train_idx, test_idx in list(cv)[:1]:
        model.refit(data.ix[train_idx, :], target[train_idx])
        ans = model.predict_proba(data.ix[test_idx, :])[:, 1]
print('[INFO] Loading digits dataset.')
X, y = load_digits(return_X_y=True)

print('[INFO] Splitting.')
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=42,
                                                    train_size=0.8)

print(f'[INFO] Train shape: {X_train.shape}')
print(f'[INFO] Test shape: {X_test.shape}')

print('[INFO] Finding best model...')
classifier = AutoSklearnClassifier(per_run_time_limit=360,
                                   ml_memory_limit=1024 * 6,
                                   time_left_for_this_task=7200)
start = time.time()

X_train = X_train.astype('float')
classifier.fit(X_train, y_train)
print(
    f'[INFO] Elapsed time finding best model: {time.time() - start} seconds.')

predictions = classifier.predict(X_test)
print('--- CLASSIFICATION REPORT: ---')
print(classification_report(y_test, predictions))
print('\n\n--- MODELS: ---')
print(classifier.show_models())
print('\n\n--- STATISTICS: ---')
print(classifier.sprint_statistics())
Example #35
0
        # 3. Start the client
        with dask.distributed.Client(
                address=cluster.scheduler_address) as client:
            automl = AutoSklearnClassifier(
                time_left_for_this_task=30,
                per_run_time_limit=10,
                memory_limit=1024,
                tmp_folder=tmp_folder,
                seed=777,
                # n_jobs is ignored internally as we pass a dask client.
                n_jobs=1,
                # Pass a dask client which connects to the previously constructed cluster.
                dask_client=client,
            )
            automl.fit(X_train, y_train)

            automl.fit_ensemble(
                y_train,
                task=MULTICLASS_CLASSIFICATION,
                dataset_name='digits',
                ensemble_size=20,
                ensemble_nbest=50,
            )

        predictions = automl.predict(X_test)
        print(automl.sprint_statistics())
        print("Accuracy score",
              sklearn.metrics.accuracy_score(y_test, predictions))

        # Wait until all workers are closed
Example #36
0
    def test_fit_pSMAC(self):
        tmp = os.path.join(self.test_dir, '..', '.tmp_estimator_fit_pSMAC')
        output = os.path.join(self.test_dir, '..', '.out_estimator_fit_pSMAC')
        self._setUp(tmp)
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('digits')

        # test parallel Classifier to predict classes, not only indexes
        Y_train += 1
        Y_test += 1

        automl = AutoSklearnClassifier(
            time_left_for_this_task=20,
            per_run_time_limit=5,
            output_folder=output,
            tmp_folder=tmp,
            shared_mode=True,
            seed=1,
            initial_configurations_via_metalearning=0,
            ensemble_size=0,
        )
        automl.fit(X_train, Y_train)
        # Create a 'dummy model' for the first run, which has an accuracy of
        # more than 99%; it should be in the final ensemble if the ensemble
        # building of the second AutoSklearn classifier works correct
        true_targets_ensemble_path = os.path.join(tmp, '.auto-sklearn',
                                                  'true_targets_ensemble.npy')
        with open(true_targets_ensemble_path, 'rb') as fh:
            true_targets_ensemble = np.load(fh)
        true_targets_ensemble[-1] = 1 if true_targets_ensemble[-1] != 1 else 0
        true_targets_ensemble = true_targets_ensemble.astype(int)
        probas = np.zeros((len(true_targets_ensemble), 10), dtype=float)

        for i, value in enumerate(true_targets_ensemble):
            probas[i, value] = 1.0
        dummy_predictions_path = os.path.join(
            tmp,
            '.auto-sklearn',
            'predictions_ensemble',
            'predictions_ensemble_1_00030.npy',
        )
        with open(dummy_predictions_path, 'wb') as fh:
            np.save(fh, probas)

        probas_test = np.zeros((len(Y_test), 10), dtype=float)
        for i, value in enumerate(Y_test):
            probas_test[i, value - 1] = 1.0

        dummy = ArrayReturningDummyPredictor(probas_test)
        context = BackendContext(tmp, output, False, False, True)
        backend = Backend(context)
        backend.save_model(dummy, 30, 1)

        automl = AutoSklearnClassifier(
            time_left_for_this_task=20,
            per_run_time_limit=5,
            output_folder=output,
            tmp_folder=tmp,
            shared_mode=True,
            seed=2,
            initial_configurations_via_metalearning=0,
            ensemble_size=0,
        )
        automl.fit_ensemble(Y_train, task=MULTICLASS_CLASSIFICATION,
                            metric=accuracy,
                            precision='32',
                            dataset_name='iris',
                            ensemble_size=20,
                            ensemble_nbest=50,
                            )

        predictions = automl.predict(X_test)
        score = sklearn.metrics.accuracy_score(Y_test, predictions)

        self.assertEqual(len(os.listdir(os.path.join(tmp, '.auto-sklearn',
                                                     'ensembles'))), 1)
        self.assertGreaterEqual(score, 0.90)
        self.assertEqual(automl._automl._task, MULTICLASS_CLASSIFICATION)

        models = automl._automl.models_
        classifier_types = [type(c) for c in models.values()]
        self.assertIn(ArrayReturningDummyPredictor, classifier_types)

        del automl
        self._tearDown(tmp)
        self._tearDown(output)
# # define dataset
# cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# # define search
# model = TPOTClassifier(generations=5, population_size=50, cv=cv, scoring='accuracy', verbosity=2, random_state=1, n_jobs=-1)
# # perform the search
# model.fit(X, y)
# plot_confusion_matrix(model, X, y)
# # export the best model
# # model.export('tpot_best_model.py')

if __name__ == '__main__':
    # example of auto-sklearn for a classification dataset
    from sklearn.datasets import make_classification
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score
    from autosklearn.classification import AutoSklearnClassifier
    # define dataset
    # split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
    # define search
    model = AutoSklearnClassifier(time_left_for_this_task=4*60, per_run_time_limit=60, n_jobs=8, resampling_strategy='cv', resampling_strategy_arguments={'folds': 10})
    # perform the search
    model.fit(X_train, y_train)
    # summarize
    print(model.sprint_statistics())
    print(model.cv_results_)
    # evaluate best model
    y_hat = model.predict(X_test)
    acc = accuracy_score(y_test, y_hat)
    print("Accuracy: %.3f" % acc)
Example #38
0
def task_executor(task_info):
    """Execute task
    :param task_info: detail of task, dict"""
    data_path = task_info.get("data_path")
    time_max = task_info.get("time_max")
    task_id = task_info.get("task_id")
    model_type = task_info.get("model_type")
    LOG.info("Load data, path=%s", data_path)
    status = "done"
    try:
        data_set = pd.read_csv(data_path)
        x_set = data_set[data_set.columns[:len(data_set.keys()) - 1]]
        y_set = data_set[data_set.columns[-1]]
        x_train, x_test, y_train, y_test = train_test_split(x_set,
                                                            y_set,
                                                            test_size=0.3,
                                                            random_state=0)
        LOG.info("start optimizer.")
        if platform.system() == "Linux":
            from autosklearn.classification import AutoSklearnClassifier
            from autosklearn.regression import AutoSklearnRegressor
            if model_type == "Classification":
                model = AutoSklearnClassifier(
                    time_left_for_this_task=time_max + 5,
                    per_run_time_limit=int(time_max / 10),
                    include_preprocessors=["no_preprocessing"],
                )
            elif model_type == "Regression":
                model = AutoSklearnRegressor(
                    time_left_for_this_task=time_max + 5,
                    per_run_time_limit=int(time_max / 10),
                    include_preprocessors=["no_preprocessing"],
                )
            else:
                LOG.error("not support model type=%s", model_type)
                raise ValueError("not support model type")
        else:
            from sklearn.ensemble import RandomForestClassifier, \
                RandomForestRegressor
            if model_type == "Classification":
                model = RandomForestClassifier(n_estimators=500)
            elif model_type == "Regression":
                model = RandomForestRegressor(n_estimators=500)
            else:
                LOG.error("not support model type=%s", model_type)
                raise ValueError("not support model type")
        model.fit(x_train, y_train)
        prediction = model.predict(x_test)

        if model_type == "Classification":
            best_metrics = accuracy_score(y_test, prediction)
            LOG.info("The accuracy is %s", best_metrics)
        else:
            best_metrics = mean_squared_error(y_test, prediction)
            LOG.info("The mse is %s", best_metrics)
    except ServerException as server_error:
        LOG.error("Some thing wrong, reason=%s", server_error)
        best_metrics = 0
        status = "failed"

    update = dict(end_time=int(time.time()),
                  best_metrics=best_metrics,
                  status=status)
    Task.objects.filter(task_id=task_id).update(**update)
Example #39
0
    # XGBOOST params - max_depth, min_child_weight, gamma
    
#    clfs = [XGB(max_depth=x) for x in range(1,10)]+\
#            [XGB(min_child_weight=x) for x in range(1,10)]+\
#            [XGB(gamma=x) for x in np.linspace(0,1,10)]
    
    X = data[0]
    X = getRelevantData(X,'vel_acc')
    X_f = getFeatures(X,'mean_std_max3fftpeaks')
    y = data[1]
    groups = data[2]
    
    scores = []
    clf = AutoC()
    cv = GroupShuffleSplit(n_splits=1,test_size=0.2)
    for train_index, test_index in cv.split(X_f,y,groups):
        # Split data to train and test set
        X_train = X[train_index]
        y_train = y[train_index]

        X_test = X[test_index]
        y_test = y[test_index]
        
        clf.fit(X_train,y_train)
        y_pred = clf.predict(X_test)
        score = accuracy_score(y_test,y_pred)
        scores.append(score)
    
    print("{:.5f} accuracy".format(np.mean(scores)))
    
    'tmp_folder': tmp_dir,
    'disable_evaluator_output': True,
}

X_train, y_train, X_test, y_test, cat = load_task(task_id)

if task_type == 'classification':
    automl = AutoSklearnClassifier(**automl_arguments)
    metric = balanced_accuracy
elif task_type == 'regression':
    automl = AutoSklearnRegressor(**automl_arguments)
    metric = r2
else:
    raise ValueError(task_type)

automl.fit(X_train, y_train, dataset_name=str(task_id), metric=metric,
           feat_type=cat)
data = automl._automl._backend.load_datamanager()
# Data manager can't be replaced with save_datamanager, it has to be deleted
# first
os.remove(automl._automl._backend._get_datamanager_pickle_filename())
data.data['X_test'] = X_test
data.data['Y_test'] = y_test
automl._automl._backend.save_datamanager(data)
trajectory = automl.trajectory_

incumbent_id_to_model = {}
incumbent_id_to_performance = {}
validated_trajectory = []

if is_test:
    memory_limit_factor = 1