Beispiel #1
0
    def test_pSMAC_wrong_arguments(self):
        X = np.zeros((100, 100))
        y = np.zeros((100, ))
        self.assertRaisesRegex(
            ValueError,
            "If shared_mode == True tmp_folder must not "
            "be None.",
            lambda shared_mode:
            AutoSklearnClassifier(
                shared_mode=shared_mode,
            ).fit(X, y),
            shared_mode=True
        )

        self.assertRaisesRegex(
            ValueError,
            "If shared_mode == True output_folder must not "
            "be None.",
            lambda shared_mode, tmp_folder:
            AutoSklearnClassifier(
                shared_mode=shared_mode,
                tmp_folder=tmp_folder,
            ).fit(X, y),
            shared_mode=True,
            tmp_folder='/tmp/duitaredxtvbedb'
        )
Beispiel #2
0
def test_cv_results(tmp_dir, output_dir):
    # TODO restructure and actually use real SMAC output from a long run
    # to do this unittest!
    X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')

    cls = AutoSklearnClassifier(time_left_for_this_task=30,
                                per_run_time_limit=5,
                                tmp_folder=tmp_dir,
                                output_folder=output_dir,
                                seed=1,
                                initial_configurations_via_metalearning=0,
                                ensemble_size=0,
                                scoring_functions=[
                                    autosklearn.metrics.precision,
                                    autosklearn.metrics.roc_auc
                                ])

    params = cls.get_params()
    original_params = copy.deepcopy(params)

    cls.fit(X_train, Y_train)
    cv_results = cls.cv_results_
    assert isinstance(cv_results, dict), type(cv_results)
    assert isinstance(cv_results['mean_test_score'],
                      np.ndarray), type(cv_results['mean_test_score'])
    assert isinstance(cv_results['mean_fit_time'],
                      np.ndarray), type(cv_results['mean_fit_time'])
    assert isinstance(cv_results['params'], list), type(cv_results['params'])
    assert isinstance(cv_results['rank_test_scores'],
                      np.ndarray), type(cv_results['rank_test_scores'])
    assert isinstance(cv_results['metric_precision'],
                      npma.MaskedArray), type(cv_results['metric_precision'])
    assert isinstance(cv_results['metric_roc_auc'],
                      npma.MaskedArray), type(cv_results['metric_roc_auc'])
    cv_result_items = [
        isinstance(val, npma.MaskedArray) for key, val in cv_results.items()
        if key.startswith('param_')
    ]
    assert all(cv_result_items), cv_results.items()

    # Compare the state of the model parameters with the original parameters
    new_params = clone(cls).get_params()
    for param_name, original_value in original_params.items():
        new_value = new_params[param_name]

        # Taken from Sklearn code:
        # We should never change or mutate the internal state of input
        # parameters by default. To check this we use the joblib.hash function
        # that introspects recursively any subobjects to compute a checksum.
        # The only exception to this rule of immutable constructor parameters
        # is possible RandomState instance but in this check we explicitly
        # fixed the random_state params recursively to be integer seeds.
        assert joblib.hash(new_value) == joblib.hash(original_value), (
            "Estimator %s should not change or mutate "
            " the parameter %s from %s to %s during fit." %
            (cls, param_name, original_value, new_value))

    # Comply with https://scikit-learn.org/dev/glossary.html#term-classes
    is_classifier(cls)
    assert hasattr(cls, 'classes_')
Beispiel #3
0
    def test_metadata_directory(self):
        # Test that metadata directory is set correctly (if user specifies,
        # Auto-sklearn should check that the directory exists. If not, it
        # should use the default directory.
        automl1 = AutoSklearnClassifier(
            time_left_for_this_task=30,
            per_run_time_limit=5,
            metadata_directory=
            "pyMetaLearn/metadata_dir",  # user specified metadata_dir
        )
        self.assertEqual(automl1.metadata_directory,
                         "pyMetaLearn/metadata_dir")

        automl2 = AutoSklearnClassifier(  # default metadata_dir
            time_left_for_this_task=30,
            per_run_time_limit=5,
        )
        self.assertIsNone(automl2.metadata_directory)

        nonexistent_dir = "nonexistent_dir"
        automl3 = AutoSklearnClassifier(
            time_left_for_this_task=30,
            per_run_time_limit=5,
            metadata_directory=nonexistent_dir,  # user specified metadata_dir
        )
        X, y = load_breast_cancer(return_X_y=True)
        self.assertRaisesRegex(ValueError,
                               "The specified metadata directory "
                               "\'%s\' does not exist!" % nonexistent_dir,
                               automl3.fit,
                               X=X,
                               y=y)
Beispiel #4
0
        def spawn_classifier(seed, dataset_name, automl_tmp_folder,
                             automl_output_folder):
            if seed == 0:
                initial_configurations_via_metalearning = 25
                smac_scenario_args = {}
            else:
                initial_configurations_via_metalearning = 0
                smac_scenario_args = {'initial_incumbent': 'RANDOM'}

            automl = AutoSklearnClassifier(
                #            time_left_for_this_task=60, # sec., how long should this seed fit process run
                #            per_run_time_limit=15, # sec., each model may only take this long before it's killed
                #            ml_memory_limit=1024, # MB, memory limit imposed on each call to a ML algorithm
                shared_mode=True,  # tmp folder will be shared between seeds
                tmp_folder=automl_tmp_folder,
                output_folder=automl_output_folder,
                delete_tmp_folder_after_terminate=False,
                ensemble_size=
                0,  # ensembles will be built when all optimization runs are finished
                initial_configurations_via_metalearning=
                initial_configurations_via_metalearning,
                seed=seed,
                smac_scenario_args=smac_scenario_args,
            )
            automl.fit(X_train, y_train, dataset_name=dataset_name)
    def test_cv_results(self):
        # TODO restructure and actually use real SMAC output from a long run
        # to do this unittest!
        tmp = os.path.join(self.test_dir, '..', '.tmp_cv_results')
        output = os.path.join(self.test_dir, '..', '.out_cv_results')
        self._setUp(tmp)
        self._setUp(output)
        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')

        cls = AutoSklearnClassifier(time_left_for_this_task=20,
                                    per_run_time_limit=5,
                                    output_folder=output,
                                    tmp_folder=tmp,
                                    shared_mode=False,
                                    seed=1,
                                    initial_configurations_via_metalearning=0,
                                    ensemble_size=0)
        cls.fit(X_train, Y_train)
        cv_results = cls.cv_results_
        self.assertIsInstance(cv_results, dict)
        self.assertIsInstance(cv_results['mean_test_score'], np.ndarray)
        self.assertIsInstance(cv_results['mean_fit_time'], np.ndarray)
        self.assertIsInstance(cv_results['params'], list)
        self.assertIsInstance(cv_results['rank_test_scores'], np.ndarray)
        self.assertTrue([isinstance(val, npma.MaskedArray) for key, val in
                         cv_results.items() if key.startswith('param_')])
        del cls
        self._tearDown(tmp)
        self._tearDown(output)
Beispiel #6
0
 def __init__(
     self,
     name: str,
     model_params: Dict[str, Any],
 ) -> None:
     super().__init__(name, model_params)
     self._model = AutoSklearnClassifier(**model_params)
def zeroconf_fit_ensemble(y):
    p("Building ensemble")

    seed = 1

    ensemble = AutoSklearnClassifier(
        time_left_for_this_task=300,per_run_time_limit=150,ml_memory_limit=20240,ensemble_size=50,ensemble_nbest=200,
        shared_mode=True, tmp_folder=atsklrn_tempdir, output_folder=atsklrn_tempdir,
        delete_tmp_folder_after_terminate=False, delete_output_folder_after_terminate=False,
        initial_configurations_via_metalearning=0,
        seed=seed)

    ensemble.fit_ensemble(
        task = BINARY_CLASSIFICATION
        ,y = y
        ,metric = F1_METRIC
        ,precision = '32'
        ,dataset_name = 'foobar' 
        ,ensemble_size=10
        ,ensemble_nbest=15)
    
    sleep(20)
    p("Ensemble built")
    
    p("Show models")
    p(str(ensemble.show_models()))
    return ensemble
def main(working_directory, time_limit, per_run_time_limit, task_id, seed):
    configuration_output_dir = os.path.join(working_directory, str(seed))
    try:
        os.makedirs(configuration_output_dir)
    except Exception as _:
        print(
            "Direcotry {0} aleardy created.".format(configuration_output_dir))

    tmp_dir = os.path.join(configuration_output_dir, str(task_id))
    #try:
    #    os.makedirs(tmp_dir)
    #except Exception as _:
    #    print("Direcotry {0} aleardy created.".format(configuration_output_dir))

    automl_arguments = {
        'time_left_for_this_task': time_limit,
        'per_run_time_limit': per_run_time_limit,
        'initial_configurations_via_metalearning': 0,
        'ensemble_size': 0,
        'seed': seed,
        'ml_memory_limit': 3072,
        'resampling_strategy': 'holdout',
        'resampling_strategy_arguments': {
            'train_size': 0.67
        },
        #'resampling_strategy': 'cv',
        #'resampling_strategy_arguments': {'folds': 5},
        'tmp_folder': tmp_dir,
        'delete_tmp_folder_after_terminate': False,
        'disable_evaluator_output': False,
    }

    X_train, y_train, X_test, y_test, cat = load_task(task_id)

    automl = AutoSklearnClassifier(**automl_arguments)

    automl.fit(X_train,
               y_train,
               dataset_name=str(task_id),
               X_test=X_test,
               y_test=y_test,
               metric=balanced_accuracy)

    with open(os.path.join(tmp_dir, "score_vanilla.csv"), 'w') as fh:
        T = 0
        fh.write("Time,Train Performance,Test Performance\n")
        # Add start time:0, Train Performance:1, Test Performance: 1
        best_loss = 1
        fh.write("{0},{1},{2}\n".format(T, 0, 0))
        for key, value in automl._automl.runhistory_.data.items(
        ):  # We compute rank based on error.
            t = value.time
            loss = value.cost
            T += t

            if loss < best_loss:
                fh.write("{0},{1},{2}\n".format(
                    T, 1 - loss,
                    1 - value.additional_info.get('test_loss', 1.0)))
                best_loss = loss
Beispiel #9
0
class AutoSklearnBaselineModel(Model):
    def __init__(
        self,
        name: str,
        model_params: Dict[str, Any],
    ) -> None:
        super().__init__(name, model_params)
        self._model = AutoSklearnClassifier(**model_params)

    def fit(self, X: np.ndarray, y: np.ndarray) -> None:
        self._model.fit(X, y)

    def save(self, path: str) -> None:
        with open(path, 'wb') as file:
            pickle.dump(self, file)

    @classmethod
    def load(cls, path: str):
        with open(path, 'rb') as file:
            model = pickle.load(file)
            return cast(AutoSklearnBaselineModel, model)

    def predict(self, X: np.ndarray) -> np.ndarray:
        return self._model.predict(X)

    def predict_proba(self, X: np.ndarray) -> np.ndarray:
        return self._model.predict_proba(X)
Beispiel #10
0
    def test_grid_scores(self):
        output = os.path.join(self.test_dir, '..', '.tmp_grid_scores')
        self._setUp(output)

        cls = AutoSklearnClassifier(time_left_for_this_task=15,
                                    per_run_time_limit=15,
                                    output_folder=output,
                                    tmp_folder=output,
                                    shared_mode=False,
                                    seed=1,
                                    initial_configurations_via_metalearning=0,
                                    ensemble_size=0)
        cls_ = cls.build_automl()
        automl = cls_._automl
        automl._proc_smac = mock.MagicMock()

        RunKey = collections.namedtuple(
            'RunKey', ['config_id', 'instance_id', 'seed'])

        RunValue = collections.namedtuple(
            'RunValue', ['cost', 'time', 'status', 'additional_info'])

        runhistory = dict()
        runhistory[RunKey(1, 1, 1)] = RunValue(1, 1, 1, '')
        automl._proc_smac.runhistory.data = runhistory
        grid_scores_ = automl.grid_scores_

        self.assertIsInstance(grid_scores_[0], _CVScoreTuple)
        # In the runhistory we store losses, thus the score is zero
        self.assertEqual(grid_scores_[0].mean_validation_score, 0)
        self.assertEqual(grid_scores_[0].cv_validation_scores, [0])
        self.assertIsInstance(grid_scores_[0].parameters, mock.MagicMock)

        del automl
        self._tearDown(output)
Beispiel #11
0
def test_binary(tmp_dir, output_dir, dask_client):

    X_train, Y_train, X_test, Y_test = putil.get_dataset('iris',
                                                         make_binary=True)
    automl = AutoSklearnClassifier(time_left_for_this_task=40,
                                   per_run_time_limit=10,
                                   tmp_folder=tmp_dir,
                                   dask_client=dask_client,
                                   output_folder=output_dir)

    automl.fit(X_train,
               Y_train,
               X_test=X_test,
               y_test=Y_test,
               dataset_name='binary_test_dataset')

    predictions = automl.predict(X_test)
    assert predictions.shape == (50, ), print_debug_information(automl)

    score = accuracy(Y_test, predictions)
    assert score > 0.9, print_debug_information(automl)
    assert count_succeses(
        automl.cv_results_) > 0, print_debug_information(automl)

    output_files = glob.glob(
        os.path.join(output_dir, 'binary_test_dataset_test_*.predict'))
    assert len(output_files) > 0, (output_files,
                                   print_debug_information(automl))
Beispiel #12
0
def spawn_classifier(seed, dataset_name):
    """Spawn a subprocess.

    auto-sklearn does not take care of spawning worker processes. This
    function, which is called several times in the main block is a new
    process which runs one instance of auto-sklearn.
    """

    # Use the initial configurations from meta-learning only in one out of
    # the four processes spawned. This prevents auto-sklearn from evaluating
    # the same configurations in four processes.
    if seed == 0:
        initial_configurations_via_metalearning = 25
    else:
        initial_configurations_via_metalearning = 0

    # Arguments which are different to other runs of auto-sklearn:
    # 1. all classifiers write to the same output directory
    # 2. shared_mode is set to True, this enables sharing of data between
    # models.
    # 3. all instances of the AutoSklearnClassifier must have a different seed!
    automl = AutoSklearnClassifier(
        time_left_for_this_task=120,  # sec., how long should this seed fit
        # process run
        per_run_time_limit=60,  # sec., each model may only take this long before it's killed
        ml_memory_limit=1024,  # MB, memory limit imposed on each call to a ML  algorithm
        shared_mode=True,  # tmp folder will be shared between seeds
        tmp_folder=tmp_folder,
        output_folder=output_folder,
        delete_tmp_folder_after_terminate=False,
        ensemble_size=0,  # ensembles will be built when all optimization runs are finished
        initial_configurations_via_metalearning=initial_configurations_via_metalearning,
        seed=seed,
    )
    automl.fit(X_train, y_train, dataset_name=dataset_name)
Beispiel #13
0
    def test_binary(self):
        tmp = os.path.join(self.test_dir, '..', '.out_binary_fit')
        output = os.path.join(self.test_dir, '..', '.tmp_binary_fit')
        self._setUp(output)
        self._setUp(tmp)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris',
                                                             make_binary=True)
        automl = AutoSklearnClassifier(time_left_for_this_task=20,
                                       per_run_time_limit=5,
                                       tmp_folder=tmp,
                                       output_folder=output)

        automl.fit(X_train,
                   Y_train,
                   X_test=X_test,
                   y_test=Y_test,
                   dataset_name='binary_test_dataset')
        predictions = automl.predict(X_test)
        self.assertEqual(predictions.shape, (50, ))
        score = accuracy(Y_test, predictions)
        self.assertGreaterEqual(score, 0.9)

        output_files = os.listdir(output)
        self.assertIn('binary_test_dataset_test_1.predict', output_files)
Beispiel #14
0
    def test_cv_results(self):
        # TODO restructure and actually use real SMAC output from a long run
        # to do this unittest!
        tmp = os.path.join(self.test_dir, '..', '.tmp_cv_results')
        output = os.path.join(self.test_dir, '..', '.out_cv_results')
        self._setUp(tmp)
        self._setUp(output)
        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')

        cls = AutoSklearnClassifier(time_left_for_this_task=20,
                                    per_run_time_limit=5,
                                    output_folder=output,
                                    tmp_folder=tmp,
                                    shared_mode=False,
                                    seed=1,
                                    initial_configurations_via_metalearning=0,
                                    ensemble_size=0)
        cls.fit(X_train, Y_train)
        cv_results = cls.cv_results_
        self.assertIsInstance(cv_results, dict)
        self.assertIsInstance(cv_results['mean_test_score'], np.ndarray)
        self.assertIsInstance(cv_results['mean_fit_time'], np.ndarray)
        self.assertIsInstance(cv_results['params'], list)
        self.assertIsInstance(cv_results['rank_test_scores'], np.ndarray)
        self.assertTrue([
            isinstance(val, npma.MaskedArray)
            for key, val in cv_results.items() if key.startswith('param_')
        ])
        del cls
        self._tearDown(tmp)
        self._tearDown(output)
def test_cv_results(tmp_dir, output_dir):
    # TODO restructure and actually use real SMAC output from a long run
    # to do this unittest!
    X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')

    cls = AutoSklearnClassifier(time_left_for_this_task=30,
                                per_run_time_limit=5,
                                tmp_folder=tmp_dir,
                                output_folder=output_dir,
                                seed=1,
                                initial_configurations_via_metalearning=0,
                                ensemble_size=0)
    cls.fit(X_train, Y_train)
    cv_results = cls.cv_results_
    assert isinstance(cv_results, dict), type(cv_results)
    assert isinstance(cv_results['mean_test_score'],
                      np.ndarray), type(cv_results['mean_test_score'])
    assert isinstance(cv_results['mean_fit_time'],
                      np.ndarray), type(cv_results['mean_fit_time'])
    assert isinstance(cv_results['params'], list), type(cv_results['params'])
    assert isinstance(cv_results['rank_test_scores'],
                      np.ndarray), type(cv_results['rank_test_scores'])
    cv_result_items = [
        isinstance(val, npma.MaskedArray) for key, val in cv_results.items()
        if key.startswith('param_')
    ]
    assert all(cv_result_items), cv_results.items()
Beispiel #16
0
    def test_grid_scores(self):
        output = os.path.join(self.test_dir, '..', '.tmp_grid_scores')
        self._setUp(output)

        cls = AutoSklearnClassifier(time_left_for_this_task=15,
                                    per_run_time_limit=5,
                                    output_folder=output,
                                    tmp_folder=output,
                                    shared_mode=False,
                                    seed=1,
                                    initial_configurations_via_metalearning=0,
                                    ensemble_size=0)
        cls_ = cls.build_automl()
        automl = cls_._automl
        automl.runhistory_ = unittest.mock.MagicMock()

        RunKey = collections.namedtuple(
            'RunKey', ['config_id', 'instance_id', 'seed'])

        RunValue = collections.namedtuple(
            'RunValue', ['cost', 'time', 'status', 'additional_info'])

        runhistory = dict()
        runhistory[RunKey(1, 1, 1)] = RunValue(1, 1, 1, '')
        automl.runhistory_.data = runhistory
        grid_scores_ = automl.grid_scores_

        self.assertIsInstance(grid_scores_[0], _CVScoreTuple)
        # In the runhistory we store losses, thus the score is zero
        self.assertEqual(grid_scores_[0].mean_validation_score, 0)
        self.assertEqual(grid_scores_[0].cv_validation_scores, [0])
        self.assertIsInstance(grid_scores_[0].parameters, unittest.mock.MagicMock)

        del automl
        self._tearDown(output)
Beispiel #17
0
 def __init__(
     self,
     name: str,
     model_params: Dict[str, Any],
     classifier_paths: Iterable[Tuple[str, str]],
 ) -> None:
     super().__init__(name, model_params, classifier_paths)
     self.selector = AutoSklearnClassifier(**model_params)
Beispiel #18
0
 def __init__(self, **kwargs) -> None:
     Ensemble.__init__(self)
     client = Client(
         processes=False,
         n_workers=kwargs['n_jobs'],
         threads_per_worker=1,
         dashboard_address=None,
     )
     self.model = AutoSklearnClassifier(**kwargs, dask_client=client)
Beispiel #19
0
def AutoSklearn(total_runtime, train_features, train_labels):
    clf = AutoSklearnClassifier(
            time_left_for_this_task=total_runtime,
            include_preprocessors=["no_preprocessing"],
            include_estimators = ["adaboost","gaussian_nb", "extra_trees", "gradient_boosting", "liblinear_svc", "libsvm_svc","random_forest",
                 "k_nearest_neighbors","decision_tree"],
    )
        
    clf.fit(train_features, train_labels, metric = balanced_accuracy)    
    return clf
def process_auto_sklearn(X_train, X_test, y_train, df_types, m_type, seed, *args):
    """Function that trains and tests data using auto-sklearn"""

    from autosklearn.classification import AutoSklearnClassifier
    from autosklearn.regression import AutoSklearnRegressor
    from autosklearn.metrics import f1_weighted
    from autosklearn.metrics import mean_squared_error

    categ_cols = df_types[df_types.NAME != 'target']['TYPE'].values.ravel()

    if m_type == 'classification':
        automl = AutoSklearnClassifier(time_left_for_this_task=TIME_PER_TASK,
                                       per_run_time_limit=int(TIME_PER_TASK/8),
                                       seed=seed,
                                       resampling_strategy='cv',
                                       resampling_strategy_arguments={'folds': 5},
                                       delete_tmp_folder_after_terminate=False)
    else:
        automl = AutoSklearnRegressor(time_left_for_this_task=TIME_PER_TASK,
                                      per_run_time_limit=int(TIME_PER_TASK/8),
                                      seed=seed,
                                      resampling_strategy='cv',
                                      resampling_strategy_arguments={'folds': 5},
                                      delete_tmp_folder_after_terminate=False)
    
    automl.fit(X_train.copy(),
        y_train.copy(), 
        feat_type=categ_cols,
        metric=f1_weighted if m_type == 'classification' else mean_squared_error)
    automl.refit(X_train.copy(), y_train.copy())

    return (automl.predict_proba(X_test) if m_type == 'classification' else 
            automl.predict(X_test))
    def test_fit_pSMAC(self):
        output = os.path.join(self.test_dir, '..', '.tmp_estimator_fit_pSMAC')
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')

        automl = AutoSklearnClassifier(time_left_for_this_task=15,
                                       per_run_time_limit=15,
                                       output_folder=output,
                                       tmp_folder=output,
                                       shared_mode=True,
                                       seed=1,
                                       initial_configurations_via_metalearning=0,
                                       ensemble_size=0)
        automl.fit(X_train, Y_train)

        # Create a 'dummy model' for the first run, which has an accuracy of
        # more than 99%; it should be in the final ensemble if the ensemble
        # building of the second AutoSklearn classifier works correct
        true_targets_ensemble_path = os.path.join(output, '.auto-sklearn',
                                                  'true_targets_ensemble.npy')
        true_targets_ensemble = np.load(true_targets_ensemble_path)
        true_targets_ensemble[-1] = 1 if true_targets_ensemble[-1] != 1 else 0
        probas = np.zeros((len(true_targets_ensemble), 3), dtype=float)
        for i, value in enumerate(true_targets_ensemble):
            probas[i, value] = 1.0
        dummy_predictions_path = os.path.join(output, '.auto-sklearn',
                                              'predictions_ensemble',
                                              'predictions_ensemble_1_00030.npy')
        with open(dummy_predictions_path, 'wb') as fh:
            np.save(fh, probas)

        probas_test = np.zeros((len(Y_test), 3), dtype=float)
        for i, value in enumerate(Y_test):
            probas_test[i, value] = 1.0

        dummy = ArrayReturningDummyPredictor(probas_test)
        backend = Backend(output, output)
        backend.save_model(dummy, 30, 1)

        automl = AutoSklearnClassifier(time_left_for_this_task=15,
                                       per_run_time_limit=15,
                                       output_folder=output,
                                       tmp_folder=output,
                                       shared_mode=True,
                                       seed=2,
                                       initial_configurations_via_metalearning=0,
                                       ensemble_size=0)
        automl.fit(X_train, Y_train)
        automl.run_ensemble_builder(0, 1, 50).wait()

        score = automl.score(X_test, Y_test)

        self.assertEqual(len(os.listdir(os.path.join(output, '.auto-sklearn',
                                                     'ensembles'))), 1)
        self.assertGreaterEqual(score, 0.90)
        self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION)

        del automl
        self._tearDown(output)
Beispiel #22
0
    def spawn_classifier(
            seed,
            time,
            search_space,
            prep_space,
            metric,
            dataset_name=None):
        """Spawn a subprocess.

        auto-sklearn does not take care of spawning worker processes. This
        function, which is called several times in the main block is a new
        process which runs one instance of auto-sklearn.
        """

        # Use the initial configurations from meta-learning only in one out of
        # the four processes spawned. This prevents auto-sklearn from evaluating
        # the same configurations in four processes.
        if seed == 0:
            initial_configurations_via_metalearning = 25
            smac_scenario_args = {}
        else:
            initial_configurations_via_metalearning = 0
            smac_scenario_args = {'initial_incumbent': 'RANDOM'}

        # Arguments which are different to other runs of auto-sklearn:
        # 1. all classifiers write to the same output directory
        # 2. shared_mode is set to True, this enables sharing of data between
        # models.
        # 3. all instances of the AutoSklearnClassifier must have a different
        # seed!
        automl = AutoSklearnClassifier(
            time_left_for_this_task=time,
            # sec., how long should this seed fit process run
            per_run_time_limit=15,
            # sec., each model may only take this long before it's killed
            ml_memory_limit=1024,
            # MB, memory limit imposed on each call to a ML algorithm
            shared_mode=True,  # tmp folder will be shared between seeds
            tmp_folder=tmp_folder,
            output_folder=output_folder,
            delete_tmp_folder_after_terminate=False,
            ensemble_size=0,
            include_estimators=search_space, exclude_estimators=None,
            include_preprocessors=prep_space, exclude_preprocessors=None,
            # ensembles will be built when all optimization runs are finished
            initial_configurations_via_metalearning=(
                initial_configurations_via_metalearning
            ),
            seed=seed,
            smac_scenario_args=smac_scenario_args,
        )
        automl.fit(X_train, y_train, metric=metric, dataset_name=dataset_name)
        # print(automl.cv_results_)
        return automl.cv_results_
Beispiel #23
0
def simple():
    stage2assistant = Exp2Assistant(stage=2)
    train_data = stage2assistant.train_data
    X_train = train_data.iloc[:, :-1]
    y_train = train_data.iloc[:, -1]

    automl = AutoSklearnClassifier(
    )  # change the time, in this experiment, 1h 12h 24h 48h
    automl.fit(X_train, y_train)
    joblib.dump(
        automl,
        path.join(path_service.get_resource("model"), "stage2_model.joblib"))
Beispiel #24
0
 def __init__(self, time_left_for_this_task, per_run_time_limit, folds):
     now = strftime("%Y-%m-%d-%H-%M-%S", gmtime())
     self.automl = AutoSklearnClassifier(
         time_left_for_this_task=time_left_for_this_task,
         per_run_time_limit=per_run_time_limit,
         #tmp_folder='/tmp/autosklearn_switch_tmp',
         #output_folder='/tmp/autosklearn_switch_out',
         #delete_tmp_folder_after_terminate=False,
         #delete_output_folder_after_terminate=False,
         #shared_mode=True,
         resampling_strategy='cv',
         resampling_strategy_arguments={'folds': folds})
def spawn_classifier(seed, dataset_name):

    automl = AutoSklearnClassifier(time_left_for_this_task=600, # sec., how long should this seed fit process run
                                   per_run_time_limit=60, # sec., each model may only take this long before it's killed 
                                   ml_memory_limit=1024, # MB
                                   shared_mode=True, # tmp folder will be shared between seeds
                                   tmp_folder=tmp_folder,
                                   output_folder=output_folder,
                                   delete_tmp_folder_after_terminate=False,
                                   ensemble_size=0, # no need to build ensembles at this stage
                                   initial_configurations_via_metalearning=0, # let seeds profit from each other's results
                                   seed=seed)
    automl.fit(X_train, y_train, dataset_name=dataset_name)
Beispiel #26
0
    def test_classification_methods_returns_self(self):
        X_train, y_train, X_test, y_test = putil.get_dataset('iris')
        automl = AutoSklearnClassifier(time_left_for_this_task=20,
                                       per_run_time_limit=5,
                                       ensemble_size=0)

        automl_fitted = automl.fit(X_train, y_train)
        self.assertIs(automl, automl_fitted)

        automl_ensemble_fitted = automl.fit_ensemble(y_train, ensemble_size=5)
        self.assertIs(automl, automl_ensemble_fitted)

        automl_refitted = automl.refit(X_train.copy(), y_train.copy())
        self.assertIs(automl, automl_refitted)
Beispiel #27
0
    def test_pSMAC_wrong_arguments(self):
        self.assertRaisesRegexp(ValueError,
                                "If shared_mode == True tmp_folder must not "
                                "be None.",
                                lambda shared_mode: AutoSklearnClassifier(shared_mode=shared_mode).fit(None, None),
                                shared_mode=True)

        self.assertRaisesRegexp(ValueError,
                                "If shared_mode == True output_folder must not "
                                "be None.",
                                lambda shared_mode, tmp_folder:
                                AutoSklearnClassifier(shared_mode=shared_mode, tmp_folder=tmp_folder).fit(None, None),
                                shared_mode=True,
                                tmp_folder='/tmp/duitaredxtvbedb')
Beispiel #28
0
    def test_classification_pandas_support(self):
        X, y = sklearn.datasets.fetch_openml(
            data_id=2,  # cat/num dataset
            return_X_y=True,
            as_frame=True,
        )

        # Drop NAN!!
        X = X.dropna('columns')

        # This test only make sense if input is dataframe
        self.assertTrue(isinstance(X, pd.DataFrame))
        self.assertTrue(isinstance(y, pd.Series))
        automl = AutoSklearnClassifier(
            time_left_for_this_task=30,
            per_run_time_limit=5,
            exclude_estimators=['libsvm_svc'],
            seed=5,
        )

        automl.fit(X, y)

        # Make sure that at least better than random.
        # We use same X_train==X_test to test code quality
        self.assertTrue(automl.score(X, y) > 0.555)

        automl.refit(X, y)

        # Make sure that at least better than random.
        # accuracy in sklearn needs valid data
        # It should be 0.555 as the dataset is unbalanced.
        y = automl._automl[0].InputValidator.encode_target(y)
        prediction = automl._automl[0].InputValidator.encode_target(automl.predict(X))
        self.assertTrue(accuracy(y, prediction) > 0.555)
    def test_classification_methods_returns_self(self):
        X_train, y_train, X_test, y_test = putil.get_dataset('iris')
        automl = AutoSklearnClassifier(time_left_for_this_task=20,
                                       per_run_time_limit=5,
                                       ensemble_size=0)

        automl_fitted = automl.fit(X_train, y_train)
        self.assertIs(automl, automl_fitted)

        automl_ensemble_fitted = automl.fit_ensemble(y_train, ensemble_size=5)
        self.assertIs(automl, automl_ensemble_fitted)

        automl_refitted = automl.refit(X_train.copy(), y_train.copy())
        self.assertIs(automl, automl_refitted)
Beispiel #30
0
def zeroconf_fit_ensemble(y, atsklrn_tempdir):
    lo = utl.get_logger(inspect.stack()[0][3])

    lo.info("Building ensemble")

    seed = 1

    ensemble = AutoSklearnClassifier(
        time_left_for_this_task=300,
        per_run_time_limit=150,
        ml_memory_limit=20240,
        ensemble_size=50,
        ensemble_nbest=200,
        shared_mode=True,
        tmp_folder=atsklrn_tempdir,
        output_folder=atsklrn_tempdir,
        delete_tmp_folder_after_terminate=False,
        delete_output_folder_after_terminate=False,
        initial_configurations_via_metalearning=0,
        seed=seed)

    lo.info("Done AutoSklearnClassifier - seed:" + str(seed))

    try:
        lo.debug("Start ensemble.fit_ensemble - seed:" + str(seed))
        ensemble.fit_ensemble(task=BINARY_CLASSIFICATION,
                              y=y,
                              metric=autosklearn.metrics.f1,
                              precision='32',
                              dataset_name='foobar',
                              ensemble_size=10,
                              ensemble_nbest=15)
    except Exception:
        lo = utl.get_logger(inspect.stack()[0][3])
        lo.exception("Error in ensemble.fit_ensemble - seed:" + str(seed))
        raise

    lo = utl.get_logger(inspect.stack()[0][3])
    lo.debug("Done ensemble.fit_ensemble - seed:" + str(seed))

    sleep(20)
    lo.info("Ensemble built - seed:" + str(seed))

    lo.info("Show models - seed:" + str(seed))
    txtList = str(ensemble.show_models()).split("\n")
    for row in txtList:
        lo.info(row)

    return ensemble
Beispiel #31
0
    def test_can_pickle_classifier(self):
        if self.travis:
            self.skipTest('This test does currently not run on travis-ci. '
                          'Make sure it runs locally on your machine!')

        output = os.path.join(self.test_dir, '..', '.tmp_can_pickle')
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
        automl = AutoSklearnClassifier(time_left_for_this_task=15,
                                       per_run_time_limit=15,
                                       tmp_folder=output,
                                       output_folder=output)
        automl.fit(X_train, Y_train)

        initial_predictions = automl.predict(X_test)
        initial_accuracy = sklearn.metrics.accuracy_score(
            Y_test, initial_predictions)
        self.assertTrue(initial_accuracy > 0.75)

        # Test pickle
        dump_file = os.path.join(output, 'automl.dump.pkl')

        with open(dump_file, 'wb') as f:
            pickle.dump(automl, f)

        with open(dump_file, 'rb') as f:
            restored_automl = pickle.load(f)

        restored_predictions = restored_automl.predict(X_test)
        restored_accuracy = sklearn.metrics.accuracy_score(
            Y_test, restored_predictions)
        self.assertTrue(restored_accuracy > 0.75)

        self.assertEqual(initial_accuracy, restored_accuracy)

        # Test joblib
        dump_file = os.path.join(output, 'automl.dump.joblib')

        sklearn.externals.joblib.dump(automl, dump_file)

        restored_automl = sklearn.externals.joblib.load(dump_file)

        restored_predictions = restored_automl.predict(X_test)
        restored_accuracy = sklearn.metrics.accuracy_score(
            Y_test, restored_predictions)
        self.assertTrue(restored_accuracy > 0.75)

        self.assertEqual(initial_accuracy, restored_accuracy)
Beispiel #32
0
    def test_fit_n_jobs_2(self):
        tmp = os.path.join(self.test_dir, '..', '.tmp_estimator_fit_pSMAC')
        output = os.path.join(self.test_dir, '..', '.out_estimator_fit_pSMAC')
        self._setUp(tmp)
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('breast_cancer')

        # test parallel Classifier to predict classes, not only indices
        Y_train += 1
        Y_test += 1

        automl = AutoSklearnClassifier(
            time_left_for_this_task=30,
            per_run_time_limit=5,
            output_folder=output,
            tmp_folder=tmp,
            seed=1,
            initial_configurations_via_metalearning=0,
            ensemble_size=5,
            n_jobs=2,
            include_estimators=['sgd'],
            include_preprocessors=['no_preprocessing'],
        )
        automl.fit(X_train, Y_train)
        n_runs = len(automl.cv_results_['mean_test_score'])

        predictions_dir = automl._automl[0]._backend._get_prediction_output_dir(
            'ensemble'
        )
        predictions = os.listdir(predictions_dir)
        # two instances of the dummy
        self.assertEqual(n_runs, len(predictions) - 2, msg=str(predictions))

        seeds = set()
        for predictions_file in predictions:
            seeds.add(int(predictions_file.split('.')[0].split('_')[2]))

        self.assertEqual(len(seeds), 2)

        ensemble_dir = automl._automl[0]._backend.get_ensemble_dir()
        ensembles = os.listdir(ensemble_dir)

        seeds = set()
        for ensemble_file in ensembles:
            seeds.add(int(ensemble_file.split('.')[0].split('_')[0]))

        self.assertEqual(len(seeds), 1)
Beispiel #33
0
def spawn_autosklearn_classifier(X_train, y_train, seed, dataset_name,
                                 time_left_for_this_task, per_run_time_limit,
                                 feat_type, memory_limit, atsklrn_tempdir):
    lo = utl.get_logger(inspect.stack()[0][3])

    try:
        lo.info("Start AutoSklearnClassifier seed=" + str(seed))
        clf = AutoSklearnClassifier(
            time_left_for_this_task=time_left_for_this_task,
            per_run_time_limit=per_run_time_limit,
            ml_memory_limit=memory_limit,
            shared_mode=True,
            tmp_folder=atsklrn_tempdir,
            output_folder=atsklrn_tempdir,
            delete_tmp_folder_after_terminate=False,
            delete_output_folder_after_terminate=False,
            initial_configurations_via_metalearning=0,
            ensemble_size=0,
            seed=seed)
    except Exception:
        lo.exception("Exception AutoSklearnClassifier seed=" + str(seed))
        raise

    lo = utl.get_logger(inspect.stack()[0][3])
    lo.info("Done AutoSklearnClassifier seed=" + str(seed))

    sleep(seed)

    try:
        lo.info("Starting seed=" + str(seed))
        try:
            clf.fit(X_train,
                    y_train,
                    metric=autosklearn.metrics.f1,
                    feat_type=feat_type,
                    dataset_name=dataset_name)
        except Exception:
            lo = utl.get_logger(inspect.stack()[0][3])
            lo.exception("Error in clf.fit - seed:" + str(seed))
            raise
    except Exception:
        lo = utl.get_logger(inspect.stack()[0][3])
        lo.exception("Exception in seed=" + str(seed) + ".  ")
        traceback.print_exc()
        raise
    lo = utl.get_logger(inspect.stack()[0][3])
    lo.info("####### Finished seed=" + str(seed))
    return None
    def test_can_pickle_classifier(self):
        tmp = os.path.join(self.test_dir, '..', '.tmp_can_pickle')
        output = os.path.join(self.test_dir, '..', '.out_can_pickle')
        self._setUp(tmp)
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
        automl = AutoSklearnClassifier(time_left_for_this_task=30,
                                       per_run_time_limit=5,
                                       tmp_folder=tmp,
                                       output_folder=output)
        automl.fit(X_train, Y_train)

        initial_predictions = automl.predict(X_test)
        initial_accuracy = sklearn.metrics.accuracy_score(
            Y_test, initial_predictions)
        self.assertGreaterEqual(initial_accuracy, 0.75)
        self.assertGreater(self._count_succeses(automl.cv_results_), 0)

        # Test pickle
        dump_file = os.path.join(output, 'automl.dump.pkl')

        with open(dump_file, 'wb') as f:
            pickle.dump(automl, f)

        with open(dump_file, 'rb') as f:
            restored_automl = pickle.load(f)

        restored_predictions = restored_automl.predict(X_test)
        restored_accuracy = sklearn.metrics.accuracy_score(
            Y_test, restored_predictions)
        self.assertGreaterEqual(restored_accuracy, 0.75)

        self.assertEqual(initial_accuracy, restored_accuracy)

        # Test joblib
        dump_file = os.path.join(output, 'automl.dump.joblib')

        joblib.dump(automl, dump_file)

        restored_automl = joblib.load(dump_file)

        restored_predictions = restored_automl.predict(X_test)
        restored_accuracy = sklearn.metrics.accuracy_score(
            Y_test, restored_predictions)
        self.assertGreaterEqual(restored_accuracy, 0.75)

        self.assertEqual(initial_accuracy, restored_accuracy)
Beispiel #35
0
def spawn_autosklearn_classifier(X_train, y_train, seed, dataset_name, time_left_for_this_task, per_run_time_limit, feat_type):
    c = AutoSklearnClassifier(time_left_for_this_task=time_left_for_this_task, per_run_time_limit=per_run_time_limit,
            ml_memory_limit=memory_limit,
            shared_mode=True, tmp_folder=atsklrn_tempdir, output_folder=atsklrn_tempdir,
            delete_tmp_folder_after_terminate=False, delete_output_folder_after_terminate=False,
            initial_configurations_via_metalearning=0, ensemble_size=0,
            seed=seed)
    sleep(seed)
    try:
        p("Starting seed="+str(seed))
        c.fit(X_train, y_train, metric='f1_metric', feat_type=feat_type, dataset_name = dataset_name)
        p("####### Finished seed="+str(seed))
    except Exception:
        p("Exception in seed="+str(seed)+".  ")
        traceback.print_exc()
    raise
Beispiel #36
0
    def test_feat_type_wrong_arguments(self):
        cls = AutoSklearnClassifier()
        X = np.zeros((100, 100))
        y = np.zeros((100, ))
        self.assertRaisesRegexp(ValueError,
                                'Array feat_type does not have same number of '
                                'variables as X has features. 1 vs 100.',
                                cls.fit,
                                X=X,
                                y=y,
                                feat_type=[True])

        self.assertRaisesRegexp(ValueError,
                                'Array feat_type must only contain strings.',
                                cls.fit,
                                X=X,
                                y=y,
                                feat_type=[True] * 100)

        self.assertRaisesRegexp(ValueError,
                                'Only `Categorical` and `Numerical` are '
                                'valid feature types, you passed `Car`',
                                cls.fit,
                                X=X,
                                y=y,
                                feat_type=['Car'] * 100)
    def test_can_pickle_classifier(self):
        tmp = os.path.join(self.test_dir, '..', '.tmp_can_pickle')
        output = os.path.join(self.test_dir, '..', '.out_can_pickle')
        self._setUp(tmp)
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
        automl = AutoSklearnClassifier(time_left_for_this_task=20,
                                       per_run_time_limit=5,
                                       tmp_folder=tmp,
                                       output_folder=output)
        automl.fit(X_train, Y_train)

        initial_predictions = automl.predict(X_test)
        initial_accuracy = sklearn.metrics.accuracy_score(Y_test,
                                                          initial_predictions)
        self.assertGreaterEqual(initial_accuracy, 0.75)

        # Test pickle
        dump_file = os.path.join(output, 'automl.dump.pkl')

        with open(dump_file, 'wb') as f:
            pickle.dump(automl, f)

        with open(dump_file, 'rb') as f:
            restored_automl = pickle.load(f)

        restored_predictions = restored_automl.predict(X_test)
        restored_accuracy = sklearn.metrics.accuracy_score(Y_test,
                                                           restored_predictions)
        self.assertGreaterEqual(restored_accuracy, 0.75)

        self.assertEqual(initial_accuracy, restored_accuracy)

        # Test joblib
        dump_file = os.path.join(output, 'automl.dump.joblib')

        sklearn.externals.joblib.dump(automl, dump_file)

        restored_automl = sklearn.externals.joblib.load(dump_file)

        restored_predictions = restored_automl.predict(X_test)
        restored_accuracy = sklearn.metrics.accuracy_score(Y_test,
                                                           restored_predictions)
        self.assertGreaterEqual(restored_accuracy, 0.75)

        self.assertEqual(initial_accuracy, restored_accuracy)
def main():

    X, y = sklearn.datasets.load_digits(return_X_y=True)
    X_train, X_test, y_train, y_test = \
        sklearn.model_selection.train_test_split(X, y, random_state=1)

    processes = []
    spawn_classifier = get_spawn_classifier(X_train, y_train)
    for i in range(4): # set this at roughly half of your cores
        p = multiprocessing.Process(target=spawn_classifier, args=(i, 'digits'))
        p.start()
        processes.append(p)
    for p in processes:
        p.join()

    print('Starting to build an ensemble!')
    automl = AutoSklearnClassifier(
        time_left_for_this_task=15,
        per_run_time_limit=15,
        ml_memory_limit=1024,
        shared_mode=True,
        ensemble_size=50,
        ensemble_nbest=200,
        tmp_folder=tmp_folder,
        output_folder=output_folder,
        initial_configurations_via_metalearning=0,
        seed=1,
    )

    # Both the ensemble_size and ensemble_nbest parameters can be changed now if
    # necessary
    automl.fit_ensemble(
        y_train,
        task=MULTICLASS_CLASSIFICATION,
        metric=accuracy,
        precision='32',
        dataset_name='digits',
        ensemble_size=20,
        ensemble_nbest=50,
    )

    predictions = automl.predict(X_test)
    print(automl.show_models())
    print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions))
Beispiel #39
0
    def test_fit(self):

        output = os.path.join(self.test_dir, '..', '.tmp_estimator_fit')
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
        automl = AutoSklearnClassifier(time_left_for_this_task=15,
                                       per_run_time_limit=5,
                                       tmp_folder=output,
                                       output_folder=output)
        automl.fit(X_train, Y_train)
        score = automl.score(X_test, Y_test)
        print(automl.show_models())

        self.assertGreaterEqual(score, 0.8)
        self.assertEqual(automl._automl._automl._task, MULTICLASS_CLASSIFICATION)

        del automl
        self._tearDown(output)
    def test_multilabel(self):
        tmp = os.path.join(self.test_dir, '..', '.tmp_multilabel_fit')
        output = os.path.join(self.test_dir, '..', '.out_multilabel_fit')
        self._setUp(tmp)
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset(
            'iris', make_multilabel=True)
        automl = AutoSklearnClassifier(time_left_for_this_task=20,
                                       per_run_time_limit=5,
                                       tmp_folder=tmp,
                                       output_folder=output)

        automl.fit(X_train, Y_train)
        predictions = automl.predict(X_test)
        self.assertEqual(predictions.shape, (50, 3))
        score = f1_macro(Y_test, predictions)
        self.assertGreaterEqual(score, 0.9)
        probs = automl.predict_proba(X_train)
        self.assertAlmostEqual(np.mean(probs), 0.33, places=1)
    def test_fit(self):
        if self.travis:
            self.skipTest('This test does currently not run on travis-ci. '
                          'Make sure it runs locally on your machine!')

        output = os.path.join(self.test_dir, '..', '.tmp_estimator_fit')
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')
        automl = AutoSklearnClassifier(time_left_for_this_task=15,
                                       per_run_time_limit=15,
                                       tmp_folder=output,
                                       output_folder=output)
        automl.fit(X_train, Y_train)
        score = automl.score(X_test, Y_test)
        print(automl.show_models())

        self.assertGreaterEqual(score, 0.8)
        self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION)

        del automl
        self._tearDown(output)
    def test_binary(self):
        tmp = os.path.join(self.test_dir, '..', '.out_binary_fit')
        output = os.path.join(self.test_dir, '..', '.tmp_binary_fit')
        self._setUp(output)
        self._setUp(tmp)

        X_train, Y_train, X_test, Y_test = putil.get_dataset(
            'iris', make_binary=True)
        automl = AutoSklearnClassifier(time_left_for_this_task=20,
                                       per_run_time_limit=5,
                                       tmp_folder=tmp,
                                       output_folder=output)

        automl.fit(X_train, Y_train, X_test=X_test, y_test=Y_test,
                   dataset_name='binary_test_dataset')
        predictions = automl.predict(X_test)
        self.assertEqual(predictions.shape, (50, ))
        score = accuracy(Y_test, predictions)
        self.assertGreaterEqual(score, 0.9)

        output_files = os.listdir(output)
        self.assertIn('binary_test_dataset_test_1.predict', output_files)
Beispiel #43
0
def spawn_classifier(seed, dataset_name):
    digits = sklearn.datasets.load_digits()
    X = digits.data
    y = digits.target
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)
    X = X[indices]
    y = y[indices]
    X_train = X[:1000]
    y_train = y[:1000]
    X_test = X[1000:]
    y_test = y[1000:]

    automl = AutoSklearnClassifier(time_left_for_this_task=60,
                                   per_run_time_limit=60,
                                   ml_memory_limit=1024,
                                   shared_mode=True,
                                   tmp_folder=tmp_folder,
                                   output_folder=output_folder,
                                   delete_tmp_folder_after_terminate=False,
                                   ensemble_size=0,
                                   initial_configurations_via_metalearning=0,
                                   seed=seed)
    automl.fit(X_train, y_train, dataset_name=dataset_name)
 def test_conversion_of_list_to_np(self, fit_ensemble, refit, fit):
     automl = AutoSklearnClassifier()
     X = [[1], [2], [3]]
     y = [1, 2, 3]
     automl.fit(X, y)
     self.assertEqual(fit.call_count, 1)
     self.assertIsInstance(fit.call_args[0][0], np.ndarray)
     self.assertIsInstance(fit.call_args[0][1], np.ndarray)
     automl.refit(X, y)
     self.assertEqual(refit.call_count, 1)
     self.assertIsInstance(refit.call_args[0][0], np.ndarray)
     self.assertIsInstance(refit.call_args[0][1], np.ndarray)
     automl.fit_ensemble(y)
     self.assertEqual(fit_ensemble.call_count, 1)
     self.assertIsInstance(fit_ensemble.call_args[0][0], np.ndarray)
    'initial_configurations_via_metalearning': 0,
    'ensemble_size': 0,
    'ensemble_nbest': 0,
    'seed': seed,
    'ml_memory_limit': 3072,
    'resampling_strategy': 'partial-cv',
    'resampling_strategy_arguments': {'folds': 10},
    'delete_tmp_folder_after_terminate': False,
    'tmp_folder': tmp_dir,
    'disable_evaluator_output': True,
}

X_train, y_train, X_test, y_test, cat = load_task(task_id)

if task_type == 'classification':
    automl = AutoSklearnClassifier(**automl_arguments)
    metric = balanced_accuracy
elif task_type == 'regression':
    automl = AutoSklearnRegressor(**automl_arguments)
    metric = r2
else:
    raise ValueError(task_type)

automl.fit(X_train, y_train, dataset_name=str(task_id), metric=metric,
           feat_type=cat)
data = automl._automl._backend.load_datamanager()
# Data manager can't be replaced with save_datamanager, it has to be deleted
# first
os.remove(automl._automl._backend._get_datamanager_pickle_filename())
data.data['X_test'] = X_test
data.data['Y_test'] = y_test
Beispiel #46
0
    processes = []
    for i in range(4):  # set this at roughly half of your cores
        p = multiprocessing.Process(target=spawn_classifier, args=(i, "digits"))
        p.start()
        processes.append(p)
    for p in processes:
        p.join()

    print("Starting to build an ensemble!")
    automl = AutoSklearnClassifier(
        time_left_for_this_task=15,
        per_run_time_limit=15,
        ml_memory_limit=1024,
        shared_mode=True,
        ensemble_size=50,
        ensemble_nbest=200,
        tmp_folder=tmp_folder,
        output_folder=output_folder,
        initial_configurations_via_metalearning=0,
        seed=1,
    )

    # Both the ensemble_size and ensemble_nbest parameters can be changed now if
    # necessary
    automl.fit_ensemble(
        y_train,
        task=MULTICLASS_CLASSIFICATION,
        metric=ACC_METRIC,
        precision="32",
        dataset_name="digits",
        ensemble_size=20,
    def test_fit_pSMAC(self):
        tmp = os.path.join(self.test_dir, '..', '.tmp_estimator_fit_pSMAC')
        output = os.path.join(self.test_dir, '..', '.out_estimator_fit_pSMAC')
        self._setUp(tmp)
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('digits')

        # test parallel Classifier to predict classes, not only indexes
        Y_train += 1
        Y_test += 1

        automl = AutoSklearnClassifier(
            time_left_for_this_task=20,
            per_run_time_limit=5,
            output_folder=output,
            tmp_folder=tmp,
            shared_mode=True,
            seed=1,
            initial_configurations_via_metalearning=0,
            ensemble_size=0,
        )
        automl.fit(X_train, Y_train)
        # Create a 'dummy model' for the first run, which has an accuracy of
        # more than 99%; it should be in the final ensemble if the ensemble
        # building of the second AutoSklearn classifier works correct
        true_targets_ensemble_path = os.path.join(tmp, '.auto-sklearn',
                                                  'true_targets_ensemble.npy')
        with open(true_targets_ensemble_path, 'rb') as fh:
            true_targets_ensemble = np.load(fh)
        true_targets_ensemble[-1] = 1 if true_targets_ensemble[-1] != 1 else 0
        true_targets_ensemble = true_targets_ensemble.astype(int)
        probas = np.zeros((len(true_targets_ensemble), 10), dtype=float)

        for i, value in enumerate(true_targets_ensemble):
            probas[i, value] = 1.0
        dummy_predictions_path = os.path.join(
            tmp,
            '.auto-sklearn',
            'predictions_ensemble',
            'predictions_ensemble_1_00030.npy',
        )
        with open(dummy_predictions_path, 'wb') as fh:
            np.save(fh, probas)

        probas_test = np.zeros((len(Y_test), 10), dtype=float)
        for i, value in enumerate(Y_test):
            probas_test[i, value - 1] = 1.0

        dummy = ArrayReturningDummyPredictor(probas_test)
        context = BackendContext(tmp, output, False, False, True)
        backend = Backend(context)
        backend.save_model(dummy, 30, 1)

        automl = AutoSklearnClassifier(
            time_left_for_this_task=20,
            per_run_time_limit=5,
            output_folder=output,
            tmp_folder=tmp,
            shared_mode=True,
            seed=2,
            initial_configurations_via_metalearning=0,
            ensemble_size=0,
        )
        automl.fit_ensemble(Y_train, task=MULTICLASS_CLASSIFICATION,
                            metric=accuracy,
                            precision='32',
                            dataset_name='iris',
                            ensemble_size=20,
                            ensemble_nbest=50,
                            )

        predictions = automl.predict(X_test)
        score = sklearn.metrics.accuracy_score(Y_test, predictions)

        self.assertEqual(len(os.listdir(os.path.join(tmp, '.auto-sklearn',
                                                     'ensembles'))), 1)
        self.assertGreaterEqual(score, 0.90)
        self.assertEqual(automl._automl._task, MULTICLASS_CLASSIFICATION)

        models = automl._automl.models_
        classifier_types = [type(c) for c in models.values()]
        self.assertIn(ArrayReturningDummyPredictor, classifier_types)

        del automl
        self._tearDown(tmp)
        self._tearDown(output)
    y_test = y[1000:]

    processes = []
    for i in range(4): # set this at roughly half of your cores
        p = multiprocessing.Process(target=spawn_classifier, args=(i, 'digits'))
        p.start()
        processes.append(p)
    for p in processes:
        p.join()

    print('Starting to build an ensemble!')
    automl = AutoSklearnClassifier(time_left_for_this_task=15,
                                   per_run_time_limit=15,
                                   ml_memory_limit=1024,
                                   shared_mode=True,
                                   ensemble_size=50,
                                   ensemble_nbest=200,
                                   tmp_folder=tmp_folder,
                                   output_folder=output_folder,
                                   initial_configurations_via_metalearning=0,
                                   seed=1)

    # Both the ensemble_size and ensemble_nbest parameters can be changed later
    automl.fit_ensemble(task=MULTICLASS_CLASSIFICATION,
                        metric=ACC_METRIC,
                        precision='32',
                        dataset_name='digits',
                        ensemble_size=10,
                        ensemble_nbest=10)

    predictions = automl.predict(X_test)
    print(automl.show_models())