Example #1
0
    def test_pSMAC_wrong_arguments(self):
        X = np.zeros((100, 100))
        y = np.zeros((100, ))
        self.assertRaisesRegex(
            ValueError,
            "If shared_mode == True tmp_folder must not "
            "be None.",
            lambda shared_mode:
            AutoSklearnClassifier(
                shared_mode=shared_mode,
            ).fit(X, y),
            shared_mode=True
        )

        self.assertRaisesRegex(
            ValueError,
            "If shared_mode == True output_folder must not "
            "be None.",
            lambda shared_mode, tmp_folder:
            AutoSklearnClassifier(
                shared_mode=shared_mode,
                tmp_folder=tmp_folder,
            ).fit(X, y),
            shared_mode=True,
            tmp_folder='/tmp/duitaredxtvbedb'
        )
Example #2
0
    def test_metadata_directory(self):
        # Test that metadata directory is set correctly (if user specifies,
        # Auto-sklearn should check that the directory exists. If not, it
        # should use the default directory.
        automl1 = AutoSklearnClassifier(
            time_left_for_this_task=30,
            per_run_time_limit=5,
            metadata_directory=
            "pyMetaLearn/metadata_dir",  # user specified metadata_dir
        )
        self.assertEqual(automl1.metadata_directory,
                         "pyMetaLearn/metadata_dir")

        automl2 = AutoSklearnClassifier(  # default metadata_dir
            time_left_for_this_task=30,
            per_run_time_limit=5,
        )
        self.assertIsNone(automl2.metadata_directory)

        nonexistent_dir = "nonexistent_dir"
        automl3 = AutoSklearnClassifier(
            time_left_for_this_task=30,
            per_run_time_limit=5,
            metadata_directory=nonexistent_dir,  # user specified metadata_dir
        )
        X, y = load_breast_cancer(return_X_y=True)
        self.assertRaisesRegex(ValueError,
                               "The specified metadata directory "
                               "\'%s\' does not exist!" % nonexistent_dir,
                               automl3.fit,
                               X=X,
                               y=y)
Example #3
0
    def test_fit_pSMAC(self):
        output = os.path.join(self.test_dir, '..', '.tmp_estimator_fit_pSMAC')
        self._setUp(output)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')

        automl = AutoSklearnClassifier(time_left_for_this_task=15,
                                       per_run_time_limit=15,
                                       output_folder=output,
                                       tmp_folder=output,
                                       shared_mode=True,
                                       seed=1,
                                       initial_configurations_via_metalearning=0,
                                       ensemble_size=0)
        automl.fit(X_train, Y_train)

        # Create a 'dummy model' for the first run, which has an accuracy of
        # more than 99%; it should be in the final ensemble if the ensemble
        # building of the second AutoSklearn classifier works correct
        true_targets_ensemble_path = os.path.join(output, '.auto-sklearn',
                                                  'true_targets_ensemble.npy')
        true_targets_ensemble = np.load(true_targets_ensemble_path)
        true_targets_ensemble[-1] = 1 if true_targets_ensemble[-1] != 1 else 0
        probas = np.zeros((len(true_targets_ensemble), 3), dtype=float)
        for i, value in enumerate(true_targets_ensemble):
            probas[i, value] = 1.0
        dummy_predictions_path = os.path.join(output, '.auto-sklearn',
                                              'predictions_ensemble',
                                              'predictions_ensemble_1_00030.npy')
        with open(dummy_predictions_path, 'wb') as fh:
            np.save(fh, probas)

        probas_test = np.zeros((len(Y_test), 3), dtype=float)
        for i, value in enumerate(Y_test):
            probas_test[i, value] = 1.0

        dummy = ArrayReturningDummyPredictor(probas_test)
        backend = Backend(output, output)
        backend.save_model(dummy, 30, 1)

        automl = AutoSklearnClassifier(time_left_for_this_task=15,
                                       per_run_time_limit=15,
                                       output_folder=output,
                                       tmp_folder=output,
                                       shared_mode=True,
                                       seed=2,
                                       initial_configurations_via_metalearning=0,
                                       ensemble_size=0)
        automl.fit(X_train, Y_train)
        automl.run_ensemble_builder(0, 1, 50).wait()

        score = automl.score(X_test, Y_test)

        self.assertEqual(len(os.listdir(os.path.join(output, '.auto-sklearn',
                                                     'ensemble_indices'))), 1)
        self.assertGreaterEqual(score, 0.90)
        self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION)

        del automl
        self._tearDown(output)
Example #4
0
    def test_pSMAC_wrong_arguments(self):
        self.assertRaisesRegexp(ValueError,
                                "If shared_mode == True tmp_folder must not "
                                "be None.",
                                lambda shared_mode: AutoSklearnClassifier(shared_mode=shared_mode).fit(None, None),
                                shared_mode=True)

        self.assertRaisesRegexp(ValueError,
                                "If shared_mode == True output_folder must not "
                                "be None.",
                                lambda shared_mode, tmp_folder:
                                AutoSklearnClassifier(shared_mode=shared_mode, tmp_folder=tmp_folder).fit(None, None),
                                shared_mode=True,
                                tmp_folder='/tmp/duitaredxtvbedb')
def goldstone_autosklearn():

    all_df = pd.read_csv(
        '/home/shoe/automl_scores/TR13a_Goldstone_Table_1_Full_problem_TRAIN/13-11-2019 01:54:44/splits/all.csv'
    )

    X = [
        "sftptv2a3", "sftptv2a4", "sftptv2a5", "sftptv2a2", "sftptv2a6",
        "logim", "maccat", "disp4cat", "stratidc"
    ]

    y = 'sftpcons'
    automl = AutoSklearnClassifier(time_left_for_this_task=60 * 5)

    stimulus, preprocessor = preprocess(
        all_df,
        {'problem': {
            "predictors": X,
            'targets': [y],
            'categorical': []
        }})
    automl.fit(stimulus, all_df[y])

    stimulus_all = preprocessor.transform(all_df)
    automl.refit(stimulus_all, all_df[y])

    print(accuracy_score(all_df[y], automl.predict(stimulus_all)))
def test_cv_results(tmp_dir, output_dir):
    # TODO restructure and actually use real SMAC output from a long run
    # to do this unittest!
    X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')

    cls = AutoSklearnClassifier(time_left_for_this_task=30,
                                per_run_time_limit=5,
                                tmp_folder=tmp_dir,
                                output_folder=output_dir,
                                seed=1,
                                initial_configurations_via_metalearning=0,
                                ensemble_size=0)
    cls.fit(X_train, Y_train)
    cv_results = cls.cv_results_
    assert isinstance(cv_results, dict), type(cv_results)
    assert isinstance(cv_results['mean_test_score'],
                      np.ndarray), type(cv_results['mean_test_score'])
    assert isinstance(cv_results['mean_fit_time'],
                      np.ndarray), type(cv_results['mean_fit_time'])
    assert isinstance(cv_results['params'], list), type(cv_results['params'])
    assert isinstance(cv_results['rank_test_scores'],
                      np.ndarray), type(cv_results['rank_test_scores'])
    cv_result_items = [
        isinstance(val, npma.MaskedArray) for key, val in cv_results.items()
        if key.startswith('param_')
    ]
    assert all(cv_result_items), cv_results.items()
Example #7
0
def test_binary(tmp_dir, output_dir, dask_client):

    X_train, Y_train, X_test, Y_test = putil.get_dataset('iris',
                                                         make_binary=True)
    automl = AutoSklearnClassifier(time_left_for_this_task=40,
                                   per_run_time_limit=10,
                                   tmp_folder=tmp_dir,
                                   dask_client=dask_client,
                                   output_folder=output_dir)

    automl.fit(X_train,
               Y_train,
               X_test=X_test,
               y_test=Y_test,
               dataset_name='binary_test_dataset')

    predictions = automl.predict(X_test)
    assert predictions.shape == (50, ), print_debug_information(automl)

    score = accuracy(Y_test, predictions)
    assert score > 0.9, print_debug_information(automl)
    assert count_succeses(
        automl.cv_results_) > 0, print_debug_information(automl)

    output_files = glob.glob(
        os.path.join(output_dir, 'binary_test_dataset_test_*.predict'))
    assert len(output_files) > 0, (output_files,
                                   print_debug_information(automl))
Example #8
0
 def fit_autosk_trial(self, trial, metric, **kwargs):
     # n_jobs = basic.get_approp_n_jobs(n_jobs)
     trial_number = trial.number
     params = trial.clf_params
     autosk_clf = AutoSklearnClassifier(**params)
     # X_train = self.storage.X_train
     # y_train = self.storage.y_train
     # TODO metrics to trial
     autosk_clf.fit(self.storage.X_train,
                    self.storage.y_train,
                    metric=metric)
     if autosk_clf.resampling_strategy not in [
             'holdout', 'holdout-iterative-fit'
     ]:
         self.logger.warning(
             'Predict is currently not implemented for resampling strategy, refit it.'
         )
         self.logger.warning(
             'we call refit() which trains all models in the final ensemble on the whole dataset.'
         )
         autosk_clf.refit(self.storage.X_train, self.storage.y_train)
         self.logger.info('Trial#{0} info :{1}'.format(
             trial_number, autosk_clf.sprint_statistics()))
     trial.clf = autosk_clf
     return trial
Example #9
0
def test_multilabel(tmp_dir, output_dir, dask_client):

    X_train, Y_train, X_test, Y_test = putil.get_dataset('iris',
                                                         make_multilabel=True)
    automl = AutoSklearnClassifier(time_left_for_this_task=30,
                                   per_run_time_limit=5,
                                   tmp_folder=tmp_dir,
                                   dask_client=dask_client,
                                   output_folder=output_dir)

    automl.fit(X_train, Y_train)

    predictions = automl.predict(X_test)
    assert predictions.shape == (50, 3), print_debug_information(automl)
    assert count_succeses(
        automl.cv_results_) > 0, print_debug_information(automl)
    assert includes_train_scores(automl.performance_over_time_.columns) is True
    assert performance_over_time_is_plausible(
        automl.performance_over_time_) is True

    score = f1_macro(Y_test, predictions)
    assert score >= 0.9, print_debug_information(automl)

    probs = automl.predict_proba(X_train)
    assert np.mean(probs) == pytest.approx(0.33, rel=1e-1)
Example #10
0
 def __init__(
     self,
     name: str,
     model_params: Dict[str, Any],
 ) -> None:
     super().__init__(name, model_params)
     self._model = AutoSklearnClassifier(**model_params)
Example #11
0
    def test_grid_scores(self):
        output = os.path.join(self.test_dir, '..', '.tmp_grid_scores')
        self._setUp(output)

        cls = AutoSklearnClassifier(time_left_for_this_task=15,
                                    per_run_time_limit=15,
                                    output_folder=output,
                                    tmp_folder=output,
                                    shared_mode=False,
                                    seed=1,
                                    initial_configurations_via_metalearning=0,
                                    ensemble_size=0)
        cls_ = cls.build_automl()
        automl = cls_._automl
        automl._proc_smac = mock.MagicMock()

        RunKey = collections.namedtuple(
            'RunKey', ['config_id', 'instance_id', 'seed'])

        RunValue = collections.namedtuple(
            'RunValue', ['cost', 'time', 'status', 'additional_info'])

        runhistory = dict()
        runhistory[RunKey(1, 1, 1)] = RunValue(1, 1, 1, '')
        automl._proc_smac.runhistory.data = runhistory
        grid_scores_ = automl.grid_scores_

        self.assertIsInstance(grid_scores_[0], _CVScoreTuple)
        # In the runhistory we store losses, thus the score is zero
        self.assertEqual(grid_scores_[0].mean_validation_score, 0)
        self.assertEqual(grid_scores_[0].cv_validation_scores, [0])
        self.assertIsInstance(grid_scores_[0].parameters, mock.MagicMock)

        del automl
        self._tearDown(output)
Example #12
0
    def test_feat_type_wrong_arguments(self):
        cls = AutoSklearnClassifier()
        X = np.zeros((100, 100))
        y = np.zeros((100, ))
        self.assertRaisesRegexp(ValueError,
                                'Array feat_type does not have same number of '
                                'variables as X has features. 1 vs 100.',
                                cls.fit,
                                X=X,
                                y=y,
                                feat_type=[True])

        self.assertRaisesRegexp(ValueError,
                                'Array feat_type must only contain strings.',
                                cls.fit,
                                X=X,
                                y=y,
                                feat_type=[True] * 100)

        self.assertRaisesRegexp(ValueError,
                                'Only `Categorical` and `Numerical` are '
                                'valid feature types, you passed `Car`',
                                cls.fit,
                                X=X,
                                y=y,
                                feat_type=['Car'] * 100)
Example #13
0
    def test_cv_results(self):
        # TODO restructure and actually use real SMAC output from a long run
        # to do this unittest!
        tmp = os.path.join(self.test_dir, '..', '.tmp_cv_results')
        output = os.path.join(self.test_dir, '..', '.out_cv_results')
        self._setUp(tmp)
        self._setUp(output)
        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')

        cls = AutoSklearnClassifier(time_left_for_this_task=20,
                                    per_run_time_limit=5,
                                    output_folder=output,
                                    tmp_folder=tmp,
                                    shared_mode=False,
                                    seed=1,
                                    initial_configurations_via_metalearning=0,
                                    ensemble_size=0)
        cls.fit(X_train, Y_train)
        cv_results = cls.cv_results_
        self.assertIsInstance(cv_results, dict)
        self.assertIsInstance(cv_results['mean_test_score'], np.ndarray)
        self.assertIsInstance(cv_results['mean_fit_time'], np.ndarray)
        self.assertIsInstance(cv_results['params'], list)
        self.assertIsInstance(cv_results['rank_test_scores'], np.ndarray)
        self.assertTrue([
            isinstance(val, npma.MaskedArray)
            for key, val in cv_results.items() if key.startswith('param_')
        ])
        del cls
        self._tearDown(tmp)
        self._tearDown(output)
Example #14
0
    def test_binary(self):
        tmp = os.path.join(self.test_dir, '..', '.out_binary_fit')
        output = os.path.join(self.test_dir, '..', '.tmp_binary_fit')
        self._setUp(output)
        self._setUp(tmp)

        X_train, Y_train, X_test, Y_test = putil.get_dataset('iris',
                                                             make_binary=True)
        automl = AutoSklearnClassifier(time_left_for_this_task=20,
                                       per_run_time_limit=5,
                                       tmp_folder=tmp,
                                       output_folder=output)

        automl.fit(X_train,
                   Y_train,
                   X_test=X_test,
                   y_test=Y_test,
                   dataset_name='binary_test_dataset')
        predictions = automl.predict(X_test)
        self.assertEqual(predictions.shape, (50, ))
        score = accuracy(Y_test, predictions)
        self.assertGreaterEqual(score, 0.9)

        output_files = os.listdir(output)
        self.assertIn('binary_test_dataset_test_1.predict', output_files)
 def _init_model(self, inp, time_limit: int = None):
     return AutoSklearnClassifier(
         time_left_for_this_task=(time if time is None else 60 *
                                  time_limit),
         ml_memory_limit=6144,
         ensemble_memory_limit=2048,
         include_estimators=inp)
Example #16
0
def main(working_directory, time_limit, per_run_time_limit, task_id, seed):
    configuration_output_dir = os.path.join(working_directory, str(seed))
    try:
        os.makedirs(configuration_output_dir)
    except Exception as _:
        print(
            "Direcotry {0} aleardy created.".format(configuration_output_dir))

    tmp_dir = os.path.join(configuration_output_dir, str(task_id))
    #try:
    #    os.makedirs(tmp_dir)
    #except Exception as _:
    #    print("Direcotry {0} aleardy created.".format(configuration_output_dir))

    automl_arguments = {
        'time_left_for_this_task': time_limit,
        'per_run_time_limit': per_run_time_limit,
        'initial_configurations_via_metalearning': 0,
        'ensemble_size': 0,
        'seed': seed,
        'ml_memory_limit': 3072,
        'resampling_strategy': 'holdout',
        'resampling_strategy_arguments': {
            'train_size': 0.67
        },
        #'resampling_strategy': 'cv',
        #'resampling_strategy_arguments': {'folds': 5},
        'tmp_folder': tmp_dir,
        'delete_tmp_folder_after_terminate': False,
        'disable_evaluator_output': False,
    }

    X_train, y_train, X_test, y_test, cat = load_task(task_id)

    automl = AutoSklearnClassifier(**automl_arguments)

    automl.fit(X_train,
               y_train,
               dataset_name=str(task_id),
               X_test=X_test,
               y_test=y_test,
               metric=balanced_accuracy)

    with open(os.path.join(tmp_dir, "score_vanilla.csv"), 'w') as fh:
        T = 0
        fh.write("Time,Train Performance,Test Performance\n")
        # Add start time:0, Train Performance:1, Test Performance: 1
        best_loss = 1
        fh.write("{0},{1},{2}\n".format(T, 0, 0))
        for key, value in automl._automl.runhistory_.data.items(
        ):  # We compute rank based on error.
            t = value.time
            loss = value.cost
            T += t

            if loss < best_loss:
                fh.write("{0},{1},{2}\n".format(
                    T, 1 - loss,
                    1 - value.additional_info.get('test_loss', 1.0)))
                best_loss = loss
Example #17
0
    def test_classification_pandas_support(self):
        X, y = sklearn.datasets.fetch_openml(
            data_id=2,  # cat/num dataset
            return_X_y=True,
            as_frame=True,
        )

        # Drop NAN!!
        X = X.dropna('columns')

        # This test only make sense if input is dataframe
        self.assertTrue(isinstance(X, pd.DataFrame))
        self.assertTrue(isinstance(y, pd.Series))
        automl = AutoSklearnClassifier(
            time_left_for_this_task=30,
            per_run_time_limit=5,
            exclude_estimators=['libsvm_svc'],
            seed=5,
        )

        automl.fit(X, y)

        # Make sure that at least better than random.
        # We use same X_train==X_test to test code quality
        self.assertTrue(automl.score(X, y) > 0.555)

        automl.refit(X, y)

        # Make sure that at least better than random.
        # accuracy in sklearn needs valid data
        # It should be 0.555 as the dataset is unbalanced.
        y = automl._automl[0].InputValidator.encode_target(y)
        prediction = automl._automl[0].InputValidator.encode_target(automl.predict(X))
        self.assertTrue(accuracy(y, prediction) > 0.555)
def process_auto_sklearn(X_train, X_test, y_train, df_types, m_type, seed, *args):
    """Function that trains and tests data using auto-sklearn"""

    from autosklearn.classification import AutoSklearnClassifier
    from autosklearn.regression import AutoSklearnRegressor
    from autosklearn.metrics import f1_weighted
    from autosklearn.metrics import mean_squared_error

    categ_cols = df_types[df_types.NAME != 'target']['TYPE'].values.ravel()

    if m_type == 'classification':
        automl = AutoSklearnClassifier(time_left_for_this_task=TIME_PER_TASK,
                                       per_run_time_limit=int(TIME_PER_TASK/8),
                                       seed=seed,
                                       resampling_strategy='cv',
                                       resampling_strategy_arguments={'folds': 5},
                                       delete_tmp_folder_after_terminate=False)
    else:
        automl = AutoSklearnRegressor(time_left_for_this_task=TIME_PER_TASK,
                                      per_run_time_limit=int(TIME_PER_TASK/8),
                                      seed=seed,
                                      resampling_strategy='cv',
                                      resampling_strategy_arguments={'folds': 5},
                                      delete_tmp_folder_after_terminate=False)
    
    automl.fit(X_train.copy(),
        y_train.copy(), 
        feat_type=categ_cols,
        metric=f1_weighted if m_type == 'classification' else mean_squared_error)
    automl.refit(X_train.copy(), y_train.copy())

    return (automl.predict_proba(X_test) if m_type == 'classification' else 
            automl.predict(X_test))
Example #19
0
        def spawn_classifier(seed, dataset_name, automl_tmp_folder,
                             automl_output_folder):
            if seed == 0:
                initial_configurations_via_metalearning = 25
                smac_scenario_args = {}
            else:
                initial_configurations_via_metalearning = 0
                smac_scenario_args = {'initial_incumbent': 'RANDOM'}

            automl = AutoSklearnClassifier(
                #            time_left_for_this_task=60, # sec., how long should this seed fit process run
                #            per_run_time_limit=15, # sec., each model may only take this long before it's killed
                #            ml_memory_limit=1024, # MB, memory limit imposed on each call to a ML algorithm
                shared_mode=True,  # tmp folder will be shared between seeds
                tmp_folder=automl_tmp_folder,
                output_folder=automl_output_folder,
                delete_tmp_folder_after_terminate=False,
                ensemble_size=
                0,  # ensembles will be built when all optimization runs are finished
                initial_configurations_via_metalearning=
                initial_configurations_via_metalearning,
                seed=seed,
                smac_scenario_args=smac_scenario_args,
            )
            automl.fit(X_train, y_train, dataset_name=dataset_name)
Example #20
0
def test_cv_results(tmp_dir, output_dir):
    # TODO restructure and actually use real SMAC output from a long run
    # to do this unittest!
    X_train, Y_train, X_test, Y_test = putil.get_dataset('iris')

    cls = AutoSklearnClassifier(time_left_for_this_task=30,
                                per_run_time_limit=5,
                                tmp_folder=tmp_dir,
                                output_folder=output_dir,
                                seed=1,
                                initial_configurations_via_metalearning=0,
                                ensemble_size=0,
                                scoring_functions=[
                                    autosklearn.metrics.precision,
                                    autosklearn.metrics.roc_auc
                                ])

    params = cls.get_params()
    original_params = copy.deepcopy(params)

    cls.fit(X_train, Y_train)
    cv_results = cls.cv_results_
    assert isinstance(cv_results, dict), type(cv_results)
    assert isinstance(cv_results['mean_test_score'],
                      np.ndarray), type(cv_results['mean_test_score'])
    assert isinstance(cv_results['mean_fit_time'],
                      np.ndarray), type(cv_results['mean_fit_time'])
    assert isinstance(cv_results['params'], list), type(cv_results['params'])
    assert isinstance(cv_results['rank_test_scores'],
                      np.ndarray), type(cv_results['rank_test_scores'])
    assert isinstance(cv_results['metric_precision'],
                      npma.MaskedArray), type(cv_results['metric_precision'])
    assert isinstance(cv_results['metric_roc_auc'],
                      npma.MaskedArray), type(cv_results['metric_roc_auc'])
    cv_result_items = [
        isinstance(val, npma.MaskedArray) for key, val in cv_results.items()
        if key.startswith('param_')
    ]
    assert all(cv_result_items), cv_results.items()

    # Compare the state of the model parameters with the original parameters
    new_params = clone(cls).get_params()
    for param_name, original_value in original_params.items():
        new_value = new_params[param_name]

        # Taken from Sklearn code:
        # We should never change or mutate the internal state of input
        # parameters by default. To check this we use the joblib.hash function
        # that introspects recursively any subobjects to compute a checksum.
        # The only exception to this rule of immutable constructor parameters
        # is possible RandomState instance but in this check we explicitly
        # fixed the random_state params recursively to be integer seeds.
        assert joblib.hash(new_value) == joblib.hash(original_value), (
            "Estimator %s should not change or mutate "
            " the parameter %s from %s to %s during fit." %
            (cls, param_name, original_value, new_value))

    # Comply with https://scikit-learn.org/dev/glossary.html#term-classes
    is_classifier(cls)
    assert hasattr(cls, 'classes_')
def zeroconf_fit_ensemble(y):
    p("Building ensemble")

    seed = 1

    ensemble = AutoSklearnClassifier(
        time_left_for_this_task=300,per_run_time_limit=150,ml_memory_limit=20240,ensemble_size=50,ensemble_nbest=200,
        shared_mode=True, tmp_folder=atsklrn_tempdir, output_folder=atsklrn_tempdir,
        delete_tmp_folder_after_terminate=False, delete_output_folder_after_terminate=False,
        initial_configurations_via_metalearning=0,
        seed=seed)

    ensemble.fit_ensemble(
        task = BINARY_CLASSIFICATION
        ,y = y
        ,metric = F1_METRIC
        ,precision = '32'
        ,dataset_name = 'foobar' 
        ,ensemble_size=10
        ,ensemble_nbest=15)
    
    sleep(20)
    p("Ensemble built")
    
    p("Show models")
    p(str(ensemble.show_models()))
    return ensemble
Example #22
0
 def __init__(
     self,
     name: str,
     model_params: Dict[str, Any],
     classifier_paths: Iterable[Tuple[str, str]],
 ) -> None:
     super().__init__(name, model_params, classifier_paths)
     self.selector = AutoSklearnClassifier(**model_params)
 def _init_model(self, inp, time_limit: int = None):
     return AutoSklearnClassifier(
         time_left_for_this_task=(time if time is None else 60 *
                                  time_limit),
         ml_memory_limit=6144,
         ensemble_memory_limit=2048,
         ensemble_size=1,
         initial_configurations_via_metalearning=0,
         include_estimators=inp)
Example #24
0
 def __init__(self, **kwargs) -> None:
     Ensemble.__init__(self)
     client = Client(
         processes=False,
         n_workers=kwargs['n_jobs'],
         threads_per_worker=1,
         dashboard_address=None,
     )
     self.model = AutoSklearnClassifier(**kwargs, dask_client=client)
Example #25
0
def AutoSklearn(total_runtime, train_features, train_labels):
    clf = AutoSklearnClassifier(
            time_left_for_this_task=total_runtime,
            include_preprocessors=["no_preprocessing"],
            include_estimators = ["adaboost","gaussian_nb", "extra_trees", "gradient_boosting", "liblinear_svc", "libsvm_svc","random_forest",
                 "k_nearest_neighbors","decision_tree"],
    )
        
    clf.fit(train_features, train_labels, metric = balanced_accuracy)    
    return clf
Example #26
0
    def spawn_classifier(
            seed,
            time,
            search_space,
            prep_space,
            metric,
            dataset_name=None):
        """Spawn a subprocess.

        auto-sklearn does not take care of spawning worker processes. This
        function, which is called several times in the main block is a new
        process which runs one instance of auto-sklearn.
        """

        # Use the initial configurations from meta-learning only in one out of
        # the four processes spawned. This prevents auto-sklearn from evaluating
        # the same configurations in four processes.
        if seed == 0:
            initial_configurations_via_metalearning = 25
            smac_scenario_args = {}
        else:
            initial_configurations_via_metalearning = 0
            smac_scenario_args = {'initial_incumbent': 'RANDOM'}

        # Arguments which are different to other runs of auto-sklearn:
        # 1. all classifiers write to the same output directory
        # 2. shared_mode is set to True, this enables sharing of data between
        # models.
        # 3. all instances of the AutoSklearnClassifier must have a different
        # seed!
        automl = AutoSklearnClassifier(
            time_left_for_this_task=time,
            # sec., how long should this seed fit process run
            per_run_time_limit=15,
            # sec., each model may only take this long before it's killed
            ml_memory_limit=1024,
            # MB, memory limit imposed on each call to a ML algorithm
            shared_mode=True,  # tmp folder will be shared between seeds
            tmp_folder=tmp_folder,
            output_folder=output_folder,
            delete_tmp_folder_after_terminate=False,
            ensemble_size=0,
            include_estimators=search_space, exclude_estimators=None,
            include_preprocessors=prep_space, exclude_preprocessors=None,
            # ensembles will be built when all optimization runs are finished
            initial_configurations_via_metalearning=(
                initial_configurations_via_metalearning
            ),
            seed=seed,
            smac_scenario_args=smac_scenario_args,
        )
        automl.fit(X_train, y_train, metric=metric, dataset_name=dataset_name)
        # print(automl.cv_results_)
        return automl.cv_results_
    def model_init(self, model_params: Dict[str, Any]) -> None:
        """Model initialization
        Инициализация объекта модели в зависимости от типа задачи

        Args:
            model_params: Словарь параметров модели
        """
        if self.config['mode'] == 'classification':
            self.model = AutoSklearnClassifier(**model_params)
        elif self.config['mode'] == 'regression':
            self.model = AutoSklearnRegressor(**model_params)
Example #28
0
def simple():
    stage2assistant = Exp2Assistant(stage=2)
    train_data = stage2assistant.train_data
    X_train = train_data.iloc[:, :-1]
    y_train = train_data.iloc[:, -1]

    automl = AutoSklearnClassifier(
    )  # change the time, in this experiment, 1h 12h 24h 48h
    automl.fit(X_train, y_train)
    joblib.dump(
        automl,
        path.join(path_service.get_resource("model"), "stage2_model.joblib"))
Example #29
0
 def fit_model(self, X, y, classifier_params=None, fit_params=None):
     classifier_params = classifier_params or {}
     fit_params = fit_params or {}
     X_train, _, y_train, _ = train_test_split(X, y)
     auto = AutoSklearnClassifier(**classifier_params)
     auto.fit(X_train, y_train, **fit_params)
     if ("resampling_strategy" in classifier_params
             and classifier_params["resampling_strategy"] == "cv"
             and auto.ensemble_size != 0):
         # X_train, _, y_train, _ = train_test_split(X, y)
         auto.refit(X_train, y_train)
     return auto
Example #30
0
 def __init__(self, time_left_for_this_task, per_run_time_limit, folds):
     now = strftime("%Y-%m-%d-%H-%M-%S", gmtime())
     self.automl = AutoSklearnClassifier(
         time_left_for_this_task=time_left_for_this_task,
         per_run_time_limit=per_run_time_limit,
         #tmp_folder='/tmp/autosklearn_switch_tmp',
         #output_folder='/tmp/autosklearn_switch_out',
         #delete_tmp_folder_after_terminate=False,
         #delete_output_folder_after_terminate=False,
         #shared_mode=True,
         resampling_strategy='cv',
         resampling_strategy_arguments={'folds': folds})