Example #1
0
 def fit_autosk_trial(self, trial, metric, **kwargs):
     # n_jobs = basic.get_approp_n_jobs(n_jobs)
     trial_number = trial.number
     params = trial.clf_params
     autosk_clf = AutoSklearnClassifier(**params)
     # X_train = self.storage.X_train
     # y_train = self.storage.y_train
     # TODO metrics to trial
     autosk_clf.fit(self.storage.X_train,
                    self.storage.y_train,
                    metric=metric)
     if autosk_clf.resampling_strategy not in [
             'holdout', 'holdout-iterative-fit'
     ]:
         self.logger.warning(
             'Predict is currently not implemented for resampling strategy, refit it.'
         )
         self.logger.warning(
             'we call refit() which trains all models in the final ensemble on the whole dataset.'
         )
         autosk_clf.refit(self.storage.X_train, self.storage.y_train)
         self.logger.info('Trial#{0} info :{1}'.format(
             trial_number, autosk_clf.sprint_statistics()))
     trial.clf = autosk_clf
     return trial
Example #2
0
def train(X, y):
    """example of auto-sklearn for a classification dataset"""
    # split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.33, random_state=1
    )
    # define search
    model = AutoSklearnClassifier(
        time_left_for_this_task=30,
        # per_run_time_limit=30,
        # n_jobs=8,
    )
    # perform the search
    model.fit(X_train, y_train)
    # summarize
    print(model.sprint_statistics())
    # evaluate best model
    y_hat = model.predict(X_test)
    acc = accuracy_score(y_test, y_hat)
    print("Accuracy: %.3f" % acc)

    model_path = Path("./catanatron/players/estimator.pickle").resolve()
    with open(model_path, "wb") as f:
        pickle.dump(model, f)
# # define dataset
# cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# # define search
# model = TPOTClassifier(generations=5, population_size=50, cv=cv, scoring='accuracy', verbosity=2, random_state=1, n_jobs=-1)
# # perform the search
# model.fit(X, y)
# plot_confusion_matrix(model, X, y)
# # export the best model
# # model.export('tpot_best_model.py')

if __name__ == '__main__':
    # example of auto-sklearn for a classification dataset
    from sklearn.datasets import make_classification
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score
    from autosklearn.classification import AutoSklearnClassifier
    # define dataset
    # split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
    # define search
    model = AutoSklearnClassifier(time_left_for_this_task=4*60, per_run_time_limit=60, n_jobs=8, resampling_strategy='cv', resampling_strategy_arguments={'folds': 10})
    # perform the search
    model.fit(X_train, y_train)
    # summarize
    print(model.sprint_statistics())
    print(model.cv_results_)
    # evaluate best model
    y_hat = model.predict(X_test)
    acc = accuracy_score(y_test, y_hat)
    print("Accuracy: %.3f" % acc)
Example #4
0
                address=cluster.scheduler_address) as client:
            automl = AutoSklearnClassifier(
                time_left_for_this_task=30,
                per_run_time_limit=10,
                memory_limit=1024,
                tmp_folder=tmp_folder,
                seed=777,
                # n_jobs is ignored internally as we pass a dask client.
                n_jobs=1,
                # Pass a dask client which connects to the previously constructed cluster.
                dask_client=client,
            )
            automl.fit(X_train, y_train)

            automl.fit_ensemble(
                y_train,
                task=MULTICLASS_CLASSIFICATION,
                dataset_name='digits',
                ensemble_size=20,
                ensemble_nbest=50,
            )

        predictions = automl.predict(X_test)
        print(automl.sprint_statistics())
        print("Accuracy score",
              sklearn.metrics.accuracy_score(y_test, predictions))

        # Wait until all workers are closed
        for process in worker_processes:
            process_python_worker.join()
Example #5
0
def evaluate_ml_algorithm(dataset,
                          algo,
                          run_id,
                          obj_metric,
                          time_limit=600,
                          seed=1,
                          task_type=None):
    if algo == 'lightgbm':
        _algo = ['LightGBM']
        add_classifier(LightGBM)
    elif algo == 'logistic_regression':
        _algo = ['Logistic_Regression']
        add_classifier(Logistic_Regression)
    else:
        _algo = [algo]
    print('EVALUATE-%s-%s-%s: run_id=%d' % (dataset, algo, obj_metric, run_id))
    train_data, test_data = load_train_test_data(dataset, task_type=task_type)
    if task_type in CLS_TASKS:
        task_type = BINARY_CLS if len(set(
            train_data.data[1])) == 2 else MULTICLASS_CLS
    print(set(train_data.data[1]))

    raw_data, test_raw_data = load_train_test_data(dataset,
                                                   task_type=MULTICLASS_CLS)
    X, y = raw_data.data
    X_test, y_test = test_raw_data.data
    feat_type = [
        'Categorical' if _type == CATEGORICAL else 'Numerical'
        for _type in raw_data.feature_types
    ]
    from autosklearn.metrics import balanced_accuracy as balanced_acc
    automl = AutoSklearnClassifier(
        time_left_for_this_task=int(time_limit),
        per_run_time_limit=180,
        n_jobs=1,
        include_estimators=_algo,
        initial_configurations_via_metalearning=0,
        ensemble_memory_limit=16384,
        ml_memory_limit=16384,
        # tmp_folder='/var/folders/0t/mjph32q55hd10x3qr_kdd2vw0000gn/T/autosklearn_tmp',
        ensemble_size=1,
        seed=int(seed),
        resampling_strategy='holdout',
        resampling_strategy_arguments={'train_size': 0.67})
    automl.fit(X.copy(), y.copy(), feat_type=feat_type, metric=balanced_acc)
    model_desc = automl.show_models()
    str_stats = automl.sprint_statistics()
    valid_results = automl.cv_results_['mean_test_score']
    print('Eval num: %d' % (len(valid_results)))

    validation_score = np.max(valid_results)

    # Test performance.
    automl.refit(X.copy(), y.copy())
    predictions = automl.predict(X_test)
    test_score = balanced_accuracy_score(y_test, predictions)

    # Print statistics about the auto-sklearn run such as number of
    # iterations, number of models failed with a time out.
    print(str_stats)
    print(model_desc)
    print('Validation Accuracy:', validation_score)
    print("Test Accuracy      :", test_score)

    save_path = save_dir + '%s-%s-%s-%d-%d.pkl' % (dataset, algo, obj_metric,
                                                   run_id, time_limit)
    with open(save_path, 'wb') as f:
        pickle.dump([dataset, algo, validation_score, test_score, task_type],
                    f)
print('[INFO] Loading digits dataset.')
X, y = load_digits(return_X_y=True)

print('[INFO] Splitting.')
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=42,
                                                    train_size=0.8)

print(f'[INFO] Train shape: {X_train.shape}')
print(f'[INFO] Test shape: {X_test.shape}')

print('[INFO] Finding best model...')
classifier = AutoSklearnClassifier(per_run_time_limit=360,
                                   ml_memory_limit=1024 * 6,
                                   time_left_for_this_task=7200)
start = time.time()

X_train = X_train.astype('float')
classifier.fit(X_train, y_train)
print(
    f'[INFO] Elapsed time finding best model: {time.time() - start} seconds.')

predictions = classifier.predict(X_test)
print('--- CLASSIFICATION REPORT: ---')
print(classification_report(y_test, predictions))
print('\n\n--- MODELS: ---')
print(classifier.show_models())
print('\n\n--- STATISTICS: ---')
print(classifier.sprint_statistics())