def evaluate_hpsklearn(dataset, run_id, time_limit, seed=1):
    from automlToolkit.utils.hpsklearn_config import tpe_classifier

    # TODO: Specify max_evals
    automl = HyperoptEstimator(preprocessing=None,
                               ex_preprocs=None,
                               classifier=tpe_classifier(),
                               algo=tpe.suggest,
                               max_evals=200,
                               trial_timeout=time_limit,
                               seed=seed)

    raw_data, test_raw_data = load_train_test_data(dataset)
    X_train, y_train = raw_data.data
    X_test, y_test = test_raw_data.data
    X_train, y_train = X_train.astype('float64'), y_train.astype('int')
    X_test, y_test = X_test.astype('float64'), y_test.astype('int')
    automl.fit(X_train, y_train)
    y_hat = automl.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_hat)
    print("%d-th Evaluation: accuracy score => %.4f" % (run_id, test_accuracy))

    save_path = save_dir + 'hpsklearn-%s-%d-%d.pkl' % (dataset, time_limit,
                                                       run_id)
    with open(save_path, 'wb') as f:
        pickle.dump([test_accuracy], f)
Ejemplo n.º 2
0
def evaluate_tpot(dataset, run_id, time_limit, seed=1, use_fe=True):
    n_job = args.n_job
    # Construct the ML model.
    config = None
    if not use_fe:
        from automlToolkit.utils.tpot_config import classifier_config_dict
        config = classifier_config_dict

    automl = TPOTClassifier(config_dict=config,
                            generations=10000,
                            population_size=20,
                            verbosity=2,
                            n_jobs=n_job,
                            cv=0.2,
                            max_eval_time_mins=2.5,
                            max_time_mins=int(time_limit / 60),
                            random_state=seed)

    raw_data, test_raw_data = load_train_test_data(dataset)
    X_train, y_train = raw_data.data
    X_test, y_test = test_raw_data.data
    X_train, y_train = X_train.astype('float64'), y_train.astype('int')
    X_test, y_test = X_test.astype('float64'), y_test.astype('int')
    automl.fit(X_train, y_train)
    y_hat = automl.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_hat)
    print("%d-th Evaluation: accuracy score => %.4f" % (run_id, test_accuracy))

    save_path = save_dir + 'tpot-%s-%d-%d.pkl' % (dataset, time_limit, run_id)
    with open(save_path, 'wb') as f:
        pickle.dump([test_accuracy], f)
Ejemplo n.º 3
0
def evaluate_1stlayer_bandit(algorithms,
                             run_id,
                             dataset='credit',
                             trial_num=200,
                             n_jobs=1,
                             meta_configs=0,
                             seed=1):
    task_id = '%s-hmab-%d-%d' % (dataset, len(algorithms), trial_num)
    _start_time = time.time()
    raw_data, test_raw_data = load_train_test_data(dataset, random_state=seed)
    bandit = FirstLayerBandit(trial_num,
                              algorithms,
                              raw_data,
                              output_dir='logs/%s/' % task_id,
                              per_run_time_limit=per_run_time_limit,
                              dataset_name='%s-%d' % (dataset, run_id),
                              n_jobs=n_jobs,
                              meta_configs=meta_configs,
                              seed=seed,
                              eval_type='holdout')
    bandit.optimize()
    time_cost = int(time.time() - _start_time)
    print(bandit.final_rewards)
    print(bandit.action_sequence)

    validation_accuracy_without_ens0 = np.max(bandit.final_rewards)
    validation_accuracy_without_ens1 = bandit.validate()
    assert np.isclose(validation_accuracy_without_ens0,
                      validation_accuracy_without_ens1)

    test_accuracy_without_ens = bandit.score(test_raw_data)
    # For debug.
    mode = True
    if mode:
        test_accuracy_with_ens0 = ensemble_implementation_examples(
            bandit, test_raw_data)
        test_accuracy_with_ens1 = EnsembleBuilder(bandit).score(test_raw_data)

        print('Dataset                     : %s' % dataset)
        print('Validation score without ens: %f - %f' %
              (validation_accuracy_without_ens0,
               validation_accuracy_without_ens1))
        print("Test score without ensemble : %f" % test_accuracy_without_ens)
        print("Test score with ensemble    : %f - %f" %
              (test_accuracy_with_ens0, test_accuracy_with_ens1))

        save_path = save_dir + '%s-%d.pkl' % (task_id, run_id)
        with open(save_path, 'wb') as f:
            stats = [
                time_cost, test_accuracy_with_ens0, test_accuracy_with_ens1,
                test_accuracy_without_ens
            ]
            pickle.dump([
                validation_accuracy_without_ens0, test_accuracy_with_ens1,
                stats
            ], f)
    del bandit
    return time_cost
Ejemplo n.º 4
0
def conduct_fe(dataset='pc4',
               classifier_id='random_forest',
               iter_num=100,
               run_id=0,
               seed=1):
    from autosklearn.pipeline.components.classification import _classifiers

    clf_class = _classifiers[classifier_id]
    cs = clf_class.get_hyperparameter_search_space()
    model = UnParametrizedHyperparameter("estimator", classifier_id)
    cs.add_hyperparameter(model)
    default_config = cs.get_default_configuration()

    raw_data, test_raw_data = load_train_test_data(dataset, random_state=seed)
    evaluator = ClassificationEvaluator(default_config,
                                        name='fe',
                                        data_node=raw_data,
                                        resampling_strategy='holdout',
                                        seed=seed)

    val_acc = evaluator(default_config)
    estimator = fetch_predict_estimator(default_config, raw_data.data[0],
                                        raw_data.data[1])
    pred = estimator.predict(test_raw_data.data[0])
    test_acc = balanced_accuracy(test_raw_data.data[1], pred)

    optimizer = EvaluationBasedOptimizer(task_type='classification',
                                         input_data=raw_data,
                                         evaluator=evaluator,
                                         model_id=classifier_id,
                                         time_limit_per_trans=240,
                                         mem_limit_per_trans=10000,
                                         seed=seed)

    task_id = 'fe-%s-%s-%d' % (dataset, classifier_id, iter_num)
    val_acc_list, test_acc_list = [], []

    val_acc_list.append(val_acc)
    test_acc_list.append(test_acc)

    for _iter in range(iter_num):
        perf, _, incubent = optimizer.iterate()
        val_acc_list.append(perf)
        train_node = optimizer.apply(raw_data, incubent)
        test_node = optimizer.apply(test_raw_data, incubent)
        estimator = fetch_predict_estimator(default_config, train_node.data[0],
                                            train_node.data[1])
        pred = estimator.predict(test_node.data[0])
        test_perf = balanced_accuracy(test_node.data[1], pred)
        test_acc_list.append(test_perf)
        print(val_acc_list)
        print(test_acc_list)

    save_path = save_dir + '%s-%d.pkl' % (task_id, run_id)
    with open(save_path, 'wb') as f:
        pickle.dump([val_acc_list, test_acc_list], f)
def evaluate_autosklearn(algorithms, rep_id, trial_num=100, dataset='credit',
                         time_limit=1200, seed=1, ensemble_enable=True):
    print('%s\nDataset: %s, Run_id: %d, Budget: %d.\n%s' % ('='*50, dataset, rep_id, time_limit, '='*50))
    mth_id = 'ausk-ens' if ensemble_enable else 'ausk'
    task_id = '%s-%s-%d-%d' % (dataset, mth_id, len(algorithms), trial_num)
    include_models = algorithms
    if ensemble_enable:
        ensemble_size = 50
        ensem_nbest = len(algorithms)*20
    else:
        ensemble_size = 1
        ensem_nbest = 1

    automl = autosklearn.classification.AutoSklearnClassifier(
        time_left_for_this_task=int(time_limit),
        per_run_time_limit=per_run_time_limit,
        n_jobs=1,
        include_estimators=include_models,
        ensemble_memory_limit=12288,
        ml_memory_limit=12288,
        ensemble_size=ensemble_size,
        ensemble_nbest=ensem_nbest,
        initial_configurations_via_metalearning=0,
        seed=seed,
        resampling_strategy='holdout',
        resampling_strategy_arguments={'train_size': 0.8}
    )
    print(automl)
    raw_data, test_raw_data = load_train_test_data(dataset)
    X, y = raw_data.data
    X_test, y_test = test_raw_data.data
    feat_type = ['Categorical' if _type == CATEGORICAL else 'Numerical'
                 for _type in raw_data.feature_types]
    automl.fit(X.copy(), y.copy(), feat_type=feat_type)
    model_desc = automl.show_models()
    str_stats = automl.sprint_statistics()
    test_results = automl.cv_results_['mean_test_score']
    time_records = automl.cv_results_['mean_fit_time']
    validation_accuracy = np.max(test_results)
    predictions = automl.predict(X_test)
    test_accuracy = accuracy_score(y_test, predictions)
    # Print statistics about the auto-sklearn run such as number of
    # iterations, number of models failed with a time out.
    print(str_stats)
    print(model_desc)
    print('Validation Accuracy', validation_accuracy)
    print("Test Accuracy", test_accuracy)

    save_path = save_dir + '%s-%d.pkl' % (task_id, rep_id)
    with open(save_path, 'wb') as f:
        stats = [model_desc, str_stats, test_results, time_records, time_limit]
        pickle.dump([validation_accuracy, test_accuracy, stats], f)
Ejemplo n.º 6
0
def conduct_hpo(dataset='pc4',
                classifier_id='random_forest',
                iter_num=100,
                run_id=0,
                seed=1):
    from autosklearn.pipeline.components.classification import _classifiers

    clf_class = _classifiers[classifier_id]
    cs = clf_class.get_hyperparameter_search_space()
    model = UnParametrizedHyperparameter("estimator", classifier_id)
    cs.add_hyperparameter(model)

    raw_data, test_raw_data = load_train_test_data(dataset, random_state=seed)
    evaluator = ClassificationEvaluator(cs.get_default_configuration(),
                                        name='hpo',
                                        data_node=raw_data,
                                        resampling_strategy='holdout',
                                        seed=seed)

    default_config = cs.get_default_configuration()
    val_acc = 1. - evaluator(default_config)
    estimator = fetch_predict_estimator(default_config, raw_data.data[0],
                                        raw_data.data[1])
    pred = estimator.predict(test_raw_data.data[0])
    test_acc = balanced_accuracy(test_raw_data.data[1], pred)

    optimizer = SMACOptimizer(evaluator,
                              cs,
                              trials_per_iter=2,
                              output_dir='logs',
                              per_run_time_limit=180)
    task_id = 'hpo-%s-%s-%d' % (dataset, classifier_id, iter_num)

    val_acc_list, test_acc_list = [], []
    val_acc_list.append(val_acc)
    test_acc_list.append(test_acc)

    for _iter in range(iter_num):
        perf, _, config = optimizer.iterate()
        val_acc_list.append(perf)
        estimator = fetch_predict_estimator(config, raw_data.data[0],
                                            raw_data.data[1])
        pred = estimator.predict(test_raw_data.data[0])
        test_perf = balanced_accuracy(test_raw_data.data[1], pred)
        test_acc_list.append(test_perf)
        print(val_acc_list)
        print(test_acc_list)

    save_path = save_dir + '%s-%d.pkl' % (task_id, run_id)
    with open(save_path, 'wb') as f:
        pickle.dump([val_acc_list, test_acc_list], f)
def evaluate_2rd_layered_bandit(run_id, mth='rb', dataset='pc4', algo='libsvm_svc',
                                cv='holdout', time_limit=120000, seed=1):
    train_data, test_data = load_train_test_data(dataset)
    bandit = SecondLayerBandit(algo, train_data, dataset_id=dataset, mth=mth, seed=seed, eval_type=cv)

    _start_time = time.time()
    _iter_id = 0
    stats = list()

    while True:
        if time.time() > time_limit + _start_time or bandit.early_stopped_flag:
            break
        res = bandit.play_once()
        print('Iteration %d - %.4f' % (_iter_id, res))
        stats.append([_iter_id, time.time() - _start_time, res])
        _iter_id += 1

    print(bandit.final_rewards)
    print(bandit.action_sequence)
    print(np.mean(bandit.evaluation_cost['fe']))
    print(np.mean(bandit.evaluation_cost['hpo']))

    fe_optimizer = bandit.optimizer['fe']
    final_train_data = fe_optimizer.apply(train_data, bandit.inc['fe'])
    assert final_train_data == bandit.inc['fe']
    final_test_data = fe_optimizer.apply(test_data, bandit.inc['fe'])
    config = bandit.inc['hpo']

    evaluator = ClassificationEvaluator(config, name='fe', seed=seed, resampling_strategy='holdout')
    val_score = evaluator(None, data_node=final_train_data)
    print('==> Best validation score', val_score, res)

    X_train, y_train = final_train_data.data
    clf = fetch_predict_estimator(config, X_train, y_train)
    X_test, y_test = final_test_data.data
    y_pred = clf.predict(X_test)
    test_score = balanced_accuracy(y_test, y_pred)
    print('==> Test score', test_score)

    # Alleviate overfitting.
    y_pred1 = bandit.predict(test_data.data[0])
    test_score1 = balanced_accuracy(y_test, y_pred1)
    print('==> Test score with average ensemble', test_score1)

    y_pred2 = bandit.predict(test_data.data[0], is_weighted=True)
    test_score2 = balanced_accuracy(y_test, y_pred2)
    print('==> Test score with weighted ensemble', test_score2)

    save_path = save_folder + '%s_%s_%d_%d_%s.pkl' % (mth, dataset, time_limit, run_id, algo)
    with open(save_path, 'wb') as f:
        pickle.dump([dataset, val_score, test_score, test_score1, test_score2], f)
Ejemplo n.º 8
0
def evaluate_autosklearn(algorithms,
                         dataset,
                         run_id,
                         trial_num,
                         seed,
                         time_limit=1200):
    print('==> Start to evaluate', dataset, 'budget', time_limit)
    include_models = algorithms
    automl = autosklearn.classification.AutoSklearnClassifier(
        time_left_for_this_task=time_limit,
        per_run_time_limit=per_run_time_limit,
        include_preprocessors=None,
        exclude_preprocessors=None,
        n_jobs=1,
        include_estimators=include_models,
        ensemble_memory_limit=8192,
        ml_memory_limit=8192,
        ensemble_size=1,
        ensemble_nbest=1,
        initial_configurations_via_metalearning=0,
        seed=int(seed),
        resampling_strategy='holdout',
        resampling_strategy_arguments={'train_size': 0.67})
    print(automl)

    train_data, test_data = load_train_test_data(dataset)
    X, y = train_data.data
    feat_type = [
        'Categorical' if _type == CATEGORICAL else 'Numerical'
        for _type in train_data.feature_types
    ]

    from autosklearn.metrics import balanced_accuracy
    automl.fit(X.copy(),
               y.copy(),
               metric=balanced_accuracy,
               feat_type=feat_type)
    model_desc = automl.show_models()
    print(model_desc)
    val_result = np.max(automl.cv_results_['mean_test_score'])
    print('Best validation accuracy', val_result)

    X_test, y_test = test_data.data
    automl.refit(X.copy(), y.copy())
    y_pred = automl.predict(X_test)
    test_result = balanced_accuracy(y_test, y_pred)
    print('Test accuracy', test_result)
    save_path = project_dir + 'data/ausk_vanilla_%s_%d_%d_%d_%d.pkl' % (
        dataset, trial_num, len(algorithms), seed, run_id)
    with open(save_path, 'wb') as f:
        pickle.dump([dataset, val_result, test_result, model_desc], f)
Ejemplo n.º 9
0
def evaluate_ausk_fe(dataset, time_limit, run_id, seed):
    print('[Run ID: %d] Start to Evaluate' % run_id, dataset, 'Budget',
          time_limit)
    from automlToolkit.utils.models.default_random_forest import DefaultRandomForest
    # Add random forest classifier (with default hyperparameter) component to auto-sklearn.
    autosklearn.pipeline.components.classification.add_classifier(
        DefaultRandomForest)
    include_models = ['DefaultRandomForest']

    # Construct the ML model.
    automl = autosklearn.classification.AutoSklearnClassifier(
        time_left_for_this_task=time_limit,
        include_preprocessors=None,
        n_jobs=1,
        include_estimators=include_models,
        ensemble_memory_limit=8192,
        ml_memory_limit=8192,
        ensemble_size=1,
        initial_configurations_via_metalearning=0,
        per_run_time_limit=300,
        seed=int(seed),
        resampling_strategy='holdout',
        resampling_strategy_arguments={'train_size': 0.67}
        # resampling_strategy='cv',
        # resampling_strategy_arguments={'folds': 5}
    )
    print(automl)

    train_data, test_data = load_train_test_data(dataset)

    X, y = train_data.data
    X_test, y_test = test_data.data

    from autosklearn.metrics import balanced_accuracy
    automl.fit(X.copy(), y.copy(), metric=balanced_accuracy)
    model_desc = automl.show_models()
    print(model_desc)

    # print(automl.cv_results_)
    val_result = np.max(automl.cv_results_['mean_test_score'])
    print('Best validation accuracy', val_result)

    # automl.refit(X.copy(), y.copy())
    test_result = automl.score(X_test, y_test)
    print('Test accuracy', test_result)

    save_path = save_dir + 'ausk_fe_%s_%d_%d.pkl' % (dataset, time_limit,
                                                     run_id)
    with open(save_path, 'wb') as f:
        pickle.dump([dataset, val_result, test_result, model_desc], f)
Ejemplo n.º 10
0
def evaluate_issue_source(classifier_id, dataset, opt_type='hpo'):
    _start_time = time.time()
    train_data, test_data = load_train_test_data(dataset)

    from autosklearn.pipeline.components.classification import _classifiers
    clf_class = _classifiers[classifier_id]
    cs = clf_class.get_hyperparameter_search_space()
    model = UnParametrizedHyperparameter("estimator", classifier_id)
    cs.add_hyperparameter(model)
    default_config = cs.get_default_configuration()

    seed = 2343
    if opt_type == 'hpo':
        evaluator = Evaluator(default_config,
                              data_node=train_data,
                              name='hpo',
                              resampling_strategy='holdout',
                              seed=seed)
        optimizer = SMACOptimizer(evaluator,
                                  cs,
                                  output_dir='logs/',
                                  per_run_time_limit=300,
                                  trials_per_iter=5,
                                  seed=seed)
    else:
        evaluator = Evaluator(default_config,
                              name='fe',
                              resampling_strategy='holdout',
                              seed=seed)
        optimizer = EvaluationBasedOptimizer('classification', train_data,
                                             evaluator, classifier_id, 300,
                                             1024, seed)

    perf_result = list()
    for iter_id in range(20):
        optimizer.iterate()
        print('=' * 30)
        print('ITERATION: %d' % iter_id)
        if opt_type == 'hpo':
            config = optimizer.incumbent_config
            perf = evaluate(train_data, test_data, config)
        else:
            fe_train_data = optimizer.incumbent
            fe_test_data = optimizer.apply(test_data, fe_train_data)
            perf = evaluate(fe_train_data, fe_test_data, default_config)
        print(perf)
        print('=' * 30)
        perf_result.append(perf)

    print(perf_result)
Ejemplo n.º 11
0
def evaluate_hmab(algorithms,
                  run_id,
                  dataset='credit',
                  trial_num=200,
                  n_jobs=1,
                  meta_configs=0,
                  seed=1,
                  eval_type='holdout'):
    task_id = '%s-hmab-%d-%d' % (dataset, len(algorithms), trial_num)
    _start_time = time.time()
    raw_data, test_raw_data = load_train_test_data(dataset)
    bandit = FirstLayerBandit(trial_num,
                              algorithms,
                              raw_data,
                              output_dir='logs/%s/' % task_id,
                              per_run_time_limit=per_run_time_limit,
                              dataset_name='%s-%d' % (dataset, run_id),
                              n_jobs=n_jobs,
                              meta_configs=meta_configs,
                              seed=seed,
                              eval_type=eval_type)
    bandit.optimize()
    time_cost = int(time.time() - _start_time)
    print(bandit.final_rewards)
    print(bandit.action_sequence)

    validation_accuracy = np.max(bandit.final_rewards)
    # validation_accuracy_without_ens = bandit.validate()
    # assert np.isclose(validation_accuracy, validation_accuracy_without_ens)
    test_accuracy_with_ens = EnsembleBuilder(
        bandit, n_jobs=n_jobs).score(test_raw_data)

    print('Dataset                     : %s' % dataset)
    print('Validation score without ens: %f' % validation_accuracy)
    print("Test score with ensemble    : %f" % test_accuracy_with_ens)

    save_path = save_dir + '%s-%d.pkl' % (task_id, run_id)
    with open(save_path, 'wb') as f:
        stats = [time_cost, 0., bandit.time_records, bandit.final_rewards]
        pickle.dump([validation_accuracy, test_accuracy_with_ens, stats], f)
    del bandit
    return time_cost
Ejemplo n.º 12
0
def test_balancer():
    dataset = 'winequality_red'
    sys.path.append(os.getcwd())
    from automlToolkit.datasets.utils import load_data, load_train_test_data
    raw_data, test_raw_data = load_train_test_data(dataset)
    # data = (
    # np.random.random((10, 4)), np.array([0, 0, 0, 0, 0, 2, 2, 2, 2, 2]))
    # feature_type = [NUMERICAL, NUMERICAL, DISCRETE, DISCRETE]
    # datanode = DataNode(data, feature_type)
    print(raw_data, test_raw_data)
    from automlToolkit.components.feature_engineering.transformations.preprocessor.data_balancer import DataBalancer
    balancer = DataBalancer()
    a = balancer.operate(raw_data)
    b = balancer.operate(raw_data)
    c = balancer.operate(raw_data)
    assert a == b and b == c
    print(balancer.operate(raw_data))
    test_data = test_raw_data.copy_()
    test_data.data[1] = None
    print(balancer.operate(test_data))
Ejemplo n.º 13
0
def evaluate_1stlayer_bandit(algorithms,
                             dataset,
                             run_id,
                             trial_num,
                             seed,
                             time_limit=1200):
    _start_time = time.time()
    train_data, test_data = load_train_test_data(dataset)
    bandit = FirstLayerBandit(trial_num,
                              algorithms,
                              train_data,
                              output_dir='logs',
                              per_run_time_limit=per_run_time_limit,
                              dataset_name=dataset,
                              opt_algo=opt_algo,
                              seed=seed)
    bandit.optimize()
    model_desc = [
        bandit.nbest_algo_ids, bandit.optimal_algo_id, bandit.final_rewards,
        bandit.action_sequence
    ]

    time_taken = time.time() - _start_time
    validation_accuracy = np.max(bandit.final_rewards)
    test_accuracy = bandit.score(test_data, metric_func=balanced_accuracy)
    test_accuracy_with_ens = EnsembleBuilder(bandit).score(
        test_data, metric_func=balanced_accuracy)
    data = [
        dataset, validation_accuracy, test_accuracy, test_accuracy_with_ens,
        time_taken, model_desc
    ]
    print(model_desc)

    print(data[:4])

    save_path = project_dir + 'data/hmab_%s_%s_%d_%d_%d_%d.pkl' % (
        opt_algo, dataset, trial_num, len(algorithms), seed, run_id)
    with open(save_path, 'wb') as f:
        pickle.dump(data, f)
Ejemplo n.º 14
0
def evaluate_base_model(classifier_id, dataset):
    _start_time = time.time()
    train_data, test_data = load_train_test_data(dataset)

    from autosklearn.pipeline.components.classification import _classifiers
    clf_class = _classifiers[classifier_id]
    cs = clf_class.get_hyperparameter_search_space()
    model = UnParametrizedHyperparameter("estimator", classifier_id)
    cs.add_hyperparameter(model)
    default_config = cs.get_default_configuration()
    X_train, y_train = train_data.data
    X_test, y_test = test_data.data
    print('X_train/test shapes: %s, %s' %
          (str(X_train.shape), str(X_test.shape)))

    # Build the ML estimator.
    from automlToolkit.components.evaluators.cls_evaluator import fetch_predict_estimator
    estimator = fetch_predict_estimator(default_config, X_train, y_train)

    y_pred = estimator.predict(X_test)
    print(balanced_accuracy(y_test, y_pred))
    print(balanced_accuracy(y_pred, y_test))
Ejemplo n.º 15
0
def evaluate_fe_bugs(dataset, run_id, time_limit, seed):
    algorithms = [
        'lda', 'k_nearest_neighbors', 'libsvm_svc', 'sgd', 'adaboost',
        'random_forest', 'extra_trees', 'decision_tree'
    ]
    algo_id = np.random.choice(algorithms, 1)[0]
    task_id = '%s-fe-%s-%d' % (dataset, algo_id, run_id)
    print(task_id)

    # Prepare the configuration for random forest.
    clf_class = _classifiers[algo_id]
    cs = clf_class.get_hyperparameter_search_space()
    clf_hp = UnParametrizedHyperparameter("estimator", algo_id)
    cs.add_hyperparameter(clf_hp)
    evaluator = ClassificationEvaluator(cs.get_default_configuration(),
                                        name='fe',
                                        seed=seed,
                                        resampling_strategy='holdout')

    pipeline = FEPipeline(fe_enabled=True,
                          optimizer_type='eval_base',
                          time_budget=time_limit,
                          evaluator=evaluator,
                          seed=seed,
                          model_id=algo_id,
                          time_limit_per_trans=per_run_time_limit,
                          task_id=task_id)

    raw_data, test_raw_data = load_train_test_data(dataset)
    train_data = pipeline.fit_transform(raw_data.copy_())
    test_data = pipeline.transform(test_raw_data.copy_())
    train_data_new = pipeline.transform(raw_data.copy_())

    assert (train_data.data[0] == train_data_new.data[0]).all()
    assert (train_data.data[1] == train_data_new.data[1]).all()
    assert (train_data_new == train_data)

    score = evaluator(None, data_node=test_data)
    print('==> Test score', score)
Ejemplo n.º 16
0
def evaluate_hmab(algorithms,
                  run_id,
                  dataset='credit',
                  trial_num=200,
                  seed=1,
                  eval_type='holdout'):
    task_id = '%s-hmab-%d-%d' % (dataset, len(algorithms), trial_num)
    _start_time = time.time()
    raw_data, test_raw_data = load_train_test_data(dataset, random_state=seed)
    bandit = FirstLayerBandit(trial_num,
                              algorithms,
                              raw_data,
                              output_dir='logs/%s/' % task_id,
                              per_run_time_limit=per_run_time_limit,
                              dataset_name='%s-%d' % (dataset, run_id),
                              seed=seed,
                              eval_type=eval_type)
    bandit.optimize()
    time_cost = int(time.time() - _start_time)
    print(bandit.final_rewards)
    print(bandit.action_sequence)

    validation_accuracy = np.max(bandit.final_rewards)
    test_accuracy = bandit.score(test_raw_data)
    # test_accuracy_with_ens = EnsembleBuilder(bandit).score(test_raw_data)

    print('Dataset          : %s' % dataset)
    print('Validation/Test score : %f - %f' %
          (validation_accuracy, test_accuracy))
    # print('Test score with ensem : %f' % test_accuracy_with_ens)

    save_path = save_dir + '%s-%d.pkl' % (task_id, run_id)
    with open(save_path, 'wb') as f:
        stats = [time_cost]
        pickle.dump([validation_accuracy, test_accuracy, stats], f)
    return time_cost
Ejemplo n.º 17
0
def check_datasets(datasets):
    for _dataset in datasets:
        try:
            _, _ = load_train_test_data(_dataset, random_state=1)
        except Exception as e:
            raise ValueError('Dataset - %s does not exist!' % _dataset)
Ejemplo n.º 18
0
    choices=['none', 'bagging', 'blending', 'stacking', 'ensemble_selection'])
parser.add_argument('--n_jobs', type=int, default=1)

args = parser.parse_args()

dataset = args.dataset
time_limit = args.time_limit
eval_type = args.eval_type
n_jobs = args.n_jobs
ensemble_method = args.ens_method
if ensemble_method == 'none':
    ensemble_method = None

save_dir = './data/eval_exps/automl-toolkit'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

print('==> Start to evaluate with Budget %d' % time_limit)

train_data, test_data = load_train_test_data(dataset)

clf = Classifier(time_limit=time_limit,
                 output_dir=save_dir,
                 ensemble_method=ensemble_method,
                 evaluation=eval_type,
                 metric='acc',
                 n_jobs=n_jobs)
clf.fit(train_data)
pred = clf.predict(test_data)
print(accuracy_score(test_data.data[1], pred))
Ejemplo n.º 19
0
def evaluate_2rd_bandit(dataset, algo, time_limit, run_id, seed):
    print('HMAB-%s-%s: run_id=%d' % (dataset, algo, run_id))
    print('==> Start to Evaluate', dataset, 'Budget', time_limit)
    train_data, test_data = load_train_test_data(dataset)
    enable_intersect = True
    bandit = SecondLayerBandit(algo,
                               train_data,
                               per_run_time_limit=300,
                               seed=seed,
                               eval_type='holdout',
                               mth='alter_hpo',
                               enable_intersection=enable_intersect)
    mth_id = 'hmab' if enable_intersect else 'hmab0'
    _start_time = time.time()
    _iter_id = 0
    stats = list()

    while True:
        if time.time() > time_limit + _start_time or bandit.early_stopped_flag:
            break
        res = bandit.play_once()
        print('Iteration %d - %.4f' % (_iter_id, res))
        stats.append([_iter_id, time.time() - _start_time, res])
        _iter_id += 1

    print(bandit.final_rewards)
    print(bandit.action_sequence)
    print(np.mean(bandit.evaluation_cost['fe']))
    print(np.mean(bandit.evaluation_cost['hpo']))

    fe_optimizer = bandit.optimizer['fe']
    final_train_data = fe_optimizer.apply(train_data, bandit.inc['fe'])
    assert final_train_data == bandit.inc['fe']
    final_test_data = fe_optimizer.apply(test_data, bandit.inc['fe'])
    config = bandit.inc['hpo']

    evaluator = ClassificationEvaluator(config,
                                        name='fe',
                                        seed=seed,
                                        resampling_strategy='holdout')
    val_score = evaluator(None, data_node=final_train_data)
    print('==> Best validation score', val_score, res)

    X_train, y_train = final_train_data.data
    clf = fetch_predict_estimator(config, X_train, y_train)
    X_test, y_test = final_test_data.data
    y_pred = clf.predict(X_test)
    test_score = balanced_accuracy(y_test, y_pred)
    print('==> Test score', test_score)

    # Alleviate overfitting.
    y_pred1 = bandit.predict(test_data.data[0])
    test_score1 = balanced_accuracy(y_test, y_pred1)
    print('==> Test score with average ensemble', test_score1)

    y_pred2 = bandit.predict(test_data.data[0], is_weighted=True)
    test_score2 = balanced_accuracy(y_test, y_pred2)
    print('==> Test score with weighted ensemble', test_score2)

    save_path = save_dir + '%s_2rd_bandit_%s_%d_%d_%s.pkl' % (
        mth_id, dataset, time_limit, run_id, algo)
    with open(save_path, 'wb') as f:
        pickle.dump([dataset, val_score, test_score, test_score1, test_score2],
                    f)
        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]
    return X_train, X_val, y_train, y_val


if __name__ == "__main__":
    trans_id = 4
    trans_types = [
        'fast_ica', 'quantile', 'variance_selector', 'percentile_selector',
        'generic_selector', 'svd', 'feature_agg', 'extra_tree_selector',
        'liblinear_based_selector', 'rfe_selector', 'normalizer', 'scaler1',
        'scaler2', 'scaler3', 'random_tree_embedding', 'polynomial', 'pca',
        'nystronem', 'lda', 'kitchen_sink', 'kernel_pca', 'cross'
    ]
    trans_name = trans_types[trans_id]
    raw_data, _ = load_train_test_data('yeast')
    train_data, valid_data = train_valid_split(raw_data)

    X, y = raw_data.data
    if trans_name == 'fast_ica':
        from sklearn.decomposition import FastICA

        qt = FastICA(n_components=7, random_state=1)
    elif trans_name == 'quantile':
        from automlToolkit.components.feature_engineering.transformations.utils import QuantileTransformer

        qt = QuantileTransformer()
    elif trans_name == 'variance_selector':
        from sklearn.feature_selection import VarianceThreshold

        qt = VarianceThreshold()
Ejemplo n.º 21
0
def evaluate_evaluation_based_fe(dataset, time_limit, run_id, seed):
    from automlToolkit.components.fe_optimizers.evaluation_based_optimizer import EvaluationBasedOptimizer

    # Prepare the configuration for random forest.
    from ConfigSpace.hyperparameters import UnParametrizedHyperparameter
    from autosklearn.pipeline.components.classification.random_forest import RandomForest
    cs = RandomForest.get_hyperparameter_search_space()
    clf_hp = UnParametrizedHyperparameter("estimator", 'random_forest')
    cs.add_hyperparameter(clf_hp)
    print(cs.get_default_configuration())
    """
    Configuration:
      bootstrap, Value: 'True'
      criterion, Value: 'gini'
      estimator, Constant: 'random_forest'
      max_depth, Constant: 'None'
      max_features, Value: 0.5
      max_leaf_nodes, Constant: 'None'
      min_impurity_decrease, Constant: 0.0
      min_samples_leaf, Value: 1
      min_samples_split, Value: 2
      min_weight_fraction_leaf, Constant: 0.0
      n_estimators, Constant: 100
    """
    evaluator = ClassificationEvaluator(cs.get_default_configuration(),
                                        name='fe',
                                        seed=seed,
                                        resampling_strategy='holdout')

    train_data, test_data = load_train_test_data(dataset)
    optimizer = EvaluationBasedOptimizer('classification',
                                         train_data,
                                         evaluator,
                                         'random_forest',
                                         300,
                                         10000,
                                         seed,
                                         trans_set=None)

    _start_time = time.time()
    _iter_id = 0
    while True:
        if time.time(
        ) > _start_time + time_limit or optimizer.early_stopped_flag:
            break
        score, iteration_cost, inc = optimizer.iterate()
        print('%d - %.4f' % (_iter_id, score))
        _iter_id += 1

    final_train_data = optimizer.apply(train_data, optimizer.incumbent)
    val_score = evaluator(None, data_node=final_train_data)
    print('==> Best validation score', val_score, score)

    final_test_data = optimizer.apply(test_data, optimizer.incumbent)
    X_train, y_train = final_train_data.data
    clf = fetch_predict_estimator(cs.get_default_configuration(), X_train,
                                  y_train)
    X_test, y_test = final_test_data.data
    y_pred = clf.predict(X_test)

    from automlToolkit.components.metrics.cls_metrics import balanced_accuracy
    test_score = balanced_accuracy(y_test, y_pred)
    print('==> Test score', test_score)

    save_path = save_dir + 'hmab_fe_%s_%d_%d.pkl' % (dataset, time_limit,
                                                     run_id)
    with open(save_path, 'wb') as f:
        pickle.dump([dataset, val_score, test_score], f)
Ejemplo n.º 22
0
def evaluate_package():
    train_data, test_data = load_train_test_data('pc4', data_dir='./')
    Classifier().fit(train_data)