Ejemplo n.º 1
0
def evaluate_autosklearn(algorithms, dataset='credit', time_limit=1200, seed=1):
    print('==> Start to evaluate', dataset, 'budget', time_limit)
    include_models = algorithms
    automl = autosklearn.classification.AutoSklearnClassifier(
        time_left_for_this_task=time_limit,
        per_run_time_limit=per_run_time_limit,
        include_preprocessors=None,
        exclude_preprocessors=None,
        n_jobs=1,
        include_estimators=include_models,
        ensemble_memory_limit=8192,
        ml_memory_limit=8192,
        ensemble_size=1,
        ensemble_nbest=1,
        initial_configurations_via_metalearning=0,
        seed=seed,
        resampling_strategy='cv',
        resampling_strategy_arguments={'folds': 5}
    )
    print(automl)
    raw_data = load_data(dataset, datanode_returned=True)
    X, y = raw_data.data
    automl.fit(X.copy(), y.copy())
    model_desc = automl.show_models()
    print(model_desc)

    test_results = automl.cv_results_['mean_test_score']
    time_records = automl.cv_results_['mean_fit_time']
    best_result = np.max(test_results)
    print('Validation Accuracy', best_result)
    save_path = project_dir + 'data/ausk_%s_%d.pkl' % (dataset, len(algorithms))
    with open(save_path, 'wb') as f:
        pickle.dump([test_results, time_records, time_limit, model_desc], f)
Ejemplo n.º 2
0
def evaluate_1stlayer_bandit(run_id,
                             B,
                             algorithms,
                             dataset='credit',
                             trial_num=200,
                             seed=1):
    _start_time = time.time()
    raw_data = load_data(dataset, datanode_returned=True)
    bandit = FirstLayerBandit(trial_num,
                              algorithms,
                              raw_data,
                              output_dir='logs',
                              per_run_time_limit=per_run_time_limit,
                              dataset_name=dataset,
                              eval_type='holdout',
                              seed=seed)
    bandit.B = B
    bandit.optimize(strategy='discounted_ucb')
    print(bandit.final_rewards)
    print(bandit.action_sequence)
    time_cost = time.time() - _start_time

    save_folder = project_dir + 'data/1stlayer-mab/'
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)
    save_path = save_folder + 'eval_ducb_%.4f_%s_%d_%d_%d.pkl' % (
        B, dataset, run_id, trial_num, len(algorithms))
    with open(save_path, 'wb') as f:
        data = [
            bandit.final_rewards, bandit.time_records, bandit.action_sequence,
            time_cost
        ]
        pickle.dump(data, f)

    return time_cost
Ejemplo n.º 3
0
def conduct_hpo(dataset='pc4', classifier_id='random_forest', iter_num=100, iter_mode=True):
    from autosklearn.pipeline.components.classification import _classifiers

    clf_class = _classifiers[classifier_id]
    cs = clf_class.get_hyperparameter_search_space()
    model = UnParametrizedHyperparameter("estimator", classifier_id)
    cs.add_hyperparameter(model)

    raw_data = load_data(dataset, datanode_returned=True)
    print(set(raw_data.data[1]))
    evaluator = ClassificationEvaluator(cs.get_default_configuration(), name='hpo', data_node=raw_data)

    if not iter_mode:
        optimizer = SMACOptimizer(evaluator, cs, evaluation_limit=600, output_dir='logs')
        inc, val = optimizer.optimize()
        print(inc, val)
    else:
        import time
        _start_time = time.time()
        optimizer = SMACOptimizer(
            evaluator, cs, trials_per_iter=1,
            output_dir='logs', per_run_time_limit=180
        )
        results = list()
        for _iter in range(iter_num):
            perf, _, _ = optimizer.iterate()
            print(_iter, perf)
            results.append(perf)
        print(results)
        print(time.time() - _start_time)
Ejemplo n.º 4
0
def conduct_hpo(optimizer='smac',
                dataset='pc4',
                classifier_id='random_forest',
                runcount_limit=100):
    from autosklearn.pipeline.components.classification import _classifiers

    clf_class = _classifiers[classifier_id]
    cs = clf_class.get_hyperparameter_search_space()
    model = UnParametrizedHyperparameter("estimator", classifier_id)
    cs.add_hyperparameter(model)

    raw_data = load_data(dataset, datanode_returned=True)
    print(set(raw_data.data[1]))
    evaluator = ClassificationEvaluator(cs.get_default_configuration(),
                                        name='hpo',
                                        data_node=raw_data)

    if optimizer == 'smac':
        optimizer = SMACOptimizer(evaluator,
                                  cs,
                                  evaluation_limit=runcount_limit,
                                  output_dir='logs')
    elif optimizer == 'psmac':
        optimizer = PSMACOptimizer(evaluator,
                                   cs,
                                   args.n,
                                   evaluation_limit=runcount_limit,
                                   output_dir='logs',
                                   trials_per_iter=args.trial)
    perf, cost, config = optimizer.iterate()
    print(perf, cost, config)
    perf, cost, config = optimizer.iterate()
    print(perf, cost, config)
Ejemplo n.º 5
0
def evaluate_evaluation_based_fe(dataset, time_limit, seed=1):
    # Prepare the configuration for random forest.
    from ConfigSpace.hyperparameters import UnParametrizedHyperparameter
    from autosklearn.pipeline.components.classification.random_forest import RandomForest
    cs = RandomForest.get_hyperparameter_search_space()
    clf_hp = UnParametrizedHyperparameter("estimator", 'random_forest')
    cs.add_hyperparameter(clf_hp)
    evaluator = Evaluator(cs.get_default_configuration(), name='fe', seed=seed)

    raw_data = load_data(dataset, datanode_returned=True)

    pipeline = FEPipeline(fe_enabled=True,
                          optimizer_type='eval_base',
                          time_budget=time_limit,
                          evaluator=evaluator,
                          seed=seed,
                          model_id='random_forest',
                          time_limit_per_trans=300)
    train_data = pipeline.fit_transform(raw_data)

    score = evaluator(None, data_node=train_data)
    print('==> Base validation score', score)

    save_path = proj_dir + 'data/fe_%s_%d.pkl' % (dataset, time_limit)
    with open(save_path, 'wb') as f:
        pickle.dump([dataset, score], f)
    return score
def evaluate_2rd_layered_bandit(run_id,
                                mth='rb',
                                dataset='pc4',
                                algo='libsvm_svc',
                                cv='holdout',
                                iter_num=100,
                                time_limit=120000,
                                seed=1):
    raw_data = load_data(dataset, datanode_returned=True)
    strategy = 'avg' if mth != 'alter-rb' else 'rb'
    mth_id = mth if mth != 'alter-rb' else 'alter'
    bandit = SecondLayerBandit(algo,
                               raw_data,
                               dataset_id=dataset,
                               mth=mth_id,
                               strategy=strategy,
                               seed=seed,
                               eval_type=cv)

    _start_time = time.time()
    stats = list()

    for _iter in range(iter_num):
        _iter_start_time = time.time()
        bandit.play_once()
        stats.append([iter, time.time() - _start_time])

        if time.time() > time_limit + _start_time:
            break

        print('%s%s' % ('\n', '=' * 65))
        end_time = time.time()
        print('| %s-%s-%d | Iteration-%d: %.4f | Time_cost: %.2f-%.2f |' %
              (dataset, mth, run_id, _iter, bandit.final_rewards[-1],
               end_time - _iter_start_time, end_time - _start_time))
        print('=' * 65, '\n')

        # Save the intermediate result.
        save_folder = project_dir + 'data/2rdlayer-mab/'
        if not os.path.exists(save_folder):
            os.makedirs(save_folder)
        file_path = save_folder + '%s-%d_2rdlayer-mab_%s_%s_%d_%d_%s.pkl' % (
            mth, seed, dataset, algo, iter_num, time_cost, cv)
        data = [
            bandit.final_rewards, bandit.action_sequence,
            bandit.evaluation_cost, stats
        ]
        with open(file_path, 'wb') as f:
            pickle.dump(data, f)
Ejemplo n.º 7
0
def get_meta_learning_configs(X,
                              y,
                              task_type,
                              dataset_name='default',
                              metric='accuracy',
                              num_cfgs=5):
    if X is None or y is None:
        X, y, _ = load_data(dataset_name)
    backend = create(temporary_directory=None,
                     output_directory=None,
                     delete_tmp_folder_after_terminate=False,
                     delete_output_folder_after_terminate=False,
                     shared_mode=True)
    dm = XYDataManager(X, y, None, None, task_type, None, dataset_name)

    configuration_space = pipeline.get_configuration_space(
        dm.info,
        include_estimators=None,
        exclude_estimators=None,
        include_preprocessors=None,
        exclude_preprocessors=None)

    watcher = StopWatch()
    name = os.path.basename(dm.name)
    watcher.start_task(name)

    def reset_data_manager(max_mem=None):
        pass

    automlsmbo = AutoMLSMBO(
        config_space=configuration_space,
        dataset_name=dataset_name,
        backend=backend,
        total_walltime_limit=1e5,
        func_eval_time_limit=1e5,
        memory_limit=1e5,
        metric=metric,
        watcher=watcher,
        metadata_directory='components/meta_learning/meta_resource',
        num_metalearning_cfgs=num_cfgs)
    automlsmbo.reset_data_manager = reset_data_manager
    automlsmbo.task = task_type
    automlsmbo.datamanager = dm
    configs = automlsmbo.get_metalearning_suggestions()
    return configs
Ejemplo n.º 8
0
def evaluate_ausk_fe(dataset, time_limit, seed=1):
    print('==> Start to Evaluate', dataset, 'Budget', time_limit)
    from automlToolkit.utils.models.default_random_forest import DefaultRandomForest
    # Add random forest classifier (with default hyperparameter) component to auto-sklearn.
    autosklearn.pipeline.components.classification.add_classifier(
        DefaultRandomForest)
    include_models = ['DefaultRandomForest']

    # Construct the ML model.
    automl = autosklearn.classification.AutoSklearnClassifier(
        time_left_for_this_task=time_limit,
        include_preprocessors=None,
        n_jobs=1,
        include_estimators=include_models,
        ensemble_memory_limit=8192,
        ml_memory_limit=8192,
        ensemble_size=1,
        ensemble_nbest=1,
        initial_configurations_via_metalearning=0,
        per_run_time_limit=600,
        seed=seed,
        resampling_strategy='cv',
        resampling_strategy_arguments={'folds': 5})
    print(automl)

    data_node = load_data(dataset, datanode_returned=True)

    X, y = data_node.data
    automl.fit(X.copy(), y.copy())
    model_desc = automl.show_models()
    print(model_desc)

    all_test_results = automl.cv_results_['mean_test_score']
    print('Mean test score', all_test_results)
    best_result = np.max(automl.cv_results_['mean_test_score'])
    print('Validation Accuracy', best_result)

    save_path = proj_dir + 'data/ausk_fe_%s_%d.pkl' % (dataset, time_limit)
    with open(save_path, 'wb') as f:
        pickle.dump([dataset, best_result], f)

    return best_result
Ejemplo n.º 9
0
def evaluate_1stlayer_bandit(algorithms, mode, dataset='credit', trial_num=200, seed=1):
    _start_time = time.time()
    raw_data = load_data(dataset, datanode_returned=True)
    bandit = FirstLayerBandit(trial_num, algorithms, raw_data,
                              output_dir='logs',
                              per_run_time_limit=per_run_time_limit,
                              dataset_name=dataset,
                              share_feature=mode,
                              seed=seed)
    bandit.optimize()
    print(bandit.final_rewards)
    print(bandit.action_sequence)
    time_cost = time.time() - _start_time

    save_path = project_dir + 'data/shared_hmab_%d_%s_%d_%d_%d.pkl' % (
        mode, dataset, trial_num, len(algorithms), seed)
    with open(save_path, 'wb') as f:
        data = [bandit.final_rewards, bandit.time_records, bandit.action_sequence, time_cost]
        pickle.dump(data, f)

    return time_cost
Ejemplo n.º 10
0
def evaluate_2armed_bandit(dataset='pc4',
                           algo='libsvm_svc',
                           time_limit=120000):
    raw_data = load_data(dataset, datanode_returned=True)
    bandit = SecondLayerBandit(algo, raw_data)

    _start_time = time.time()
    stats = list()

    for iter in range(20):
        res = bandit.play_once()
        stats.append([iter, time.time() - _start_time, res])

        if time.time() > time_limit + _start_time:
            break

    print(bandit.final_rewards)
    print(bandit.action_sequence)
    print(bandit.evaluation_cost['fe'])
    print(bandit.evaluation_cost['hpo'])
    print(np.mean(bandit.evaluation_cost['fe']))
    print(np.mean(bandit.evaluation_cost['hpo']))
Ejemplo n.º 11
0
import os
import sys

sys.path.append(os.getcwd())
from autosklearn.smbo import AutoMLSMBO
from autosklearn.constants import *
from autosklearn.data.xy_data_manager import XYDataManager
from autosklearn.util.backend import create
from autosklearn.util import pipeline, StopWatch

from automlToolkit.datasets.utils import load_data

dataset_name = 'diabetes'
X, y, _ = load_data(dataset_name)


def get_meta_learning_configs(X,
                              y,
                              task_type,
                              dataset_name,
                              metric='accuracy',
                              num_cfgs=5):
    backend = create(temporary_directory=None,
                     output_directory=None,
                     delete_tmp_folder_after_terminate=False,
                     delete_output_folder_after_terminate=False,
                     shared_mode=True)
    dm = XYDataManager(X, y, None, None, task_type, None, dataset_name)

    configuration_space = pipeline.get_configuration_space(
        dm.info,
Ejemplo n.º 12
0
import os
import sys
import argparse
sys.path.append(os.getcwd())
from automlToolkit.datasets.utils import load_data

parser = argparse.ArgumentParser()
dataset_set = 'diabetes,spectf,credit,ionosphere,lymphography,pc4,' \
              'messidor_features,winequality_red,winequality_white,splice,spambase,amazon_employee'
parser.add_argument('--datasets', type=str, default=dataset_set)
args = parser.parse_args()

for dataset in args.datasets.split(','):
    raw_data = load_data(dataset, datanode_returned=True)
    print(raw_data)