Beispiel #1
0
def softmax_method(data):
    """Do model selection with softmax method

    Parameters
    ----------
    data: utils.data_loader.DataSet
        training data

    """

    log = get_logger('softmax', 'log/sf/softmax.log', level=DEBUG)

    optimizations = _get_optimizations()
    model_selection = SoftMaxSelection(optimizations)

    log.info('Begin fitting on {}'.format(data.name))
    train_x, train_y = data.train_data()

    start = time.time()
    best_optimization = model_selection.fit(train_x, train_y, temperature=0.5, budget=BUDGET)
    elapsed = time.time() - start

    log.info('Fitting on {} is over, spend {}s'.format(data.name, elapsed))

    csv_file = 'log/sf/sf_{}.csv'.format(data.name)
    pkl_file = 'log/sf/sf_{}.pkl'.format(data.name)

    return _get_test_result(best_optimization, data, model_selection.statistics(), csv_file, pkl_file, log)
Beispiel #2
0
def ucb_or_random_method(data, method):
    """Do model selection by traditional ucb method

    Parameters
    ----------

    data: utils.data_loader.DataSet
        training data

    method: str
        model selection method (only ucb or random can be chosen)

    """
    log = get_logger(method, 'log/{}/{}.log'.format(method, method), level=DEBUG)

    optimizations = _get_optimizations()
    model_selection = BanditModelSelection(optimizations, method)

    log.info('Begin fit on {}'.format(data.name))
    train_x, train_y = data.train_data()

    start = time.time()

    best_optimization = model_selection.fit(train_x, train_y, budget=BUDGET)

    log.info('Fitting on {} is done! Spend {}s'.format(data.name, time.time() - start))

    csv_file = 'log/{}/{}_{}.csv'.format(method, method, data.name)
    pkl_file = 'log/{}/{}_{}.pkl'.format(method, method, data.name)
    return _get_test_result(best_optimization, data, model_selection.statistics(), csv_file, pkl_file, log)
Beispiel #3
0
def eg_method(data):
    """Do model selection with epsilon-greedy method

    Parameters
    ----------
    data: utils.data_loader.DataSet
        training data

    """

    log = get_logger('epsilon-greedy', 'log/eg/epsilon-greedy.log', level=DEBUG)

    optimizations = _get_optimizations()
    model_selection = EpsilonGreedySelection(optimizations)

    log.info('Begin fitting on {}'.format(data.name))
    train_x, train_y = data.train_data()

    start = time.time()
    best_optimization = model_selection.fit(train_x, train_y, budget=BUDGET)
    elapsed = time.time() - start

    log.info('Fitting on {} is over, spend {}s'.format(data.name, elapsed))

    csv_file = 'log/eg/eg_{}.csv'.format(data.name)
    pkl_file = 'log/eg/eg_{}.pkl'.format(data.name)

    return _get_test_result(best_optimization, data, model_selection.statistics(), csv_file, pkl_file, log)
Beispiel #4
0
def single_arm_method(data, model_gen, budget=BUDGET):
    model_name = type(model_gen).__name__
    optimization = RacosOptimization(model_gen, model_name)
    train_x, train_y = data.train_data()
    logger = get_logger('single_arm', 'log/single/single_arm.log')
    logger.info(f'Begin to fit {data.name} using {model_name}')
    for i in range(budget):
        logger.info(f'Process: {i + 1}/{budget}')
        optimization.run_one_step(train_x, train_y)
    logger.info(f'Fitting on {data.name} using model {model_name} is over')

    optimization.instances.to_csv(f'log/single/single_{data.name}_{model_name}.csv')
Beispiel #5
0
def run_extreme_bandit(data):
    log = get_logger('extreme bandit', 'log/exb/exb.log', level=DEBUG)

    optimizations = _get_optimizations()
    model_selection = ExtremeHunter(optimizations)

    log.info('Begin fit on {}'.format(data.name))
    train_x, train_y = data.train_data()

    best_optimization = model_selection.fit(train_x, train_y, budget=50)

    log.info('Fitting on {} is over'.format(data.name))

    csv_file = 'log/exh/exh_{}.csv'.format(data.name)

    return _get_test_result(best_optimization, data, model_selection.statistics(), csv_file, log)
Beispiel #6
0
def proposed_method(data, theta, gamma, beta, show_selection_detail=False):
    """Do model selection with proposed method

    Parameters
    ----------
    data: utils.data_loader.DataSet
        training data

    theta: float

    gamma: float

    beta: float

    show_selection_detail: bool
    """
    log_name = 'proposed-{}-{}'.format(theta, gamma)
    log = get_logger(log_name, 'log/proposed-new/' + log_name + '.log', level=DEBUG)

    optimizations = _get_optimizations()
    model_selection = BanditModelSelection(optimizations, 'new', theta=theta, gamma=gamma, beta=beta)

    log.info('Begin fit on {}'.format(data.name))
    train_x, train_y = data.train_data()

    start = time.time()
    best_optimization = model_selection.fit(train_x, train_y, budget=BUDGET)

    # write parameter change information
    if show_selection_detail:
        with open('log/ps_{}_{}_{}.csv'.format(theta, gamma, data.name), 'a') as f:
            count = len(model_generators)
            for record in model_selection.param_change_info:
                f.write('t = {}'.format(count))
                record.to_csv(f, mode='a')
                f.write('\n\n')

                count += 1

    log.info('Fitting on {} is over, spend {}s'.format(data.name, time.time() - start))

    csv_file = 'log/proposed-new/proposed_{}_{}_{}_{}.csv'.format(theta, gamma, beta, data.name)
    pkl_file = 'log/proposed-new/proposed_{}_{}_{}_{}.pkl'.format(theta, gamma, beta, data.name)

    return _get_test_result(best_optimization, data, model_selection.statistics(), csv_file, pkl_file, log)
Beispiel #7
0
def ground_truth_lab():
    log = get_logger('gt', 'log/gt.log', level=INFO)
    for data in ALL_DATA:
        start = time.time()
        log.info('Start finding ground truth model for data set {}'.format(data.name))

        result = []
        for generator in model_generators:
            result.append(find_ground_truth(data, generator))
        df_result = pd.DataFrame(data=result, columns=['name', 'max', 'mean', 'std', 'best_model', 'time'])
        df_result = df_result.set_index(df_result['name'])
        best_model = df_result['max'].idxmax()

        # save to csv
        with open('log/gt_{}.csv'.format(data.name), 'a') as f:
            f.write('best is {}\n'.format(best_model))
            df_result.to_csv(f, mode='a')

        elapsed = time.time() - start
        log.info('g-test --- Fitting on {} is over, spend {}s'.format(data.name, elapsed))
Beispiel #8
0
def auto_sk_method(data, time_left):
    classifier = autosklearn.classification.AutoSklearnClassifier(
        time_left_for_this_task=time_left,
        per_run_time_limit=300,
        exclude_estimators=exclude_estimators)

    logger = get_logger('log/auto_sk/auto_sk_{}'.format(data.name),
                        'log/auto_sk/auto_sk_{}.log'.format(data.name))

    train_x, train_y = data.train_data()

    logger.info('Start fitting on {}'.format(data.name))
    start = time.time()

    classifier.fit(train_x, train_y)

    # get best validation score
    idx_best_run = np.argmax(classifier.cv_results_['mean_test_score'])
    best_score = classifier.cv_results_['mean_test_score'][idx_best_run]

    # calculate test v
    test_x, test_y = data.test_data()
    y_hat = classifier.predict(test_x)
    test_v = accuracy_score(test_y, y_hat)

    # show result information
    logger.info('Fitting on {} is done, spend {}s'.format(
        data.name,
        time.time() - start))
    logger.info('Sprint statistics\n{}'.format(classifier.sprint_statistics()))
    logger.info('Test V is {}'.format(test_v))
    # logger.info('Show model:\n{}'.format(classifier.show_models()))

    # save cv results
    cv_result = pd.DataFrame.from_dict(classifier.cv_results_)
    cv_result.to_csv('log/auto_sk/auto_sk_cv_result_on_{}.csv'.format(
        data.name))
    cv_result.to_pickle('log/auto_sk/auto_sk_cv_result_on_{}.pkl'.format(
        data.name))

    return data.name, best_score, test_v
Beispiel #9
0
def auto_sk_lab(start, end):
    logger = get_logger('auto-sklearn-{}-{}'.format(start, end),
                        'log/auto_sk/auto-sk-{}-{}.log'.format(start, end))

    result = []
    data_sets = data_loader.data_for_auto_sklearn()[start:end]
    for (data, time_left) in data_sets:
        logger.info('Start fitting {}'.format(data.name))
        start = time.time()

        method_result = auto_sk_method(data, time_left)
        result.append(method_result)

        logger.info('Fitting on {} is over, spend {}s\n'
                    'result:\n'
                    '{}'.format(data.name,
                                time.time() - start, method_result))

    df_result = pd.DataFrame(data=result,
                             columns=['data set', 'best v', 'test v'])
    df_result.to_csv('log/auto_sk/auto-sk-{}to{}.csv'.format(start, end))
    df_result.to_pickle('log/auto_sk/auto-sk-{}tp{}.pkl'.format(start, end))
Beispiel #10
0
def new_erucb_method(data, b=B):
    log_name = 'new-erucb'
    log = get_logger(log_name, 'log/proposed-new/' + log_name + '.log', level=DEBUG)

    model_selection = _get_model_selection(b)

    log.info('Begin fit on {}'.format(data.name))
    train_x, train_y = data.train_data()
    start = time.time()

    best_optimization = model_selection.fit(train_x, train_y, budget=BUDGET)

    elapsed = time.time() - start
    log.info('Fitting on {} ends, spend {}s'.format(data.name, elapsed))
    for (prefix, param_info) in model_selection.param_change_info:
        assert isinstance(param_info, pd.DataFrame)
        with open('log/proposed-new/erucb-process-{}-{}.csv'.format(data.name, b), mode='a') as f:
            f.write(prefix)
            param_info.to_csv(f, mode='a')

    csv = 'log/proposed-new/new_erucb_{}_{}.csv'.format(data.name, b)
    return _get_test_result(best_optimization, data, model_selection.statistics(), csv, '', log)
Beispiel #11
0
def find_ground_truth(data, model_generator, budget=BUDGET):
    """Find the ground truth model for each dataset

    Parameters
    ----------

    data: utils.data_loader.DataSet
        training data

    model_generator: framework.base.ModelGenerator
        generator for the target model

    budget: int
        number of samples

    Returns
    -------

    evaluation_result: (float, float, float)
        best evaluation result, mean and standard deviation

    """
    train_x, train_y = data.train_data()
    model_name = type(model_generator).__name__
    start = time.time()
    log = get_logger('gt.model', '', level=INFO)
    log.info('{} --- {} start fitting'.format(data.name, model_name))

    # begin sampling
    result = random_search(model_generator, train_x, train_y, search_times=budget)

    best_result_index = result['Accuracy'].idxmax()
    best_result_params = result['Raw Parameters'][best_result_index]
    best_model = model_generator.generate_model(best_result_params)

    elapsed = time.time() - start
    log.info('{} --- {} end running, spend {}s'.format(data.name, model_name, elapsed))
    acc_column = result['Accuracy']
    return model_name, acc_column.max(), acc_column.mean(), acc_column.std(), best_model, elapsed
Beispiel #12
0
def ground_truth_method(data):
    logger = get_logger('gt', 'log/ground/ground_truth.log', level=INFO)

    result = []
    budget_for_single_model = int(BUDGET / len(model_generators))
    logger.info('Begin fitting on {}'.format(data.name))
    start = time.time()

    for model_generator in model_generators:
        result.append(find_ground_truth(data, model_generator, budget_for_single_model))

    logger.info('Fitting on {} is over, spend {}s'.format(data.name, time.time() - start))

    df_result = pd.DataFrame(data=result, columns=['model', 'best v', 'mean', 'std', 'best model', 'time'])
    df_result.to_csv('log/ground/ground_{}.csv'.format(data.name))

    # get test v
    best_model_index = df_result['best v'].idxmax()
    best_model = df_result['best model'][best_model_index]
    test_v = _evaluate_test_v(data, best_model)
    logger.info('Test v of {} is {}'.format(data.name, test_v))

    return data.name, df_result['best v'].max(), type(best_model).__name__, test_v
Beispiel #13
0
import logging
import random
import signal
import time

import pandas as pd

import framework.base as base
from utils.logging_ import get_logger

# --------------------------------------------------------
# define a logger

log = get_logger('random_search', 'random_search.log', level=logging.INFO)


def timeout_handler(signum, frame):
    raise TimeoutError("Timeout!")


signal.signal(signal.SIGALRM, timeout_handler)


def random_search(model_generator, train_x, train_y, search_times=100):
    evaluator = base.ModelEvaluator(model_generator, train_x, train_y)
    model_name = type(model_generator).__name__
    raw_parameter_list = []
    actual_parameter_list = []
    accuracy_list = []
    time_list = []