Exemple #1
0
def evaluate_1stlayer_bandit(algorithms,
                             mode,
                             dataset='credit',
                             trial_num=200,
                             seed=1):
    _start_time = time.time()
    raw_data = load_data(dataset, datanode_returned=True)
    bandit = FirstLayerBandit(trial_num,
                              algorithms,
                              raw_data,
                              output_dir='logs',
                              per_run_time_limit=per_run_time_limit,
                              dataset_name=dataset,
                              share_feature=mode,
                              seed=seed)
    bandit.optimize()
    print(bandit.final_rewards)
    print(bandit.action_sequence)
    time_cost = time.time() - _start_time

    save_path = project_dir + 'data/shared_hmab_%d_%s_%d_%d_%d.pkl' % (
        mode, dataset, trial_num, len(algorithms), seed)
    with open(save_path, 'wb') as f:
        data = [
            bandit.final_rewards, bandit.time_records, bandit.action_sequence,
            time_cost
        ]
        pickle.dump(data, f)

    return time_cost
Exemple #2
0
def evaluate_hmab(algorithms, run_id, dataset='credit', trial_num=200, seed=1, eval_type='holdout', enable_ens=False):
    task_id = '%s-hmab-%d-%d' % (dataset, len(algorithms), trial_num)
    _start_time = time.time()
    raw_data, test_raw_data = load_train_test_data(dataset)
    bandit = FirstLayerBandit(trial_num, algorithms, raw_data,
                              output_dir='logs/%s/' % task_id,
                              per_run_time_limit=per_run_time_limit,
                              dataset_name='%s-%d' % (dataset, run_id),
                              seed=seed,
                              eval_type=eval_type)
    bandit.optimize()
    time_cost = int(time.time() - _start_time)
    print(bandit.final_rewards)
    print(bandit.action_sequence)

    validation_accuracy = np.max(bandit.final_rewards)
    test_accuracy = bandit.score(test_raw_data, metric_func=balanced_accuracy)
    test_accuracy_with_ens = EnsembleBuilder(bandit).score(test_raw_data, metric_func=balanced_accuracy)

    print('Dataset          : %s' % dataset)
    print('Validation/Test score : %f - %f' % (validation_accuracy, test_accuracy))
    print('Test score with ensem : %f' % test_accuracy_with_ens)

    save_path = save_dir + '%s-%d.pkl' % (task_id, run_id)
    with open(save_path, 'wb') as f:
        stats = [time_cost, test_accuracy_with_ens, bandit.time_records, bandit.final_rewards]
        pickle.dump([validation_accuracy, test_accuracy, stats], f)
    return time_cost
Exemple #3
0
def evaluate_1stlayer_bandit(run_id, B, algorithms, dataset='credit', trial_num=200, seed=1):
    _start_time = time.time()
    raw_data = load_data(dataset, datanode_returned=True)
    bandit = FirstLayerBandit(trial_num, algorithms, raw_data,
                              output_dir='logs',
                              per_run_time_limit=per_run_time_limit,
                              dataset_name=dataset,
                              eval_type='holdout',
                              seed=seed)
    bandit.B = B
    bandit.optimize(strategy='discounted_ucb')
    print(bandit.final_rewards)
    print(bandit.action_sequence)
    time_cost = time.time() - _start_time

    save_folder = project_dir + 'data/1stlayer-mab/'
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)
    save_path = save_folder + 'eval_ducb_%.4f_%s_%d_%d_%d.pkl' % (
        B, dataset, run_id, trial_num, len(algorithms))
    with open(save_path, 'wb') as f:
        data = [bandit.final_rewards, bandit.time_records, bandit.action_sequence, time_cost]
        pickle.dump(data, f)

    return time_cost
Exemple #4
0
    def fit(self, train_data: DataNode, dataset_id=None):
        """
        this function includes this following two procedures.
            1. tune each algorithm's hyperparameters.
            2. engineer each algorithm's features automatically.
        :param train_data:
        :return:
        """
        if self.enable_meta_algorithm_selection:
            try:
                alad = AlgorithmAdvisor(task_type=self.task_type,
                                        n_algorithm=9,
                                        metric=self.metric_id)
                n_algo = 5
                model_candidates = alad.fetch_algorithm_set(
                    train_data, dataset_id=dataset_id)
                include_models = list()
                for algo in model_candidates:
                    if algo in self.include_algorithms and len(
                            include_models) < n_algo:
                        include_models.append(algo)
                self.include_algorithms = include_models
                self.logger.info(
                    'Executing meta-learning based algorithm recommendation!')
                self.logger.info('Algorithms recommended: %s' %
                                 ','.join(self.include_algorithms))
            except Exception as e:
                self.logger.error("Meta-learning failed!")

        # Check whether this dataset is balanced or not.
        if self.task_type in CLS_TASKS and is_unbalanced_dataset(train_data):
            # self.include_algorithms = imb_classication_algorithms
            self.logger.info('Input dataset is imbalanced!')
            train_data = DataBalancer().operate(train_data)
        if self.amount_of_resource is None:
            trial_num = len(self.include_algorithms) * 30
        else:
            trial_num = self.amount_of_resource

        self.solver = FirstLayerBandit(
            self.task_type,
            trial_num,
            self.include_algorithms,
            train_data,
            per_run_time_limit=self.per_run_time_limit,
            dataset_name=self.dataset_name,
            ensemble_method=self.ensemble_method,
            ensemble_size=self.ensemble_size,
            inner_opt_algorithm='fixed',
            metric=self.metric,
            fe_algo='bo',
            seed=self.seed,
            time_limit=self.time_limit,
            eval_type=self.evaluation_type,
            output_dir=self.output_dir)
        self.solver.optimize()
Exemple #5
0
def evaluate_hmab(algorithms,
                  dataset,
                  run_id,
                  trial_num,
                  seed,
                  time_limit=1200):
    print('%s-%s-%d: %d' % (hmab_flag, dataset, run_id, time_limit))

    _start_time = time.time()
    train_data, test_data = load_train_test_data(dataset,
                                                 task_type=MULTICLASS_CLS)
    cls_task_type = BINARY_CLS if len(set(
        train_data.data[1])) == 2 else MULTICLASS_CLS
    balanced_acc_metric = make_scorer(balanced_accuracy)

    if is_unbalanced_dataset(train_data):
        from solnml.components.feature_engineering.transformations.preprocessor.smote_balancer import DataBalancer
        train_data = DataBalancer().operate(train_data)

    bandit = FirstLayerBandit(cls_task_type,
                              trial_num,
                              algorithms,
                              train_data,
                              output_dir='logs',
                              per_run_time_limit=per_run_time_limit,
                              dataset_name=dataset,
                              ensemble_size=50,
                              inner_opt_algorithm=opt_algo,
                              metric=balanced_acc_metric,
                              fe_algo='bo',
                              seed=seed,
                              time_limit=time_limit,
                              eval_type='holdout')
    bandit.optimize()
    time_taken = time.time() - _start_time
    model_desc = [
        bandit.nbest_algo_ids, bandit.optimal_algo_id, bandit.final_rewards,
        bandit.action_sequence
    ]

    validation_accuracy = np.max(bandit.final_rewards)
    best_pred = bandit._best_predict(test_data)
    test_accuracy = balanced_accuracy(test_data.data[1], best_pred)

    bandit.refit()
    es_pred = bandit._es_predict(test_data)
    test_accuracy_with_ens = balanced_accuracy(test_data.data[1], es_pred)

    data = [
        dataset, validation_accuracy, test_accuracy, test_accuracy_with_ens,
        time_taken, model_desc
    ]
    print(model_desc)
    print(data)

    save_path = project_dir + '%s_%s_%s_%d_%d_%d_%d_%d.pkl' % (
        hmab_flag, opt_algo, dataset, trial_num, len(algorithms), seed, run_id,
        time_limit)
    with open(save_path, 'wb') as f:
        pickle.dump(data, f)
Exemple #6
0
def evaluate_hmab(algorithms, dataset, run_id, trial_num, seed, time_limit=1200):
    print('%s-%s-%d: %d' % (hmab_flag, dataset, run_id, time_limit))
    exclude_datasets = ['gina_prior2', 'pc2', 'abalone', 'wind', 'waveform-5000(2)',
                        'page-blocks(1)', 'winequality_white', 'pollen']
    alad = AlgorithmAdvisor(task_type=MULTICLASS_CLS, n_algorithm=9,
                            metric='bal_acc', exclude_datasets=exclude_datasets)
    n_algo = 5
    assert dataset in exclude_datasets
    meta_infos = alad.fit_meta_learner()
    assert dataset not in meta_infos
    model_candidates = alad.fetch_algorithm_set(dataset)
    include_models = list()
    print(model_candidates)
    for algo in model_candidates:
        if algo in algorithms and len(include_models) < n_algo:
            include_models.append(algo)
    print('After algorithm recommendation', include_models)

    _start_time = time.time()
    train_data, test_data = load_train_test_data(dataset, task_type=MULTICLASS_CLS)
    cls_task_type = BINARY_CLS if len(set(train_data.data[1])) == 2 else MULTICLASS_CLS
    balanced_acc_metric = make_scorer(balanced_accuracy)

    if is_unbalanced_dataset(train_data):
        from solnml.components.feature_engineering.transformations.balancer.smote_balancer import DataBalancer
        train_data = DataBalancer().operate(train_data)
    bandit = FirstLayerBandit(cls_task_type, trial_num, include_models, train_data,
                              output_dir='logs',
                              per_run_time_limit=per_run_time_limit,
                              dataset_name=dataset,
                              ensemble_size=50,
                              inner_opt_algorithm=opt_algo,
                              metric=balanced_acc_metric,
                              fe_algo='bo',
                              seed=seed,
                              time_limit=time_limit,
                              eval_type='holdout')
    bandit.optimize()
    time_taken = time.time() - _start_time
    model_desc = [bandit.nbest_algo_ids, bandit.optimal_algo_id, bandit.final_rewards, bandit.action_sequence]

    validation_accuracy = np.max(bandit.final_rewards)
    best_pred = bandit._best_predict(test_data)
    test_accuracy = balanced_accuracy(test_data.data[1], best_pred)

    bandit.refit()
    es_pred = bandit._es_predict(test_data)
    test_accuracy_with_ens = balanced_accuracy(test_data.data[1], es_pred)

    data = [dataset, validation_accuracy, test_accuracy, test_accuracy_with_ens, time_taken, model_desc]
    print(model_desc)
    print(data)

    save_path = project_dir + '%s_%s_%s_%d_%d_%d_%d_%d.pkl' % (
        hmab_flag, opt_algo, dataset, trial_num, len(algorithms), seed, run_id, time_limit)
    with open(save_path, 'wb') as f:
        pickle.dump(data, f)
Exemple #7
0
def evaluate_hmab(algorithms,
                  run_id,
                  dataset='credit',
                  trial_num=200,
                  n_jobs=1,
                  meta_configs=0,
                  seed=1,
                  eval_type='holdout'):
    task_id = '%s-hmab-%d-%d' % (dataset, len(algorithms), trial_num)
    _start_time = time.time()
    raw_data, test_raw_data = load_train_test_data(dataset)
    bandit = FirstLayerBandit(trial_num,
                              algorithms,
                              raw_data,
                              output_dir='logs/%s/' % task_id,
                              per_run_time_limit=per_run_time_limit,
                              dataset_name='%s-%d' % (dataset, run_id),
                              n_jobs=n_jobs,
                              meta_configs=meta_configs,
                              seed=seed,
                              eval_type=eval_type)
    bandit.optimize()
    time_cost = int(time.time() - _start_time)
    print(bandit.final_rewards)
    print(bandit.action_sequence)

    validation_accuracy = np.max(bandit.final_rewards)
    # validation_accuracy_without_ens = bandit.validate()
    # assert np.isclose(validation_accuracy, validation_accuracy_without_ens)
    test_accuracy_with_ens = EnsembleBuilder(
        bandit, n_jobs=n_jobs).score(test_raw_data)

    print('Dataset                     : %s' % dataset)
    print('Validation score without ens: %f' % validation_accuracy)
    print("Test score with ensemble    : %f" % test_accuracy_with_ens)

    save_path = save_dir + '%s-%d.pkl' % (task_id, run_id)
    with open(save_path, 'wb') as f:
        stats = [time_cost, 0., bandit.time_records, bandit.final_rewards]
        pickle.dump([validation_accuracy, test_accuracy_with_ens, stats], f)
    del bandit
    return time_cost
Exemple #8
0
def evaluate_autosklearn(algorithms, dataset, run_id, trial_num, seed, time_limit=1200):
    print('%s-%s-%d: %d' % (hmab_flag, dataset, run_id, time_limit))

    _start_time = time.time()
    train_data, test_data = load_train_test_data(dataset, task_type=MULTICLASS_CLS)
    cls_task_type = BINARY_CLS if len(set(train_data.data[1])) == 2 else MULTICLASS_CLS
    balanced_acc_metric = make_scorer(balanced_accuracy)

    if is_unbalanced_dataset(train_data):
        from solnml.components.feature_engineering.transformations.balancer.smote_balancer import DataBalancer
        train_data = DataBalancer().operate(train_data)

    bandit = FirstLayerBandit(cls_task_type, trial_num, algorithms, train_data,
                              output_dir='logs',
                              per_run_time_limit=per_run_time_limit,
                              dataset_name=dataset,
                              ensemble_size=50,
                              inner_opt_algorithm=opt_algo,
                              metric=balanced_acc_metric,
                              fe_algo='bo',
                              seed=seed,
                              time_limit=time_limit,
                              eval_type='holdout')
    while time.time() - _start_time < time_limit:
        bandit.sub_bandits['random_forest'].optimizer['hpo'].iterate()
    # bandit.optimize()
    # fe_exp_output = bandit.sub_bandits['random_forest'].exp_output['fe']
    # hpo_exp_output = bandit.sub_bandits['random_forest'].exp_output['hpo']
    fe_exp_output = dict()
    hpo_exp_output = bandit.sub_bandits['random_forest'].optimizer['hpo'].exp_output
    inc_config = bandit.sub_bandits['random_forest'].optimizer['hpo'].incumbent_config.get_dictionary()
    inc_config.pop('estimator')
    from solnml.components.models.classification.random_forest import RandomForest
    rf = RandomForest(**inc_config)
    rf.fit(train_data.data[0], train_data.data[1])
    validation_accuracy = bandit.sub_bandits['random_forest'].optimizer['hpo'].incumbent_perf
    best_pred = rf.predict(test_data.data[0])
    test_accuracy = balanced_accuracy(test_data.data[1], best_pred)

    # es_pred = bandit._es_predict(test_data)
    # test_accuracy_with_ens = balanced_accuracy(test_data.data[1], es_pred)
    data = [dataset, validation_accuracy, test_accuracy, fe_exp_output, hpo_exp_output,
            _start_time]
    save_path = project_dir + '%s_%s_%s_%d_%d_%d_%d_%d.pkl' % (
        ausk_flag, opt_algo, dataset, trial_num, len(algorithms), seed, run_id, time_limit)
    with open(save_path, 'wb') as f:
        pickle.dump(data, f)

    del_path = './logs/'
    for i in os.listdir(del_path):
        file_data = del_path + "/" + i
        if os.path.isfile(file_data):
            os.remove(file_data)
def ensemble_implementation_examples(bandit: FirstLayerBandit, test_data: DataNode):
    from sklearn.model_selection import StratifiedShuffleSplit
    from sklearn.metrics import accuracy_score
    from autosklearn.metrics import accuracy
    n_best = 20
    stats = bandit.fetch_ensemble_members(test_data)
    seed = stats['split_seed']
    test_size = 0.2
    train_predictions = []
    test_predictions = []
    for algo_id in bandit.nbest_algo_ids:
        X, y = stats[algo_id]['train_dataset'].data
        sss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=1)
        for train_index, test_index in sss.split(X, y):
            X_train, X_valid = X[train_index], X[test_index]
            y_train, y_valid = y[train_index], y[test_index]

        X_test, y_test = stats[algo_id]['test_dataset'].data
        configs = stats[algo_id]['configurations']
        performance = stats[algo_id]['performance']
        best_index = np.argsort(-np.array(performance))
        best_configs = [configs[i] for i in best_index[:n_best]]

        for config in best_configs:
            try:
                # Build the ML estimator.
                _, estimator = get_estimator(config)
                # print(X_train.shape, X_test.shape)
                estimator.fit(X_train, y_train)
                y_valid_pred = estimator.predict_proba(X_valid)
                y_test_pred = estimator.predict_proba(X_test)
                train_predictions.append(y_valid_pred)
                test_predictions.append(y_test_pred)
            except Exception as e:
                print(str(e))

    es = EnsembleSelection(ensemble_size=50, task_type=1,
                           metric=accuracy, random_state=np.random.RandomState(seed))
    assert len(train_predictions) == len(test_predictions)
    es.fit(train_predictions, y_valid, identifiers=None)
    y_pred = es.predict(test_predictions)
    y_pred = np.argmax(y_pred, axis=-1)
    test_score = accuracy_score(y_test, y_pred)
    return test_score
Exemple #10
0
def evaluate_imbalanced(algorithms,
                        dataset,
                        run_id,
                        trial_num,
                        seed,
                        time_limit=1200):
    print('%s-%s-%d: %d' % (hmab_flag, dataset, run_id, time_limit))
    _start_time = time.time()
    train_data, test_data = load_train_test_data(dataset)
    cls_task_type = BINARY_CLS if len(set(
        train_data.data[1])) == 2 else MULTICLASS_CLS
    # ACC or Balanced_ACC
    balanced_acc_metric = make_scorer(balanced_accuracy)
    bandit = FirstLayerBandit(cls_task_type,
                              trial_num,
                              algorithms,
                              train_data,
                              output_dir='logs',
                              per_run_time_limit=per_run_time_limit,
                              dataset_name=dataset,
                              ensemble_size=50,
                              opt_algo=opt_algo,
                              metric=balanced_acc_metric,
                              fe_algo='bo',
                              seed=seed)
    bandit.optimize()
    model_desc = [
        bandit.nbest_algo_ids, bandit.optimal_algo_id, bandit.final_rewards,
        bandit.action_sequence
    ]

    time_taken = time.time() - _start_time
    validation_accuracy = np.max(bandit.final_rewards)
    best_pred = bandit._best_predict(test_data)
    test_accuracy = balanced_accuracy(test_data.data[1], best_pred)
    es_pred = bandit._es_predict(test_data)
    test_accuracy_with_ens = balanced_accuracy(test_data.data[1], es_pred)
    data = [
        dataset, validation_accuracy, test_accuracy, test_accuracy_with_ens,
        time_taken, model_desc
    ]
    print(model_desc)
    print(data[:4])

    save_path = project_dir + 'data/%s_%s_%s_%d_%d_%d_%d.pkl' % (
        hmab_flag, opt_algo, dataset, trial_num, len(algorithms), seed, run_id)
    with open(save_path, 'wb') as f:
        pickle.dump(data, f)
def evaluate_1stlayer_bandit(algorithms, run_id, dataset='credit', trial_num=200, n_jobs=1, meta_configs=0, seed=1):
    task_id = '%s-hmab-%d-%d' % (dataset, len(algorithms), trial_num)
    _start_time = time.time()
    raw_data, test_raw_data = load_train_test_data(dataset, random_state=seed)
    bandit = FirstLayerBandit(trial_num, algorithms, raw_data,
                              output_dir='logs/%s/' % task_id,
                              per_run_time_limit=per_run_time_limit,
                              dataset_name='%s-%d' % (dataset, run_id),
                              n_jobs=n_jobs,
                              meta_configs=meta_configs,
                              seed=seed,
                              eval_type='holdout')
    bandit.optimize()
    time_cost = int(time.time() - _start_time)
    print(bandit.final_rewards)
    print(bandit.action_sequence)

    validation_accuracy_without_ens0 = np.max(bandit.final_rewards)
    validation_accuracy_without_ens1 = bandit.validate()
    assert np.isclose(validation_accuracy_without_ens0, validation_accuracy_without_ens1)

    test_accuracy_without_ens = bandit.score(test_raw_data)
    # For debug.
    mode = True
    if mode:
        test_accuracy_with_ens0 = ensemble_implementation_examples(bandit, test_raw_data)
        test_accuracy_with_ens1 = EnsembleBuilder(bandit).score(test_raw_data)

        print('Dataset                     : %s' % dataset)
        print('Validation score without ens: %f - %f' % (
            validation_accuracy_without_ens0, validation_accuracy_without_ens1))
        print("Test score without ensemble : %f" % test_accuracy_without_ens)
        print("Test score with ensemble    : %f - %f" % (test_accuracy_with_ens0, test_accuracy_with_ens1))

        save_path = save_dir + '%s-%d.pkl' % (task_id, run_id)
        with open(save_path, 'wb') as f:
            stats = [time_cost, test_accuracy_with_ens0, test_accuracy_with_ens1, test_accuracy_without_ens]
            pickle.dump([validation_accuracy_without_ens0, test_accuracy_with_ens1, stats], f)
    del bandit
    return time_cost
def evaluate_hmab(algorithms,
                  dataset,
                  run_id,
                  trial_num,
                  seed,
                  time_limit=1200):
    print('%s-%s-%d: %d' % (hmab_flag, dataset, run_id, time_limit))

    _start_time = time.time()
    train_data, test_data = load_train_test_data(dataset,
                                                 task_type=MULTICLASS_CLS)
    cls_task_type = BINARY_CLS if len(set(
        train_data.data[1])) == 2 else MULTICLASS_CLS
    balanced_acc_metric = make_scorer(balanced_accuracy)

    if is_unbalanced_dataset(train_data):
        from solnml.components.feature_engineering.transformations.preprocessor.smote_balancer import DataBalancer
        train_data = DataBalancer().operate(train_data)

    bandit = FirstLayerBandit(cls_task_type,
                              trial_num,
                              algorithms,
                              train_data,
                              output_dir='logs',
                              per_run_time_limit=per_run_time_limit,
                              dataset_name=dataset,
                              ensemble_size=50,
                              inner_opt_algorithm=opt_algo,
                              metric=balanced_acc_metric,
                              fe_algo='bo',
                              seed=seed,
                              time_limit=time_limit,
                              eval_type='partial')
    # while time.time()-_start_time<time_limit:
    #     bandit.sub_bandits['random_forest'].optimizer['fe'].iterate()
    #     # print(bandit.sub_bandits['random_forest'].optimizer['hpo'].exp_output)
    bandit.optimize()
    fe_exp_output = bandit.sub_bandits['random_forest'].optimizer[
        'fe'].exp_output
    hpo_exp_output = bandit.sub_bandits['random_forest'].optimizer[
        'hpo'].exp_output

    validation_accuracy = np.max(bandit.final_rewards)
    best_pred = bandit._best_predict(test_data)
    test_accuracy = balanced_accuracy(test_data.data[1], best_pred)

    bandit.refit()
    es_pred = bandit._es_predict(test_data)
    test_accuracy_with_ens = balanced_accuracy(test_data.data[1], es_pred)

    data = [
        dataset, validation_accuracy, test_accuracy, test_accuracy_with_ens,
        fe_exp_output, hpo_exp_output, _start_time
    ]

    save_path = project_dir + '%s_%s_%s_%d_%d_%d_%d_%d.pkl' % (
        hmab_flag, opt_algo, dataset, trial_num, len(algorithms), seed, run_id,
        time_limit)
    with open(save_path, 'wb') as f:
        pickle.dump(data, f)

    del_path = './logs/'
    for i in os.listdir(del_path):
        file_data = del_path + "/" + i
        if os.path.isfile(file_data):
            os.remove(file_data)
Exemple #13
0
class AutoML(object):
    def __init__(self,
                 time_limit=300,
                 dataset_name='default_name',
                 amount_of_resource=None,
                 task_type=None,
                 metric='bal_acc',
                 include_algorithms=None,
                 ensemble_method='ensemble_selection',
                 enable_meta_algorithm_selection=True,
                 per_run_time_limit=150,
                 ensemble_size=50,
                 evaluation='holdout',
                 output_dir="logs",
                 logging_config=None,
                 random_state=1,
                 n_jobs=1):
        self.metric_id = metric
        self.metric = get_metric(self.metric_id)

        self.dataset_name = dataset_name
        self.time_limit = time_limit
        self.seed = random_state
        self.per_run_time_limit = per_run_time_limit
        self.output_dir = output_dir
        self.logging_config = logging_config
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)
        self.logger = self._get_logger(self.dataset_name)

        self.evaluation_type = evaluation
        self.amount_of_resource = amount_of_resource
        self.ensemble_method = ensemble_method
        self.ensemble_size = ensemble_size
        self.enable_meta_algorithm_selection = enable_meta_algorithm_selection
        self.task_type = task_type
        self.n_jobs = n_jobs
        self.solver = None

        if include_algorithms is not None:
            self.include_algorithms = include_algorithms
        else:
            if task_type in CLS_TASKS:
                self.include_algorithms = list(classification_algorithms)
            elif task_type in REG_TASKS:
                self.include_algorithms = list(regression_algorithms)
            else:
                raise ValueError("Unknown task type %s" % task_type)
        if ensemble_method is not None and ensemble_method not in ensemble_list:
            raise ValueError("%s is not supported for ensemble!" %
                             ensemble_method)

    def _get_logger(self, name):
        logger_name = 'SolnML-%s(%d)' % (name, self.seed)
        setup_logger(
            os.path.join(self.output_dir, '%s.log' % str(logger_name)),
            self.logging_config,
        )
        return get_logger(logger_name)

    def fit(self, train_data: DataNode, dataset_id=None):
        """
        this function includes this following two procedures.
            1. tune each algorithm's hyperparameters.
            2. engineer each algorithm's features automatically.
        :param train_data:
        :return:
        """
        if self.enable_meta_algorithm_selection:
            try:
                alad = AlgorithmAdvisor(task_type=self.task_type,
                                        n_algorithm=9,
                                        metric=self.metric_id)
                n_algo = 5
                model_candidates = alad.fetch_algorithm_set(
                    train_data, dataset_id=dataset_id)
                include_models = list()
                for algo in model_candidates:
                    if algo in self.include_algorithms and len(
                            include_models) < n_algo:
                        include_models.append(algo)
                self.include_algorithms = include_models
                self.logger.info(
                    'Executing meta-learning based algorithm recommendation!')
                self.logger.info('Algorithms recommended: %s' %
                                 ','.join(self.include_algorithms))
            except Exception as e:
                self.logger.error("Meta-learning failed!")

        # Check whether this dataset is balanced or not.
        if self.task_type in CLS_TASKS and is_unbalanced_dataset(train_data):
            # self.include_algorithms = imb_classication_algorithms
            self.logger.info('Input dataset is imbalanced!')
            train_data = DataBalancer().operate(train_data)
        if self.amount_of_resource is None:
            trial_num = len(self.include_algorithms) * 30
        else:
            trial_num = self.amount_of_resource

        self.solver = FirstLayerBandit(
            self.task_type,
            trial_num,
            self.include_algorithms,
            train_data,
            per_run_time_limit=self.per_run_time_limit,
            dataset_name=self.dataset_name,
            ensemble_method=self.ensemble_method,
            ensemble_size=self.ensemble_size,
            inner_opt_algorithm='fixed',
            metric=self.metric,
            fe_algo='bo',
            seed=self.seed,
            time_limit=self.time_limit,
            eval_type=self.evaluation_type,
            output_dir=self.output_dir)
        self.solver.optimize()

    def refit(self):
        self.solver.refit()

    def predict_proba(self, test_data: DataNode):
        return self.solver.predict_proba(test_data)

    def predict(self, test_data: DataNode):
        return self.solver.predict(test_data)

    def score(self, test_data: DataNode, metric_func=None):
        if metric_func is None:
            metric_func = self.metric
        return metric_func(self, test_data, test_data.data[1])

    def get_ens_model_info(self):
        if self.ensemble_method is not None:
            return self.solver.es.get_ens_model_info()
        else:
            return None
Exemple #14
0
    def fit(self, train_data: DataNode, **kwargs):
        """
        This function includes this following two procedures.
            1. tune each algorithm's hyperparameters.
            2. engineer each algorithm's features automatically.
        :param train_data:
        :return:
        """
        # Check whether this dataset is balanced or not.
        # if self.task_type in CLS_TASKS and is_imbalanced_dataset(train_data):
        #     self.logger.info('Input dataset is imbalanced!')
        #     train_data = DataBalancer().operate(train_data)

        dataset_id = kwargs.get('dataset_id', None)
        inner_opt_algorithm = kwargs.get('opt_strategy', 'alter_hpo')
        self.logger.info('Optimization algorithm in 2rd bandit: %s' %
                         inner_opt_algorithm)

        if self.enable_meta_algorithm_selection:
            try:
                n_algo_recommended = 5
                meta_datasets = kwargs.get('meta_datasets', None)
                self.logger.info(
                    'Executing Meta-Learning based Algorithm Recommendation.')
                alad = RankNetAdvisor(task_type=self.task_type,
                                      n_algorithm=n_algo_recommended,
                                      metric=self.metric_id)
                alad.fit()
                model_candidates = alad.fetch_algorithm_set(
                    dataset_id, datanode=train_data)
                include_models = list()
                for algo in model_candidates:
                    if algo in self.include_algorithms and len(
                            include_models) < n_algo_recommended:
                        include_models.append(algo)
                # if 'logistic_regression' in include_models:
                #     include_models.remove('logistic_regression')
                # if 'adaboost' not in include_models:
                #     include_models.append('adaboost')

                # include_models = ['extra_trees', 'adaboost', 'liblinear_svc', 'random_forest',
                #                   'libsvm_svc', 'lightgbm']
                self.include_algorithms = include_models
                self.logger.info('Final Algorithms Recommended: [%s]' %
                                 ','.join(self.include_algorithms))
            except Exception as e:
                self.logger.error(
                    "Meta-Learning based Algorithm Recommendation FAILED: %s."
                    % str(e))
                traceback.print_exc(file=sys.stdout)

        self.solver = FirstLayerBandit(
            self.task_type,
            self.amount_of_resource,
            self.include_algorithms,
            train_data,
            include_preprocessors=self.include_preprocessors,
            per_run_time_limit=self.per_run_time_limit,
            dataset_name=self.dataset_name,
            ensemble_method=self.ensemble_method,
            ensemble_size=self.ensemble_size,
            inner_opt_algorithm=inner_opt_algorithm,
            metric=self.metric,
            enable_fe=self.enable_fe,
            fe_algo='bo',
            seed=self.seed,
            time_limit=self.time_limit,
            eval_type=self.evaluation_type,
            output_dir=self.output_dir)
        self.solver.optimize()
Exemple #15
0
class AutoML(object):
    def __init__(self,
                 time_limit=300,
                 dataset_name='default_name',
                 amount_of_resource=None,
                 task_type=None,
                 metric='bal_acc',
                 include_algorithms=None,
                 include_preprocessors=None,
                 ensemble_method='ensemble_selection',
                 enable_meta_algorithm_selection=True,
                 enable_fe=True,
                 per_run_time_limit=150,
                 ensemble_size=50,
                 evaluation='holdout',
                 output_dir="logs",
                 logging_config=None,
                 random_state=1,
                 n_jobs=1):
        self.metric_id = metric
        self.metric = get_metric(self.metric_id)

        self.dataset_name = dataset_name
        self.time_limit = time_limit
        self.seed = random_state
        self.per_run_time_limit = per_run_time_limit
        self.output_dir = output_dir
        self.logging_config = logging_config
        self.logger = self._get_logger(self.dataset_name)

        self.evaluation_type = evaluation
        self.include_preprocessors = include_preprocessors

        self.amount_of_resource = amount_of_resource
        self.ensemble_method = ensemble_method
        self.ensemble_size = ensemble_size
        self.enable_meta_algorithm_selection = enable_meta_algorithm_selection
        self.enable_fe = enable_fe
        self.task_type = task_type
        self.n_jobs = n_jobs
        self.solver = None

        # Disable meta learning
        if self.include_preprocessors is not None:
            self.enable_meta_algorithm_selection = False

        if include_algorithms is not None:
            self.include_algorithms = include_algorithms
        else:
            if task_type in CLS_TASKS:
                if task_type in [IMG_CLS, TEXT_CLS]:
                    raise ValueError(
                        'Please use AutoDL module, instead of AutoML.')
                else:
                    self.include_algorithms = list(classification_algorithms)
            elif task_type in RGS_TASKS:
                self.include_algorithms = list(regression_algorithms)
            else:
                raise ValueError("Unknown task type %s" % task_type)
        if ensemble_method is not None and ensemble_method not in ensemble_list:
            raise ValueError("%s is not supported for ensemble!" %
                             ensemble_method)

    def _get_logger(self, name):
        logger_name = 'SolnML-%s(%d)' % (name, self.seed)
        setup_logger(
            os.path.join(self.output_dir, '%s.log' % str(logger_name)),
            self.logging_config,
        )
        return get_logger(logger_name)

    def fit(self, train_data: DataNode, **kwargs):
        """
        This function includes this following two procedures.
            1. tune each algorithm's hyperparameters.
            2. engineer each algorithm's features automatically.
        :param train_data:
        :return:
        """
        # Check whether this dataset is balanced or not.
        # if self.task_type in CLS_TASKS and is_imbalanced_dataset(train_data):
        #     self.logger.info('Input dataset is imbalanced!')
        #     train_data = DataBalancer().operate(train_data)

        dataset_id = kwargs.get('dataset_id', None)
        inner_opt_algorithm = kwargs.get('opt_strategy', 'alter_hpo')
        self.logger.info('Optimization algorithm in 2rd bandit: %s' %
                         inner_opt_algorithm)

        if self.enable_meta_algorithm_selection:
            try:
                n_algo_recommended = 5
                meta_datasets = kwargs.get('meta_datasets', None)
                self.logger.info(
                    'Executing Meta-Learning based Algorithm Recommendation.')
                alad = RankNetAdvisor(task_type=self.task_type,
                                      n_algorithm=n_algo_recommended,
                                      metric=self.metric_id)
                alad.fit()
                model_candidates = alad.fetch_algorithm_set(
                    dataset_id, datanode=train_data)
                include_models = list()
                for algo in model_candidates:
                    if algo in self.include_algorithms and len(
                            include_models) < n_algo_recommended:
                        include_models.append(algo)
                # if 'logistic_regression' in include_models:
                #     include_models.remove('logistic_regression')
                # if 'adaboost' not in include_models:
                #     include_models.append('adaboost')

                # include_models = ['extra_trees', 'adaboost', 'liblinear_svc', 'random_forest',
                #                   'libsvm_svc', 'lightgbm']
                self.include_algorithms = include_models
                self.logger.info('Final Algorithms Recommended: [%s]' %
                                 ','.join(self.include_algorithms))
            except Exception as e:
                self.logger.error(
                    "Meta-Learning based Algorithm Recommendation FAILED: %s."
                    % str(e))
                traceback.print_exc(file=sys.stdout)

        self.solver = FirstLayerBandit(
            self.task_type,
            self.amount_of_resource,
            self.include_algorithms,
            train_data,
            include_preprocessors=self.include_preprocessors,
            per_run_time_limit=self.per_run_time_limit,
            dataset_name=self.dataset_name,
            ensemble_method=self.ensemble_method,
            ensemble_size=self.ensemble_size,
            inner_opt_algorithm=inner_opt_algorithm,
            metric=self.metric,
            enable_fe=self.enable_fe,
            fe_algo='bo',
            seed=self.seed,
            time_limit=self.time_limit,
            eval_type=self.evaluation_type,
            output_dir=self.output_dir)
        self.solver.optimize()

    def refit(self):
        self.solver.refit()

    def predict_proba(self, test_data: DataNode):
        return self.solver.predict_proba(test_data)

    def predict(self, test_data: DataNode):
        return self.solver.predict(test_data)

    def score(self, test_data: DataNode, metric_func=None):
        if metric_func is None:
            metric_func = self.metric
        return metric_func(self, test_data, test_data.data[1])

    def get_ens_model_info(self):
        if self.ensemble_method is not None:
            return self.solver.es.get_ens_model_info()
        else:
            return None

    def get_val_stats(self):
        return self.solver.get_stats()