Beispiel #1
0
def evaluate_hmab(algorithms,
                  dataset,
                  run_id,
                  trial_num,
                  seed,
                  time_limit=1200):
    print('%s-%s-%d: %d' % (hmab_flag, dataset, run_id, time_limit))

    _start_time = time.time()
    train_data, test_data = load_train_test_data(dataset,
                                                 task_type=MULTICLASS_CLS)
    cls_task_type = BINARY_CLS if len(set(
        train_data.data[1])) == 2 else MULTICLASS_CLS
    balanced_acc_metric = make_scorer(balanced_accuracy)

    if is_unbalanced_dataset(train_data):
        from solnml.components.feature_engineering.transformations.preprocessor.smote_balancer import DataBalancer
        train_data = DataBalancer().operate(train_data)

    bandit = FirstLayerBandit(cls_task_type,
                              trial_num,
                              algorithms,
                              train_data,
                              output_dir='logs',
                              per_run_time_limit=per_run_time_limit,
                              dataset_name=dataset,
                              ensemble_size=50,
                              inner_opt_algorithm=opt_algo,
                              metric=balanced_acc_metric,
                              fe_algo='bo',
                              seed=seed,
                              time_limit=time_limit,
                              eval_type='holdout')
    bandit.optimize()
    time_taken = time.time() - _start_time
    model_desc = [
        bandit.nbest_algo_ids, bandit.optimal_algo_id, bandit.final_rewards,
        bandit.action_sequence
    ]

    validation_accuracy = np.max(bandit.final_rewards)
    best_pred = bandit._best_predict(test_data)
    test_accuracy = balanced_accuracy(test_data.data[1], best_pred)

    bandit.refit()
    es_pred = bandit._es_predict(test_data)
    test_accuracy_with_ens = balanced_accuracy(test_data.data[1], es_pred)

    data = [
        dataset, validation_accuracy, test_accuracy, test_accuracy_with_ens,
        time_taken, model_desc
    ]
    print(model_desc)
    print(data)

    save_path = project_dir + '%s_%s_%s_%d_%d_%d_%d_%d.pkl' % (
        hmab_flag, opt_algo, dataset, trial_num, len(algorithms), seed, run_id,
        time_limit)
    with open(save_path, 'wb') as f:
        pickle.dump(data, f)
    def operate(self, input_datanode, target_fields=None):
        output_datanode = input_datanode.copy_()

        output_datanode.trans_hist.append(self.type)
        if is_unbalanced_dataset(input_datanode):
            output_datanode.data_balance = 1
        return output_datanode
Beispiel #3
0
def evaluate_hmab(algorithms, dataset, run_id, trial_num, seed, time_limit=1200):
    print('%s-%s-%d: %d' % (hmab_flag, dataset, run_id, time_limit))
    exclude_datasets = ['gina_prior2', 'pc2', 'abalone', 'wind', 'waveform-5000(2)',
                        'page-blocks(1)', 'winequality_white', 'pollen']
    alad = AlgorithmAdvisor(task_type=MULTICLASS_CLS, n_algorithm=9,
                            metric='bal_acc', exclude_datasets=exclude_datasets)
    n_algo = 5
    assert dataset in exclude_datasets
    meta_infos = alad.fit_meta_learner()
    assert dataset not in meta_infos
    model_candidates = alad.fetch_algorithm_set(dataset)
    include_models = list()
    print(model_candidates)
    for algo in model_candidates:
        if algo in algorithms and len(include_models) < n_algo:
            include_models.append(algo)
    print('After algorithm recommendation', include_models)

    _start_time = time.time()
    train_data, test_data = load_train_test_data(dataset, task_type=MULTICLASS_CLS)
    cls_task_type = BINARY_CLS if len(set(train_data.data[1])) == 2 else MULTICLASS_CLS
    balanced_acc_metric = make_scorer(balanced_accuracy)

    if is_unbalanced_dataset(train_data):
        from solnml.components.feature_engineering.transformations.balancer.smote_balancer import DataBalancer
        train_data = DataBalancer().operate(train_data)
    bandit = FirstLayerBandit(cls_task_type, trial_num, include_models, train_data,
                              output_dir='logs',
                              per_run_time_limit=per_run_time_limit,
                              dataset_name=dataset,
                              ensemble_size=50,
                              inner_opt_algorithm=opt_algo,
                              metric=balanced_acc_metric,
                              fe_algo='bo',
                              seed=seed,
                              time_limit=time_limit,
                              eval_type='holdout')
    bandit.optimize()
    time_taken = time.time() - _start_time
    model_desc = [bandit.nbest_algo_ids, bandit.optimal_algo_id, bandit.final_rewards, bandit.action_sequence]

    validation_accuracy = np.max(bandit.final_rewards)
    best_pred = bandit._best_predict(test_data)
    test_accuracy = balanced_accuracy(test_data.data[1], best_pred)

    bandit.refit()
    es_pred = bandit._es_predict(test_data)
    test_accuracy_with_ens = balanced_accuracy(test_data.data[1], es_pred)

    data = [dataset, validation_accuracy, test_accuracy, test_accuracy_with_ens, time_taken, model_desc]
    print(model_desc)
    print(data)

    save_path = project_dir + '%s_%s_%s_%d_%d_%d_%d_%d.pkl' % (
        hmab_flag, opt_algo, dataset, trial_num, len(algorithms), seed, run_id, time_limit)
    with open(save_path, 'wb') as f:
        pickle.dump(data, f)
Beispiel #4
0
    def fit(self, train_data: DataNode, dataset_id=None):
        """
        this function includes this following two procedures.
            1. tune each algorithm's hyperparameters.
            2. engineer each algorithm's features automatically.
        :param train_data:
        :return:
        """
        if self.enable_meta_algorithm_selection:
            try:
                alad = AlgorithmAdvisor(task_type=self.task_type,
                                        n_algorithm=9,
                                        metric=self.metric_id)
                n_algo = 5
                model_candidates = alad.fetch_algorithm_set(
                    train_data, dataset_id=dataset_id)
                include_models = list()
                for algo in model_candidates:
                    if algo in self.include_algorithms and len(
                            include_models) < n_algo:
                        include_models.append(algo)
                self.include_algorithms = include_models
                self.logger.info(
                    'Executing meta-learning based algorithm recommendation!')
                self.logger.info('Algorithms recommended: %s' %
                                 ','.join(self.include_algorithms))
            except Exception as e:
                self.logger.error("Meta-learning failed!")

        # Check whether this dataset is balanced or not.
        if self.task_type in CLS_TASKS and is_unbalanced_dataset(train_data):
            # self.include_algorithms = imb_classication_algorithms
            self.logger.info('Input dataset is imbalanced!')
            train_data = DataBalancer().operate(train_data)
        if self.amount_of_resource is None:
            trial_num = len(self.include_algorithms) * 30
        else:
            trial_num = self.amount_of_resource

        self.solver = FirstLayerBandit(
            self.task_type,
            trial_num,
            self.include_algorithms,
            train_data,
            per_run_time_limit=self.per_run_time_limit,
            dataset_name=self.dataset_name,
            ensemble_method=self.ensemble_method,
            ensemble_size=self.ensemble_size,
            inner_opt_algorithm='fixed',
            metric=self.metric,
            fe_algo='bo',
            seed=self.seed,
            time_limit=self.time_limit,
            eval_type=self.evaluation_type,
            output_dir=self.output_dir)
        self.solver.optimize()
Beispiel #5
0
def evaluate_autosklearn(algorithms, dataset, run_id, trial_num, seed, time_limit=1200):
    print('%s-%s-%d: %d' % (hmab_flag, dataset, run_id, time_limit))

    _start_time = time.time()
    train_data, test_data = load_train_test_data(dataset, task_type=MULTICLASS_CLS)
    cls_task_type = BINARY_CLS if len(set(train_data.data[1])) == 2 else MULTICLASS_CLS
    balanced_acc_metric = make_scorer(balanced_accuracy)

    if is_unbalanced_dataset(train_data):
        from solnml.components.feature_engineering.transformations.balancer.smote_balancer import DataBalancer
        train_data = DataBalancer().operate(train_data)

    bandit = FirstLayerBandit(cls_task_type, trial_num, algorithms, train_data,
                              output_dir='logs',
                              per_run_time_limit=per_run_time_limit,
                              dataset_name=dataset,
                              ensemble_size=50,
                              inner_opt_algorithm=opt_algo,
                              metric=balanced_acc_metric,
                              fe_algo='bo',
                              seed=seed,
                              time_limit=time_limit,
                              eval_type='holdout')
    while time.time() - _start_time < time_limit:
        bandit.sub_bandits['random_forest'].optimizer['hpo'].iterate()
    # bandit.optimize()
    # fe_exp_output = bandit.sub_bandits['random_forest'].exp_output['fe']
    # hpo_exp_output = bandit.sub_bandits['random_forest'].exp_output['hpo']
    fe_exp_output = dict()
    hpo_exp_output = bandit.sub_bandits['random_forest'].optimizer['hpo'].exp_output
    inc_config = bandit.sub_bandits['random_forest'].optimizer['hpo'].incumbent_config.get_dictionary()
    inc_config.pop('estimator')
    from solnml.components.models.classification.random_forest import RandomForest
    rf = RandomForest(**inc_config)
    rf.fit(train_data.data[0], train_data.data[1])
    validation_accuracy = bandit.sub_bandits['random_forest'].optimizer['hpo'].incumbent_perf
    best_pred = rf.predict(test_data.data[0])
    test_accuracy = balanced_accuracy(test_data.data[1], best_pred)

    # es_pred = bandit._es_predict(test_data)
    # test_accuracy_with_ens = balanced_accuracy(test_data.data[1], es_pred)
    data = [dataset, validation_accuracy, test_accuracy, fe_exp_output, hpo_exp_output,
            _start_time]
    save_path = project_dir + '%s_%s_%s_%d_%d_%d_%d_%d.pkl' % (
        ausk_flag, opt_algo, dataset, trial_num, len(algorithms), seed, run_id, time_limit)
    with open(save_path, 'wb') as f:
        pickle.dump(data, f)

    del_path = './logs/'
    for i in os.listdir(del_path):
        file_data = del_path + "/" + i
        if os.path.isfile(file_data):
            os.remove(file_data)
Beispiel #6
0
    def __init__(self, name, task_type, datanode, seed=1):
        self.name = name
        self._seed = seed
        self.root_node = datanode.copy_()
        self.incumbent = self.root_node
        self.task_type = task_type
        self.graph = TransformationGraph()
        self.graph.add_node(self.root_node)
        self.time_budget = None
        self.maximum_evaluation_num = None
        self.logger = get_logger(self.__module__ + "." +
                                 self.__class__.__name__)

        self.if_bal = False if is_unbalanced_dataset(
            data_node=datanode) else True
def evaluate_hmab(algorithms,
                  dataset,
                  run_id,
                  trial_num,
                  seed,
                  time_limit=1200):
    print('%s-%s-%d: %d' % (hmab_flag, dataset, run_id, time_limit))

    _start_time = time.time()
    train_data, test_data = load_train_test_data(dataset,
                                                 task_type=MULTICLASS_CLS)
    cls_task_type = BINARY_CLS if len(set(
        train_data.data[1])) == 2 else MULTICLASS_CLS
    balanced_acc_metric = make_scorer(balanced_accuracy)

    if is_unbalanced_dataset(train_data):
        from solnml.components.feature_engineering.transformations.preprocessor.smote_balancer import DataBalancer
        train_data = DataBalancer().operate(train_data)

    bandit = FirstLayerBandit(cls_task_type,
                              trial_num,
                              algorithms,
                              train_data,
                              output_dir='logs',
                              per_run_time_limit=per_run_time_limit,
                              dataset_name=dataset,
                              ensemble_size=50,
                              inner_opt_algorithm=opt_algo,
                              metric=balanced_acc_metric,
                              fe_algo='bo',
                              seed=seed,
                              time_limit=time_limit,
                              eval_type='partial')
    # while time.time()-_start_time<time_limit:
    #     bandit.sub_bandits['random_forest'].optimizer['fe'].iterate()
    #     # print(bandit.sub_bandits['random_forest'].optimizer['hpo'].exp_output)
    bandit.optimize()
    fe_exp_output = bandit.sub_bandits['random_forest'].optimizer[
        'fe'].exp_output
    hpo_exp_output = bandit.sub_bandits['random_forest'].optimizer[
        'hpo'].exp_output

    validation_accuracy = np.max(bandit.final_rewards)
    best_pred = bandit._best_predict(test_data)
    test_accuracy = balanced_accuracy(test_data.data[1], best_pred)

    bandit.refit()
    es_pred = bandit._es_predict(test_data)
    test_accuracy_with_ens = balanced_accuracy(test_data.data[1], es_pred)

    data = [
        dataset, validation_accuracy, test_accuracy, test_accuracy_with_ens,
        fe_exp_output, hpo_exp_output, _start_time
    ]

    save_path = project_dir + '%s_%s_%s_%d_%d_%d_%d_%d.pkl' % (
        hmab_flag, opt_algo, dataset, trial_num, len(algorithms), seed, run_id,
        time_limit)
    with open(save_path, 'wb') as f:
        pickle.dump(data, f)

    del_path = './logs/'
    for i in os.listdir(del_path):
        file_data = del_path + "/" + i
        if os.path.isfile(file_data):
            os.remove(file_data)