def test_evaluate_multiclass_classification_all_metrics(self):
        X_train, Y_train, X_test, Y_test = get_dataset("iris")
        X_valid = X_test[:25,]
        Y_valid = Y_test[:25,]
        X_test = X_test[25:,]
        Y_test = Y_test[25:,]

        D = Dummy()
        D.info = {"metric": BAC_METRIC, "task": MULTICLASS_CLASSIFICATION, "is_sparse": False, "label_num": 3}
        D.data = {"X_train": X_train, "Y_train": Y_train, "X_valid": X_valid, "X_test": X_test}
        D.feat_type = ["numerical", "Numerical", "numerical", "numerical"]

        configuration_space = get_configuration_space(D.info, include_estimators=["lda"], include_preprocessors=["pca"])

        # Test all scoring functions
        err = []
        for i in range(N_TEST_RUNS):
            print("Evaluate configuration: %d; result:" % i)
            configuration = configuration_space.sample_configuration()
            D_ = copy.deepcopy(D)
            evaluator = HoldoutEvaluator(D_, configuration, all_scoring_functions=True)
            if not self._fit(evaluator):
                continue

            err.append(evaluator.predict())
            print(err[-1])

            self.assertIsInstance(err[-1], dict)
            for key in err[-1]:
                self.assertEqual(len(err[-1]), 5)
                self.assertTrue(np.isfinite(err[-1][key]))
                self.assertGreaterEqual(err[-1][key], 0.0)

        print("Number of times it was worse than random guessing:" + str(np.sum(err > 1)))
    def test_5000_classes(self):
        weights = ([0.0002] * 4750) + ([0.0001] * 250)
        X, Y = sklearn.datasets.make_classification(
            n_samples=10000,
            n_features=20,
            n_classes=5000,
            n_clusters_per_class=1,
            n_informative=15,
            n_redundant=5,
            n_repeated=0,
            weights=weights,
            flip_y=0,
            class_sep=1.0,
            hypercube=True,
            shift=None,
            scale=1.0,
            shuffle=True,
            random_state=1,
        )

        self.assertEqual(250, np.sum(np.bincount(Y) == 1))
        D = Dummy()
        D.info = {"metric": ACC_METRIC, "task": MULTICLASS_CLASSIFICATION, "is_sparse": False, "label_num": 1}
        D.data = {"X_train": X, "Y_train": Y, "X_valid": X, "X_test": X}
        D.feat_type = ["numerical"] * 5000

        configuration_space = get_configuration_space(
            D.info, include_estimators=["lda"], include_preprocessors=["no_preprocessing"]
        )
        configuration = configuration_space.sample_configuration()
        D_ = copy.deepcopy(D)
        evaluator = HoldoutEvaluator(D_, configuration)
        evaluator.fit()
    def test_with_abalone(self):
        dataset = 'abalone'
        dataset_path = os.path.join(os.path.dirname(__file__), '.datasets',
                                    dataset)
        D = CompetitionDataManager(dataset_path)
        configuration_space = get_configuration_space(
            D.info,
            include_estimators=['extra_trees'],
            include_preprocessors=['no_preprocessing'])

        errors = []
        for i in range(N_TEST_RUNS):
            configuration = configuration_space.sample_configuration()
            D_ = copy.deepcopy(D)
            evaluator = NestedCVEvaluator(D_, configuration,
                                          inner_cv_folds=2,
                                          outer_cv_folds=2)
            if not self._fit(evaluator):
                continue
            err = evaluator.predict()
            self.assertLess(err, 0.99)
            self.assertTrue(np.isfinite(err))
            errors.append(err)
        # This is a reasonable bound
        self.assertEqual(10, len(errors))
        self.assertLess(min(errors), 0.77)
Beispiel #4
0
def _create_search_space(tmp_dir, data_info, watcher, log_function):
    task_name = 'CreateConfigSpace'
    watcher.start_task(task_name)
    config_space_path = os.path.join(tmp_dir, 'space.pcs')
    configuration_space = paramsklearn.get_configuration_space(
        data_info)
    sp_string = pcs_parser.write(configuration_space)
    _write_file_with_data(config_space_path, sp_string,
                          'Configuration space', log_function)
    watcher.stop_task(task_name)

    return configuration_space, config_space_path
def main():
    parser = ArgumentParser()

    parser.add_argument("configuration_directory",
                        metavar="configuration-directory")
    parser.add_argument("output_directory", metavar="output-directory")
    parser.add_argument("--cutoff", type=int, default=-1,
                        help="Only consider the validation performances up to "
                             "this time.")
    parser.add_argument("--num-runs", type=int, default=1)
    parser.add_argument("--only-best", type=bool, default=False,
                        help="Look only for the best configuration in the "
                             "validation files.")

    args = parser.parse_args()
    configuration_directory = args.configuration_directory
    output_dir = args.output_directory
    cutoff = int(args.cutoff)
    num_runs = args.num_runs

    for sparse, task in [(1, BINARY_CLASSIFICATION),
                         (1, MULTICLASS_CLASSIFICATION),
                         (0, BINARY_CLASSIFICATION),
                         (0, MULTICLASS_CLASSIFICATION)]:

        for metric in ['acc_metric', 'auc_metric', 'bac_metric', 'f1_metric',
                       'pac_metric']:

            output_dir_ = os.path.join(output_dir, '%s_%s_%s' % (
                metric, TASK_TYPES_TO_STRING[task], 'sparse' if sparse else 'dense'))

            configuration_space = paramsklearn.get_configuration_space(
                {'is_sparse': sparse, 'task': task}
            )

            try:
                os.makedirs(output_dir_)
            except:
                pass

            outputs, configurations = retrieve_matadata(
                validation_directory=configuration_directory,
                num_runs=num_runs,
                metric=metric,
                cutoff=cutoff,
                configuration_space=configuration_space,
                only_best=args.only_best)

            if len(outputs) == 0:
                raise ValueError("Nothing found!")

            write_output(outputs, configurations, output_dir_,
                         configuration_space, metric)
    def test_evaluate_multiclass_classification(self):
        X_train, Y_train, X_test, Y_test = get_dataset("iris")

        X_valid = X_test[:25,]
        Y_valid = Y_test[:25,]
        X_test = X_test[25:,]
        Y_test = Y_test[25:,]

        D = Dummy()
        D.info = {"metric": BAC_METRIC, "task": MULTICLASS_CLASSIFICATION, "is_sparse": False, "label_num": 3}
        D.data = {"X_train": X_train, "Y_train": Y_train, "X_valid": X_valid, "X_test": X_test}
        D.feat_type = ["numerical", "Numerical", "numerical", "numerical"]

        configuration_space = get_configuration_space(
            D.info, include_estimators=["extra_trees"], include_preprocessors=["select_rates"]
        )

        err = np.zeros([N_TEST_RUNS])
        num_models_better_than_random = 0
        for i in range(N_TEST_RUNS):
            print("Evaluate configuration: %d; result:" % i)
            configuration = configuration_space.sample_configuration()
            D_ = copy.deepcopy(D)
            evaluator = CVEvaluator(D_, configuration, with_predictions=True)

            if not self._fit(evaluator):
                print()
                continue
            e_, Y_optimization_pred, Y_valid_pred, Y_test_pred = evaluator.predict()
            err[i] = e_
            print(err[i], configuration["classifier:__choice__"])

            num_targets = len(np.unique(Y_train))
            self.assertTrue(np.isfinite(err[i]))
            self.assertGreaterEqual(err[i], 0.0)
            # Test that ten models were trained
            self.assertEqual(len(evaluator.models), 10)
            self.assertEqual(Y_optimization_pred.shape[0], Y_train.shape[0])
            self.assertEqual(Y_optimization_pred.shape[1], num_targets)
            self.assertEqual(Y_valid_pred.shape[0], Y_valid.shape[0])
            self.assertEqual(Y_valid_pred.shape[1], num_targets)
            self.assertEqual(Y_test_pred.shape[0], Y_test.shape[0])
            self.assertEqual(Y_test_pred.shape[1], num_targets)
            # Test some basic statistics of the dataset
            if err[i] < 0.5:
                self.assertTrue(0.3 < Y_valid_pred.mean() < 0.36666)
                self.assertGreaterEqual(Y_valid_pred.std(), 0.01)
                self.assertTrue(0.3 < Y_test_pred.mean() < 0.36666)
                self.assertGreaterEqual(Y_test_pred.std(), 0.01)
                num_models_better_than_random += 1
        self.assertGreater(num_models_better_than_random, 5)
    def test_evaluate_binary_classification(self):
        X_train, Y_train, X_test, Y_test = get_dataset('iris')

        eliminate_class_two = Y_train != 2
        X_train = X_train[eliminate_class_two]
        Y_train = Y_train[eliminate_class_two]

        eliminate_class_two = Y_test != 2
        X_test = X_test[eliminate_class_two]
        Y_test = Y_test[eliminate_class_two]

        X_valid = X_test[:25, ]
        Y_valid = Y_test[:25, ]
        X_test = X_test[25:, ]
        Y_test = Y_test[25:, ]

        D = Dummy()
        D.info = {
            'metric': AUC_METRIC,
            'task': BINARY_CLASSIFICATION,
            'is_sparse': False,
            'label_num': 2
        }
        D.data = {
            'X_train': X_train,
            'Y_train': Y_train,
            'X_valid': X_valid,
            'X_test': X_test
        }
        D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical']

        configuration_space = get_configuration_space(
            D.info,
            include_estimators=['lda'],
            include_preprocessors=['pca'])

        err = np.zeros([N_TEST_RUNS])
        for i in range(N_TEST_RUNS):
            print('Evaluate configuration: %d; result:' % i)
            configuration = configuration_space.sample_configuration()
            D_ = copy.deepcopy(D)
            evaluator = HoldoutEvaluator(D_, configuration)

            if not self._fit(evaluator):
                continue
            err[i] = evaluator.predict()
            self.assertTrue(np.isfinite(err[i]))
            print(err[i])

            self.assertGreaterEqual(err[i], 0.0)
    def test_predict_proba_binary_classification(self):
        X_train, Y_train, X_test, Y_test = get_dataset('iris')

        eliminate_class_two = Y_train != 2
        X_train = X_train[eliminate_class_two]
        Y_train = Y_train[eliminate_class_two]

        eliminate_class_two = Y_test != 2
        X_test = X_test[eliminate_class_two]
        Y_test = Y_test[eliminate_class_two]

        X_valid = X_test[:25, ]
        Y_valid = Y_test[:25, ]
        X_test = X_test[25:, ]
        Y_test = Y_test[25:, ]

        class Dummy2(object):

            def predict_proba(self, y, batch_size=200):
                return np.array([[0.1, 0.9], [0.7, 0.3]])

        model = Dummy2()
        task_type = BINARY_CLASSIFICATION

        D = Dummy()
        D.info = {
            'metric': BAC_METRIC,
            'task': task_type,
            'is_sparse': False,
            'label_num': 3
        }
        D.data = {
            'X_train': X_train,
            'Y_train': Y_train,
            'X_valid': X_valid,
            'X_test': X_test
        }
        D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical']

        configuration_space = get_configuration_space(
            D.info,
            include_estimators=['lda'],
            include_preprocessors=['select_rates'])
        configuration = configuration_space.sample_configuration()

        evaluator = HoldoutEvaluator(D, configuration)
        pred = evaluator.predict_proba(None, model, task_type)
        expected = [[0.9], [0.3]]
        for i in range(len(expected)):
            self.assertEqual(expected[i], pred[i])
    def test_metalearning(self):
        dataset_name = 'digits'

        initial_challengers = {
            ACC_METRIC: "--initial-challengers \" "
                        "-balancing:strategy 'weighting' "
                        "-classifier:__choice__ 'proj_logit'",
            AUC_METRIC: "--initial-challengers \" "
                        "-balancing:strategy 'none' "
                        "-classifier:__choice__ 'random_forest'",
            BAC_METRIC: "--initial-challengers \" "
                        "-balancing:strategy 'weighting' "
                        "-classifier:__choice__ 'proj_logit'",
            F1_METRIC: "--initial-challengers \" "
                       "-balancing:strategy 'weighting' "
                       "-classifier:__choice__ 'proj_logit'",
            PAC_METRIC: "--initial-challengers \" "
                        "-balancing:strategy 'none' "
                        "-classifier:__choice__ 'random_forest'"
        }

        for metric in initial_challengers:
            configuration_space = get_configuration_space(
                {
                    'metric': metric,
                    'task': MULTICLASS_CLASSIFICATION,
                    'is_sparse': False
                },
                include_preprocessors=['no_preprocessing'])

            X_train, Y_train, X_test, Y_test = get_dataset(dataset_name)
            categorical = [False] * X_train.shape[1]

            meta_features_label = calc_meta_features(X_train, Y_train,
                                                     categorical, dataset_name)
            meta_features_encoded_label = calc_meta_features_encoded(X_train,
                                                                     Y_train,
                                                                     categorical,
                                                                     dataset_name)
            initial_configuration_strings_for_smac = \
                create_metalearning_string_for_smac_call(
                    meta_features_label,
                    meta_features_encoded_label,
                    configuration_space, dataset_name, metric,
                    MULTICLASS_CLASSIFICATION, False, 1, None)

            print(metric)
            print(initial_configuration_strings_for_smac[0])
            self.assertTrue(initial_configuration_strings_for_smac[
                                0].startswith(initial_challengers[metric]))
Beispiel #10
0
def _create_search_space(
    tmp_dir, data_info, backend, watcher, logger, include_estimators=None, include_preprocessors=None
):
    task_name = "CreateConfigSpace"
    watcher.start_task(task_name)
    configspace_path = os.path.join(tmp_dir, "space.pcs")
    configuration_space = paramsklearn.get_configuration_space(
        data_info, include_estimators=include_estimators, include_preprocessors=include_preprocessors
    )
    sp_string = pcs_parser.write(configuration_space)
    backend.write_txt_file(configspace_path, sp_string, "Configuration space")
    watcher.stop_task(task_name)

    return configuration_space, configspace_path
    def test_file_output(self):
        output_dir = os.path.join(os.getcwd(), '.test')

        try:
            shutil.rmtree(output_dir)
        except Exception:
            pass

        X_train, Y_train, X_test, Y_test = get_dataset('boston')
        X_valid = X_test[:25, ]
        Y_valid = Y_test[:25, ]
        X_test = X_test[25:, ]
        Y_test = Y_test[25:, ]

        D = Dummy()
        D.info = {
            'metric': R2_METRIC,
            'task': REGRESSION,
            'is_sparse': False,
            'label_num': 3
        }
        D.data = {
            'X_train': X_train,
            'Y_train': Y_train,
            'X_valid': X_valid,
            'X_test': X_test
        }
        D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical']
        D.name = 'test'

        configuration_space = get_configuration_space(D.info)

        while True:
            configuration = configuration_space.sample_configuration()
            evaluator = HoldoutEvaluator(D, configuration,
                                         with_predictions=True,
                                         all_scoring_functions=True,
                                         output_dir=output_dir,
                                         output_y_test=True)

            if not self._fit(evaluator):
                continue
            evaluator.predict()
            evaluator.file_output()

            self.assertTrue(os.path.exists(os.path.join(
                output_dir, '.auto-sklearn', 'true_targets_ensemble.npy')))
            break
    def test_evaluate_multiclass_classification_all_metrics(self):
        X_train, Y_train, X_test, Y_test = get_dataset('iris')
        X_valid = X_test[:25, ]
        Y_valid = Y_test[:25, ]
        X_test = X_test[25:, ]
        Y_test = Y_test[25:, ]

        D = Dummy()
        D.info = {
            'metric': BAC_METRIC,
            'task': MULTICLASS_CLASSIFICATION,
            'is_sparse': False,
            'label_num': 3
        }
        D.data = {
            'X_train': X_train,
            'Y_train': Y_train,
            'X_valid': X_valid,
            'X_test': X_test
        }
        D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical']

        configuration_space = get_configuration_space(
            D.info,
            include_estimators=['lda'],
            include_preprocessors=['pca'])

        # Test all scoring functions
        err = []
        for i in range(N_TEST_RUNS):
            print('Evaluate configuration: %d; result:' % i)
            configuration = configuration_space.sample_configuration()
            D_ = copy.deepcopy(D)
            evaluator = HoldoutEvaluator(D_, configuration,
                                         all_scoring_functions=True)
            if not self._fit(evaluator):
                continue

            err.append(evaluator.predict())
            print(err[-1])

            self.assertIsInstance(err[-1], dict)
            for key in err[-1]:
                self.assertEqual(len(err[-1]), 5)
                self.assertTrue(np.isfinite(err[-1][key]))
                self.assertGreaterEqual(err[-1][key], 0.0)
    def test_evaluate_multilabel_classification(self):
        X_train, Y_train, X_test, Y_test = get_dataset('iris')
        Y_train = np.array(convert_to_bin(Y_train, 3))
        Y_train[:, -1] = 1
        Y_test = np.array(convert_to_bin(Y_test, 3))
        Y_test[:, -1] = 1

        X_valid = X_test[:25, ]
        Y_valid = Y_test[:25, ]
        X_test = X_test[25:, ]
        Y_test = Y_test[25:, ]

        D = Dummy()
        D.info = {
            'metric': F1_METRIC,
            'task': MULTILABEL_CLASSIFICATION,
            'is_sparse': False,
            'label_num': 3
        }
        D.data = {
            'X_train': X_train,
            'Y_train': Y_train,
            'X_valid': X_valid,
            'X_test': X_test
        }
        D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical']

        configuration_space = get_configuration_space(
            D.info,
            include_estimators=['extra_trees'],
            include_preprocessors=['no_preprocessing'])

        err = np.zeros([N_TEST_RUNS])
        for i in range(N_TEST_RUNS):
            print('Evaluate configuration: %d; result:' % i)
            configuration = configuration_space.sample_configuration()
            D_ = copy.deepcopy(D)
            evaluator = HoldoutEvaluator(D_, configuration)
            if not self._fit(evaluator):
                continue
            err[i] = evaluator.predict()
            print(err[i])

            self.assertTrue(np.isfinite(err[i]))
            self.assertGreaterEqual(err[i], 0.0)
    def test_evaluate_regression(self):
        X_train, Y_train, X_test, Y_test = get_dataset("boston")

        X_valid = X_test[:200,]
        Y_valid = Y_test[:200,]
        X_test = X_test[200:,]
        Y_test = Y_test[200:,]

        D = Dummy()
        D.info = {"metric": R2_METRIC, "task": REGRESSION, "is_sparse": False, "label_num": 1}
        D.data = {"X_train": X_train, "Y_train": Y_train, "X_valid": X_valid, "X_test": X_test}
        D.feat_type = [
            "numerical",
            "Numerical",
            "numerical",
            "numerical",
            "numerical",
            "numerical",
            "numerical",
            "numerical",
            "numerical",
            "numerical",
            "numerical",
        ]

        configuration_space = get_configuration_space(
            D.info, include_estimators=["extra_trees"], include_preprocessors=["no_preprocessing"]
        )

        err = np.zeros([N_TEST_RUNS])
        for i in range(N_TEST_RUNS):
            print("Evaluate configuration: %d; result:" % i)
            configuration = configuration_space.sample_configuration()
            D_ = copy.deepcopy(D)
            evaluator = HoldoutEvaluator(D_, configuration)
            if not self._fit(evaluator):
                continue
            err[i] = evaluator.predict()
            self.assertTrue(np.isfinite(err[i]))
            print(err[i])

            self.assertGreaterEqual(err[i], 0.0)

        print("Number of times it was worse than random guessing:" + str(np.sum(err > 1)))
    def test_evaluate_regression(self):
        X_train, Y_train, X_test, Y_test = get_dataset('boston')

        X_valid = X_test[:200, ]
        Y_valid = Y_test[:200, ]
        X_test = X_test[200:, ]
        Y_test = Y_test[200:, ]

        D = Dummy()
        D.info = {
            'metric': R2_METRIC,
            'task': REGRESSION,
            'is_sparse': False,
            'label_num': 1
        }
        D.data = {
            'X_train': X_train,
            'Y_train': Y_train,
            'X_valid': X_valid,
            'X_test': X_test
        }
        D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical',
                       'numerical', 'numerical', 'numerical', 'numerical',
                       'numerical', 'numerical', 'numerical']

        configuration_space = get_configuration_space(
            D.info,
            include_estimators=['extra_trees'],
            include_preprocessors=['no_preprocessing'])

        err = np.zeros([N_TEST_RUNS])
        for i in range(N_TEST_RUNS):
            print('Evaluate configuration: %d; result:' % i)
            configuration = configuration_space.sample_configuration()
            D_ = copy.deepcopy(D)
            evaluator = HoldoutEvaluator(D_, configuration)
            if not self._fit(evaluator):
                continue
            err[i] = evaluator.predict()
            self.assertTrue(np.isfinite(err[i]))
            print(err[i])

            self.assertGreaterEqual(err[i], 0.0)
    def test_file_output(self):
        output_dir = os.path.join(os.getcwd(), ".test")

        try:
            shutil.rmtree(output_dir)
        except Exception:
            pass

        X_train, Y_train, X_test, Y_test = get_dataset("boston")
        X_valid = X_test[:25,]
        Y_valid = Y_test[:25,]
        X_test = X_test[25:,]
        Y_test = Y_test[25:,]

        D = Dummy()
        D.info = {"metric": R2_METRIC, "task": REGRESSION, "is_sparse": False, "label_num": 3}
        D.data = {"X_train": X_train, "Y_train": Y_train, "X_valid": X_valid, "X_test": X_test}
        D.feat_type = ["numerical", "Numerical", "numerical", "numerical"]
        D.name = "test"

        configuration_space = get_configuration_space(D.info)

        while True:
            configuration = configuration_space.sample_configuration()
            evaluator = HoldoutEvaluator(
                D,
                configuration,
                with_predictions=True,
                all_scoring_functions=True,
                output_dir=output_dir,
                output_y_test=True,
            )

            if not self._fit(evaluator):
                continue
            evaluator.predict()
            evaluator.file_output()

            self.assertTrue(os.path.exists(os.path.join(output_dir, ".auto-sklearn", "true_targets_ensemble.npy")))
            break
    def test_evaluate_binary_classification(self):
        X_train, Y_train, X_test, Y_test = get_dataset("iris")

        eliminate_class_two = Y_train != 2
        X_train = X_train[eliminate_class_two]
        Y_train = Y_train[eliminate_class_two]

        eliminate_class_two = Y_test != 2
        X_test = X_test[eliminate_class_two]
        Y_test = Y_test[eliminate_class_two]

        X_valid = X_test[:25,]
        Y_valid = Y_test[:25,]
        X_test = X_test[25:,]
        Y_test = Y_test[25:,]

        D = Dummy()
        D.info = {"metric": AUC_METRIC, "task": BINARY_CLASSIFICATION, "is_sparse": False, "label_num": 2}
        D.data = {"X_train": X_train, "Y_train": Y_train, "X_valid": X_valid, "X_test": X_test}
        D.feat_type = ["numerical", "Numerical", "numerical", "numerical"]

        configuration_space = get_configuration_space(D.info, include_estimators=["lda"], include_preprocessors=["pca"])

        err = np.zeros([N_TEST_RUNS])
        for i in range(N_TEST_RUNS):
            print("Evaluate configuration: %d; result:" % i)
            configuration = configuration_space.sample_configuration()
            D_ = copy.deepcopy(D)
            evaluator = HoldoutEvaluator(D_, configuration)

            if not self._fit(evaluator):
                continue
            err[i] = evaluator.predict()
            self.assertTrue(np.isfinite(err[i]))
            print(err[i])

            self.assertGreaterEqual(err[i], 0.0)

        print("Number of times it was worse than random guessing:" + str(np.sum(err > 1)))
    def test_predict_proba_binary_classification(self):
        X_train, Y_train, X_test, Y_test = get_dataset("iris")

        eliminate_class_two = Y_train != 2
        X_train = X_train[eliminate_class_two]
        Y_train = Y_train[eliminate_class_two]

        eliminate_class_two = Y_test != 2
        X_test = X_test[eliminate_class_two]
        Y_test = Y_test[eliminate_class_two]

        X_valid = X_test[:25,]
        Y_valid = Y_test[:25,]
        X_test = X_test[25:,]
        Y_test = Y_test[25:,]

        class Dummy2(object):
            def predict_proba(self, y, batch_size=200):
                return np.array([[0.1, 0.9], [0.7, 0.3]])

        model = Dummy2()
        task_type = BINARY_CLASSIFICATION

        D = Dummy()
        D.info = {"metric": BAC_METRIC, "task": task_type, "is_sparse": False, "label_num": 3}
        D.data = {"X_train": X_train, "Y_train": Y_train, "X_valid": X_valid, "X_test": X_test}
        D.feat_type = ["numerical", "Numerical", "numerical", "numerical"]

        configuration_space = get_configuration_space(
            D.info, include_estimators=["lda"], include_preprocessors=["select_rates"]
        )
        configuration = configuration_space.sample_configuration()

        evaluator = HoldoutEvaluator(D, configuration)
        pred = evaluator.predict_proba(None, model, task_type)
        expected = [[0.9], [0.3]]
        for i in range(len(expected)):
            self.assertEqual(expected[i], pred[i])
    def test_evaluate_multilabel_classification(self):
        X_train, Y_train, X_test, Y_test = get_dataset("iris")
        Y_train = np.array(convert_to_bin(Y_train, 3))
        Y_train[:, -1] = 1
        Y_test = np.array(convert_to_bin(Y_test, 3))
        Y_test[:, -1] = 1

        X_valid = X_test[:25,]
        Y_valid = Y_test[:25,]
        X_test = X_test[25:,]
        Y_test = Y_test[25:,]

        D = Dummy()
        D.info = {"metric": F1_METRIC, "task": MULTILABEL_CLASSIFICATION, "is_sparse": False, "label_num": 3}
        D.data = {"X_train": X_train, "Y_train": Y_train, "X_valid": X_valid, "X_test": X_test}
        D.feat_type = ["numerical", "Numerical", "numerical", "numerical"]

        configuration_space = get_configuration_space(
            D.info, include_estimators=["extra_trees"], include_preprocessors=["no_preprocessing"]
        )

        err = np.zeros([N_TEST_RUNS])
        for i in range(N_TEST_RUNS):
            print("Evaluate configuration: %d; result:" % i)
            configuration = configuration_space.sample_configuration()
            D_ = copy.deepcopy(D)
            evaluator = HoldoutEvaluator(D_, configuration)
            if not self._fit(evaluator):
                continue
            err[i] = evaluator.predict()
            print(err[i])

            self.assertTrue(np.isfinite(err[i]))
            self.assertGreaterEqual(err[i], 0.0)

        print("Number of times it was worse than random guessing:" + str(np.sum(err > 1)))
    def test_evaluate_multiclass_classification(self):
        X_train, Y_train, X_test, Y_test = get_dataset('iris')

        X_valid = X_test[:25, ]
        Y_valid = Y_test[:25, ]
        X_test = X_test[25:, ]
        Y_test = Y_test[25:, ]

        D = Dummy()
        D.info = {
            'metric': ACC_METRIC,
            'task': MULTICLASS_CLASSIFICATION,
            'is_sparse': False,
            'label_num': 3
        }
        D.data = {
            'X_train': X_train,
            'Y_train': Y_train,
            'X_valid': X_valid,
            'X_test': X_test
        }
        D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical']

        configuration_space = get_configuration_space(
            D.info,
            include_estimators=['lda'],
            include_preprocessors=['pca'])

        err = np.zeros([N_TEST_RUNS])
        num_models_better_than_random = 0
        for i in range(N_TEST_RUNS):
            print('Evaluate configuration: %d; result:' % i)
            configuration = configuration_space.sample_configuration()
            D_ = copy.deepcopy(D)
            evaluator = NestedCVEvaluator(D_, configuration,
                                          with_predictions=True,
                                          all_scoring_functions=True)

            if not self._fit(evaluator):
                continue
            e_, Y_optimization_pred, Y_valid_pred, Y_test_pred = \
                evaluator.predict()
            err[i] = e_[ACC_METRIC]
            print(err[i], configuration['classifier:__choice__'])
            print(e_['outer:bac_metric'], e_[BAC_METRIC])

            # Test the outer CV
            num_targets = len(np.unique(Y_train))
            self.assertTrue(np.isfinite(err[i]))
            self.assertGreaterEqual(err[i], 0.0)
            # Test that ten models were trained
            self.assertEqual(len(evaluator.outer_models), 5)
            self.assertTrue(all([model is not None
                                 for model in evaluator.outer_models]))

            self.assertEqual(Y_optimization_pred.shape[0], Y_train.shape[0])
            self.assertEqual(Y_optimization_pred.shape[1], num_targets)
            self.assertEqual(Y_valid_pred.shape[0], Y_valid.shape[0])
            self.assertEqual(Y_valid_pred.shape[1], num_targets)
            self.assertEqual(Y_test_pred.shape[0], Y_test.shape[0])
            self.assertEqual(Y_test_pred.shape[1], num_targets)
            # Test some basic statistics of the predictions
            if err[i] < 0.5:
                self.assertTrue(0.3 < Y_valid_pred.mean() < 0.36666)
                self.assertGreaterEqual(Y_valid_pred.std(), 0.1)
                self.assertTrue(0.3 < Y_test_pred.mean() < 0.36666)
                self.assertGreaterEqual(Y_test_pred.std(), 0.1)
                num_models_better_than_random += 1

            # Test the inner CV
            self.assertEqual(len(evaluator.inner_models), 5)
            for fold in range(5):
                self.assertEqual(len(evaluator.inner_models[fold]), 5)
                self.assertTrue(all([model is not None
                                     for model in evaluator.inner_models[fold]
                                     ]))
                self.assertGreaterEqual(len(evaluator.outer_indices[fold][0]),
                                        75)
                for inner_fold in range(5):
                    self.assertGreaterEqual(
                        len(evaluator.inner_indices[fold][inner_fold][0]), 60)

        self.assertGreater(num_models_better_than_random, 9)
    def test_metalearning(self):
        dataset_name = 'digits'

        initial_challengers = {
            'acc_metric': ["--initial-challengers \" "
                           "-adaboost:algorithm 'SAMME.R' "
                           "-adaboost:learning_rate '0.400363929326' "
                           "-adaboost:max_depth '5' "
                           "-adaboost:n_estimators '319' "
                           "-balancing:strategy 'none' "
                           "-classifier 'adaboost' "
                           "-imputation:strategy 'most_frequent' "
                           "-preprocessor 'no_preprocessing' "
                           "-rescaling:strategy 'min/max'\""],
            'auc_metric': ["--initial-challengers \" "
                           "-adaboost:algorithm 'SAMME.R' "
                           "-adaboost:learning_rate '0.966883114819' "
                           "-adaboost:max_depth '5' "
                           "-adaboost:n_estimators '412' "
                           "-balancing:strategy 'weighting' "
                           "-classifier 'adaboost' "
                           "-imputation:strategy 'median' "
                           "-preprocessor 'no_preprocessing' "
                           "-rescaling:strategy 'min/max'\""],
            'bac_metric': ["--initial-challengers \" "
                           "-adaboost:algorithm 'SAMME.R' "
                           "-adaboost:learning_rate '0.400363929326' "
                           "-adaboost:max_depth '5' "
                           "-adaboost:n_estimators '319' "
                           "-balancing:strategy 'none' "
                           "-classifier 'adaboost' "
                           "-imputation:strategy 'most_frequent' "
                           "-preprocessor 'no_preprocessing' "
                           "-rescaling:strategy 'min/max'\""],
            'f1_metric': ["--initial-challengers \" "
                          "-adaboost:algorithm 'SAMME.R' "
                          "-adaboost:learning_rate '0.966883114819' "
                          "-adaboost:max_depth '5' "
                          "-adaboost:n_estimators '412' "
                          "-balancing:strategy 'weighting' "
                          "-classifier 'adaboost' "
                          "-imputation:strategy 'median' "
                          "-preprocessor 'no_preprocessing' "
                          "-rescaling:strategy 'min/max'\""],
            'pac_metric': ["--initial-challengers \" "
                           "-adaboost:algorithm 'SAMME.R' "
                           "-adaboost:learning_rate '0.400363929326' "
                           "-adaboost:max_depth '5' "
                           "-adaboost:n_estimators '319' "
                           "-balancing:strategy 'none' "
                           "-classifier 'adaboost' "
                           "-imputation:strategy 'most_frequent' "
                           "-preprocessor 'no_preprocessing' "
                           "-rescaling:strategy 'min/max'\""]
        }

        for metric in initial_challengers:
            configuration_space = get_configuration_space(
                {
                    'metric': metric,
                    'task': MULTICLASS_CLASSIFICATION,
                    'is_sparse': False
                },
                include_preprocessors=['no_preprocessing'])

            X_train, Y_train, X_test, Y_test = get_dataset(dataset_name)
            categorical = [False] * X_train.shape[1]

            meta_features_label = calc_meta_features(X_train, Y_train,
                                                     categorical, dataset_name)
            meta_features_encoded_label = calc_meta_features_encoded(X_train,
                                                                     Y_train,
                                                                     categorical,
                                                                     dataset_name)
            initial_configuration_strings_for_smac = \
                create_metalearning_string_for_smac_call(
                    meta_features_label,
                    meta_features_encoded_label,
                    configuration_space, dataset_name, metric,
                    MULTICLASS_CLASSIFICATION, False, 1, None)

            print(metric)
            self.assertEqual(initial_challengers[metric],
                             initial_configuration_strings_for_smac)
def main(dataset_info, mode, seed, params, mode_args=None):
    """This command line interface has three different operation modes:

    * CV: useful for the Tweakathon
    * 1/3 test split: useful to evaluate a configuration
    * cv on 2/3 train split: useful to optimize hyperparameters in a training
      mode before testing a configuration on the 1/3 test split.

    It must by no means be used for the Auto part of the competition!
    """

    debug_log("Run script")
    num_run = None
    if mode != 'test':
        num_run = get_new_run_num()

    for key in params:
        try:
            params[key] = int(params[key])
        except Exception:
            try:
                params[key] = float(params[key])
            except Exception:
                pass

    if seed is not None:
        seed = int(float(seed))
    else:
        seed = 1

    output_dir = os.getcwd()

    D = store_and_or_load_data(dataset_info=dataset_info, outputdir=output_dir)

    cs = get_configuration_space(D.info)
    configuration = configuration_space.Configuration(cs, params)
    metric = D.info['metric']

    global evaluator
    # Train/test split
    if mode == 'holdout':
        make_mode_holdout(
            D,
            seed,
            configuration,
            num_run)
    elif mode == 'test':
        make_mode_holdout(
            D,
            seed,
            configuration, metric)

    elif mode == 'cv':
        make_mode_cv(
            D,
            seed,
            configuration,
            num_run,
            mode_args['folds'])
    elif mode == 'partial_cv':
        make_mode_partial_cv(
            D,
            seed,
            configuration,
            num_run,
            metric,
            mode_args['folds'],
            mode_args['fold']),
    elif mode == 'nested_cv':
        make_mode_nested_cv(
            D,
            seed,
            configuration,
            num_run,
            mode_args['inner_folds'],
            mode_args['outer_folds']),
    else:
        raise ValueError('Must choose a legal mode.')
    def test_evaluate_multiclass_classification_partial_fit(self):
        X_train, Y_train, X_test, Y_test = get_dataset('iris')

        X_valid = X_test[:25, ]
        Y_valid = Y_test[:25, ]
        X_test = X_test[25:, ]
        Y_test = Y_test[25:, ]

        D = Dummy()
        D.info = {
            'metric': 'bac_metric',
            'task': MULTICLASS_CLASSIFICATION,
            'is_sparse': False,
            'target_num': 3
        }
        D.data = {
            'X_train': X_train,
            'Y_train': Y_train,
            'X_valid': X_valid,
            'X_test': X_test
        }
        D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical']

        configuration_space = get_configuration_space(
            D.info,
            include_estimators=['ridge'],
            include_preprocessors=['select_rates'])

        err = np.zeros([N_TEST_RUNS])
        num_models_better_than_random = 0
        for i in range(N_TEST_RUNS):
            print('Evaluate configuration: %d; result:' % i)
            configuration = configuration_space.sample_configuration()
            D_ = copy.deepcopy(D)
            evaluator = CVEvaluator(D_, configuration, with_predictions=True)

            if not self._partial_fit(evaluator, fold=i % 10):
                print()
                continue
            e_, Y_optimization_pred, Y_valid_pred, Y_test_pred = \
                evaluator.predict()
            err[i] = e_
            print(err[i], configuration['classifier'])

            self.assertTrue(np.isfinite(err[i]))
            self.assertGreaterEqual(err[i], 0.0)
            # Test that only one model was trained
            self.assertEqual(len(evaluator.models), 10)
            self.assertEqual(1, np.sum([True if model is not None else False
                                        for model in evaluator.models]))
            self.assertLess(Y_optimization_pred.shape[0], 13)
            self.assertEqual(Y_valid_pred.shape[0], Y_valid.shape[0])
            self.assertEqual(Y_test_pred.shape[0], Y_test.shape[0])
            # Test some basic statistics of the dataset
            if err[i] < 0.5:
                self.assertTrue(0.3 < Y_valid_pred.mean() < 0.36666)
                self.assertGreaterEqual(Y_valid_pred.std(), 0.01)
                self.assertTrue(0.3 < Y_test_pred.mean() < 0.36666)
                self.assertGreaterEqual(Y_test_pred.std(), 0.01)
                num_models_better_than_random += 1
        self.assertGreaterEqual(num_models_better_than_random, 5)